''' 1.爬取豆瓣top250电影信息 - 第一页: https://movie.douban.com/top250?start=0&filter= - 第二页: https://movie.douban.com/top250?start=25&filter= - 第三页: https://movie.douban.com/top250?start=50&filter= - 第十页: https://movie.douban.com/top250?start=225&filter= - 爬取步骤: - 1) 获取所有电影的主页url - 2) 往每一个主页发送请求,获取响应数据 - 3) 解析并提取想要的数据(获取每一部电影的class为item的div) - 4) 根据每一部电影的div提取电影的: 详情页url、电影名字、电影评分、评价人数 ''' import requests import re headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' } # 爬虫三部曲 # 1.发送请求 def get_html(url): response = requests.get(url, headers=headers) return response # 2.解析数据 def parse_html(response): movie_data_list = re.findall( '<div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价', response.text, re.S) return movie_data_list # 3.保存数据 def save_data(movie_data_list, num): # ('https://movie.douban.com/subject/1292052/', '肖申克的救赎', '9.7', '1737867') url, name, point, commit = movie_data_list # 处理数据的格式 movie_data = f''' 电影排名:{num} 详情页url:{url} 电影名字:{name} 电影评分:{point} 评价人数:{commit} ''' print(movie_data) with open('douban_top250.txt', 'a', encoding='utf-8') as f: f.write(movie_data) if __name__ == '__main__': number = 0 num = 1 for line in range(10): url = f'https://movie.douban.com/top250?start={number}&filter=' # url = f'https://movie.douban.com/top250?start={line * 25}&filter=' number += 25 # print(url) index_response = get_html(url) movie_data_list = parse_html(index_response) for movie_tuple in movie_data_list: # ('https://movie.douban.com/subject/1292052/', '肖申克的救赎', '9.7', '1737867') save_data(movie_tuple, num) num += 1
来源:https://www.cnblogs.com/xichenHome/p/12153273.html