from concurrent.futures import ThreadPoolExecutor import requests import re import uuid pool = ThreadPoolExecutor(200) # 1.发送请求函数 def get_page(url): response = requests.get(url) return response # 2.解析主页获取视频ID号 def parse_index(response): id_list = re.findall( '<a href="video_(.*?)".*?>', response.text, re.S ) return id_list # 3.解析视频详情页获取真实 视频链接 def parse_detail(res): response = res.result() movie_detail_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0] print(f'往视频链接: {movie_detail_url}发送请求...') # 异步往视频详情页链接发送请求,把结果交给 pool.submit(get_page, movie_detail_url).add_done_callback(save_movie) return movie_detail_url # 4.往真实视频链接发送请求,获取数据并保存到本地 def save_movie(res): movie_response = res.result() # print(1111) # movie_response = get_page(movie_detail_url) # print(movie_response) name = str(uuid.uuid4()) print(f'{name}.mp4视频开始保存...') with open(f'{name}.mp4', 'wb') as f: f.write(movie_response.content) print('视频下载完毕!') if __name__ == '__main__': # 1.访问主页获取数据 index_response = get_page('https://www.pearvideo.com/') # # 2.解析主页获取所有的视频id号 id_list = parse_index(index_response) print(id_list) # 3.循环对每个视频详情页链接进行拼接 for id in id_list: print(id) detail_url = 'https://www.pearvideo.com/video_' + id # 异步提交爬取视频详情页,把返回的数据,交给parse_detail(回调函数) pool.submit(get_page, detail_url).add_done_callback(parse_detail)
来源:https://www.cnblogs.com/shenblog/p/11732712.html