利用线程池和回调函数爬虫
from concurrent.futures import ThreadPoolExecutor import requests import re import uuid pool = ThreadPoolExecutor(200) # 1.发送请求函数 def get_page(url): response = requests.get(url) return response # 2.解析主页获取视频ID号 def parse_index(response): id_list = re.findall( '<a href="video_(.*?)".*?>', response.text, re.S ) return id_list # 3.解析视频详情页获取真实 视频链接 def parse_detail(res): response = res.result() movie_detail_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0] print(f'往视频链接: {movie_detail_url}发送请求...') # 异步往视频详情页链接发送请求,把结果交给 pool.submit(get_page, movie_detail_url).add_done_callback(save_movie)