#电影天堂电影爬虫 import requests from lxml import etree import time DOMAIN = "https://dytt8.net" HEADERS = { "Referer": "https://dytt8.net/html/gndy/dyzz/index.html", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" } #获取元素对象 def get_page_info(url,flag=True): html = "" my_time = 0 time.sleep(1) while(True): response = requests.get(url=url,headers=HEADERS) if response.status_code == 200: if flag: text = response.text else: text = response.content.decode("gbk") html = etree.HTML(text) break else: my_time = my_time + 1 # print(response.status_code,my_time) time.sleep(my_time) return html #获取页数 def get_pages(): url = DOMAIN+"/html/gndy/dyzz/index.html" html = get_page_info(url) pages = html.xpath("//select[@name='sldd']/option[last()]/text()")[0] return int(pages) #获取电影信息 def get_movie_info(detail_url): html = get_page_info(detail_url,False) infos = html.xpath("//div[@id='Zoom']//p/text()") index = 0 for info in infos: index = index + 1 if info != '': info = str(info).replace(u'\u3000',u' ').strip() if info.startswith("◎译 名"): info = info.replace("◎译 名 ", "").strip() print("\n\n======================================译 名:{}===============================".format(info)) elif info.startswith("◎片 名"): info = info.replace("◎片 名", "").strip() print("片 名:{}".format(info)) elif info.startswith("◎年 代"): info = info.replace("◎年 代", "").strip() print("年 代:{}".format(info)) elif info.startswith("◎产 地"): info = info.replace("◎产 地", "").strip() print("产 地:{}".format(info)) elif info.startswith("◎类 别"): info = info.replace("◎类 别", "").strip() print("类 别:{}".format(info)) elif info.startswith("◎语 言"): info = info.replace("◎语 言", "").strip() print("语 言:{}".format(info)) elif info.startswith("◎字 幕"): info = info.replace("◎字 幕", "").strip() print("字 幕:{}".format(info)) elif info.startswith("◎上映日期"): info = info.replace("◎上映日期", "").strip() print("上映日期:{}".format(info)) elif info.startswith("◎IMDb评分"): info = info.replace("◎IMDb评分", "").strip() print("◎IMDb评分:{}".format(info)) elif info.startswith("◎豆瓣评分"): info = info.replace("◎豆瓣评分", "").strip() print("豆瓣评分:{}".format(info)) elif info.startswith("◎文件格式"): info = info.replace("◎文件格式", "").strip() print("文件格式:{}".format(info)) elif info.startswith("◎视频尺寸"): info = info.replace("◎视频尺寸", "").strip() print("视频尺寸:{}".format(info)) elif info.startswith("◎文件大小"): info = info.replace("◎文件大小", "").strip() print("文件大小:{}".format(info)) elif info.startswith("◎片 长"): info = info.replace("◎片 长", "").strip() print("片 长:{}".format(info)) elif info.startswith("◎导 演"): info = info.replace("◎导 演", "").strip() print("导 演:{}".format(info)) elif info.startswith("◎编 剧"): info = info.replace("◎编 剧", "").strip() print("编 剧:{}".format(info)) elif info.startswith("◎主 演"): actors = [] info = info.replace("◎主 演", "").strip() actors.append(info) for i in range(index,len(infos)): info = infos[i].strip() if info.startswith("◎"): break else: actors.append(info) print("主演:{}".format(actors)) elif info.startswith("◎标 签"): info = info.replace("◎标 签", "").strip() print("标 签:{}".format(info)) elif info.startswith("◎简 介"): info = info.replace("◎简 介", "").strip() info = infos[index].strip() print("简 介:{}".format(info)) download_url = html.xpath("//table//td[@bgcolor='#fdfddf']/a/@href") if len(download_url) > 0: print("迅雷下载地址:{}".format(download_url[0])) def get_detail_url():#获取电影详情链接 for i in range(1,get_pages()+1): url = "{}/html/gndy/dyzz/list_23_{}.html".format(DOMAIN,i) html = get_page_info(url) detail_urls = html.xpath("//table[@class='tbspan']//a[@class='ulink']/@href") for detail_url in detail_urls: detail_url = DOMAIN + detail_url get_movie_info(detail_url) if __name__ == '__main__': get_detail_url()