
1 # _author: Jolly
2 # date: 2019/8/28
3
4 import requests
5 import time
6 from lxml import etree
7
8
9 BASE_DOMAIN = 'https://dytt8.net'
10 dytt8_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
11 HEADERS = {
12 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
13 'Referer': 'https://www.dytt8.net/html/gndy/dyzz/list_23_2.html'
14 }
15 response = requests.get(dytt8_url, headers=HEADERS)
16 html = etree.HTML(response.text)
17 # print(etree.tostring(html, encoding='utf-8').decode('utf-8'))
18 def get_detail_url():
19 tables_tag = html.xpath('//div[@class="co_content8"]//table[@class="tbspan"]')
20 # print(tables_tag)
21 for table in tables_tag:
22 detail_url = table.xpath('.//a/@href')[0]
23 full_detail_url = BASE_DOMAIN + detail_url
24 # print(full_detail_url)
25 time.sleep(0.5)
26 parse_detail_url(full_detail_url)
27
28
29
30 def parse_detail_url(full_detail_url):
31 movie_infomations = {}
32 response = requests.get(full_detail_url, headers=HEADERS)
33 text = response.content.decode('gbk')
34 # print(response.text)
35 html = etree.HTML(text)
36 div_tag = html.xpath('//div[@id="Zoom"]')[0]
37 # print(div_tag)
38 thumbnail = div_tag.xpath('.//img/@src')
39 # print(thumbnail)
40 cover = thumbnail[0] # 封面图
41 movie_infomations['cover'] = cover
42 if len(thumbnail) >= 2:
43 screenshot = thumbnail[1] # 截图
44 movie_infomations['screenshot'] = screenshot
45 else:
46 movie_infomations['screenshot'] = cover
47 text_infomations = div_tag.xpath('.//text()') #文本信息
48 # print(text_infomations)
49
50 def parse_info(info, substituted):
51 return info.replace(substituted, "").strip()
52
53 for index, info in enumerate(text_infomations):
54 if info.startswith('◎译 名'):
55 translate_name = info.replace('◎译 名', "").strip()
56 movie_infomations['translate_name'] = translate_name
57 elif info.startswith('◎年 代'):
58 year = info.replace('◎年 代', "").strip()
59 movie_infomations['year'] = year
60 elif info.startswith('◎产 地'):
61 country = parse_info(info, '◎产 地')
62 movie_infomations['country'] = country
63 elif info.startswith('◎字 幕'):
64 language = parse_info(info, '◎字 幕')
65 movie_infomations['language'] = language
66 elif info.startswith('◎主 演'):
67 actors = []
68 actor = info.replace('◎主 演', "").strip()
69 actors.append(actor)
70 for i in range(index+1, len(text_infomations)):
71 actor = text_infomations[i].strip()
72 if actor.startswith('◎'):
73 break
74 actors.append(actor)
75 movie_infomations['actors'] = actors
76
77 elif info.startswith('◎简 介'):
78 for i in range(index, len(text_infomations)):
79 profile = text_infomations[i].strip()
80 if profile.startswith('【下载地址】'):
81 break
82 movie_infomations['profile'] = profile
83 download_url = div_tag.xpath('.//a/@href')
84
85
86
87 print(movie_infomations)
88
89
90 if __name__ == '__main__':
91 get_detail_url()
来源:https://www.cnblogs.com/Jolly-hu/p/12227314.html
