1 """
2 爬取豆瓣电影TOP250 - 完整示例代码
3 """
4
5 import codecs
6
7 import requests
8 from bs4 import BeautifulSoup
9
10 DOWNLOAD_URL = 'http://movie.douban.com/top250/'
11
12
13 def download_page(url):
14 return requests.get(url, headers={
15 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
16 }).content
17
18
19 def parse_html(html):
20 soup = BeautifulSoup(html)
21 movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
22
23 movie_name_list = []
24
25 for movie_li in movie_list_soup.find_all('li'):
26 detail = movie_li.find('div', attrs={'class': 'hd'})
27 movie_name = detail.find('span', attrs={'class': 'title'}).getText()
28
29 movie_name_list.append(movie_name)
30
31 next_page = soup.find('span', attrs={'class': 'next'}).find('a')
32 if next_page:
33 return movie_name_list, DOWNLOAD_URL + next_page['href']
34 return movie_name_list, None
35
36
37 def main():
38 url = DOWNLOAD_URL
39
40 with codecs.open('movies', 'wb', encoding='utf-8') as fp:
41 while url:
42 html = download_page(url)
43 movies, url = parse_html(html)
44 fp.write(u'{movies}\n'.format(movies='\n'.join(movies)))
45
46
47 if __name__ == '__main__':
48 main()
原文链接:
https://zhuanlan.zhihu.com/p/20423182
感谢原po。
来源:https://www.cnblogs.com/jiaqi77/p/12020773.html