项目1:实现豆瓣电影TOP250标题爬取:
1 from urllib.request import urlopen
2 from bs4 import BeautifulSoup
3 import re
4
5 class doubanSpider():
6 def __init__(self):
7 """
8 初始化,
9 页码,URL,存储数据,
10 """
11 self.page = 0;
12 # "http://movie.douban.com/top250?start=25&filter=&type=" 第二页
13 # 第一页
14 self.cur_url = "http://movie.douban.com/top250?start=0&filter=&type="
15 self.datas = []
16
17 def claw(self):
18 while self.page<10:
19 self.downloadURL()
20 self.updateURL()
21 self.output()
22
23 def updateURL(self):
24 self.page+=1
25 self.cur_url.replace("start=0","start="+str(self.page*25))
26
27 def downloadURL(self):
28 html = urlopen(self.cur_url)
29 bsObj = BeautifulSoup(html,"html.parser")
30 datas = bsObj.findAll("span", {"class": "title"})
31 for data in datas:
32 str = data.get_text()
33 if "\xa0/\xa0" not in str:
34 self.datas.append(str)
35
36 def output(self):
37 num = 1
38 for data in self.datas:
39 print("TOP"+str(num)+": " +data)
40 num+=1
41
42 if __name__ == "__main__":
43 print("豆瓣电影TOP250:python抓取")
44 myspider = doubanSpider()
45 myspider.claw()

来源:https://www.cnblogs.com/chenjz1993/p/7252057.html