1 import requests
2 import bs4
3
4 #获取网页代码
5 def gethtml(url):
6 try:
7 response = requests.get(url)
8 response.raise_for_status()
9 response.encoding = response.apparent_encoding
10 return response.text
11 except:
12 return "禁止爬取本网站"
13
14 #获取每一页中的文字
15 def chapters(url,name):
16 html = gethtml("http://www.bjkgjlu.com"+url)
17 soup = bs4.BeautifulSoup(html,'html.parser')
18 for i in soup.find_all("div",attrs={"class":"chapter_content"}):
19 with open(name+".txt","wb") as f:
20 f.write(i.text.split("<")[0].encode("utf-8"))
21 print(name+"爬取结束,并存入文件")
22
23 if __name__=="__main__":
24 url = "http://www.bjkgjlu.com/303618kyi/catalog"
25 chapter_name_list = []
26 chapter_url_list = []
27 html =gethtml(url)
28 soup = bs4.BeautifulSoup(html, "html.parser")
29
30 for i in soup.findAll("div", attrs={"class": "col-xs-120 col-sm-60 col-md-40 col-lg-30"}):
31 for j in i.children:
32 chapter_name_list.append(j.text)
33 chapter_url_list .append(j.get("href"))
34 print(chapter_name_list )
35 for j in range(len(chapter_name_list)):
36 chapters(chapter_url_list[j],chapter_name_list[j] )