简单的爬取小说的脚本
1 '''
2 爬取网站 顶点小说
3 网站地址 https://www.booktxt.net
4 本脚本只为学习
5 '''
6 import requests
7 from bs4 import BeautifulSoup
8 import time,random
9
10 book_name = '5_5626' #小说名字的编号
11 book_url = 'https://www.booktxt.net' + '/' + book_name + '/' #拼接小说地址)
12 response = requests.get(url= book_url)
13
14 response.encoding = response.apparent_encoding #转码
15 soup = BeautifulSoup(response.text, features='html.parser')
16 a = soup.find(id='list')
17 dd_all = a.find_all('dd')
18 http_all = []
19
20 for i in dd_all:
21 http_all.append(book_url + i.find('a').attrs.get('href'))
22 http_all = http_all[8:] #从开头开始截取都为7章
23 m = 5 #测试限定爬取次数
24 with open(book_name+'.txt', 'w') as f:
25 n = 0 #计数
26 for i in http_all:
27 if m==n:break
28 h = requests.get(url=i)
29 h.encoding = h.apparent_encoding
30 hb = BeautifulSoup(h.text, features='html.parser')
31 tar_t = hb.find(id='content')
32 tar_h = hb.find("h1").text
33 f.write(tar_h+'\n')
34 for j in tar_t:
35 if str(j)!="<br/>":
36 f.write(str(j).lstrip()+'\n')
37 time.sleep(random.randint(3, 6))#增加爬取时间间隔,防止被封ip
38 n+=1
39 f.write('\n\n')
40 print('第%d章写入完成!'%n)
41 f.close()
来源:https://www.cnblogs.com/MMTTBD/p/10514261.html