#库的准备
import requests #cmd pip install requests安装
import re #系统库
#下载网址
url = 'https://www.shujy.com/5200/9613/'
#发送http请求,及响应
#加headers,不加容易报403,防爬虫
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",}
response = requests.get(url, headers = headers)#向url发送请求
#编码方式
response.encoding = 'utf-8'
#目标主页源码
html=response.text
#小说名
title = re.findall(r'<meta property="og:novel:book_name" content="(.*?)"/>',html)
#print(title)
#新建文本保存 ,%s与%d一样 'w'代表以写的方式打开,以utf-8方式编码
fb = open('%s.txt' % title,'w',encoding='utf-8')
#print(response.text),re.S是匹配所以隐藏字符,[0]是指从列表补充元素
#dl = re.findall(r'<div id="list">.*?</div> ',html,re.S)[0]
#获取章节信息(也即章节,url)
dl = re.findall(r'<div id="list">.*?</div>',html,re.S)[0]
#print(dl)
# .*?是正则表达式
chapter_info_list = re.findall(r'href="(.*?)">(.*?)<',dl)
#print(chapter_info_list)
for chapter_info in chapter_info_list:
chapter_title = chapter_info[1]
chapter_url = chapter_info[0]
#chapter_url,chapter_tirle = chapter_info
chapter_url = "https://www.shujy.com/5200/9613/%s"% chapter_url
#print(chapter_url,chapter_title)
#下载每一章
chapter_reponse = requests.get(chapter_url,headers = headers)
chapter_reponse.encoding = 'utf-8'
chapter_html = chapter_reponse.text
#提取第一页内容
chapter_content = re.findall(r'<div id="content">(.*?)<a style="color:red;',chapter_html,re.S)[0]
#print(chapter_content)
#exit()
#清洗数据
chapter_content = chapter_content.replace(' ','')
chapter_content = chapter_content.replace(' ','')
chapter_content = chapter_content.replace('<br />','')
chapter_content = chapter_content.replace('<br/>','')
chapter_content = chapter_content.replace('本章未完,请点击','')
#print(chapter_content)
#exit()
chapter1_url = re.findall(r'<a style="color:red;" href="(.*?)">下一页',chapter_html,re.S)[0]
#print(chapter1_url)
#exit()
chapter1_url = "https://www.shujy.com/5200/9613/%s"% chapter1_url
#print(chapter1_url)
#exit()
chapter1_reponse = requests.get(chapter1_url,headers = headers)
chapter1_reponse.encoding = 'utf-8'
chapter1_html = chapter1_reponse.text
chapter1_content = re.findall(r'<div id="content">(.*?)<div class="bottem2">',chapter1_html,re.S)[0]
#print(chapter1_content)
#exit()
chapter1_content = chapter1_content.replace(' ','')
chapter1_content = chapter1_content.replace(' ','')
chapter1_content = chapter1_content.replace('<br />','')
chapter1_content = chapter1_content.replace('<br/>','')
#print(chapter1_content)
#exit()
chapter_content = chapter_content + chapter1_content
#print(chapter_content)
#exit()
#数据持久化,也即保存文件
fb.write(chapter_title)
fb.write(chapter_content)
fb.write('\n')
print(chapter_url)
#print(chapter_content)
#exit()
PS:网站防爬机制太强,爬到一定数量就会返回空页
VSCODE的终端太服了,一开始以为网页都没爬完,结果是装不下还是怎么了
'''
import requests
import re
#1.模拟浏览器读取主页
ag = {'User-Agent':'Mozilla/5.0'}
url = 'http://www.jingcaiyuedu.com/novel/DGCcu2/list.html'
r = requests.get(url, headers = ag)
r.encoding = 'utf-8'
html = r.text
#2.创建txt文件
title = re.findall(r'<meta name="description" content="(.*?)"/>',html)[0]
#print(title)
fb = open('%s.txt' % title, 'w', encoding='utf-8')#3.获得 href,名称 的列表
dl = re.findall(r'<dl class="panel-body panel-chapterlist">.*?</dl>',html, re.S)[0]
chapter_info_list = re.findall(r'href="(.*?)">(.*?)<',dl)
for i in chapter_info_list:
chapter_url,chapter_title = i
ag = {'User-Agent':'Mozilla/5.0'}
chapter_url = 'http://www.jingcaiyuedu.com%s' % chapter_url
response = requests.get(chapter_url, headers =ag)
response.encoding = 'utf-8'
#将每一章文本信息提取
book_html = response.text
chapter_content = re.findall(r'<div class="panel-body" id="htmlContent">(.*?)</div>',book_html, re.S)[0]
#清洗
chapter_content = chapter_content.replace('','')
chapter_content = chapter_content.replace('<br>','')
chapter_content = chapter_content.replace('<br />','')
chapter_content = chapter_content.replace('<p>','')
chapter_content = chapter_content.replace('</p>','')
chapter_content = chapter_content.replace(' ','')
#保存
fb.write(chapter_title)
fb.write(chapter_content)
print('成功:',chapter_url)
'''
'''
#库的准备
import requests #cmd pip install requests安装
import re #系统库
url = 'https://www.shujy.com/5200/9613/'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",}
response=requests.get(url)
response.encoding='utf-8'
html=response.text
title=re.findall(r'<meta property="og:novel:book_name" content="(.*?)"/>',html)[0]
fb=open('%s.txt'% title,'w',encoding='utf-8')
dl=re.findall(r'<div id="list">.*?</div>',html,re.S)[0]
chapter_info_list=re.findall(r'<a href="(.*?)">(.*?)</a>',dl)
for chapter_info in chapter_info_list:
chapter_url,chapter_title=chapter_info
chapter_url="https://www.shujy.com/5200/9613/%s " % chapter_url
chapter_url=chapter_url.replace(' ','')
chapter_response=requests.get(chapter_url,headers=headers)
chapter_response.encoding='utf-8'
chapter_html=chapter_response.text
chapter_content=re.findall(r'<div id="content">(.*?)</div>',chapter_html,re.S)[0]
chapter_content=chapter_content.replace(' ','')
chapter_content=chapter_content.replace('<br />','')
chapter_content=chapter_content.replace('&t;','')
fb.write(chapter_title)
fb.write('\n')
fb.write(chapter_content)
fb.write('\n')
print(chapter_url,chapter_title)
'''
来源:https://www.cnblogs.com/kyx599/p/12173806.html