#coding:utf-8import requests,re,osfrom bs4 import BeautifulSoupnames = [] # 存放章节名urls = [] # 存放章节链接nums = 0 # 章节数url = 'http://www.xbiquge.la/20/20948/'response = requests.get(url)response.encoding='utf-8' #解决乱码问题,直接response.encoding"""解析url,查找标签div,id='list',在查找到的列表中找出a标签"""soup = BeautifulSoup(response.text,'lxml')div =soup.find_all('div',id='list')a_bf = BeautifulSoup(str(div),'lxml')a = a_bf.find_all('a')"""将a标签中的链接href添加到urls中,将div标签的属性添加到目录中"""for div in a: names.append(div.string) urls.append("http://www.xbiquge.la"+div.get('href'))"""在该路径下,检查是否有该文件夹"""div1 = soup.find_all('div',id='info')t_bf= BeautifulSoup(str(div1),'lxml')h=t_bf.find('h1')h = h.stringt='C:\\Users\\Administrator\\Desktop\\%s'%hif not os.path.exists(t): os.mkdir(t)"""循环输出urls,并逐一解析,找到小说所在的p标签,并保存""""""在文件夹下创建章节同名txt文件"""i = 0while i <len(urls): response1 = requests.get(url=urls[i]) response1.encoding = 'utf-8' c = re.findall('<div id="content">(.*?)</div>',response1.text) soup1 = BeautifulSoup(response1.text,'lxml') b_bf = BeautifulSoup(str(c),'lxml') src =a[i].get_text() + '.txt' print(src) filename = t+'/'+src f = open(filename, 'w+', encoding='utf-8') f.write(a[i].get_text()+'\n') for result in b_bf: e = b_bf.find_all('p') res = result.text f.write(res) i += 1
来源:博客园
作者:疾风不弃
链接:https://www.cnblogs.com/hfct/p/11652007.html