python爬虫分章节保存小说

匿名 (未验证) 提交于 2019-12-02 22:51:30
#coding:utf-8import requests,re,osfrom bs4 import BeautifulSoupnames = []  # 存放章节名urls = []  # 存放章节链接nums = 0  # 章节数url = 'http://www.xbiquge.la/20/20948/'response = requests.get(url)response.encoding='utf-8'   #解决乱码问题,直接response.encoding"""解析url,查找标签div,id='list',在查找到的列表中找出a标签"""soup = BeautifulSoup(response.text,'lxml')div =soup.find_all('div',id='list')a_bf = BeautifulSoup(str(div),'lxml')a = a_bf.find_all('a')"""将a标签中的链接href添加到urls中,将div标签的属性添加到目录中"""for div in a:    names.append(div.string)    urls.append("http://www.xbiquge.la"+div.get('href'))"""在该路径下,检查是否有该文件夹"""div1 = soup.find_all('div',id='info')t_bf= BeautifulSoup(str(div1),'lxml')h=t_bf.find('h1')h = h.stringt='C:\\Users\\Administrator\\Desktop\\%s'%hif not os.path.exists(t):    os.mkdir(t)"""循环输出urls,并逐一解析,找到小说所在的p标签,并保存""""""在文件夹下创建章节同名txt文件"""i = 0while i <len(urls):    response1 = requests.get(url=urls[i])    response1.encoding = 'utf-8'    c = re.findall('<div id="content">(.*?)</div>',response1.text)    soup1 = BeautifulSoup(response1.text,'lxml')    b_bf = BeautifulSoup(str(c),'lxml')    src =a[i].get_text() + '.txt'    print(src)    filename = t+'/'+src    f = open(filename, 'w+', encoding='utf-8')    f.write(a[i].get_text()+'\n')    for result in b_bf:        e = b_bf.find_all('p')        res = result.text        f.write(res)        i += 1
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!