# -*- coding: utf-8 -*-
import requests
from lxml import etree
class BookSpider(object):
def __init__(self):
self.url = "http://www.jianlaixiaoshuo.com/"
self.base_url = "http://www.jianlaixiaoshuo.com/"
self.headers = {
"Use_Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
#请求网页内容
def get_html(self,url):
html = requests.get(url,headers = self.headers).content.decode()
return html
#封装xpath
def get_xpath(self,html,pattern):
p = etree.HTML(html)
result = p.xpath(pattern)
return result
#保存数据
def save_data(self, data):
with open('剑来.txt','a',encoding='utf-8')as f:
f.write(data)
#下载数据
def down_load(self,url):
html = self.get_html(self.url)
# print(html)
pattern1 = '//dl[@class="chapterlist"]/dd/a/@href'
pattern2 = '//dl[@class="chapterlist"]/dd/a/text()'
#获取每一章的链接地址
book_lists = self.get_xpath(html, pattern1)
#获取每一章的章节名
book_name_lists = self.get_xpath(html, pattern2)
print(book_lists)
for book_name, url in zip(book_name_lists, book_lists):
#完整的章节url地址
book_url = self.base_url + url
book_html = self.get_html(book_url)
#数据清洗
pattern = '//div[@id="BookText"]/p/text()'
book_data = self.get_xpath(book_html, pattern)
#将列表转换为str
book_data = ''.join(book_data)
book_data = book_data.replace('<p>','')
book_data = book_data.replace('</p>', '')
book_data = book_data.replace('<script type="text/javascript" src="/tb.js"></script>', '')
book_data = book_data.replace('<br />', '')
book_data = book_data.replace('—', '')
book_data = book_data+'\n'
book_text = book_name+'\n'+book_data
print('正在下载',book_name)
print(book_text)
self.save_data(book_text)
#运行程序
def run(self):
self.down_load(self.url)
if __name__ == "__main__":
p = BookSpider()
p.run()
来源:https://www.cnblogs.com/maxxu11/p/12631126.html