爬取多页数据存储在execl里面

import urllib.request
import random
from lxml import etree
import  xlwt
import os

head =['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
       'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
       'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)']

def get_head(head):  #伪装浏览器
    tou = random.choice(head)
    url_head = ('User-Agent', tou)
    openner = urllib.request.build_opener()
    openner.addheaders=[url_head]
    urllib.request.install_opener(openner) #全局变量

def geturl(url):
    get_head(head) #要是想查看有没有使用了设置的User-Agent,可以使用Fiddler 4抓包软件查看
    yemian=urllib.request.urlopen(url)
    #print(yemian.getcode()) #和print(yemian.status)一样获取页面的状态码200 300 400
    #print(yemian.info())#查看页面相应的简介信息info() 比如页面的编码 服务名字
    #print(yemian.geturl()) #获取当前访问页面的url地址
    xiangying = yemian.read().decode('gbk')
    xiaoshuo_name = etree.HTML(xiangying).xpath('//a[@class="name"]/@title')
    #print(xiaoshuo_name)#获取到一个小说名字列表
    xiaoshuo_write = etree.HTML(xiangying).xpath('//div[@class="author"]/text()')
    xiaoshuo_link = etree.HTML(xiangying).xpath('//a[@class="name"]/@href')
    return xiaoshuo_name,xiaoshuo_link,xiaoshuo_write #获取到一个元组，里面嵌套列表

def  zhi():  #把值写到一个文件里面
    aa=cunzai().xiaoshuo() #类的实例化
    for i in range(1, 4):
        url = 'https://www.qb5.tw/list/' + str(i) + '.html'
        aa=geturl(url) #获取1页的元组([小说名字],[地址],[作者])
        with open('xiaoshuo.txt','a+') as f: #把获取的每页数据追加写入到一个文件里面
            for a in range(0,len(aa[0])): #len(aa[0])获取的是小说名字列表有多少个值
                f.writelines("%s,%s,%s\n"%(aa[0][a],aa[1][a],aa[2][a]))#依次写入到一个文件里面 格式看图片

def txt():
    fopen = open('xiaoshuo.txt') #查看文件
    bb=cunzai().biaoge() #类的实例化 有文件就删除之前测试的demo.xls文件
    wb = xlwt.Workbook(encoding='utf-8') #创建一个execl格
    ws = wb.add_sheet('小说') #设置表格名字
    row0 = ["小说名字", "地址", "作者"] #列表名字
    for b in range(0, len(row0)):
        ws.write(0, b, row0[b]) #写入每列的列表名字
    ws.col(0).width = 6000 #设置第一例宽
    ws.col(1).width = 10000#设置第二例宽
    ws.col(2).width = 5000#设置第三例宽
    hang = 1 #这里是设置开始的行，从第2行开始，第1行已经写入了列表名字
    for line in fopen:
        line = line.strip('\n') #切割掉每行末尾的\n字符 例子超维入侵,https://www.qb5.tw/shu/94280.html,一念乱天机\n \n不会写出来，但是会空格一行
        line = line.split(',') #以,逗号作为分隔符，切割成列表 例子['超维入侵', 'https://www.qb5.tw/shu/94280.html', '一念乱天机']
        len_line = len(line) #获取列表里面有几个值
        for j in range(len_line): #len_lines里面是 ['超维入侵', 'https://www.qb5.tw/shu/94280.html', '一念乱天机']...
            ws.write(hang,len_line-3, line[j]) #(1,3-3,value) (1,4-3,value) (1,5-3,value) len_line在循环里自增+1
            len_line+=1
            wb.save('demo.xls')
        hang+=1
    fopen.close()

class cunzai(object): #如果存在就删除文件
    def xiaoshuo(self):
        if os.path.isfile('xiaoshuo.txt'):
            os.remove('xiaoshuo.txt')
    def biaoge(self):
        if os.path.isfile('demo.xls'):
            os.remove('demo.xls')

if __name__ == '__main__':
    '''
    url='https://www.qb5.tw/list/1.html'
    aa=geturl(url) #测试geturl(url)函数
    print(aa)
    '''
    zhi()
    txt()
爬取到的xiaoshuo.txt内容
在这里插入图片描述
来源：CSDN
作者：qq_39043100
链接：https://blog.csdn.net/qq_39043100/article/details/103730544
标签
mozilla
url