爬虫多线程模板,xpath,etree

China☆狼群 提交于 2019-12-05 04:29:19
class QuiShi:    def __init__(self):        self.temp_url = "http://www.lovehhy.net/Joke/Detail/QSBK/{0}"        self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"}        #1.Queue url队列        self.url_query = Queue()        # html网页队列        self.html_query = Queue()        # content内容队列        self.content_query = Queue()    def get_url_list(self):         for i in range(1,5):             self.url_query.put(self.temp_url.format(i))    def parse_url(self):        while True:            url = self.url_query.get()            self.html_query.put(requests.get(url,headers=self.headers).content.decode("gbk"))            self.url_query.task_done()    def get_content_list(self):        # print(html_str)        #etree.HTML 变成树状结构        while True:            html_str = self.html_query.get()            html_str = html_str.replace("<br />","").strip("")            html = etree.HTML(html_str)            # s = html.xpath('//div[@id="footzoon"]')            h3_list = html.xpath('//div[@id="footzoon"]/h3')            content_list=[]            for h3 in h3_list:                item = {}                item["title"] = h3.xpath("./a/text()")                item["title_href"] = h3.xpath("./a/@href")                item["content"] =[]                s = h3.xpath('./following-sibling::div/text()')                for i in s:                    item["content"].append(i.replace("\u3000",""))                content_list.append(item)            self.content_query.put(content_list)            self.html_query.task_done()    def save_content_list(self):        while True:            cons = self.content_query.get()            print(cons)            self.content_query.task_done()    def run(self):        # 1.获取url地址列表        t1 = threading.Thread(target=self.get_url_list)        t21 = threading.Thread(target=self.parse_url)        t22 = threading.Thread(target=self.parse_url)        t23 = threading.Thread(target=self.parse_url)        t3 = threading.Thread(target=self.get_content_list)        t4 = threading.Thread(target=self.save_content_list)        t1.start()        t21.start()        t22.start()        t23.start()        t3.start()        t4.start()        self.url_query.join()        self.html_query.join()        self.content_query.join()if __name__ == '__main__':    t1 = time.time()    quishi = QuiShi()    quishi.run()    print(time.time() - t1)
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!