class QuiShi: def __init__(self): self.temp_url = "http://www.lovehhy.net/Joke/Detail/QSBK/{0}" self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"} #1.Queue url队列 self.url_query = Queue() # html网页队列 self.html_query = Queue() # content内容队列 self.content_query = Queue() def get_url_list(self): for i in range(1,5): self.url_query.put(self.temp_url.format(i)) def parse_url(self): while True: url = self.url_query.get() self.html_query.put(requests.get(url,headers=self.headers).content.decode("gbk")) self.url_query.task_done() def get_content_list(self): # print(html_str) #etree.HTML 变成树状结构 while True: html_str = self.html_query.get() html_str = html_str.replace("<br />","").strip("") html = etree.HTML(html_str) # s = html.xpath('//div[@id="footzoon"]') h3_list = html.xpath('//div[@id="footzoon"]/h3') content_list=[] for h3 in h3_list: item = {} item["title"] = h3.xpath("./a/text()") item["title_href"] = h3.xpath("./a/@href") item["content"] =[] s = h3.xpath('./following-sibling::div/text()') for i in s: item["content"].append(i.replace("\u3000","")) content_list.append(item) self.content_query.put(content_list) self.html_query.task_done() def save_content_list(self): while True: cons = self.content_query.get() print(cons) self.content_query.task_done() def run(self): # 1.获取url地址列表 t1 = threading.Thread(target=self.get_url_list) t21 = threading.Thread(target=self.parse_url) t22 = threading.Thread(target=self.parse_url) t23 = threading.Thread(target=self.parse_url) t3 = threading.Thread(target=self.get_content_list) t4 = threading.Thread(target=self.save_content_list) t1.start() t21.start() t22.start() t23.start() t3.start() t4.start() self.url_query.join() self.html_query.join() self.content_query.join()if __name__ == '__main__': t1 = time.time() quishi = QuiShi() quishi.run() print(time.time() - t1)
来源:https://www.cnblogs.com/dreamer-zhang/p/11905889.html