#!/usr/bin/env python# -*- coding:utf-8 -*-import jsonimport reimport timeimport lxml.htmlfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom redis_cache import RedisCacheclass LagouSpider(object): def __init__(self): # 调用webdriver.Chrome()启动浏览器 self.driver = webdriver.Chrome() self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' self.detail_url = None def run(self): # 获得url打开浏览器 self.driver.get(self.url) while True: # 获取当前页面源代码 source = self.driver.page_source # 进行等待页面加载,如果需要的内容已出现,就进行下一步 WebDriverWait(driver=self.driver, timeout=10).until( EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]/span[last()]')) ) # 将source传入parse_list_page函数进行解析 self.parse_list_page(source) try: next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]') if "pager_next_disabled" in next_btn.get_attribute("class"): break else: next_btn.click() except: print(source) time.sleep(1) def parse_list_page(self, source): """ 进行原始页面解析 :param source: :return: """ html = lxml.html.fromstring(source) # 获取详情页链接集 links = html.xpath('//a[@class="position_link"]/@href') for link in links: self.detail_url = link self.requests_detail_page(link) time.sleep(1) def requests_detail_page(self,url): """ 请求详情页信息 :param url: :return: """ self.driver.execute_script("window.open('%s')" % url) self.driver.switch_to.window(self.driver.window_handles[1]) WebDriverWait(self.driver, timeout=10).until( EC.presence_of_element_located((By.XPATH, '//div[@class="job-name"]//span[@class="name"]')) ) source = self.driver.page_source self.parse_datail_page(source) self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) def parse_datail_page(self, source): """详情页解析""" html = lxml.html.fromstring(source) job_name = html.xpath('//div[@class="job-name"]//span[@class="name"]/text()')[0] job_salary = html.xpath('//dd[@class="job_request"]/p//span[1]/text()')[0] job_city = html.xpath('//dd[@class="job_request"]/p//span[2]/text()')[0] job_city = re.sub(r"[\s/]", "", job_city) experience = html.xpath('//dd[@class="job_request"]/p//span[3]/text()')[0].strip() experience = re.sub(r"[\s/]", "", experience) education = html.xpath('//dd[@class="job_request"]/p//span[4]/text()')[0] education = re.sub(r"[\s/]", "", education) job_time = html.xpath('//dd[@class="job_request"]/p//span[5]/text()')[0] job_advantage = html.xpath('//dd[@class="job-advantage"]/p/text()')[0] desc = "".join(html.xpath('//dd[@class="job_bt"]//text()')).strip() job_address = "".join(html.xpath('//div[@class="work_addr"]//text()')) job_address = re.sub(r"[\s/]", "", job_address)[0:-4] position = { 'job_name': job_name, 'job_salary': job_salary, 'job_city': job_city, 'experience': experience, 'education': education, 'job_advantage': job_advantage, 'desc': desc, 'job_address': job_address, 'job_time': job_time, } rc = RedisCache() rc[self.detail_url] = position position_print = json.loads(rc[self.detail_url]) print(self.detail_url) print(position_print) print('='*40)if __name__ == '__main__': spider = LagouSpider() spider.run()
来源:https://www.cnblogs.com/wenjiangtao/p/10963633.html