拉钩爬虫 | 易学教程

#!/usr/bin/env python# -*- coding:utf-8 -*-import jsonimport reimport timeimport lxml.htmlfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom redis_cache import RedisCacheclass LagouSpider(object):    def __init__(self):        # 调用webdriver.Chrome()启动浏览器        self.driver = webdriver.Chrome()        self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='        self.detail_url = None    def run(self):        # 获得url打开浏览器        self.driver.get(self.url)        while True:            # 获取当前页面源代码            source = self.driver.page_source            # 进行等待页面加载,如果需要的内容已出现,就进行下一步            WebDriverWait(driver=self.driver, timeout=10).until(                EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]/span[last()]'))            )            # 将source传入parse_list_page函数进行解析            self.parse_list_page(source)            try:                next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')                if "pager_next_disabled" in next_btn.get_attribute("class"):                    break                else:                    next_btn.click()            except:                print(source)            time.sleep(1)    def parse_list_page(self, source):        """        进行原始页面解析        :param source:        :return:        """        html = lxml.html.fromstring(source)        # 获取详情页链接集        links = html.xpath('//a[@class="position_link"]/@href')        for link in links:            self.detail_url = link            self.requests_detail_page(link)            time.sleep(1)    def requests_detail_page(self,url):        """        请求详情页信息        :param url:        :return:        """        self.driver.execute_script("window.open('%s')" % url)        self.driver.switch_to.window(self.driver.window_handles[1])        WebDriverWait(self.driver, timeout=10).until(            EC.presence_of_element_located((By.XPATH, '//div[@class="job-name"]//span[@class="name"]'))        )        source  = self.driver.page_source        self.parse_datail_page(source)        self.driver.close()        self.driver.switch_to.window(self.driver.window_handles[0])    def parse_datail_page(self, source):        """详情页解析"""        html = lxml.html.fromstring(source)        job_name = html.xpath('//div[@class="job-name"]//span[@class="name"]/text()')[0]        job_salary = html.xpath('//dd[@class="job_request"]/p//span[1]/text()')[0]        job_city = html.xpath('//dd[@class="job_request"]/p//span[2]/text()')[0]        job_city = re.sub(r"[\s/]", "", job_city)        experience = html.xpath('//dd[@class="job_request"]/p//span[3]/text()')[0].strip()        experience = re.sub(r"[\s/]", "", experience)        education = html.xpath('//dd[@class="job_request"]/p//span[4]/text()')[0]        education = re.sub(r"[\s/]", "", education)        job_time = html.xpath('//dd[@class="job_request"]/p//span[5]/text()')[0]        job_advantage = html.xpath('//dd[@class="job-advantage"]/p/text()')[0]        desc = "".join(html.xpath('//dd[@class="job_bt"]//text()')).strip()        job_address = "".join(html.xpath('//div[@class="work_addr"]//text()'))        job_address = re.sub(r"[\s/]", "", job_address)[0:-4]        position = {            'job_name': job_name,            'job_salary': job_salary,            'job_city': job_city,            'experience': experience,            'education': education,            'job_advantage': job_advantage,            'desc': desc,            'job_address': job_address,            'job_time': job_time,        }        rc = RedisCache()        rc[self.detail_url] = position        position_print = json.loads(rc[self.detail_url])        print(self.detail_url)        print(position_print)        print('='*40)if __name__ == '__main__':    spider = LagouSpider()    spider.run()
来源：https://www.cnblogs.com/wenjiangtao/p/10963633.html
标签
python爬虫
webdriver
xpath