passing selenium response url to scrapy

后端 未结 2 932
悲哀的现实
悲哀的现实 2020-11-27 13:56

I am learning Python and am trying to scrape this page for a specific value on the dropdown menu. After that I need to click each item on the resulted table to retrieve the

2条回答
  •  独厮守ぢ
    2020-11-27 14:50

    Here is a middleware for Scrapy and Selenium

    from scrapy.http import HtmlResponse
    from scrapy.utils.python import to_bytes
    from selenium import webdriver
    from scrapy import signals
    
    
    class SeleniumMiddleware(object):
    
        @classmethod
        def from_crawler(cls, crawler):
            middleware = cls()
            crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
            crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
            return middleware
    
        def process_request(self, request, spider):
            request.meta['driver'] = self.driver  # to access driver from response
            self.driver.get(request.url)
            body = to_bytes(self.driver.page_source)  # body must be of type bytes 
            return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
    
        def spider_opened(self, spider):
            self.driver = webdriver.Firefox()
    
        def spider_closed(self, spider):
            self.driver.close()
    

    Also need to add in settings.py

    DOWNLOADER_MIDDLEWARES = {
        'youproject.middlewares.selenium.SeleniumMiddleware': 200
    }
    

    Decide weather its 200 or something else based on docs.

    Update firefox headless mode with scrapy and selenium

    If you want to run firefox in headless mode then install xvfb

    sudo apt-get install -y xvfb
    

    and PyVirtualDisplay

    sudo pip install pyvirtualdisplay
    

    and use this middleware

    from shutil import which
    
    from pyvirtualdisplay import Display
    from scrapy import signals
    from scrapy.http import HtmlResponse
    from scrapy.utils.project import get_project_settings
    from selenium import webdriver
    from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
    
    settings = get_project_settings()
    
    HEADLESS = True
    
    
    class SeleniumMiddleware(object):
    
        @classmethod
        def from_crawler(cls, crawler):
            middleware = cls()
            crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
            crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
            return middleware
    
        def process_request(self, request, spider):
            self.driver.get(request.url)
            request.meta['driver'] = self.driver
            body = str.encode(self.driver.page_source)
            return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
    
        def spider_opened(self, spider):
            if HEADLESS:
                self.display = Display(visible=0, size=(1280, 1024))
                self.display.start()
            binary = FirefoxBinary(settings.get('FIREFOX_EXE') or which('firefox'))
            self.driver = webdriver.Firefox(firefox_binary=binary)
    
        def spider_closed(self, spider):
            self.driver.close()
            if HEADLESS:
                self.display.stop()
    

    where settings.py contains

    FIREFOX_EXE = '/path/to/firefox.exe'
    

    The problem is that some versions of firefox don't work with selenium. To solve this problem you can download firefox version 47.0.1 (this version works flawlessly) from here then extract it and put it inside your selenium project. Afterwards modify firefox path as

    FIREFOX_EXE = '/path/to/your/scrapyproject/firefox/firefox.exe'
    

提交回复
热议问题