Executing Javascript Submit form functions using scrapy in python

前端 未结 3 1326
栀梦
栀梦 2020-12-02 07:13

I am scrapping a site using scrapy framework and having trouble clicking on a javascript link for opening another page.

I can identify the code on the page as:

3条回答
  •  孤街浪徒
    2020-12-02 07:29

    Checkout the below snipped on how to use scrapy with selenium. Crawling will be slower as you aren't just downloading the html but you will get full access to the DOM.

    Note: I have copy-pasted this snippet as the links previously provided no longer work.

    # Snippet imported from snippets.scrapy.org (which no longer works)
    
    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.selector import HtmlXPathSelector
    from scrapy.http import Request
    
    from selenium import selenium
    
    class SeleniumSpider(CrawlSpider):
        name = "SeleniumSpider"
        start_urls = ["http://www.domain.com"]
    
        rules = (
            Rule(SgmlLinkExtractor(allow=('\.html', )),
            callback='parse_page',follow=True),
        )
    
        def __init__(self):
            CrawlSpider.__init__(self)
            self.verificationErrors = []
            self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com")
            self.selenium.start()
    
        def __del__(self):
            self.selenium.stop()
            print self.verificationErrors
            CrawlSpider.__del__(self)
    
        def parse_page(self, response):
            item = Item()
    
            hxs = HtmlXPathSelector(response)
            #Do some XPath selection with Scrapy
            hxs.select('//div').extract()
    
            sel = self.selenium
            sel.open(response.url)
    
            #Wait for javscript to load in Selenium
            time.sleep(2.5)
    
            #Do some crawling of javascript created content with Selenium
            sel.get_text("//div")
            yield item
    

提交回复
热议问题