How to stop scrapy spider after certain number of requests?

后端 未结 5 686
情书的邮戳
情书的邮戳 2021-02-04 20:01

I am developing an simple scraper to get 9 gag posts and its images but due to some technical difficulties iam unable to stop the scraper and it keeps on scraping

5条回答
  •  不要未来只要你来
    2021-02-04 20:47

    First: Use self.count and initialize outside of parse. Then don't prevent the parsing of the items, but generating new requests. See the following code:

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy import Item, Field
    
    
    class GagItem(Item):
        entry_id = Field()
        url = Field()
        votes = Field()
        comments = Field()
        title = Field()
        img_url = Field()
    
    
    class FirstSpider(scrapy.Spider):
    
        name = "first"
        allowed_domains = ["9gag.com"]
        start_urls = ('http://www.9gag.com/', )
    
        last_gag_id = None
        COUNT_MAX = 30
        count = 0
    
        def parse(self, response):
    
            for article in response.xpath('//article'):
                gag_id = article.xpath('@data-entry-id').extract()
                ninegag_item = GagItem()
                ninegag_item['entry_id'] = gag_id[0]
                ninegag_item['url'] = article.xpath('@data-entry-url').extract()[0]
                ninegag_item['votes'] = article.xpath('@data-entry-votes').extract()[0]
                ninegag_item['comments'] = article.xpath('@data-entry-comments').extract()[0]
                ninegag_item['title'] = article.xpath('.//h2/a/text()').extract()[0].strip()
                ninegag_item['img_url'] = article.xpath('.//div[1]/a/img/@src').extract()
                self.last_gag_id = gag_id[0]
                self.count = self.count + 1
                yield ninegag_item
    
            if (self.count < self.COUNT_MAX):
                next_url = 'http://9gag.com/?id=%s&c=10' % self.last_gag_id
                yield scrapy.Request(url=next_url, callback=self.parse)
    

提交回复
热议问题