问题
I am trying to collect populations of different sovereigns from wiki list of sovereigns and add them to an array on each response. In the code below allList should have a list of dicts with name of the country in ['nation'] and the population in ['demographics']. Many thanks.
# -*- coding: utf-8 -*-
import scrapy
import logging
import csv
import pprint
class CrawlerSpider(scrapy.Spider):
name = 'test2Crawler'
allowed_domains = ['web']
start_urls = ['https://en.wikipedia.org/wiki/List_of_sovereign_states']
urlList = []
output = []
fields = ["nation", "demographics"]
filename = "C:\\second_project\\testWrite.csv"
def __init__(self):
self.counter = 1
pass
def parse(self, response):
list = []
item = {}
for resultHref in response.xpath(
'//table[contains(@class, "wikitable")]//a[preceding-sibling::span[@class="flagicon"]]'):
hrefRaw = resultHref.xpath('./@href').extract_first()
href = response.urljoin(hrefRaw)
nameC = resultHref.xpath('./text()').extract_first()
item['href'] = href
item['nameC'] = nameC
self.urlList.append(item.copy())
self.runSpider()
def parse_item(self, response):
i = {}
print "getting called..", self.counter
i['nation'] = response.meta['Country']
i['demographics'] = response.xpath(
'//tr[preceding-sibling::tr/th/a/text()="Population"]/td/text()').extract_first()
yield i
def passLinks(self, givenLink):
self.counter = self.counter + 1
if self.counter < 10:
href = givenLink['href']
nameC = givenLink['nameC']
yield scrapy.Request(href, callback=self.parse_item, meta={'Country': nameC})
else:
pass
def runSpider(self):
allList = [list(self.passLinks(token)) for token in self.urlList]
pprint.pprint(allList)
with open(self.filename, 'wb') as f:
writer = csv.DictWriter(f, self.fields)
writer.writeheader()
for xItem in allList:
writer.writerow({'nation': xItem['nation'], 'demographics': xItem['demographics']})
回答1:
It seems that this is exactly what pipelines.py is set up for in Scrapy. The issue is that the callback responses are not received in order or fast enough for them to be stored in a separate array. Once the responses are received they go through the Pipelines.py where they can be processed and stored. Below is Scrapy documentation on usage of pipeline. Very useful tool! [https://doc.scrapy.org/en/latest/topics/item-pipeline.html][1]
来源:https://stackoverflow.com/questions/43610655/python-how-to-add-response-from-scrapy-request-from-yield-into-an-array