问题
I have created an item pipeline as an answer to this question.
It is supposed to create a new file for every page according to the page_no
value set in the item. This works mostly fine.
The problem is with the last csv file generated by the pipeline/item exporter, page-10.csv
.
The last 10 values are not exported, so the file stays empty.
What could be the reason for this behaviour?
pipelines.py
from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
"""Distribute items across multiple CSV files according to their 'page_no' field"""
def open_spider(self, spider):
self.filename_to_exporter = {}
def spider_closed(self, spider):
for exporter in self.filename_to_exporter.values():
exporter.finish_exporting()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
f = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
self.filename_to_exporter[filename] = exporter
return self.filename_to_exporter[filename]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item
spider
import scrapy
from ..pipelines import PerFilenameExportPipeline
class spidey(scrapy.Spider):
name = "idk"
custom_settings = {
'ITEM_PIPELINES': {
PerFilenameExportPipeline: 100
}
}
def start_requests(self):
yield scrapy.Request("http://quotes.toscrape.com/", cb_kwargs={'page_no': 1})
def parse(self, response, page_no):
for qts in response.xpath("//*[@class=\"quote\"]"):
yield {
'page_no': page_no,
'author' : qts.xpath("./span[2]/small/text()").get(),
'quote' : qts.xpath("./*[@class=\"text\"]/text()").get()
}
next_pg = response.xpath('//li[@class="next"]/a/@href').get()
if next_pg is not None:
yield response.follow(next_pg, cb_kwargs={'page_no': page_no + 1})
来源:https://stackoverflow.com/questions/65007869/csvitemexporter-for-multiple-files-in-custom-item-pipeline-not-exporting-all-ite