问题
So I am trying to scrape a website and I want to scrape many tables. The problem is that when I use those two for loops it will scrape every month and year as it should but it will mix the data from different months and years instead of giving the tables by the order defined by the loops. Any idea how to solve this problem?
import scrapy
from ..items import RenItem
from scrapy.utils.response import open_in_browser
from scrapy.http import FormRequest
class ScrapeTableSpider(scrapy.Spider):
name = 'scrape-table'
allowed_domains = ['https://www.centrodeinformacao.ren.pt/PT/InformacaoExploracao/Paginas/EstatisticaMensal.aspx']
start_urls = ['https://www.centrodeinformacao.ren.pt/PT/InformacaoExploracao/Paginas/EstatisticaMensal.aspx']
def parse(self, response):
for j in range (2007,2009):
# print(j)
yield FormRequest.from_response(response, formdata={
'ctl00$m$g_9b99ffea_e036_46c7_9be7_88c49a7820ac$ddlAnos': str(j)}, callback=self.parse2, dont_filter=True)
def parse2(self, response):
for i in range (1,4):
#print(i)
yield FormRequest.from_response(response, formdata = {
'ctl00$m$g_9b99ffea_e036_46c7_9be7_88c49a7820ac$ddlMeses': str(i),
'ctl00$m$g_9b99ffea_e036_46c7_9be7_88c49a7820ac$cmdCxecutar': 'Executar'},
callback=self.start_scraping, dont_filter=True)
def start_scraping(self, response):
open_in_browser(response)
items = RenItem()
for row in response.xpath('//tr[@class="grid_row"]'):
renmensal = row.xpath('td[1]//text()').extract()
mes1 = row.xpath('td[2]//text()').extract()
acum1 = row.xpath('td[3]//text()').extract()
mes2 = row.xpath('td[4]//text()').extract()
acum2 = row.xpath('td[5]//text()').extract()
mes_variacao = row.xpath('td[6]//text()').extract()
acum_variacao = row.xpath('td[7]//text()').extract()
if len(renmensal) == 0:
items['renmensal'] = ' '
else:
items['renmensal'] = renmensal
if len(mes1) == 0:
items['mes1'] = ' '
else:
items['mes1'] = mes1
if len(acum1) == 0:
items['acum1'] = ' '
else:
items['acum1'] = acum1
if len(mes2) == 0:
items['mes2'] = ' '
else:
items['mes2'] = mes2
if len(acum2) == 0:
items['acum2'] = ' '
else:
items['acum2'] = acum2
if len(mes_variacao) == 0:
items['mes_variacao'] = ' '
else:
items['mes_variacao'] = mes_variacao
if len(acum_variacao) == 0:
items['acum_variacao'] = ' '
else:
items['acum_variacao'] = acum_variacao
yield items
来源:https://stackoverflow.com/questions/64282670/scrapy-can%c2%b4t-scrape-multiple-tables-at-once