问题
I am trying to yield items from different requests as shown here. If I add items = PrintersItem() to each request I get endless loops.. It I take it out other errors occur. Not sure how to combine yield request with yield items for each
import scrapy
from scrapy.http import Request, FormRequest
from ..items import PrintersItem
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
items = PrintersItem()
token = response.xpath('//*[@name="CSRFToken"]/@value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
url = response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action)
for items in self.postlogin2(response):
yield items
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
url = response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2)
for items in self.action(response):
yield items
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/@value').extract()
items['tcpip'] = tcpip
for items in self.action2(response):
yield items
回答1:
If you want to send items
from parse
to postlogin2
, etc. then add it as meta
data in Request
yield Request( ..., meta={"items": items})
and get it in other function
items = response.meta["items"]
and yield it only in the last function
yield items
Doc: Request and Response, Request.meta special keys
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35',
'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[@name="CSRFToken"]/@value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self, response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
#url=response.urljoin("/general/information.html?kind=item"),
url=response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action,
meta={"items": items})
def action(self, response):
items = response.meta["items"]
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
#url=response.urljoin("/net/wired/tcpip.html"),
url=response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2,
meta={"items": items})
def action2(self, response):
items = response.meta["items"]
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/@value').extract()
items['tcpip'] = tcpip
yield items
来源:https://stackoverflow.com/questions/60385343/scrapy-yeild-items-from-multiple-requests