I am scraping listings with Scrapy. My script parses first for the listing urls using parse_node
, then it parses each listing using parse_listing
,
import requests from scrapy create a hash and an agents list and append that list with the data from the requests.
from scrapy import requests
listing = { "title" : "amazing listing", "agents" : [ ] }
agentUrls = ["list", "of", "urls", "from", "scraped", "page"]
for agentUrl in agentUrls:
agentPage = requests.get(agentUrl)
agentTree = html.fromstring(page.content)
name = agentTree.xpath('//node[@id="AgentName"]/text()').extract_first()
email = agentTree.xpath('//node[@id="AgentEmail"]/text()').extract_first()
agent = { "name" : name, "email": email }
listings.agents.append(agent)
This is somewhat complicated issued:
You need to form a single item from multiple different urls.
Scrapy allows you to carry over data in request's meta attribute so you can do something like:
def parse_node(self,response,node):
yield Request('LISTING LINK', callback=self.parse_listing)
def parse_listing(self,response):
item = defaultdict(list)
item['id'] = response.xpath('//node[@id="ListingId"]/text()').extract_first()
item['title'] = response.xpath('//node[@id="ListingTitle"]/text()').extract_first()
agent_urls = string.split(response.xpath('//node[@id="Agents"]/text()').extract_first() or "",'^')
# find all agent urls and start with first one
url = agent_urls.pop(0)
# we want to go through agent urls one-by-one and update single item with agent data
yield Request(url, callback=self.parse_agent,
meta={'item': item, 'agent_urls' agent_urls})
def parse_agent(self,response):
item = response.meta['item'] # retrieve item generated in previous request
agent = dict()
agent['name'] = response.xpath('//node[@id="AgentName"]/text()').extract_first()
agent['email'] = response.xpath('//node[@id="AgentEmail"]/text()').extract_first()
item['agents'].append(agent)
# check if we have any more agent urls left
agent_urls = response.meta['agent_urls']
if not agent_urls: # we crawled all of the agents!
return item
# if we do - crawl next agent and carry over our current item
url = agent_urls.pop(0)
yield Request(url, callback=self.parse_agent,
meta={'item': item, 'agent_urls' agent_urls})