问题
I am having issues displaying my items as i wanted. My code is as follows:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import request
from scrapy.selector import HtmlXPathSelector
from texashealth.items import TexashealthItem
class texashealthspider(CrawlSpider):
name="texashealth"
allowed_domains=['jobs.texashealth.org']
start_urls=['http://jobs.texashealth.org/search/?&q=&title=Filter%3A%20title&facility=Filter%3A%20facility&location=Filter%3A%20city&date=Filter%3A%20date']
rules=(
Rule(SgmlLinkExtractor(allow=("search/",)), callback="parse_health", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse_health(self, response):
hxs=HtmlXPathSelector(response)
titles=hxs.select('//tbody/tr/td')
items = []
for titles in titles:
item=TexashealthItem()
item['title']=titles.select('span[@class="jobTitle"]/a/text()').extract()
item['link']=titles.select('span[@class="jobTitle"]/a/@href').extract()
item['shifttype']=titles.select('span[@class="jobShiftType"]/text()').extract()
item['location']=titles.select('span[@class="jobLocation"]/text()').extract()
items.append(item)
print items
return items
and the output that is being displayed looks as follows in the json format:
[
TexashealthItem(location=[], link=[u'/job/Fort-Worth-ULTRASONOGRAPHER-II-Job-TX-76101/31553900/'], shifttype=[], title=[u'ULTRASONOGRAPHER II Job']),
TexashealthItem(location=[], link=[], shifttype=[u'Texas Health Fort Worth'], title=[]),
TexashealthItem(location=[u'Fort Worth, TX, US'], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[u'/job/Kaufman-RN-Acute-ICU-Full-Time-Kaufman-Job-TX-75142/35466900/'], shifttype=[], title=[u'RN--Telemetry--Full Time--Kaufman Job']),
TexashealthItem(location=[], link=[], shifttype=[u'Texas Health Kaufman'], title=[]),
TexashealthItem(location=[u'Kaufman, TX, US'], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[u'/job/Fort-Worth-NURSE-PRACTITIONER-Occ-Med-Full-Time-Alliance-Job-TX-76101/35465400/'], shifttype=[], title=[u'NURSE PRACTITIONER-Occ Med-Full Time-Alliance Job']),
TexashealthItem(location=[], link=[], shifttype=[u'Texas Health Alliance'], title=[]),
TexashealthItem(location=[u'Fort Worth, TX, US'], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[], shifttype=[], title=[])
]
As you can see above, the parameters of the items are being displayed in separate intervals, that is, it displays the title and link in one line, and the rest of the output in other separate lines.
Can i get a solution so that i can display all the parameters in just one shot?
Thank you for your help
回答1:
You should loop on table rows -- tr elements, and not table cells -- td elements.
I suggest you use hxs.select('//table[@id="searchresults"]/tbody/tr') and then use .//span... in each loop iteration
titles=hxs.select('//table[@id="searchresults"]/tbody/tr')
items = []
for titles in titles:
item['title']=titles.select('.//span[@class="jobTitle"]/a/text()').extract()
item['link']=titles.select('.//span[@class="jobTitle"]/a/@href').extract()
item['shifttype']=titles.select('.//span[@class="jobShiftType"]/text()').extract()
item['location']=titles.select('.//span[@class="jobLocation"]/text()').extract()
items.append(item)
return items
来源:https://stackoverflow.com/questions/21582340/scrapy-output-issue