scrapy items are not JSON serializable while storing them to couchdb

问题

items.py classes

import scrapy
from scrapy.item import Item, Field
import json


class Attributes(scrapy.Item):
    description = Field()
    pages=Field()
    author=Field()
class Vendor(scrapy.Item):
    title=Field()
    order_url=Field()

class bookItem(scrapy.Item):

    title = Field()
    url = Field()
    marketprice=Field()
    images=Field()
    price=Field()
    attributes=Field()
    vendor=Field()
    time_scraped=Field()

my scrapper

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapper.items import bookItem,Attributes,Vendor
import couchdb
import logging
import json
import time
from couchdb import Server


class libertySpider(CrawlSpider):
   

    couch = couchdb.Server()
    db = couch['python-tests']
    name = "libertybooks"
    allowed_domains = ["libertybooks.com"]
    unvisited_urls = []
    visited_urls = []
    start_urls = [
        "http://www.libertybooks.com"
    ]
    url=["http://www.kaymu.pk"]
    rules = [Rule(SgmlLinkExtractor(),  callback='parse_item', follow=True)]
    
    total=0
    productpages=0
    exceptionnum=0



    def parse_item(self,response):
        if response.url.find("pid")!=-1:
            with open("number.html","w") as w:
                self.total=self.total+1
                w.write(str(self.total)+","+str(self.productpages))
            itm=bookItem()
            attrib=Attributes()
            ven=Vendor()
            images=[]
            try:
                name=response.xpath('//span[@id="pagecontent_lblbookName"]/text()').extract()[0]
                name=name.encode('utf-8')
                
            except:
                name="name not found"
            try:
                price=response.xpath('//span[@id="pagecontent_lblPrice"]/text()').extract()[0]
                price=price.encode('utf-8')
            except:
                price=-1
            try:
                marketprice=response.xpath('//span[@id="pagecontent_lblmarketprice"]/text()').extract()[0]
                marketprice=marketprice.encode('utf-8')
            except:
                marketprice=-1
            try:
                pages=response.xpath('//span[@id="pagecontent_spanpages"]/text()').extract()[0]
                pages=pages.encode('utf-8')
            except:
                pages=-1
            try:
                author=response.xpath('//span[@id="pagecontent_lblAuthor"]/text()').extract()[0]
                author=author.encode('utf-8')
            except:
                author="author not found"
            try:
                description=response.xpath('//span[@id="pagecontent_lblbookdetail"]/text()').extract()[0]
                description=description.encode('utf-8')
            except:
                description="des: not found"
            try:
                image=response.xpath('//img[@id="pagecontent_imgProduct"]/@src').extract()[0]
                image=image.encode('utf-8')
            except:
                image="#"


            ven['title']='libertybooks'
            ven['order_url']=response.url
            itm['vendor']=ven
           
            itm['time_scraped']=time.ctime()
            



            itm['title']=name
            itm['url']=response.url




            itm['price']=price
            itm['marketprice']=marketprice
            itm['images']=images

            attrib['pages']=pages
            attrib['author']=author
            attrib['description']=description
            itm['attributes']=attrib
 
            self.saveindb(itm)
            return itm

    def saveindb(self,obj):
        logging.debug(obj)
        self.db.save(obj)

Stack trace

2014-12-09 13:57:37-0800 [libertybooks] ERROR: Spider error processing <GET http://www.libertybooks.com/bookdetail.aspx?pid=16532>
    Traceback (most recent call last):
      File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
        call.func(*call.args, **call.kw)
      File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 638, in _tick
        taskObj._oneWorkUnit()
      File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
        result = next(self._iterator)
      File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 57, in <genexpr>
        work = (callable(elem, *args, **named) for elem in iterable)
    --- <exception caught here> ---
      File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 96, in iter_errback
        yield next(it)
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
        for x in result:
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
        return (_set_referer(r) for r in result or ())
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
        return (r for r in result or () if _filter(r))
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
        return (r for r in result or () if _filter(r))
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spiders/crawl.py", line 67, in _parse_response
        cb_res = callback(response, **cb_kwargs) or ()
      File "/home/asad/Desktop/scrapper/scrapper/spiders/liberty_spider.py", line 107, in parse_item
        self.saveindb(itm)
      File "/home/asad/Desktop/scrapper/scrapper/spiders/liberty_spider.py", line 112, in saveindb
        self.db.save(obj)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/client.py", line 431, in save
        _, _, data = func(body=doc, **options)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 514, in post_json
        **params)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 533, in _request_json
        headers=headers, **params)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 529, in _request
        credentials=self.credentials)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 244, in request
        body = json.encode(body).encode('utf-8')
      File "/usr/local/lib/python2.7/dist-packages/couchdb/json.py", line 69, in encode
        return _encode(obj)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/json.py", line 135, in <lambda>
        dumps(obj, allow_nan=False, ensure_ascii=False)
      File "/usr/lib/python2.7/json/__init__.py", line 250, in dumps
        sort_keys=sort_keys, **kw).encode(obj)
      File "/usr/lib/python2.7/json/encoder.py", line 207, in encode
        chunks = self.iterencode(o, _one_shot=True)
      File "/usr/lib/python2.7/json/encoder.py", line 270, in iterencode
        return _iterencode(o, 0)
      File "/usr/lib/python2.7/json/encoder.py", line 184, in default
        raise TypeError(repr(o) + " is not JSON serializable")
    exceptions.TypeError: {'attributes': {'author': 'Tina Fey',
     'description': "Once in a generation a woman comes along who changes everything. Tina Fey is not that woman, but she met that woman once and acted weird around her.\r\n\r\nBefore 30 Rock, Mean Girls and 'Sarah Palin', Tina Fey was just a young girl with a dream: a recurring stress dream that she was being chased through a local airport by her middle-school gym teacher.\r\n\r\nShe also had a dream that one day she would be a comedian on TV. She has seen both these dreams come true.\r\n\r\nAt last, Tina Fey's story can be told. From her youthful days as a vicious nerd to her tour of duty on Saturday Night Live; from her passionately halfhearted pursuit of physical beauty to her life as a mother eating things off the floor; from her one-sided college romance to her nearly fatal honeymoon - from the beginning of this paragraph to this final sentence.\r\n\r\nTina Fey reveals all, and proves what we've all suspected: you're no one until someone calls you bossy.",
     'pages': '304 Pages'},
     'images': [],
     'marketprice': '1,095',
     'price': '986',
     'time_scraped': 'Tue Dec  9 13:57:37 2014',
     'title': 'Bossypants',
     'url': 'http://www.libertybooks.com/bookdetail.aspx?pid=16532',
     'vendor': {'order_url': 'http://www.libertybooks.com/bookdetail.aspx?pid=16532',
     'title': 'libertybooks'}} is not JSON serializable

i am a beginner to the scrapy and couchdb, i have also tried to convert the item object to the json object by using "json.dumps(itm, default=lambda o: o.dict, sort_keys=True, indent=4)" but got the same response , so please tell me is there a way for me to make my class json serializable so that they can be stored in the couchdb?

回答1:

Well, the shorter answer is just to use ScrapyJSONEncoder:

from scrapy.utils.serialize import ScrapyJSONEncoder
_encoder = ScrapyJSONEncoder()

    ...

    def saveindb(self,obj):
        logging.debug(obj)
        self.db.save(_encoder.encode(obj))

The longer version is: if you intend this spider to grow (if it's not supposed to be a one-time thing), you may want to use a pipeline to store the items in CouchDB and keeping the concerns separated (crawling/scraping in spider code, storing in database in pipeline code).

This may look like over-engineering at first, but it really helps when a project starts to grow and makes testing easier.

来源：https://stackoverflow.com/questions/27389925/scrapy-items-are-not-json-serializable-while-storing-them-to-couchdb

标签

python

json

couchdb

scrapy