一.文件保存
1.分类保存
1 def process_item(self, item, spider): 2 category="novel1/"+item['category'] 3 if os.path.exists(category)==False: 4 os.mkdir(category) 5 fname=category+"/"+item['article_name']+'.txt' 6 self.filename=codecs.open(fname, 'a', 'utf-8') 7 self.filename.write(item['content_name']+'\n') 8 self.filename.write(item['content']+'\n') 9 return item
2.直接保存
1 def __init__(self):
2 self.filename=codecs.open('face.json','wb+','utf-8')
3
4 def process_item(self, item, spider):
5 line = json.dumps(dict(item), ensure_ascii=False,sort_keys=True, indent=4) + ",\n"
6 self.filename.write(line)
7 return item
8
9 def spider_closed(self, spider):
10 self.filename.close()
3.图片下载(setting.py文件中设置保存路径 IMAGES_STORE=os.path.join(os.path.dirname(os.path.dirname(__file__)),'images') )
1 class Img699PicPipeline(object):
2 def process_item(self, item, spider):
3 return item
4
5
6 class Images699Pipeline(ImagesPipeline):
7 def get_media_requests(self, item, info):
8 # 这个方法是在发送下载请求之前调用的,其实这个方法本身就是去发送下载请求的
9 request_objs=super(Images699Pipeline, self).get_media_requests(item,info)
10 for request_obj in request_objs:
11 request_obj.item=item
12 return request_objs
13
14 def file_path(self, request, response=None, info=None):
15 # 这个方法是在图片将要被存储的时候调用,来获取这个图片存储的路径
16 path=super(Images699Pipeline, self).file_path(request,response,info)
17 category=request.item.get('category')
18 image_store=settings.IMAGES_STORE
19 category_path=os.path.join(image_store,category)
20 if not os.path.exists(category_path):
21 os.makedirs(category_path)
22 image_name=path.replace("full/","")
23 image_path=os.path.join(category_path,image_name)
24 return image_path
二、内容去重
1 class DuplicatesPipeline(object):
2 def __init__(self):
3 self.face_set = set()
4
5 def process_item(self, item, spider):
6 for materail in item['materials']:
7 id=materail['id']
8 if id in self.face_set:
9 raise DropItem("Duplicate book found:%s" % item)
10 self.face_set.add(id)
11 return item
来源:https://www.cnblogs.com/ShadowXie/p/9699888.html