1、如何将一个scrapy爬虫项目修改成为一个简单的分布式爬虫项目
官方文档:https://scrapy-redis.readthedocs.io/en/stable/
只用修改scrapy项目的两个文件就可以了
一个是爬虫组件文件:
# -*- coding: utf-8 -*-
import scrapy
from scrapy_redis.spiders import RedisSpider
# 自定义爬虫类的继承类不再是scrapy.spiders下面的爬虫类,
# 而是scrapy-redis.spiders下面的爬虫类
class DistributedSpiderSpider(RedisSpider):
name = 'distributed_spider'
allowed_domains = ['wh.lianjia.com']
# 将start_urls替换为redis_key
# start_urls = ['https://wh.lianjia.com/ershoufang/']
redis_key = "myspider:start_urls"
def parse(self, response):
print(response.meta)
item = dict()
item["province"] = "湖北"
item["city"] = "武汉"
blocks = response.xpath("//div[@class='position']/dl[2]/dd/div[1]/div/a")
for block in blocks:
title = blocks.xpath("./@title").get()
url = "https://wh.lianjia.com" + block.xpath("./@href").get()
print(title, url, "=====================================")
yield scrapy.Request(url=url, callback=self.parse_block, meta={"item": item})
def parse_block(self, response):
print(response.meta)
url = response.url + "pg%dco32/"
index = 1
while index < 100:
new_url = url % index
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse_item, meta={"item": response.meta["item"]})
index += 1
def parse_item(self, response):
print(response.meta)
sellLinks = response.css("ul.sellListContent>li>a")
for link in sellLinks:
url = link.xpath("./@href").get()
print(url)
yield scrapy.Request(url=url, callback=self.parse_detail, meta={"item": response.meta["item"]})
def parse_detail(self, response):
item = response.meta["item"]
print(response.meta)
item["url"] = response.url
item["block"] = response.css("div.areaName>span.info").xpath("./a[1]/text()").get().strip()
item["smallBlock"] = response.css("div.areaName>span.info").xpath("./a[2]/text()").get().strip()
item["price"] = response.xpath("//span[@class='total']/text()").get().strip() + "万"
item["unitPrice"] = response.xpath("//span[@class='unitPriceValue']/text()").get().strip() + "元/平米"
print(item)
另一个是设置文件(settings.py):
在设置文件中添加几个设置项就可以了
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
ITEM_PIPELINES = {
'DistributedSpider.pipelines.DistributedspiderPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400
}
REDIS_HOST = "localhost" # 要连接的redis数据库的地址
REDIS_PORT = "6379" # redis数据库启动时的默认端口
来源:oschina
链接:https://my.oschina.net/u/4342785/blog/4131901