1.创建scrapy项目
2.安装scrapy redis
pip install scrapy-redis
3.设置setting.py
3.1 添加item_piplines
ITEM_PIPELINES = {
# scrapyredis配置
'scrapy_redis.pipelines.RedisPipeline':400
}
3.2 添加scrapy-redis属性配置
""" scrapy-redis配置 """
# Enables scheduling storing requests queue in redis.
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 调度器启用Redis存储Requests队列
#SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有的爬虫实例使用Redis进行重复过滤
#DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 将Requests队列持久化到Redis,可支持暂停或重启爬虫
#SCHEDULER_PERSIST = True
# Requests的调度策略,默认优先级队列
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
3.3 添加redis配置
# 指定Redis的主机名和端口
REDIS_HOST = 'ip'
REDIS_PORT = port
4.修改爬虫文件
4.1 将crawspider父类换成rediscrawlspider
4.2 设置 redis_key 用于redis推送爬取任务
4.3 设置动态域名,也可使用allow_domin,两者选一个
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from youyuan.items import YouyuanItem from scrapy_redis.spiders import RedisCrawlSpider #class YouyuancomSpider(CrawlSpider): class YouyuancomSpider(RedisCrawlSpider): name = 'youyuancom' allowed_domains = ['youyuan.com'] # scrapy_redis分布式时,将redis_key 代替 start_urls #start_urls = ['http://www.youyuan.com/find/zhejiang/mm18-0/advance-0-0-0-0-0-0-0/p1/'] redis_key = "YouyuancomSpider:start_urls" rules = ( Rule(LinkExtractor(allow=r'youyuan.com/find/zhejiang/mm18-0/p\d+/')), Rule(LinkExtractor(allow=r'/\d+-profile/'), callback='parse_personitem', follow=True), ) # scrapy redis 动态域名 def __init__(self, *args, **kwargs): # Dynamically define the allowed domains list. domain = kwargs.pop('domain', '') self.allowed_domains = filter(None, domain.split(',')) super(YouyuancomSpider, self).__init__(*args, **kwargs) def parse_personitem(self, response): item = YouyuanItem() item["username"] = response.xpath("//div[@class='con']/dl[@class='personal_cen']/dd/div/strong/text()").extract() item["introduce"] = response.xpath("//div[@class='con']/dl[@class='personal_cen']/dd/p/text()").extract() item["imgsrc"] = response.xpath("//div[@class='con']/dl[@class='personal_cen']/dt/img/@src").extract() item["persontag"] = response.xpath("//div[@class='pre_data']/ul/li/p/text()").extract() item["sourceUrl"] = response.url yield item def parse_item(self, response): item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() return item
5.直接进入爬虫项目下运行 scrapy runspider xx.py
6.redis端推送爬取任务
lpush redis_key 网址
redis_key 是在爬虫文件里设置的值
注意点:
1.scrapy runspider xx.py xx是你的爬虫文件,也就是sracpy genspider xx 对应
2.如果你是在pycharm里直接创建的项目,导入items.py时使用的是 from ..items import xxItmes
会报错,attempted relative import with no known parent package,大意就是找不到上一层父类,
解决方案:
1.选中你的爬虫子项目->右键mark directory as->选择 source root
2.修改你的爬虫文件,将 from ..items import xxItmes 修改为 from 项目名.items import xxItems 会没有提示,需要自己手动打入
然后再运行 runspider命令
出现类似 如图字样表示爬虫已经等待接收任务
3.如果出现
scrapy-redis中DEBUG: Filtered offsite request to xxx
需要在setting.py设置
SPIDER_MIDDLEWARES = {
'youyuan.middlewares.YouyuanSpiderMiddleware': 543,
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None, 将这个设置为none
}
=====================分割线=====================
redis保存入mqsql
安装mysql python3.7 对应的是pymysql pip install pymysql
比较简单,直接帖代码
import redis import json from pymysql import * def process_item(): #传概念redis数据库 rediscli = redis.Redis(host="",port=6379,db=0) mysqlcli = connect(host='127.0.0.1',port=3306,user='root',password='root',database='test',charset='utf8') offset = 0 while True: #将数据从redis中pop出来 source,data = rediscli.blpop("youyuancom:items") #创建mysql操作游标对象,可以执行mysql语句 cursor = mysqlcli.cursor() sql = "insert into scrapyredis_youyuan(username,persontag,imgsrc,url) values(%s,%s,%s,%s)" jsonitem = json.loads(data) params = [jsonitem["username"],jsonitem["persontag"],jsonitem["imgsrc"],jsonitem["sourceUrl"]] # 参数化 result = cursor.execute(sql, params) mysqlcli.commit() cursor.close() offset +=1 print("保存入数据库:"+str(offset)) if __name__ == "__main__": process_item()
来源:oschina
链接:https://my.oschina.net/myithome/blog/3171540