tip:
大致思路:从网络(URL:http://ip.jiangxianli.com/api/proxy_ips)获取代理ip以及端口,存到列表;随机一个数从列表中取一个ip,设置超市时间以及次数;捕获异常,从列表中删除无用的ip代理并设置新的ip代理
settings.py中还要开启下载中间件
DOWNLOADER_MIDDLEWARES = {
'tianmao.middlewares.TestDownloaderMiddleware': 543,
}
code:
from scrapy import signals
import requests
import json, random
class TestDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self):
# 获取请求到的ip内容
res = requests.get('http://ip.jiangxianli.com/api/proxy_ips')
# 存入列表
self.ip_list = json.loads(res.content)['data']['data']
self.random_int = 1
print('init method is running ...')
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# print(request.meta)
print('process request is running ...')
# 获取代理
self.get_proxy(request)
return None
def process_response(self, request, response, spider):
print('process_response is running ...')
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
print('exception is %s' % exception)
if exception:
# 从ip_list删除之前的ip_dic
self.ip_list.pop(self.random_int)
# 再次选一个IP地址,返回request
request = self.get_proxy(request)
return request
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def get_proxy(self, request):
num_ip = len(self.ip_list)
print('现在总共有%d个ip地址' % num_ip)
# 随机一个ip
self.random_int = random.randint(0, num_ip)
print('随机到的整数是%d' % self.random_int)
ip_dic = self.ip_list[self.random_int+1]
print('随机到的ip地址是:%s' % ip_dic)
ip = ip_dic.get('ip')
port = ip_dic.get('port')
ip_address = 'http://' + ip + ':' + port
# 设置代理
request.meta['proxy'] = ip_address
# 设置最大请求时间
request.meta['download_timeout'] = 5
return request