spider_project.py
from spider_project.items import SpiderProjectItem
from bs4 import BeautifulSoup
from scrapy.http import Request
import scrapy
import math
import lxml
class Pro_spider(scrapy.Spider):
name = 'test'
allow_domains = ['lianjia.com']
base_url = 'https://hz.fang.lianjia.com/loupan/'
regions = {
'xihu':'西湖',
'xiacheng':'下城',
'jianggan':'江干',
'gongshu':'拱墅',
'shangcheng':'上城',
'binjiang':'滨江',
'yuhang':'余杭',
'xiaoshan':'萧山'
}
def start_requests(self):
index = 1
for region in list(self.regions.keys()):
# print(region)
url = self.base_url + region + "/" +'pg{0}'.format(index)
# for num in range(1,2):
# url = self.base_url + '/nhs{0}/'.format(num)
yield Request(url,callback=self.parse_page,dont_filter=True,meta={'region':region,'index':index})
def parse_page(self,response):
soup = BeautifulSoup(response.text, 'lxml')
totalpage = math.floor(int(soup.find(attrs={'data-current': 1})['data-total-count']) / 10 + 1) # 向下取整
print(totalpage)
for index in range(1,totalpage+1):
url = self.base_url + response.meta['region'] + "/" + 'pg{0}'.format(index)
yield Request(url, callback=self.parse, dont_filter=True,meta={'region': response.meta['region'], 'index': index})
def parse(self,response):
Item = SpiderProjectItem()
soup = BeautifulSoup(response.text,'lxml')
loupans = soup.find_all('div',class_='resblock-desc-wrapper')
print(len(loupans))
for loupan in loupans:
#获取楼盘名称
Item['name'] = loupan.find_all('a',class_='name')[-1].get_text()
#获取楼盘详细位置,列表形式,"滨江,长河,闻涛路绿城九龙仓柳岸晓风"
location_info =loupan.find('div',class_='resblock-location').find_all(['span', 'a'])
location = ''
for item in location_info:
location += item.string
# print(location)
Item['location'] = location
#获取每平方单价
Item['danjia'] = loupan.find('span',class_='number').string
#获取总价
# zongjia = loupan.find('div',class_='second').get_text()
# print(zongjia)
# print(name,location,price)
#获取建筑面积
try:
Item['area'] = str(loupan.find('div',class_='resblock-area').get_text()).split()[-1]
except Exception as error:
Item['area'] = None
#获取房型
nums = len(loupan.find('a',class_='resblock-room').find_all('span'))
# try:
type = ''
for num in range(0,nums):
type += str(loupan.find('a',class_='resblock-room').find_all('span')[num].get_text()) + '/'
# Item['type'] = type
if len(type):
Item['type'] = type
else:
Item['type'] = None
# except Exception as error:
#获取地区
Item['region'] = response.meta['region']
# [print(i) for i in loupan.find('a',class_='resblock-room').find_all('span')]
# totalpage = math.floor(int(soup.find(attrs={'data-current': 1})['data-total-count']) / 10 + 1) # 向下取整
# if response.meta['index'] < totalpage:
# index = response.meta['index']+1
# url = self.base_url + response.meta['region'] + "/" + 'pg{0}'.format(index)
# yield Request(url,callback=self.parse, dont_filter=False, meta={'region': response.meta['region'],'index':index})
# def parse(self,response):
# print(response.text)
# print('\n')
# print(response.request.headers['User-Agent'],'\n')
yield Item
middleware.py
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class SpiderProjectSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class SpiderProjectDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# ua = UserAgent()
# request.headers['User-Agent'] = ua.random
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
#设置随机UA
class RandomUserAgent(object):
def process_request(self, request, spider):
ua = UserAgent()
request.headers['User-Agent'] = ua.random
#设置爬虫访问频率
class RandomDelayMiddleware(object):
def __init__(self, delay):
self.delay_min = delay_min
self.delay_max = delay_max
@classmethod
def from_crawler(cls, crawler):
delay_min = crawler.spider.settings.get("RANDOM_DELAY_MIN", 10)
delay_max = crawler.spider.settings.get("RANDOM_DELAY_MAX", 20)
if not isinstance(delay_min, int):
raise ValueError("RANDOM_DELAY_MIN need a int")
if not isinstance(delay_max, int):
raise ValueError("RANDOM_DELAY_MAX need a int")
return cls(delay_min,delay_max)
def process_request(self, request, spider):
delay = random.randint(self.delay_min, self.delay_max)
logging.info(">>> request.url is {0},random delay: {1} s <<<".format(request.url,delay))
time.sleep(delay)
pipeline.py
# -*- coding: utf-8 -*-
import pymysql
# import time
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class SpiderProjectPipeline(object):
def open_spider(self,spider):
self.connect = pymysql.connect(
host = '127.0.0.1',
user = 'root',
passwd = 'Xzl@K8sBASEserver!',
database = 'spider_test',
port = 8635,
charset = 'utf8',
use_unicode = False
)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
insert_sql = "INSERT INTO item_info(name,region," \
"location,area,danjia,type)" \
"VALUES (" \
" %s, %s, %s, " \
" %s," \
" %s, %s)"
params = (
item['name'],item['region'],item['location'],
item['area'],item['danjia'],item['type']
)
self.cursor.execute(insert_sql,params)
self.connect.commit()
def spider_close(self,spider):
self.cursor.close()
self.connect.close()
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for spider_project project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'spider_project'
SPIDER_MODULES = ['spider_project.spiders']
NEWSPIDER_MODULE = 'spider_project.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'spider_project (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY_MIN = 20
DOWNLOAD_DELAY_MAX = 30
# HTTPERROR_ALLOWED_CODES = [301,302]
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'spider_project.middlewares.SpiderProjectSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'spider_project.middlewares.SpiderProjectDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'spider_project.pipelines.SpiderProjectPipeline': 300,
'spider_project.pipelines.SpiderProjectPipeline': 400
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'spider_project.middlewares.RandomUserAgent': 543,
}
PROXIES = [
{'ip_port': '223.199.26.27:8746', 'user_passwd': 'user1:pass1'},
{'ip_port': '183.166.21.218:9999', 'user_passwd': 'user2:pass2'},
{'ip_port': '223.199.18.87:9999', 'user_passwd': 'user3:pass3'},
{'ip_port': '114.99.13.4:9999', 'user_passwd': 'user4:pass4'},
{'ip_port': '47.112.214.45:8000', 'user_passwd': 'user5:pass5'},
entrypoint.py
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'test'])
来源:CSDN
作者:等一场大️
链接:https://blog.csdn.net/weixin_43855694/article/details/104103680