Python利用scrapy框架抓取链家楼盘信息

spider_project.py
from spider_project.items import SpiderProjectItem
from bs4 import BeautifulSoup
from scrapy.http import Request
import scrapy
import math
import lxml
class Pro_spider(scrapy.Spider):
    name = 'test'
    allow_domains = ['lianjia.com']
    base_url = 'https://hz.fang.lianjia.com/loupan/'
    regions = {
               'xihu':'西湖',
               'xiacheng':'下城',
               'jianggan':'江干',
               'gongshu':'拱墅',
               'shangcheng':'上城',
               'binjiang':'滨江',
               'yuhang':'余杭',
               'xiaoshan':'萧山'
    }

    def start_requests(self):
        index = 1
        for region in list(self.regions.keys()):
            # print(region)
            url = self.base_url + region + "/" +'pg{0}'.format(index)
        # for num in range(1,2):
        #     url = self.base_url + '/nhs{0}/'.format(num)
            yield Request(url,callback=self.parse_page,dont_filter=True,meta={'region':region,'index':index})

    def parse_page(self,response):
        soup = BeautifulSoup(response.text, 'lxml')
        totalpage = math.floor(int(soup.find(attrs={'data-current': 1})['data-total-count']) / 10 + 1)  # 向下取整
        print(totalpage)
        for index in range(1,totalpage+1):
            url = self.base_url + response.meta['region'] + "/" + 'pg{0}'.format(index)
            yield Request(url, callback=self.parse, dont_filter=True,meta={'region': response.meta['region'], 'index': index})
    def parse(self,response):
        Item = SpiderProjectItem()
        soup = BeautifulSoup(response.text,'lxml')
        loupans = soup.find_all('div',class_='resblock-desc-wrapper')
        print(len(loupans))
        for loupan in loupans:
        #获取楼盘名称
            Item['name'] = loupan.find_all('a',class_='name')[-1].get_text()
        #获取楼盘详细位置，列表形式,"滨江,长河,闻涛路绿城九龙仓柳岸晓风"
            location_info =loupan.find('div',class_='resblock-location').find_all(['span', 'a'])
            location = ''
            for item in location_info:
                location += item.string
            # print(location)
            Item['location'] = location
        #获取每平方单价
            Item['danjia'] = loupan.find('span',class_='number').string

        #获取总价
            # zongjia = loupan.find('div',class_='second').get_text()
            # print(zongjia)
            # print(name,location,price)
        #获取建筑面积
            try:
                Item['area'] = str(loupan.find('div',class_='resblock-area').get_text()).split()[-1]
            except Exception as error:
                Item['area'] = None
        #获取房型
            nums = len(loupan.find('a',class_='resblock-room').find_all('span'))
            # try:
            type = ''
            for num in range(0,nums):
                type += str(loupan.find('a',class_='resblock-room').find_all('span')[num].get_text()) + '/'
                # Item['type'] = type
            if len(type):
                Item['type'] = type
            else:
                Item['type'] = None

            # except Exception as error:

        #获取地区
            Item['region'] = response.meta['region']

            # [print(i) for i in loupan.find('a',class_='resblock-room').find_all('span')]
        # totalpage = math.floor(int(soup.find(attrs={'data-current': 1})['data-total-count']) / 10 + 1)  # 向下取整
        # if response.meta['index'] < totalpage:
        #     index = response.meta['index']+1
        #     url = self.base_url + response.meta['region'] + "/" + 'pg{0}'.format(index)
        #     yield Request(url,callback=self.parse, dont_filter=False, meta={'region': response.meta['region'],'index':index})

    # def parse(self,response):
    #     print(response.text)
        # print('\n')
        # print(response.request.headers['User-Agent'],'\n')
        yield Item

middleware.py
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class SpiderProjectSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class SpiderProjectDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    # def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # ua = UserAgent()
        # request.headers['User-Agent'] = ua.random

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        # return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
#设置随机UA
class RandomUserAgent(object):
    def process_request(self, request, spider):
        ua = UserAgent()
        request.headers['User-Agent'] = ua.random

#设置爬虫访问频率
class RandomDelayMiddleware(object):
    def __init__(self, delay):
        self.delay_min = delay_min
        self.delay_max = delay_max

    @classmethod
    def from_crawler(cls, crawler):
        delay_min = crawler.spider.settings.get("RANDOM_DELAY_MIN", 10)
        delay_max = crawler.spider.settings.get("RANDOM_DELAY_MAX", 20)

        if not isinstance(delay_min, int):
            raise ValueError("RANDOM_DELAY_MIN need a int")
        if not isinstance(delay_max, int):
            raise ValueError("RANDOM_DELAY_MAX need a int")
        return cls(delay_min,delay_max)
    def process_request(self, request, spider):
        delay = random.randint(self.delay_min, self.delay_max)
        logging.info(">>> request.url is {0},random delay: {1} s <<<".format(request.url,delay))
        time.sleep(delay)

pipeline.py
# -*- coding: utf-8 -*-
import pymysql
# import time
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class SpiderProjectPipeline(object):
    def open_spider(self,spider):
        self.connect = pymysql.connect(
        host = '127.0.0.1',
        user = 'root',
        passwd = 'Xzl@K8sBASEserver!',
        database = 'spider_test',
        port = 8635,
        charset = 'utf8',
        use_unicode = False
    )
        self.cursor = self.connect.cursor()
    def process_item(self, item, spider):
        insert_sql = "INSERT INTO item_info(name,region," \
                     "location,area,danjia,type)" \
                     "VALUES (" \
                     " %s, %s, %s, " \
                     " %s," \
                     " %s, %s)"
        params = (
            item['name'],item['region'],item['location'],
            item['area'],item['danjia'],item['type']
        )
        self.cursor.execute(insert_sql,params)
        self.connect.commit()
    def spider_close(self,spider):
        self.cursor.close()
        self.connect.close()

settings.py
# -*- coding: utf-8 -*-

# Scrapy settings for spider_project project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'spider_project'

SPIDER_MODULES = ['spider_project.spiders']
NEWSPIDER_MODULE = 'spider_project.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'spider_project (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY_MIN = 20
DOWNLOAD_DELAY_MAX = 30
# HTTPERROR_ALLOWED_CODES = [301,302]



# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'spider_project.middlewares.SpiderProjectSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'spider_project.middlewares.SpiderProjectDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
#    'spider_project.pipelines.SpiderProjectPipeline': 300,
    'spider_project.pipelines.SpiderProjectPipeline': 400
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'spider_project.middlewares.RandomUserAgent': 543,
}
PROXIES = [
    {'ip_port': '223.199.26.27:8746', 'user_passwd': 'user1:pass1'},
    {'ip_port': '183.166.21.218:9999', 'user_passwd': 'user2:pass2'},
    {'ip_port': '223.199.18.87:9999', 'user_passwd': 'user3:pass3'},
    {'ip_port': '114.99.13.4:9999', 'user_passwd': 'user4:pass4'},
    {'ip_port': '47.112.214.45:8000', 'user_passwd': 'user5:pass5'},

entrypoint.py
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'test'])

来源：CSDN

作者：等一场大️

链接：https://blog.csdn.net/weixin_43855694/article/details/104103680

标签

scrapy

python