Python——Scrapy爬取链家网站所有房源信息

浪子不回头ぞ 提交于 2021-02-06 09:49:57

 

用scrapy爬取链家全国以上房源分类的信息:

路径:

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class LianItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


class ErShouFangItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 总价
    total_price = scrapy.Field()
    # 单价
    single_price = scrapy.Field()
    # 楼层
    room_info = scrapy.Field()
    # 住宅位置
    region = scrapy.Field()
    # 房屋朝向及装修情况
    direction = scrapy.Field()
    # 建筑面积
    area = scrapy.Field()
    # 建筑类型
    house_struct = scrapy.Field()
    # 房屋户型
    huxing = scrapy.Field()
    # 购买时间
    buy_time = scrapy.Field()
    # url
    ershou_detail_url = scrapy.Field()


class NewHouseItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 标题
    title = scrapy.Field()
    # 位置
    region = scrapy.Field()
    # 房屋信息
    room_info = scrapy.Field()
    # 建筑面积
    area = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 详情页
    newHouse_detail_url = scrapy.Field()


class RentHouseItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 标题
    title = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 房间信息(房源户型、朝向、面积、租赁方式)
    house_info = scrapy.Field()

    # 发布时间
    pub_time = scrapy.Field()
    # 入住:
    in_time = scrapy.Field()
    # 租期
    lease = scrapy.Field()
    # 楼层
    floor = scrapy.Field()
    # 电梯:
    lift = scrapy.Field()
    # 车位:
    carport = scrapy.Field()
    # 用水:
    use_water = scrapy.Field()
    # 用电:
    use_electricity = scrapy.Field()
    # 燃气:
    use_gas = scrapy.Field()
    # url
    rent_detail_url = scrapy.Field()

class OfficeHouseItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 标题
    title = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 数量
    num = scrapy.Field()
    # 面积
    area = scrapy.Field()
    # url
    office_detail_url = scrapy.Field()

class XiaoquHouseItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 标题
    title = scrapy.Field()
    # 地区
    region = scrapy.Field()
    # 单价
    single_price = scrapy.Field()
    # 建筑年代
    build_time = scrapy.Field()
    # 建筑类型
    house_struct = scrapy.Field()
    # 物业费用
    service_fees = scrapy.Field()
    # 物业公司
    service_company = scrapy.Field()
    # 开发商
    build_company = scrapy.Field()
    # 楼栋数
    building_nums = scrapy.Field()
    # 房屋总数
    house_nums = scrapy.Field()
    # url
    xiaoqu_detail_url = scrapy.Field()
View Code

 

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
from lian.items import ErShouFangItem, NewHouseItem,RentHouseItem,OfficeHouseItem,XiaoquHouseItem  # 已经导入成功,不用管


class LianPipeline(object):
    def __init__(self):
        self.ershoufang_fp = open('ershoufang.json', 'wb')
        self.ershoufang_exporter = JsonLinesItemExporter(self.ershoufang_fp, ensure_ascii=False)

        self.newhouse_fp = open('newhouse.json', 'wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)

        self.renthouse_fp = open('renthouse.json', 'wb')
        self.renthouse_exporter = JsonLinesItemExporter(self.renthouse_fp, ensure_ascii=False)

        self.officehouse_fp = open('officehouse.json', 'wb')
        self.officehouse_exporter = JsonLinesItemExporter(self.officehouse_fp, ensure_ascii=False)

        self.xiaoquhouse_fp = open('xiaoquhouse.json', 'wb')
        self.xiaoquhouse_exporter = JsonLinesItemExporter(self.xiaoquhouse_fp, ensure_ascii=False)

    def process_item(self, item, spider):
        if isinstance(item, ErShouFangItem):
            self.ershoufang_exporter.export_item(item)
        elif isinstance(item, NewHouseItem):
            self.newhouse_exporter.export_item(item)
        elif isinstance(item, RentHouseItem):
            self.renthouse_exporter.export_item(item)
        elif isinstance(item ,OfficeHouseItem):
            self.officehouse_exporter.export_item(item)
        else:
            self.xiaoquhouse_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.ershoufang_fp.close()
        self.newhouse_fp.close()
        self.renthouse_fp.close()
        # self.officehouse_fp.closed()
        self.xiaoquhouse_fp.close()
View Code

 

 lian_spider.py

# -*- coding: utf-8 -*-
import scrapy
import re
from lian.items import ErShouFangItem,NewHouseItem,RentHouseItem,OfficeHouseItem,XiaoquHouseItem # 已经导入成功,不用管
class LianSpiderSpider(scrapy.Spider):
    name = 'lian_spider'
    allowed_domains = ['lianjia.com']
    start_urls = ['https://www.lianjia.com/city/']
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
        'Cookie': 'select_city=510700; lianjia_uuid=8bd3d017-2c99-49a5-826e-986f56ce99b9; _smt_uid=5cd3cd13.44c49764; UM_distinctid=16a9b59145a158-0442ba7704d667-3b654406-c0000-16a9b59146011e; _jzqckmp=1; _ga=GA1.2.822868133.1557384475; _gid=GA1.2.801531476.1557384475; all-lj=ed5a77c9e9ec3809d0c1321ec78803ae; lianjia_ssid=50fd11a7-d48c-4dde-b281-287224c40487; TY_SESSION_ID=ae45e1a4-b6d9-46bb-81c8-7cff32931953; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1557384618,1557389971,1557392984,1557446598; _jzqc=1; _jzqy=1.1557384468.1557446599.1.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6.-; _qzjc=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216a9b5916632a6-01ac8dcdbbb8a7-3b654406-786432-16a9b59166452e%22%2C%22%24device_id%22%3A%2216a9b5916632a6-01ac8dcdbbb8a7-3b654406-786432-16a9b59166452e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _jzqa=1.1500973956232310800.1557384468.1557451920.1557454945.6; _jzqx=1.1557451920.1557454945.2.jzqsr=mianyang%2Elianjia%2Ecom|jzqct=/ershoufang/pag1/.jzqsr=mianyang%2Elianjia%2Ecom|jzqct=/ershoufang/; CNZZDATA1255604082=609852050-1557381958-https%253A%252F%252Fwww.baidu.com%252F%7C1557455869; CNZZDATA1254525948=1645681089-1557382543-https%253A%252F%252Fwww.baidu.com%252F%7C1557458144; CNZZDATA1255633284=262578687-1557381275-https%253A%252F%252Fwww.baidu.com%252F%7C1557458627; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1557459240; _qzja=1.677427564.1557384472885.1557451920228.1557454945305.1557459200351.1557459240226.0.0.0.62.6; _qzjb=1.1557454945305.13.0.0.0; _qzjto=33.3.0; _jzqb=1.13.10.1557454945.1'
    }

    # 每个城市
    def parse(self, response):
        lis = response.xpath('//div[@class="city_list_section"]/ul/li')
        city_links = []
        for li in lis:
            province = li.xpath('.//div[@class="city_list_tit c_b"]/text()').extract_first()
            # print(province)
            lis2 = li.xpath('.//div[@class="city_province"]/ul/li')
            city_info = {}
            for l in lis2:
                city_info['city'] = l.xpath('./a/text()').extract_first()
                city_info['city_link'] = l.xpath('./a/@href').extract_first()
                city_links.append(city_info)
                # print(city_info)
                yield scrapy.Request(
                    url=city_info['city_link'],
                    headers=self.headers,
                    callback=self.parse_rent_type,
                    meta={'city_name': (province,city_info['city'])}
                )
        # 爬取海外房源,所有城市标题上房有海外房源的信息均为一致,所以只爬取一次
        # yield scrapy.Request(
        #     url='https://i.lianjia.com/us',
        #     headers=self.headers,
        #     callback=self.parse_haiwai
        # )

    # 类型(二手房,新房,租房,商业办公,小区)
    def parse_rent_type(self, response):
        province,city_name = response.meta.get('city_name')
        lis = response.xpath('//div[@class="nav typeUserInfo"]/ul/li')
        for li in lis:
            type = li.xpath('./a/text()').extract_first()
            if type == '二手房':
                ershoufang_link = li.xpath('./a/@href').extract_first()
                # print("city:{}————————{}".format(city_name,ershoufang_link))
                next_urls = [ershoufang_link + '/pg{}/'.format(str(i)) for i in range(1, 101)]
                i = 0
                for url in next_urls:
                    i = i+1
                    yield scrapy.Request(
                        url=url,
                        headers=self.headers,
                        callback=self.parse_ershoufang,
                        meta={'city_name': (province,city_name,i)}
                    )
            # 不好找页码
            elif type == '新房':
                xinfang_link = li.xpath('./a/@href').extract_first()
                xinfang_link = xinfang_link + '/loupan/'
                yield scrapy.Request(
                    url=xinfang_link,
                    headers=self.headers,
                    callback=self.parse_xinfang,
                    meta={'city_name': (province,city_name)}
                )
            elif type == '租房':
                zufang_link = li.xpath('./a/@href').extract_first()
                next_urls = [zufang_link + '/pg{}/'.format(str(i)) for i in range(1, 101)]
                i = 0
                for url in next_urls:
                    i = i + 1
                    yield scrapy.Request(
                        url=url,
                        headers=self.headers,
                        callback=self.parse_zufang,
                        meta={'city_name': (url,province,city_name,i)}
                    )
            # 不好找页码
            elif type == '商业办公':

                #TODO 有一个重定向,只会爬取一页
                shangyebangong_link = li.xpath('./a/@href').extract_first()
                shangyebangong_link = str(shangyebangong_link)+"/xzl/rent/mlist"
                # print(province, city_name,shangyebangong_link)
                if shangyebangong_link == None:
                    continue
                yield scrapy.Request(
                    url=shangyebangong_link,
                    headers=self.headers,
                    callback=self.parse_shangyebangong,
                    meta={'city_name': (province,city_name)}
                )
            # 不好找页码
            elif type == '小区':
                xiaoqu_link = li.xpath('./a/@href').extract_first()
                yield scrapy.Request(
                    url=xiaoqu_link,
                    headers=self.headers,
                    callback=self.parse_xiaoqu,
                    meta={'city_name': (province,city_name)}
                )

    # 获取二手房主页item+
    def parse_ershoufang(self, response):
        province,city_name,i = response.meta.get('city_name')
        lis = response.xpath('//ul[@class="sellListContent"]/li')
        for li in lis:
            ershou_detail_link = li.xpath('.//div[@class="title"]/a/@href').extract_first()
            # 注意有的房屋信息为None
            if ershou_detail_link == None:
                continue
            # print("{}——————{}".format(city_name,ershou_detail_link))
            yield scrapy.Request(
                url=ershou_detail_link,
                headers=self.headers,
                callback=self.parse_ershoufang_detail,
                meta={'city_name': (ershou_detail_link,province,city_name,i)}
            )
    # 二手房item详情页
    def parse_ershoufang_detail(self, response):
        ershou_detail_link,province,city_name,i = response.meta.get('city_name')
        title = response.xpath('//div[@class="sellDetailHeader"]//div[@class="title"]/h1/text()').extract_first()
        # print("***第{}页*** 城市:{}   二手房   标题:{}".format(i,city_name, title))
        total_price = response.xpath('//div[@class="price "]/span[@class="total"]/text()').extract_first() + str(response.xpath('//div[@class="price "]/span[@class="unit"]/span/text()').extract_first()).strip()
        single_price = response.xpath('//span[@class="unitPriceValue"]/text()').extract_first() + str(response.xpath('//span[@class="unitPriceValue"]/i/text()').extract_first())
        room_info = response.xpath('//div[@class="room"]/div[1]/text()').extract_first() + '-' + response.xpath('//div[@class="room"]/div[2]/text()').extract_first()
        region = response.xpath('//div[@class="areaName"]/span[@class="info"]/a[1]/text()').extract_first() + '-' +  response.xpath('//div[@class="areaName"]/span[@class="info"]/a[2]/text()').extract_first()
        direction = response.xpath('//div[@class="type"]/div[1]/text()').extract_first() + '-' + response.xpath('//div[@class="type"]/div[2]/text()').extract_first()
        area = response.xpath('//div[@class="area"]/div[1]/text()').extract_first()
        house_struct = response.xpath('//div[@class="area"]/div[2]/text()').extract_first()
        huxing = response.xpath('//div[@class="introContent"]/div[1]/div[2]/ul/li[1]/text()').extract_first()
        buy_time = response.xpath('//div[@class="transaction"]/div[2]/ul/li[3]/span[2]/text()').extract_first()
        print("***第{}页*** 城市:{}   二手房   标题:{} 总价:{} 单价:{} 楼层:{} 住宅位置:{} 房屋朝向:{} 建筑面积:{} 建筑类型:{} 房屋户型:{} 购买时间:{}".format(i, city_name, title,total_price,single_price,room_info,region,direction,area,house_struct,huxing,buy_time))

        item = ErShouFangItem(
            province = province,
            city = city_name,
            total_price = total_price,
            single_price = single_price,
            room_info = room_info,
            region = region,
            direction = direction,
            area = area,
            house_struct = house_struct,
            huxing = huxing,
            buy_time = buy_time,
            ershou_detail_url = ershou_detail_link
        )
        yield item

    # 新房楼盘主页
    def parse_xinfang(self, response):
        province,city_name = response.meta.get('city_name')
        lis = response.xpath('//ul[@class="resblock-list-wrapper"]/li')
        for li in lis:
            title = li.xpath('./a[@class="resblock-img-wrapper "]/@title').extract_first()

            region_infos = li.xpath('.//div[@class="resblock-location"]//text()').extract()
            region = ''
            for i in region_infos:
                region = region + i.replace('\n', '').strip(' ')

            room_infos = li.xpath('.//a[@class="resblock-room"]/span//text()').extract()
            room_info = ''
            for i in room_infos:
                room_info = room_info + i.strip(' ')

            area_infos = li.xpath('.//div[@class="main-price"]/span//text()').extract()
            area = ''
            for i in area_infos:
                area = area + i.strip(' ')

            # 加上单位并去除首尾空格
            price = li.xpath('.//div[@class="main-price"]/span[1]/text()').extract_first() + str(li.xpath('.//div[@class="main-price"]/span[2]/text()').extract_first()).strip()

            newhouse_detail_url = 'https://bj.fang.lianjia.com'+str(li.xpath('./a[@class="resblock-img-wrapper "]/@href').extract_first())
            print("城市:{}   新房  {}  {}".format(city_name,title, newhouse_detail_url))
            item = NewHouseItem(
                province=province,
                city = city_name,
                title = title,
                region = region,
                room_info = room_info,
                area = area,
                price = price,
                newHouse_detail_url = newhouse_detail_url
            )
            yield item

    # 租房首页
    def parse_zufang(self, response):
        zufang_link, province, city_name, i = response.meta.get('city_name')
        # 去掉链接pg页码信息
        # print("去掉之前:{}".format(zufang_link))
        zufang_link = re.findall('(.*?)/zufang//pg\d+/',zufang_link)[0]
        items = response.xpath('//div[@class="content__list"]/div')
        for zu in items:
            zufang_detail_link = zufang_link + str(zu.xpath('./a[@class="content__list--item--aside"]/@href').extract_first())
            # 注意有的房屋信息为None
            if zufang_detail_link == None:
                continue
            # print("{}——————{}".format(city_name,zufang_detail_link))
            yield scrapy.Request(
                url=zufang_detail_link,
                headers=self.headers,
                callback=self.parse_zufang_detail,
                meta={'city_name': (zufang_detail_link,province,city_name,i)}
            )
    # 租房信息详情
    def parse_zufang_detail(self, response):
        zufang_detail_link, province, city_name, i = response.meta.get('city_name')
        title = response.xpath('//div[@class="content clear w1150"]/p/text()').extract_first()
        price = response.xpath('//div[@class="content__aside fr"]/p/span/text()').extract_first()
        house_infos = response.xpath('//ul[@class="content__aside__list"]/p//text()').extract()
        house_info = ''
        for i in house_infos:
            house_info = house_info + i.replace('\n','/').strip(' ')
        # 发布时间
        pub_time = str(response.xpath('string(//div[@class="content__subtitle"])').extract_first())
        pub_time = re.findall('\d{4}-\d{1,2}-\d{1,2}',pub_time)
        if pub_time:
            pub_time = pub_time[0]
        else:
            pub_time = None
        # 入住时间
        in_time = response.xpath('//div[@class="content__article__info"]/ul/li[3]/text()').extract_first()
        # 租期
        lease = response.xpath('//div[@class="content__article__info"]/ul/li[5]/text()').extract_first()
        # 楼层
        floor = response.xpath('//div[@class="content__article__info"]/ul/li[8]/text()').extract_first()
        # 是否有电梯
        lift = response.xpath('//div[@class="content__article__info"]/ul/li[9]/text()').extract_first()
        # 是否有停车位
        carport = response.xpath('//div[@class="content__article__info"]/ul/li[11]/text()').extract_first()
        use_water = response.xpath('//div[@class="content__article__info"]/ul/li[12]/text()').extract_first()
        use_electricity = response.xpath('//div[@class="content__article__info"]/ul/li[14]/text()').extract_first()
        use_gas = response.xpath('//div[@class="content__article__info"]/ul/li[15]/text()').extract_first()

        # print(" 城市:{}   租房   {} {} {} {} {} {} {}".format(city_name, lease,floor,lift,carport,use_water,use_electricity,use_gas))
        item = RentHouseItem(
            province = province,
            city = city_name,
            title = title,
            price = price,
            house_info = house_info,
            pub_time = pub_time,
            in_time = in_time,
            lease = lease,
            floor = floor,
            lift = lift,
            carport = carport,
            use_water = use_water,
            use_electricity = use_electricity,
            use_gas = use_gas,
            rent_detail_url = zufang_detail_link
        )
        yield item
        print("***第{}页*** 城市:{}   租房   {}   {}".format(i, city_name, title, price))

    # 海外房源信息
    # def parse_haiwai(self,response):
    #     items = response.xpath('//*[@id="env"]/div[4]/div/div[2]')
    #     for i in items:
    #         title = i.xpath('.//div[class="titles"]/a/div/text()').extract_first()
    #         price = i.xpath('.//span[@class="fr"]/text()').extract_first()
    #         print("城市:美国   标题:{}   价格:{}".format(title,price))

    # 商业办公主页item详情
    def parse_shangyebangong(self, response):
        province, city_name = response.meta.get('city_name')
        items = response.xpath('//div[@class="result__ul"]/a')
        for i in items:
            office_detail_url = response.xpath('./@href')
            title = i.xpath('./div/p[@class="result__li-title"]/text()').extract_first()
            area = i.xpath('./div/p[@class="result__li-features"]/text()').extract_first()
            nums = i.xpath('./div/p[@class="result__li-other"]/text()').extract_first()
            price = i.xpath('./div/p[@class="result__li-price"]/span/text()').extract_first()
            item = OfficeHouseItem(
                province = province,
                city = city_name,
                title = title,
                price = price,
                num = nums,
                area = area,
                office_detail_url = office_detail_url
            )
            yield item
            print("城市:{}   商业办公   标题:{}   面积:{}   数量:{}   价格:{}   url:{}".format(city_name, title, area, nums, price, office_detail_url))

    # 小区主页item
    def parse_xiaoqu(self, response):
        province,city_name = response.meta.get('city_name')
        ul = response.xpath('//ul[@class="listContent"]/li')
        for li in ul:
            xiaoqu_detail_link = li.xpath('.//a[@class="img"]/@href').extract_first()
            if xiaoqu_detail_link == None:
                continue
            yield scrapy.Request(
                url=xiaoqu_detail_link,
                headers=self.headers,
                callback=self.parse_xiaoqu_detail,
                meta={'city_name': (xiaoqu_detail_link,province,city_name)}
            )
    # 小区item详情
    def parse_xiaoqu_detail(self, response):
        xiaoqu_detail_link,province,city_name = response.meta.get('city_name')
        title = response.xpath('//h1[@class="detailTitle"]/text()').extract_first()
        region = response.xpath('//div[@class="detailDesc"]/text()').extract_first()
        single_price = response.xpath('//span[@class="xiaoquUnitPrice"]/text()').extract_first()

        # 注意有的房屋没有建成时间信息,影响后面值得获取,需要进行判断后准确取值
        build_time = str(response.xpath('//div[@class="xiaoquInfo"]/div[1]/span[2]/text()').extract_first()).strip()
        house_struct = None
        service_fees = None

        pattern = re.compile('[0-9]+')
        if pattern.findall(build_time):
            build_time = build_time
            house_struct = response.xpath('//div[@class="xiaoquInfo"]/div[2]/span[2]/text()').extract_first()
            service_fees = response.xpath('//div[@class="xiaoquInfo"]/div[3]/span[2]/text()').extract_first()
            service_company = response.xpath('//div[@class="xiaoquInfo"]/div[4]/span[2]/text()').extract_first()
            build_company = response.xpath('//div[@class="xiaoquInfo"]/div[5]/span[2]/text()').extract_first()
            building_nums = response.xpath('//div[@class="xiaoquInfo"]/div[6]/span[2]/text()').extract_first()
            house_nums = response.xpath('//div[@class="xiaoquInfo"]/div[7]/span[2]/text()').extract_first()
        else:
            build_time = None
            house_struct = response.xpath('//div[@class="xiaoquInfo"]/div[1]/span[2]/text()').extract_first()
            service_fees = response.xpath('//div[@class="xiaoquInfo"]/div[2]/span[2]/text()').extract_first()
            service_company = response.xpath('//div[@class="xiaoquInfo"]/div[3]/span[2]/text()').extract_first()
            build_company = response.xpath('//div[@class="xiaoquInfo"]/div[4]/span[2]/text()').extract_first()
            building_nums = response.xpath('//div[@class="xiaoquInfo"]/div[5]/span[2]/text()').extract_first()
            house_nums = response.xpath('//div[@class="xiaoquInfo"]/div[6]/span[2]/text()').extract_first()

        item = XiaoquHouseItem(
            province=province,
            city = city_name,
            title=title,
            region=region,
            single_price=single_price,
            build_time=build_time,
            house_struct=house_struct,
            service_fees=service_fees,
            service_company=service_company,
            build_company=build_company,
            building_nums=building_nums,
            house_nums=house_nums,
            xiaoqu_detail_url=xiaoqu_detail_link
        )
        yield item
        print("省份:{} 城市:{}   小区   {}   {}   {}   {}   {}   {}   {}".format(province, city_name, build_time,house_struct,service_fees,service_company,build_company,building_nums,house_nums))
View Code

 

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for lian project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'lian'

SPIDER_MODULES = ['lian.spiders']
NEWSPIDER_MODULE = 'lian.spiders'

LOG_LEVEL = "WARNING"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

DOWNLOAD_FAIL_ON_DATALOSS = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'lian.middlewares.LianSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'lian.middlewares.LianDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'lian.pipelines.LianPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
View Code

 

结果:

 

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!