不知道怎么从楼盘页中直接抓取地图坐标点(看到神箭手上人家提供的脚本有直接抓坐标的功能),因此先在链家网上爬取其他与房屋有关的信息;脚本主体套用的Tsukasa鱼的这篇,表示万分感谢
### 初步设定
import os
os.chdir("D:/")
import json
import requests
import re
import pandas as pd
import time
import random
from itertools import chain
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import datetime
import numpy as np
import pymongo
def generate_allurl(user_in_nub, url): # 生成url
for url_next in range(1, int(user_in_nub)):
yield url.format(url_next)
def get_allurl_LJ(generate_allurl, wildcard): # 分析url解析出每一页的详细url
ua = UserAgent()
headers = {'User-Agent':ua.random} # 使用fake_useragent伪装的请求头
time.sleep(random.uniform(0.5,1.0)) # 随机数读取避免连续读取而造成错误
get_url = requests.get(generate_allurl, 'lxml', headers=headers)
if get_url.status_code == 200:
re_set = re.compile(wildcard)
re_get = re.findall(re_set, get_url.text)
return re_get
def replace_column_LJ(info, info_column): # 爬取内容中的多余字符筛除
for i in info_column:
info[i] = info[i].replace(' ','')
info[i] = info[i].replace('\r','')
info[i] = info[i].replace('\n','')
info[i] = info[i].replace('㎡','')
info[i] = info[i].replace('元/平米','')
return info
def open_url_LJ(re_get): # 分析详细url获取所需信息
ua = UserAgent()
headers = {'User-Agent':ua.random} # 使用fake_useragent伪装的请求头
time.sleep(random.uniform(0.5,1.0)) # 随机数读取避免连续读取而造成错误
res = requests.get(re_get, 'lxml', headers=headers)
info = {}
if res.status_code == 200 and "unitPriceValue" in res.text:
soup = BeautifulSoup(res.text, 'lxml')
info['挂牌网址'] = re_get
info['爬取时间'] = time.time()
if "main" in res.text:
info['房屋标题'] = soup.select('.main')[0].text
info['总计售价'] = soup.select('.total')[0].text
info['单方售价'] = soup.select('.unitPriceValue')[0].text
info['建筑年代'] = soup.select('.subInfo')[2].text[:4]
info['小区名称'] = soup.select('.info')[0].text
info['所属政区'] = soup.select('.info a')[0].text
info['所属片区'] = soup.select('.info a')[1].text
info['链家编号'] = soup.select('.houseRecord span')[1].text.replace('举报','')
for i in soup.select('.base li'): # 基本属性部分
i = str(i)
if '</span>' in i or len(i) > 0:
key, value = (i.split('</span>'))
info[key[24:]] = value.rsplit('</li>')[0]
for i in soup.select('.transaction li'): # 交易属性部分
i = str(i)
if ('</span>' in i or len(i) > 0) and '抵押信息' not in i:
key, value, drop = (i.split('</span>'))
info[key[25:]] = value[7:].rsplit('</span>')[0]
replace_column_LJ(info,['套内面积','建筑面积','单方售价']) # 排除内容中的多余字符
else:
print('失败')
return info
# 整理所有需要爬取的详情页地址
wb_ttl = []
for i in generate_allurl(101, 'https://sz.ke.com/ershoufang/pg{}co32ng1hu1nb1/'):
# url = 'https://sz.ke.com/ershoufang/pg{}co32ng1hu1nb1/' # 对贝壳找房最新非商业非车位非地下室
# url = 'http://sz.lianjia.com/ershoufang/pg{}ng1hu1nb1tt2/' # 对链家最新非商业非车位非地下室
wb_tmp=get_allurl_LJ(i, '<li.*?class="clear">.*?<a.*?class="img.*?".*?href="(.*?)"') #对列表页楼盘详情地址的通配符
wb_ttl.append(wb_tmp) # 在循环中产生的二维list
wb_ttl_alt = list(chain(*wb_ttl)) # 将二维list展开成一维
len_tmp=len(wb_ttl_alt) # 计算将要爬取的总数
print("需要爬取 %d 个详情页"%(len_tmp))
# 依次爬取所有详情页中指定的内容
myclient = pymongo.MongoClient(host='mongodb://localhost:27017/') # 指向连接
mydb = myclient['db_WebCrwr'] # 指向库db_test
mycol = mydb["cl_LJ_xhbn"] # 指向集合cl_test_dct
# dct_dt_tmp={} # 构建一个空的字典以便于下面直接将数据按行导入该字典
c=0
for i in wb_ttl_alt:
print(i)
mycol.insert_one(open_url_LJ(i)) # 直接入库
# dct_dt_tmp[i] = open_url(i) # 进入前面定义的空字典
c = c + 1
if c%1000 == 1:
print(str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M'))+"; steps %d in total %i"%(c, len_tmp))
time.sleep(2) #避免连续读取而造成错误
跑了几周没有问题,接下来会利用小区名称匹配百度地图API提供的坐标点来实现地图可视化
来源:CSDN
作者:啊有猫
链接:https://blog.csdn.net/qq_39360343/article/details/81977416