爬取链家项目需要用到代理池 ,代理池具体怎么设置可以去翻我之前的文章
import hashlib
import requests
from lxml import etree
import pymongo
import time,re
def get_proxies():
try:
response = requests.get('http://localhost:5000/get')
proxies = {
'http': 'http://' + response.text
}
return proxies
except Exception:
return None
#请求url,获取xpath
def get_xpath_by_requests(url,proxies):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Referer': 'https://bj.lianjia.com/?utm_source=baidu&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaotimiaoshu&utm_campaign=sousuo&ljref=pc_sem_baidu_ppzq_x',
}
try:
response = requests.get(url, headers=headers, proxies=proxies)
# print(response.text)
return etree.HTML(response.text)
except Exception:
#重新获取代理
proxies_new = get_proxies()
print('重新获取代理',proxies)
return get_xpath_by_requests(url,proxies_new)
def get_text(text):
if text:
return text[0]
return ''
def get_md5(value):
md5 = hashlib.md5(bytes(value,encoding='utf-8'))
return md5.hexdigest()
def write_to_mongo(item):
# db['beijing'].insert(item)
item['hash_url'] = get_md5(item['detail_url'])
db['beijing'].update({'hash_url':item['hash_url']},{'$set':item},True)
def parse_page(div_list):
for div in div_list:
title =get_text(div.xpath('.//p[@class="content__list--item--title twoline"]/a/text()')).strip()
# print(title)
price = get_text(div.xpath('.//span[@class="content__list--item-price"]/em/text()'))
# print(price)
#地址
#大小
#朝向
#规格
#详情页链接
detail_url = get_text(div.xpath('.//p[@class="content__list--item--title twoline"]/a/@href'))
item = {}
item['title'] = title
item['price'] = price
item['detail_url'] = detail_url
# print(item)
write_to_mongo(item)
def parse_area(url):
html = get_xpath_by_requests(url, proxies)
# print(div_list)
#第一种分页
i=1
while True:
page_url = url+'pg{}'.format(i)
html = get_xpath_by_requests(page_url, proxies)
div_list = html.xpath('//div[@class="content__list"]/div')
if not div_list:
break
parse_page(div_list)
i+=1
#第二种:获取最大页
# max_page = html.xpath('//a[@class="next"]/text()')
# print(max_page)
# for i in range(1,int(max_page)+1):
# pass
#第三种:获取下一页,模仿点击下一页操作。
#先解析该页
#在获取下一页
#判断下一页是否为空,不为空就回拼接下一页的url,调自己。
def main():
base_url = 'https://bj.lianjia.com/zufang/'
html = get_xpath_by_requests(base_url,proxies)
#获取区域列表
areas = html.xpath('//div[@id="filter"]/ul[2]/li[position()>1]/a/@href')
# print(areas)
for area in areas:
area_url = 'https://bj.lianjia.com'+area
parse_area(area_url)
if __name__ == '__main__':
# 创建连接
client = pymongo.MongoClient(host='localhost', port=27017)
# 用client连接数据库
db = client['lianjia']
proxies = get_proxies()
main()
来源:CSDN
作者:D_dalei
链接:https://blog.csdn.net/D_wart/article/details/103721306