一、链家数据爬取
(由于链家二手房搜索结果有100页的限制,也就是只能搜到3000条结果,因此,我将按照城区搜索结果进行爬取)
首先从搜索结果页面获得二手房详情页面的url,存储到apartment_url.csv中
# -*- coding: utf-8 -*-
import csv
import re
import urllib2
import sqlite3
import random
import threading
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\
{'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\
{'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\
{'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\
{'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]
websites = ["dongcheng","xicheng","chaoyang/p1p2p3p4","chaoyang/p5","chaoyang/p6","chaoyang/p7p8","haidian/p1p2p3p4p5p6","haidian/p7p8","fengtai","shijingshan","tongzhou","changping","daxing","yizhuangkaifaqu","shunyi","fangshan","mentougou","pinggu","huairou","miyun"]
pages = [37,42,60,50,80,82,85,42,77,24,41,97,54,13,33,26,16,1,1,1]
pages_url = []
detail_pages_url = []
lock = threading.Lock()
def generate_pages_url():
for i in range(len(websites)):
for j in range(1,pages[i]+1):
pages_url.append("https://bj.lianjia.com/ershoufang/"+websites[i] + "/pg" + str(j))
def generate_apartments_url():
# url_spider(pages_url[0])
for url in pages_url:
url_spider(url)
apartment_w = open("apartment_url.csv", "a")
csv_writer = csv.writer(apartment_w, dialect = "excel")
for i in detail_pages_url:
row = [i]
csv_writer.writerow(row)
def url_spider(url):
try:
req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)])
source_code = urllib2.urlopen(req,timeout=5).read()
plain_text=unicode(source_code)#,errors='ignore')
soup = BeautifulSoup(plain_text)
except (urllib2.HTTPError, urllib2.URLError), e:
print e
return
except Exception,e:
print e
return
cj_list=soup.findAll('div',{'class':'info clear'})
for cj in cj_list:
title=cj.find('div',{'class':'title'})
a=title.find('a').get('href')
detail_pages_url.append(a)
# print a
# row[a]
# csv_writer.writerow(row)
if __name__=="__main__":
#产生北京市内在售二手房所有搜索页面列表 存储到pages_url
generate_pages_url()
#产生北京市内在售二手房所有详情页面列表 存储到detail_pages_url
generate_apartments_url()
# -*- coding: utf-8 -*-
import csv
import re
import urllib2
import sqlite3
import random
import threading
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\
{'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\
{'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\
{'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\
{'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]
def apartment_spider(url):
print(url)
try:
req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)])
source_code = urllib2.urlopen(req,timeout=5).read()
plain_text=unicode(source_code)#,errors='ignore')
soup = BeautifulSoup(plain_text)
except (urllib2.HTTPError, urllib2.URLError), e:
print e
return
except Exception,e:
print e
return
#房屋信息title
title = soup.find('div',{'class':'sellDetailHeader'}).find('div',{'class':'title-wrapper'}).find('div',{'class':'content'}).find('div',{'class':'title'}).find('h1').get('title')
overview = soup.find('div',{'class':'overview'}).find('div',{'class':'content'})
#图片img
img_info = soup.find('div',{'class':'overview'}).find('div',{'class':'thumbnail'}).find('li')
if(img_info!=None):
img = img_info.get('data-src')
else:
img = ""
#总价total_price
total_price = soup.find('div',{'class':'price'}).find('span',{'class':'total'}).next_element
#单价unit_price
unit_price = overview.find('div',{'class':'price'}).find('div',{'class':'text'}).find('div',{'class':'unitPrice'}).find('span').next_element
#朝向orientation
orientation = overview.find('div',{'class':'type'}).find('div',{'class':'mainInfo'}).next_element
#面积area
area = overview.find('div',{'class':'area'}).find('div',{'class':'mainInfo'}).next_element.split('平米')[0]
year_info = overview.find('div',{'class':'area'}).find('div',{'class':'subInfo'}).next_element
aroundInfo = overview.find('div',{'class':'aroundInfo'})
#小区名称name
name_info = aroundInfo.find('div',{'class':'communityName'})
name = name_info.find('a',{'class':'info'}).next_element
name_id = name_info.find('a',{'class':'info'}).get('href').split('/')[2]
subway_info = aroundInfo.find('div',{'class':'areaName'}).find('a',{'class':'supplement'}).next_element
place_info = aroundInfo.find('div',{'class':'areaName'}).find('span',{'class':'info'}).findAll('a')
#城区district
district = place_info[0].next_element
#地段place
place = place_info[1].next_element
#id
id = aroundInfo.find('div',{'class':'houseRecord'}).find('span',{'class':'info'}).next_element
introContent = soup.find('div',{'class':'m-content'}).find('div',{'class':'box-l'}).find('div',{'id':'introduction'}).find('div',{'class':'introContent'})
room_type = str(introContent.find('div',{'class':'content'}).findAll('li')[0]).split('</span>')[1].split('</li>')[0]
time = str(introContent.find('div',{'class':'transaction'}).findAll('li')[0]).split('</span>')[1].split('</li>')[0]
# print "title: " + title
# print "img: " + img
# print "price: " + total_price
# print "unit price: " + unit_price
# print "orientation: " + orientation
# print district
# print place
# print "area: " + area
# print "name: " + name
# print "name id: " + name_id
# print time
# print room_type
# print "id: " + id
# print "subway info:" + subway_info
row = [title, total_price, unit_price, orientation, district, place, area, name, name_id, time,
room_type, id, subway_info]
csv_writer.writerow(row)
img_row = [id, img]
csv_writer2.writerow(img_row)
if __name__=="__main__":
# 负责记录apartment详情
apartment_w = open("apartments19000+.csv", "a")
csv_writer = csv.writer(apartment_w, dialect = "excel")
# 负责记录img网址
img_w = open("img19000+.csv", "a")
csv_writer2 = csv.writer(img_w, dialect = "excel")
# apartment url读取
urls = open("19000+.csv", "r")
for url in urls:
apartment_spider(url)
apartment_w.close()
img_w.close()
爬取图片
#coding = utf-8
import urllib
def img(url,id):
try:
response = urllib.urlopen(url)
get_img = response.read()
except Exception,e:
print e
return
with open('imgs/'+str(id)+'.jpg','wb') as fp:
fp.write(get_img)
print(id)
return
if __name__=="__main__":
lines = open("img19000+.csv", "r")
count = 0
for line in lines:
id = line.split(',')[0]
url = line.split(',')[1]
img(url,id)
count = count + 1
print count
来源:CSDN
作者:nancy0314
链接:https://blog.csdn.net/nancy0314/article/details/78421409