博主帮一个朋友做论文,要分析知乎的问答数据,数据量不多,因此简单用selenium中关于鼠标下滑window.scrollTo方法爬取了知乎的‘’抑郁症“专题相关问答
一、python开头引入的模块
import requests,json,random
try:
import cookielib
except:
import http.cookiejar as cookielib
import os.path
try:
from PIL import Image
except:
pass
import time,csv,xml,re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options # 从options模块中调用Options类
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
chrome_options = Options() # 实例化Option对象
chrome_options.add_argument('--headless') # 把Chrome浏览器设置为静默模式
driver = webdriver.Chrome() # 设置引擎为Chrome,在后台默默运行#options = chrome_options
二、知乎python登陆验证
# 构造 Request headers
# 从配置表获取
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3510.2 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.8.3.16721",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
]
ua = random.choice(ua_list)
headers = {
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/topic/19564862/hot",
'User-Agent': str(ua)
}
#代理ip
代理ip json格式的url
url_ip=''
resp = requests.get(url=url_ip)
proxies_list=[]
if resp.status_code == 200:
data_json = resp.json()
for d in data_json['obj']:
port = d['port']
ip = d['ip']
full_ip = ip + ':' + port
dict ={'http':full_ip}
proxies_list.append(dict)
proxies =random.choice(proxies_list)
# print(proxies)
# 账号密码
account ='知乎账号'
secret ='知乎密码'
cookie_file = 'cookie.txt'
header = {
'Host': 'www.zhihu.com',
'Referer': 'http://www.zhihu.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
}
# 使用登录cookie信息
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename=cookie_file)
try:
session.cookies.load(ignore_discard=True)
except:
print("Cookie 未能加载")
def get_xsrf():
'''_xsrf 是一个动态变化的参数'''
index_url = 'https://www.zhihu.com'
# 获取登录时需要用到的_xsrf
index_page = session.get(index_url, headers=header)
# html = index_page.cookies
# pattern = r'name="_xsrf" value="(.*?)"'
#
# # 这里的_xsrf 返回的是一个list
#
# _xsrf = re.findall(pattern, html)
xsrf = index_page.request._cookies.get("_xsrf")
return xsrf
# 获取验证码
def get_captcha():
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
r = session.get(captcha_url, headers=header)
with open('captcha.jpg', 'wb') as f:
f.write(r.content)
f.close()
# 用pillow 的 Image 显示验证码
# 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入
try:
im = Image.open('captcha.jpg')
im.show()
im.close()
except:
print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))
captcha = input("please input the captcha\n>")
return captcha
def isLogin():
# 通过查看用户个人信息来判断是否已经登录
url = "https://www.zhihu.com/settings/profile"
login_code = session.get(url, headers=header, allow_redirects=False).status_code
if login_code == 200:
return True
else:
return False
def login(secret, account):
# 通过输入的用户名判断是否是手机号
if re.match(r"^1\d{10}$", account):
print("手机号登录 \n")
post_url = 'https://www.zhihu.com/login/phone_num'
postdata = {
'_xsrf': get_xsrf(),
'password': secret,
'remember_me': 'true',
'phone_num': account,
}
else:
if "@" in account:
print("邮箱登录 \n")
else:
print("你的账号输入有问题,请重新登录")
return 0
post_url = 'https://www.zhihu.com/login/email'
postdata = {
'_xsrf': get_xsrf(),
'password': secret,
'remember_me': 'true',
'email': account,
}
try:
# 不需要验证码直接登录成功
login_page = session.post(post_url, data=postdata, headers=header)
login_code = login_page.text
print(login_page.status_code)
# print(login_code)
except:
# 需要输入验证码后才能登录成功
postdata["captcha"] = get_captcha()
login_page = session.post(post_url, data=postdata, headers=header)
login_code = eval(login_page.text)
cookie_path = cookie_file
session.cookies.save(cookie_path)
# try:
# input = raw_input
# except:
# pass
##将主页面的用户提问print到shell上
def getpage(url2):
mainpage = session.get(url2, headers=header)
soup = BeautifulSoup(mainpage.text, 'html.parser')
tags = soup.find_all("a", class_="question_link")
# print tags
def get_login_cookie(url):
'''
获取保存cookie
:param url:
:return:
'''
if not os.path.exists(cookie_file):
# account = input('请输入你的用户名\n> ')
# secret = input("请输入你的密码\n> ")
user_name = account
passwd = secret
login(passwd, user_name)
try:
cookie_jar = cookielib.LWPCookieJar(cookie_file)
cookie_jar.load(ignore_discard=True, ignore_expires=True)
print('Load cookie succeeded')
except cookielib.LoadError:
return None
else:
cookie_d = {}
for cookie in cookie_jar:
domain = cookie.domain
if url.find(domain) > 0:
cookie_d[cookie.name] = cookie.value
return cookie_d
if __name__ == '__main__':
if isLogin():
print('您已经登录')
url2 = 'https://www.zhihu.com'
getpage(url2)
else:
# account = input('请输入你的用户名\n> ')
#
# secret = input("请输入你的密码\n> ")
login(secret, account)
三、window.scrollTo方法获取页面数据
url='https://www.zhihu.com/topic/19564862/questions'
driver.get(url)
time.sleep(3)
while True:
res =driver.page_source
html =BeautifulSoup(res,'html.parser')
items= html.find_all('div',class_='List-item TopicFeedItem')
driver.execute_script('window.scrollTo(0,10000000)')
print(len(items))
# 判断数据文件 数量需要多少数据 就判断是多少
if len(items)>=2000:
break
四、python采集数据(数据采集部分,博主太懒了,没有打包函数,因为采集少量数据,没有scrapy框架或者用gevent和queue多线程和协程爬虫去做,如果有需要后续可以自己去打包)
# 最终页面 selenium driver方法获取
res = driver.page_source
html = BeautifulSoup(res, 'html.parser')
items = html.find_all('div', class_='List-item TopicFeedItem')
for item in items:
# excel =[]
# title = item.find('h2').text
# content =item.find('div',class_='RichContent-inner').text
url1='https://www.zhihu.com'+ item.find('h2').find('a')['href']
# 读取url
# print(url1)
# https://www.zhihu.com/question/356682213/answer/908153106
com_id = re.match(".*question/(\d+)", url1)
author_id = com_id.group(1)
# 通过正则获取问题id
driver.get(url1)
res = driver.page_source
html = BeautifulSoup(res, 'html.parser')
# 太懒了下面部分没有打包称函数,自行打包
#问章详情页采集,没有描述的不采集
if html.find('div', class_='QuestionHeader-detail') != None:
# 判断是否存在描述
if html.find('button', class_='QuestionRichText-more') != None:
# 判断是否有阅读全文按钮
driver.find_element_by_class_name('QuestionRichText-more').click()
# 通过点击事件打开描述全文
time.sleep(1)
title = driver.find_element_by_class_name('QuestionHeader').find_element_by_tag_name('h1').text
des = driver.find_element_by_class_name('QuestionHeader-detail').text
comment = html.find('button', class_='Button Button--plain Button--withIcon Button--withLabel').text.replace('\u200b', '评论数量:')
# 获取script标签文件下面的数据
script = html.find('script', id='js-initialData').text
dict = eval(script)['initialState']['entities']['questions'][str(author_id)]['author']
# 将字符串转化为字典 并且获取数据
timeStamp = eval(script)['initialState']['entities']['questions'][str(author_id)]['updatedTime']
timeArray = time.localtime(timeStamp)
otherTime = time.strftime("%Y--%m--%d %H:%M:%S", timeArray)
# 获取发布时间戳 并且转化为标准时间格式
name = dict['name']
# 提问者昵称
headline = dict['headline']
# 提问者介绍
# excel.append(otherTime)
# excel.append(title)
# excel.append(des)
# excel.append(comment)
# excel.append(name)
# excel.append(headline)
row = [otherTime, title, des, comment, name, headline]
writer.writerow(row)
print(title)
# print(excel)
elif html.find('button', class_='QuestionRichText-more') == None:
title = driver.find_element_by_class_name('QuestionHeader').find_element_by_tag_name('h1').text
des = driver.find_element_by_class_name('QuestionHeader-detail').text
comment = html.find('button',class_='Button Button--plain Button--withIcon Button--withLabel').text.replace('\u200b', '评论数量:')
script = html.find('script', id='js-initialData').text
dict = eval(script)['initialState']['entities']['questions'][str(author_id)]['author']
timeStamp = eval(script)['initialState']['entities']['questions'][str(author_id)]['updatedTime']
timeArray = time.localtime(timeStamp)
otherTime = time.strftime("%Y--%m--%d %H:%M:%S", timeArray)
# print(otherTime) # 2013--10--10 23:40:00
name = dict['name']
headline = dict['headline']
# excel.append(otherTime)
# excel.append(title)
# excel.append(des)
# excel.append(comment)
# excel.append(name)
# excel.append(headline)
row = [otherTime, title, des, comment, name, headline]
writer.writerow(row)
print(title)
# print(excel)
else:
continue
driver.close()
五、csv存储数据
# 写入文件
film_file = open('zhihuzhuanti.csv','w',newline='',encoding='utf-8')
writer = csv.writer(film_file)
writer.writerow(['时间','标题','描述','评论数','作者','介绍'])
重要提醒:python csv存储事件放到采集流程之前,这里放到后面是为了方便
注意事项:
下面使用eval函数报错 处理
global false, null, true
false = null = true = ""
文章转发来自于:趣快排营销-数据爬虫:https://seogurublog.com/20191124.html
来源:CSDN
作者:qq_40240315
链接:https://blog.csdn.net/qq_40240315/article/details/103243641