在上次的爬虫中,我们只是爬取了历史文章中加载更多的数据(https://www.cnblogs.com/jueshilaozhongyi/p/11656435.html),这次是历史文章中首页的数据
历史文章首页的数据是返回在html中的,再具体点在JavaScript中
本次代码的缺点:1.还是不能很智能,需要通过抓包工具获取首页的链接
2.有些公众号没有历史文章,这种公众号不能使用
3.有些公众号历史文章使用的是分类,这种也不能使用(下次分享这种的怎么处理)
好了,我们先来看看首页的链接吧:
+action=getmsg
对比地址,我们可以看到也就是访问的路径都一样,只是action的参数不一样,这次的action值是home,后面的参数都一样
下面开始放代码吧:
# 在之前我们的公众号名字是通过我们手动输入的,这次因为是在首页,可以通过正则表达式直接获取,新增加了获取公众号名的步骤
import requestsimport re, osimport time# 在之前的链接里我们封装的数据库操作,可以直接拿来用from conn.connect_mysql import insert_wechat_content,select_wechat_contentpath = os.getcwd()print(path)file_path = path + '//content_file'def get_content_text(url): """ 请求接口数据 :return: """ wechat_home_url = url headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) MicroMessenger/2.3.27(0x12031b13) MacWechat Chrome/39.0.2171.95 Safari/537.36 NetType/WIFI WindowsWechat", "cookie": "devicetype=android-28; lang=zh_CN; pass_ticket=sZNf5AG/C0AvageD87nRhK3W3AuVgYP3dYTvz3i57WFq718hIiDmMmA/ICUWA3W; version=2700073a; wap_sid2=CILAnPMFElxnMzRMNjdKbGpLdXYxZ0xzN2JfeldZX25JaGQ1a0EyLTNGUmE5SHZxNGRqTERPX1kybnd6a0Nwd2pONkJiLUxRbW84OU9kdkxjcHJjMHVZRXRxQUVDd2dFQUFBfjCMwsvtBTgNQJVO; wxuin=1583816706" } result = requests.get(url=wechat_home_url, headers=headers, verify=False) r = result.text return rdef write_content_file(url): """ 写入接口请求的数据 :param data: :return: """ data = get_content_text(url) f = open(file_path, 'w+', encoding='utf-8') f.write(data) f.close()def read_content_file(): """ 读取file数据 :return: text """ f = open(file_path, 'r', encoding='utf-8') text = f.read() f.close() return textdef find_msg(): """ 正则表达式获取msgList中的数据 :return: str """ r = read_content_file() msgList = re.findall(r"msgList = \'(.*)\'", str(r)) return str(msgList[0])def msg_replace(): """ 替换引号为单引号 :return: str """ msg = find_msg() msg_replace = msg.replace(""", "'") return msg_replacedef msg_json(): """ 将数据处理成json格式 :return: json """ import demjson msg = msg_replace() msg_json = demjson.decode(msg) return msg_json['list']def get_wechat_name(): """ 获取公众号名 :return: """ r = read_content_file() wechat_name = re.findall(r"nickname = \"(.*)\" |\|"";", str(r)) # wechat_name = "'{}'".format(wechat_name) # print(wechat_name) return wechat_name[0]def format_data(): """ 保存获取到的数据 :return: """ msg = msg_json() wechat_name = get_wechat_name() wechat_name = "'{}'".format(wechat_name) for i in msg: # 标题 title = i['app_msg_ext_info']['title'] title = "'{}'".format(title) # 文章地址 content_url = i['app_msg_ext_info']['content_url'] content_url = "'{}'".format(content_url) # 封面图 cover = i['app_msg_ext_info']['cover'] cover = "'{}'".format(cover) # 转载路径 source_url = i['app_msg_ext_info']['source_url'] source_url = "'{}'".format(source_url) # 转载公众号 source_name = i['app_msg_ext_info']['author'] source_name = "'{}'".format(source_name) # 发布时间 datetime = i['comm_msg_info']['datetime'] datetime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(datetime)) datetime = "'{}'".format(datetime) print(title, content_url, cover, source_url, source_name, datetime) if select_wechat_content(title) == 1: print("数据已经存在") else: insert_wechat_content(wechat_name, title, content_url, cover, source_url, source_name, datetime)def run(url): write_content_file(url) format_data()if __name__ == "__main__": url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI5NDY1MjQzNA==&uin=MTU4MzgxNjcwNg%3D%3D&key=b1719993cc296ec41a4aad024aa262db236a1b7242d12dd98e5d02bf751cb5e705f8ef8ef6cda9e235519a360bab4c42b4ab301a460e39a67ca76f0945e49ddf2cbaaf03553a73e079426924bbbe17ce&devicetype=iMac+MacBookPro15%2C1+OSX+OSX+10.14.5+build(18F203)&version=12031b13&lang=zh_CN&nettype=WIFI&a8scene=0&fontScale=100&pass_ticket=CSP6SWxOUwP4xAvrB01DuLNCJIO%2FR65vUpx4MFOWrJCce3JldcoyR1VZK4%2BQfXzn" run(url)大功告成,Over!