一.使用requests 模块 是基于urllib库
1.requests.get() 的请求
import requests
# 使用requests 模块 是基于urllib
# urllib python 内置的模块 也是没扣你发送http请求的库
# 模拟http请求, get post put delete
# 1 get 请求
# res = requests.get('https://www.baidu.com')
# # print(res) # <Response [200]>
#
# # 注意编码的问题
# res.encoding = 'utf-8'
# print(res.text) # 百度首页的内容 》》》 响应的内容
#
# with open('a.html', 'w')as f: # 百度首页登录的页面
#
# f.write(res.text)
2.requests.get() 的参数和编码的问题 报错‘gbk’ 》》》
1.'gbk' 编码格式的问题
# 1 get 请求
# res = requests.get('https://www.baidu.com')
# # print(res) # <Response [200]>
#
# # 注意编码的问题
# res.encoding = 'utf-8'
2.必传参数
# 2. get 请求携带参数
# User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
# request.get()的方法的参数
# def get(url, params=None, **kwargs):
# r"""Sends a GET request.
# https://www.baidu.com/s?wd=%E6%9C%80%E7%BE%8E%E9%A3%8E%E6%99%AF%E5%9B%BE
# res = requests.get('https://www.baidu.com/s',
# params={"wd": '最美风景图'},
# # 请求头的信息
# headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'})
#
# res.encoding = 'utf-8'
# # print(res.text) # 请求的内容
# # <div class="timeout-title">网络不给力,请稍后重试</div>
# with open('a.html', 'w')as f:
# f.write(res.text)
# <div class="timeout-title">网络不给力,请稍后重试</div> 无法访问的这样
3.模拟用户登录的实列 》》 华华手机
参数准备
(1)headers 中 User_Agent Referer cookie = res.cookies.get()
格式案列:
# >>>> 模拟登录网站 User_Agent: Referer cookie
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Findex.php'}
res = requests.post('http://www.aa7a.cn/user.php',
headers=headers,
data={
'username': '1024359512@qq.com ',
"password": 'mo1234',
'captcha': 'mkab',
'remember': 1,
'ref': 'http://www.aa7a.cn/index.php',
'act': 'act_login'
}
)
# 如果登录成功,cookie存在在于对象 中
cookie = res.cookies.get_dict() # 生成cookie
# 向首页发送get请求
res = requests.get('http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fuser.php%3Fact%3Dlogout',
headers=headers,
cookies=cookie)
# 判断
if '1024359512@qq.com' in res.text:
print('登录成功')
else:
print('没有登录')
"""
""
username: koko
password: mo123
captcha: nkab
remember: 1
ref: http://www.aa7a.cn/
act: act_login
"""
3.requests.get() 爬取电影
# 爬取视频
# Request URL: https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=12&mrd=0.025906198752709164&filterIds=1625830,1625746,1625846,1626267,1626185,1625876,1626253,1626235,1626236,1626232,1626243,1626215,1626218,1626241,1625836
# Request URL: https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0 #从第零条开始
import re
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')
reg_text = '<a href="(.*?)" class="vervideo-lilink actplay">'
obj = re.findall(reg_text, res.text)
print(obj)
for url in obj:
url = 'https://www.pearvideo.com/'+url
res1 = requests.get(url)
obj1 = re.findall('srcUrl="(.*?)"', res1.text)
# ['https://video.pearvideo.com/mp4/adshort/20191125/cont-1626267-14630490_adpkg-ad_hd.mp4'] 666
print(obj1[0],111)
# https://video.pearvideo.com/mp4/adshort/20191125/cont-1626267-14630490_adpkg-ad_hd.mp4
name = obj1[0].rsplit('/',1)[1] # 一左边的第一'/' 进行切分 1 >>切分一次[1]下标1 作为电影名字
print(name) # cont-1626267-14630490_adpkg-ad_hd.mp4 电影名
res2 = requests.get(obj1[0])
with open(name, 'wb')as f:
for line in res2.iter_content():
f.write(line)
·