from requests_html import HTMLSession
import os
session = HTMLSession()
# http://www.xiaohuar.com/list-3-0.html
#获取索引页url
def get_index_page():
for i in range(6):
url = 'http://www.xiaohuar.com/list-3-%s.html'%i
yield url
#获取
# url= "http://www.xiaohuar.com/list-3-0.html"
# r = session.get(url=url)
# for element in r.html.find('#images a[class="imglink"]'):
# print(element.attrs.get('href'))
#解析索引页获取详情页url
def get_detail_page(url):
r = session.get(url=url)
for element in r.html.find('#images a[class="imglink"]'):
yield element.attrs.get('href')
#测试解析详情页获取视频url,名字
# url = 'http://www.xiaohuar.com/p-3-136.html'
# r = session.get(url=url)
# r.html.encoding = "gbk"
# file_name = r.html.find('title',first=True).text.replace('\\','')
# print(file_name)
#
# element = r.html.find('#media source',first=True)
# if element:
# mp4_url = element.attrs.get('src')
# else:
# m3u8_url = r.html.search('var vHLSurl = "{}";')[0]
# print(m3u8_url)
#解析详情页获取视频url,名字
def get_url_name(url):
r = session.get(url=url)
r.html.encoding = "gbk"
file_name = r.html.find('title',first=True).text.replace('\\','')
print(file_name)
element = r.html.find('#media source',first=True)
if element:
vurl = element.attrs.get('src')
vtype = 'mp4'
else:
vurl = r.html.search('var vHLSurl = "{}";')[0]
vtype = 'm3u8'
return file_name,vurl,vtype
#保存文件
def save(file_name,vurl,vtype):
if vtype == "mp4":
file_name += ".mp4"
r = session.get(url=vurl)
with open(file_name,'wb') as f:
f.write(r.content)
elif vtype == "m3u8":
save_m3u8(file_name,vurl)
#处理m3u8
def save_m3u8(file_name,vurl):
if not os.path.exists(file_name):
os.mkdir(file_name)
r = session.get(url=vurl)
m3u8_path = os.path.join(file_name,'playlist.m3u8')
with open(m3u8_path,'wb') as f:
f.write(r.content)
for line in r.text:
if line.endswith('ts'):
ts_url = vurl.replace('playlist.m3u8',line)
ts_path = os.path.join(file_name,line)
r0 = session.get(url=ts_url)
with open(ts_path,'wb') as f:
f.write(r0.content)
if __name__ == '__main__':
for index_page in get_index_page():
for detail_url in get_detail_page(index_page):
file_name, vurl, vtype = get_url_name(detail_url)
save(file_name, vurl, vtype)
# 上述的for循环,是由于yield导致的!建议使用,看起来大气
知识点补充:
# print(str('电影'.encode('utf-8')).strip("b'").upper().replace('\X','%'))
# 前端页面对中文的参数的编码原理
视频以m3u8结尾的,需要我门再进一步处理!拿到里面片段的ts文件!