基础练习与理解
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo , "html.parser") #HTML解析器
print(soup.prettify())
#BeautifulSoup库是解析、遍历、维护标签树的功能库
#<p class="title">...</p> :p是名字,前后都有,表示范围, class:表示ta的属性
#文档与标签树(BeautifulSoup类)一一对应
#Tag:标签
#Name:<tag>.name
#Attribution:name的属性
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.title.name)
print(soup.title.parent.name)
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.head.contents)
print(len(soup.body.contents)) # 下行遍历
print(soup.body.parent)
print(len(soup.body.parent))
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
for parent in soup.body.parents:
if parent is None:
print(parent)
else:
print(parent.name)
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.body.next_sibling) # 平行遍历值存在在同一个节点下
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.body.previous_sibling.next_sibling)
import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://python123.io/ws/demo.html")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
#print(soup.prettify())# 格式输出
# 信息标记
# XML JSON YAML
# print(soup.find_all("a"))
# print(soup.find_all(["a",'b']))
#for tag in soup.find_all(True):
# print(tag)
'''for tag in soup.find_all(re.compile('b')): #打印由b开头的标签
print(tag.name)
print(soup.find_all('p','course'))
print(soup.find_all(id='link1'))
print(soup.find_all(re.compile(id='link')))'''
print(soup.find_all('a'))
print(soup.find_all('a',recursive=False)) #是否对子孙全部检索
print(soup.find_all(string = 'Basic Python'))
实例0:唯一一个成功的练习。。。。。。
import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string,tds[1].string,tds[3].string])
def printUnivList(ulist,num):
tplt="{0:^10}\t{1:{3}^10}\t{2:^10}"
print("{0:^10}\t{1:{3}^6}\t{2:{3}^10}".format("排名","学校名称","总分",chr(12288)))
for i in range(num):
u=ulist[i]
print(tplt.format(u[0],u[1],u[2],chr(12288)))
print('Suc'+str(num))
def main():
uinfo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,20)
main()
import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
#正则表达式:通用的字符串表达式
match = re.search(r'[1-9]\d{5}','BIT 100081')
m = re.search(r'[1-9]\d{5}','BIT100081 TSU100084')
if match:
print(match.group(0))
print(m.re)
print(m.pos)
print(m.endpos)
print(m.group(0))
print(m.start())
print(m.end())
print(m.span())
import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
#正则表达式:通用的字符串表达式
match = re.search(r'[1-9]\d{5}','BIT 100081')
m = re.search(r'[1-9]\d{5}','BIT100081 TSU100084')
实例1:淘宝网
这个没有弄出来,可能需要js解密,毕竟2017年的视频了,,,,,
import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200229&ie=utf8
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200229&ie=utf8&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200229&ie=utf8&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s=88
def getHTMLText(url):
try :
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding= r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try: #"view_price":"189.00"
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
tlt = re.findall(r' \"raw_title\"\:\".*?\"',html)
for i in range (len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price,title])
except:
print("")
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count = 0
for g in ilt:
count = count+1
print(tplt.format(count,g[0],g[1]))
def main():
goods = "书包"
depth = 2
start_url = 'https://s.taobao.com/search?q='+goods
infoList =[]
for i in range(depth):
try:
url = start_url + '&s='+str(44*i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main()
淘宝网实现代码(感谢凯子哥
import requests
import re
def getHTMLText(url):
try:
header = {
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'sec-fetch-dest': 'document',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'referer': 'https://www.taobao.com/',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'thw=cn; cna=VSbhFonmvwkCAXrzTK7a5A2T; _m_h5_tk=6b85056df16f6cbfae506e6994464aae_1582999057839; _m_h5_tk_enc=178918b33bbb9ac374d7cb14c6b2740b; t=5fe3ba1edf359b4aec3b03cce35e77f4; hng=CN%7Czh-CN%7CCNY%7C156; lgc=tmzclkhh; tracknick=tmzclkhh; tg=0; mt=ci=109_1; enc=NfVq7knSvFEMbo9ZI9ZLfVBCgKXkOenARaXDARMBSWNGmJ2CffGmwFWCZExWW4NGj%2FFqboNImFj26kUXsedegQ%3D%3D; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&nk2=F5tY2MUM5J8%3D&vt3=F8dBxd30vK9kZI043Vc%3D&id2=UU20tZkILrdQmg%3D%3D; uc4=nk4=0%40FYFG0uAb839Bwt8A8i9rPhKngg%3D%3D&id4=0%40U2%2Fz8ObhGZmIMvV1%2FMHRLhumAUtJ; _cc_=Vq8l%2BKCLiw%3D%3D; tfstk=cLHFBdZ78ppehKOZi-wzPlP4wrmCZp4ulOr4tjbeXQay8fVhi878jh_8buzvIWf..; v=0; cookie2=1fa71391ef4b6fe1b965bfc583362414; _tb_token_=f07e793bef7bb; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; uc1=cookie14=UoTUOa5q%2FxzG6Q%3D%3D; JSESSIONID=CA20EC2F429BBCF8261293C1ABE16081; l=dBQqvyeRQulc6ZkEBOCwourza77OSIRAnuPzaNbMi_5Kc6L_kpQOoRV_QFp62jWftx8B4o0COBp9-etkiKy06Pt-g3fPaxDc.; isg=BKGhnWUPWyu9pffeDmGL7O4-sG27ThVAIZzT_wN2nagHasE8S54lEM-oyZ5sgK14',
}
#kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url, headers=header)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return "1"
def parsePage(ilt,html): #ilt 是tab补全
try:
plt =re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
#检索价格 因为有小数点
tlt =re.findall(r'\"raw_title\"\:\".*?\"',html) # *?表示最小匹配
# 匹配 title 为健 双引号作为值,取得最后一个双引号为止的内容
for i in range(len(plt)) :
price =eval(plt[i].split(':')[1])
title =eval(tlt[i].split(':')[1])
ilt.append([price,title])
except:
print("2") #程序不会因为异常而退出,这样的程序更加可靠
def printGoodsList(ilt):
tplt ="{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count =0
for g in ilt:
count=count+1
print(tplt.format(count,g[0],g[1]))
def main():
#goods=str("施华洛世奇".encode('utf-8'))
goods = '施华洛世奇'
depth = 2
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
for i in range(depth):
try:
url = start_url + '&s=' + str(44 * i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main()
实例2:财富网+百度股票通
这个我依然没有实现,根本搜不了百度股票通的网页版,但是我还是把代码敲下来了
import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
import traceback
# 选取原则:保存在html中,非js代码生成
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def getStockList(lst,stockURL):
html = getHTMLText(stockURL)
soup = BeautifulSoup(html,'html.parser')
a = soup.find_all('a')
for i in a:
try:
href = i.attrs['herf']
lst.append(re.findall(r"[s][hz]\d{6}",href)[0])
except:
continue
def getStockInfo(lst,stockURL,fpath):
for stock in lst:
url = stockURL + tock +".html"
html = getHTMLText(url)
try :
if html == "":
continue
infoDict ={}
soup=BeautifulSoup(html,'html.parser')
stockInfo = soup.find('div',attrs={'class':'stock-bets'})
name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
infoDict.update({'股票名称':name.text.split()[0]})
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val
with open(fpath,'a',encoding='utf-8') as f:
f.write(str(infoDict)+'\n')
except:
traceback.print_exc()
continue
return ""
def main():
stock_list_url='http://quote.eastmoney.com/stocklist.html'
stock_info_url='https://gupiao.baidu.com/stock/'
output_file = 'D://BaiduStockInfo.txt'
fpath = 'D://BaiduStockInfo.txt'
slist = []
getStockInfo(slist,stock_list_url,fpath)
getStockInfo(slist,stock_info_url,output_file)
main()
实例2 小优化
提高用户体验,增加了一个动态进度条
import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
import traceback
# 选取原则:保存在html中,非js代码生成
def getHTMLText(url,code='utf-8'):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding=code
return r.text
except:
return ""
def getStockList(lst,stockURL):
html = getHTMLText(stockURL,'GB2312')
soup = BeautifulSoup(html,'html.parser')
a = soup.find_all('a')
for i in a:
try:
href = i.attrs['herf']
lst.append(re.findall(r"[s][hz]\d{6}",href)[0])
except:
continue
def getStockInfo(lst,stockURL,fpath):
count = 0
for stock in lst:
url = stockURL + tock +".html"
html = getHTMLText(url)
try :
if html == "":
continue
infoDict ={}
soup=BeautifulSoup(html,'html.parser')
stockInfo = soup.find('div',attrs={'class':'stock-bets'})
name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
infoDict.update({'股票名称':name.text.split()[0]})
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val
with open(fpath,'a',encoding='utf-8') as f:
count = count +1
f.write(str(infoDict)+'\n')
print('\r当前速度为:{:.2}%'.format(count*100/len(lst)),end='')
#不换行
except:
traceback.print_exc()
count = count + 1
print('\r当前速度为:{:.2}%'.format(count * 100 / len(lst)), end='')
continue
return ""
def main():
stock_list_url='http://quote.eastmoney.com/stocklist.html'
stock_info_url='https://gupiao.baidu.com/stock/'
output_file = 'D://BaiduStockInfo.txt'
fpath = 'D://BaiduStockInfo.txt'
slist = []
getStockInfo(slist,stock_list_url,fpath)
getStockInfo(slist,stock_info_url,output_file)
main()
来源:CSDN
作者:QXK_Jack
链接:https://blog.csdn.net/QXK_Jack/article/details/104560375