Python--Requests and BeautifulSoup and re

有些话、适合烂在心里 提交于 2020-03-02 00:56:32

基础练习与理解 

视频资料

import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo , "html.parser") #HTML解析器
print(soup.prettify())
#BeautifulSoup库是解析、遍历、维护标签树的功能库
#<p class="title">...</p>  :p是名字,前后都有,表示范围, class:表示ta的属性
#文档与标签树(BeautifulSoup类)一一对应
#Tag:标签
#Name:<tag>.name
#Attribution:name的属性
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.title.name)
print(soup.title.parent.name)
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.head.contents)
print(len(soup.body.contents)) # 下行遍历
print(soup.body.parent)
print(len(soup.body.parent))
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
for parent in soup.body.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.body.next_sibling)  # 平行遍历值存在在同一个节点下
import requests
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://editor.csdn.net/md")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.body.previous_sibling.next_sibling)
import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
r = requests.get("https://python123.io/ws/demo.html")
# print(r.text)
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
#print(soup.prettify())#  格式输出
# 信息标记
# XML JSON YAML
# print(soup.find_all("a"))
# print(soup.find_all(["a",'b']))
#for tag in soup.find_all(True):
  #  print(tag)
'''for tag in soup.find_all(re.compile('b')): #打印由b开头的标签
    print(tag.name)
print(soup.find_all('p','course'))
print(soup.find_all(id='link1'))
print(soup.find_all(re.compile(id='link')))'''
print(soup.find_all('a'))
print(soup.find_all('a',recursive=False)) #是否对子孙全部检索
print(soup.find_all(string = 'Basic Python'))

 实例0:唯一一个成功的练习。。。。。。

import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
def getHTMLText(url):
    try:
        r = requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def fillUnivList(ulist,html):
    soup = BeautifulSoup(html,"html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr,bs4.element.Tag):
            tds = tr('td')
            ulist.append([tds[0].string,tds[1].string,tds[3].string])

def printUnivList(ulist,num):
    tplt="{0:^10}\t{1:{3}^10}\t{2:^10}"
    print("{0:^10}\t{1:{3}^6}\t{2:{3}^10}".format("排名","学校名称","总分",chr(12288)))
    for i in range(num):
        u=ulist[i]
        print(tplt.format(u[0],u[1],u[2],chr(12288)))
    print('Suc'+str(num))

def main():
    uinfo = []
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
    html = getHTMLText(url)
    fillUnivList(uinfo,html)
    printUnivList(uinfo,20)

main()

 

import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
#正则表达式:通用的字符串表达式
match = re.search(r'[1-9]\d{5}','BIT 100081')
m = re.search(r'[1-9]\d{5}','BIT100081 TSU100084')
if match:
    print(match.group(0))
print(m.re)
print(m.pos)
print(m.endpos)
print(m.group(0))
print(m.start())
print(m.end())
print(m.span())


import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
#正则表达式:通用的字符串表达式
match = re.search(r'[1-9]\d{5}','BIT 100081')
m = re.search(r'[1-9]\d{5}','BIT100081 TSU100084')

实例1:淘宝网

这个没有弄出来,可能需要js解密,毕竟2017年的视频了,,,,,

import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200229&ie=utf8
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200229&ie=utf8&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200229&ie=utf8&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s=88
def getHTMLText(url):
    try :
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding= r.apparent_encoding
        return r.text
    except:
        return ""

def parsePage(ilt,html):
    try: #"view_price":"189.00"
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt = re.findall(r' \"raw_title\"\:\".*?\"',html)
        for i in range (len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        print("")

def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号","价格","商品名称"))
    count = 0
    for g in ilt:
        count = count+1
        print(tplt.format(count,g[0],g[1]))

def main():
    goods = "书包"
    depth = 2
    start_url = 'https://s.taobao.com/search?q='+goods
    infoList =[]
    for i in range(depth):
        try:
            url = start_url + '&s='+str(44*i)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)

main()

淘宝网实现代码(感谢凯子哥

import requests
import re
def getHTMLText(url):
    try:
        header = {
            'authority': 's.taobao.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
            'sec-fetch-dest': 'document',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'referer': 'https://www.taobao.com/',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cookie': 'thw=cn; cna=VSbhFonmvwkCAXrzTK7a5A2T; _m_h5_tk=6b85056df16f6cbfae506e6994464aae_1582999057839; _m_h5_tk_enc=178918b33bbb9ac374d7cb14c6b2740b; t=5fe3ba1edf359b4aec3b03cce35e77f4; hng=CN%7Czh-CN%7CCNY%7C156; lgc=tmzclkhh; tracknick=tmzclkhh; tg=0; mt=ci=109_1; enc=NfVq7knSvFEMbo9ZI9ZLfVBCgKXkOenARaXDARMBSWNGmJ2CffGmwFWCZExWW4NGj%2FFqboNImFj26kUXsedegQ%3D%3D; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&nk2=F5tY2MUM5J8%3D&vt3=F8dBxd30vK9kZI043Vc%3D&id2=UU20tZkILrdQmg%3D%3D; uc4=nk4=0%40FYFG0uAb839Bwt8A8i9rPhKngg%3D%3D&id4=0%40U2%2Fz8ObhGZmIMvV1%2FMHRLhumAUtJ; _cc_=Vq8l%2BKCLiw%3D%3D; tfstk=cLHFBdZ78ppehKOZi-wzPlP4wrmCZp4ulOr4tjbeXQay8fVhi878jh_8buzvIWf..; v=0; cookie2=1fa71391ef4b6fe1b965bfc583362414; _tb_token_=f07e793bef7bb; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; uc1=cookie14=UoTUOa5q%2FxzG6Q%3D%3D; JSESSIONID=CA20EC2F429BBCF8261293C1ABE16081; l=dBQqvyeRQulc6ZkEBOCwourza77OSIRAnuPzaNbMi_5Kc6L_kpQOoRV_QFp62jWftx8B4o0COBp9-etkiKy06Pt-g3fPaxDc.; isg=BKGhnWUPWyu9pffeDmGL7O4-sG27ThVAIZzT_wN2nagHasE8S54lEM-oyZ5sgK14',
        }
        #kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get(url, headers=header)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return "1"

def parsePage(ilt,html): #ilt 是tab补全
    try:
        plt =re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        #检索价格 因为有小数点
        tlt =re.findall(r'\"raw_title\"\:\".*?\"',html) # *?表示最小匹配
        # 匹配 title 为健 双引号作为值,取得最后一个双引号为止的内容
        for i in range(len(plt)) :
            price =eval(plt[i].split(':')[1])
            title =eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        print("2") #程序不会因为异常而退出,这样的程序更加可靠

def printGoodsList(ilt):
    tplt ="{:4}\t{:8}\t{:16}"
    print(tplt.format("序号","价格","商品名称"))
    count =0
    for g in ilt:
        count=count+1
        print(tplt.format(count,g[0],g[1]))


def main():
    #goods=str("施华洛世奇".encode('utf-8'))
    goods = '施华洛世奇'
    depth = 2
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44 * i)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)
main()

 

实例2:财富网+百度股票通

这个我依然没有实现,根本搜不了百度股票通的网页版,但是我还是把代码敲下来了 

import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
import traceback
# 选取原则:保存在html中,非js代码生成

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""

def getStockList(lst,stockURL):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html,'html.parser')
    a = soup.find_all('a')
    for i in a:
        try:
            href = i.attrs['herf']
            lst.append(re.findall(r"[s][hz]\d{6}",href)[0])
        except:
            continue

def getStockInfo(lst,stockURL,fpath):
    for stock in lst:
        url = stockURL + tock +".html"
        html = getHTMLText(url)
        try :
            if html == "":
                continue
            infoDict ={}
            soup=BeautifulSoup(html,'html.parser')
            stockInfo = soup.find('div',attrs={'class':'stock-bets'})

            name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
            infoDict.update({'股票名称':name.text.split()[0]})
            keyList = stockInfo.find_all('dt')
            valueList = stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val
            with open(fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict)+'\n')
        except:
            traceback.print_exc()
            continue

    return ""

def main():
    stock_list_url='http://quote.eastmoney.com/stocklist.html'
    stock_info_url='https://gupiao.baidu.com/stock/'
    output_file = 'D://BaiduStockInfo.txt'
    fpath = 'D://BaiduStockInfo.txt'
    slist = []
    getStockInfo(slist,stock_list_url,fpath)
    getStockInfo(slist,stock_info_url,output_file)

main()

实例2 小优化

 提高用户体验,增加了一个动态进度条

import requests
import re
from bs4 import BeautifulSoup #BeautifulSoup是一个类
import bs4
import traceback
# 选取原则:保存在html中,非js代码生成

def getHTMLText(url,code='utf-8'):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding=code
        return r.text
    except:
        return ""

def getStockList(lst,stockURL):
    html = getHTMLText(stockURL,'GB2312')
    soup = BeautifulSoup(html,'html.parser')
    a = soup.find_all('a')
    for i in a:
        try:
            href = i.attrs['herf']
            lst.append(re.findall(r"[s][hz]\d{6}",href)[0])
        except:
            continue

def getStockInfo(lst,stockURL,fpath):
    count = 0

    for stock in lst:
        url = stockURL + tock +".html"
        html = getHTMLText(url)
        try :
            if html == "":
                continue
            infoDict ={}
            soup=BeautifulSoup(html,'html.parser')
            stockInfo = soup.find('div',attrs={'class':'stock-bets'})
            name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
            infoDict.update({'股票名称':name.text.split()[0]})
            keyList = stockInfo.find_all('dt')
            valueList = stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val
            with open(fpath,'a',encoding='utf-8') as f:
                count = count +1
                f.write(str(infoDict)+'\n')
                print('\r当前速度为:{:.2}%'.format(count*100/len(lst)),end='')
                #不换行
        except:
            traceback.print_exc()
            count = count + 1
            print('\r当前速度为:{:.2}%'.format(count * 100 / len(lst)), end='')
            continue

    return ""

def main():
    stock_list_url='http://quote.eastmoney.com/stocklist.html'
    stock_info_url='https://gupiao.baidu.com/stock/'
    output_file = 'D://BaiduStockInfo.txt'
    fpath = 'D://BaiduStockInfo.txt'
    slist = []
    getStockInfo(slist,stock_list_url,fpath)
    getStockInfo(slist,stock_info_url,output_file)

main()

 

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!