python3爬虫(一)
一 爬虫的错误处理
#encoding : utf-8
#author:wht@92
#爬虫程序一,两个错误的解决办法
# 2019-12-16
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
#检查网址是否可以打开
try:
html = urlopen(url)
except HTTPError as e:
return None
#检查服务器是否存在
try:
bsobj = BeautifulSoup(html.read())
title = bsobj.body.h1
except AttributeError as e:
return None
return title
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
print("Title could not be found")
else:
print(title)
二 属性值的抽取
#encoding : utf-8
#author:wht@92
#爬虫程序二,class属性值下的抽取
# 2019-12-16
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsobj = BeautifulSoup(html)
titlelist = bsobj.findAll({"h1","h2"})
namelist = bsobj.findAll("span",{"class":{"green","red"}}) #bsobj.findAll(tagname,tagAttributes)
# print(titlelist)
# for name in namelist:
# print(name.get_text()) # .get_text() 会将正在处理的HTML文档中的所有标签清楚,一般到最后打印储存时使用
sumlist = bsobj.findAll(limit=2,text="the prince")
print(len(sumlist))
allText = bsobj.findAll(id="text")
print(allText[0].get_text())
三 父子标签的处理
#encoding : utf-8
#author:wht@92
#爬虫程序三,父子标签
# 2019-12-16
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsobj = BeautifulSoup(html)
for sibling in bsobj.find("table",{"id":"giftList"}).tr.next_siblings:
print(sibling)
for child in bsobj.find("table",{"id":"giftList"}).children:
print(child)
print(bsobj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
import re
images = bsobj.findAll("img",{"src":re.compile("\.\.\/img/gifts/img.*\.jpg")})
for image in images:
print(image["src"])
print(bsobj.findAll(lambda tag: len(tag.attrs) == 2))
来源:CSDN
作者:weixin_42376686
链接:https://blog.csdn.net/weixin_42376686/article/details/103570440