python3爬虫（一）

一爬虫的错误处理

#encoding : utf-8
#author:wht@92
#爬虫程序一,两个错误的解决办法
# 2019-12-16

from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
 #检查网址是否可以打开
 try:
  html = urlopen(url)
 except HTTPError as e:
  return None
 #检查服务器是否存在
 try:
  bsobj = BeautifulSoup(html.read())
  title = bsobj.body.h1
 except AttributeError as e:
  return None
 return title
 
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
 print("Title could not be found")
else:
 print(title)

二属性值的抽取

#encoding : utf-8
#author:wht@92
#爬虫程序二,class属性值下的抽取
# 2019-12-16
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsobj = BeautifulSoup(html)
titlelist = bsobj.findAll({"h1","h2"})
namelist = bsobj.findAll("span",{"class":{"green","red"}})  #bsobj.findAll(tagname,tagAttributes)
# print(titlelist)
# for name in namelist:
#  print(name.get_text()) # .get_text() 会将正在处理的HTML文档中的所有标签清楚，一般到最后打印储存时使用
sumlist = bsobj.findAll(limit=2,text="the prince")
print(len(sumlist))
allText = bsobj.findAll(id="text")
print(allText[0].get_text())

三父子标签的处理

#encoding : utf-8
#author:wht@92
#爬虫程序三,父子标签
# 2019-12-16
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsobj = BeautifulSoup(html)

for sibling in bsobj.find("table",{"id":"giftList"}).tr.next_siblings:
 print(sibling)
 
for child in bsobj.find("table",{"id":"giftList"}).children:
 print(child)
 
print(bsobj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())

import re 
images = bsobj.findAll("img",{"src":re.compile("\.\.\/img/gifts/img.*\.jpg")})
for image in images:
 print(image["src"])
 
print(bsobj.findAll(lambda tag: len(tag.attrs) == 2))

来源：CSDN

作者：weixin_42376686

链接：https://blog.csdn.net/weixin_42376686/article/details/103570440

标签

python爬虫

python3