python3爬虫(一)

痴心易碎 提交于 2019-12-17 00:25:51

python3爬虫(一)

一 爬虫的错误处理

#encoding : utf-8
#author:wht@92
#爬虫程序一,两个错误的解决办法
# 2019-12-16

from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
 #检查网址是否可以打开
 try:
  html = urlopen(url)
 except HTTPError as e:
  return None
 #检查服务器是否存在
 try:
  bsobj = BeautifulSoup(html.read())
  title = bsobj.body.h1
 except AttributeError as e:
  return None
 return title
 
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
 print("Title could not be found")
else:
 print(title)

二 属性值的抽取

#encoding : utf-8
#author:wht@92
#爬虫程序二,class属性值下的抽取
# 2019-12-16
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsobj = BeautifulSoup(html)
titlelist = bsobj.findAll({"h1","h2"})
namelist = bsobj.findAll("span",{"class":{"green","red"}})  #bsobj.findAll(tagname,tagAttributes)
# print(titlelist)
# for name in namelist:
#  print(name.get_text()) # .get_text() 会将正在处理的HTML文档中的所有标签清楚,一般到最后打印储存时使用
sumlist = bsobj.findAll(limit=2,text="the prince")
print(len(sumlist))
allText = bsobj.findAll(id="text")
print(allText[0].get_text())

三 父子标签的处理

#encoding : utf-8
#author:wht@92
#爬虫程序三,父子标签
# 2019-12-16
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsobj = BeautifulSoup(html)

for sibling in bsobj.find("table",{"id":"giftList"}).tr.next_siblings:
 print(sibling)
 
for child in bsobj.find("table",{"id":"giftList"}).children:
 print(child)
 
print(bsobj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())

import re 
images = bsobj.findAll("img",{"src":re.compile("\.\.\/img/gifts/img.*\.jpg")})
for image in images:
 print(image["src"])
 
print(bsobj.findAll(lambda tag: len(tag.attrs) == 2))
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!