1、第一种方法
# 第一种方式:requests 和 lxml结合使用 import requests from lxml import etree #1、拿到所有的页面链接,并使用yield返回完整的超链接 def get_html(url): # 获取页面HTML html=requests.get(url) # 使用etree格式化HTML seq=etree.HTML(html.text) link_list=seq.xpath('//*[@id="content"]/ul/li/a/@href') for i in link_list: yield "http://www.runoob.com"+i # 2、获取详细的页面数据 def get_html_link(link): for i in link: # 获取界面 link_html=requests.get(i) # 初始化 link_seq=etree.HTML(link_html.content) # 得到标题 title=link_seq.xpath('//*[@id="content"]/h1/text()')[0] # 得到题目内容 subject=link_seq.xpath('//*[@id="content"]/p[position()>1 and position()<4]/text()') subject_list='\n'.join(subject) yield (title,subject_list) # 3、保存数据 def save_subject(title_subject): with open("G:/1.txt",'a+',encoding='utf-8') as f: for title,subject_list in title_subject: f.write(title+'\n') f.write(subject_list+'\n') f.write("#"*50+'\n') # 4、函数回调 def funcall(url): link=get_html(url) title_subject=get_html_link(link) save_subject(title_subject) # 5、主函数 def main(): url='http://www.runoob.com/python/python-100-examples.html' funcall(url) if __name__=="__main__": main() # for i in get_html('http://www.runoob.com/python/python-100-examples.html'): # print(i) # for i in get_html_link(link): # print(i)
2、第二种方法
# 第二种方式:urllib.request 与 beautifulsoup结合使用 import urllib.request from bs4 import BeautifulSoup # 1、获取所有页面链接 def get_html(url): # 获取页面HTML源码 html=urllib.request.urlopen(url).read() # 格式化html soup=BeautifulSoup(html,'lxml') # 首先找到第一个id='content'的标签,并找到子标签ul(2个) # 其次遍历子标签ul,并获取到所有的ul的子标签li # 然后遍历li标签,并获取到li标签下的所有a标签 # 使用yield返回超链接 for i in soup.find(id='content').find_all('ul'): for j in i.find_all('li'): for k in j.find_all('a'): yield 'http://www.runoob.com'+k['href'] # 2、获取详细的页面数据 def get_html_link(link): # 遍历所有的超链接 for i in link: # 请求超链接页面HTML link_list=urllib.request.urlopen(i).read() # 格式化HTML soup=BeautifulSoup(link_list,'lxml') # 获取id='content'的标签 content=soup.find(id='content') if content: # 获取h1标签的内容 title=content.find('h1').string # 获取前3个p标签的内容 conten_list=content.find_all('p',limit=3) subject='' for j in conten_list: subject+=j.get_text() yield (title,subject) # 3、数据保存 def save_suject(title_content): with open('G:/2.txt','w+',encoding='utf+8') as f: for tile,content in title_content: f.write(tile+'\n') f.write(content+'\n') f.write('#'*80+'\n') # 4、函数回调 def fun_call(url): link=get_html(url) title_content=get_html_link(link) save_suject(title_content) # 5、主函数 def main(): url='http://www.runoob.com/python/python-100-examples.html' fun_call(url) if __name__=='__main__': main()
3、第三种方法
# 第三种方式 import requests,re from bs4 import BeautifulSoup # 1、获取页面的超链接信息 def get_html(url): html=requests.get(url) html.encoding='utf-8' soup=BeautifulSoup(html.text,'lxml') for i in soup.find_all('a',href=re.compile('^/python/python-exercise')): yield 'http://www.runoob.com'+i.get('href') # 2、获取超链接页面的详细信息 def get_html_link(link_list): for i in link_list: html_link=requests.get(i) html_link.encoding='utf-8' soup=BeautifulSoup(html_link.text,'lxml') title=soup.find('div',class_="article-intro").h1.string con=soup.find('div',class_="article-intro").find_all('p') i=1 list1=[] while True: if re.match('程序源代码',con[i].text) or re.match(' Python 100例',con[i].text) or re.match('以上实例输出结果为',con[i].text): break else: list1.append(con[i].text) i+=1 yield (title,list1) # 3、保存数据 def save_data(content_list): with open('G:/3.txt','w+',encoding='utf+8') as f: for tile,content in content_list: f.write(tile+'\n') for i in range(len(content)): f.write(content[i]+'\n') f.write('#'*80+'\n') # 4、函数回调 def fun_call(url): link_list=get_html(url) content_list=get_html_link(link_list) save_data(content_list) # 5、主函数 def main(): url='http://www.runoob.com/python/python-100-examples.html' fun_call(url) if __name__=='__main__': main()
文章来源: 爬虫实例--菜鸟教程