1 #爬取糗事百科照片(前5页) ·##利用正则表达式
2 import requests #请求数据
4 from urllib import request #请求数据,用这个方便下载照片
5 import re #正则
6 #糗事百科照片地址
7 #普通get请求获取
8 k = 0
9 for i in range(1,6):
10 url = f'https://www.qiushibaike.com/imgrank/page/{i}/'
11 #UA伪装防止识破
12 headers = {
13 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
14 }
15 #获取响应对象response
16 res = requests.get(url, headers=headers)
17 #利用正则表达式findall,返回列表,re.S 是用来在html中的/t/n等解决方式
18 img_urls = re.findall('<div class="thumb">.*?<img src="(.*?)".*? height="auto">.*?</div>', res.text, re.S)
19 for img_url in img_urls:
20 k += 1
21 img_url = 'https:' + img_url
22 imgName = "./imges/qiushi"+str(i)+str(k)+".jpg"
23 request.urlretrieve(img_url, imgName)
## bs4 文档 https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
1 #爬取糗事百科照片(前5页) ##利用bs4
2 import requests #请求数据
3 from bs4 import BeautifulSoup #数据分析
4 from urllib import request #请求数据,用这个方便下载照片
5 #糗事百科照片地址
6 #普通get请求获取
7 k = 0
8 for i in range(1,6):
9 url = 'https://www.qiushibaike.com/imgrank/page/1/'
10 #UA伪装防止识破
11 headers = {
12 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
13 }
14 #获取响应对象response
15 res = requests.get(url, headers=headers)
16 text = res.text
17 #实例化BeautifulSoup对象
18 soup = BeautifulSoup(text,"lxml")
19 #寻找相关数据
20 img_urls = soup.find_all(class_="illustration")
21 #遍历图片地址
22 for img_url in img_urls:
23 k += 1
24 #拼接完整图片地址
25 img_url = "https:"+ img_url.get("src")
26 #下载图片存放位置名字
27 imgName = "./imges/qiushi"+str(i)+str(k)+".jpg"
28 request.urlretrieve(img_url,imgName)
lxml
1 #爬取糗事百科照片(前5页)
2 import requests #请求数据
3 from lxml import etree#数据分析
4 from urllib import request #请求数据,用这个方便下载照片
5 #糗事百科照片地址
6 #普通get请求获取
7 k = 0
8 for i in range(1,6):
9 url = 'https://www.qiushibaike.com/imgrank/page/1/'
10 #UA伪装防止识破
11 headers = {
12 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
13 }
14 #获取响应对象response
15 res = requests.get(url, headers=headers)
16 text = res.text
17 # 将字符串格式的文件转化为html文档
18 html = etree.HTML(text)
19 img_urls = html.xpath("//div[@class='thumb']//img/@src")
20 for img_url in img_urls:
21 img_url = "https:"+ img_url
22 k += 1
23 imgName = "./imges/qiushi"+str(i)+str(k)+".jpg"
24 request.urlretrieve(img_url,imgName)
25 print("正在下载ing:%s"%img_url)
来源:https://www.cnblogs.com/helloboke/p/11494671.html