1. 使用requests 、urllib构建简单爬虫代码
"""爬取 唯美女生 网站图片"""
import requests
import re
import os
import time
# 1. 请求网页
myheaders = {'User-Agent': 'Mozilla/5.0'}
# url = "http://pic.netbian.com"
url = 'http://pic.netbian.com/4kmeinv'
response = requests.get(url, headers=myheaders)
# 2. 处理响应数据, 正则匹配
html = response.text
img_urls = re.findall('<img src="(.*?)" alt=".*?">', html)
print(img_urls)
# 3. 下载图片
if not os.path.exists('彼岸图片'):
os.mkdir('彼岸图片')
for img_url in img_urls:
time.sleep(1)
img_name = img_url.split('/')[-1]
response = requests.get((url + img_url), headers=myheaders)
with open('彼岸图片/' + img_name, 'wb') as f:
f.write(response.content)
"""使用 urllib 创建爬虫"""
from urllib.request import urlopen
from urllib.request import Request # 包装爬虫
url = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
# 创建Resquest对象,来包装请求
request = Request(url, headers=headers, method='GET', data=None)
# 发送请求
response = urlopen(request)
# 打印信息
print('状态码:', response.getcode(), '真实请求地址:', response.geturl(), '状态码:', response.status, '请求头:', response.getheaders())
info = response.read().decode()
print(info)
2. GET请求, 当需要手动传参时,可以使用urllib.parse中的:quote、urlencode来进行“中文”转码
"""GET 请求,当需要手动传参,且参数为中文时,需要将参数转码
单个参数时可以使用:quote,转码
多个参数时:urlencode, 转码
"""
from urllib.request import urlopen, Request
from urllib.parse import quote
# 单个参数时可以使用:quote,转码
# url = "https://www.baidu.com/s?wd={}".format(quote("科技"))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
# request = Request(url, headers=headers)
# response = urlopen(request, timeout=3)
# info = response.read().decode()
# print(info)
# 多个参数时:urlencode, 转码
from urllib.parse import urlencode
args = {
'wd': "科技",
'ie': 'utf-8'
# ...
}
url = 'https://www.baidu.com/s?{}'.format(urlencode(args))
print(url)
request2 = Request(url, headers=headers)
response2 = urlopen(request2, timeout=3)
info2 = response2.read().decode()
print(info2)
来源:oschina
链接:https://my.oschina.net/u/4351449/blog/4255094