爬虫基础（二） | 易学教程

#爬取百度网页代码：
from urllib import request

# url = 'http://www.baidu.com'# 我写的
url = 'https://www.baidu.com/'  # 我复制的
header = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
#设置 header是为了让爬虫看起来更像是从正常浏览器访问，防止被禁等等

req = request.Request(url, headers=header)
response = request.urlopen(req)
print(response.read().decode())

以下代码可直接用：

from urllib import request
from bs4 import BeautifulSoup

url = 'https://www.baidu.com'
headers = {
    "Host":'www.baidu.com',
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}

req = request.Request(url,headers=headers)
response = request.urlopen(req)
#print(response.getcode())  # 200
#print(response.headers)    # 返回headers的信息
#print(response.geturl())    # https://www.baidu.com
html = response.read().decode() # 获取网页源代码

print(req.header_items())  
#返回伪造的浏览器信息 [('Host', 'www.baidu.com'), ('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36')]

soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')  # 列表
print(title[0].text) # 打印title列表的第零个元素的内容-- 百度一下，你就知道

# by the way：join的用法
li = ['a','b','c']
print('*'.join(li))

来源：CSDN

作者：qq_25871537

链接：https://blog.csdn.net/qq_25871537/article/details/104436796

标签

response

url