#爬取百度网页代码:
from urllib import request
# url = 'http://www.baidu.com'# 我写的
url = 'https://www.baidu.com/' # 我复制的
header = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
#设置 header是为了让爬虫看起来更像是从正常浏览器访问,防止被禁等等
req = request.Request(url, headers=header)
response = request.urlopen(req)
print(response.read().decode())
以下代码可直接用:
from urllib import request
from bs4 import BeautifulSoup
url = 'https://www.baidu.com'
headers = {
"Host":'www.baidu.com',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
req = request.Request(url,headers=headers)
response = request.urlopen(req)
#print(response.getcode()) # 200
#print(response.headers) # 返回headers的信息
#print(response.geturl()) # https://www.baidu.com
html = response.read().decode() # 获取网页源代码
print(req.header_items())
#返回伪造的浏览器信息 [('Host', 'www.baidu.com'), ('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36')]
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title') # 列表
print(title[0].text) # 打印title列表的第零个元素的内容-- 百度一下,你就知道
# by the way:join的用法
li = ['a','b','c']
print('*'.join(li))
来源:CSDN
作者:qq_25871537
链接:https://blog.csdn.net/qq_25871537/article/details/104436796