爬虫基础(二)

北战南征 提交于 2020-02-23 01:33:21
#爬取百度网页代码:
from urllib import request

# url = 'http://www.baidu.com'# 我写的
url = 'https://www.baidu.com/'  # 我复制的
header = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
#设置 header是为了让爬虫看起来更像是从正常浏览器访问,防止被禁等等

req = request.Request(url, headers=header)
response = request.urlopen(req)
print(response.read().decode())

以下代码可直接用:

from urllib import request
from bs4 import BeautifulSoup

url = 'https://www.baidu.com'
headers = {
    "Host":'www.baidu.com',
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}

req = request.Request(url,headers=headers)
response = request.urlopen(req)
#print(response.getcode())  # 200
#print(response.headers)    # 返回headers的信息
#print(response.geturl())    # https://www.baidu.com
html = response.read().decode() # 获取网页源代码

print(req.header_items())  
#返回伪造的浏览器信息 [('Host', 'www.baidu.com'), ('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36')]

soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')  # 列表
print(title[0].text) # 打印title列表的第零个元素的内容-- 百度一下,你就知道

# by the way:join的用法
li = ['a','b','c']
print('*'.join(li))
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!