爬虫day1 | 易学教程

常用模块  requests   BeautifulSoup1 汽车之家爬虫练习

import requests
from bs4 import BeautifulSoup
ret = requests.get(url="https://www.autohome.com.cn/news/")
ret.encoding = ret.apparent_encoding
# print(ret.text)
soup = BeautifulSoup(ret.text,'html.parser')
div = soup.find(name='div',id='auto-channel-lazyload-article')
li_list = div.find_all(name='li')
for li in li_list:
    h3 = li.find(name='h3')
    if not h3:
        continue
    # print(h3.text)

    p = li.find(name='p')
    # print(p.text)

    a = li.find(name='a')
    # print(a.get('href'))

    img = li.find('img')
    src = img.get('src')

    file_name = src.rsplit('__',1)[1]
    print(file_name)
    ret_img = requests.get(
        url= 'https:'+src
    )
    with open(file_name,'wb') as f:
        f.write(ret_img.content)

2 抽屉登陆点赞练习

import requests
from bs4 import BeautifulSoup
# 先访问页面，返回cookie
r1 = requests.get(url='https://dig.chouti.com/all/new/1',
                headers={
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"})

r1_cookie_dict = r1.cookies.get_dict()


# 登录请求，参数的设置
response_login = requests.post(url='https://dig.chouti.com/login',
                    data={
                        "phone":"8618387391326",
                        "password":"zmq251010",
                        'oneMonth':'1'    },
                    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"},
                    cookies = r1_cookie_dict
                               )
print(response_login.text)

for i in range(3,5):
    response_index = requests.get(url='https://dig.chouti.com/all/new/%s' %i,
                 headers={
                     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"})
    soup = BeautifulSoup(response_index.text,'html.parser')
    div = soup.find(attrs={'id':'content-list'})
    items = div.find_all(attrs={'class':'item'})
    for item in items:
        tag = item.find(attrs={'class':'part2'})
        nid = tag.get('share-linkid')


        # 根据每个新点赞操作
        ret = requests.post(url='https://dig.chouti.com/link/vote?linksId=%s' %nid,
                            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"},
                            cookies =r1_cookie_dict )
        print(ret.text)

request常用参数import requestsrequests.get(    url='x',    params={'nid':1,'name':'x'},#x?nid=1&name=x    header={},    cookies={})requests.post(    url='x',    data={        'name':'alex',        'age':18    },    header= {},    cookie = {},)request参数method:url:params:data:json:headers:cookies:files:上传文件

来源：https://www.cnblogs.com/zhange000/p/9260113.html

标签

mozilla