urllib库爬取实例

大憨熊 提交于 2019-11-29 15:07:08
from urllib import request
import random

def spider(url):

    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
    ]
    user_agent = random.choice(user_agent_list)

    print(user_agent,url)


    headers = {
        "User-Agent":user_agent
    }

    req = request.Request(url,headers=headers)
    response = request.urlopen(req)
    html = response.read()
    html = html.decode("utf-8")
    # print(html)

    l = url.split("/")
    fileName = "05_"+l[-1]



    with open(fileName,"w",encoding="utf-8") as f:
        f.write(html)


if __name__ == "__main__":
    url_list = ["http://www.langlang2017.com/index.html","http://www.langlang2017.com/route.html","http://www.langlang2017.com/FAQ.html"]
    for url in url_list:
        spider(url)

 

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!