py3.6爬取图片

帅比萌擦擦* 提交于 2019-12-28 05:11:20

PY3.6爬取图片,附带断点续传

在这里插入代码片
```import urllib.request
import bs4
import re
import os
from urllib import request
# 模拟浏览器访问url并获取页面内容(即爬取源码)
def getHtml(url):
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
    headers = {"User-Agent":user_agent}
    request = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read()
    return html

# 爬取整个网页(这里就细致一些,指定编码之类的)
def parse(url):
    html_doc = getHtml(url)
    sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
    return sp


#获取当前
def get_ok(uu,page_count):
    if page_count>0:
       for c in range(0,10):
            if c == 0:
                x_url = uu
                st_sta = get_picmm(x_url)
            elif c >1:
                x_url = uu+str(c)+".html"
                st_sta2 = get_picmm(x_url)
def get_picmm(x_url):
    print('x_url',x_url)
    name = parse(x_url).find("div","boxs").find("ul")
    cre_urll( name)
def cre_urll(name):
    for n in name:
        if len(n) >10:
            p_href = n.find('a').get('href').replace('https://www.meitulu.com/item/','').replace('.html','')
            p_title = n.find('a').find('img').get('alt')
            print('p_title',p_title)
            p_url_list = p_title[-4:].replace('[','').replace(']','').replace('图','').replace('列','')
            get_foot_page(int(p_url_list),p_href,p_title)
        else:
             pass   
def get_foot_page(p_url_list,p_href,p_title):
    pic_url = 'https://mtl.gzhuibei.com/images/img/'+str(p_href)+"/"
    file_handle=open("D:\\XX\\pic_nima\\muluguochan.txt",mode='a',encoding='utf-8')
    file_handle2=open("D:\\XX\\pic_nima\\muluguochan.txt",mode='r',encoding='utf-8')
    ll = file_handle2.read()
    if ll.count(pic_url)==1:
        print('已下载,跳过',pic_url)
        pass
    elif ll.count(pic_url)==0:
        print('当前下载',pic_url)
        for i in range(1,p_url_list+1):
            pic_url2 = pic_url+str(i)+'.jpg'
            save_pic(p_title,pic_url2,i)
        file_handle.write(pic_url+",\n")   
    pass  
def save_pic(title,pic_url,i):
    our_dir = "D:\\XX\\pic_nima\\国产\\"+title+"\\"
    if not os.path.exists(our_dir):
        os.makedirs(our_dir)
    urllib.request.urlretrieve(pic_url, filename=our_dir+str(i)+".jpg")
    pass          
if __name__ == '__main__':
    x_url = "https://www.meitulu.com/guochan/"
    get_ok(x_url,199)


易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!