用JAVA爬取视频和图片

<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.12</version>
</dependency>

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.13.1</version>
</dependency>

<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>2.7</version>
</dependency>

<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.10</version>
</dependency>

package com.download.util;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.StatusLine;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.UUID;

/**
 * @author Administrator
 */
public class DownloadVideoAndImage {

    public static void main(String[] args) {

        //开始页码
        int startPage = 1;
        //结束页码
        int endPage = 5;
        //保存路径
        String savePath = "D:" + File.separator + "Desktop" + File.separator + "download";
        //设置编码
        String charset = "utf-8";

        Thread thread1 = new Thread(new Runnable() {
            @Override
            public void run() {
                for (int i = 1; i < endPage; i++) {
                    String url = "https://ibaotu.com/shipin/7-0-0-0-0-" + i + ".html";
                    System.out.println("正在下载第" + i + "页video。");
                    try {
                        downloadVideoByURL(url, savePath, charset);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    System.out.println("第" + i + "页video下载完成");
                }
            }
        });

        Thread thread2 = new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    downloadVideoByPage(startPage, savePath, charset);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        });

        Thread thread3 = new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    downloadImagePage(startPage, savePath, charset);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        });

        thread1.start();
        thread2.start();
        thread3.start();
    }


    /**
     * @param url      下载的目标地址
     * @param savePath 保存的路径
     * @param charset  编码
     * @throws Exception
     */
    public static void downloadVideoByURL(String url, String savePath, String charset) throws IOException {
        //根据URL获取html
        String content = getHtmlByURL(url, charset);
        //根据获取到的html得到Document对象
        Document document = getDocumentByHtml(content);
        //用标签选择器选择需要下载的元素
        Elements elements = document.select("div.media-list div.video-play video");
        //创建保存文件夹
        createSaveFileFolder(savePath);

        for (Element element : elements) {
            String videoUrl = element.attr("src");
            if (!(videoUrl.startsWith("https:"))) {
                videoUrl = "https:" + videoUrl;
                System.out.println(videoUrl);
                String uuid = getUUIDString();
                FileUtils.copyURLToFile(new URL(videoUrl), new File(savePath, "downloadVideoByURL" + File.separator + uuid + ".mp4"));
            }
        }
        System.out.println("video下载完毕");
    }

    /**
     * 通过递归调用下载
     *
     * @param page     当前页码
     * @param savePath 保存路径
     * @param charset  编码
     * @throws Exception
     */
    public static void downloadVideoByPage(int page, String savePath, String charset) throws Exception {
        System.out.println("downloadVideoByPage开始下载第" + page + "页video。");
        //目标url
        String url = "https://ibaotu.com/shipin/7-0-0-0-0-" + page + ".html";
        //获取Html
        String content = getHtmlByURL(url, charset);
        //获取Document对象
        Document document = getDocumentByHtml(content);
        //通过select选择器选取所需元素
        Elements elements1 = document.select("div.media-list div.video-play video");
        //创建保存文件夹
        createSaveFileFolder(savePath);

        for (Element element : elements1) {
            String videoUrl = element.attr("src");
            if (!videoUrl.startsWith("https:")) {
                videoUrl = "https:" + videoUrl;
                System.out.println(videoUrl);
                String uuid = getUUIDString();
                FileUtils.copyURLToFile(new URL(videoUrl), new File(savePath, "downloadVideoByPage" + File.separator + uuid + ".mp4"));
            }
        }
        System.out.println("downloadVideoByPage第" + page + "页video下载完毕。");
        page++;
        downloadVideoByPage(page, savePath, charset);
    }


    /**
     * 通过递归调用下载
     *
     * @param page     当前页码
     * @param savePath 保存路径
     * @param charset  编码
     * @throws Exception
     */
    public static void downloadImagePage(int page, String savePath, String charset) throws Exception {
        System.out.println("downloadImagePage正在下载第" + page + "页image。");
        //目标URL
        String url = "https://ibaotu.com/shipin/7-0-0-0-0-" + page + ".html";
        //获取Html
        String content = getHtmlByURL(url, charset);
        //获取Document对象
        Document document = getDocumentByHtml(content);
        //通过select选择器选取所需元素
        Elements elements = document.select("div.media-list div.show-image>img ");
        //创建保存文件夹
        createSaveFileFolder(savePath);

        for (Element element : elements) {
            String imgUrl = element.attr("data-url");
            //imgUrl不以https开头,前面拼接https
            if (!imgUrl.startsWith("https:")) {
                imgUrl = "https:" + imgUrl;
                System.out.println(imgUrl);
                String uuid = getUUIDString();
                FileUtils.copyURLToFile(new URL(imgUrl), new File(savePath, "downloadImagePage" + File.separator + uuid + ".jpg"));
            }
        }

        System.out.println("downloadImagePage第" + page + "页image下载完毕。");
        page++;
        downloadImagePage(page, savePath, charset);
    }


    /**
     * 通过URL获取html页面
     *
     * @param url 目标url
     * @return
     * @throws IOException
     */
    public static String getHtmlByURL(String url, String charset) throws IOException {
        //创建HttpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //获取连接
        HttpGet httpGet = new HttpGet(url);
        //获取响应
        CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
        //获取响应状态码
        StatusLine statusLine = httpResponse.getStatusLine();
        int statusCode = statusLine.getStatusCode();
        String content = null;
        //状态码200代表连接成功
        int ok = 200;
        if (statusCode == ok) {
            //获取响应实体
            HttpEntity entity = httpResponse.getEntity();
            return EntityUtils.toString(entity, charset);
        }
        return "网络错误，请重试";
    }

    /**
     * 获取Document对象
     *
     * @param html
     * @return
     */
    public static Document getDocumentByHtml(String html) {
        return Jsoup.parse(html);
    }

    /**
     * 创建保存文件夹
     *
     * @param savePath 保存路径
     */
    public static void createSaveFileFolder(String savePath) {
        File file = new File(savePath);
        //保存路径没有以路径结尾，添加路径结尾
        if (!(savePath.endsWith(File.separator))) {
            savePath = savePath + File.separator;
        }
        //文件夹不存在，创建
        if ((!file.exists())) {
            file.mkdirs();
        }
    }

    /**
     * 生成UUID字符串并去除-
     *
     * @return
     */
    public static String getUUIDString() {
        return UUID.randomUUID().toString().replace("-", "");
    }
}

来源：oschina

链接：https://my.oschina.net/u/4459974/blog/4310648

标签

apache

element

jsoup

httpclient