java爬虫jsoup解析器

本次实例是不知道有httpclient的情况下写的，虽然可以爬取，但还是要按照httpclient来进行爬取

代码：

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;

import java.util.*;

import org.jsoup.nodes.Document;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;

import java.net.MalformedURLException;
import java.net.URL;

@RunWith(SpringRunner.class)
@SpringBootTest
public class JsoupTest {


    @Test
    public void testCastDom() throws Exception {
        Document document = Jsoup.parse(new URL("https://www.tupianzj.com/meinv/mm/dabomeinv/"), 10000);

        Elements ele = document.getElementsByTag("img");//根据节点名查找

//        Elements ele = document.getElementsByAttribute("href");//根据属性查找

//        Elements ele = document.getElementsByAttributeValue("href","/meinv/20200224/204775.html");//根据属性名和值查找

//        Elements ele = document.select("head");//查找id||class||attr   必须要找到标签后
//        Elements ele = document.select("span");
//        Set<String> str = document.classNames();//查找class    必须要找到标签后

//        Elements ele = document.select("div#head");//组合使用方法 div为#head
//        Elements ele = document.select("div.warp");//组合使用方法 div为.warp
//        Elements ele = document.select("li a");//组合使用方法 li下的a标签
//        Elements ele = document.select("span[class]");//寻找span标签包含class属性的 只要span包含class的
//        Elements ele = document.select("li span");//寻找父子层关系的标签 li下的所有span

//        ele.forEach(e -> System.out.println(e.toString()));
        for (Element e : ele) {
            System.out.println(e.toString());
            String src = element2.attr("src");//提取出src属性
        }
    }
}

实例爬取鬼刀冰公主，编写不规范，用于参考：

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;

import org.jsoup.nodes.*;
import org.jsoup.select.Elements;
import org.jsoup.Jsoup;

public class 爬取壁纸社鬼刀冰公主图片 {
    public static void main(String args[]) {//jsonp
        int i = 157;

        while (i < 276) {  //定义要爬取的页数
            i++;
            try {
                String urlPath = null;
                // 1.确定爬取的链接
                urlPath = "https://www.toopic.cn/dnbz/" + i + ".html";
                // 发送爬取
                Document document = Jsoup.connect(urlPath).get();

                //System.out.println(document);//打印爬取的节点

                // 查找节点
                Elements elements = document
                        .select(".preview-pic td");

                // 查找到对应的节点·
                Elements element = elements.select("img");
                String alt = null;
                for (Element element2 : element) {
                   //获取节点属性
                    String src = element2.attr("src");
                    alt = element2.attr("alt");

                    //拿到src链接后访问
                    URL url = new URL("https://www.toopic.cn" + src);
                    URLConnection urlConnection = url.openConnection();

                    //将获取到的图片转换成二进制
                    InputStream is = urlConnection.getInputStream();

                    //写出文件
                    OutputStream os = new FileOutputStream(new File("E:\\图片", alt + ".jpg"));
                    int i2 = 0;
                    while ((i2 = is.read()) != -1) {  //二进制只有0和1  如果没有值就为-1
                        os.write(i2);
                    }
                }

                //打印成功信息
                System.out.println(alt + "=========================================>下载完成");
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }
}

爬取笔趣阁小说：

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class 爬取笔趣阁小说 {
   public static void main(String[] args) {
      String urlPath = "http://www.shuquge.com/txt/248/index.html";
      Map<String, String> path = new HashMap<String, String>();

      try {
         Document document = Jsoup.connect(urlPath).get();
         Elements elements = document.select(".listmain a");
         for (Element element : elements) {
            String href = element.attr("href");
            String text = element.text();
            path.put(text, href);
         }
         File file = null;
         int index = 0;
         for (Map.Entry<String, String> map : path.entrySet()) {
            file = new File("E://С˵//" + map.getKey() + ".txt");
            if (!file.exists()) {
               file.createNewFile();
            }
            urlPath = "http://www.shuquge.com/txt/248/" + map.getValue()
                  + "";
            document = Jsoup.connect(urlPath).get();
            Elements div = document.select(".content");
            Elements h1 = div.select("h1");
            Elements content = div.select("#content");
            BufferedWriter bw = null;
            for (int i = 0; i < h1.size(); i++) {
               bw = new BufferedWriter(new OutputStreamWriter(
                     new FileOutputStream(file), "GBK"));
               bw.append(h1.text());
               bw.newLine();
               bw.append(content.text());
               bw.newLine();
            }
            bw.flush();
            bw.close();
            index++;
            System.out.println(map.getKey()
                  + "========================��   " + index + "  �������");

            Thread.sleep(10);
         }

      } catch (Exception e) {
         // TODO: handle exception
         e.printStackTrace();
      }
   }
}

实例3：

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.*;

public class 爬取鬼刀高清图片 {
    public static void main(String[] args) {
        int index = 1;

        String urlPath = null;

        int i = 1;
        try {
            while (index <= 1210) {//一共有1210页
                System.out.println("\n\n第" + index + "页");
                if (index == 1) {
                    urlPath = "http://pic.netbian.com/index.html";//第一页没有下标
                } else {
                    urlPath = "http://pic.netbian.com/index_" + index + ".html";
                }
                Document document = Jsoup.connect(urlPath).get();//解析url地址
                Elements elements = document.select("#main .clearfix");//获取节点
                Elements elements2 = elements.select("img");//获取节点下的img标签
                for (Element element : elements2) {//将获取的一系列的img循环
                    String src = element.attr("src");//获取src熟悉
                    System.out.println(src);
                    URL url = new URL("http://pic.netbian.com" + src);//去访问图片路径
                    URLConnection urlConnection = url.openConnection();//网页打开图片
                    InputStream is = urlConnection.getInputStream();//将图片转换为字节码
                    OutputStream os = new FileOutputStream(
                            new File("E:\\ͼƬ", "4K高清壁纸" + i + ".jpg"));//定义输出流
                    int i1 = 0;
                    while ((i1 = is.read()) != -1) {//如果字节读取完就会返回-1
                        os.write(i1);//写入文件
                    }
                    System.out
                            .println("4K高清壁纸" + i + "======================下载完成");
                    i++;//定义的4K高清壁纸加上i确保文件名不会重复覆盖
                }
                index++;//下一页
            }
        } catch (Exception e) {
            // TODO: handle exception
            e.printStackTrace();
        }
    }
}

来源：oschina

链接：https://my.oschina.net/u/4580084/blog/4913033

标签

element

.Net Framework