本次实例是不知道有httpclient的情况下写的,虽然可以爬取,但还是要按照httpclient来进行爬取
代码:
import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.Test; import java.util.*; import org.jsoup.nodes.Document; import org.junit.runner.RunWith; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.junit4.SpringRunner; import java.net.MalformedURLException; import java.net.URL; @RunWith(SpringRunner.class) @SpringBootTest public class JsoupTest { @Test public void testCastDom() throws Exception { Document document = Jsoup.parse(new URL("https://www.tupianzj.com/meinv/mm/dabomeinv/"), 10000); Elements ele = document.getElementsByTag("img");//根据节点名查找 // Elements ele = document.getElementsByAttribute("href");//根据属性查找 // Elements ele = document.getElementsByAttributeValue("href","/meinv/20200224/204775.html");//根据属性名和值查找 // Elements ele = document.select("head");//查找id||class||attr 必须要找到标签后 // Elements ele = document.select("span"); // Set<String> str = document.classNames();//查找class 必须要找到标签后 // Elements ele = document.select("div#head");//组合使用方法 div为#head // Elements ele = document.select("div.warp");//组合使用方法 div为.warp // Elements ele = document.select("li a");//组合使用方法 li下的a标签 // Elements ele = document.select("span[class]");//寻找span标签包含class属性的 只要span包含class的 // Elements ele = document.select("li span");//寻找父子层关系的标签 li下的所有span // ele.forEach(e -> System.out.println(e.toString())); for (Element e : ele) { System.out.println(e.toString()); String src = element2.attr("src");//提取出src属性 } } }
实例爬取鬼刀冰公主,编写不规范,用于参考:
import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.net.URLConnection; import org.jsoup.nodes.*; import org.jsoup.select.Elements; import org.jsoup.Jsoup; public class 爬取壁纸社鬼刀冰公主图片 { public static void main(String args[]) {//jsonp int i = 157; while (i < 276) { //定义要爬取的页数 i++; try { String urlPath = null; // 1.确定爬取的链接 urlPath = "https://www.toopic.cn/dnbz/" + i + ".html"; // 发送爬取 Document document = Jsoup.connect(urlPath).get(); //System.out.println(document);//打印爬取的节点 // 查找节点 Elements elements = document .select(".preview-pic td"); // 查找到对应的节点· Elements element = elements.select("img"); String alt = null; for (Element element2 : element) { //获取节点属性 String src = element2.attr("src"); alt = element2.attr("alt"); //拿到src链接后访问 URL url = new URL("https://www.toopic.cn" + src); URLConnection urlConnection = url.openConnection(); //将获取到的图片转换成二进制 InputStream is = urlConnection.getInputStream(); //写出文件 OutputStream os = new FileOutputStream(new File("E:\\图片", alt + ".jpg")); int i2 = 0; while ((i2 = is.read()) != -1) { //二进制只有0和1 如果没有值就为-1 os.write(i2); } } //打印成功信息 System.out.println(alt + "=========================================>下载完成"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
爬取笔趣阁小说:
import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.OutputStreamWriter; import java.util.HashMap; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class 爬取笔趣阁小说 { public static void main(String[] args) { String urlPath = "http://www.shuquge.com/txt/248/index.html"; Map<String, String> path = new HashMap<String, String>(); try { Document document = Jsoup.connect(urlPath).get(); Elements elements = document.select(".listmain a"); for (Element element : elements) { String href = element.attr("href"); String text = element.text(); path.put(text, href); } File file = null; int index = 0; for (Map.Entry<String, String> map : path.entrySet()) { file = new File("E://С˵//" + map.getKey() + ".txt"); if (!file.exists()) { file.createNewFile(); } urlPath = "http://www.shuquge.com/txt/248/" + map.getValue() + ""; document = Jsoup.connect(urlPath).get(); Elements div = document.select(".content"); Elements h1 = div.select("h1"); Elements content = div.select("#content"); BufferedWriter bw = null; for (int i = 0; i < h1.size(); i++) { bw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(file), "GBK")); bw.append(h1.text()); bw.newLine(); bw.append(content.text()); bw.newLine(); } bw.flush(); bw.close(); index++; System.out.println(map.getKey() + "========================�� " + index + " �������"); Thread.sleep(10); } } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } }
实例3:
import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.net.URLConnection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.jsoup.*; public class 爬取鬼刀高清图片 { public static void main(String[] args) { int index = 1; String urlPath = null; int i = 1; try { while (index <= 1210) {//一共有1210页 System.out.println("\n\n第" + index + "页"); if (index == 1) { urlPath = "http://pic.netbian.com/index.html";//第一页没有下标 } else { urlPath = "http://pic.netbian.com/index_" + index + ".html"; } Document document = Jsoup.connect(urlPath).get();//解析url地址 Elements elements = document.select("#main .clearfix");//获取节点 Elements elements2 = elements.select("img");//获取节点下的img标签 for (Element element : elements2) {//将获取的一系列的img循环 String src = element.attr("src");//获取src熟悉 System.out.println(src); URL url = new URL("http://pic.netbian.com" + src);//去访问图片路径 URLConnection urlConnection = url.openConnection();//网页打开图片 InputStream is = urlConnection.getInputStream();//将图片转换为字节码 OutputStream os = new FileOutputStream( new File("E:\\ͼƬ", "4K高清壁纸" + i + ".jpg"));//定义输出流 int i1 = 0; while ((i1 = is.read()) != -1) {//如果字节读取完就会返回-1 os.write(i1);//写入文件 } System.out .println("4K高清壁纸" + i + "======================下载完成"); i++;//定义的4K高清壁纸加上i确保文件名不会重复覆盖 } index++;//下一页 } } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } }
来源:oschina
链接:https://my.oschina.net/u/4580084/blog/4913033