Jsoup : How to parse multiple HTML files from local drive?

馋奶兔 提交于 2020-01-06 09:01:14

问题


I've got multiple HTML files on my hdd to parse with Jsoup. I've been able to parse one file but not multiple files. I would like to parse all the files of a folder.

I wrote this code wich extracts text (within certain ids) from a html file (named "file.htm" in the folder "C:/html") :

package jsouptest;

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main {

    public static void main(String[] args) {
        Document doc;

        try{

            File input = new File("C:/html/file.htm");

            doc = Jsoup.parse(input, "UTF-8", "");


            Elements ids = doc.select("div[id^=desk] p");

            for (Element id : ids){

                System.out.println("\n"+id.text());

            }

        }catch(IOException e){

        }

    }

}

How to apply this code to all files that are in the folder "C:/html" ? Thanks


回答1:


Extract the code to parse html in a method; list the content of your directory and call parse for each file

   File input = new File("C:/html");
   File[] st = input.listFiles();
   for (int i = 0; i < st.length; i++) {
          if(st[i].isFile()){//other condition like name ends in html
                 parse(st[i]);
          }
   }

so your code should look like this:

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main {

    public static void main(String[] args) {
        File input = new File("C:/html");
        File[] st = input.listFiles();
        for (int i = 0; i < st.length; i++) {
            if(st[i].isFile()){//other condition like name ends in html
                parse(st[i]);
            }
        }

    }

    private static void parse(File input ) {
        Document doc;

        try{

            doc = Jsoup.parse(input, "UTF-8", "");


            Elements ids = doc.select("div[id^=desk] p");

            for (Element id : ids){

                System.out.println("\n"+id.text());

            }

        }catch(IOException e){

        }
    }
}



回答2:


I have written program to read folder and inner folder for given path and write results into csv

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class fixingCode {

    public static void main(String[] args) {
        FileWriter writer = null;

        System.out.println("--------------------------Program started--------------------------");

        File input = new File(
                "C:\\My Web Sites\\\\library\\math");//reading file from parent folder 

        try {
            writer = new FileWriter("c:\\Temp\\results.csv");//writing file on path
            Process(input, writer);

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {

            try {

                writer.flush();
                writer.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        //

        System.out.println("--------------------------Program End--------------------------");
    }

    static int spc_count = -1;

    static void Process(File aFile, FileWriter writer) {
        spc_count++;
        Document doc = null;
        String spcs = "";

        try {

            //
            for (int i = 0; i < spc_count; i++)
                spcs += " ";
            if (aFile.isFile()) {
                System.out.println(spcs + "[FILE] " + aFile.getName());
            } else if (aFile.isDirectory()) {
                //
                System.out.println(spcs + "[DIR] " + aFile.getName());
                //
                File[] listOfFiles = aFile.listFiles();
                //
                File[] st = listOfFiles;

                //
                for (int i = 0; i < st.length; i++) {
                    if (st[i].isFile()) {// other condition like name
                                            // ends in

                        doc = Jsoup.parse(st[i], null);

                        // get page title
                        String title = doc.title();
                        System.out.println("title : " + "[" + i + "]" + title);
                        //
                        String ownText = doc.body().ownText();
                        String text = doc.body().text();
                        //
                        // System.out.println("ownText" + ownText + "\n");
                        System.out.println("text" + text);
                        //

                        writer.append("title : " + "[" + i + "]");
                        writer.append(',');
                        writer.append(title);
                        writer.append('\n');

                        /*
                         * writer.append("ownText"); writer.append(',');
                         * writer.append(ownText); writer.append('\n');
                         */

                        writer.append("text : " + "[" + i + "]");
                        writer.append(',');
                        writer.append(text);
                        writer.append('\n');
                    }
                    //
                    //
                    if (listOfFiles != null) {
                        //
                        for (int j = 0; j < listOfFiles.length; j++)
                            Process(listOfFiles[j], writer);
                    } else {
                        System.out.println(spcs + " [ACCESS DENIED]");
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        spc_count--;
    }

}


来源:https://stackoverflow.com/questions/26407402/jsoup-how-to-parse-multiple-html-files-from-local-drive

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!