问题
I've got multiple HTML files on my hdd to parse with Jsoup. I've been able to parse one file but not multiple files. I would like to parse all the files of a folder.
I wrote this code wich extracts text (within certain ids) from a html file (named "file.htm" in the folder "C:/html") :
package jsouptest;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static void main(String[] args) {
Document doc;
try{
File input = new File("C:/html/file.htm");
doc = Jsoup.parse(input, "UTF-8", "");
Elements ids = doc.select("div[id^=desk] p");
for (Element id : ids){
System.out.println("\n"+id.text());
}
}catch(IOException e){
}
}
}
How to apply this code to all files that are in the folder "C:/html" ? Thanks
回答1:
Extract the code to parse html in a method; list the content of your directory and call parse for each file
File input = new File("C:/html");
File[] st = input.listFiles();
for (int i = 0; i < st.length; i++) {
if(st[i].isFile()){//other condition like name ends in html
parse(st[i]);
}
}
so your code should look like this:
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static void main(String[] args) {
File input = new File("C:/html");
File[] st = input.listFiles();
for (int i = 0; i < st.length; i++) {
if(st[i].isFile()){//other condition like name ends in html
parse(st[i]);
}
}
}
private static void parse(File input ) {
Document doc;
try{
doc = Jsoup.parse(input, "UTF-8", "");
Elements ids = doc.select("div[id^=desk] p");
for (Element id : ids){
System.out.println("\n"+id.text());
}
}catch(IOException e){
}
}
}
回答2:
I have written program to read folder and inner folder for given path and write results into csv
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class fixingCode {
public static void main(String[] args) {
FileWriter writer = null;
System.out.println("--------------------------Program started--------------------------");
File input = new File(
"C:\\My Web Sites\\\\library\\math");//reading file from parent folder
try {
writer = new FileWriter("c:\\Temp\\results.csv");//writing file on path
Process(input, writer);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//
System.out.println("--------------------------Program End--------------------------");
}
static int spc_count = -1;
static void Process(File aFile, FileWriter writer) {
spc_count++;
Document doc = null;
String spcs = "";
try {
//
for (int i = 0; i < spc_count; i++)
spcs += " ";
if (aFile.isFile()) {
System.out.println(spcs + "[FILE] " + aFile.getName());
} else if (aFile.isDirectory()) {
//
System.out.println(spcs + "[DIR] " + aFile.getName());
//
File[] listOfFiles = aFile.listFiles();
//
File[] st = listOfFiles;
//
for (int i = 0; i < st.length; i++) {
if (st[i].isFile()) {// other condition like name
// ends in
doc = Jsoup.parse(st[i], null);
// get page title
String title = doc.title();
System.out.println("title : " + "[" + i + "]" + title);
//
String ownText = doc.body().ownText();
String text = doc.body().text();
//
// System.out.println("ownText" + ownText + "\n");
System.out.println("text" + text);
//
writer.append("title : " + "[" + i + "]");
writer.append(',');
writer.append(title);
writer.append('\n');
/*
* writer.append("ownText"); writer.append(',');
* writer.append(ownText); writer.append('\n');
*/
writer.append("text : " + "[" + i + "]");
writer.append(',');
writer.append(text);
writer.append('\n');
}
//
//
if (listOfFiles != null) {
//
for (int j = 0; j < listOfFiles.length; j++)
Process(listOfFiles[j], writer);
} else {
System.out.println(spcs + " [ACCESS DENIED]");
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
spc_count--;
}
}
来源:https://stackoverflow.com/questions/26407402/jsoup-how-to-parse-multiple-html-files-from-local-drive