问题
I have about 1,000,000 XML files and I am using XpathExpression with Java language to walk through the XML tags and get my considered data.
Imagine I have about 5000 tags for name, 5000 tags for family name, 5000 tags for age, and only 1 tag for date in each file. Now I want to repeat date tag to 5000 times too.
Blow code is runnable for XML files with Java programming with less than 20MB size, but I have files with more than 20MB size and it takes so many times to run and in some cases, I got Out of memory error in eclipse( I tried adding vmargs in the run configuration of Eclipse but it takes so much time and still so low.) I am pretty sure there is a problem with my array for repeting date tag and it is not optimized, I really appreciate if you mind and have a look at my code, in addition I should say that i am newbie to java:
package TEST;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import java.util.Arrays;
public class Data {
//function started
public static void main(String[] args) throws Exception
{
//Get Files
String doc ="MyFileNew";
String dump="";
int number =200;
for (int i=1; i<=201; i++) {
number ++;
dump = doc+number;
String fileName= "/root/MyFiles/" + dump + ".xml";
Document document = getDocument(fileName);
;
FileWriter fw = null;
BufferedWriter bw = null;
PrintWriter pw = null;
//Using Document Builder
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
documentBuilderFactory.setNamespaceAware(true);
DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
Document doc1 = documentBuilder.parse(fileName);
/*******Get attribute values using xpath******/
XPathFactory xpathFactory = XPathFactory.newInstance();
XPath xpath = xpathFactory.newXPath();
try {
fw = new FileWriter("/root/Results/" + dump + ".txt");
bw = new BufferedWriter(fw);
pw = new PrintWriter(bw);
//Printing Name tags
pw.println( "Name"+ evaluateXPath(document, "/xml/item/item[@key='Name']/text()") );
//Counting Name tags
XPathExpression expr1 = xpath.compile("count(/xml/item/item[@key='Name']/)");
Number result1 = (Number) expr1.evaluate(doc1, XPathConstants.NUMBER);
int n = result1.intValue();
//Printing FamilyName tags
pw.println( "FamilyName: " + evaluateXPath(document, "/xml/item/item[@key='FamilyName']/text() \n") );
//Printing Age tags
pw.println( "Age: " + evaluateXPath(document, "/xml/item/item[@key='Age']/text() \n") );
//Repeating Date based on counting name tags
String[] strArray = new String[0];
for (int q=0; q<n;q++){
List<String> strArraytmp = evaluateXPath(document,"/xml/item/item[@key='date']/text()");
String[] strings = strArraytmp.stream().toArray(String[]::new);
strArray= Stream.of(strArray, strings ).flatMap(Stream::of).toArray(String[]::new);
}
pw.println("date: " + Arrays.toString(strArray));
System.out.println("this file goes to path:" + "/root/Results/Data/" + dump + ".txt");
pw.flush();
}
catch (IOException e)
{ e.printStackTrace(); } }
}
private static List<String> evaluateXPath(Document document, String xpathExpression) throws Exception
{
// Create XPathFactory object
XPathFactory xpathFactory = XPathFactory.newInstance();
// Create XPath object
XPath xpath = xpathFactory.newXPath();
List<String> values = new ArrayList<>();
try
{
// Create XPathExpression object
XPathExpression expr = xpath.compile(xpathExpression);
// Evaluate expression result on XML document
NodeList nodes = (NodeList) expr.evaluate(document, XPathConstants.NODESET);
for (int i = 0; i < nodes.getLength(); i++) {
values.add(nodes.item(i).getNodeValue());
}
} catch (XPathExpressionException e) {
e.printStackTrace();
}
return values;
}
private static Document getDocument(String fileName) throws Exception
{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(fileName);
return doc;
}
}
来源:https://stackoverflow.com/questions/62245567/evaluatexpath-runs-slow-for-repeating-1-xml-element-in-java