I have to parse a bunch of XML files in Java that sometimes -- and invalidly -- contain HTML entities such as —, > and so forth. I
I made yesterday something similar i need to add value from unziped XML in stream to database.
//import I'm not sure if all are necessary :)
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.*;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
//I didnt checked this code now because i'm in work for sure its work maybe
you will need to do little changes
InputSource is = new InputSource(new FileInputStream("test.xml"));
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(is);
XPathFactory xpf = XPathFactory.newInstance();
XPath xpath = xpf.newXPath();
String words= xpath.evaluate("/foo/bar", doc.getDocumentElement());
ParsingHexToChar.parseToChar(words);
// lib which i use common-lang3.jar
//metod to parse
public static String parseToChar( String words){
String decode= org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(words);
return decode;
}