Is this the best way to get all elements & attributes in an XML file using Saxon?

送分小仙女□ 提交于 2020-08-10 19:17:20

问题


Our program displays a tree control showing the metadata structure of the XML file they are using as a datasource. So it displays all elements & attributes in use in the XML file, like this:

Employees
  Employee
    FirstName
    LastName
Orders
  Order
    OrderId

For the case where the user does not pass us a XSD file, we need to walk the XML file and build up the metadata structure.

The full code for this is at SaxonQuestions.zip, TestBuildTree.java and is also listed below.

I am concerned that my code is not the most efficient, or maybe even wrong. It works, but it works on the 3 XML files I tested it on. My questions are:

  1. What is the best way to get the root element from the DOM? Is it walking the children of the DOM root node?
  2. What is the best way to determine if an element has data (as opposed to just child elements)? The best I could come up with throws an exception if not and I don't think code executing the happy path should throw an exception.
  3. What is the best way to get the class of the data held in an element or attribute? Is it: ((XdmAtomicValue)((XdmNode)currentNode).getTypedValue()).getValue().getClass();
  4. Is the best way to walk all the nodes to use XdmNode.axisIterator? And to do so as I have in this code?

TestBuildTree.java

import net.sf.saxon.s9api.*;

import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;

public class TestBuildTree {

    public static void main(String[] args) throws Exception {

        XmlDatasource datasource = new XmlDatasource(
                new FileInputStream(new File("files", "SouthWind.xml").getCanonicalPath()),
                null);

        // Question:
        //   Is this the best way to get the root element?
        // get the root element
        XdmNode rootNode = null;
        for (XdmNode node : datasource.getXmlRootNode().children()) {
            if (node.getNodeKind() == XdmNodeKind.ELEMENT) {
                rootNode = node;
                break;
            }
        }

        TestBuildTree buildTree = new TestBuildTree(rootNode);
        Element root = buildTree.addNode();

        System.out.println("Schema:");
        printElement("", root);
    }

    private static void printElement(String indent, Element element) {
        System.out.println(indent + "<" + element.name + "> (" + (element.type == null ? "null" : element.type.getSimpleName()) + ")");
        indent += "  ";
        for (Attribute attr : element.attributes)
            System.out.println(indent + "=" + attr.name + " (" + (attr.type == null ? "null" : attr.type.getSimpleName()) + ")");
        for (Element child : element.children)
            printElement(indent, child);
    }

    protected XdmItem currentNode;

    public TestBuildTree(XdmItem currentNode) {
        this.currentNode = currentNode;
    }

    private Element addNode() throws SaxonApiException {

        String name = ((XdmNode)currentNode).getNodeName().getLocalName();

        // Question:
        //   Is this the best way to determine that this element has data (as opposed to child elements)?
        Boolean elementHasData;
        try {
            ((XdmNode) currentNode).getTypedValue();
            elementHasData = true;
        } catch (Exception ex) {
            elementHasData = false;
        }

        // Questions:
        //   Is this the best way to get the type of the element value?
        //   If no schema is it always String?
        Class valueClass = ! elementHasData ? null : ((XdmAtomicValue)((XdmNode)currentNode).getTypedValue()).getValue().getClass();
        Element element = new Element(name, valueClass, null);

        // add in attributes
        XdmSequenceIterator currentSequence;
        if ((currentSequence = moveTo(Axis.ATTRIBUTE)) != null) {
            do {
                name = ((XdmNode) currentNode).getNodeName().getLocalName();

                // Questions:
                //   Is this the best way to get the type of the attribute value?
                //   If no schema is it always String?
                valueClass = ((XdmAtomicValue)((XdmNode)currentNode).getTypedValue()).getValue().getClass();

                Attribute attr = new Attribute(name, valueClass, null);
                element.attributes.add(attr);
            } while (moveToNextInCurrentSequence(currentSequence));
            moveTo(Axis.PARENT);
        }

        // add in children elements
        if ((currentSequence = moveTo(Axis.CHILD)) != null) {
            do {
                Element child = addNode();
                // if we don't have this, add it
                Element existing = element.getChildByName(child.name);
                if (existing == null)
                    element.children.add(child);
                else
                    // add in any children this does not have
                    existing.addNewItems (child);
            } while (moveToNextInCurrentSequence(currentSequence));
            moveTo(Axis.PARENT);
        }

        return element;
    }

    // moves to element or attribute
    private XdmSequenceIterator moveTo(Axis axis) {

        XdmSequenceIterator en = ((XdmNode) currentNode).axisIterator(axis);

        boolean gotIt = false;
        while (en.hasNext()) {
            currentNode = en.next();
            if (((XdmNode) currentNode).getNodeKind() == XdmNodeKind.ELEMENT || ((XdmNode) currentNode).getNodeKind() == XdmNodeKind.ATTRIBUTE) {
                gotIt = true;
                break;
            }
        }

        if (gotIt) {
            if (axis == Axis.ATTRIBUTE || axis == Axis.CHILD || axis == Axis.NAMESPACE)
                return en;
            return null;
        }
        return null;
    }

    // moves to next element or attribute
    private Boolean moveToNextInCurrentSequence(XdmSequenceIterator currentSequence)
    {
        if (currentSequence == null)
            return false;
        while (currentSequence.hasNext())
        {
            currentNode = currentSequence.next();
            if (((XdmNode)currentNode).getNodeKind() == XdmNodeKind.ELEMENT || ((XdmNode)currentNode).getNodeKind() == XdmNodeKind.ATTRIBUTE)
                return true;
        }
        return false;
    }

    static class Node {
        String name;
        Class type;
        String description;

        public Node(String name, Class type, String description) {
            this.name = name;
            this.type = type;
            this.description = description;
        }
    }

    static class Element extends Node {
        List<Element> children;
        List<Attribute> attributes;

        public Element(String name, Class type, String description) {
            super(name, type, description);
            children = new ArrayList<>();
            attributes = new ArrayList<>();
        }

        public Element getChildByName(String name) {
            for (Element child : children) {
                if (child.name.equals(name))
                    return child;
            }
            return null;
        }

        public void addNewItems(Element child) {
            for (Attribute attrAdd : child.attributes) {
                boolean haveIt = false;
                for (Attribute attrExist : attributes)
                    if (attrExist.name.equals(attrAdd.name)) {
                        haveIt = true;
                        break;
                    }
                if (!haveIt)
                    attributes.add(attrAdd);
            }

            for (Element elemAdd : child.children) {
                Element exist = null;
                for (Element elemExist : children)
                    if (elemExist.name.equals(elemAdd.name)) {
                        exist = elemExist;
                        break;
                    }
                if (exist == null)
                    children.add(elemAdd);
                else
                    exist.addNewItems(elemAdd);
            }
        }
    }

    static class Attribute extends Node {
        public Attribute(String name, Class type, String description) {
            super(name, type, description);
        }
    }
}

XmlDatasource.java

import com.saxonica.config.EnterpriseConfiguration;
import com.saxonica.ee.s9api.SchemaValidatorImpl;
import net.sf.saxon.Configuration;
import net.sf.saxon.lib.FeatureKeys;
import net.sf.saxon.s9api.*;
import net.sf.saxon.type.SchemaException;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import javax.xml.transform.Source;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamSource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;

public class XmlDatasource {

    /** the DOM all searches are against */
    private XdmNode xmlRootNode;

    private XPathCompiler xPathCompiler;

    /** key == the prefix; value == the uri mapped to that prefix */
    private HashMap<String, String> prefixToUriMap = new HashMap<>();

    /** key == the uri mapped to that prefix; value == the prefix */
    private HashMap<String, String> uriToPrefixMap = new HashMap<>();


    public XmlDatasource (InputStream xmlData, InputStream schemaFile) throws SAXException, SchemaException, SaxonApiException, IOException {

        boolean haveSchema = schemaFile != null;

        // call this before any instantiation of Saxon classes.
        Configuration config = createEnterpriseConfiguration();

        if (haveSchema) {
            Source schemaSource = new StreamSource(schemaFile);
            config.addSchemaSource(schemaSource);
        }

        Processor processor = new Processor(config);

        DocumentBuilder doc_builder = processor.newDocumentBuilder();

        XMLReader reader = createXMLReader();

        InputSource xmlSource = new InputSource(xmlData);
        SAXSource saxSource = new SAXSource(reader, xmlSource);

        if (haveSchema) {
            SchemaValidator validator = new SchemaValidatorImpl(processor);
            doc_builder.setSchemaValidator(validator);
        }
        xmlRootNode = doc_builder.build(saxSource);

        xPathCompiler = processor.newXPathCompiler();
        if (haveSchema)
            xPathCompiler.setSchemaAware(true);

        declareNameSpaces();
    }

    public XdmNode getXmlRootNode() {
        return xmlRootNode;
    }

    public XPathCompiler getxPathCompiler() {
        return xPathCompiler;
    }

    /**
     * Create a XMLReader set to disallow XXE aattacks.
     * @return a safe XMLReader.
     */
    public static XMLReader createXMLReader() throws SAXException {

        XMLReader reader = XMLReaderFactory.createXMLReader();

        // stop XXE https://www.owasp.org/index.php/XML_External_Entity_(XXE)_Prevention_Cheat_Sheet#JAXP_DocumentBuilderFactory.2C_SAXParserFactory_and_DOM4J
        reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
        reader.setFeature("http://xml.org/sax/features/external-general-entities", false);
        reader.setFeature("http://xml.org/sax/features/external-parameter-entities", false);

        return reader;
    }

    private void declareNameSpaces() throws SaxonApiException {

        // saxon has some of their functions set up with this.
        prefixToUriMap.put("saxon", "http://saxon.sf.net");
        uriToPrefixMap.put("http://saxon.sf.net", "saxon");

        XdmValue list = xPathCompiler.evaluate("//namespace::*", xmlRootNode);
        if (list == null || list.size() == 0)
            return;

        for (int index=0; index<list.size(); index++) {
            XdmNode node = (XdmNode) list.itemAt(index);
            String prefix = node.getNodeName() == null ? "" : node.getNodeName().getLocalName();

            // xml, xsd, & xsi are XML structure ones, not ones used in the XML
            if (prefix.equals("xml") || prefix.equals("xsd") || prefix.equals("xsi"))
                continue;

            // use default prefix if prefix is empty.
            if (prefix == null || prefix.isEmpty())
                prefix = "def";

            // this returns repeats, so if a repeat, go on to next.
            if (prefixToUriMap.containsKey(prefix))
                continue;

            String uri = node.getStringValue();
            if (uri != null && !uri.isEmpty()) {
                xPathCompiler.declareNamespace(prefix, uri);
                prefixToUriMap.put(prefix, uri);
                uriToPrefixMap.put(uri, prefix);            }
        }
    }

    public static EnterpriseConfiguration createEnterpriseConfiguration()
    {
        EnterpriseConfiguration configuration = new EnterpriseConfiguration();
        configuration.supplyLicenseKey(new BufferedReader(new java.io.StringReader(deobfuscate(key))));
        configuration.setConfigurationProperty(FeatureKeys.SUPPRESS_XPATH_WARNINGS, Boolean.TRUE);

        return configuration;
    }
}

来源:https://stackoverflow.com/questions/63100478/is-this-the-best-way-to-get-all-elements-attributes-in-an-xml-file-using-saxon

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!