Convert XML file to CSV in java

后端 未结 8 888
时光说笑
时光说笑 2020-12-05 01:17

@Before There will be probably some duplicate questions suggestions, I don\'t think that is the case maybe read this first, I\'ll try to be as brief as possible. Title gives

8条回答
  •  隐瞒了意图╮
    2020-12-05 01:44

    The code provided should be considered a sketch rather than the definitive article. I am not an expert on SAX and the implementation could be improved for better performance, simpler code etc. That said SAX should be able to cope with streaming large XML files.

    I would approach this problem with 2 passes using the SAX parser. (Incidentally, I would also use a CSV generating library to create the output as this would deal with all the fiddly character escaping that CSV involves but I haven't implemented this in my sketch).

    First pass: Establish number of header columns

    Second pass: Output CSV

    I assume that the XML file is well formed. I assume that we don't have a scheme/DTD with a predefined order.

    In the first pass I have assumed that a CSV column will be added for every XML element containing text content or for any attribute (I have assumed attributes will contain something!).

    The second pass, having established the number of target columns, will do the actual CSV output.

    Based on your example XML my code sketch would produce:

    ItemID,StartTime,EndTime,ViewItemURL,AverageTime,category,category,type,type,AveragePrice
    4504216603,10:00:10.000Z,10:00:30.000Z,http://url,,,,,,
    4504216604,10:30:10.000Z,11:00:10.000Z,http://url,value1,9823,9112,TX,TY,value2
    

    Please note I have used the google collections LinkedHashMultimap as this is helpful when associating multiple values with a single key. I hope you find this useful!

    import com.google.common.collect.LinkedHashMultimap;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.LinkedHashMap;
    import java.util.Map.Entry;
    import org.xml.sax.Attributes;
    import org.xml.sax.InputSource;
    import org.xml.sax.SAXException;
    import org.xml.sax.XMLReader;
    import org.xml.sax.helpers.DefaultHandler;
    import org.xml.sax.helpers.XMLReaderFactory;
    
    public class App {
    
        public static void main(String[] args) throws SAXException, FileNotFoundException, IOException {
            // First pass - to determine headers
            XMLReader xr = XMLReaderFactory.createXMLReader();
            HeaderHandler handler = new HeaderHandler();
            xr.setContentHandler(handler);
            xr.setErrorHandler(handler);
            FileReader r = new FileReader("test1.xml");
            xr.parse(new InputSource(r));
    
            LinkedHashMap headers = handler.getHeaders();
            int totalnumberofcolumns = 0;
            for (int headercount : headers.values()) {
                totalnumberofcolumns += headercount;
            }
            String[] columnheaders = new String[totalnumberofcolumns];
            int i = 0;
            for (Entry entry : headers.entrySet()) {
                for (int j = 0; j < entry.getValue(); j++) {
                    columnheaders[i] = entry.getKey();
                    i++;
                }
            }
            StringBuilder sb = new StringBuilder();
            for (String h : columnheaders) {
                sb.append(h);
                sb.append(',');
            }
            System.out.println(sb.substring(0, sb.length() - 1));
    
            // Second pass - collect and output data
    
            xr = XMLReaderFactory.createXMLReader();
    
            DataHandler datahandler = new DataHandler();
            datahandler.setHeaderArray(columnheaders);
    
            xr.setContentHandler(datahandler);
            xr.setErrorHandler(datahandler);
            r = new FileReader("test1.xml");
            xr.parse(new InputSource(r));
        }
    
        public static class HeaderHandler extends DefaultHandler {
    
            private String content;
            private String currentElement;
            private boolean insideElement = false;
            private Attributes attribs;
            private LinkedHashMap itemHeader;
            private LinkedHashMap accumulativeHeader = new LinkedHashMap();
    
            public HeaderHandler() {
                super();
            }
    
            private LinkedHashMap getHeaders() {
                return accumulativeHeader;
            }
    
            private void addItemHeader(String headerName) {
                if (itemHeader.containsKey(headerName)) {
                    itemHeader.put(headerName, itemHeader.get(headerName) + 1);
                } else {
                    itemHeader.put(headerName, 1);
                }
            }
    
            @Override
            public void startElement(String uri, String name,
                    String qName, Attributes atts) {
                if ("item".equalsIgnoreCase(qName)) {
                    itemHeader = new LinkedHashMap();
                }
                currentElement = qName;
                content = null;
                insideElement = true;
                attribs = atts;
            }
    
            @Override
            public void endElement(String uri, String name, String qName) {
                if (!"item".equalsIgnoreCase(qName) && !"root".equalsIgnoreCase(qName)) {
                    if (content != null && qName.equals(currentElement) && content.trim().length() > 0) {
                        addItemHeader(qName);
                    }
                    if (attribs != null) {
                        int attsLength = attribs.getLength();
                        if (attsLength > 0) {
                            for (int i = 0; i < attsLength; i++) {
                                String attName = attribs.getLocalName(i);
                                addItemHeader(attName);
                            }
                        }
                    }
                }
                if ("item".equalsIgnoreCase(qName)) {
                    for (Entry entry : itemHeader.entrySet()) {
                        String headerName = entry.getKey();
                        Integer count = entry.getValue();
                        //System.out.println(entry.getKey() + ":" + entry.getValue());
                        if (accumulativeHeader.containsKey(headerName)) {
                            if (count > accumulativeHeader.get(headerName)) {
                                accumulativeHeader.put(headerName, count);
                            }
                        } else {
                            accumulativeHeader.put(headerName, count);
                        }
                    }
                }
                insideElement = false;
                currentElement = null;
                attribs = null;
            }
    
            @Override
            public void characters(char ch[], int start, int length) {
                if (insideElement) {
                    content = new String(ch, start, length);
                }
            }
        }
    
        public static class DataHandler extends DefaultHandler {
    
            private String content;
            private String currentElement;
            private boolean insideElement = false;
            private Attributes attribs;
            private LinkedHashMultimap dataMap;
            private String[] headerArray;
    
            public DataHandler() {
                super();
            }
    
            @Override
            public void startElement(String uri, String name,
                    String qName, Attributes atts) {
                if ("item".equalsIgnoreCase(qName)) {
                    dataMap = LinkedHashMultimap.create();
                }
                currentElement = qName;
                content = null;
                insideElement = true;
                attribs = atts;
            }
    
            @Override
            public void endElement(String uri, String name, String qName) {
                if (!"item".equalsIgnoreCase(qName) && !"root".equalsIgnoreCase(qName)) {
                    if (content != null && qName.equals(currentElement) && content.trim().length() > 0) {
                        dataMap.put(qName, content);
                    }
                    if (attribs != null) {
                        int attsLength = attribs.getLength();
                        if (attsLength > 0) {
                            for (int i = 0; i < attsLength; i++) {
                                String attName = attribs.getLocalName(i);
                                dataMap.put(attName, attribs.getValue(i));
                            }
                        }
                    }
                }
                if ("item".equalsIgnoreCase(qName)) {
                    String data[] = new String[headerArray.length];
                    int i = 0;
                    for (String h : headerArray) {
                        if (dataMap.containsKey(h)) {
                            Object[] values = dataMap.get(h).toArray();
                            data[i] = (String) values[0];
                            if (values.length > 1) {
                                dataMap.removeAll(h);
                                for (int j = 1; j < values.length; j++) {
                                    dataMap.put(h, values[j]);
                                }
                            } else {
                                dataMap.removeAll(h);
                            }
                        } else {
                            data[i] = "";
                        }
                        i++;
                    }
                    StringBuilder sb = new StringBuilder();
                    for (String d : data) {
                        sb.append(d);
                        sb.append(',');
                    }
                    System.out.println(sb.substring(0, sb.length() - 1));
                }
                insideElement = false;
                currentElement = null;
                attribs = null;
            }
    
            @Override
            public void characters(char ch[], int start, int length) {
                if (insideElement) {
                    content = new String(ch, start, length);
                }
            }
    
            public void setHeaderArray(String[] headerArray) {
                this.headerArray = headerArray;
            }
        }
    }
    

提交回复
热议问题