R: convert XML data to data frame

前端 未结 4 584
醉话见心
醉话见心 2020-12-08 04:52

For a homework assignment I am attempting to convert an XML file into a data frame in R. I have tried many different things, and I have searched for ideas on the internet bu

相关标签:
4条回答
  • 2020-12-08 05:08

    It may not be as verbose as the XML package but xml2 doesn't have the memory leaks and is laser-focused on data extraction. I use trimws which is a really recent addition to R core.

    library(xml2)
    
    pg <- read_xml("http://www.ggobi.org/book/data/olive.xml")
    
    # get all the <record>s
    recs <- xml_find_all(pg, "//record")
    
    # extract and clean all the columns
    vals <- trimws(xml_text(recs))
    
    # extract and clean (if needed) the area names
    labs <- trimws(xml_attr(recs, "label"))
    
    # mine the column names from the two variable descriptions
    # this XPath construct lets us grab either the <categ…> or <real…> tags
    # and then grabs the 'name' attribute of them
    cols <- xml_attr(xml_find_all(pg, "//data/variables/*[self::categoricalvariable or
                                                          self::realvariable]"), "name")
    
    # this converts each set of <record> columns to a data frame
    # after first converting each row to numeric and assigning
    # names to each column (making it easier to do the matrix to data frame conv)
    dat <- do.call(rbind, lapply(strsplit(vals, "\ +"),
                                     function(x) {
                                       data.frame(rbind(setNames(as.numeric(x),cols)))
                                     }))
    
    # then assign the area name column to the data frame
    dat$area_name <- labs
    
    head(dat)
    ##   region area palmitic palmitoleic stearic oleic linoleic linolenic
    ## 1      1    1     1075          75     226  7823      672        NA
    ## 2      1    1     1088          73     224  7709      781        31
    ## 3      1    1      911          54     246  8113      549        31
    ## 4      1    1      966          57     240  7952      619        50
    ## 5      1    1     1051          67     259  7771      672        50
    ## 6      1    1      911          49     268  7924      678        51
    ##   arachidic eicosenoic    area_name
    ## 1        60         29 North-Apulia
    ## 2        61         29 North-Apulia
    ## 3        63         29 North-Apulia
    ## 4        78         35 North-Apulia
    ## 5        80         46 North-Apulia
    ## 6        70         44 North-Apulia
    

    UPDATE

    I'd prbly do the last bit this way now:

    library(tidyverse)
    
    strsplit(vals, "[[:space:]]+") %>% 
      map_df(~as_data_frame(as.list(setNames(., cols)))) %>% 
      mutate(area_name=labs)
    
    0 讨论(0)
  • 2020-12-08 05:09

    For me the canonical answer is

    doc<-xmlParse("Olive_py.xml")
    xmldf <- xmlToDataFrame(nodes = getNodeSet(doc, "//record"))
    

    which is somehow hidden in @Parfait's answer.

    However, this will fail if some of the nodes have multiple child nodes of the same type. In such cases an extractor function will solve the problem:

    example data

    <?xml version="1.0" encoding="UTF-8"?>
    <testrun duration="25740" footerText="Generated by IntelliJ IDEA on 11/20/19, 9:21 PM" name="All in foo">
        <suite duration="274" locationUrl="java:suite://com.foo.bar.LoadBla" name="LoadBla"
               status="passed">
            <test duration="274" locationUrl="java:test://com.foo.bar.LoadBla/testReadWrite"
                  name="LoadBla.testReadWrite" status="passed">
                <output type="stdout">ispsum ..</output>
            </test>
        </suite>
        <suite duration="9298" locationUrl="java:suite://com.foo.bar.TestFooSearch" name="TestFooSearch"
               status="passed">
            <test duration="7207" locationUrl="java:test://com.foo.bar.TestFooSearch/TestFooSearch"
                  name="TestFooSearch.TestFooSearch" status="passed">
                <output type="stdout"/>
            </test>
            <test duration="2091" locationUrl="java:test://com.foo.bar.TestFooSearch/testSameSearch"
                  name="TestFooSearch.testSameSearch" status="passed"/>
        </suite>
    </testrun>
    

    code

    require(XML)
    require(tidyr)
    require(dplyr)
    
    node2df <- function(node){
        # (Optinonally) read out properties of  some optional child node
        outputNodes = getNodeSet(node, "output")
        stdout = if (length(outputNodes) > 0) xmlValue(outputNodes[[1]]) else NA
    
        vec_as_df <- function(namedVec, row_name="name", value_name="value"){
            data_frame(name = names(namedVec), value = namedVec) %>% set_names(row_name, value_name)
        }
    
        # Extract all node properties
        node %>%
            xmlAttrs %>%
            vec_as_df %>%
            pivot_wider(names_from = name, values_from = value) %>%
            mutate(stdout = stdout)
    }
    
    testResults = xmlParse(xmlFile) %>%
        getNodeSet("/testrun/suite/test", fun = node2df) %>%
        bind_rows()
    
    0 讨论(0)
  • 2020-12-08 05:11

    Here's what I came up with. It matches the olive oil csv file that is also available on the same page. They show X as the first column name, but I don't see it in the xml so I just added it manually.

    It will probably be best to break it up into sections, then assemble the final data frame once we've got all the parts. We can also use the [.XML* shortcuts for XPath, and the other [[ convenience accessor functions.

    library(XML)
    url <- "http://www.ggobi.org/book/data/olive.xml"
    
    ## parse the xml document and get the top-level XML node
    doc <- xmlParse(url)
    top <- xmlRoot(doc)
    
    ## create the data frame
    df <- cbind(
        ## get all the labels for the first column (groups)
        X = unlist(doc["//record//@label"], use.names = FALSE), 
        read.table(
            ## get all the records as a character vector
            text = xmlValue(top[["data"]][["records"]]), 
            ## get the column names from 'variables'
            col.names = xmlSApply(top[["data"]][["variables"]], xmlGetAttr, "name"), 
            ## assign the NA values to 'na' in the records
            na.strings = "na"
        )
    )
    
    ## result
    head(df)
    #              X region area palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
    # 1 North-Apulia      1    1     1075          75     226  7823      672        NA        60         29
    # 2 North-Apulia      1    1     1088          73     224  7709      781        31        61         29
    # 3 North-Apulia      1    1      911          54     246  8113      549        31        63         29
    # 4 North-Apulia      1    1      966          57     240  7952      619        50        78         35
    # 5 North-Apulia      1    1     1051          67     259  7771      672        50        80         46
    # 6 North-Apulia      1    1      911          49     268  7924      678        51        70         44
    
    ## clean up
    free(doc); rm(doc, top); gc()
    
    0 讨论(0)
  • 2020-12-08 05:20

    Great answers above! For future readers, anytime you face a complex XML needing R import, consider re-structuring the XML document using XSLT (a special-purpose declarative programming language that manipulates XML content into various end-use needs). Then simply use R's xmlToDataFrame() function from XML package.

    Unfortunately, R does not have a dedicated XSLT package available on CRAN-R across all operating systems. The listed SXLT seems to be a Linux package and not able to be used on Windows. See unanswered SO questions here and here. I understand @hrbrmstr (above) maintains a GitHub XSLT project. Nonetheless, nearly all general-purpose languages maintain XSLT processors including Java, C#, Python, PHP, Perl, and VB.

    Below is the open-source Python route and because the XML document is pretty nuanced, two XSLTs are being used (of course XSLT gurus can combine them into one but tried as I might couldn't get it to work.

    FIRST XSLT (using a recursive template)

    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output omit-xml-declaration="yes" indent="yes"/>
    <xsl:strip-space elements="*"/>
    
    <!-- Identity Transform -->    
    <xsl:template match="node()|@*">
        <xsl:copy>
           <xsl:apply-templates select="node()|@*"/>
        </xsl:copy>
    </xsl:template>
    
    <xsl:template match="record/text()" name="tokenize">        
        <xsl:param name="text" select="."/>
        <xsl:param name="separator" select="' '"/>
        <xsl:choose>            
            <xsl:when test="not(contains($text, $separator))">                
                <data>
                    <xsl:value-of select="normalize-space($text)"/>
                </data>              
            </xsl:when>
            <xsl:otherwise>
                <data>                  
                    <xsl:value-of select="normalize-space(substring-before($text, $separator))"/>                  
                </data>                  
                <xsl:call-template name="tokenize">
                    <xsl:with-param name="text" select="substring-after($text, $separator)"/>
                </xsl:call-template>                
            </xsl:otherwise>            
        </xsl:choose>        
    </xsl:template>     
    
    <xsl:template match="description|variables|categoricalvariable|realvariable">        
    </xsl:template> 
    

    SECOND XSLT

    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    
        <!-- Identity Transform -->    
        <xsl:template match="records">
            <xsl:copy>
               <xsl:apply-templates select="node()|@*"/>
            </xsl:copy>
        </xsl:template>
    
        <xsl:template match="record">
            <record>
                <area_name><xsl:value-of select="@label"/></area_name>
                <area><xsl:value-of select="data[1]"/></area>
                <region><xsl:value-of select="data[2]"/></region>
                <palmitic><xsl:value-of select="data[3]"/></palmitic>
                <palmitoleic><xsl:value-of select="data[4]"/></palmitoleic>
                <stearic><xsl:value-of select="data[5]"/></stearic>
                <oleic><xsl:value-of select="data[6]"/></oleic>
                <linoleic><xsl:value-of select="data[7]"/></linoleic>
                <linolenic><xsl:value-of select="data[8]"/></linolenic>
                <arachidic><xsl:value-of select="data[9]"/></arachidic>
                <eicosenoic><xsl:value-of select="data[10]"/></eicosenoic>
            </record>
       </xsl:template>         
    
    </xsl:stylesheet>
    

    Python (using lxml module)

    import lxml.etree as ET
    
    cd = os.path.dirname(os.path.abspath(__file__))
    
    # FIRST TRANSFORMATION
    dom = ET.parse('http://www.ggobi.org/book/data/olive.xml')
    xslt = ET.parse(os.path.join(cd, 'Olive.xsl'))
    transform = ET.XSLT(xslt)
    newdom = transform(dom)
    
    tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True,  xml_declaration=True)
    
    xmlfile = open(os.path.join(cd, 'Olive_py.xml'),'wb')
    xmlfile.write(tree_out)
    xmlfile.close()    
    
    # SECOND TRANSFORMATION
    dom = ET.parse(os.path.join(cd, 'Olive_py.xml'))
    xslt = ET.parse(os.path.join(cd, 'Olive2.xsl'))
    transform = ET.XSLT(xslt)
    newdom = transform(dom)
    
    tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True,  xml_declaration=True)    
    
    xmlfile = open(os.path.join(cd, 'Olive_py.xml'),'wb')
    xmlfile.write(tree_out)
    xmlfile.close()
    

    R

    library(XML)
    
    # LOADING TRANSFORMED XML INTO R DATA FRAME
    doc<-xmlParse("Olive_py.xml")
    xmldf <- xmlToDataFrame(nodes = getNodeSet(doc, "//record"))
    View(xmldf)
    

    Output

    area_name   area    region  palmitic    palmitoleic stearic oleic   linoleic    linolenic   arachidic   eicosenoic
    North-Apulia 1      1       1075        75          226     7823        672          na                     60
    North-Apulia 1      1       1088        73          224     7709        781          31          61         29
    North-Apulia 1      1       911         54          246     8113        549          31          63         29
    North-Apulia 1      1       966         57          240     7952        619          50          78         35
    North-Apulia 1      1       1051        67          259     7771        672          50          80         46
       ...
    

    (slight cleanup on very first record is needed as an extra space was added after "na" in xml doc, so arachidic and eicosenoic were shifted forward)

    0 讨论(0)
提交回复
热议问题