Getting a memory error when parsing a large XML file in Python

大兔子大兔子 提交于 2019-12-10 23:43:24

问题


My XML file looks like this:

<root>
<group from="1", to="100">
    <link target="1"/>
    ...
    <link target="100"/>
</group>
...
</root>

I have a 6000 <group> elements and 5M <link> elements. I want to have a dictionary with the tuple (from, to) as keys and a list of <link>s' target attributes, but I get a memory error with following code:

from lxml import etree
from gzip import open as gopen

def extractTargets(fin):
    targets = dict()

    with gopen(fin) as xml:
        context = etree.iterparse(xml, tag="group")

        for event, elem in context:
            targets[(elem.get("from"), elem.get("to"))] = elem.xpath("link/@target")
            elem.clear()

            while elem.getprevious() is not None:
                del elem.getparent()[0]
        del context

回答1:


Try following code:

lxml.etree

import lxml.etree
from gzip import open as gopen

class GroupDictTarget(object):
    def __init__(self, d):
        self.d = d
    def start(self, tag, attrib):
        if tag == 'group':
            self.group = self.d[attrib['from'], attrib['to']] = []
        elif tag == 'link':
            self.group.append(attrib['target'])
    def close(self):
        pass

def extractTargets(fin):
    with gopen(fin) as xml:
        targets = {}
        parser = lxml.etree.XMLParser(target=GroupDictTarget(targets))
        lxml.etree.parse(xml, parser)
        return targets

xml.parsers.expat

import xml.parsers.expat
from gzip import open as gopen

class GroupDictTarget(object):
    # Same as above

def extractTargets(fin):
    targets = {}
    p = xml.parsers.expat.ParserCreate()
    p.StartElementHandler = GroupDictTarget(targets).start
    with gopen(fin) as f:
        p.ParseFile(f)
    return targets

xml.sax

import xml.sax
from gzip import open as gopen

class GroupDictTarget(object):
    # Same as above

def extractTargets(fin):
    targets = {}
    handler = xml.sax.handler.ContentHandler()
    handler.startElement = GroupDictTarget(targets).start
    with gopen(fin) as f:
        xml.sax.parse(f, handler)
    return targets



回答2:


I had the same problem today, and for me it worked after i deleted the "tag" parameter:

context = etree.iterparse(xml)

for event, elem in context:
        if elem.tag = "group":
            targets[(elem.get("from"), elem.get("to"))] = elem.xpath("link/@target")
        elem.clear()

        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context


来源:https://stackoverflow.com/questions/17252010/getting-a-memory-error-when-parsing-a-large-xml-file-in-python

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!