Converting an xml doc into a specific dot-expanded json structure

爱⌒轻易说出口 提交于 2019-12-06 02:19:43

You can use recursion here. One way is to store the paths progressively as your recurse the XML document, and return a result dictionary at the end, which can be serialized to JSON.

The below demo uses the standard library xml.etree.ElementTree for parsing XML documents.

Demo:

from xml.etree.ElementTree import ElementTree
from pprint import pprint

# Setup XML tree for parsing
tree = ElementTree()
tree.parse("sample.xml")
root = tree.getroot()

def collect_xml_paths(root, path=[], result={}):
    """Collect XML paths into a dictionary"""

    # First collect root items
    if not result:
        root_id, root_value = tuple(root.attrib.items())[0]
        root_key = root.tag + "[@%s]" % root_id
        result[root_key] = root_value

    # Go through each child from root
    for child in root:

        # Extract text
        text = child.text.strip()

        # Update path
        new_path = path[:]
        new_path.append(child.tag)

        # Create dot separated key
        key = ".".join(new_path)

        # Get child attributes
        attributes = child.attrib

        # Ensure we have attributes
        if attributes:

            # Add each attribute to result
            for k, v in attributes.items():
                attrib_key = key + "[@%s]" % k
                result.setdefault(attrib_key, []).append(v)

        # Add text if it exists
        if text:
            result.setdefault(key, []).append(text)

        # Recurse through paths once done iteration
        collect_xml_paths(child, new_path)

    # Separate single values from list values
    return {k: v[0] if len(v) == 1 else v for k, v in result.items()}

pprint(collect_xml_paths(root))

Output:

{'Genres.Genre': ['Comedy', 'TV-Show'],
 'Genres.Genre[@FacebookID]': ['6003161475030', '6003172932634'],
 'Item[@ID]': '288917',
 'Main.Platform': 'iTunes',
 'Main.PlatformID': '353736518',
 'Products.Product.Offers.Offer.Currency': ['CAD', 'CAD', 'EUR', 'EUR'],
 'Products.Product.Offers.Offer.Price': ['3.49', '2.49', '2.49', '1.99'],
 'Products.Product.Offers.Offer[@Type]': ['HDBUY', 'SDBUY', 'HDBUY', 'SDBUY'],
 'Products.Product.Rating': 'Tout public',
 'Products.Product.URL': ['https://itunes.apple.com/ca/tv-season/id353187108?i=353736518',
                      'https://itunes.apple.com/fr/tv-season/id353187108?i=353736518'],
 'Products.Product[@Country]': ['CA', 'FR']}

If you want to serialize this dictionary to JSON, you can use json.dumps():

from json import dumps

print(dumps(collect_xml_paths(root)))
# {"Item[@ID]": "288917", "Main.Platform": "iTunes", "Main.PlatformID": "353736518", "Genres.Genre[@FacebookID]": ["6003161475030", "6003172932634"], "Genres.Genre": ["Comedy", "TV-Show"], "Products.Product[@Country]": ["CA", "FR"], "Products.Product.URL": ["https://itunes.apple.com/ca/tv-season/id353187108?i=353736518", "https://itunes.apple.com/fr/tv-season/id353187108?i=353736518"], "Products.Product.Offers.Offer[@Type]": ["HDBUY", "SDBUY", "HDBUY", "SDBUY"], "Products.Product.Offers.Offer.Price": ["3.49", "2.49", "2.49", "1.99"], "Products.Product.Offers.Offer.Currency": ["CAD", "CAD", "EUR", "EUR"], "Products.Product.Rating": "Tout public"}

This is a bit verbose, but it wasn't too hard to format this as a flat dict. Here is an example:

node = etree.fromstring(file_data.encode('utf-8'), parser=parser)
data = OrderedDict()
nodes = [(node, ''),] # format is (node, prefix)

while nodes:

    for sub, prefix in nodes:

        # remove the prefix tag unless its for the first attribute
        tag_prefix = '.'.join(prefix.split('.')[1:]) if ('.' in prefix) else ''
        atr_prefix = sub.tag if (sub == node) else tag_prefix

        # tag
        if sub.text.strip():
            _prefix = tag_prefix + '.' + sub.tag
            _value = sub.text.strip()
            if data.get(_prefix): # convert it to a list if multiple values
                if not isinstance(data[_prefix], list): data[_prefix] = [data[_prefix],]
                data[_prefix].append(_value)
            else:
                data[_prefix] = _value

        # atr
        for k, v in sub.attrib.items():
            _prefix = atr_prefix + '[@%s]' % k
            _value = v
            if data.get(_prefix): # convert it to a list if multiple values
                if not isinstance(data[_prefix], list): data[_prefix] = [data[_prefix],]
                data[_prefix].append(_value)
            else:
                data[_prefix] = _value

        nodes.remove((sub, prefix))

        for s in sub.getchildren():
            _prefix = (prefix + '.' + sub.tag).strip('.')
            nodes.append((s, _prefix))

    if not nodes: break
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!