Parse indented text tree in Java

后端 未结 4 1344
旧时难觅i
旧时难觅i 2020-12-19 14:38

I have an indented file that I need to parsed using java, I need some way to place this in a Section class as shown below

    root
     root1
       text1
           


        
相关标签:
4条回答
  • 2020-12-19 14:53

    I added a parent pointer as well. Maybe the text can be parsed without it, but parent pointers make it easier. First of all, you need to have more constructors:

    static final int root_depth = 4; // assuming 4 whitespaces precede the tree root
    
    public Section(String text, int depth) {
        this.text     = text;
        this.depth    = depth;
        this.children = new ArrayList<Section>();
        this.parent   = null;
    }
    
    public Section(String text, int depth, Section parent) {
        this.text     = text;
        this.depth    = depth;
        this.children = new ArrayList<Section>();
        this.parent   = parent;
    }
    

    Then, when you start parsing the file, read it line by line:

    Section prev = null;
    for (String line; (line = bufferedReader.readLine()) != null; ) {
        if (prev == null && line begins with root_depth whitespaces) {
            Section root = new Section(text_of_line, root_depth);
            prev = root;
        }
        else {
            int t_depth = no. of whitespaces at the beginning of this line;
            if (t_depth > prev.getDepth())
                // assuming that empty sections are not allowed
                Section t_section = new Section(text_of_line, t_depth, prev);
                prev.addChild(t_section);
            }
            else if (t_depth == prev.getDepth) {
                Section t_section = new Section(text_of_line, t_depth, prev.getParent());
                prev.getParent().addChild(t_section);
            }
            else {
                while (t_depth < prev.getDepth()) {
                    prev = prev.getParent();
                }
                // at this point, (t_depth == prev.getDepth()) = true
                Section t_section = new Section(text_of_line, t_depth, prev.getParent());
                prev.getParent().addChild(t_section);
            }
        }
    }
    

    I have glossed over some finer points of the pseudo-code, but I think you get the overall idea of how to go about this parsing. Do remember to implement the methods addChild(), getDepth(), getParent(), etc.

    0 讨论(0)
  • 2020-12-19 14:56

    An implementation in C#

    Based on this answer, I've created a C# solution.

    It allows for multiple roots and assumes an input structure like:

    Test
        A
        B
        C
            C1
            C2
        D
    Something
        One
        Two
        Three
    

    An example usage of the code would be:

    var lines = new[]
    {
        "Test",
        "\tA",
        "\tB",
        "\tC",
        "\t\tC1",
        "\t\tC2",
        "\tD",
        "Something",
        "\tOne",
        "\tTwo",
        "\tThree"
    };
    
    var roots = IndentedTextToTreeParser.Parse(lines, 0, '\t');
    
    var dump = IndentedTextToTreeParser.Dump(roots);
    Console.WriteLine(dump);
    

    You can specify the root indention (zero by default) and also the idention character (a tab \t by default).

    The full code:

    namespace MyNamespace
    {
        using System;
        using System.Collections.Generic;
        using System.Diagnostics;
        using System.Text;
    
        public static class IndentedTextToTreeParser
        {
            // https://stackoverflow.com/questions/21735468/parse-indented-text-tree-in-java
    
            public static List<IndentTreeNode> Parse(IEnumerable<string> lines, int rootDepth = 0, char indentChar = '\t')
            {
                var roots = new List<IndentTreeNode>();
    
                // --
    
                IndentTreeNode prev = null;
    
                foreach (var line in lines)
                {
                    if (string.IsNullOrEmpty(line.Trim(indentChar)))
                        throw new Exception(@"Empty lines are not allowed.");
    
                    var currentDepth = countWhiteSpacesAtBeginningOfLine(line, indentChar);
    
                    if (currentDepth == rootDepth)
                    {
                        var root = new IndentTreeNode(line, rootDepth);
                        prev = root;
    
                        roots.Add(root);
                    }
                    else
                    {
                        if (prev == null)
                            throw new Exception(@"Unexpected indention.");
                        if (currentDepth > prev.Depth + 1)
                            throw new Exception(@"Unexpected indention (children were skipped).");
    
                        if (currentDepth > prev.Depth)
                        {
                            var node = new IndentTreeNode(line.Trim(), currentDepth, prev);
                            prev.AddChild(node);
    
                            prev = node;
                        }
                        else if (currentDepth == prev.Depth)
                        {
                            var node = new IndentTreeNode(line.Trim(), currentDepth, prev.Parent);
                            prev.Parent.AddChild(node);
    
                            prev = node;
                        }
                        else
                        {
                            while (currentDepth < prev.Depth) prev = prev.Parent;
    
                            // at this point, (currentDepth == prev.Depth) = true
                            var node = new IndentTreeNode(line.Trim(indentChar), currentDepth, prev.Parent);
                            prev.Parent.AddChild(node);
                        }
                    }
                }
    
                // --
    
                return roots;
            }
    
            public static string Dump(IEnumerable<IndentTreeNode> roots)
            {
                var sb = new StringBuilder();
    
                foreach (var root in roots)
                {
                    doDump(root, sb, @"");
                }
    
                return sb.ToString();
            }
    
            private static int countWhiteSpacesAtBeginningOfLine(string line, char indentChar)
            {
                var lengthBefore = line.Length;
                var lengthAfter = line.TrimStart(indentChar).Length;
                return lengthBefore - lengthAfter;
            }
    
            private static void doDump(IndentTreeNode treeNode, StringBuilder sb, string indent)
            {
                sb.AppendLine(indent + treeNode.Text);
                foreach (var child in treeNode.Children)
                {
                    doDump(child, sb, indent + @"    ");
                }
            }
        }
    
        [DebuggerDisplay(@"{Depth}: {Text} ({Children.Count} children)")]
        public class IndentTreeNode
        {
            public IndentTreeNode(string text, int depth = 0, IndentTreeNode parent = null)
            {
                Text = text;
                Depth = depth;
                Parent = parent;
            }
    
            public string Text { get; }
            public int Depth { get; }
            public IndentTreeNode Parent { get; }
            public List<IndentTreeNode> Children { get; } = new List<IndentTreeNode>();
    
            public void AddChild(IndentTreeNode child)
            {
                if (child != null) Children.Add(child);
            }
        }
    }
    

    I've also included an method Dump() to convert the tree back to a string in order to better debug the algorithm itself.

    0 讨论(0)
  • 2020-12-19 14:59

    I implemented an alternative solution, using recursive function calls. As far as I can estimate, it will have worse performance than Max Seo's suggestion, especially on deep hierachies. However it is easier to understand (in my opinion), and hence modify to you specific needs. Have a look and let me know if you have any suggestions.

    One benefit is that it can handle trees with multiple roots, as it is.

    Problem description - just to be clear about it...

    Assuming we have a construct Node, which can contain data and have zero or more children, which are also Nodes. Based on a text input, we want to build trees of nodes, where the data of each node is the content from a line, and the position of the node in tree is indicated by the line position and indentation, so that a line which is indented is the child of the first previous line, which is less indented.

    Algorithm description

    Assuming we have a list of lines, define a function which:

    • If the input list has at least two lines:
      • Removes the first line from the list
      • Removes all lines from the list which satisfy all:
        • Has a higher indent than the first line
        • Occurs before the next line with indent less than or equal to the first line
      • Passes these lines recursively to the function, and sets the result as children of the first line
      • If it has remaining lines, passes these recursively to the function, and combines them with the first line, as the result
      • If there are no more remaining lines, returns a list with the first line as the single element
    • If the input list has one line:
      • Sets the children of that one line to an empty list
      • Returns the list
    • If the input list has no elements
      • Returns an empty list
    • Remove the first line from the list

    Calling the function with a list of lines, will result in a list of trees, based on their indent. If the tree has only one root, the resulting tree will be the first element of the result list.

    Pseudocode

    List<Node> LinesToTree( List<Line> lines )
    {
        if(lines.count >= 2)
        {
            firstLine = lines.shift
            nextLine = lines[0]
            children = List<Line>
    
            while(nextLine != null && firstLine.indent < nextLine.indent)
            {
                children.add(lines.shift)
                nextLine = lines[0]
            }
    
            firstLineNode = new Node
            firstLineNode.data = firstLine.data
            firstLineNode.children = LinesToTree(children)
    
            resultNodes = new List<Node>
            resultNodes.add(firstLineNode)
    
            if(lines.count > 0)
            {
                siblingNodes = LinesToTree(lines)
                resultNodes.addAll(siblingNodes)
                return resultNodes
            }
            else
            {
                return resultNodes
            }
        }
        elseif()
        {
            nodes = new List<Node>
            node = new Node
            node.data = lines[0].data
            node.children = new List<Node>
            return nodes
        }
        else
        {
            return new List<Node>
        }
    }
    

    PHP implementation using arrays

    The implementation is customisable through the delegate to get the indent, and the name of the children field in output array.

    public static function IndentedLinesToTreeArray(array $lineArrays, callable $getIndent = null, $childrenFieldName = "children")
    {
        //Default function to get element indentation
        if($getIndent == null){
            $getIndent = function($line){
                return $line["indent"];
            };
        }
    
        $lineCount = count($lineArrays);
    
        if($lineCount >= 2)
        {
            $firstLine = array_shift($lineArrays);
            $children = [];
            $nextLine = $lineArrays[0];
    
            while($getIndent($firstLine) < $getIndent($nextLine)){
                $children[] = array_shift($lineArrays);
                if(!isset($lineArrays[0])){
                    break;
                }
                $nextLine = $lineArrays[0];
            }
    
            $firstLine[$childrenFieldName] = self::IndentedLinesToTreeArray($children, $getIndent, $childrenFieldName);
    
            if(count($lineArrays)){
                return array_merge([$firstLine],self::IndentedLinesToTreeArray($lineArrays, $getIndent, $childrenFieldName));
            }else{
                return [$firstLine];
            }
        }
        elseif($lineCount == 1)
        {
            $lineArrays[0][$childrenFieldName] = [];
            return $lineArrays;
        }
        else
        {
            return [];
        }
    }
    
    0 讨论(0)
  • 2020-12-19 15:00

    Surprisingly complex problem... but here's a pseudocode

    intialize a stack
    push first line to stack
    while (there are more lines to read) {
     S1 = top of stack // do not pop off yet
     S2 = read a line
     if depth of S1 < depth of S2 {
      add S2 as child of S1
      push S2 into stack
     }
     else {
      while (depth of S1 >= depth of S2 AND there are at least 2 elements in stack) {
       pop stack
       S1 = top of stack // do not pop
      }
      add S2 as child of S1
      push S2 into stack
     }
    }
    return bottom element of stack
    

    where depth is the # leading whitespaces. You might have to either modify or wrap the Section class to store the depth of the line.

    0 讨论(0)
提交回复
热议问题