Given a regular expression, how would I generate all strings that match it?

后端 未结 4 723
不思量自难忘°
不思量自难忘° 2020-12-15 00:07

I\'m using a simple language of only (), |, spaces, and alpha characters.
Given a regular expression like the following:

(hello         


        
4条回答
  •  感动是毒
    2020-12-15 00:26

    Somewhat following Kieveli's advice, I have come up with a working solution. Although not previously mentioned, it was important for me to also get a count of how many results could potentially be generated. I was using a python script called "exrex" which I had found on github. Embarrassingly, I did not realize that it had the capability to also count. Nonetheless, I implemented it the best I could in C++ using my simplified regular expression language. If interested in my solution, please read on.

    From an object oriented stand point, I wrote a scanner to take the regular expression(string), and convert it into a list of tokens(vector of strings). The list of tokens was then sent to a parser which generated an n-ary tree. All of this was packed inside an "expression generator" class that could take an expression and hold the parse tree, as well as the generated count.
    object overview
    The scanner was important because it tokenized the empty string case which you can see in my question appearing as "|)". Scanning also created a pattern of [word] [operation] [word] [operation] ... [word].
    For example, scanning: "(hello|goodbye) (world(s|)|)"
    will create: [][(][hello][|][goodbye][)][ ][(][world][(][s][|][][)][][|][][)][]

    The parse tree was a vector of nodes. Nodes contain a vector of vector of nodes. parse structure
    The orange cells represent the "or"s, and the other boxes that draw the connections, represent the "and"s. Below is my code.

    Node header

    #pragma once
    #include 
    #include 
    
    class Function_Expression_Node{
    
    public:
        Function_Expression_Node(std::string const& value_in = "", bool const& more_in = false);
    
        std::string value;
        bool more;
        std::vector> children;
    
    };
    

    Node source

    #include "function_expression_node.hpp"
    
        Function_Expression_Node::Function_Expression_Node(std::string const& value_in, bool const& more_in)
        : value(value_in)
        , more(more_in)
        {}
    

    Scanner header

    #pragma once
    #include 
    #include 
    
    class Function_Expression_Scanner{
    
        public: Function_Expression_Scanner() = delete;
        public: static std::vector Scan(std::string const& expression);
    
    };
    

    Scanner source

    #include "function_expression_scanner.hpp"
    
    std::vector Function_Expression_Scanner::Scan(std::string const& expression){
    
        std::vector tokens;
        std::string temp;
    
        for (auto const& it: expression){
    
            if (it == '('){
                tokens.push_back(temp);
                tokens.push_back("(");
                temp.clear();
            }
    
            else if (it == '|'){
                tokens.push_back(temp);
                tokens.push_back("|");
                temp.clear();
            }
    
            else if (it == ')'){
                tokens.push_back(temp);
                tokens.push_back(")");
                temp.clear();
            }
    
            else if (isalpha(it) || it == ' '){
                temp+=it;
            }
    
        }
    
        tokens.push_back(temp);
    
        return tokens;
        }
    

    Parser header

    #pragma once
    #include 
    #include 
    #include "function_expression_node.hpp"
    
    class Function_Expression_Parser{
    
        Function_Expression_Parser() = delete;
    
    //get parse tree
    public: static std::vector> Parse(std::vector const& tokens, unsigned int & amount);
        private: static std::vector> Build_Parse_Tree(std::vector::const_iterator & it, std::vector::const_iterator const& end, unsigned int & amount);
            private: static Function_Expression_Node Recursive_Build(std::vector::const_iterator & it, int & total); //<- recursive
    
        //utility
        private: static bool Is_Word(std::string const& it);
    };
    

    Parser source

    #include "function_expression_parser.hpp"
    
    bool Function_Expression_Parser::Is_Word(std::string const& it){
            return (it != "(" && it != "|" && it != ")");
        }
    Function_Expression_Node Function_Expression_Parser::Recursive_Build(std::vector::const_iterator & it, int & total){
    
        Function_Expression_Node sub_root("",true); //<- contains the full root
        std::vector root;
    
        const auto begin = it;
    
        //calculate the amount
        std::vector> multiplies;
        std::vector adds;
        int sub_amount = 1;
    
        while(*it != ")"){
    
            //when we see a "WORD", add it.
            if(Is_Word(*it)){
                root.push_back(Function_Expression_Node(*it));
            }
    
            //when we see a "(", build the subtree,
            else if (*it == "("){
                ++it;
                root.push_back(Recursive_Build(it,sub_amount));
    
                //adds.push_back(sub_amount);
                //sub_amount = 1;
            }
    
            //else we see an "OR" and we do the split
            else{
                sub_root.children.push_back(root);
                root.clear();
    
                //store the sub amount
                adds.push_back(sub_amount);
                sub_amount = 1;
            }
    
            ++it;
        }
    
        //add the last bit, if there is any
        if (!root.empty()){
            sub_root.children.push_back(root);
    
            //store the sub amount
            adds.push_back(sub_amount);
        }
        if (!adds.empty()){
            multiplies.push_back(adds);
        }
    
    
        //calculate sub total
        int or_count = 0;
        for (auto const& it: multiplies){
            for (auto const& it2: it){
                or_count+=it2;
            }
    
            if (or_count > 0){
                total*=or_count;
            }
            or_count = 0;
        }
    
        /*
        std::cout << "---SUB FUNCTION---\n";
        for (auto it: multiplies){for (auto it2: it){std::cout << "{" << it2 << "} ";}std::cout << "\n";}std::cout << "--\n";
        std::cout << total << std::endl << '\n';
        */
    
        return sub_root;
    }
    std::vector> Function_Expression_Parser::Build_Parse_Tree(std::vector::const_iterator & it, std::vector::const_iterator const& end, unsigned int & amount){
    
        std::vector> full_root;
        std::vector root;
    
        const auto begin = it;
    
        //calculate the amount
        std::vector adds;
        int sub_amount = 1;
        int total = 0;
    
        while (it != end){
    
            //when we see a "WORD", add it.
            if(Is_Word(*it)){
                root.push_back(Function_Expression_Node(*it));
            }
    
            //when we see a "(", build the subtree,
            else if (*it == "("){
                ++it;
                root.push_back(Recursive_Build(it,sub_amount));
    
            }
    
            //else we see an "OR" and we do the split
            else{
                full_root.push_back(root);
                root.clear();
    
                //store the sub amount
                adds.push_back(sub_amount);
                sub_amount = 1;
            }
    
            ++it;
        }
    
        //add the last bit, if there is any
        if (!root.empty()){
            full_root.push_back(root);
    
            //store the sub amount
            adds.push_back(sub_amount);
            sub_amount = 1;
        }
    
        //calculate sub total
        for (auto const& it: adds){ total+=it; }
    
        /*
        std::cout << "---ROOT FUNCTION---\n";
        for (auto it: adds){std::cout << "[" << it << "] ";}std::cout << '\n';
        std::cout << total << std::endl << '\n';
        */
        amount = total;
    
        return full_root;
    }
    std::vector> Function_Expression_Parser::Parse(std::vector const& tokens, unsigned int & amount){
    
        auto it = tokens.cbegin();
        auto end = tokens.cend();
        auto parse_tree = Build_Parse_Tree(it,end,amount);
        return parse_tree;
    }
    

    Generator header

    #pragma once
    #include "function_expression_node.hpp"
    
    class Function_Expression_Generator{
    
        //constructors
        public: Function_Expression_Generator(std::string const& expression);
        public: Function_Expression_Generator();
    
        //transformer
        void Set_New_Expression(std::string const& expression);
    
        //observers
        public: unsigned int Get_Count();
        //public: unsigned int Get_One_Word_Name_Count();
        public: std::vector Get_Generations();
            private: std::vector Generate(std::vector> const& parse_tree);
                private: std::vector Sub_Generate(std::vector const& nodes);
    
    private:
        std::vector> m_parse_tree;
        unsigned int amount;
    
    };
    

    Generator source

    #include "function_expression_generator.hpp"
    
    #include "function_expression_scanner.hpp"
    #include "function_expression_parser.hpp"
    
    //constructors
    Function_Expression_Generator::Function_Expression_Generator(std::string const& expression){
        auto tokens = Function_Expression_Scanner::Scan(expression);
        m_parse_tree = Function_Expression_Parser::Parse(tokens,amount);
    }
    Function_Expression_Generator::Function_Expression_Generator(){}
    
    //transformer
    void Function_Expression_Generator::Set_New_Expression(std::string const& expression){
        auto tokens = Function_Expression_Scanner::Scan(expression);
        m_parse_tree = Function_Expression_Parser::Parse(tokens,amount);
    }
    
    //observers
    unsigned int Function_Expression_Generator::Get_Count(){
        return amount;
    }
    std::vector Function_Expression_Generator::Get_Generations(){
        return Generate(m_parse_tree);
    }
    std::vector Function_Expression_Generator::Generate(std::vector> const& parse_tree){
        std::vector results;
        std::vector more;
    
        for (auto it: parse_tree){
            more = Sub_Generate(it);
            results.insert(results.end(), more.begin(), more.end());
        }
    
        return results;
    }
    std::vector Function_Expression_Generator::Sub_Generate(std::vector const& nodes){
        std::vector results;
        std::vector more;
        std::vector new_results;
    
        results.push_back("");
        for (auto it: nodes){
            if (!it.more){
                for (auto & result: results){
                    result+=it.value;
                }
            }
            else{
                more = Generate(it.children);
                for (auto m: more){
                    for (auto r: results){
                        new_results.push_back(r+m);
                    }
                }
                more.clear();
                results = new_results;
                new_results.clear();
            }
        }
    
        return results;
    }
    

    In conclusion, I recommend using exrex, or any other programs mentioned in this thread, if you need to generate matches for regular expressions.

提交回复
热议问题