Grammar balancing issue

问题

Is it possible to force Boost.Spirit Qi to behave in such way, that generated grammar would be adjustable in compliance with some runtime-calculable conditions/rules/rates? For example, the input consists of language constructs, that cause the different alternatives during parsing, some more frequently, others -- less. But the order of the alternatives affects on the efficiency, i.e. runtime optimality of the grammar. In some cases it is impossible to determine in advance which alternative will be chosen more often in the case of arbitrary input (which may be strongly clustered).

I know that it is possible to append symbols to qi::symbols at runtime, but similar behavior would be desirable for some other parsers.

回答1:

You sadly forgot (?) to include a sample grammar. So I made up my own. It parses a language like this:

begin
    declare x : int;
    declare y : string;

    let x = 42;
    let y = "Life the universe and everything";

    for(ch : y)
    begin
        if (call is_alpha(ch))
        begin
            declare z : string;
            let z = call to_upper(ch);
            call print(z);
        end; 
        else call print("?");
    end;
end;

Now. You may notice that each "language construct" (as you refer to it in the OP) has an introducing keyword. This is on purpose.

Because, now, we can use qi::symbols with these introducer keywords to dispatch rules (this is known as the Nabialek trick):

// let's have some language constructs
feature_vardecl    = identifier >> ':' >> type >> ';';
feature_assignment = identifier >> "=" >> expression >> ';';
feature_block      = *statement >> kw["end"] >> ';' | statement;
feature_forloop    = '(' >> identifier >> ':' >> identifier > ')' >> statement;
feature_func_call  = invocation > ';';
feature_if         = ('(' > expression > ')' > statement) >> (kw["else"] > statement);

language_constructs.add
    ("declare", &feature_vardecl)
    ("let",     &feature_assignment)
    ("begin",   &feature_block)
    ("if",      &feature_if)
    ("for",     &feature_forloop)
    ("call",    &feature_func_call);

You can see, we store the address of the corresponding grammar rule as the value in the dictionary. Now, we employ the Nabialek trick (uses qi::_a local to invoke the subrule):

statement  = 
      (kw[language_constructs] [ qi::_a = qi::_1 ] > qi::lazy(*qi::_a))
    | (expression > ';');

If you wanted a more 'lightweight' grammar, you'd simply remove some features:

language_constructs.add
    ("let",     &feature_assignment)
    ("begin",   &feature_block)
    ("call",    &feature_func_call);

You could even dynamically add features to the language_constructs in response to input (e.g. a version identifier in the input, or just when parsing failed). I'm not sure whether this would be a good idea, but... it's all possible.

A fully functional sample that parses the above program (if supplied in input.txt) complete with adhoc "unit tests", distinct keyword checking support, debugging etc.:

See it Live on Coliru

#define BOOST_SPIRIT_DEBUG
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/repository/include/qi_distinct.hpp>
#include <fstream>

namespace qi = boost::spirit::qi;
namespace phx= boost::phoenix;

using It      = std::string::const_iterator;
using Skipper = qi::space_type;
using Rule    = qi::rule<It, Skipper>;

template <typename G, size_t N>
    bool test(const char (&raw)[N], const G& grammar)
{
    std::string const input(raw, raw+N-1);
    auto f(std::begin(input)), l(std::end(input));
    try
    {
        bool ok = qi::phrase_parse(f, l, grammar, qi::space);
        // if (f!=l) std::cerr << "remaining unparsed: '" << std::string(f,l) << "'\n";
        return ok && (f == l); 
    } catch (qi::expectation_failure<It> const& e)
    {
        // std::cout << "Expectation failure '" << e.what() << "' at '" << std::string(e.first, e.last) << "'\n";
        return false;
    }
}

template <typename It, typename Skipper>
struct toy_grammar : qi::grammar<It, Skipper>
{
    toy_grammar() : toy_grammar::base_type(start)
    {
        using boost::spirit::repository::distinct;
        static const auto kw = distinct(qi::char_("a-zA-Z_0-9"));

        keywords.add("let")("declare")("begin")("end")("for")("call")("if")("else");

        identifier = !kw[keywords] >> qi::lexeme [ qi::alpha >> *qi::char_("a-zA-Z_0-9") ];

        assert( test("z", identifier));
        assert( test("Afgjkj_123123", identifier));
        assert(!test("1", identifier));

        type       = qi::lexeme [ kw["int"] | kw["double"]| kw["string"] | kw["boolean"]];

        assert( test("int",     type));
        assert( test("double",  type));
        assert( test("string",  type));
        assert( test("boolean", type));
        assert(!test("intzies", type));
        assert(!test("Int",     type));

        literal    = qi::lexeme [
                    qi::real_parser<double, qi::strict_real_policies<double>>()
                    | qi::int_
                    | qi::as_string ['"' >> *~qi::char_('"') >> '"']
                    | kw [ qi::bool_ ]
                    ];

        assert( test("42",     literal));
        assert( test("42.",    literal));
        assert( test(".0",     literal));
        assert( test("-3e+7",  literal));
        assert( test("-inf",   literal));
        assert( test("-99",    literal));
        assert( test("\"\"",   literal));
        assert( test("\"\0\"", literal));
        assert( test("true",   literal));
        assert( test("false",  literal));
        assert(!test("trueish",literal));
        assert(!test("yes",    literal));

        invocation = identifier > '(' > -(expression % ',') > ')';

        // arhem, this part left as an exercise for the reader :)
        expression = literal | identifier | (kw["call"] > invocation); 

        assert( test("-99",       expression));
        assert( test("\"santa\"", expression));
        assert( test("clause",    expression));
        assert( test("true",      expression));
        assert( test("call foo()",    expression));
        assert( test("call foo(bar, inf, false)", expression));
        assert(!test("call 42()",     expression));

        // let's have some language constructs
        feature_vardecl    = identifier >> ':' >> type >> ';';
        feature_assignment = identifier >> "=" >> expression >> ';';
        feature_block      = *statement >> kw["end"] >> ';' | statement;
        feature_forloop    = '(' >> identifier >> ':' >> identifier > ')' >> statement;
        feature_func_call  = invocation > ';';
        feature_if_else    = ('(' > expression > ')' > statement) >> (kw["else"] > statement);

        language_constructs.add
            ("declare", &feature_vardecl)
            ("let",     &feature_assignment)
            ("begin",   &feature_block)
            ("if",      &feature_if_else)
            ("for",     &feature_forloop)
            ("call",    &feature_func_call);

        statement  = 
              (kw[language_constructs] [ qi::_a = qi::_1 ] > qi::lazy(*qi::_a))
            | (expression > ';');

        assert( test("declare x : int;"                                       , statement));
        assert( test("let y = true;"                                          , statement));
        assert( test("call foo();",                                             statement));
        assert( test("call foo(bar, inf, false);",                              statement));

        assert( test("begin end;",                                              statement));
        assert( test("begin let y = x; end;",                                   statement));
        assert( test("begin let y = x; call foo(y); end;",                      statement));
        assert( test("for (x : collection) begin let y = x; call foo(y); end;", statement));

        BOOST_SPIRIT_DEBUG_NODES((identifier)(type)(literal)(expression)(invocation)(statement)
                (feature_vardecl)(feature_assignment)(feature_block)(feature_forloop)(feature_func_call)(feature_if_else)
                );

        start = statement;
    }
  private:
    qi::symbols<char, Rule const*> language_constructs;
    qi::symbols<char, qi::unused_type> keywords;

    Rule start,
         identifier, type, literal, expression, invocation, 
         feature_assignment, feature_vardecl, feature_block, feature_forloop, feature_func_call, feature_if_else;

    qi::rule<It, Skipper, qi::locals<Rule const*> > statement;
};

int main()
{
    using namespace std;
    ifstream ifs("input.txt", ios::binary);
    string const input(istreambuf_iterator<char>(ifs), {});

    auto f(begin(input)), l(end(input));
    try
    {
        static const toy_grammar<It, Skipper> p;
        bool ok = qi::phrase_parse(
                f, l,
                p,
                qi::space);

        assert(ok);

        if (f!=l)
            cout << "Program remaining unparsed: '" << string(f,l) << "'\n";
    } catch (qi::expectation_failure<It> const& e)
    {
        cout << "Expectation failure '" << e.what() << "' at '" << string(e.first, e.last) << "'\n";
    }
}

来源：https://stackoverflow.com/questions/20846491/grammar-balancing-issue

标签

c++

boost-spirit

boost-spirit-qi

tree-balancing