Using boost::spirit::qi to parse numbers with separators

问题

I am attempting to use boost::spirit::qi to do some parsing. It's actually going quite well, and I successfully have managed to parse numbers in various bases based on a suffix. Examples: 123, c12h, 777o, 110101b.

I then wanted to add the ability to allow a completely ignored separator character, to allow values like 123_456 or 1101_0011b to parse. I tried using the skip parser, but I highly suspect that I completely misunderstood how it was to be used. It compiles just fine, but my attempt to make it ignore the underscore does absolutely nothing at all. Any suggestions on how to make this do what I want would be appreciated. My test code is included below:

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using qi::_val;
using qi::_1;
using qi::skip;
using qi::uint_parser;
using ascii::char_;

template <typename Iterator>
struct unsigned_parser : qi::grammar<Iterator, uint64_t()> {

    unsigned_parser() : unsigned_parser::base_type(start) {
        uint_parser<uint64_t, 10> dec_parser;
        uint_parser<uint64_t, 16> hex_parser;
        uint_parser<uint64_t, 8> oct_parser;
        uint_parser<uint64_t, 2> bin_parser;

        start = skip(char_('_'))[
            /* binary with suffix */
            (bin_parser[_val=_1] >> char_("bByY"))
            /* octal with suffix */
            | (oct_parser[_val=_1] >> char_("qQoO"))
            /* hexadecimal with suffix */
            | (hex_parser[_val=_1] >> char_("hHxX"))
            /* decimal with optional suffix */
            | (dec_parser[_val=_1] >> -char_("dDtT"))
            ];
    }

    qi::rule<Iterator, uint64_t()> start;
};

int main(int argv, const char *argc[]) {
    typedef std::string::const_iterator iter;
    unsigned_parser<iter> up;
    uint64_t val;
    if (argv != 2) {
        std::cerr << "Usage: " << argc[0] << " <input>" << std::endl;
        return 1;
    }
    std::string test(argc[1]);
    iter i = test.begin();
    iter end = test.end();
    bool rv = parse(i, end, up, val);
    if (rv && i == end) {
        std::cout << "Succeeded: " << val << std::endl;
        return 0;
    }
    if (rv) {
        std::cout << "Failed partial parse: " << val << std::endl;
        return 1;
    }
    std::cout << "Failed." << std::endl;
    return 1;
}

回答1:

Aw. Nobody should have to bother with implementation details like Spirit parser contexts unless you're extending the library and implementing your own parser directives.

Until that time, phoenix::function<>, phoenix::bind or even BOOST_PHOENIX_ADAPT_FUNCTION should be plenty for anyone.

Here are two approaches to your question without any patches to the library.

Straightforward parsing Live On Coliru

This could be viewed as the "naive" way of parsing the different styles of integers using just Qi and simple semantic actions:

start = 
      eps [_val=0] >> +(char_("0-9a-fA-F") [ _val = _val*16 + _decode(_1) ] | '_')>>  char_("hHxX") /* hexadecimal with suffix */
    | eps [_val=0] >> +(char_("0-7")       [ _val = _val* 8 + _decode(_1) ] | '_')>>  char_("qQoO") /* octal       with suffix */
    | eps [_val=0] >> +(char_("01")        [ _val = _val* 2 + _decode(_1) ] | '_')>>  char_("bByY") /* binary      with suffix */
    | eps [_val=0] >> +(char_("0-9")       [ _val = _val*10 + _decode(_1) ] | '_')>> -char_("dDtT") /* decimal     with optional suffix */
    ;

Of course, you will want to know what _decode looks like. Well you define it yourself:

struct decode {
    template <typename> struct result { typedef int type; };
    template <typename Ch> int operator()(Ch ch) const {
        if (ch>='0' && ch<='9') return ch - '0';
        if (ch>='a' && ch<='z') return ch - 'a' + 10;
        if (ch>='A' && ch<='Z') return ch - 'A' + 10;
        assert(false);
    }
};
boost::phoenix::function<decode> _decode;

Using BOOST_PHOENIX_ADAPT_FUNCTION macro Live On Coliru

Instead of defining the function object you can use the macro

int decode(char ch) {
    if (ch>='0' && ch<='9') return ch - '0';
    if (ch>='a' && ch<='z') return ch - 'a' + 10;
    if (ch>='A' && ch<='Z') return ch - 'A' + 10;
    assert(false);
}

BOOST_PHOENIX_ADAPT_FUNCTION(int, _decode, decode, 1)

Using std::strtoul Live On Coliru

Of course, the above may be a tad "complex" because it requires you to deal with nitty gritty details of integer arithmetics and digit decoding.

Also, the "naive" approach does some duplicate work in case the literal is a decimal value like "101_101". It will calculate the subresult for the hex, octal and binary branches before realizing it was a decimal.

So we could change the order around:

start = 
        (raw[+char_("_0-9a-fA-F")] >>  char_("hHxX")) [ _val = _strtoul(_1,16) ] /* hexadecimal with suffix */
      | (raw[+char_("_0-7")]       >>  char_("qQoO")) [ _val = _strtoul(_1, 8) ] /* octal       with suffix */
      | (raw[+char_("_01")]        >>  char_("bByY")) [ _val = _strtoul(_1, 2) ] /* binary      with suffix */
      | (raw[+char_("_0-9")]       >> -char_("dDtT")) [ _val = _strtoul(_1,10) ] /* decimal     with optional suffix */
      ;

Again you will be curious how we implemented _evaluate? It's a function that takes the synthesized attributes from raw (which is an iterator range) and the base, which is definitely known by then:

struct strtoul_f {
    template <typename, typename> struct result { typedef uint64_t type; };
    template <typename Raw, typename Int> uint64_t operator()(Raw raw, Int base) const {
        std::string s(raw.begin(), raw.end());
        s.erase(std::remove(s.begin(), s.end(), '_'), s.end());
        char *f(&s[0]), *l(f+s.size());
        return std::strtoul(f, &l, base);
    }
};
boost::phoenix::function<strtoul_f> _strtoul;

As you can see, the only complexity is removing the _ from the range first.

回答2:

If you really want to do this the "nice" way, you'd have to hack it into extract_int in numeric_utils.hpp.

Even better, you'd want to make it a strategy class much like with the real_policies used by real_parser. Because just mixing in more branches with the existing general purpose integer handling code just complicates that and has the potential to slow down any integer parsing.

I have not done this. However, I do have a proof-of-concept approach here:

https://github.com/boostorg/spirit/compare/master...sehe:so_q29132809

Mind you, this is not well tested and not fit for serious use for the reasons stated, but you can use it as inspiration. You might want to just duplicate the uint_parser directive as-a-whole and stick it in your Spirit Repository location.

The patch

It's relatively straightforward. If you define ALLOW_SO_UNDERSCORE_HACK you will get the bypass for underscore inserted into the loop unrolling macros:

#if defined(ALLOW_SO_UNDERSCORE_HACK)
#   define SPIRIT_SO_SKIP_UNDERSCORE_HACK()                                   \
                if ('_' == *it) {                                             \
                    ++it;                                                     \
                    continue;                                                 \
                }
#else
#   define SPIRIT_SO_SKIP_UNDERSCORE_HACK()
#endif

The only real complexity there is from "seeing through: the optimizations made in that translation unit.

There's a rather arbitrary choice to (dis)allow underscores amonge the leading zeros. I have opted to do so:

#if defined(ALLOW_SO_UNDERSCORE_HACK)
                // skip leading zeros
                for(;it != last;++it) {
                    if ('0' == *it && leading_zeros < MaxDigits) {
                        ++leading_zeros;
                        continue;
                    } else if ('_' == *it) {
                        continue;
                    }
                    break;
                }
#else

Finally, uderscores are not counted towards the MinDigits and MaxDigits limits

DEMO

The following test program demonstrates things. Note The reordering of branches.

#include <boost/spirit/include/qi.hpp>

namespace qi = boost::spirit::qi;

template <typename Iterator>
struct unsigned_parser : qi::grammar<Iterator, uint64_t()> {

    unsigned_parser() : unsigned_parser::base_type(start) {
        using namespace qi;
        uint_parser<uint64_t, 10> dec_parser;
        uint_parser<uint64_t, 16> hex_parser;
        uint_parser<uint64_t, 8> oct_parser;
        uint_parser<uint64_t, 2> bin_parser;

        start = eps(false)
            | (hex_parser >> omit[ char_("hHxX")]) /* hexadecimal with suffix */
            | (oct_parser >> omit[ char_("qQoO")]) /* octal with suffix */
            | (bin_parser >> omit[ char_("bByY")]) /* binary with suffix */
            | (dec_parser >> omit[-char_("dDtT")]) /* decimal with optional suffix */
            ;
    }

    qi::rule<Iterator, uint64_t()> start;
};

int main(int argv, const char *argc[]) {
    typedef std::string::const_iterator iter;
    unsigned_parser<iter> up;

    for (auto const& test : std::vector<std::string>(argc+1, argc+argv)) {
        iter i = test.begin(), end = test.end();

        uint64_t val;
        bool rv = parse(i, end, up, val);

        std::cout << (rv?"Successful":"Failed") << " parse: '" << test << "' -> " << val << "\n";

        if (i != end)
            std::cout << " ** Remaining unparsed: '" << std::string(i,end) << "'\n";
    }
}

If you call it with command line arguments 123_456 123456 1_bh 0_010Q 1010_1010_0111_0111_b it will print:

Successful parse: '123_456' -> 123456
Successful parse: '123456' -> 123456
Successful parse: '1_bh' -> 27
Successful parse: '0_010Q' -> 8
Successful parse: '1010_1010_0111_0111_b' -> 43639

LISTING

Full patch (on boost-1.57.0 tag) for preservation on SO:

commit 24b16304f436bfd0f6e2041b2b7be0c8677c7e75
Author: Seth Heeren <sgheeren@gmail.com>
Date:   Thu Mar 19 01:44:55 2015 +0100

    http://stackoverflow.com/questions/29132809/using-boostspiritqi-to-parse-numbers-with-separators

    rough patch for exposition of my answer only

diff --git a/include/boost/spirit/home/qi/numeric/detail/numeric_utils.hpp b/include/boost/spirit/home/qi/numeric/detail/numeric_utils.hpp
index 5137f87..1ced164 100644
--- a/include/boost/spirit/home/qi/numeric/detail/numeric_utils.hpp
+++ b/include/boost/spirit/home/qi/numeric/detail/numeric_utils.hpp
@@ -262,10 +262,21 @@ namespace boost { namespace spirit { namespace qi { namespace detail
    ///////////////////////////////////////////////////////////////////////////
    //  extract_int: main code for extracting integers
    ///////////////////////////////////////////////////////////////////////////
+#if defined(ALLOW_SO_UNDERSCORE_HACK)
+#   define SPIRIT_SO_SKIP_UNDERSCORE_HACK()                                   \
+                if ('_' == *it) {                                             \
+                    ++it;                                                     \
+                    continue;                                                 \
+                }
+#else
+#   define SPIRIT_SO_SKIP_UNDERSCORE_HACK()
+#endif
+
#define SPIRIT_NUMERIC_INNER_LOOP(z, x, data)                                 \
        if (!check_max_digits<MaxDigits>::call(count + leading_zeros)         \
            || it == last)                                                    \
            break;                                                            \
+        SPIRIT_SO_SKIP_UNDERSCORE_HACK()                                      \
        ch = *it;                                                             \
        if (!radix_check::is_valid(ch) || !extractor::call(ch, count, val))   \
            break;                                                            \
@@ -301,12 +312,25 @@ namespace boost { namespace spirit { namespace qi { namespace detail
            std::size_t leading_zeros = 0;
            if (!Accumulate)
            {
+#if defined(ALLOW_SO_UNDERSCORE_HACK)
+                // skip leading zeros
+                for(;it != last;++it) {
+                    if ('0' == *it && leading_zeros < MaxDigits) {
+                        ++leading_zeros;
+                        continue;
+                    } else if ('_' == *it) {
+                        continue;
+                    }
+                    break;
+                }
+#else
                // skip leading zeros
                while (it != last && *it == '0' && leading_zeros < MaxDigits)
                {
                    ++it;
                    ++leading_zeros;
                }
+#endif
            }

            typedef typename
@@ -366,6 +390,7 @@ namespace boost { namespace spirit { namespace qi { namespace detail
#define SPIRIT_NUMERIC_INNER_LOOP(z, x, data)                                 \
        if (it == last)                                                       \
            break;                                                            \
+        SPIRIT_SO_SKIP_UNDERSCORE_HACK()                                      \
        ch = *it;                                                             \
        if (!radix_check::is_valid(ch))                                       \
            break;                                                            \
@@ -399,12 +424,25 @@ namespace boost { namespace spirit { namespace qi { namespace detail
            std::size_t count = 0;
            if (!Accumulate)
            {
+#if defined(ALLOW_SO_UNDERSCORE_HACK)
+                // skip leading zeros
+                for(;it != last;++it) {
+                    if ('0' == *it) {
+                        ++count;
+                        continue;
+                    } else if ('_' == *it) {
+                        continue;
+                    }
+                    break;
+                }
+#else
                // skip leading zeros
                while (it != last && *it == '0')
                {
                    ++it;
                    ++count;
                }
+#endif

                if (it == last)
                {
@@ -472,6 +510,7 @@ namespace boost { namespace spirit { namespace qi { namespace detail
    };

#undef SPIRIT_NUMERIC_INNER_LOOP
+#undef SPIRIT_SO_SKIP_UNDERSCORE_HACK

    ///////////////////////////////////////////////////////////////////////////
    // Cast an signed integer to an unsigned integer

来源：https://stackoverflow.com/questions/29132809/using-boostspiritqi-to-parse-numbers-with-separators

标签

c++

boost-spirit

boost-spirit-qi