问题
I am attempting to use boost::spirit::qi to do some parsing. It's actually going quite well, and I successfully have managed to parse numbers in various bases based on a suffix. Examples: 123, c12h, 777o, 110101b.
I then wanted to add the ability to allow a completely ignored separator character, to allow values like 123_456 or 1101_0011b to parse. I tried using the skip parser, but I highly suspect that I completely misunderstood how it was to be used. It compiles just fine, but my attempt to make it ignore the underscore does absolutely nothing at all. Any suggestions on how to make this do what I want would be appreciated. My test code is included below:
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using qi::_val;
using qi::_1;
using qi::skip;
using qi::uint_parser;
using ascii::char_;
template <typename Iterator>
struct unsigned_parser : qi::grammar<Iterator, uint64_t()> {
unsigned_parser() : unsigned_parser::base_type(start) {
uint_parser<uint64_t, 10> dec_parser;
uint_parser<uint64_t, 16> hex_parser;
uint_parser<uint64_t, 8> oct_parser;
uint_parser<uint64_t, 2> bin_parser;
start = skip(char_('_'))[
/* binary with suffix */
(bin_parser[_val=_1] >> char_("bByY"))
/* octal with suffix */
| (oct_parser[_val=_1] >> char_("qQoO"))
/* hexadecimal with suffix */
| (hex_parser[_val=_1] >> char_("hHxX"))
/* decimal with optional suffix */
| (dec_parser[_val=_1] >> -char_("dDtT"))
];
}
qi::rule<Iterator, uint64_t()> start;
};
int main(int argv, const char *argc[]) {
typedef std::string::const_iterator iter;
unsigned_parser<iter> up;
uint64_t val;
if (argv != 2) {
std::cerr << "Usage: " << argc[0] << " <input>" << std::endl;
return 1;
}
std::string test(argc[1]);
iter i = test.begin();
iter end = test.end();
bool rv = parse(i, end, up, val);
if (rv && i == end) {
std::cout << "Succeeded: " << val << std::endl;
return 0;
}
if (rv) {
std::cout << "Failed partial parse: " << val << std::endl;
return 1;
}
std::cout << "Failed." << std::endl;
return 1;
}
回答1:
Aw. Nobody should have to bother with implementation details like Spirit parser contexts unless you're extending the library and implementing your own parser directives.
Until that time, phoenix::function<>
, phoenix::bind
or even BOOST_PHOENIX_ADAPT_FUNCTION
should be plenty for anyone.
Here are two approaches to your question without any patches to the library.
Straightforward parsing Live On Coliru
This could be viewed as the "naive" way of parsing the different styles of integers using just Qi and simple semantic actions:
start = eps [_val=0] >> +(char_("0-9a-fA-F") [ _val = _val*16 + _decode(_1) ] | '_')>> char_("hHxX") /* hexadecimal with suffix */ | eps [_val=0] >> +(char_("0-7") [ _val = _val* 8 + _decode(_1) ] | '_')>> char_("qQoO") /* octal with suffix */ | eps [_val=0] >> +(char_("01") [ _val = _val* 2 + _decode(_1) ] | '_')>> char_("bByY") /* binary with suffix */ | eps [_val=0] >> +(char_("0-9") [ _val = _val*10 + _decode(_1) ] | '_')>> -char_("dDtT") /* decimal with optional suffix */ ;
Of course, you will want to know what
_decode
looks like. Well you define it yourself:struct decode { template <typename> struct result { typedef int type; }; template <typename Ch> int operator()(Ch ch) const { if (ch>='0' && ch<='9') return ch - '0'; if (ch>='a' && ch<='z') return ch - 'a' + 10; if (ch>='A' && ch<='Z') return ch - 'A' + 10; assert(false); } }; boost::phoenix::function<decode> _decode;
Using
BOOST_PHOENIX_ADAPT_FUNCTION
macro Live On ColiruInstead of defining the function object you can use the macro
int decode(char ch) { if (ch>='0' && ch<='9') return ch - '0'; if (ch>='a' && ch<='z') return ch - 'a' + 10; if (ch>='A' && ch<='Z') return ch - 'A' + 10; assert(false); } BOOST_PHOENIX_ADAPT_FUNCTION(int, _decode, decode, 1)
Using
std::strtoul
Live On ColiruOf course, the above may be a tad "complex" because it requires you to deal with nitty gritty details of integer arithmetics and digit decoding.
Also, the "naive" approach does some duplicate work in case the literal is a decimal value like "101_101". It will calculate the subresult for the hex, octal and binary branches before realizing it was a decimal.
So we could change the order around:
start = (raw[+char_("_0-9a-fA-F")] >> char_("hHxX")) [ _val = _strtoul(_1,16) ] /* hexadecimal with suffix */ | (raw[+char_("_0-7")] >> char_("qQoO")) [ _val = _strtoul(_1, 8) ] /* octal with suffix */ | (raw[+char_("_01")] >> char_("bByY")) [ _val = _strtoul(_1, 2) ] /* binary with suffix */ | (raw[+char_("_0-9")] >> -char_("dDtT")) [ _val = _strtoul(_1,10) ] /* decimal with optional suffix */ ;
Again you will be curious how we implemented
_evaluate
? It's a function that takes the synthesized attributes fromraw
(which is an iterator range) and the base, which is definitely known by then:struct strtoul_f { template <typename, typename> struct result { typedef uint64_t type; }; template <typename Raw, typename Int> uint64_t operator()(Raw raw, Int base) const { std::string s(raw.begin(), raw.end()); s.erase(std::remove(s.begin(), s.end(), '_'), s.end()); char *f(&s[0]), *l(f+s.size()); return std::strtoul(f, &l, base); } }; boost::phoenix::function<strtoul_f> _strtoul;
As you can see, the only complexity is removing the
_
from the range first.
回答2:
If you really want to do this the "nice" way, you'd have to hack it into extract_int
in numeric_utils.hpp.
Even better, you'd want to make it a strategy class much like with the real_policies
used by real_parser
. Because just mixing in more branches with the existing general purpose integer handling code just complicates that and has the potential to slow down any integer parsing.
I have not done this. However, I do have a proof-of-concept approach here:
- https://github.com/boostorg/spirit/compare/master...sehe:so_q29132809
Mind you, this is not well tested and not fit for serious use for the reasons stated, but you can use it as inspiration. You might want to just duplicate the uint_parser
directive as-a-whole and stick it in your Spirit Repository location.
The patch
It's relatively straightforward. If you define
ALLOW_SO_UNDERSCORE_HACK
you will get the bypass for underscore inserted into the loop unrolling macros:#if defined(ALLOW_SO_UNDERSCORE_HACK) # define SPIRIT_SO_SKIP_UNDERSCORE_HACK() \ if ('_' == *it) { \ ++it; \ continue; \ } #else # define SPIRIT_SO_SKIP_UNDERSCORE_HACK() #endif
The only real complexity there is from "seeing through: the optimizations made in that translation unit.
There's a rather arbitrary choice to (dis)allow underscores amonge the leading zeros. I have opted to do so:
#if defined(ALLOW_SO_UNDERSCORE_HACK) // skip leading zeros for(;it != last;++it) { if ('0' == *it && leading_zeros < MaxDigits) { ++leading_zeros; continue; } else if ('_' == *it) { continue; } break; } #else
Finally, uderscores are not counted towards the
MinDigits
andMaxDigits
limits
DEMO
The following test program demonstrates things. Note The reordering of branches.
#include <boost/spirit/include/qi.hpp>
namespace qi = boost::spirit::qi;
template <typename Iterator>
struct unsigned_parser : qi::grammar<Iterator, uint64_t()> {
unsigned_parser() : unsigned_parser::base_type(start) {
using namespace qi;
uint_parser<uint64_t, 10> dec_parser;
uint_parser<uint64_t, 16> hex_parser;
uint_parser<uint64_t, 8> oct_parser;
uint_parser<uint64_t, 2> bin_parser;
start = eps(false)
| (hex_parser >> omit[ char_("hHxX")]) /* hexadecimal with suffix */
| (oct_parser >> omit[ char_("qQoO")]) /* octal with suffix */
| (bin_parser >> omit[ char_("bByY")]) /* binary with suffix */
| (dec_parser >> omit[-char_("dDtT")]) /* decimal with optional suffix */
;
}
qi::rule<Iterator, uint64_t()> start;
};
int main(int argv, const char *argc[]) {
typedef std::string::const_iterator iter;
unsigned_parser<iter> up;
for (auto const& test : std::vector<std::string>(argc+1, argc+argv)) {
iter i = test.begin(), end = test.end();
uint64_t val;
bool rv = parse(i, end, up, val);
std::cout << (rv?"Successful":"Failed") << " parse: '" << test << "' -> " << val << "\n";
if (i != end)
std::cout << " ** Remaining unparsed: '" << std::string(i,end) << "'\n";
}
}
If you call it with command line arguments 123_456 123456 1_bh 0_010Q 1010_1010_0111_0111_b
it will print:
Successful parse: '123_456' -> 123456
Successful parse: '123456' -> 123456
Successful parse: '1_bh' -> 27
Successful parse: '0_010Q' -> 8
Successful parse: '1010_1010_0111_0111_b' -> 43639
LISTING
Full patch (on boost-1.57.0
tag) for preservation on SO:
commit 24b16304f436bfd0f6e2041b2b7be0c8677c7e75
Author: Seth Heeren <sgheeren@gmail.com>
Date: Thu Mar 19 01:44:55 2015 +0100
http://stackoverflow.com/questions/29132809/using-boostspiritqi-to-parse-numbers-with-separators
rough patch for exposition of my answer only
diff --git a/include/boost/spirit/home/qi/numeric/detail/numeric_utils.hpp b/include/boost/spirit/home/qi/numeric/detail/numeric_utils.hpp
index 5137f87..1ced164 100644
--- a/include/boost/spirit/home/qi/numeric/detail/numeric_utils.hpp
+++ b/include/boost/spirit/home/qi/numeric/detail/numeric_utils.hpp
@@ -262,10 +262,21 @@ namespace boost { namespace spirit { namespace qi { namespace detail
///////////////////////////////////////////////////////////////////////////
// extract_int: main code for extracting integers
///////////////////////////////////////////////////////////////////////////
+#if defined(ALLOW_SO_UNDERSCORE_HACK)
+# define SPIRIT_SO_SKIP_UNDERSCORE_HACK() \
+ if ('_' == *it) { \
+ ++it; \
+ continue; \
+ }
+#else
+# define SPIRIT_SO_SKIP_UNDERSCORE_HACK()
+#endif
+
#define SPIRIT_NUMERIC_INNER_LOOP(z, x, data) \
if (!check_max_digits<MaxDigits>::call(count + leading_zeros) \
|| it == last) \
break; \
+ SPIRIT_SO_SKIP_UNDERSCORE_HACK() \
ch = *it; \
if (!radix_check::is_valid(ch) || !extractor::call(ch, count, val)) \
break; \
@@ -301,12 +312,25 @@ namespace boost { namespace spirit { namespace qi { namespace detail
std::size_t leading_zeros = 0;
if (!Accumulate)
{
+#if defined(ALLOW_SO_UNDERSCORE_HACK)
+ // skip leading zeros
+ for(;it != last;++it) {
+ if ('0' == *it && leading_zeros < MaxDigits) {
+ ++leading_zeros;
+ continue;
+ } else if ('_' == *it) {
+ continue;
+ }
+ break;
+ }
+#else
// skip leading zeros
while (it != last && *it == '0' && leading_zeros < MaxDigits)
{
++it;
++leading_zeros;
}
+#endif
}
typedef typename
@@ -366,6 +390,7 @@ namespace boost { namespace spirit { namespace qi { namespace detail
#define SPIRIT_NUMERIC_INNER_LOOP(z, x, data) \
if (it == last) \
break; \
+ SPIRIT_SO_SKIP_UNDERSCORE_HACK() \
ch = *it; \
if (!radix_check::is_valid(ch)) \
break; \
@@ -399,12 +424,25 @@ namespace boost { namespace spirit { namespace qi { namespace detail
std::size_t count = 0;
if (!Accumulate)
{
+#if defined(ALLOW_SO_UNDERSCORE_HACK)
+ // skip leading zeros
+ for(;it != last;++it) {
+ if ('0' == *it) {
+ ++count;
+ continue;
+ } else if ('_' == *it) {
+ continue;
+ }
+ break;
+ }
+#else
// skip leading zeros
while (it != last && *it == '0')
{
++it;
++count;
}
+#endif
if (it == last)
{
@@ -472,6 +510,7 @@ namespace boost { namespace spirit { namespace qi { namespace detail
};
#undef SPIRIT_NUMERIC_INNER_LOOP
+#undef SPIRIT_SO_SKIP_UNDERSCORE_HACK
///////////////////////////////////////////////////////////////////////////
// Cast an signed integer to an unsigned integer
来源:https://stackoverflow.com/questions/29132809/using-boostspiritqi-to-parse-numbers-with-separators