问题
Once again, I find myself reaching for boost spirit. Once again I find myself defeated by it.
A HTTP header value takes the general form:
text/html; q=1.0, text/*; q=0.8, image/gif; q=0.6, image/jpeg; q=0.6, image/*; q=0.5, */*; q=0.1
i.e. value *OWS [; *OWS name *OWS [= *OWS possibly_quoted_value] *OWS [...]] *OWS [ , <another value> ...]
so in my mind, this header decodes to:
value[0]:
text/html
params:
name : q
value : 1.0
value[1]:
text/*
params:
name : q
value : 0.8
...
and so on.
I am certain that to anyone who knows how, the boost::spirit::qi syntax for this is trivial.
I humbly ask for your assistance.
for example, here's the outline of the code which decodes the Content-Type
header, which is limited to one value of the form type/subtype
, with any number of parameters of the form <sp> ; <sp> token=token|quoted_string
template<class Iter>
void parse(ContentType& ct, Iter first, Iter last)
{
ct.mutable_type()->append(to_lower(consume_token(first, last)));
consume_lit(first, last, '/');
ct.mutable_subtype()->append(to_lower(consume_token(first, last)));
while (first != last) {
skipwhite(first, last);
if (consume_char_if(first, last, ';'))
{
auto p = ct.add_parameters();
skipwhite(first, last);
p->set_name(to_lower(consume_token(first, last)));
skipwhite(first, last);
if (consume_char_if(first, last, '='))
{
skipwhite(first, last);
p->set_value(consume_token_or_quoted(first, last));
}
else {
// no value on this parameter
}
}
else if (consume_char_if(first, last, ','))
{
// normally we should get the next value-token here but in the case of Content-Type
// we must barf
throw std::runtime_error("invalid use of ; in Content-Type");
}
}
}
ContentType& populate(ContentType& ct, const std::string& header_value)
{
parse(ct, header_value.begin(), header_value.end());
return ct;
}
回答1:
I've taken the code as posted by OP and given it a review.
there's no need to specify
void()
. In fact it's preferable to useqi::unused_type
in such cases, which is what rules will default to if no attribute type is declared.there no need for
char_
if you don't wish to expose the attribute. Uselit
instead.there is no need to wrap every char parser in a
rule
. That hurts performance. It's best to leave the proto expression tree un-evaluated as long so Qi can optimize parser expressions more, and the compiler can inline more.Also, Qi doesn't have move semantics on attributes, so avoiding redundant rules eliminates redundant copies of sub-attributes that get concatenated in the containing rules.
Sample alternative spelling (caution, see Assigning parsers to auto variables)
auto CR = qi::lit('\r'); auto LF = qi::lit('\n'); auto CRLF = qi::lit("\r\n"); auto HT = qi::lit('\t'); auto SP = qi::lit(' '); auto LWS = qi::copy(-CRLF >> +(SP | HT)); // deepcopy UPALPHA = char_('A', 'Z'); LOALPHA = char_('a', 'z'); ALPHA = UPALPHA | LOALPHA; DIGIT = char_('0', '9'); //CTL = char_(0, 31) | char_(127); TEXT = char_("\t\x20-\x7e\x80-\xff");
Since you didn't have to use
char_
, you also don't have kill the attribute usingqi::omit[]
.When you are in a Qi domain expression template, raw string/char literals are implicitly wrapped in a
qi::lit
so, you can simply things likequoted_pair = omit[char_('\\')] >> char_; quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
to just
quoted_pair = '\\' >> char_; quoted_string = '"' >> *(qdtext | quoted_pair) >> '"';
instead of spelling out skipping spaces with
omit[*SP]
all the time, just declare the rule with a skipper. Now, you can simplifynvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value; any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only); content_type_rule = type_subtype_rule >> *any_parameter;
to just
nvp = token >> '=' >> value; any_parameter = ';' >> (nvp | name_only); content_type_rule = type_subtype_rule >> qi::skip(spaces)[*any_parameter];
Note that any subrule invocations of rules that are declared without a skipper are implicitly lexeme: Boost spirit skipper issues
there were many redundant/unused headers
- recent compilers + boost versions make BOOST_FUSION_ADAPT_STRUCT much simpler by using
decltype
The results of simplifying are much less noisy:
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapted.hpp>
struct parameter {
boost::optional<std::string> name;
std::string value;
};
struct type_subtype {
std::string type;
std::string subtype;
};
struct content_type {
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using qi::ascii::char_;
spaces = char_(' ');
token = +~char_( "()<>@,;:\\\"/[]?={} \t");
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
type_subtype_rule = token >> '/' >> token;
name_only = token;
nvp = token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];
BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
}
private:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, binary_parameter(), Skipper> nvp;
qi::rule<Iterator, parameter(), Skipper> any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
// lexemes
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
qi::rule<Iterator, unary_parameter()> name_only;
};
See it Live On Coliru (with the same test cases)
BONUS
I'd prefer a simpler AST in a case like this. By injecting some attribute values using qi::attr
you can avoid using boost::variant and/or even avoid boost::optional:
struct parameter {
bool have_name;
std::string name;
std::string value;
};
struct type_subtype {
std::string type;
std::string subtype;
};
struct content_type {
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(parameter, have_name, name, value)
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)
namespace qi = boost::spirit::qi;
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using qi::ascii::char_;
spaces = char_(' ');
token = +~char_( "()<>@,;:\\\"/[]?={} \t");
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
type_subtype_rule = token >> '/' >> token;
name_only = qi::attr(false) >> qi::attr("") >> token;
nvp = qi::attr(true) >> token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];
BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
}
private:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, parameter(), Skipper> nvp, name_only, any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
// lexemes
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
};
回答2:
OK, after an heroic 24 hours of struggle (well, not really - more like reading the manual over and over again...), I've found a way that works.
I am by no means competent with boost::spirit
. If someone out there can improve on this answer, please do post it.
This spirit state machine takes the value of a header (with one, optionally parameterised, value) and turns it into a content_type
structure.
My amateur reading of the HTTP standard indicates that some headers have the form (spaces here indicate any amount of white space, values may be quoted or not:
Header-Name: tokena/tokenb [; param1 = "value" [; param2 = value]...]
whereas others have the more general form:
Header-Name: token [; param1 = "value"[; param2 = value]...] [ , token ...]
This code covers the first case - i.e. the HTTP Content-Type
header value. I will need to extend it to cater for the Accept
header (which can advertise multiple values with parameters) - that will come later.
So here's the code. Please by all means show me how to improve it!!
#define BOOST_SPIRIT_DEBUG
#include <gtest/gtest.h>
#include <boost/spirit/include/qi.hpp>
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_char.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <utility>
#include <vector>
#include <string>
#include <boost/variant.hpp>
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using unary_parameter = std::string;
struct binary_parameter
{
std::string name;
std::string value;
};
BOOST_FUSION_ADAPT_STRUCT(binary_parameter,
(std::string, name)
(std::string, value))
using parameter = boost::variant<unary_parameter, binary_parameter>;
struct type_subtype
{
std::string type;
std::string subtype;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype,
(std::string, type)
(std::string, subtype))
using content_type_pair = std::pair<std::string, std::string>;
struct content_type
{
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(content_type,
(type_subtype, type)
(std::vector<parameter>, params))
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using ascii::char_;
using qi::omit;
using qi::eoi;
CR = char_('\r');
LF = char_('\n');
CRLF = CR >> LF;
SP = char_(' ');
HT = char_('\t');
LWS = -CRLF >> +(SP | HT);
UPALPHA = char_('A', 'Z');
LOALPHA = char_('a', 'z');
ALPHA = UPALPHA | LOALPHA;
DIGIT = char_('0', '9');
CTL = char_(0, 31) | char_(127);
QUOT = char_('"');
TEXT = (char_ - CTL) | HT;
separator = char_('(') | ')' | '<' | '>' | '@'
| ',' | ';' | ':' | '\\' | '"'
| '/' | '[' | ']' | '?' | '='
| '{' | '}' | SP | HT;
end_sequence = separator | space;
token = +(char_ - separator);
qdtext = char_ - char_('"') - '\\';
quoted_pair = omit[char_('\\')] >> char_;
quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
value = quoted_string | token ;
type_subtype_rule = token >> '/' >> token;
name_only = token;
nvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
content_type_rule = type_subtype_rule >> *any_parameter;
BOOST_SPIRIT_DEBUG_NODES((qdtext)(quoted_pair)(quoted_string)(value)(token)(separator));
}
qi::rule<Iterator, void()> CR, LF, CRLF, SP, HT, LWS, CTL, QUOT;
qi::rule<Iterator, char()> UPALPHA, LOALPHA, ALPHA, DIGIT, TEXT, qdtext, quoted_pair;
qi::rule<Iterator, void()> separator, space, end_sequence;
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
qi::rule<Iterator, unary_parameter()> name_only;
qi::rule<Iterator, binary_parameter()> nvp;
qi::rule<Iterator, parameter()> any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
};
TEST(spirit_test, test1)
{
token_grammar<std::string::const_iterator> grammar{};
std::string test = R"__test(application/json )__test";
content_type ct;
bool r = qi::parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("application", ct.type.type);
EXPECT_EQ("json", ct.type.subtype);
EXPECT_EQ(0, ct.params.size());
ct = {};
test = R"__test(text/html ; charset = "ISO-8859-5")__test";
qi::parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("text", ct.type.type);
EXPECT_EQ("html", ct.type.subtype);
ASSERT_EQ(1, ct.params.size());
ASSERT_EQ(typeid(binary_parameter), ct.params[0].type());
auto& x = boost::get<binary_parameter>(ct.params[0]);
EXPECT_EQ("charset", x.name);
EXPECT_EQ("ISO-8859-5", x.value);
}
来源:https://stackoverflow.com/questions/37840682/decode-http-header-value-fully-with-boost-spirit