How to incrementally parse (and act on) a large file with Boost.Spirit.Qi?

十年热恋 提交于 2019-12-01 06:48:42
sehe

Just use streaming iterators.

Or operate on a memory mapped file.

On the processing side, push actions onto a queue from inside a semantic action.

Note: you could run into a supposed bug that doesn't clear the backtrack buffers properly; You might want to check this and take preventative measures as described in this answer: Boost spirit memory leak using flush_multi_pass

Live Demo

#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/io.hpp>

namespace model
{
    namespace qi = boost::spirit::qi;
    namespace px = boost::phoenix;

    struct spectrum {
        std::string comment;
        std::string file;
        std::string nativeId;
        double      precursorMz;
        int         precursorCharge;
        double      precursorIntensity;
    };

    struct cluster {
        std::string           id;
        std::vector<spectrum> spectra;
    };
}

BOOST_FUSION_ADAPT_STRUCT(model::spectrum, comment, file, nativeId, precursorMz, precursorCharge, precursorIntensity)
BOOST_FUSION_ADAPT_STRUCT(model::cluster, id, spectra)

namespace model
{
    template <typename Iterator>
    struct cluster_parser : qi::grammar<Iterator>
    {
        cluster_parser(std::function<void(std::string const&, model::cluster const&)> handler) 
            :   cluster_parser::base_type(start),
                submit_(handler)
        {
            using namespace qi;

            quoted_string %= lexeme['"' > +(char_ - '"') > '"'];

            spectrum_start %=
                lit("SPEC") >
                "#" > +(char_ - "File:") >
                "File:" > quoted_string > lit(",") >
                "NativeID:" > quoted_string >
                bool_ > double_ > int_ > double_;

            cluster_start %= 
                "=Cluster=" > eol >
                "id=" > +(char_ - eol) > eol >
                spectrum_start % eol;


            clusters %= 
                "name=" > qi::as_string[ +(char_ - eol) ][ name_ = _1 ] > eol > eol >
                cluster_start [ submit_(name_, _1) ] % eol;

            start = skip(blank) [clusters];

            BOOST_SPIRIT_DEBUG_NODES((start)(clusters)(cluster_start)(quoted_string)(spectrum_start))
        }
      private:
        qi::_a_type name_;
        px::function<std::function<void(std::string const&, model::cluster const&)> > submit_;

        qi::rule<Iterator, std::string(), qi::blank_type> quoted_string;
        qi::rule<Iterator, cluster(), qi::blank_type> cluster_start;
        qi::rule<Iterator, spectrum(), qi::blank_type> spectrum_start;
        qi::rule<Iterator, qi::locals<std::string>, qi::blank_type> clusters;
        qi::rule<Iterator> start;
    };
}

int main()
{
    using namespace model;

    cluster_parser<boost::spirit::istream_iterator> g([&](auto const&...){std::cout << "handled\n";}); // Our grammar
    std::string str;
    //std::ifstream input("c:/test/Mo_tai.clustering");

    std::istringstream input(R"(name=GreedyClustering_0.99

=Cluster=
id=9c8c5830-5841-4f77-b819-64180509615b
SPEC    #file=w:\test\Mo_Tai_iTRAQ_f4.mgf#id=index=219#title=Mo_Tai_iTRAQ_f4.1254.1254.2 File:"Mo_Tai_iTRAQ_f4.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1254"   true        300.1374    2           0.0
=Cluster=
id=f8f384a1-3d5f-4af1-9581-4d03a5aa3342
SPEC    #file=w:\test\Mo_Tai_iTRAQ_f9.mgf#id=index=560#title=Mo_Tai_iTRAQ_f9.1666.1666.3 File:"Mo_Tai_iTRAQ_f9.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1666"   true        300.14413   3           0.0
SPEC    #file=w:\test\Mo_Tai_iTRAQ_f9.mgf#id=index=520#title=Mo_Tai_iTRAQ_f9.1621.1621.3 File:"Mo_Tai_iTRAQ_f9.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1621"   true        300.14197   3           0.0
=Cluster=
id=b84b79e1-44bc-44c0-a9af-5391ca02582d
SPEC    #file=w:\test\Mo_Tai_iTRAQ_f2.mgf#id=index=7171#title=Mo_Tai_iTRAQ_f2.12729.12729.2 File:"Mo_Tai_iTRAQ_f2.raw", NativeID:"controllerType=0 controllerNumber=1 scan=12729"   true        300.15695   2           0.0)");
    input.unsetf(std::ios::skipws);
    boost::spirit::istream_iterator begin(input);
    boost::spirit::istream_iterator end;

    bool r = phrase_parse(begin, end, g, qi::blank);

    if (r && begin == end) {
        std::cout << "Parsing succeeded\n";
    }
    else {
        std::cout << "Parsing failed\n";
    }

    if (begin!=end) {
        std::cout << "Unparsed remaining input: '" << std::string(begin, end) << "\n";
    }

    return (r && begin==end)? 0 : 1;
}

Prints

handled
handled
handled
Parsing succeeded

BONUS: Threaded workers

Here's a version that dispatches the clusters for asynchronous processing on a thread pool.

Note that the submit method posts a lambda to the service. The lambda captures by value because the lifetime of the parameters should extend during the processing.

Live On Coliru

#include <boost/asio.hpp>
#include <boost/thread.hpp>
namespace ba = boost::asio;

struct Processing {
    Processing() {
        for (unsigned i=0; i < boost::thread::hardware_concurrency(); ++i)
            _threads.create_thread([this] { _svc.run(); });
    }

    ~Processing() {
        _work.reset();
        _threads.join_all();
    }

    void submit(std::string const& name, model::cluster const& cluster) {
        _svc.post([=] { do_processing(name, cluster); });
    }

  private:
    void do_processing(std::string const& name, model::cluster const& cluster) {
        std::cout << "Thread " << boost::this_thread::get_id() << ": " << name << " cluster of " << cluster.spectra.size() << " spectra\n";
        boost::this_thread::sleep_for(boost::chrono::milliseconds(950));
    }

    ba::io_service _svc;
    boost::optional<ba::io_service::work> _work = ba::io_service::work(_svc);
    boost::thread_group _threads;
};

[...snip...] and in main:

Processing processing;
auto handler = [&processing](auto&... args) { processing.submit(args...); };

cluster_parser<boost::spirit::istream_iterator> g(handler); // Our grammar

The rest is unmodified, and now it prints (e.g.):

Thread 7f0144a5b700: GreedyClustering_0.99 cluster of 1 spectra
Thread 7f014425a700: GreedyClustering_0.99 cluster of 2 spectra
Parsing succeeded
Thread 7f0143a59700: GreedyClustering_0.99 cluster of 1 spectra
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!