C++ algorithm like python's 'groupby'

后端 未结 6 601
再見小時候
再見小時候 2020-12-15 22:20

Are there any C++ transformations which are similar to itertools.groupby()?

Of course I could easily write my own, but I\'d prefer to leverage the idiomatic behavior

相关标签:
6条回答
  • 2020-12-15 22:43

    How about this?

    template <typename StructType, typename FieldSelectorUnaryFn>
    auto GroupBy(const std::vector<StructType>& instances, const FieldSelectorUnaryFn& fieldChooser)
    {
        StructType _;
        using FieldType = decltype(fieldChooser(_));
        std::map<FieldType, std::vector<StructType>> instancesByField;
        for (auto& instance : instances)
        {
            instancesByField[fieldChooser(instance)].push_back(instance);
        }
        return instancesByField;
    }
    

    and use it like this:

    auto itemsByX = GroupBy(items, [](const auto& item){ return item.x; });
    
    0 讨论(0)
  • 2020-12-15 22:51

    I wrote a C++ library to address this problem in an elegant way. Given your struct

    struct foo
    {
        int x;
        std::string y;
        float z;
    };
    

    To group by y you simply do:

    std::vector<foo> dataframe;
    ...
    auto groups = group_by(dataframe, &foo::y);
    

    You can also group by more than one variable:

    auto groups = group_by(dataframe, &foo::y, &foo::x);
    

    And then iterate through the groups normally:

    for(auto& [key, group]: groups)
    {
        // do something
    }
    

    It also has other operations such as: subset, concat, and others.

    0 讨论(0)
  • 2020-12-15 22:54

    This doesn't really answer your question, but for the fun of it, I implemented a group_by iterator. Maybe someone will find it useful:

    #include <assert.h>
    #include <iostream>
    #include <set>
    #include <sstream>
    #include <string>
    #include <vector>
    
    using std::cout;
    using std::cerr;
    using std::multiset;
    using std::ostringstream;
    using std::pair;
    using std::vector;
    
    struct Foo
    {
      int x;
      std::string y;
      float z;
    };
    
    struct FooX {
      typedef int value_type;
      value_type operator()(const Foo &f) const { return f.x; }
    };
    
    
    
    template <typename Iterator,typename KeyFunc>
    struct GroupBy {
      typedef typename KeyFunc::value_type KeyValue;
    
      struct Range {
        Range(Iterator begin,Iterator end)
        : iter_pair(begin,end)
        {
        }
    
        Iterator begin() const { return iter_pair.first; }
        Iterator end() const { return iter_pair.second; }
    
        private:
          pair<Iterator,Iterator> iter_pair;
      };
    
      struct Group {
        KeyValue value;
        Range range;
    
        Group(KeyValue value,Range range)
        : value(value), range(range)
        {
        }
      };
    
    
      struct GroupIterator {
        typedef Group value_type;
    
        GroupIterator(Iterator iter,Iterator end,KeyFunc key_func)
        : range_begin(iter), range_end(iter), end(end), key_func(key_func)
        {
          advance_range_end();
        }
    
        bool operator==(const GroupIterator &that) const
        {
          return range_begin==that.range_begin;
        }
    
        bool operator!=(const GroupIterator &that) const
        {
          return !(*this==that);
        }
    
        GroupIterator operator++()
        {
          range_begin = range_end;
          advance_range_end();
          return *this;
        }
    
        value_type operator*() const
        {
          return value_type(key_func(*range_begin),Range(range_begin,range_end));
        }
    
    
        private:
          void advance_range_end()
          {
            if (range_end!=end) {
              typename KeyFunc::value_type value = key_func(*range_end++);
              while (range_end!=end && key_func(*range_end)==value) {
                ++range_end;
              }
            }
          }
    
          Iterator range_begin;
          Iterator range_end;
          Iterator end;
          KeyFunc key_func;
      };
    
      GroupBy(Iterator begin_iter,Iterator end_iter,KeyFunc key_func)
      : begin_iter(begin_iter),
        end_iter(end_iter),
        key_func(key_func)
      {
      }
    
      GroupIterator begin() { return GroupIterator(begin_iter,end_iter,key_func); }
    
      GroupIterator end() { return GroupIterator(end_iter,end_iter,key_func); }
    
      private:
        Iterator begin_iter;
        Iterator end_iter;
        KeyFunc key_func;
    };
    
    
    template <typename Iterator,typename KeyFunc>
    inline GroupBy<Iterator,KeyFunc>
      group_by(
        Iterator begin,
        Iterator end,
        const KeyFunc &key_func = KeyFunc()
      )
    {
      return GroupBy<Iterator,KeyFunc>(begin,end,key_func);
    }
    
    
    static void test()
    {
      vector<Foo> foos;
      foos.push_back({5,"bill",2.1});
      foos.push_back({5,"rick",3.7});
      foos.push_back({3,"tom",2.5});
      foos.push_back({7,"joe",3.4});
      foos.push_back({5,"bob",7.2});
    
      ostringstream out;
    
      for (auto group : group_by(foos.begin(),foos.end(),FooX())) {
        out << group.value << ":";
        for (auto elem : group.range) {
          out << " " << elem.y;
        }
        out << "\n";
      }
    
      assert(out.str()==
        "5: bill rick\n"
        "3: tom\n"
        "7: joe\n"
        "5: bob\n"
      );
    }
    
    int main(int argc,char **argv)
    {
      test();
      return 0;
    }
    
    0 讨论(0)
  • 2020-12-15 22:54

    Eric Niebler's ranges library provides a group_by view.

    according to the docs it is a header only library and can be included easily.

    It's supposed to go into the standard C++ space, but can be used with a recent C++11 compiler.

    minimal working example:

    #include <map>
    #include <vector>
    #include <range/v3/all.hpp>
    using namespace std;
    using namespace ranges;
    
    int main(int argc, char **argv) {
        vector<int> l { 0,1,2,3,6,5,4,7,8,9 };
        ranges::v3::sort(l);
        auto x = l | view::group_by([](int x, int y) { return x / 5 == y / 5; });
        map<int, vector<int>> res;
    
        auto i = x.begin();
        auto e = x.end();
        for (;i != e; ++i) {
          auto first = *((*i).begin());
          res[first / 5] = to_vector(*i);
        }
    
        // res = { 0 : [0,1,2,3,4], 1: [5,6,7,8,9] }
    }
    

    (I compiled this with clang 3.9.0. and --std=c++11)

    0 讨论(0)
  • 2020-12-15 22:58

    I recently discovered cppitertools.

    It fulfills this need exactly as described.

    https://github.com/ryanhaining/cppitertools#groupby

    0 讨论(0)
  • 2020-12-15 23:07

    What is the point of bloating standard C++ library with an algorithm that is one line of code?

    for (const auto & foo : foos) foos_by_x[foo.x].push_back(foo);
    

    Also, take a look at std::multimap, it might be just what you need.

    UPDATE:

    The one-liner I have provided is not well-optimized for the case when your vector is already sorted. A number of map lookups can be reduced if we remember the iterator of previously inserted object, so it the "key" of the next object and do a lookup only when the key is changing. For example:

    #include <map>
    #include <vector>
    #include <string>
    #include <algorithm>
    #include <iostream>
    
    struct foo {
        int         x;
        std::string y;
        float       z;
    };
    
    class optimized_inserter {
      public:
        typedef std::map<int, std::vector<foo> > map_type;
    
        optimized_inserter(map_type & map) : map(&map), it(map.end()) {}
    
        void operator()(const foo & obj) {
            typedef map_type::value_type value_type;
            if (it != map->end() && last_x == obj.x) {
                it->second.push_back(obj);
                return;
            }
            last_x = obj.x;
            it = map->insert(value_type(obj.x, std::vector<foo>({ obj }))).first;
        }
    
      private:
        map_type          *map;
        map_type::iterator it;
        int                last_x;
    };
    
    int main()
    {
        std::vector<foo> foos;
        std::map<int, std::vector<foo>> foos_by_x;
    
        foos.push_back({ 1, "one", 1.0 });
        foos.push_back({ 3, "third", 2.5 });
        foos.push_back({ 1, "one.. but third", 1.5 });
        foos.push_back({ 2, "second", 1.8 });
        foos.push_back({ 1, "one.. but second", 1.5 });
    
        std::sort(foos.begin(), foos.end(), [](const foo & lhs, const foo & rhs) {
                return lhs.x < rhs.x;
            });
    
        std::for_each(foos.begin(), foos.end(), optimized_inserter(foos_by_x));
    
        for (const auto & p : foos_by_x) {
            std::cout << "--- " << p.first << "---\n";
            for (auto & f : p.second) {
                std::cout << '\t' << f.x << " '" << f.y << "' / " << f.z << '\n';
            }
        }
    }
    
    0 讨论(0)
提交回复
热议问题