Are there any C++ transformations which are similar to itertools.groupby()?
Of course I could easily write my own, but I\'d prefer to leverage the idiomatic behavior
How about this?
template <typename StructType, typename FieldSelectorUnaryFn>
auto GroupBy(const std::vector<StructType>& instances, const FieldSelectorUnaryFn& fieldChooser)
{
StructType _;
using FieldType = decltype(fieldChooser(_));
std::map<FieldType, std::vector<StructType>> instancesByField;
for (auto& instance : instances)
{
instancesByField[fieldChooser(instance)].push_back(instance);
}
return instancesByField;
}
and use it like this:
auto itemsByX = GroupBy(items, [](const auto& item){ return item.x; });
I wrote a C++ library to address this problem in an elegant way. Given your struct
struct foo
{
int x;
std::string y;
float z;
};
To group by y
you simply do:
std::vector<foo> dataframe;
...
auto groups = group_by(dataframe, &foo::y);
You can also group by more than one variable:
auto groups = group_by(dataframe, &foo::y, &foo::x);
And then iterate through the groups normally:
for(auto& [key, group]: groups)
{
// do something
}
It also has other operations such as: subset, concat, and others.
This doesn't really answer your question, but for the fun of it, I implemented a group_by iterator. Maybe someone will find it useful:
#include <assert.h>
#include <iostream>
#include <set>
#include <sstream>
#include <string>
#include <vector>
using std::cout;
using std::cerr;
using std::multiset;
using std::ostringstream;
using std::pair;
using std::vector;
struct Foo
{
int x;
std::string y;
float z;
};
struct FooX {
typedef int value_type;
value_type operator()(const Foo &f) const { return f.x; }
};
template <typename Iterator,typename KeyFunc>
struct GroupBy {
typedef typename KeyFunc::value_type KeyValue;
struct Range {
Range(Iterator begin,Iterator end)
: iter_pair(begin,end)
{
}
Iterator begin() const { return iter_pair.first; }
Iterator end() const { return iter_pair.second; }
private:
pair<Iterator,Iterator> iter_pair;
};
struct Group {
KeyValue value;
Range range;
Group(KeyValue value,Range range)
: value(value), range(range)
{
}
};
struct GroupIterator {
typedef Group value_type;
GroupIterator(Iterator iter,Iterator end,KeyFunc key_func)
: range_begin(iter), range_end(iter), end(end), key_func(key_func)
{
advance_range_end();
}
bool operator==(const GroupIterator &that) const
{
return range_begin==that.range_begin;
}
bool operator!=(const GroupIterator &that) const
{
return !(*this==that);
}
GroupIterator operator++()
{
range_begin = range_end;
advance_range_end();
return *this;
}
value_type operator*() const
{
return value_type(key_func(*range_begin),Range(range_begin,range_end));
}
private:
void advance_range_end()
{
if (range_end!=end) {
typename KeyFunc::value_type value = key_func(*range_end++);
while (range_end!=end && key_func(*range_end)==value) {
++range_end;
}
}
}
Iterator range_begin;
Iterator range_end;
Iterator end;
KeyFunc key_func;
};
GroupBy(Iterator begin_iter,Iterator end_iter,KeyFunc key_func)
: begin_iter(begin_iter),
end_iter(end_iter),
key_func(key_func)
{
}
GroupIterator begin() { return GroupIterator(begin_iter,end_iter,key_func); }
GroupIterator end() { return GroupIterator(end_iter,end_iter,key_func); }
private:
Iterator begin_iter;
Iterator end_iter;
KeyFunc key_func;
};
template <typename Iterator,typename KeyFunc>
inline GroupBy<Iterator,KeyFunc>
group_by(
Iterator begin,
Iterator end,
const KeyFunc &key_func = KeyFunc()
)
{
return GroupBy<Iterator,KeyFunc>(begin,end,key_func);
}
static void test()
{
vector<Foo> foos;
foos.push_back({5,"bill",2.1});
foos.push_back({5,"rick",3.7});
foos.push_back({3,"tom",2.5});
foos.push_back({7,"joe",3.4});
foos.push_back({5,"bob",7.2});
ostringstream out;
for (auto group : group_by(foos.begin(),foos.end(),FooX())) {
out << group.value << ":";
for (auto elem : group.range) {
out << " " << elem.y;
}
out << "\n";
}
assert(out.str()==
"5: bill rick\n"
"3: tom\n"
"7: joe\n"
"5: bob\n"
);
}
int main(int argc,char **argv)
{
test();
return 0;
}
Eric Niebler's ranges library provides a group_by view.
according to the docs it is a header only library and can be included easily.
It's supposed to go into the standard C++ space, but can be used with a recent C++11 compiler.
minimal working example:
#include <map>
#include <vector>
#include <range/v3/all.hpp>
using namespace std;
using namespace ranges;
int main(int argc, char **argv) {
vector<int> l { 0,1,2,3,6,5,4,7,8,9 };
ranges::v3::sort(l);
auto x = l | view::group_by([](int x, int y) { return x / 5 == y / 5; });
map<int, vector<int>> res;
auto i = x.begin();
auto e = x.end();
for (;i != e; ++i) {
auto first = *((*i).begin());
res[first / 5] = to_vector(*i);
}
// res = { 0 : [0,1,2,3,4], 1: [5,6,7,8,9] }
}
(I compiled this with clang 3.9.0. and --std=c++11
)
I recently discovered cppitertools.
It fulfills this need exactly as described.
https://github.com/ryanhaining/cppitertools#groupby
What is the point of bloating standard C++ library with an algorithm that is one line of code?
for (const auto & foo : foos) foos_by_x[foo.x].push_back(foo);
Also, take a look at std::multimap
, it might be just what you need.
UPDATE:
The one-liner I have provided is not well-optimized for the case when your vector is already sorted. A number of map lookups can be reduced if we remember the iterator of previously inserted object, so it the "key" of the next object and do a lookup only when the key is changing. For example:
#include <map>
#include <vector>
#include <string>
#include <algorithm>
#include <iostream>
struct foo {
int x;
std::string y;
float z;
};
class optimized_inserter {
public:
typedef std::map<int, std::vector<foo> > map_type;
optimized_inserter(map_type & map) : map(&map), it(map.end()) {}
void operator()(const foo & obj) {
typedef map_type::value_type value_type;
if (it != map->end() && last_x == obj.x) {
it->second.push_back(obj);
return;
}
last_x = obj.x;
it = map->insert(value_type(obj.x, std::vector<foo>({ obj }))).first;
}
private:
map_type *map;
map_type::iterator it;
int last_x;
};
int main()
{
std::vector<foo> foos;
std::map<int, std::vector<foo>> foos_by_x;
foos.push_back({ 1, "one", 1.0 });
foos.push_back({ 3, "third", 2.5 });
foos.push_back({ 1, "one.. but third", 1.5 });
foos.push_back({ 2, "second", 1.8 });
foos.push_back({ 1, "one.. but second", 1.5 });
std::sort(foos.begin(), foos.end(), [](const foo & lhs, const foo & rhs) {
return lhs.x < rhs.x;
});
std::for_each(foos.begin(), foos.end(), optimized_inserter(foos_by_x));
for (const auto & p : foos_by_x) {
std::cout << "--- " << p.first << "---\n";
for (auto & f : p.second) {
std::cout << '\t' << f.x << " '" << f.y << "' / " << f.z << '\n';
}
}
}