Given two key-value lists, I am trying to combine the two sides by matching the keys and applying a function to the two values when the keys match. In my case I want to mult
You can actually do all you want using one thrust::set_intersection_by_key
call.
However, some prerequisites need to be met:
First, the easy one:
You need to zip Lvalsv
and Rvalsv
into a single thrust::zip_iterator
and pass this as the values to thrust::set_intersection_by_key.
You could already run this:
std::size_t min_size = std::min(Lsize, Rsize);
thrust::device_vector<int> result_keys(min_size);
thrust::device_vector<int> result_values_left(min_size);
thrust::device_vector<int> result_values_right(min_size);
auto zipped_input_values = thrust::make_zip_iterator(thrust::make_tuple(Lvalsv.begin(), Rvalsv.begin()));
auto zipped_output_values = thrust::make_zip_iterator(thrust::make_tuple(result_values_left.begin(), result_values_right.begin()));
auto result_pair = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), zipped_input_values, result_keys.begin(), zipped_output_values);
This would yield two result vectors, which you would need to multiply element-wise to get your final result.
But wait, wouldn't it be great if you could avoid having to store these two vectors as the result, then read each element again for multiplying them and then store the final result in a third vector?
Let's do that. The concept I adapted is from here.
The transform_output_iterator
is a iterator, which is a wrapper around another OutputIterator
. When writing to the transform_output_iterator
, a UnaryFunction
is applied to the value to be written, then that result is written to the wrapped OutputIterator
.
This allows us to pass the result from thrust::set_intersection_by_key
through the Multiplier
functor and then store it in the results in a single result_values
vector.
The following code implements this idea:
#include <thrust/iterator/iterator_traits.h>
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_adaptor.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
#include <thrust/set_operations.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cstdint>
#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
std::cout << name << ":\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
std::cout << std::endl;
}
template <typename OutputIterator, typename UnaryFunction>
class Proxy
{
UnaryFunction& fun;
OutputIterator& out;
public:
__host__ __device__
Proxy(UnaryFunction& fun, OutputIterator& out) : fun(fun), out(out) {}
template <typename T>
__host__ __device__
Proxy operator=(const T& x) const
{
*out = fun(x);
return *this;
}
};
// This iterator is a wrapper around another OutputIterator which
// applies a UnaryFunction before writing to the OutputIterator.
template <typename OutputIterator, typename UnaryFunction>
class transform_output_iterator : public thrust::iterator_adaptor<
transform_output_iterator<OutputIterator, UnaryFunction>
, OutputIterator
, thrust::use_default
, thrust::use_default
, thrust::use_default
, Proxy<const OutputIterator, const UnaryFunction> >
{
UnaryFunction fun;
public:
friend class thrust::iterator_core_access;
// shorthand for the name of the iterator_adaptor we're deriving from
typedef thrust::iterator_adaptor<
transform_output_iterator<OutputIterator, UnaryFunction>,
OutputIterator, thrust::use_default, thrust::use_default, thrust::use_default, Proxy<const OutputIterator, const UnaryFunction>
> super_t;
__host__ __device__
transform_output_iterator(OutputIterator out, UnaryFunction fun) : super_t(out), fun(fun)
{
}
private:
__host__ __device__
typename super_t::reference dereference() const
{
return Proxy<const OutputIterator, const UnaryFunction>(fun, this->base_reference());
}
};
struct Multiplier
{
template<typename Tuple>
__host__ __device__
auto operator()(Tuple t) const -> decltype(thrust::get<0>(t) * thrust::get<1>(t))
{
return thrust::get<0>(t) * thrust::get<1>(t);
}
};
template <typename OutputIterator, typename UnaryFunction>
transform_output_iterator<OutputIterator, UnaryFunction>
__host__ __device__
make_transform_output_iterator(OutputIterator out, UnaryFunction fun)
{
return transform_output_iterator<OutputIterator, UnaryFunction>(out, fun);
}
int main()
{
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector<int> Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector<int> Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector<int> Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector<int> Rvalsv(Rvals, Rvals+Rsize);
std::size_t min_size = std::min(Lsize, Rsize);
thrust::device_vector<int> result_keys(min_size);
thrust::device_vector<int> result_values(min_size);
auto zipped_values = thrust::make_zip_iterator(thrust::make_tuple(Lvalsv.begin(), Rvalsv.begin()));
auto output_it = make_transform_output_iterator(result_values.begin(), Multiplier());
auto result_pair = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), zipped_values, result_keys.begin(), output_it);
std::size_t new_size = result_pair.first - result_keys.begin();
result_keys.resize(new_size);
result_values.resize(new_size);
PRINTER(result_keys);
PRINTER(result_values);
}
output
$ nvcc -std=c++11 main.cu && ./a.out
result_keys: 1 4 5 6
result_values: 6 1 8 1
Well, I'm not sure this is the best method (@m.s. usually comes up with better approaches than I), but one possible approach would be (method 1):
I don't know what your skill level is with thrust but I can provide a trivial worked example of the above if desired.
Another possible approach (method 2):
My sense is the second method might be faster, but I haven't carefully thought through it. In any event, it's better to benchmark test cases than to work off of (my) intuition.
Based on a comment below, here is a description of what is happening starting with the 2nd step of method 2, using your example dataset:
The output of step 1 (the merge_by_key operation) would look like something like this:
keys: { 1, 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values: { 3, 2, 4, 1, 1, 1, 2, 4, 1, 1, 2 };
Let's construct two versions, the first being "the item" and the second being "the next item to the right":
keys1: { 1, 1, 2, 3, 4, 4, 5, 5, 6, 6 };
values1: { 3, 2, 4, 1, 1, 1, 2, 4, 1, 1 };
keys2: { 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values2: { 2, 4, 1, 1, 1, 2, 4, 1, 1, 2 };
The actual "construction" is trivial. keys1 is just [keys.begin(), keys.end()-1), and keys2 is just [keys.begin()+1, keys.end()). And likewise for values1 and values2.
We'll zip keys1 and values1 together and we'll zip keys2 and values2 together. Then we'll pass these two zipped entities to a transform operation that has a special functor that will do the following:
If keys1 == keys2, do the desired math operation on the values1 and values2 quantities, and place a one in the marker array. If not, place a 0 in a marker array. The output of this operation would be:
keys: { 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values: { 6, 4, 1, 1, 1, 8, 4, 1, 1, 2 };
marker: { 1, 0, 0, 1, 0, 1, 0, 1, 0, 0 };
Now zip the 3 vectors above together, and pass that to remove_if. The remove_if functor would indicate removal of any items for which marker == 0, leaving:
keys: { 1, 4, 5, 6 };
values: { 6, 1, 8, 1 };
marker: { 1, 1, 1, 1 };
Here is a fully worked example demonstrating both methods:
$ cat t1007.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/set_operations.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <thrust/merge.h>
#include <thrust/remove.h>
#include <assert.h>
struct mark_mpy_func
{
template <typename T1, typename T2>
__host__ __device__
int operator()(T1 &z1, T2 &z2){
int res = 0;
if (thrust::get<0>(z1) == thrust::get<0>(z2)){
res = thrust::get<1>(z1) * thrust::get<1>(z2);
thrust::get<2>(z1) = 1;}
return res;
}
};
struct mtest_func
{
__host__ __device__
bool operator()(int t){
if (t == 0) return true;
return false;
}
};
int main(){
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector<int> Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector<int> Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector<int> Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector<int> Rvalsv(Rvals, Rvals+Rsize);
// method 1
thrust::device_vector<int> Lkeysvo(Lsize);
thrust::device_vector<int> Lvalsvo(Lsize);
thrust::device_vector<int> Rkeysvo(Rsize);
thrust::device_vector<int> Rvalsvo(Rsize);
size_t Lsizeo = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Lkeysvo.begin(), Lvalsvo.begin()).first - Lkeysvo.begin();
size_t Rsizeo = thrust::set_intersection_by_key(Rkeysv.begin(), Rkeysv.end(), Lkeysv.begin(), Lkeysv.end(), Rvalsv.begin(), Rkeysvo.begin(), Rvalsvo.begin()).first - Rkeysvo.begin();
assert(Lsizeo == Rsizeo);
thrust::device_vector<int> res1(Lsizeo);
thrust::transform(Lvalsvo.begin(), Lvalsvo.begin()+Lsizeo, Rvalsvo.begin(), res1.begin(), thrust::multiplies<int>());
std::cout << "Method 1 result:" << std::endl << "keys: ";
thrust::copy_n(Lkeysvo.begin(), Lsizeo, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res1.begin(), Lsizeo, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
// method 2
thrust::device_vector<int> Mkeysv(Lsize + Rsize);
thrust::device_vector<int> Mvalsv(Lsize + Rsize);
thrust::merge_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Rvalsv.begin(), Mkeysv.begin(), Mvalsv.begin());
thrust::device_vector<int> marker(Lsize + Rsize - 1);
thrust::device_vector<int> res2(Lsize + Rsize - 1);
thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), Mvalsv.begin(), marker.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, Mvalsv.end()-1, marker.end())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin()+1, Mvalsv.begin()+1)), res2.begin(), mark_mpy_func());
size_t rsize2 = thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple( Mkeysv.begin(), res2.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, res2.end())), marker.begin(), mtest_func()) - thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), res2.begin()));
std::cout << "Method 2 result:" << std::endl << "keys: ";
thrust::copy_n(Mkeysv.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res2.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1007 t1007.cu
$ ./t1007
Method 1 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
Method 2 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
$
If it is acceptable to use a marker value (say, -1) in the output data to inform the remove_if operation, then the separate marker array can be dispensed with. Here's a modified version of method 2 that works this way:
$ cat t1007.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/merge.h>
#include <thrust/remove.h>
#define MARK_VAL -1
struct mark_mpy_func
{
template <typename T1, typename T2>
__host__ __device__
int operator()(T1 &z1, T2 &z2){
int res = MARK_VAL;
if (thrust::get<0>(z1) == thrust::get<0>(z2)){
res = thrust::get<1>(z1) * thrust::get<1>(z2);}
return res;
}
};
struct mtest_func
{
template <typename T>
__host__ __device__
bool operator()(T t){
if (thrust::get<1>(t) == MARK_VAL) return true;
return false;
}
};
int main(){
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector<int> Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector<int> Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector<int> Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector<int> Rvalsv(Rvals, Rvals+Rsize);
// method 2
thrust::device_vector<int> Mkeysv(Lsize + Rsize);
thrust::device_vector<int> Mvalsv(Lsize + Rsize);
thrust::merge_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Rvalsv.begin(), Mkeysv.begin(), Mvalsv.begin());
thrust::device_vector<int> res2(Lsize + Rsize - 1);
thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), Mvalsv.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, Mvalsv.end()-1)), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin()+1, Mvalsv.begin()+1)), res2.begin(), mark_mpy_func());
size_t rsize2 = thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple( Mkeysv.begin(), res2.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, res2.end())), mtest_func()) - thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), res2.begin()));
std::cout << "Method 2 result:" << std::endl << "keys: ";
thrust::copy_n(Mkeysv.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res2.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1007 t1007.cu
$ ./t1007
Method 2 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
$
I think two set intersections are required, as suggested in the first answer. The other solutions won't work, and it is just coincidence in the input data they produce correct result. For example, if the second (key,value) pair is removed from the left set, the computed result will be different while it shouldn't Here is the code:
$ cat inner_join.cu
#include <thrust/set_operations.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <iostream>
int main()
{
int _Lkeys[] = {1, 4, 5, 6};
int _Lvals[] = {3, 1, 2, 1};
int _Rkeys[] = {1, 3, 4, 5, 6, 7};
int _Rvals[] = {2, 1, 1, 4, 1, 2};
size_t Lsize = sizeof(_Lkeys) / sizeof(int);
size_t Rsize = sizeof(_Rkeys) / sizeof(int);
thrust::device_vector<int> Lkeys(_Lkeys, _Lkeys + Lsize);
thrust::device_vector<int> Lvals(_Lvals, _Lvals + Lsize);
thrust::device_vector<int> Rkeys(_Rkeys, _Rkeys + Rsize);
thrust::device_vector<int> Rvals(_Rvals, _Rvals + Rsize);
std::size_t min_size = std::min(Lsize, Rsize);
thrust::device_vector<int> result_keys(min_size);
thrust::device_vector<int> result_Rvals(min_size);
thrust::device_vector<int> result_Lvals(min_size);
// set intersection keys, and left set values
size_t intersection_size =
thrust::set_intersection_by_key(Lkeys.begin(), Lkeys.end(), Rkeys.begin(),
Rkeys.end(), Lvals.begin(),
result_keys.begin(), result_Lvals.begin())
.first -
result_keys.begin();
// set intersection keys, and right set values
thrust::set_intersection_by_key(Rkeys.begin(), Rkeys.end(), Lkeys.begin(),
Lkeys.end(), Rvals.begin(),
result_keys.begin(), result_Rvals.begin());
result_Lvals.resize(intersection_size);
result_keys.resize(intersection_size);
thrust::device_vector<int> result_values(intersection_size);
// join left and right intersection values
thrust::transform(result_Lvals.begin(), result_Lvals.end(),
result_Rvals.begin(), result_values.begin(),
thrust::multiplies<int>());
std::cout << "keys: ";
thrust::copy_n(result_keys.begin(), intersection_size,
std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(result_values.begin(), intersection_size,
std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
}
output
$ nvcc inner_join.cu -run
keys: 1,4,5,6,
vals: 6,1,8,1,