CUDA Thrust and sort_by_key

I’m looking for a sorting algorithm on CUDA that can sort an array A of elements (double) and returns an array of keys B for that array A. I know the sort_by_key function in the Thrust library but I want my array of elements A to remain unchanged. What can I do?

My code is:

void sortCUDA(double V[], int P[], int N) {

        real_t *Vcpy = (double*) malloc(N*sizeof(double));
        memcpy(Vcpy,V,N*sizeof(double));

        thrust::sort_by_key(V, V + N, P);
        free(Vcpy);
}

i'm comparing the thrust algorithm against others that i have on sequencial cpu

N               mergesort       sortCUDA
113             0.000008        0.000010
226             0.000018        0.000016
452             0.000036        0.000020
905             0.000061        0.000034
1810            0.000135        0.000071
3621            0.000297        0.000156
7242            0.000917        0.000338
14484           0.001421        0.000853
28968           0.003069        0.001931
57937           0.006666        0.003939
115874          0.014435        0.008025
231749          0.031059        0.016718
463499          0.067407        0.039848
926999          0.148170        0.118003
1853998         0.329005        0.260837
3707996         0.731768        0.544357
7415992         1.638445        1.073755
14831984        3.668039        2.150179
115035495       39.276560       19.812200
230070990       87.750377       39.762915
460141980       200.940501      74.605219

Thrust performance is not bad, but I think if I use OMP can probably get easily a better CPU time

I think this is because to memcpy

SOLUTION:

void thrustSort(double V[], int P[], int N)
{
        thrust::device_vector<int> d_P(N);
        thrust::device_vector<double> d_V(V, V + N);
        thrust::sequence(d_P.begin(), d_P.end());

        thrust::sort_by_key(d_V.begin(), d_V.end(), d_P.begin());

        thrust::copy(d_P.begin(),d_P.end(),P);
}

where V is a my double values to sort

You can modify comparison operator to sort keys instead of values. @Robert Crovella correctly pointed that a raw device pointer cannot be assigned from the host. The modified algorithm is below:

struct cmp : public binary_function<int,int,bool>
{
  cmp(const double *ptr) : rawA(ptr) { }

  __host__ __device__ bool operator()(const int i, const int j) const 
  {return rawA[i] > rawA[j];}

   const double *rawA; // an array in global mem
}; 

void sortkeys(double *A, int n) {
  // move data to the gpu
  thrust::device_vector<double> devA(A, A + n);
  double *rawA = thrust::raw_pointer_cast(devA.data());

  thrust::device_vector<int> B(n);
  // initialize keys
  thrust::sequence(B.begin(), B.end());
  thrust::sort(B.begin(), B.end(), cmp(rawA));
  // B now contains the sorted keys
 }

And here is alternative with arrayfire. Though I am not sure which one is more efficient since arrayfire solution uses two additional arrays:

void sortkeys(double *A, int n) {
   af::array devA(n, A, af::afHost);
   af::array vals, indices;
   // sort and populate vals/indices arrays
   af::sort(vals, indices, devA);
   std::cout << devA << "\n" << indices << "\n";
}

How large is this array? The most efficient way, in terms of speed, will likely be to just duplicate the original array before sorting, if the memory is available.

Building on the answer provided by @asm (I wasn't able to get it working), this code seemed to work for me, and does sort only the keys. However, I believe it is limited to the case where the keys are in sequence 0, 1, 2, 3, 4 ... corresponding to the (double) values. Since this is a "index-value" sort, it could be extended to the case of an arbitrary sequence of keys, perhaps by doing an indexed copy. However I'm not sure the process of generating the index sequence and then rearranging the original keys will be any faster than just copying the original value data to a new vector (for the case of arbitrary keys).

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>

using namespace std;

__device__  double *rawA; // an array in global mem

struct cmp : public binary_function<int, int, bool>
{
  __host__ __device__  bool operator()(const int i, const int j) const
  {return ( rawA[i] < rawA[j]);}
};

void sortkeys(double *A, int n) {
  // move data to the gpu
  thrust::device_vector<double> devA(A, A + n);
//  rawA = thrust::raw_pointer_cast(&(devA[0]));
  double *test = raw_pointer_cast(devA.data());
  cudaMemcpyToSymbol(rawA, &test, sizeof(double *));

  thrust::device_vector<int> B(n);
  // initialize keys
  thrust::sequence(B.begin(), B.end());
  thrust::sort(B.begin(), B.end(), cmp());
  // B now contains the sorted keys
  thrust::host_vector<int> hostB = B;
  for (int i=0; i<hostB.size(); i++)
    std::cout << hostB[i] << " ";
  std::cout<<std::endl;
  for (int i=0; i<hostB.size(); i++)
    std::cout << A[hostB[i]] << " ";
  std::cout<<std::endl;
 }


int main(){

  double C[] = {0.7, 0.3, 0.4, 0.2, 0.6, 1.2, -0.5, 0.5, 0.0, 10.0};
  sortkeys(C, 9);
  std::cout << std::endl;
  return 0;
}

来源：https://stackoverflow.com/questions/13515308/cuda-thrust-and-sort-by-key

标签

sorting

cuda

permutation

thrust