Sorting (small) arrays by key in CUDA

问题

I'm trying to write a function that takes a block of unsorted key/value pairs such as

<7, 4>
<2, 8>
<3, 1>
<2, 2>
<1, 5>
<7, 1>
<3, 8>
<7, 2>

and sorts them by key while reducing the values of pairs with the same key:

<1, 5>
<2, 10>
<3, 9>
<7, 7>

Currently, I'm using a __device__ function like the one below which is essentially a bitonic sort that will combine values of the same key and set the old data to an infinitely large value (just using 99 for now) so that a subsequent bitonic sort will sift them to the bottom and the array cut by the value of int * removed.

__device__ void interBitonicSortReduce(int2 *sdata, int tid, int recordNum, int *removed) {
  int n = MIN(DEFAULT_DIMBLOCK, recordNum);
  for (int k = 2; k <= n; k *= 2) {
    for (int j = k / 2; j > 0; j /= 2) {
      int ixj = tid ^ j;
      if (ixj > tid) {
        if (sdata[tid].x == sdata[ixj].x && sdata[tid].x < 99) {
          atomicAdd(&sdata[tid].y, sdata[ixj].y);
          sdata[ixj].x = 99; 
          sdata[ixj].y = 99; 
          atomicAdd(removed, 1); 
        }   
        if ((tid & k) == 0 && sdata[tid].x > sdata[ixj].x)
          swapData2(sdata[tid], sdata[ixj]);
        if ((tid & k) != 0 && sdata[tid].x < sdata[ixj].x)
          swapData2(sdata[tid], sdata[ixj]);
        __syncthreads();
      }   
    }   
  }
}

This works just fine for small sets of data but with larger sets (though still within the size of a single block) a single call just won't do it.

Is it wise to try to combine the sorting and the reduction in the same function? Obviously the function would need to be called more than once but is it possible to determine exactly how many times it needs to be called to exhaust all the data based on its size?

Or should I preform the reduction separately with something like this:

__device__ int interReduce(int2 *sdata, int tid) {
  int index = tid;
  while (sdata[index].x == sdata[tid].x) {
    index--;
    if (index < 0)
      break;
  }
  if (index+1 != tid) {
    atomicAdd(&sdata[index+1].y, sdata[tid].y);
    sdata[tid].x = 99;
    sdata[tid].y = 99;
    return 1;
  }
  return 0;
}

I'm trying to come up with the most efficient solution, but my experience with CUDA and parallel algorithms is limited.

回答1:

You can use thrust to do this.

Use thrust::sort_by_key followed by thrust::reduce_by_key

Here's an example:

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/sequence.h>

#define N 12
typedef thrust::device_vector<int>::iterator dintiter;
int main(){

  thrust::device_vector<int> keys(N);
  thrust::device_vector<int> values(N);
  thrust::device_vector<int> new_keys(N);
  thrust::device_vector<int> new_values(N);
  thrust::sequence(keys.begin(), keys.end());
  thrust::sequence(values.begin(), values.end());

  keys[3] = 1;
  keys[9] = 1;
  keys[8] = 2;
  keys[7] = 4;

  thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
  thrust::pair<dintiter, dintiter> new_end;
  new_end = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), new_keys.begin(), new_values.begin());

  std::cout << "results  values:" << std::endl;
  thrust::copy(new_values.begin(), new_end.second, std::ostream_iterator<int>( std::cout, " "));
  std::cout << std::endl << "results keys:" << std::endl;
  thrust::copy(new_keys.begin(), new_end.first, std::ostream_iterator<int>( std::cout, " "));
  std::cout << std::endl;

  return 0;
}

回答2:

From your post, it seems that you need to sort by key many small arrays. Quoting yourself:

This works just fine for small sets of data but with larger sets (though still within the size of a single block) a single call just won't do it.

Below you will find a fully worked example constructed around my answer to Sorting many small arrays in CUDA and using cub::BlockRadixSort.

#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>

#include "Utilities.cuh"

using namespace cub;

/**********************************/
/* CUB BLOCKSORT KERNEL NO SHARED */
/**********************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void BlockSortKernel(float *d_values, int *d_keys, float *d_values_result, int *d_keys_result)
{
    // --- Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
    typedef cub::BlockLoad      <int*,   BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE>  BlockLoadIntT;
    typedef cub::BlockLoad      <float*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE>  BlockLoadFloatT;
    typedef cub::BlockStore     <int*,   BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreIntT;
    typedef cub::BlockStore     <float*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreFloatT;
    typedef cub::BlockRadixSort <int ,   BLOCK_THREADS, ITEMS_PER_THREAD, float>                 BlockRadixSortT;

    // --- Allocate type-safe, repurposable shared memory for collectives
    __shared__ union {
        typename BlockLoadIntT      ::TempStorage loadInt;
        typename BlockLoadFloatT    ::TempStorage loadFloat;
        typename BlockStoreIntT     ::TempStorage storeInt;
        typename BlockStoreFloatT   ::TempStorage storeFloat;
        typename BlockRadixSortT    ::TempStorage sort;
    } temp_storage;

    // --- Obtain this block's segment of consecutive keys (blocked across threads)
    int   thread_keys[ITEMS_PER_THREAD];
    float thread_values[ITEMS_PER_THREAD];
    int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);

    BlockLoadIntT(temp_storage.loadInt).Load(d_keys   + block_offset, thread_keys);
    BlockLoadFloatT(temp_storage.loadFloat).Load(d_values + block_offset, thread_values);
    __syncthreads(); 

    // --- Collectively sort the keys
    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(thread_keys, thread_values);
    __syncthreads(); 

    // --- Store the sorted segment
    BlockStoreIntT(temp_storage.storeInt).Store(d_keys_result   + block_offset, thread_keys);
    BlockStoreFloatT(temp_storage.storeFloat).Store(d_values_result + block_offset, thread_values);

}

/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_values, int *d_keys, float *d_values_result, int *d_keys_result)
{
    // --- Shared memory allocation
    __shared__ float sharedMemoryArrayValues[BLOCK_THREADS * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryArrayKeys[BLOCK_THREADS * ITEMS_PER_THREAD];

    // --- Specialize BlockStore and BlockRadixSort collective types
    typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, float>  BlockRadixSortT;

    // --- Allocate type-safe, repurposable shared memory for collectives
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);

    // --- Load data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        sharedMemoryArrayValues[threadIdx.x * ITEMS_PER_THREAD + k] = d_values[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
        sharedMemoryArrayKeys[threadIdx.x * ITEMS_PER_THREAD + k]   = d_keys[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
    }
    __syncthreads();

    // --- Collectively sort the keys
    BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)  [ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys   + (threadIdx.x * ITEMS_PER_THREAD))),
                                                       *static_cast<float(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayValues + (threadIdx.x * ITEMS_PER_THREAD))));
    __syncthreads();

    // --- Write data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        d_values_result[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValues[threadIdx.x * ITEMS_PER_THREAD + k];
        d_keys_result  [block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys  [threadIdx.x * ITEMS_PER_THREAD + k];
    }
}

/********/
/* MAIN */
/********/
int main() {

    const int numElemsPerArray  = 8;
    const int numArrays         = 4;
    const int N                 = numArrays * numElemsPerArray;
    const int numElemsPerThread = 4;

    const int RANGE             = N * numElemsPerThread;

    // --- Allocating and initializing the data on the host
    float *h_values = (float *)malloc(N * sizeof(float));
    int *h_keys     = (int *)  malloc(N * sizeof(int));
    for (int i = 0 ; i < N; i++) {
        h_values[i] = rand() % RANGE;
        h_keys[i]   = rand() % RANGE;
    }

    printf("Original\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys[k * numElemsPerArray + i], h_values[k * numElemsPerArray + i]);

    // --- Allocating the results on the host
    float *h_values_result1 = (float *)malloc(N * sizeof(float));
    float *h_values_result2 = (float *)malloc(N * sizeof(float));
    int   *h_keys_result1   = (int *)  malloc(N * sizeof(int));
    int   *h_keys_result2   = (int *)  malloc(N * sizeof(int));

    // --- Allocating space for data and results on device
    float *d_values;            gpuErrchk(cudaMalloc((void **)&d_values,         N * sizeof(float)));
    int   *d_keys;              gpuErrchk(cudaMalloc((void **)&d_keys,           N * sizeof(int)));
    float *d_values_result1;    gpuErrchk(cudaMalloc((void **)&d_values_result1, N * sizeof(float)));
    float *d_values_result2;    gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
    int   *d_keys_result1;      gpuErrchk(cudaMalloc((void **)&d_keys_result1,   N * sizeof(int)));
    int   *d_keys_result2;      gpuErrchk(cudaMalloc((void **)&d_keys_result2,   N * sizeof(int)));

    // --- BlockSortKernel no shared
    gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_keys,   h_keys,   N * sizeof(int),   cudaMemcpyHostToDevice));
    BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_values, d_keys, d_values_result1, d_keys_result1); 
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());    
    gpuErrchk(cudaMemcpy(h_values_result1, d_values_result1, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_keys_result1,   d_keys_result1,   N * sizeof(int),   cudaMemcpyDeviceToHost));

    printf("\n\nBlockSortKernel no shared\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_result1[k * numElemsPerArray + i]);

    // --- BlockSortKernel with shared
    gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_keys,   h_keys,   N * sizeof(int),   cudaMemcpyHostToDevice));
    shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_values, d_keys, d_values_result2, d_keys_result2); 
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());    
    gpuErrchk(cudaMemcpy(h_values_result2, d_values_result2, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_keys_result2,   d_keys_result2,   N * sizeof(int),   cudaMemcpyDeviceToHost));

    printf("\n\nBlockSortKernel shared\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys_result2[k * numElemsPerArray + i], h_values_result2[k * numElemsPerArray + i]);

    return 0;
}

回答3:

I recently had the problem of extending the approach above to the case when multiple arrays must be ordered according to the same key.

It seems that, due to its prototype, it is not possible to use cub::BlockRadixSort by "packing" the arrays using zip iterators and tuples, see C++ operating on “packed” arrays. Accordingly, I have exploited the helper index approach suggested in the quoted post.

Here is the example I worked out:

#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>

#include "Utilities.cuh"

using namespace cub;

/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_valuesA, float *d_valuesB, int *d_keys, float *d_values_resultA, float *d_values_resultB, int *d_keys_result)
{
    // --- Shared memory allocation
    __shared__ float sharedMemoryArrayValuesA[BLOCK_THREADS * ITEMS_PER_THREAD];
    __shared__ float sharedMemoryArrayValuesB[BLOCK_THREADS * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryArrayKeys[BLOCK_THREADS * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryHelperIndices[BLOCK_THREADS * ITEMS_PER_THREAD];

    // --- Specialize BlockStore and BlockRadixSort collective types
    typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, int>    BlockRadixSortT;

    // --- Allocate type-safe, repurposable shared memory for collectives
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);

    // --- Load data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        sharedMemoryArrayValuesA [threadIdx.x * ITEMS_PER_THREAD + k] = d_valuesA[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
        sharedMemoryArrayValuesB [threadIdx.x * ITEMS_PER_THREAD + k] = d_valuesB[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
        sharedMemoryArrayKeys    [threadIdx.x * ITEMS_PER_THREAD + k] = d_keys   [block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
        sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k] =                          threadIdx.x * ITEMS_PER_THREAD + k ;
    }
    __syncthreads();

    // --- Collectively sort the keys
    BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys     + (threadIdx.x * ITEMS_PER_THREAD))),
                                                       *static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryHelperIndices + (threadIdx.x * ITEMS_PER_THREAD))));
    __syncthreads();

    // --- Write data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        d_values_resultA[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesA[sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k]];
        d_values_resultB[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesB[sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k]];
        d_keys_result   [block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys                             [threadIdx.x * ITEMS_PER_THREAD + k];
    }
}

/********/
/* MAIN */
/********/
int main() {

    const int numElemsPerArray  = 8;
    const int numArrays         = 4;
    const int N                 = numArrays * numElemsPerArray;
    const int numElemsPerThread = 4;

    const int RANGE             = N * numElemsPerThread;

    // --- Allocating and initializing the data on the host
    float *h_valuesA    = (float *)malloc(N * sizeof(float));
    float *h_valuesB    = (float *)malloc(N * sizeof(float));
    int *h_keys         = (int *)  malloc(N * sizeof(int));
    for (int i = 0 ; i < N; i++) {
        h_valuesA[i] = rand() % RANGE;
        h_valuesB[i] = rand() % RANGE;
        h_keys[i]    = rand() % RANGE;
    }

    printf("Original\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value A %f; Value B %f\n", k, i, h_keys[k * numElemsPerArray + i], h_valuesA[k * numElemsPerArray + i], h_valuesB[k * numElemsPerArray + i]);

    // --- Allocating the results on the host
    float *h_values_resultA  = (float *)malloc(N * sizeof(float));
    float *h_values_resultB  = (float *)malloc(N * sizeof(float));
    float *h_values_result2  = (float *)malloc(N * sizeof(float));
    int   *h_keys_result1    = (int *)  malloc(N * sizeof(int));
    int   *h_keys_result2    = (int *)  malloc(N * sizeof(int));

    // --- Allocating space for data and results on device
    float *d_valuesA;           gpuErrchk(cudaMalloc((void **)&d_valuesA,        N * sizeof(float)));
    float *d_valuesB;           gpuErrchk(cudaMalloc((void **)&d_valuesB,        N * sizeof(float)));
    int   *d_keys;              gpuErrchk(cudaMalloc((void **)&d_keys,           N * sizeof(int)));
    float *d_values_resultA;    gpuErrchk(cudaMalloc((void **)&d_values_resultA, N * sizeof(float)));
    float *d_values_resultB;    gpuErrchk(cudaMalloc((void **)&d_values_resultB, N * sizeof(float)));
    float *d_values_result2;    gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
    int   *d_keys_result1;      gpuErrchk(cudaMalloc((void **)&d_keys_result1,   N * sizeof(int)));
    int   *d_keys_result2;      gpuErrchk(cudaMalloc((void **)&d_keys_result2,   N * sizeof(int)));

    // --- BlockSortKernel with shared
    gpuErrchk(cudaMemcpy(d_valuesA, h_valuesA, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_valuesB, h_valuesB, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_keys,   h_keys,   N * sizeof(int),   cudaMemcpyHostToDevice));
    shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_valuesA, d_valuesB, d_keys, d_values_resultA, d_values_resultB, d_keys_result1); 
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());    
    gpuErrchk(cudaMemcpy(h_values_resultA, d_values_resultA, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_values_resultB, d_values_resultB, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_keys_result1,   d_keys_result1,   N * sizeof(int),   cudaMemcpyDeviceToHost));

    printf("\n\nBlockSortKernel using shared memory\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_resultA[k * numElemsPerArray + i], h_values_resultB[k * numElemsPerArray + i]);

    return 0;
}

回答4:

Following my second answer, I want to provide a further extension to the case when CUB is used to sort elements stored in a linear shared memory array which is filled by a 2D grid of threads. Accordingly, cub::BlockRadixSort is used with a 2D grid of threads instead of a 1D grid of threads as in the previous answer. Here is a fully worked example:

#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>

#include "Utilities.cuh"

using namespace cub;

/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCKSIZE_X, int BLOCKSIZE_Y, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_valuesA, float *d_valuesB, int *d_keys, float *d_values_resultA, float *d_values_resultB, int *d_keys_result)
{
    // --- Shared memory allocation
    __shared__ float sharedMemoryArrayValuesA [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
    __shared__ float sharedMemoryArrayValuesB [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryArrayKeys    [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryHelperIndices[BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];

    // --- Specialize BlockStore and BlockRadixSort collective types
    typedef cub::BlockRadixSort <int , BLOCKSIZE_X, ITEMS_PER_THREAD, int, 4, false, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, BLOCKSIZE_Y> BlockRadixSortT;

    // --- Allocate type-safe, repurposable shared memory for collectives
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    int block_offset = blockIdx.x * (BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD);

    // --- Load data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        sharedMemoryArrayValuesA [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_valuesA[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
        sharedMemoryArrayValuesB [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_valuesB[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
        sharedMemoryArrayKeys    [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_keys   [block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
        sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] =                          (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k ;
    }
    __syncthreads();

    // --- Collectively sort the keys
    BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys     + ((threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD))),
                                                       *static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryHelperIndices + ((threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD))));
    __syncthreads();

    // --- Write data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        d_values_resultA[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesA[sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]];
        d_values_resultB[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesB[sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]];
        d_keys_result   [block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys                             [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
    }
}

/********/
/* MAIN */
/********/
int main() {

    const int blockSize_x       = 2;
    const int blockSize_y       = 4;
    const int numElemsPerArray  = blockSize_x * blockSize_y;
    const int numArrays         = 4;
    const int N                 = numArrays * numElemsPerArray;
    const int numElemsPerThread = numElemsPerArray / (blockSize_x * blockSize_y);

    const int RANGE             = N * numElemsPerThread;

    // --- Allocating and initializing the data on the host
    float *h_valuesA    = (float *)malloc(N * sizeof(float));
    float *h_valuesB    = (float *)malloc(N * sizeof(float));
    int *h_keys         = (int *)  malloc(N * sizeof(int));
    for (int i = 0 ; i < N; i++) {
        h_valuesA[i] = rand() % RANGE;
        h_valuesB[i] = rand() % RANGE;
        h_keys[i]    = rand() % RANGE;
    }

    printf("Original\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value A %f; Value B %f\n", k, i, h_keys[k * numElemsPerArray + i], h_valuesA[k * numElemsPerArray + i], h_valuesB[k * numElemsPerArray + i]);

    // --- Allocating the results on the host
    float *h_values_resultA  = (float *)malloc(N * sizeof(float));
    float *h_values_resultB  = (float *)malloc(N * sizeof(float));
    float *h_values_result2  = (float *)malloc(N * sizeof(float));
    int   *h_keys_result1    = (int *)  malloc(N * sizeof(int));
    int   *h_keys_result2    = (int *)  malloc(N * sizeof(int));

    // --- Allocating space for data and results on device
    float *d_valuesA;           gpuErrchk(cudaMalloc((void **)&d_valuesA,        N * sizeof(float)));
    float *d_valuesB;           gpuErrchk(cudaMalloc((void **)&d_valuesB,        N * sizeof(float)));
    int   *d_keys;              gpuErrchk(cudaMalloc((void **)&d_keys,           N * sizeof(int)));
    float *d_values_resultA;    gpuErrchk(cudaMalloc((void **)&d_values_resultA, N * sizeof(float)));
    float *d_values_resultB;    gpuErrchk(cudaMalloc((void **)&d_values_resultB, N * sizeof(float)));
    float *d_values_result2;    gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
    int   *d_keys_result1;      gpuErrchk(cudaMalloc((void **)&d_keys_result1,   N * sizeof(int)));
    int   *d_keys_result2;      gpuErrchk(cudaMalloc((void **)&d_keys_result2,   N * sizeof(int)));

    // --- BlockSortKernel with shared
    gpuErrchk(cudaMemcpy(d_valuesA, h_valuesA, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_valuesB, h_valuesB, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_keys,   h_keys,   N * sizeof(int),   cudaMemcpyHostToDevice));
    shared_BlockSortKernel<blockSize_x, blockSize_y, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_valuesA, d_valuesB, d_keys, d_values_resultA, d_values_resultB, d_keys_result1); 
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());    
    gpuErrchk(cudaMemcpy(h_values_resultA, d_values_resultA, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_values_resultB, d_values_resultB, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_keys_result1,   d_keys_result1,   N * sizeof(int),   cudaMemcpyDeviceToHost));

    printf("\n\nBlockSortKernel using shared memory\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_resultA[k * numElemsPerArray + i], h_values_resultB[k * numElemsPerArray + i]);

    return 0;
}

来源：https://stackoverflow.com/questions/17597346/sorting-small-arrays-by-key-in-cuda

标签

sorting

cuda

parallel-processing

reduce

cub