How to improve computational time for sorting with thrust?

问题

I found the method 'vectorized/batch sort' and 'nested sort' on below link. How to use Thrust to sort the rows of a matrix?

When I tried this method for 500 row and 1000 elements, the result of them are

vectorized/batch sort : 66ms
nested sort : 3290ms

I am using 1080ti HOF model to do this operation but it takes too long compared to your case.
But in the below link, it could be less than 10ms and almost 100 microseconds.
(How to find median value in 2d array for each column with CUDA?)

Could you recommend how to optimize this method to reduce operation time?

#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>

#define NSORTS 500
#define DSIZE 1000

int my_mod_start = 0;
int my_mod() {
    return (my_mod_start++) / DSIZE;
}

bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2) {
    return thrust::equal(d1.begin(), d1.end(), d2.begin());
}


struct sort_functor
{
    thrust::device_ptr<int> data;
    int dsize;
    __host__ __device__
        void operator()(int start_idx)
    {
        thrust::sort(thrust::device, data + (dsize*start_idx), data + (dsize*(start_idx + 1)));
    }
};

#include <time.h>
#include <windows.h>

unsigned long long dtime_usec(LONG start) {

    SYSTEMTIME timer2;
    GetSystemTime(&timer2);
    LONG end = (timer2.wSecond * 1000) + timer2.wMilliseconds;

    return (end-start);
}

int main() {
    for (int i = 0; i < 3; i++) {
        SYSTEMTIME timer1;
        cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16 * DSIZE*NSORTS));
        thrust::host_vector<int> h_data(DSIZE*NSORTS);
        thrust::generate(h_data.begin(), h_data.end(), rand);
        thrust::device_vector<int> d_data = h_data;

        // first time a loop
        thrust::device_vector<int> d_result1 = d_data;
        thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
        GetSystemTime(&timer1);
        LONG time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
        for (int i = 0; i < NSORTS; i++)
            thrust::sort(r1ptr + (i*DSIZE), r1ptr + ((i + 1)*DSIZE));
        cudaDeviceSynchronize();
        time_ms1 = dtime_usec(time_ms1);
        std::cout << "loop time: " << time_ms1 << "ms" << std::endl;

        //vectorized sort
        thrust::device_vector<int> d_result2 = d_data;
        thrust::host_vector<int> h_segments(DSIZE*NSORTS);
        thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
        thrust::device_vector<int> d_segments = h_segments;
        GetSystemTime(&timer1);
        time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
        thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
        thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
        cudaDeviceSynchronize();
        time_ms1 = dtime_usec(time_ms1);
        std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
        if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;

        //nested sort
        thrust::device_vector<int> d_result3 = d_data;
        sort_functor f = { d_result3.data(), DSIZE };
        thrust::device_vector<int> idxs(NSORTS);
        thrust::sequence(idxs.begin(), idxs.end());
        GetSystemTime(&timer1);
        time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
        thrust::for_each(idxs.begin(), idxs.end(), f);
        cudaDeviceSynchronize();
        time_ms1 = dtime_usec(time_ms1);
        std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
        if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;

    }
    return 0;
}

回答1:

The main takeaway from your thrust experience is that you should never compile a debug project or with device debug switch (-G) when you are interested in performance. Compiling device debug code causes the compiler to omit many performance optimizations. The difference in your case was quite dramatic, about a 30x improvement going from debug to release code.

Here is a segmented cub sort, where we are launching 500 blocks and each block is handling a separate 1024 element array. The CUB code is lifted from here.

$ cat t1761.cu
#include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
#include <iostream>
const int ipt=8;
const int tpb=128;
__global__ void ExampleKernel(int *data)
{
    // Specialize BlockRadixSort for a 1D block of 128 threads owning 8 integer items each
    typedef cub::BlockRadixSort<int, tpb, ipt> BlockRadixSort;
    // Allocate shared memory for BlockRadixSort
    __shared__ typename BlockRadixSort::TempStorage temp_storage;
    // Obtain a segment of consecutive items that are blocked across threads
    int thread_keys[ipt];
    // just create some synthetic data in descending order 1023 1022 1021 1020 ...
    for (int i = 0; i < ipt; i++) thread_keys[i] = (tpb-1-threadIdx.x)*ipt+i;
    // Collectively sort the keys
    BlockRadixSort(temp_storage).Sort(thread_keys);
    __syncthreads();
    // write results to output array
    for (int i = 0; i < ipt; i++) data[blockIdx.x*ipt*tpb + threadIdx.x*ipt+i] = thread_keys[i];
}


int main(){

    const int blks = 500;
    int *data;
    cudaMalloc(&data, blks*ipt*tpb*sizeof(int));
    ExampleKernel<<<blks,tpb>>>(data);
    int *h_data = new int[blks*ipt*tpb];
    cudaMemcpy(h_data, data, blks*ipt*tpb*sizeof(int), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 10; i++) std::cout << h_data[i] << " ";
    std::cout << std::endl;
}

$ nvcc -o t1761 t1761.cu -I/path/to/cub/cub-1.8.0
$ CUDA_VISIBLE_DEVICES="2" nvprof ./t1761
==13713== NVPROF is profiling process 13713, command: ./t1761
==13713== Warning: Profiling results might be incorrect with current version of nvcc compiler used to compile cuda app. Compile with nvcc compiler 9.0 or later version to get correct profiling results. Ignore this warning if code is already compiled with the recommended nvcc version
0 1 2 3 4 5 6 7 8 9
==13713== Profiling application: ./t1761
==13713== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   60.35%  308.66us         1  308.66us  308.66us  308.66us  [CUDA memcpy DtoH]
                   39.65%  202.79us         1  202.79us  202.79us  202.79us  ExampleKernel(int*)
      API calls:   98.39%  210.79ms         1  210.79ms  210.79ms  210.79ms  cudaMalloc
                    0.72%  1.5364ms         1  1.5364ms  1.5364ms  1.5364ms  cudaMemcpy
                    0.32%  691.15us         1  691.15us  691.15us  691.15us  cudaLaunchKernel
                    0.28%  603.26us        97  6.2190us     400ns  212.71us  cuDeviceGetAttribute
                    0.24%  516.56us         1  516.56us  516.56us  516.56us  cuDeviceTotalMem
                    0.04%  79.374us         1  79.374us  79.374us  79.374us  cuDeviceGetName
                    0.01%  13.373us         1  13.373us  13.373us  13.373us  cuDeviceGetPCIBusId
                    0.00%  5.0810us         3  1.6930us     729ns  2.9600us  cuDeviceGetCount
                    0.00%  2.3120us         2  1.1560us     609ns  1.7030us  cuDeviceGet
                    0.00%     748ns         1     748ns     748ns     748ns  cuDeviceGetUuid
$

(CUDA 10.2.89, RHEL 7)

Above I am running on a Tesla K20x, which has performance that is "closer" to your 1080ti than a Tesla V100. We see that the kernel execution time is ~200us. If I run the exact same code on a Tesla V100, the kernel execution time drops to ~35us:

$ CUDA_VISIBLE_DEVICES="0" nvprof ./t1761
==13814== NVPROF is profiling process 13814, command: ./t1761
0 1 2 3 4 5 6 7 8 9
==13814== Profiling application: ./t1761
==13814== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   82.33%  163.43us         1  163.43us  163.43us  163.43us  [CUDA memcpy DtoH]
                   17.67%  35.073us         1  35.073us  35.073us  35.073us  ExampleKernel(int*)
      API calls:   98.70%  316.92ms         1  316.92ms  316.92ms  316.92ms  cudaMalloc
                    0.87%  2.7879ms         1  2.7879ms  2.7879ms  2.7879ms  cuDeviceTotalMem
                    0.19%  613.75us        97  6.3270us     389ns  205.37us  cuDeviceGetAttribute
                    0.19%  601.61us         1  601.61us  601.61us  601.61us  cudaMemcpy
                    0.02%  72.718us         1  72.718us  72.718us  72.718us  cudaLaunchKernel
                    0.02%  59.905us         1  59.905us  59.905us  59.905us  cuDeviceGetName
                    0.01%  37.886us         1  37.886us  37.886us  37.886us  cuDeviceGetPCIBusId
                    0.00%  4.6830us         3  1.5610us     546ns  2.7850us  cuDeviceGetCount
                    0.00%  1.9900us         2     995ns     587ns  1.4030us  cuDeviceGet
                    0.00%     677ns         1     677ns     677ns     677ns  cuDeviceGetUuid
$

You'll note there is no "input" array, I'm just synthesizing data in the kernel, since we are interested in performance, primarily. If you need to handle an array size like 1000, you should probably just pad each array to 1024 (e.g. pad with a very large number, then ignore the last numbers in the sorted result.)

来源：https://stackoverflow.com/questions/62935564/how-to-find-median-value-in-2d-array-for-each-column-with-cuda

标签

sorting

cuda

thrust