What is the most efficient way to transpose a matrix in CUDA?

后端 未结 3 1785
予麋鹿
予麋鹿 2021-01-06 16:27

I have a M*N host memory matrix, and upon copying into a device memory, I need it to be transposed into a N*M matrix. Is there any cuda (cuBLAS...)

3条回答
  •  遥遥无期
    2021-01-06 17:10

    To answer your question on efficiency, I have compared two ways to perform matrix transposition, one using the Thrust library and one using cublasgeam, as suggested by Robert Crovella. The result of the comparison is the following on a Kepler K20c card:

    | Matrix size   | Thrust [ms]   | cuBLAS [ms]   |
    |               |               |               |
    | 32x32         | 0.015         | 0.016         |
    | 64x64         | 0.015         | 0.017         |
    | 128x128       | 0.019         | 0.017         |
    | 256x256       | 0.028         | 0.017         |
    | 512x512       | 0.088         | 0.042         |
    | 1024x1024     | 0.34          | 0.13          |
    | 2048x2048     | 1.24          | 0.48          |
    | 4096x4096     | 11.02         | 1.98          |
    

    As it can be seen, the cublasgeam outperforms the version using Thrust. Below is the code to perform the comparison.

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    /**********************/
    /* cuBLAS ERROR CHECK */
    /**********************/
    #ifndef cublasSafeCall
    #define cublasSafeCall(err)     __cublasSafeCall(err, __FILE__, __LINE__)
    #endif
    
    inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
    {
        if( CUBLAS_STATUS_SUCCESS != err) {
            fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err); 
            getch(); cudaDeviceReset(); assert(0); 
        }
    }
    
    // convert a linear index to a linear index in the transpose 
    struct transpose_index : public thrust::unary_function
    {
        size_t m, n;
    
        __host__ __device__
        transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}
    
        __host__ __device__
        size_t operator()(size_t linear_index)
        {
            size_t i = linear_index / n;
            size_t j = linear_index % n;
    
            return m * j + i;
        }
    };
    
    // convert a linear index to a row index
    struct row_index : public thrust::unary_function
    {
        size_t n;
    
        __host__ __device__
        row_index(size_t _n) : n(_n) {}
    
        __host__ __device__
    
        size_t operator()(size_t i)
        {
            return i / n;
        }
    };
    
    // transpose an M-by-N array
    template 
    void transpose(size_t m, size_t n, thrust::device_vector& src, thrust::device_vector& dst)
    {
        thrust::counting_iterator indices(0);
    
        thrust::gather
        (thrust::make_transform_iterator(indices, transpose_index(n, m)),
        thrust::make_transform_iterator(indices, transpose_index(n, m)) + dst.size(),
        src.begin(),dst.begin());
    }
    
    // print an M-by-N array
    template 
    void print(size_t m, size_t n, thrust::device_vector& d_data)
    {
        thrust::host_vector h_data = d_data;
    
        for(size_t i = 0; i < m; i++)
        {
            for(size_t j = 0; j < n; j++)
                std::cout << std::setw(8) << h_data[i * n + j] << " ";
                std::cout << "\n";
        }
    }
    
    int main(void)
    {
        size_t m = 5; // number of rows
        size_t n = 4; // number of columns
    
        // 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
        thrust::device_vector data(m * n, 1.);
        data[1] = 2.;
        data[3] = 3.;
    
        std::cout << "Initial array" << std::endl;
        print(m, n, data);
    
        std::cout << "Transpose array - Thrust" << std::endl;
        thrust::device_vector transposed_thrust(m * n);
        transpose(m, n, data, transposed_thrust);
        print(n, m, transposed_thrust);
    
        std::cout << "Transpose array - cuBLAS" << std::endl;
        thrust::device_vector transposed_cuBLAS(m * n);
        double* dv_ptr_in  = thrust::raw_pointer_cast(data.data());
        double* dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
        double alpha = 1.;
        double beta  = 0.;
        cublasHandle_t handle;
        cublasSafeCall(cublasCreate(&handle));
        cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m)); 
        print(n, m, transposed_cuBLAS);
    
        getch();
    
        return 0;
    }
    

提交回复
热议问题