cuBLAS matrix inverse much slower than MATLAB

In my current project, I am attempting to calculate the inverse of a large (n > 2000) matrix with cuBLAS. The inverse calculation is performed, but for some reason calculation times are significantly slower than compared to those when done in MATLAB.

I have attached a sample calculation performed on random matrices using my implementation in either language as well as performance results.

Any help or suggestions on what may be causing this slowdown would be greatly appreciated.

Thank you in advance.

Comparison

cuBLAS vs. MATLAB

N = 500 : cuBLAS ~ 0.130 sec, MATLAB ~ 0.066 sec -> ~1.97x slower

N = 1000 : cuBLAS ~ 0.898 sec, MATLAB ~ 0.311 sec -> ~2.89x slower

N = 2000 : cuBLAS ~ 6.667 sec, MATLAB ~ 0.659 sec -> ~10.12x slower

N = 4000 : cuBLAS ~ 51.860 sec, MATLAB ~ 4.296 sec -> ~12.07x slower

C++ Code

#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <conio.h>

#define CUDA_CALL(res, str) { if (res != cudaSuccess) { printf("CUDA Error : %s : %s %d : ERR %s\n", str, __FILE__, __LINE__, cudaGetErrorName(res)); } }
#define CUBLAS_CALL(res, str) { if (res != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS Error : %s : %s %d : ERR %d\n", str, __FILE__, __LINE__, int(res)); } }

static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;

void d_CUDATimerStart(void)
{
    CUDA_CALL(cudaEventCreate(&cu_TimerStart), "Failed to create start event!");
    CUDA_CALL(cudaEventCreate(&cu_TimerStop), "Failed to create stop event!");

    CUDA_CALL(cudaEventRecord(cu_TimerStart), "Failed to record start event!");
}

float d_CUDATimerStop(void)
{
    CUDA_CALL(cudaEventRecord(cu_TimerStop), "Failed to record stop event!");

    CUDA_CALL(cudaEventSynchronize(cu_TimerStop), "Failed to synch stop event!");

    float ms;

    CUDA_CALL(cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop), "Failed to elapse events!");

    CUDA_CALL(cudaEventDestroy(cu_TimerStart), "Failed to destroy start event!");
    CUDA_CALL(cudaEventDestroy(cu_TimerStop), "Failed to destroy stop event!");

    return ms;
}

float* d_GetInv(float* L, int n)
{
    cublasHandle_t cu_cublasHandle;
    CUBLAS_CALL(cublasCreate(&cu_cublasHandle), "Failed to initialize cuBLAS!");

    float** adL;
    float** adC;
    float* dL;
    float* dC;
    int* dLUPivots;
    int* dLUInfo;

    size_t szA = n * n * sizeof(float);

    CUDA_CALL(cudaMalloc(&adL, sizeof(float*)), "Failed to allocate adL!");
    CUDA_CALL(cudaMalloc(&adC, sizeof(float*)), "Failed to allocate adC!");
    CUDA_CALL(cudaMalloc(&dL, szA), "Failed to allocate dL!");
    CUDA_CALL(cudaMalloc(&dC, szA), "Failed to allocate dC!");
    CUDA_CALL(cudaMalloc(&dLUPivots, n * sizeof(int)), "Failed to allocate dLUPivots!");
    CUDA_CALL(cudaMalloc(&dLUInfo, sizeof(int)), "Failed to allocate dLUInfo!");

    CUDA_CALL(cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice), "Failed to copy to dL!");
    CUDA_CALL(cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adL!");
    CUDA_CALL(cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adC!");

    d_CUDATimerStart();

    CUBLAS_CALL(cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1), "Failed to perform LU decomp operation!");
    CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");

    CUBLAS_CALL(cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1), "Failed to perform Inverse operation!");
    CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");

    float timed = d_CUDATimerStop();

    printf("cublas inverse in: %.5f ms.\n", timed);

    float* res = (float*)malloc(szA);

    CUDA_CALL(cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost), "Failed to copy to res!");

    CUDA_CALL(cudaFree(adL), "Failed to free adL!");
    CUDA_CALL(cudaFree(adC), "Failed to free adC!");
    CUDA_CALL(cudaFree(dL), "Failed to free dL!");
    CUDA_CALL(cudaFree(dC), "Failed to free dC!");
    CUDA_CALL(cudaFree(dLUPivots), "Failed to free dLUPivots!");
    CUDA_CALL(cudaFree(dLUInfo), "Failed to free dLUInfo!");

    CUBLAS_CALL(cublasDestroy(cu_cublasHandle), "Failed to destroy cuBLAS!");

    return res;
}

int main()
{
    int n = 1000;
    float* L = (float*)malloc(n * n * sizeof(float));
    for(int i = 0; i < n * n; i++)
        L[i] = ((float)rand()/(float)(RAND_MAX));

    float* inv = d_GetInv(L, n);

    printf("done.");
    _getch();

    return 0;
}

MATLAB Code

A = rand(1000);
tic
X = inv(A);
toc

System Info:

GPU: GTX 780 3gb

CPU: i7-4790S @ 3.20 GHz

As @RobertCrovella said, you should not use batched small matrix APIs for a single large matrix inversion.

Basically you could use the same method as in your code, but with the non-batched version of getrf() and getri() to maximum the performance for large matrix.

For getrf() you could find it here.

http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrf

For getri(), although CUDA toolkit does not provide a getri() to solve AX=I, where A is LU-facotored by getrf(), it does provide a getrs() to solve AX=B. All you need to do is to set B=I before calling getrs().

http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrs

来源：https://stackoverflow.com/questions/37731103/cublas-matrix-inverse-much-slower-than-matlab

标签

c++

performance

matlab

cuda

cublas