CUBLAS: Incorrect inversion for matrix with zero pivot

前端 未结 1 1876
误落风尘
误落风尘 2020-12-01 23:11

Since CUDA 5.5, the CUBLAS library contains routines for batched matrix factorization and inversion (cublasgetrfBatched and cublasgetriBatched respectively

1条回答
  •  一向
    一向 (楼主)
    2020-12-01 23:19

    There seems to be a bug in the current CUBLAS library implementation of cublasgetrfBatched for matrices of dimension (n) such that 3<=n<=16, when there is a "zero pivot" as you say.

    A possible workaround is to "identity-extend" your A matrix to be inverted, when n<17, to a size of 17x17 (using matlab nomenclature):

     LU = getrf( [A 0 ; 0 I]);
    

    continuing, you can then use cublasgetriBatched in an "ordinary" fashion:

     invA = getri( LU(1:3,1:3) )
    

    (You can also leave everything at n=17, call getri that way, and then extract the result as the first 3x3 rows and columns of invA.)

    Here is a fully worked example, borrowing from the code you supplied, showing the inversion of your supplied 3x3 zero_pivot matrix, using the zero_pivot_war matrix as an "identity-extended" workaround:

    $ cat t340.cu
    #include 
    #include 
    #include 
    #include 
    
    #define cudacall(call)                                                                                                          \
        do                                                                                                                          \
        {                                                                                                                           \
            cudaError_t err = (call);                                                                                               \
            if(cudaSuccess != err)                                                                                                  \
            {                                                                                                                       \
                fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
                cudaDeviceReset();                                                                                                  \
                exit(EXIT_FAILURE);                                                                                                 \
            }                                                                                                                       \
        }                                                                                                                           \
        while (0)
    
    #define cublascall(call)                                                                                        \
        do                                                                                                          \
        {                                                                                                           \
            cublasStatus_t status = (call);                                                                         \
            if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
            {                                                                                                       \
                fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
                cudaDeviceReset();                                                                                  \
                exit(EXIT_FAILURE);                                                                                 \
            }                                                                                                       \
                                                                                                                    \
        }                                                                                                           \
        while(0)
    
    
    void invert_device(float* src_d, float* dst_d, int n)
    {
        cublasHandle_t handle;
        cublascall(cublasCreate_v2(&handle));
    
        int batchSize = 1;
    
        int *P, *INFO;
    
        cudacall(cudaMalloc(&P,17 * batchSize * sizeof(int)));
        cudacall(cudaMalloc(&INFO,batchSize * sizeof(int)));
    
        int lda = 17;
    
        float *A[] = { src_d };
        float** A_d;
        cudacall(cudaMalloc(&A_d,sizeof(A)));
        cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
    
        cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
    
        int INFOh = 0;
        cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
    
        if(INFOh == 17)
        {
            fprintf(stderr, "Factorization Failed: Matrix is singular\n");
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }
    
        float* C[] = { dst_d };
        float** C_d;
        cudacall(cudaMalloc(&C_d,sizeof(C)));
        cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
    
        cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
    
        cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
    
        if(INFOh != 0)
        {
            fprintf(stderr, "Inversion Failed: Matrix is singular\n");
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }
    
        cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
    }
    
    void invert(float* src, float* dst, int n)
    {
        float* src_d, *dst_d;
    
        cudacall(cudaMalloc(&src_d,17 * 17 * sizeof(float)));
        cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
        cudacall(cudaMalloc(&dst_d,n * n * sizeof(float)));
    
        invert_device(src_d,dst_d,n);
    
        cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
    
        cudaFree(src_d), cudaFree(dst_d);
    }
    
    void test_invert()
    {
        const int n = 3;
    
        //Random matrix with full pivots
    /*    float full_pivots[n*n] = { 0.5, 3, 4,
                                    1, 3, 10,
                                    4 , 9, 16 };
    
        //Almost same as above matrix with first pivot zero
        float zero_pivot[n*n] = { 0, 3, 4,
                                  1, 3, 10,
                                  4 , 9, 16 };
    
        float zero_pivot_col_major[n*n] = { 0, 1, 4,
                                            3, 3, 9,
                                            4 , 10, 16 };
    */
        float zero_pivot_war[17*17] = {
                                            0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                            1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                            4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                            0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                            0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
                                            0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
                                            0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
                                            0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
                                            0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
                                            0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
                                            0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
                                            0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
                                            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
                                            0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
                                            0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
                                            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
                                            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
    /*
        float another_zero_pivot[n*n] = { 0, 3, 4,
                                          1, 5, 6,
                                          9, 8, 2 };
    
        float another_full_pivot[n * n] = { 22, 3, 4,
                                            1, 5, 6,
                                            9, 8, 2 };
    
        float singular[n*n] = {1,2,3,
                               4,5,6,
                               7,8,9};
    */
        float result[n*n];
    
        //Select matrix by setting "a"
        float* a = zero_pivot_war;
    
        fprintf(stdout, "Input:\n\n");
        for(int i=0; i

    The above result appears to me to be correct based on a simple test elsewhere.

    I don't have any further technical details about the nature of the possible bug in CUBLAS. From what I can tell, it is present in both CUDA 5.5 and CUDA 6.0 RC. Detailed bug discussions for NVIDIA-supplied assets (e.g. CUBLAS library) should be taken up on the NVIDIA developer forums or directly at the bug filing portal on developer.nvidia.com (you must be a registered developer to file a bug).

    0 讨论(0)
提交回复
热议问题