QR decomposition to solve linear systems in CUDA

前端 未结 3 416
离开以前
离开以前 2020-12-29 16:06

I\'m writing an image restoration algorithm on GPU, details in

Cuda: least square solving , poor in speed

The QR decomposition method to solve the linear sy

3条回答
  •  失恋的感觉
    2020-12-29 16:56

    As of February 2015, CUDA 7.0 (now in Release Candidate) offers the new cuSOLVER library including the possibility of calculating the QR decomposition of a matrix. This, in conjunction with the cuBLAS library, enables to solve a linear system according to the guidelines expounded in Appendix C of the cuSOLVER User's Guide.

    The steps you have to follow are three:

    1) geqrf: it calculates the QR decomposition of the matrix by returning the upper triangular matrix R in the upper triangular part of A and the matrix Q in the form of Householder's vectors stored in the lower triangular part of A, while the scaling factors of the Householder's vectors are returned by the TAU parameter;

    2) ormqr: it returns the product of Q and a matrix C by overwriting C;

    3) trsm: it solves an upper triangular linear system.

    Below, I'm providing a full example of the usage of those routines.

    #include "cuda_runtime.h"
    #include "device_launch_paraMeters.h"
    
    #include
    #include
    #include
    #include
    #include
    #include
    
    #include 
    #include 
    #include 
    
    #include "Utilities.cuh"
    #include "TimingGPU.cuh"
    
    #define BLOCK_SIZE 32
    
    #define prec_save 10
    
    /***************/
    /* COPY KERNEL */
    /***************/
    __global__ void copy_kernel(const double * __restrict d_in, double * __restrict d_out, const int M, const int N) {
    
        const int i = blockIdx.x * blockDim.x + threadIdx.x;
        const int j = blockIdx.y * blockDim.y + threadIdx.y;
    
        if ((i < N) && (j < N)) d_out[j * N + i] = d_in[j * M + i];
    }
    
    /****************************************************/
    /* LOAD INDIVIDUAL REAL MATRIX FROM txt FILE TO CPU */
    /****************************************************/
    // --- Load individual real matrix from txt file
    template 
    void loadCPUrealtxt(T * __restrict h_out, const char *filename, const int M) {
    
        std::ifstream infile;
        infile.open(filename);
        for (int i = 0; i < M; i++) {
            double temp;
            infile >> temp;
            h_out[i] = (T)temp;
        }
    
        infile.close();
    
    }
    
    /************************************/
    /* SAVE REAL ARRAY FROM GPU TO FILE */
    /************************************/
    template 
    void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
    
        T *h_in = (T *)malloc(M * sizeof(T));
    
        gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
    
        std::ofstream outfile;
        outfile.open(filename);
        for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
        outfile.close();
    
    }
    
    /********/
    /* MAIN */
    /********/
    int main(){
    
        // --- Extension of Appendix C.1 of cuSOLVER library User's Guide
        // --- See also http://www.netlib.org/lapack/lug/node40.html
    
        // --- ASSUMPTION Nrows >= Ncols
        const int Nrows = 500;
        const int Ncols = 500;
    
        TimingGPU timerGPU;
        double timingQR, timingSolve;
    
        // --- cuSOLVE input/output parameters/arrays
        int work_size = 0;
        int *devInfo;           gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    
        // --- CUDA solver initialization
        cusolverDnHandle_t solver_handle;
        cusolveSafeCall(cusolverDnCreate(&solver_handle));
    
        // --- CUBLAS initialization
        cublasHandle_t cublas_handle;
        cublasSafeCall(cublasCreate(&cublas_handle));
    
        /***********************/
        /* SETTING THE PROBLEM */
        /***********************/
        // --- Setting the host, Nrows x Ncols matrix
        double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
        loadCPUrealtxt(h_A, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\testMatrix.txt", Nrows * Ncols);
    
        // --- Setting the device matrix and moving the host matrix to the device
        double *d_A;            gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
        gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
    
        // --- Initializing the data matrix C (Of course, this step could be done by a kernel function directly on the device).
        // --- Notice that, in this case, only the first column of C contains actual data, the others being empty (zeroed). However, cuBLAS trsm
        //     has the capability of solving triangular linear systems with multiple right hand sides.
        double *h_C = (double *)calloc(Nrows * Nrows, sizeof(double));
        loadCPUrealtxt(h_C, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\testVector.txt", Nrows);
    
        double *d_C;            gpuErrchk(cudaMalloc(&d_C, Nrows * Nrows * sizeof(double)));
        gpuErrchk(cudaMemcpy(d_C, h_C, Nrows * Nrows * sizeof(double), cudaMemcpyHostToDevice));
    
        /**********************************/
        /* COMPUTING THE QR DECOMPOSITION */
        /**********************************/
        timerGPU.StartCounter();
    
        // --- CUDA QR GEQRF preliminary operations
        double *d_TAU;      gpuErrchk(cudaMalloc((void**)&d_TAU, min(Nrows, Ncols) * sizeof(double)));
        cusolveSafeCall(cusolverDnDgeqrf_bufferSize(solver_handle, Nrows, Ncols, d_A, Nrows, &work_size));
        double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
    
        // --- CUDA GEQRF execution: The matrix R is overwritten in upper triangular part of A, including diagonal 
        //     elements. The matrix Q is not formed explicitly, instead, a sequence of householder vectors are
        //     stored in lower triangular part of A.
        cusolveSafeCall(cusolverDnDgeqrf(solver_handle, Nrows, Ncols, d_A, Nrows, d_TAU, work, work_size, devInfo));
        int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
        if (devInfo_h != 0) std::cout << "Unsuccessful gerf execution\n\n";
    
        timingQR = timerGPU.GetCounter();
        printf("Timing for QR calculation = %f [ms]\n", timingQR);
    
        /*****************************/
        /* SOLVING THE LINEAR SYSTEM */
        /*****************************/
        timerGPU.StartCounter();
    
        // --- CUDA ORMQR execution: Computes the multiplication Q^T * C and stores it in d_C
        cusolveSafeCall(cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, Nrows, Ncols, min(Nrows, Ncols), d_A, Nrows, d_TAU, d_C, Nrows, work, work_size, devInfo));
    
        // --- Reducing the linear system size
        double *d_R; gpuErrchk(cudaMalloc(&d_R, Ncols * Ncols * sizeof(double)));
        double *d_B; gpuErrchk(cudaMalloc(&d_B, Ncols * sizeof(double)));
        dim3 Grid(iDivUp(Ncols, BLOCK_SIZE), iDivUp(Ncols, BLOCK_SIZE));
        dim3 Block(BLOCK_SIZE, BLOCK_SIZE);
        copy_kernel << > >(d_A, d_R, Nrows, Ncols);
        gpuErrchk(cudaMemcpy(d_B, d_C, Ncols * sizeof(double), cudaMemcpyDeviceToDevice));
    
        // --- Solving an upper triangular linear system - compute x = R \ Q^T * B
        const double alpha = 1.;
        cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
            CUBLAS_DIAG_NON_UNIT, Ncols, 1, &alpha, d_R, Ncols, d_B, Ncols));
    
        timingSolve = timerGPU.GetCounter();
        printf("Timing for solution of the linear system = %f [ms]\n", timingSolve);
        printf("Overall timing = %f [ms]\n", timingQR + timingSolve);
    
        /************************/
        /* CHECKING THE RESULTS */
        /************************/
        // --- The upper triangular part of A contains the elements of R. Showing this.
        saveGPUrealtxt(d_A, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_R.txt", Nrows * Ncols);
    
        // --- The first Nrows elements of d_C contain the result of Q^T * C
        saveGPUrealtxt(d_C, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_QTC.txt", Nrows);
    
        // --- Initializing the output Q matrix (Of course, this step could be done by a kernel function directly on the device)
        double *h_Q = (double *)malloc(Nrows * Nrows * sizeof(double));
        for (int j = 0; j < Nrows; j++)
            for (int i = 0; i < Nrows; i++)
                if (j == i) h_Q[j + i*Nrows] = 1.;
                else        h_Q[j + i*Nrows] = 0.;
    
        double *d_Q;            gpuErrchk(cudaMalloc(&d_Q, Nrows * Nrows * sizeof(double)));
        gpuErrchk(cudaMemcpy(d_Q, h_Q, Nrows * Nrows * sizeof(double), cudaMemcpyHostToDevice));
    
        // --- Calculation of the Q matrix
        cusolveSafeCall(cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, Nrows, Ncols, min(Nrows, Ncols), d_A, Nrows, d_TAU, d_Q, Nrows, work, work_size, devInfo));
    
        // --- d_Q contains the elements of Q. Showing this.
        saveGPUrealtxt(d_Q, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_Q.txt", Nrows * Nrows);
    
        // --- At this point, d_C contains the elements of Q^T * C, where C is the data vector. Showing this.
        // --- According to the above, only the first column of d_C makes sense.
        //gpuErrchk(cudaMemcpy(h_C, d_C, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
        //printf("\n\n");
        //for (int j = 0; j < Nrows; j++)
        //  for (int i = 0; i < Nrows; i++)
        //      printf("C[%i, %i] = %f\n", j, i, h_C[j + i*Nrows]);
    
        // --- Check final result
        saveGPUrealtxt(d_B, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_B.txt", Ncols);
    
        cusolveSafeCall(cusolverDnDestroy(solver_handle));
    
        return 0;
    }
    

    The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.

    Data can be generated and results can be checked by the following Matlab code:

    clear all
    close all
    clc
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % GENERATE RANDOM NON-SQUARE MATRIX WITH DESIRED CONDITION NUMBER %
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % --- Credit to https://math.stackexchange.com/questions/198515/can-we-generate-random-singular-matrices-with-desired-condition-number-using-mat
    Nrows = 500;                % --- Number of rows
    Ncols = 500;                % --- Number of columns
    % condNumber = 10 * sqrt(2);  % --- Desired condition number
    % A = randn(Nrows, Ncols);
    % [U, S, V] = svd(A);
    % S(S~=0) = linspace(condNumber, 1, min(Nrows, Ncols));
    % A = U * S * V';
    
    % --- Setting the problem solution
    x = ones(Ncols, 1);
    
    % y = A * x;
    % 
    % Asave = reshape(A, Nrows * Ncols, 1);
    % save testMatrix.txt Asave -ascii -double
    % save testVector.txt y -ascii -double
    
    load testMatrix.txt
    load testVector.txt
    A = reshape(testMatrix, Nrows, Ncols);
    y = testVector;
    
    [Q, R] = qr(A);
    
    xMatlab = R \ (Q.' * y);
    
    fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
    
    fprintf('Percentage rms of Q * R - A %f\n', 100 * sqrt(sum(sum(abs(Q * R - A).^2)) / sum(sum(abs(A).^2))));
    
    load d_R.txt
    d_R = reshape(d_R, Nrows, Ncols);
    
    d_R = d_R(1 : Ncols, :);
    R   = R(1 : Ncols, :);
    
    fprintf('Percentage rms of matrix R between Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(triu(R) - triu(d_R)).^2)) / sum(sum(abs(triu(d_R)).^2))));
    
    load d_QTC.txt
    fprintf('Percentage rms of Q^T * y - d_QTC %f\n', 100 * sqrt(sum(sum(abs(Q.' * y - d_QTC).^2)) / sum(sum(abs(d_QTC).^2))));
    
    load d_B.txt
    fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(d_B - x).^2)) / sum(sum(abs(x).^2))));
    

    Please, comment/uncomment rows as required.

    Timing

    Timings (in ms) (tests performed on a GTX960 card, cc. 5.2):

    Size         QR decomposition       Solving system       Overall
    100x100      0.89                   1.41                 2.30
    200x200      5.97                   3.23                 9.20
    500x500      17.08                  21.6                 38.7
    

提交回复
热议问题