Rcpp and CULA: segmentation fault

问题

I extracted the relevant bits from the gputools R-package to run a QR decomposition on my GPU using Rcpp by dynamically loading a shared library that links to culatools. Everything runs smoothly in the terminal and R.app on my Mac. The results agree with R's qr() function, but the problem is that a segmentation fault occurs on exiting R.app (the error does not occur when using the terminal):

*** caught segfault ***
address 0x10911b050, cause 'memory not mapped'

I think I narrowed down the problem to the pointers 'a' and 'tau' in the .c file that links to culatools:

#include<cula.h>

void gpuQR(const int *m, const int *n, float *a, const int *lda, float *tau)
{
    culaInitialize();   
    culaSgeqrf(m[0], n[0], a, lda[0], tau);
    culaShutdown();
}

I compiled the .c file on my Mac using:

/usr/local/cuda/bin/nvcc -gencode arch=compute_10,code=sm_10 -gencode arch=compute_11,code=sm_11 -gencode arch=compute_12,code=sm_12 -gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -c -I. -I/usr/local/cula/include -m64 -Xcompiler -fPIC gpuQR.c -o gpuQR.o
/usr/local/cuda/bin/nvcc -gencode arch=compute_10,code=sm_10 -gencode arch=compute_11,code=sm_11 -gencode arch=compute_12,code=sm_12 -gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -shared -m64 -Xlinker -rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_core -lcula_lapack -lcublas -o gpuQR.so gpuQR.o

I wrote a .cpp file that uses Rcpp and dynamically loads the shared library gpuQR.so:

#include <Rcpp.h>
#include <dlfcn.h>

using namespace Rcpp;
using namespace std;

typedef void (*func)(int*, int*, float*, int*, float*);

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);
    vector<float> scale(n_cols);

    void* lib_handle = dlopen("path/gpuQR.so", RTLD_LAZY);
    if (!lib_handle) 
    { 
        Rcout << dlerror() << endl; 
    } else {
        func gpuQR = (func) dlsym(lib_handle, "gpuQR");  
        gpuQR(&n_rows, &n_cols, &(x[0]), &n_rows, &(scale[0]));
    }

    dlclose(lib_handle);

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { y[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

I compiled the .cpp file in R using:

library(Rcpp)  
PKG_LIBS <- sprintf('%s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('%s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_Rcpp.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

and ran an example:

dyn.load('path/gpuQR_Rcpp.so')

set.seed(100)
A <- matrix(rnorm(9), 3, 3)
n_row <- nrow(A)
n_col <- ncol(A)

res <- .Call('gpuQR_Rcpp', c(A), n_row, n_col)
matrix(res, n_row, n_col)

           [,1]       [,2]       [,3]
[1,]  0.5250958 -0.8666927  0.8594266
[2,] -0.2504899 -0.3878644 -0.1277837
[3,]  0.1502908  0.4742033 -0.8804248

qr(A)$qr

          [,1]       [,2]       [,3]
[1,]  0.5250957 -0.8666925  0.8594266
[2,] -0.2504899 -0.3878643 -0.1277838
[3,]  0.1502909  0.4742033 -0.8804247

Does anybody have an idea how to fix the segmentation fault?

回答1:

The problem is solved by deleting

dlclose(lib_handle);

from .cpp file. This yields the following:

#include <Rcpp.h>
#include <dlfcn.h>

using namespace Rcpp;
using namespace std;

typedef void (*func)(int*, int*, float*, int*, float*);

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);
    vector<float> scale(n_cols);

    void* lib_handle = dlopen("path/gpuQR.so", RTLD_LAZY);
    if (!lib_handle) 
    { 
        Rcout << dlerror() << endl; 
    } else {
        func gpuQR = (func) dlsym(lib_handle, "gpuQR");  
        gpuQR(&n_rows, &n_cols, &(x[0]), &n_rows, &(scale[0]));
    }

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { x[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

The .cpp file can be compiled in R using:

library(Rcpp)  
PKG_LIBS <- sprintf('%s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('%s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_Rcpp.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

The actual .c file linking to culatools is:

#include<cula.h>

void gpuQR(const int *m, const int *n, float *a, const int *lda, float *tau)
{
    culaInitialize();   
    culaSgeqrf(m[0], n[0], a, lda[0], tau);
    culaShutdown();
}

It can be compiled using:

gcc -c -I/usr/local/cula/include gpuQR.c
gcc -shared -Wl,-rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_lapack -o gpuQR.so gpuQR.o

The QR decomposition can then be performed in R using:

dyn.load('path/gpuQR_Rcpp.so')

set.seed(100)

n_row <- 3
n_col <- 3
A <- matrix(rnorm(n_row * n_col), n_row, n_col)

res <- .Call('gpuQR_Rcpp', c(A), n_row, n_col)
matrix(res, n_row, n_col)

           [,1]       [,2]       [,3]
[1,]  0.5250958 -0.8666927  0.8594266
[2,] -0.2504899 -0.3878644 -0.1277837
[3,]  0.1502908  0.4742033 -0.8804248

qr(A)$qr

          [,1]       [,2]       [,3]
[1,]  0.5250957 -0.8666925  0.8594266
[2,] -0.2504899 -0.3878643 -0.1277838
[3,]  0.1502909  0.4742033 -0.8804247

Here are the results from a benchmark using a NVIDIA GeForce 9400M GPU with 16 CUDA cores:

n_row <- 1000; n_col <- 1000
A <- matrix(rnorm(n_row * n_col), n_row, n_col)
B <- A; dim(B) <- NULL

res <- benchmark(.Call('gpuQR_Rcpp', B, n_row, n_col), qr(A), columns = c('test', 'replications', 'elapsed', 'relative'), order = 'relative')

                                  test replications elapsed relative
1 .Call("gpuQR_Rcpp", B, n_row, n_col)          100  38.037    1.000
2                                qr(A)          100 152.575    4.011

回答2:

There is actually no need to dynamically load a shared library linking to culatools. I thought about this initially, but I did not get the .cpp file using Rcpp compiled. Anyway, the new .cpp file is:

#include<Rcpp.h>
#include<cula.h>

using namespace Rcpp;
using namespace std;

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);  
    vector<float> scale(n_cols);

    culaInitialize();   
    culaSgeqrf(n_rows, n_cols, &(x[0]), n_rows, &(scale[0]));
    culaShutdown();

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { x[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

The .cpp file is compiled using:

library(Rcpp)  
PKG_LIBS <- sprintf('-Wl,-rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_lapack %s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('-I/usr/local/cula/include %s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_inc.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

where I set the appropriate path to culatools. The whole thing does not run faster, but there is no need any longer to compile the shared library linking to culatools and dynamically loading it.

I think that this is a nice alternative to the gputools R-package to extend R with C++ and perform linear algebra operations on the GPU.

来源：https://stackoverflow.com/questions/17925733/rcpp-and-cula-segmentation-fault

标签

cuda

rcpp

cula