Rcpp and CULA: segmentation fault

浪子不回头ぞ 提交于 2019-12-07 23:58:08

问题


I extracted the relevant bits from the gputools R-package to run a QR decomposition on my GPU using Rcpp by dynamically loading a shared library that links to culatools. Everything runs smoothly in the terminal and R.app on my Mac. The results agree with R's qr() function, but the problem is that a segmentation fault occurs on exiting R.app (the error does not occur when using the terminal):

*** caught segfault ***
address 0x10911b050, cause 'memory not mapped'

I think I narrowed down the problem to the pointers 'a' and 'tau' in the .c file that links to culatools:

#include<cula.h>

void gpuQR(const int *m, const int *n, float *a, const int *lda, float *tau)
{
    culaInitialize();   
    culaSgeqrf(m[0], n[0], a, lda[0], tau);
    culaShutdown();
}

I compiled the .c file on my Mac using:

/usr/local/cuda/bin/nvcc -gencode arch=compute_10,code=sm_10 -gencode arch=compute_11,code=sm_11 -gencode arch=compute_12,code=sm_12 -gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -c -I. -I/usr/local/cula/include -m64 -Xcompiler -fPIC gpuQR.c -o gpuQR.o
/usr/local/cuda/bin/nvcc -gencode arch=compute_10,code=sm_10 -gencode arch=compute_11,code=sm_11 -gencode arch=compute_12,code=sm_12 -gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -shared -m64 -Xlinker -rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_core -lcula_lapack -lcublas -o gpuQR.so gpuQR.o

I wrote a .cpp file that uses Rcpp and dynamically loads the shared library gpuQR.so:

#include <Rcpp.h>
#include <dlfcn.h>

using namespace Rcpp;
using namespace std;

typedef void (*func)(int*, int*, float*, int*, float*);

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);
    vector<float> scale(n_cols);

    void* lib_handle = dlopen("path/gpuQR.so", RTLD_LAZY);
    if (!lib_handle) 
    { 
        Rcout << dlerror() << endl; 
    } else {
        func gpuQR = (func) dlsym(lib_handle, "gpuQR");  
        gpuQR(&n_rows, &n_cols, &(x[0]), &n_rows, &(scale[0]));
    }

    dlclose(lib_handle);

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { y[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

I compiled the .cpp file in R using:

library(Rcpp)  
PKG_LIBS <- sprintf('%s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('%s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_Rcpp.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

and ran an example:

dyn.load('path/gpuQR_Rcpp.so')

set.seed(100)
A <- matrix(rnorm(9), 3, 3)
n_row <- nrow(A)
n_col <- ncol(A)

res <- .Call('gpuQR_Rcpp', c(A), n_row, n_col)
matrix(res, n_row, n_col)

           [,1]       [,2]       [,3]
[1,]  0.5250958 -0.8666927  0.8594266
[2,] -0.2504899 -0.3878644 -0.1277837
[3,]  0.1502908  0.4742033 -0.8804248

qr(A)$qr

          [,1]       [,2]       [,3]
[1,]  0.5250957 -0.8666925  0.8594266
[2,] -0.2504899 -0.3878643 -0.1277838
[3,]  0.1502909  0.4742033 -0.8804247

Does anybody have an idea how to fix the segmentation fault?


回答1:


The problem is solved by deleting

dlclose(lib_handle);

from .cpp file. This yields the following:

#include <Rcpp.h>
#include <dlfcn.h>

using namespace Rcpp;
using namespace std;

typedef void (*func)(int*, int*, float*, int*, float*);

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);
    vector<float> scale(n_cols);

    void* lib_handle = dlopen("path/gpuQR.so", RTLD_LAZY);
    if (!lib_handle) 
    { 
        Rcout << dlerror() << endl; 
    } else {
        func gpuQR = (func) dlsym(lib_handle, "gpuQR");  
        gpuQR(&n_rows, &n_cols, &(x[0]), &n_rows, &(scale[0]));
    }

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { x[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

The .cpp file can be compiled in R using:

library(Rcpp)  
PKG_LIBS <- sprintf('%s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('%s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_Rcpp.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

The actual .c file linking to culatools is:

#include<cula.h>

void gpuQR(const int *m, const int *n, float *a, const int *lda, float *tau)
{
    culaInitialize();   
    culaSgeqrf(m[0], n[0], a, lda[0], tau);
    culaShutdown();
}

It can be compiled using:

gcc -c -I/usr/local/cula/include gpuQR.c
gcc -shared -Wl,-rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_lapack -o gpuQR.so gpuQR.o

The QR decomposition can then be performed in R using:

dyn.load('path/gpuQR_Rcpp.so')

set.seed(100)

n_row <- 3
n_col <- 3
A <- matrix(rnorm(n_row * n_col), n_row, n_col)

res <- .Call('gpuQR_Rcpp', c(A), n_row, n_col)
matrix(res, n_row, n_col)

           [,1]       [,2]       [,3]
[1,]  0.5250958 -0.8666927  0.8594266
[2,] -0.2504899 -0.3878644 -0.1277837
[3,]  0.1502908  0.4742033 -0.8804248

qr(A)$qr

          [,1]       [,2]       [,3]
[1,]  0.5250957 -0.8666925  0.8594266
[2,] -0.2504899 -0.3878643 -0.1277838
[3,]  0.1502909  0.4742033 -0.8804247

Here are the results from a benchmark using a NVIDIA GeForce 9400M GPU with 16 CUDA cores:

n_row <- 1000; n_col <- 1000
A <- matrix(rnorm(n_row * n_col), n_row, n_col)
B <- A; dim(B) <- NULL

res <- benchmark(.Call('gpuQR_Rcpp', B, n_row, n_col), qr(A), columns = c('test', 'replications', 'elapsed', 'relative'), order = 'relative')

                                  test replications elapsed relative
1 .Call("gpuQR_Rcpp", B, n_row, n_col)          100  38.037    1.000
2                                qr(A)          100 152.575    4.011



回答2:


There is actually no need to dynamically load a shared library linking to culatools. I thought about this initially, but I did not get the .cpp file using Rcpp compiled. Anyway, the new .cpp file is:

#include<Rcpp.h>
#include<cula.h>

using namespace Rcpp;
using namespace std;

RcppExport SEXP gpuQR_Rcpp(SEXP x_, SEXP n_rows_, SEXP n_cols_)
{       
    vector<float> x = as<vector<float> >(x_);
    int n_rows = as<int>(n_rows_);
    int n_cols = as<int>(n_cols_);  
    vector<float> scale(n_cols);

    culaInitialize();   
    culaSgeqrf(n_rows, n_cols, &(x[0]), n_rows, &(scale[0]));
    culaShutdown();

    for(int ii = 1; ii < n_rows; ii++)
    {
        for(int jj = 0; jj < n_cols; jj++)
        {
            if(ii > jj) { x[ii + jj * n_rows] *= scale[jj]; }
        }
    }

    return wrap(x);
}

The .cpp file is compiled using:

library(Rcpp)  
PKG_LIBS <- sprintf('-Wl,-rpath,/usr/local/cula/lib64 -L/usr/local/cula/lib64 -lcula_lapack %s $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)', Rcpp:::RcppLdFlags()) 
PKG_CPPFLAGS <- sprintf('-I/usr/local/cula/include %s', Rcpp:::RcppCxxFlags())  
Sys.setenv(PKG_LIBS = PKG_LIBS , PKG_CPPFLAGS = PKG_CPPFLAGS) 
R <- file.path(R.home(component = 'bin'), 'R') 
file <- 'path/gpuQR_inc.cpp'
cmd <- sprintf('%s CMD SHLIB %s', R, paste(file, collapse = ' '))
system(cmd)

where I set the appropriate path to culatools. The whole thing does not run faster, but there is no need any longer to compile the shared library linking to culatools and dynamically loading it.

I think that this is a nice alternative to the gputools R-package to extend R with C++ and perform linear algebra operations on the GPU.



来源:https://stackoverflow.com/questions/17925733/rcpp-and-cula-segmentation-fault

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!