Calculate Euclidean distance matrix using a big.matrix object

前端 未结 1 1782
你的背包
你的背包 2020-12-10 06:05

I have an object of class big.matrix in R with dimension 778844 x 2. The values are all integers (kilometres). My objective is to calc

相关标签:
1条回答
  • 2020-12-10 06:51

    Here is a way using RcppArmadillo. Much of this is very similar to the RcppGallery example. This will return a big.matrix with the associated pairwise (by row) euclidean distances. I like to wrap my big.matrix functions in a wrapper function to create a cleaner syntax (i.e. avoid the @address and other initializations.

    Note - as we are using bigmemory (and therefore concerned with RAM usage) I have this example returned the N-1 x N-1 matrix of only lower triangular elements. You could modify this but this is what I threw together.

    euc_dist.cpp

    // To enable the functionality provided by Armadillo's various macros,
    // simply include them before you include the RcppArmadillo headers.
    #define ARMA_NO_DEBUG
    
    #include <RcppArmadillo.h>
    // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]]
    
    using namespace Rcpp;
    using namespace arma;
    
    // The following header file provides the definitions for the BigMatrix
    // object
    #include <bigmemory/BigMatrix.h>
    
    // C++11 plugin
    // [[Rcpp::plugins(cpp11)]]
    
    template <typename T>
    void BigArmaEuclidean(const Mat<T>& inBigMat, Mat<T> outBigMat) {
    
      int W = inBigMat.n_rows;
    
      for(int i = 0; i < W - 1; i++){
          for(int j=i+1; j < W; j++){
              outBigMat(j-1,i) = sqrt(sum(pow((inBigMat.row(i) - inBigMat.row(j)),2)));
          }
      }
    }
    
    // [[Rcpp::export]]
    void BigArmaEuc(SEXP pInBigMat, SEXP pOutBigMat) {
      // First we tell Rcpp that the object we've been given is an external
      // pointer.
      XPtr<BigMatrix> xpMat(pInBigMat);
      XPtr<BigMatrix> xpOutMat(pOutBigMat);
    
    
      int type = xpMat->matrix_type();
      switch(type) {
          case 1:
            BigArmaEuclidean(
                arma::Mat<char>((char *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false),
                arma::Mat<char>((char *)xpOutMat->matrix(), xpOutMat->nrow(), xpOutMat->ncol(), false)
            );
            return;
    
          case 2:
            BigArmaEuclidean(
              arma::Mat<short>((short *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false),
              arma::Mat<short>((short *)xpOutMat->matrix(), xpOutMat->nrow(), xpOutMat->ncol(), false)
            );
            return;
    
          case 4:
            BigArmaEuclidean(
              arma::Mat<int>((int *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false),
              arma::Mat<int>((int *)xpOutMat->matrix(), xpOutMat->nrow(), xpOutMat->ncol(), false)
            );
            return;
    
          case 8:
            BigArmaEuclidean(
              arma::Mat<double>((double *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false),
              arma::Mat<double>((double *)xpOutMat->matrix(), xpOutMat->nrow(), xpOutMat->ncol(), false)
            );
            return;
    
          default:
            // We should never get here, but it resolves compiler warnings.
            throw Rcpp::exception("Undefined type for provided big.matrix");
      }
    
    }
    

    My little wrapper

    bigMatrixEuc <- function(bigMat){
        zeros <- big.matrix(nrow = nrow(bigMat)-1,
                            ncol = nrow(bigMat)-1,
                            init = 0,
                            type = typeof(bigMat))
        BigArmaEuc(bigMat@address, zeros@address)
        return(zeros)
    }
    

    The test

    library(Rcpp)
    sourceCpp("euc_dist.cpp")
    
    library(bigmemory)
    
    set.seed(123)
    mat <- matrix(rnorm(16), 4)
    bm <- as.big.matrix(mat)
    
    # Call new euclidean function
    bm_out <- bigMatrixEuc(bm)[]
    
    # pull out the matrix elements for out purposes
    distMat <- as.matrix(dist(mat))
    distMat[upper.tri(distMat, diag=TRUE)] <- 0
    distMat <- distMat[2:4, 1:3]
    
    # check if identical 
    all.equal(bm_out, distMat, check.attributes = FALSE)
    [1] TRUE
    
    0 讨论(0)
提交回复
热议问题