Is there a way to speed up the combn
command to get all unique combinations of 2 elements taken from a vector?
Usually this would be set up like this:
# Get latest version of data.table
library(devtools)
install_github("Rdatatable/data.table", build_vignettes = FALSE)
library(data.table)
# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000)))
# Transform data
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
However, combn
is 10 times slower (23sec versus 3 sec on my computer) than calculating all possible combinations using data.table.
system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})
Dealing with very large vectors, I am searching for a way to save memory by only calculating the unique combinations (like combn
), but with the speed of data.table (see second code snippet).
I appreciate any help.
You could use combnPrim
from gRbase
source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") # will install dependent packages automatically.
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
# user system elapsed
# 27.322 0.585 27.674
system.time({
d.2 <- as.data.table(t(combnPrim(d$id,2)))
})
# user system elapsed
# 2.317 0.110 2.425
identical(d.1[order(V1, V2),], d.2[order(V1,V2),])
#[1] TRUE
Here's a way using data.table
function foverlaps()
, that also turns out to be fast!
require(data.table) ## 1.9.4+
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
system.time(olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid])
# 0.603 0.062 0.717
Note that foverlaps()
does not calculate all permutations. The subset xid != yid
is needed to remove self overlaps. The subset could be internally handled more efficiently by implementing ignoreSelf
argument - similar to IRanges::findOverlaps
.
Now it's just a matter of performing a subset using the ids obtained:
system.time(ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid])))
# 0.576 0.047 0.662
So totally, ~1.4 seconds.
The advantage is that you can do the same way even if your data.table d
has more than 1 column on which you've to get the combinations for, and using the same amount of memory (since we return the indices). In that case, you'd just do:
cbind(d[olaps$xid, your_cols, with=FALSE], d[olaps$yid, your_cols, with=FALSE])
But it's limited to replacing just combn(., 2L)
. Not more than 2L.
Here is a solution using Rcpp.
library(Rcpp)
library(data.table)
cppFunction('
Rcpp::DataFrame combi2(Rcpp::CharacterVector inputVector){
int len = inputVector.size();
int retLen = len * (len-1) / 2;
Rcpp::CharacterVector outputVector1(retLen);
Rcpp::CharacterVector outputVector2(retLen);
int start = 0;
for (int i = 0; i < len; ++i){
for (int j = i+1; j < len; ++j){
outputVector1(start) = inputVector(i);
outputVector2(start) = inputVector(j);
++start;
}
}
return(Rcpp::DataFrame::create(Rcpp::Named("id") = outputVector1,
Rcpp::Named("neighbor") = outputVector2));
};
')
# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000)))
system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})
# 1.908 0.397 2.389
system.time({
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
})
# 0.653 0.038 0.705
system.time(ans2 <- combi2(d$id))
# 1.377 0.108 1.495
Using the Rcpp function to get the indices and then form the data.table, works better.
cppFunction('
Rcpp::DataFrame combi2inds(const Rcpp::CharacterVector inputVector){
const int len = inputVector.size();
const int retLen = len * (len-1) / 2;
Rcpp::IntegerVector outputVector1(retLen);
Rcpp::IntegerVector outputVector2(retLen);
int indexSkip;
for (int i = 0; i < len; ++i){
indexSkip = len * i - ((i+1) * i)/2;
for (int j = 0; j < len-1-i; ++j){
outputVector1(indexSkip+j) = i+1;
outputVector2(indexSkip+j) = i+j+1+1;
}
}
return(Rcpp::DataFrame::create(Rcpp::Named("xid") = outputVector1,
Rcpp::Named("yid") = outputVector2));
};
')
system.time({
indices <- combi2inds(d$id)
ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
})
# 0.389 0.027 0.425
A post with any variation of the word Fast in the title is incomplete without benchmarks. Before we post any benchmarks, I would just like to mention that since this question was posted, two highly optimized packages, arrangements
and RcppAlgos
(I am the author) for generating combinations have been released for R
.
To give you an idea of their speed over combn
and gRbase::combnPrim
, here is a basic benchmark:
microbenchmark(arrangements::combinations(20, 10),
combn(20, 10),
gRbase::combnPrim(20, 10),
RcppAlgos::comboGeneral(20, 10),
unit = "relative")
Unit: relative
expr min lq mean median uq max neval
arrangements::combinations(20, 10) 1.364092 1.244705 1.198256 1.265019 1.192174 3.658389 100
combn(20, 10) 82.672684 61.589411 52.670841 59.976063 58.584740 67.596315 100
gRbase::combnPrim(20, 10) 6.650843 5.290714 5.024889 5.303483 5.514129 4.540966 100
RcppAlgos::comboGeneral(20, 10) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
Now, we benchmark the other functions posted for the very specific case of producing combinations choose 2 and producing a data.table
object.
The functions are as follows:
funAkraf <- function(d) {
a <- comb2.int(length(d$id)) ## comb2.int from the answer given by @akraf
data.table(V1 = d$id[a[,1]], V2 = d$id[a[,2]])
}
funAnirban <- function(d) {
indices <- combi2inds(d$id)
ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
ans2
}
funArrangements <- function(d) {as.data.table(arrangements::combinations(x = d$id, k = 2))}
funArun <- function(d) {
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
ans
}
funGRbase <- function(d) {as.data.table(t(gRbase::combnPrim(d$id,2)))}
funOPCombn <- function(d) {as.data.table(t(combn(d$id, 2)))}
funRcppAlgos <- function(d) {as.data.table(RcppAlgos::comboGeneral(d$id, 2))}
And here are the benchmarks on the example given by the OP:
d <- data.table(id=as.character(paste0("A", 10001:15000)))
microbenchmark(funAkraf(d),
funAnirban(d),
funArrangements(d),
funArun(d),
funGRbase(d),
funOPCombn(d),
funRcppAlgos(d),
times = 10, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funAkraf(d) 2.961790 2.869365 2.612028 2.948955 2.215608 2.352351 10
funAnirban(d) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
funArrangements(d) 1.384152 1.427382 1.473522 1.854861 1.258471 1.233715 10
funArun(d) 2.785375 2.543434 2.353724 2.793377 1.883702 2.013235 10
funGRbase(d) 4.309175 3.909820 3.359260 3.921906 2.727707 2.465525 10
funOPCombn(d) 22.810793 21.722210 17.989826 21.492045 14.079908 12.933432 10
funRcppAlgos(d) 1.359991 1.551938 1.434623 1.727857 1.318949 1.176934 10
We see that the function provided by @AnirbanMukherjee is the fastest for this task, followed by RcppAlgos
/arrangements
(very close timings).
They all give the same result as well:
identical(funAkraf(d), funOPCombn(d))
#[1] TRUE
identical(funAkraf(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funAnirban(d))
#[1] TRUE
identical(funRcppAlgos(d), funArun(d))
#[1] TRUE
## different order... we must sort
identical(funRcppAlgos(d), funGRbase(d))
[1] FALSE
d1 <- funGRbase(d)
d2 <- funRcppAlgos(d)
## now it's the same
identical(d1[order(V1, V2),], d2[order(V1,V2),])
#[1] TRUE
Thanks to @Frank for pointing out how to compare two data.tables
without going through the pains of creating new data.tables
and then arranging them:
fsetequal(funRcppAlgos(d), funGRbase(d))
[1] TRUE
Here are two base-R solutions if you don't want to use additional dependencies:
comb2.int
usesrep
and other sequence generating functions to generate the desired output.comb2.mat
creates a matrix, usesupper.tri()
to get the upper triangle andwhich(..., arr.ind = TRUE)
to obtain the column and row indices => all combinations.
Possibility 1: comb2.int
comb2.int <- function(n, rep = FALSE){
if(!rep){
# e.g. n=3 => (1,1), (1,2), (1,3), (2,2), (2,3), (3,3)
x <- rep(1:n,(n:1)-1)
i <- seq_along(x)+1
o <- c(0,cumsum((n-2):1))
y <- i-o[x]
}else{
# e.g. n=3 => (1,2), (1,3), (2,3)
x <- rep(1:n,n:1)
i <- seq_along(x)
o <- c(0,cumsum(n:2))
y <- i-o[x]+x-1
}
return(cbind(x,y))
}
Possibility 2: comb2.mat
comb2.mat <- function(n, rep = FALSE){
# Use which(..., arr.ind = TRUE) to get coordinates.
m <- matrix(FALSE, nrow = n, ncol = n)
idxs <- which(upper.tri(m, diag = rep), arr.ind = TRUE)
return(idxs)
}
The functions give the same result as combn(.)
:
for(i in 2:8){
# --- comb2.int ------------------
stopifnot(comb2.int(i) == t(combn(i,2)))
# => Equal
# --- comb2.mat ------------------
m <- comb2.mat(i)
colnames(m) <- NULL # difference 1: colnames
m <- m[order(m[,1]),] # difference 2: output order
stopifnot(m == t(combn(i,2)))
# => Equal up to above differences
}
But I have other elements in my vector than sequencial integers!
Use the return values as indices:
v <- LETTERS[1:5]
c <- comb2.int(length(v))
cbind(v[c[,1]], v[c[,2]])
#> [,1] [,2]
#> [1,] "A" "B"
#> [2,] "A" "C"
#> [3,] "A" "D"
#> [4,] "A" "E"
#> [5,] "B" "C"
#> [6,] "B" "D"
#> [7,] "B" "E"
#> [8,] "C" "D"
#> [9,] "C" "E"
#> [10,] "D" "E"
Benchmark:
time(combn
) = ~5x time(comb2.mat
) = ~80x time(comb2.int
):
library(microbenchmark)
n <- 800
microbenchmark({
comb2.int(n)
},{
comb2.mat(n)
},{
t(combn(n, 2))
})
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> { comb2.int(n) } 4.394051 4.731737 6.350406 5.334463 7.22677 14.68808 100
#> { comb2.mat(n) } 20.131455 22.901534 31.648521 24.411782 26.95821 297.70684 100
#> { t(combn(n, 2)) } 363.687284 374.826268 391.038755 380.012274 389.59960 532.30305 100
来源:https://stackoverflow.com/questions/26828301/faster-version-of-combn