I have a big (12 million rows) data.table
which looks like this:
library(data.table)
set.seed(123)
dt <
For small data sets the table function seems to be more efficient, but on large datasets dcast seems to be the most efficient and convenient option.
TableFunction <- function(){
df <- as.data.frame.matrix(table(dt$id, dt$y))
df[df > 1] <- 1
df <- cbind(id = as.numeric(row.names(df)), df)
setDT(df)
}
AnsFunction <- function(){
ans = data.table(id = unique(dt$id))[, unique(dt$y) := 0L][]
dt[, {set(ans, i=id, j=unique(y), value=1L); NULL}, by=id]
}
dcastFunction <- function(){
df <-dcast.data.table(dt, id ~ y, fun.aggregate = function(x) 1L, fill=0L,value.var = "y")
}
library(data.table)
library(microbenchmark)
set.seed(123)
N = 10000
dt <- data.table(id=rep(1:N, each=5),y=sample(letters[1 : 5], N*5, replace = T))
microbenchmark(
"dcast" = dcastFunction(),
"Table" = TableFunction(),
"Ans" = AnsFunction()
)
Unit: milliseconds
expr min lq mean median uq max neval cld
dcast 42.48367 45.39793 47.56898 46.83755 49.33388 60.72327 100 b
Table 28.32704 28.74579 29.14043 29.00010 29.23320 35.16723 100 a
Ans 120.80609 123.95895 127.35880 126.85018 130.12491 156.53289 100 c
> all(test1 == test2) [1] TRUE > all(test1 == test3) [1] TRUE
y = apply(matrix(sample(letters, 10L*20L, TRUE), ncol=20L), 1L, paste, collapse="")
dt = data.table(id=sample(1e5,1e7,TRUE), y=sample(y,1e7,TRUE))
microbenchmark(
"dcast" = dcastFunction(),
"Table" = TableFunction(),
"Ans" = AnsFunction()
)
Unit: seconds
expr min lq mean median uq max neval cld
dcast 1.985969 2.064964 2.189764 2.216138 2.266959 2.643231 100 a
Table 5.022388 5.403263 5.605012 5.580228 5.830414 6.318729 100 c
Ans 2.234636 2.414224 2.586727 2.599156 2.645717 2.982311 100 b