As a relatively inexperienced user of the data.table package in R, I\'ve been trying to process one text column into a large number of indicator columns (dummy variables), w
UPDATE : VERSION 3
Found even faster way. This function is also highly memory efficient.
Primary reason previous function was slow because of copy/assignments happening inside lapply loop as well as rbinding of the result.
In following version, we preallocate matrix with appropriate size, and then change values at appropriate coordinates, which makes it very fast compared to other looping versions.
funcGT3 <- function() {
#Get list of column names in result
resCol <- unique(dt[, unlist(strsplit(messy_string, split="\\$"))])
#Get dimension of result
nresCol <- length(resCol)
nresRow <- nrow(dt)
#Create empty matrix with dimensions same as desired result
mat <- matrix(rep(0, nresRow * nresCol), nrow = nresRow, dimnames = list(as.character(1:nresRow), resCol))
#split each messy_string by $
ll <- strsplit(dt[,messy_string], split="\\$")
#Get coordinates of mat which we need to set to 1
coords <- do.call(rbind, lapply(1:length(ll), function(i) cbind(rep(i, length(ll[[i]])), ll[[i]] )))
#Set mat to 1 at appropriate coordinates
mat[coords] <- 1
#Bind the mat to original data.table
return(cbind(dt, mat))
}
result <- funcGT3() #result for 1000 rows in dt
result
ID messy_string zn tc sv db yx st ze qs wq oe cv ut is kh kk im le qg rq po wd kc un ft ye if zl zt wy et rg iu
1: 1 zn$tc$sv$db$yx 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2: 2 st$ze$qs$wq 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3: 3 oe$cv$ut$is 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4: 4 kh$kk$im$le$qg 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5: 5 rq$po$wd$kc 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0
---
996: 996 rp$cr$tb$sa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
997: 997 cz$wy$rj$he 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
998: 998 cl$rr$bm 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
999: 999 sx$hq$zy$zd 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1000: 1000 bw$cw$pw$rq 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
Benchmark againt version 2 suggested by Ricardo (this is for 250K rows in data) :
Unit: seconds
expr min lq median uq max neval
GT2 104.68672 104.68672 104.68672 104.68672 104.68672 1
GT3 15.15321 15.15321 15.15321 15.15321 15.15321 1
VERSION 1 Following is version 1 of suggested answer
set.seed(10)
elements_list <- c(outer(letters, letters, FUN = paste, sep = ""))
random_string <- function(min_length, max_length, separator) {
selection <- paste(sample(elements_list, ceiling(runif(1, min_length, max_length))), collapse = separator)
return(selection)
}
dt <- data.table(ID = c(1:1000), messy_string = "")
dt[ , messy_string := random_string(2, 5, "$"), by = ID]
myFunc <- function() {
ll <- strsplit(dt[,messy_string], split="\\$")
COLS <- do.call(rbind,
lapply(1:length(ll),
function(i) {
data.frame(
ID= rep(i, length(ll[[i]])),
COL = ll[[i]],
VAL= rep(1, length(ll[[i]]))
)
}
)
)
res <- as.data.table(tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length ))
dt <- cbind(dt, res)
for (j in names(dt))
set(dt,which(is.na(dt[[j]])),j,0)
return(dt)
}
create_indicators <- function(search_list, searched_string) {
y <- rep(0, length(search_list))
for(j in 1:length(search_list)) {
x <- regexpr(search_list[j], searched_string)
x <- x[1]
y[j] <- ifelse(x > 0, 1, 0)
}
return(y)
}
OPFunc <- function() {
indicators <- matrix(0, nrow = nrow(dt), ncol = length(elements_list))
for(n in 1:nrow(dt)) {
indicators[n, ] <- dt[n, create_indicators(elements_list, messy_string)]
}
indicators <- data.table(indicators)
setnames(indicators, elements_list)
dt <- cbind(dt, indicators)
return(dt)
}
library(plyr)
plyrFunc <- function() {
indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i)
dt[i,
data.frame(t(as.matrix(table(strsplit(messy_string,
split = "\\$")))))
]))
dt = cbind(dt, indicators)
#dt[is.na(dt)] = 0 #THIS DOESN'T WORK. USING FOLLOWING INSTEAD
for (j in names(dt))
set(dt,which(is.na(dt[[j]])),j,0)
return(dt)
}
BENCHMARK
system.time(res <- myFunc())
## user system elapsed
## 1.01 0.00 1.01
system.time(res2 <- OPFunc())
## user system elapsed
## 21.58 0.00 21.61
system.time(res3 <- plyrFunc())
## user system elapsed
## 1.81 0.00 1.81
VERSION 2 : Suggested by Ricardo
I'm posting this here instead of in my answer as the framework is really @GeekTrader's -Rick_
myFunc.modified <- function() {
ll <- strsplit(dt[,messy_string], split="\\$")
## MODIFICATIONS:
# using `rbindlist` instead of `do.call(rbind.. )`
COLS <- rbindlist( lapply(1:length(ll),
function(i) {
data.frame(
ID= rep(i, length(ll[[i]])),
COL = ll[[i]],
VAL= rep(1, length(ll[[i]])),
# MODICIATION: Not coercing to factors
stringsAsFactors = FALSE
)
}
)
)
# MODIFICATION: Preserve as matrix, the output of tapply
res2 <- tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length )
# FLATTEN into a data.table
resdt <- data.table(r=c(res2))
# FIND & REPLACE NA's of single column
resdt[is.na(r), r:=0L]
# cbind with dt, a matrix, with the same attributes as `res2`
cbind(dt,
matrix(resdt[[1]], ncol=ncol(res2), byrow=FALSE, dimnames=dimnames(res2)))
}
### Benchmarks:
orig = quote({dt <- copy(masterDT); myFunc()})
modified = quote({dt <- copy(masterDT); myFunc.modified()})
microbenchmark(Modified = eval(modified), Orig = eval(orig), times=20L)
# Unit: milliseconds
# expr min lq median uq max
# 1 Modified 895.025 971.0117 1011.216 1189.599 2476.972
# 2 Orig 1953.638 2009.1838 2106.412 2230.326 2356.802