问题
Ok I have a complex function built using data.frames and in trying to speed it up I've turned to data.table. I'm totally new to this so I'm quite befuddled. Anyhow I've made a much much simpler toy example of what I want to do, but I cannot work out how to translate it into data.table format. Here is the example in data.frame form:
rows <- 10
data1 <- data.frame( id =1:rows,
a = seq(0.2, 0.55, length.out = rows),
b = seq(0.35, 0.7, length.out = rows),
c = seq(0.4, 0.83, length.out = rows),
d = seq(0.6, 0.87, length.out = rows),
e = seq(0.7, 0.99, length.out = rows),
f = seq(0.52, 0.90, length.out = rows)
)
DT1 <- data.table(data1) #for later
data2 <- data.frame( id =3:1,
a = rep(3, 3),
d = rep(2, 3),
f = rep(1, 3)
)
m.names <- c("a", "d", "f")
data1[match(data2$id, data1$id),m.names] <- data1[match(data2$id, data1$id),m.names] + data2[match(data2$id, data1$id),m.names]
So note in the last step that I want perform addition between the pre-existing figures and the new data and its vectorised across several columns.
In a data.table format I've only gotten this far:
DT1[id %in% data2$id, m.names, with=FALSE]
This selects the values I want to add to but after that I am lost. I would appreciate any help !
EDIT:
Ok I've figure out part of it - I can use the last line of code above to achieve the vectorised addition part using using data2 to store the added values as follows:
data2[,m.names] <- data2[,m.names] + data.frame(DT1[id %in% data2$id, m.names, with=FALSE])
Even with 2.5million rows (in DT1) and 10,000 rows in data2 and 6 matching columns this only takes 0.004sec, but I still need to assign the new data2 to the appropriate dynamically assigned columns in data 1
回答1:
Here's another way, using devel version data.table v1.9.5:
require(data.table) ## v1.9.5+
setDT(data1) ## data1 is now a data.table
cols1 = c("a", "d", "f")
cols2 = paste0("i.", cols1)
setkey(data1, id) ## setkey and prepare for join
data1[data2, (cols1) := mapply(`+`, mget(cols1), mget(cols2), SIMPLIFY=FALSE)]
# id a b c d e f
# 1: 1 3.2000000 0.3500000 0.4000000 2.60 0.7000000 1.5200000
# 2: 2 3.2388889 0.3888889 0.4477778 2.63 0.7322222 1.5622222
# 3: 3 3.2777778 0.4277778 0.4955556 2.66 0.7644444 1.6044444
# 4: 4 0.3166667 0.4666667 0.5433333 0.69 0.7966667 0.6466667
# 5: 5 0.3555556 0.5055556 0.5911111 0.72 0.8288889 0.6888889
# 6: 6 0.3944444 0.5444444 0.6388889 0.75 0.8611111 0.7311111
# 7: 7 0.4333333 0.5833333 0.6866667 0.78 0.8933333 0.7733333
# 8: 8 0.4722222 0.6222222 0.7344444 0.81 0.9255556 0.8155556
# 9: 9 0.5111111 0.6611111 0.7822222 0.84 0.9577778 0.8577778
# 10: 10 0.5500000 0.7000000 0.8300000 0.87 0.9900000 0.9000000
The join of the form x[i]
is performed on the key column id
. For each row of data2's id
column, the corresponding matching rows in data1 is found. For example, for id = 2
from data2, the matching row is the 2nd row in data1.
Once we've all the matching rows, we evaluate the expression in j
, which updates the data1 columns provided in col1
by adding the values from mget(cols1)
and mget(cols2)
.
cols2
was generated with a i.
prefix, which fetches the values from the data.table i -- here data2.
HTH
回答2:
One way would be to use set
in a for
loop as this involves multiple columns. Convert the second dataset to 'data.table' (DT2), set the key with the 'id' column, and join with 'data1'. Create column index vectors for the 'm.names' column in the dataset ('indx1') and i.
columns created after the join ('indx2'). Using a for
loop, set
the NA
elements in 'm.names' columns to '0', and then sum the corresponding columns based on the 'indx1' and 'indx2'.
DT2 <- as.data.table(data2)
DTNew <- setkey(DT2, id)[data1]
indx1 <- match(m.names, names(DTNew))
indx2 <- grep('i\\.', names(DTNew))
for(k in seq_along(indx1)){
set(DTNew, i=which(is.na(DTNew[[indx1[k]]])), j= indx1[k], value=0)
set(DTNew, i=NULL, j= indx2[k], value = DTNew[[indx1[k]]]+
DTNew[[indx2[k]]])
}
res <- DTNew[,2:4 := NULL]
setnames(res, names(data1))
Checking with the modified 'data1'
data1[match(data2$id, data1$id),m.names] <- data1[match(data2$id,
data1$id),m.names] + data2[match(data2$id, data1$id),m.names]
all.equal(setDF(res), data1)
#[1] TRUE
Benchmarks
On a 1e6 dataset,
set.seed(24)
data1 <- cbind(id=1:1e6,as.data.frame(matrix(rnorm(1e6*10), ncol=10,
dimnames=list(NULL, letters[1:10])) ))
set.seed(46)
data2 <- data.frame(id= sample(1:1000, 100, replace=FALSE),
a= rnorm(100), d=rnorm(100), f= rnorm(100))
m.names <- c("a", "d", "f")
DT2 <- as.data.table(data2)
system.time({
DTNew <- setkey(DT2, id)[data1]
indx1 <- match(m.names, names(DTNew))
indx2 <- grep('i\\.', names(DTNew))
for(k in seq_along(indx1)){
set(DTNew, i=which(is.na(DTNew[[indx1[k]]])), j= indx1[k], value=0)
set(DTNew, i=NULL, j= indx2[k], value = DTNew[[indx1[k]]]+
DTNew[[indx2[k]]])
}
res <- DTNew[,2:4 := NULL]
setnames(res, names(data1))
})
# user system elapsed
# 0.082 0.005 0.086
回答3:
Ok thanks to @David Arenburg for his suggestion. I've modified it slightly to arrive at the following for my preferred solution
text <- NULL
for(i in 1:length(m.names)){
text <- paste0(text, m.names[i], " = ", m.names[i], " + i.", m.names[i], ", ")
}
expr <- parse(text = paste0("\":=\"(", substr(text, 1, nchar(text)-2), ")" ))
res2 <- DT1[data2, eval(expr)]
来源:https://stackoverflow.com/questions/29321218/complex-data-table-subset-and-vectorised-maniulation