I have a data frame with 3 columns: custId, saleDate, DelivDateTime.
> head(events22)
custId saleDate DelivDate
1 280356593 2012-11-1
The fastest between ddply
and aggregate
, I suppose would be aggregate
, especially on huge data as you have. However, the fastest would be data.table
.
require(data.table)
dt <- data.table(events22)
dt[, .SD[which.max(saleDate),], by=custId]
From ?data.table
: .SD
is a data.table
containing the subset of x's
Data for each group, excluding the group column(s).
I, too, would recommend data.table
here, but since you asked for an aggregate
solution, here is one which combines aggregate
and merge
to get all the columns:
merge(events22, aggregate(saleDate ~ custId, events22, max))
Or just aggregate
if you only want the "custId" and "DelivDate" columns:
aggregate(list(DelivDate = events22$saleDate),
list(custId = events22$custId),
function(x) events22[["DelivDate"]][which.max(x)])
Finally, here's an option using sqldf
:
library(sqldf)
sqldf("select custId, DelivDate, max(saleDate) `saleDate`
from events22 group by custId")
I'm not a benchmarking or data.table
expert, but it surprised me that data.table
is not faster here. My suspicion is that the results would be quite different on a larger dataset, say for instance, your 400k lines one. Anyway, here's some benchmarking code modeled after @mnel's answer here so you can do some tests on your actual dataset for future reference.
library(rbenchmark)
First, set up your functions for what you want to benchmark.
DDPLY <- function() {
x <- ddply(events22, .(custId), .inform = T,
function(x) {
x[x$saleDate == max(x$saleDate),"DelivDate"]})
}
DATATABLE <- function() { x <- dt[, .SD[which.max(saleDate), ], by = custId] }
AGG1 <- function() {
x <- merge(events22, aggregate(saleDate ~ custId, events22, max)) }
AGG2 <- function() {
x <- aggregate(list(DelivDate = events22$saleDate),
list(custId = events22$custId),
function(x) events22[["DelivDate"]][which.max(x)]) }
SQLDF <- function() {
x <- sqldf("select custId, DelivDate, max(saleDate) `saleDate`
from events22 group by custId") }
DOCALL <- function() {
do.call(rbind,
lapply(split(events22, events22$custId), function(x){
x[which.max(x$saleDate), ]
})
)
}
Second, do the benchmarking.
benchmark(DDPLY(), DATATABLE(), AGG1(), AGG2(), SQLDF(), DOCALL(),
order = "elapsed")[1:5]
# test replications elapsed relative user.self
# 4 AGG2() 100 0.285 1.000 0.284
# 3 AGG1() 100 0.891 3.126 0.896
# 6 DOCALL() 100 1.202 4.218 1.204
# 2 DATATABLE() 100 1.251 4.389 1.248
# 1 DDPLY() 100 1.254 4.400 1.252
# 5 SQLDF() 100 2.109 7.400 2.108
This should be pretty fast but data.table
is likely faster:
do.call(rbind,
lapply(split(events22, events22$custId), function(x){
x[which.max(x$saleDate), ]
})
)
Here's a much faster data.table
function:
DATATABLE <- function() {
dt <- data.table(events, key=c('custId', 'saleDate'))
dt[, maxrow := 1:.N==.N, by = custId]
return(dt[maxrow==TRUE, list(custId, DelivDate)])
}
Note that this function creates a data.table
and sorts the data, which is a step you'd only need to perform once. If you remove this step (perhaps you have a multi-step data processing pipeline, and create the data.table
once, as a first step), the function is more than twice as fast.
I also modified all the previous functions to return the result, for easier comparison:
DDPLY <- function() {
return(ddply(events, .(custId), .inform = T,
function(x) {
x[x$saleDate == max(x$saleDate),"DelivDate"]}))
}
AGG1 <- function() {
return(merge(events, aggregate(saleDate ~ custId, events, max)))}
SQLDF <- function() {
return(sqldf("select custId, DelivDate, max(saleDate) `saleDate`
from events group by custId"))}
DOCALL <- function() {
return(do.call(rbind,
lapply(split(events, events$custId), function(x){
x[which.max(x$saleDate), ]
})
))
}
Here's the results for 10k rows, repeated 10 times:
library(rbenchmark)
library(plyr)
library(data.table)
library(sqldf)
events <- do.call(rbind, lapply(1:500, function(x) events22))
events$custId <- sample(1:nrow(events), nrow(events))
benchmark(a <- DDPLY(), b <- DATATABLE(), c <- AGG1(), d <- SQLDF(),
e <- DOCALL(), order = "elapsed", replications=10)[1:5]
test replications elapsed relative user.self
2 b <- DATATABLE() 10 0.13 1.000 0.13
4 d <- SQLDF() 10 0.42 3.231 0.41
3 c <- AGG1() 10 12.11 93.154 12.03
1 a <- DDPLY() 10 32.17 247.462 32.01
5 e <- DOCALL() 10 56.05 431.154 55.85
Since all the functions return their results, we can verify they all return the same answer:
c <- c[order(c$custId),]
dim(a); dim(b); dim(c); dim(d); dim(e)
all(a$V1==b$DelivDate)
all(a$V1==c$DelivDate)
all(a$V1==d$DelivDate)
all(a$V1==e$DelivDate)
/Edit: On the smaller, 20 row dataset, data.table
is still the fastest, but by a thinner margin:
test replications elapsed relative user.self
2 b <- DATATABLE() 100 0.22 1.000 0.22
3 c <- AGG1() 100 0.42 1.909 0.42
5 e <- DOCALL() 100 0.48 2.182 0.49
1 a <- DDPLY() 100 0.55 2.500 0.55
4 d <- SQLDF() 100 1.00 4.545 0.98
/Edit2: If we remove the data.table
creation from the function we get the following results:
dt <- data.table(events, key=c('custId', 'saleDate'))
DATATABLE2 <- function() {
dt[, maxrow := 1:.N==.N, by = custId]
return(dt[maxrow==TRUE, list(custId, DelivDate)])
}
benchmark(a <- DDPLY(), b <- DATATABLE2(), c <- AGG1(), d <- SQLDF(),
e <- DOCALL(), order = "elapsed", replications=10)[1:5]
test replications elapsed relative user.self
2 b <- DATATABLE() 10 0.09 1.000 0.08
4 d <- SQLDF() 10 0.41 4.556 0.39
3 c <- AGG1() 10 11.73 130.333 11.67
1 a <- DDPLY() 10 31.59 351.000 31.50
5 e <- DOCALL() 10 55.05 611.667 54.91