Say I have data that looks like
date, user, items_bought, event_number
2013-01-01, x, 2, 1
2013-01-02, x, 1, 2
2013-01-03, x, 0, 3
2013-01-04, x, 0, 4
2013-0
I like James' answer better, but here's an alternative:
with(data,{
sapply(split(data,user),function(x){
sapply(x$date,function(y) sum(x$items_bought[x$date %in% c(y,y-1,y-2)]))
})
})
The following looks valid:
unlist(lapply(split(data, data$user),
function(x) {
ave(x$items_bought,
cumsum(c(0, diff(x$date)) >= 3), FUN = cumsum)
}))
#x1 x2 x3 x4 y1 y2 y3 y4
# 2 3 3 4 1 6 6 7
Where data
:
data = structure(list(date = structure(c(15706, 15707, 15710, 15711,
15706, 15707, 15710, 15711), class = "Date"), user = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c(" x", " y"), class = "factor"),
items_bought = c(2L, 1L, 3L, 1L, 1L, 5L, 6L, 1L)), .Names = c("date",
"user", "items_bought"), row.names = c(NA, -8L), class = "data.frame")
It seems like packages xts
and zoo
contain functions that do what you want, although you may have the same problems with the size of your actual dataset as with @alexis_laz answer. Using the functions from the xts
answer to this question seem to do the trick.
First I took the code from the answer I link to above and made sure it worked for just one user
. I include the apply.daily
function because I believe from your edits/comments that you have multiple observations for some days for some users - I added an extra line to the toy dataset to reflect this.
# Make dataset with two observations for one date for "y" user
dat <- structure(list(
date = structure(c(15706, 15707, 15708, 15709, 15710, 15711,
15706, 15707, 15708, 15709, 15710, 15711, 15711), class = "Date"),
user = c("x", "x", "x", "x", "x", "x", "y", "y", "y", "y", "y", "y", "y"),
items_bought = c(2L, 1L, 0L, 0L, 3L, 1L, 1L, 1L, 0L, 5L, 6L, 1L, 0L)),
.Names = c("date", "user", "items_bought"),
row.names = c(NA, -13L),
class = "data.frame")
# Load xts package (also loads zoo)
require(xts)
# See if this works for one user
dat1 = subset(dat, user == "y")
# Create "xts" object for use with apply.daily()
dat1.1 = xts(dat1$items_bought, dat1$date)
dat2 = apply.daily(dat1.1, sum)
# Now use rollapply with a 3-day window
# The "partial" argument appears to only work with zoo objects, not xts
sum.itemsbought = rollapply(zoo(dat2), 3, sum, align = "right", partial = TRUE)
I thought the output could look nicer (more like example output from your question). I haven't worked with zoo
objects much, but the answer to this question gave me some pointers for putting the info into a data.frame
.
data.frame(Date=time(sum.itemsbought), sum.itemsbought, row.names=NULL)
Once I had this worked out for one user
, it was straightforward to expand this to the entire toy dataset. This is where speed could become an issue. I use lapply
and do.call
for this step.
allusers = lapply(unique(dat$user), function(x) {
dat1 = dat[dat$user == x,]
dat1.1 = xts(dat1$items_bought, dat1$date)
dat2 = apply.daily(dat1.1, sum)
sum.itemsbought = rollapply(zoo(dat2), 3, sum, align = "right", partial = TRUE)
data.frame(Date=time(sum.itemsbought), user = x, sum.itemsbought, row.names=NULL)
} )
do.call(rbind, allusers)
I'd like to propose an additional data.table
approach combined with zoo
package rollapplyr
function
First, we will aggregate items_bought
column per user
per unique date
(as you pointed out that there could be more than one unique date per user)
library(data.table)
data <- setDT(data)[, lapply(.SD, sum), by = c("user", "date"), .SDcols = "items_bought"]
Next, we will compute rollapplyr
combined with sum
and partial = TRUE
in order to cover up for margins (thanks for the advice @G. Grothendieck) in 3 days intervals
library(zoo)
data[, cum_items_bought_3_days := lapply(.SD, rollapplyr, 3, sum, partial = TRUE), .SDcols = "items_bought", by = user]
# user date items_bought cum_items_bought_3_days
# 1: x 2013-01-01 2 2
# 2: x 2013-01-02 1 3
# 3: x 2013-01-03 0 3
# 4: x 2013-01-04 0 1
# 5: x 2013-01-05 3 3
# 6: x 2013-01-06 1 4
# 7: y 2013-01-01 1 1
# 8: y 2013-01-02 1 2
# 9: y 2013-01-03 0 2
# 10: y 2013-01-04 5 6
# 11: y 2013-01-05 6 11
# 12: y 2013-01-06 1 12
This is the data set I've used
data <- structure(list(date = structure(c(15706, 15707, 15708, 15709, 15710, 15711, 15706, 15707, 15708, 15709, 15710, 15711), class = "Date"), user = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c(" x", " y"), class = "factor"), items_bought = c(2L, 1L, 0L, 0L, 3L, 1L, 1L, 1L, 0L, 5L, 6L, 1L)), .Names = c("date", "user", "items_bought"), row.names = c(NA, -12L), class = "data.frame")
Here is a fairly simple method:
# replicate your data, shifting the days ahead by your required window,
# and rbind into a single data frame
d <- do.call(rbind,lapply(0:2, function(x) transform(data,date=date+x)))
# use aggregate to add it together, subsetting out "future" days
aggregate(items_bought~date+user,subset(d,date<=max(data$date)),sum)
date user items_bought
1 2013-01-01 x 2
2 2013-01-02 x 3
3 2013-01-03 x 3
4 2013-01-04 x 1
5 2013-01-05 x 3
6 2013-01-06 x 4
7 2013-01-01 y 1
8 2013-01-02 y 2
9 2013-01-03 y 2
10 2013-01-04 y 6
11 2013-01-05 y 11
12 2013-01-06 y 12
Here is an approach that doesn't use cumsum but a nested lapply
instead. The first one goes over the users and then for each user the second lapply
constructs the desired data frame by summing all items bought from within the last 2 days of each date. Note that if data$date
were not sorted, it would have to be sorted in ascending order first.
data <- structure(list(
date = structure(c(15706, 15707, 15708, 15709, 15710, 15711,
15706, 15707, 15708, 15709, 15710, 15711), class = "Date"),
user = c("x", "x", "x", "x", "x", "x", "y", "y", "y", "y", "y", "y"),
items_bought = c(2L, 1L, 0L, 0L, 3L, 1L, 1L, 1L, 0L, 5L, 6L, 1L)),
.Names = c("date", "user", "items_bought"),
row.names = c(NA, -12L),
class = "data.frame")
do.call(rbind, lapply(unique(data$user),
function(u) {
subd <- subset(data, user == u)
do.call(rbind, lapply(subd$date,
function(x) data.frame(date = x,
user = u, items_bought =
sum(subd[subd$date %in% (x - 2):x, "items_bought"]))))
}))
Edit
To deal with the issue of having several timestamps for each day (more than 1 row per date) I would first aggregate by summing all items bought during at each time in the same day. You can do that e.g. using the built-in function aggregate
but if your data is too large you can also use data.table
for speed. I'll call your original data frame (with more than 1 row per date) predata
and the aggregated one (1 row per date) data
. So by calling
predt <- data.table(predata)
setkey(predt, date, user)
data <- predt[, list(items_bought = sum(items_bought)), by = key(predt)]
you get a data frame containing one row per date and columns date, user, items_bought. Now, I think the following way will be faster than the nested lapply
above, but I am not sure since I cannot test it on your data. I am using data.table because it is meant to be fast (if used the right way, which I am not sure this is). The inner loop will be replaced by a function f
. I do not know if there is a neater way, avoiding this function and replacing the double loop with only one call to data.table, or how to write a data.table call that would execute faster.
library(data.table)
dt <- data.table(data)
setkey(dt, user)
f <- function(d, u) {
do.call(rbind, lapply(d$date, function(x) data.frame(date = x,
items_bought = d[date %in% (x - 2):x, sum(items_bought)])))
}
data <- dt[, f(.SD, user), by = user]
Another way, which doesn't use data.table, assuming that you have enough RAM (again, I don't know the size of your data), is to store items bought 1 day before in a vector, then items bought 2 days before in another vector, etc, and to sum them up in the end. Something like
sumlist <- vector("list", 2) # this will hold one vector, which contains items
# bought 1 or 2 days ago
for (i in 1:2) {
# tmpstr will be used to find the items that a given user bought i days ago
tmpstr <- paste(data$date - i, data$user, sep = "|")
tmpv <- data$items_bought[
match(tmpstr, paste(data$date, data$user, sep = "|"))]
# if a date is not in the original data, assume no purchases
tmpv[is.na(tmpv)] <- 0
sumlist[[i]] <- tmpv
}
# finally, add up items bought in the past as well as the present day
data$cum_items_bought_3_days <-
rowSums(as.data.frame(sumlist)) + data$items_bought
A final thing I would try would be to parallelize the lapply
calls, e.g. by using the function mclapply
instead, or by re-writing the code using the parallel functionality of foreach
or plyr
. Depending on the strength of your PC and the size of the task, this may outperform the data.table single-core performance...