dplyr: grouping and summarizing/mutating data with rolling time windows

大城市里の小女人 提交于 2019-11-29 04:32:58

This can be done using SQL:

library(sqldf)

dd <- transform(data, date = as.Date(date))
sqldf("select a.*, count(*) n_trans30, sum(b.n_widgets) 'total_widgets30' 
       from dd a 
       left join dd b on b.date between a.date - 30 and a.date 
                         and b.id = a.id
                         and b.rowid <= a.rowid
       group by a.rowid")

giving:

  id       date n_widgets n_trans30 total_widgets30
1  1 2015-01-01         1         1               1
2  1 2015-01-01         2         2               3
3  1 2015-01-05         3         3               6
4  1 2015-01-25         4         4              10
5  2 2015-05-05         5         1               5
6  2 2015-01-01         2         1               2
7  3 2015-08-01         4         1               4
8  4 2015-01-01         5         1               5

Another approach is to expand your dataset to contain all possible days (using tidyr::complete), then use a rolling function (RcppRoll::roll_sum)

The fact that you have multiple observations per day is probably creating an issue though...

library(tidyr)
library(RcppRoll)
df2 <- df %>%
   mutate(date=as.Date(date))

## create full dataset with all possible dates (go even 30 days back for first observation)
df_full<- df2 %>%
 mutate(date=as.Date(date))  %>%
   complete(id, 
       date=seq(from=min(.$date)-30,to=max(.$date), by=1), 
       fill=list(n_widgets=0))

## now use rolling function, and keep only original rows (left join)
df_roll <- df_full %>%
  group_by(id) %>%
  mutate(n_trans_30=roll_sum(x=n_widgets!=0, n=30, fill=0, align="right"),
         total_widgets_30=roll_sum(x=n_widgets, n=30, fill=0, align="right")) %>%
  ungroup() %>%
  right_join(df2, by = c("date", "id", "n_widgets"))

The result is the same as yours (by chance)

     id       date n_widgets n_trans_30 total_widgets_30
  <dbl>     <date>     <dbl>      <dbl>            <dbl>
1     1 2015-01-01         1          1                1
2     1 2015-01-01         2          2                3
3     1 2015-01-05         3          3                6
4     1 2015-01-25         4          4               10
5     1 2015-02-15         4          2                8
6     2 2015-05-05         5          1                5
7     2 2015-01-01         2          1                2
8     3 2015-08-01         4          1                4
9     4 2015-01-01         5          1                5

But as said, it will fail for some days as it count last 30 obs, not last 30 days. So you might want first to summarise the information by day, then apply this.

I found a way to do this while working on this question

df <- data.frame(
  id = c(1, 1, 1, 1, 1, 2, 2, 3, 4),
  date = c("2015-01-01", 
           "2015-01-01", 
           "2015-01-05", 
           "2015-01-25",
           "2015-02-15",
           "2015-05-05", 
           "2015-01-01", 
           "2015-08-01", 
           "2015-01-01"),
  n_widgets = c(1,2,3,4,4,5,2,4,5)
)

count_window <- function(df, date2, w, id2){
  min_date <- date2 - w
  df2 <- df %>% filter(id == id2, date >= min_date, date <= date2)
  out <- length(df2$date)
  return(out)
}
v_count_window <- Vectorize(count_window, vectorize.args = c("date2","id2"))

sum_window <- function(df, date2, w, id2){
  min_date <- date2 - w
  df2 <- df %>% filter(id == id2, date >= min_date, date <= date2)
  out <- sum(df2$n_widgets)
  return(out)
}
v_sum_window <- Vectorize(sum_window, vectorize.args = c("date2","id2"))

res <- df %>% mutate(date = ymd(date)) %>% 
  mutate(min_date = date - 30,
         n_trans = v_count_window(., date, 30, id),
         total_widgets = v_sum_window(., date, 30, id)) %>% 
  select(id, date, n_widgets, n_trans, total_widgets)
res


id       date n_widgets n_trans total_widgets

1  1 2015-01-01         1       2             3
2  1 2015-01-01         2       2             3
3  1 2015-01-05         3       3             6
4  1 2015-01-25         4       4            10
5  1 2015-02-15         4       2             8
6  2 2015-05-05         5       1             5
7  2 2015-01-01         2       1             2
8  3 2015-08-01         4       1             4
9  4 2015-01-01         5       1             5

This version is fairly case specific but you could probably make a version of the functions that is more general.

Gopala

EDITED based on comment below.

You can try something like this for up to 5 days:

df %>%
  arrange(id, date) %>%
  group_by(id) %>%
  filter(as.numeric(difftime(Sys.Date(), date, unit = 'days')) <= 5) %>%
  summarise(n_total_widgets = sum(n_widgets))

In this case, there are no days within five of current. So, it won't produce any output.

To get last five days for each ID, you can do something like this:

df %>%
   arrange(id, date) %>%
   group_by(id) %>%
   filter(as.numeric(difftime(max(date), date, unit = 'days')) <= 5) %>%
   summarise(n_total_widgets = sum(n_widgets))

Resulting output will be:

Source: local data frame [4 x 2]

     id n_total_widgets
  (dbl)           (dbl)
1     1               4
2     2               5
3     3               4
4     4               5
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!