Cumulative sum in a window (or running window sum) based on a condition in R

前端 未结 4 696
傲寒
傲寒 2020-12-17 17:19

I am trying to calculate cumulative sum for a given window based on a condition. I have seen threads where the solution does conditional cumulative sum (Calculate a conditio

相关标签:
4条回答
  • 2020-12-17 17:57

    1) rollapply Create a Sum function which takes FY and Rev as a 2 column matrix (or if not makes it one) and then sums the revenues for those years within k of the last year. Then convert DFI to a data table, sum rows having the same Customer/Product/Year and run rollapplyr with Sum for each Customer/Product group.

    library(data.table)
    library(zoo)
    
    k <- 5
    Sum <- function(x) {
      x <- matrix(x,, 2)
      FY <- x[, 1]
      Rev <- x[, 2]
      ok <- FY >= tail(FY, 1) - k + 1
      sum(Rev[ok])
    }
    DT <- as.data.table(DFI)
    DT <- DT[, list(Rev = sum(Rev)), by = c("Customer", "Product", "FY")]
    DT[, cumsum := rollapplyr(.SD, k, Sum, by.column = FALSE, partial = TRUE),
           by = c("Customer", "Product"), .SDcols = c("FY", "Rev")]
    

    giving:

     > DT
        Customer Product   FY Rev cumsum
     1:    13575       A 2011   4      4
     2:    13575       A 2012   3      7
     3:    13575       A 2013   3     10
     4:    13575       A 2015   1     11
     5:    13575       A 2016   2      9
     6:    13575       B 2011   3      3
     7:    13575       B 2012   3      6
     8:    13575       B 2013   4     10
     9:    13575       B 2014   5     15
    10:    13575       B 2015   6     21
    11:    13578       A 2010   3      3
    12:    13578       A 2016   2      2
    13:    13578       B 2013   2      2
    14:    13578       C 2014   4      4
    15:    13578       D 2015   2      2
    16:    13578       E 2010   2      2
    

    2) data.table only

    First sum rows that have the same Customer/Product/FY and then, grouping by Customer/Product, for each FY value, fy, pick out the Rev values whose FY values are between fy-k+1 and fy and sum.

    library(data.table)
    
    k <- 5
    DT <- as.data.table(DFI)
    DT <- DT[, list(Rev = sum(Rev)), by = c("Customer", "Product", "FY")]
    DT[, cumsum := sapply(FY, function(fy) sum(Rev[between(FY, fy-k+1, fy)])),
           by = c("Customer", "Product")]
    

    giving:

    > DT
        Customer Product   FY Rev cumsum
     1:    13575       A 2011   4      4
     2:    13575       A 2012   3      7
     3:    13575       A 2013   3     10
     4:    13575       A 2015   1     11
     5:    13575       A 2016   2      9
     6:    13575       B 2011   3      3
     7:    13575       B 2012   3      6
     8:    13575       B 2013   4     10
     9:    13575       B 2014   5     15
    10:    13575       B 2015   6     21
    11:    13578       A 2010   3      3
    12:    13578       A 2016   2      2
    13:    13578       B 2013   2      2
    14:    13578       C 2014   4      4
    15:    13578       D 2015   2      2
    16:    13578       E 2010   2      2
    
    0 讨论(0)
  • 2020-12-17 18:08

    A solution using dplyr, tidyr, and zoo.

    # Load packages
    library(dplyr)
    library(tidyr)
    library(zoo)
    
    # A helper function to convert the rolling cumsum result
    cumsum_roll <- function(x){
      vec <- c(x[1, ], x[, ncol(x)][-1])
      return(vec)
    }
    
    DFI2 <- DFI %>%
      # Group by FY, Customer, Product
      group_by_at(vars(-Rev)) %>%                 
      # Calculate the total Rev pf each group
      summarise(Rev = sum(Rev)) %>%               
      ungroup() %>%
      group_by(Customer) %>%
      # Expand the data frame based on FY and Product
      # Fill the Rev to be 0
      complete(FY = full_seq(FY, period = 1), Product, fill = list(Rev = 0)) %>%
      # Sort the data frame by Customer, FY, and Product
      arrange(Customer, Product, FY) %>%
      ungroup() %>%
      group_by(Customer, Product) %>%
      # Apply the rolling cumsum by rollapply. Specify the window as 5.
      # cumsum_roll is to transcribe the output of rollapply, a matrix, to a vector
      mutate(cumsum = cumsum_roll(rollapply(Rev, 5, FUN = cumsum))) %>%
      # Remove Rev = 0
      filter(Rev != 0) %>%
      # Reorder the columns
      select(FY, Customer, Product, Rev, cumsum) %>%
      ungroup() %>%
      as.data.frame()
    
    DFI2
    #      FY Customer Product Rev cumsum
    # 1  2011    13575       A   4      4
    # 2  2012    13575       A   3      7
    # 3  2013    13575       A   3     10
    # 4  2015    13575       A   1     11
    # 5  2016    13575       A   2      9
    # 6  2011    13575       B   3      3
    # 7  2012    13575       B   3      6
    # 8  2013    13575       B   4     10
    # 9  2014    13575       B   5     15
    # 10 2015    13575       B   6     21
    # 11 2010    13578       A   3      3
    # 12 2016    13578       A   2      2
    # 13 2013    13578       B   2      2
    # 14 2014    13578       C   4      4
    # 15 2015    13578       D   2      2
    # 16 2010    13578       E   2      2
    
    0 讨论(0)
  • 2020-12-17 18:08

    Not a new tidyverse answer but I think nest helps with readability

    library(tidyverse)
    library(zoo)
    
    roll_cumsum <- function(df) {
                      df %>%
                         complete(FY = full_seq(FY, period=1)) %>%
                         mutate(roll_cumsum = rollapplyr(Rev, 5, sum, na.rm=TRUE, partial=TRUE))
                   }
    
    DFI %>%
      group_by_at(vars(-Rev)) %>%
      summarise(Rev = sum(Rev)) %>%
      group_by(Customer, Product) %>%
      nest(FY, Rev) %>%
      mutate(data = map(data, ~roll_cumsum(.x))) %>%
      unnest() %>%
      filter(!is.na(Rev)) %>%
      arrange(Customer, Product, FY)
    
    # A tibble: 16 x 5
       # Customer Product    FY   Rev roll_cumsum
          # <dbl> <chr>   <dbl> <dbl>       <dbl>
     # 1    13575 A        2011  4.00        4.00
     # 2    13575 A        2012  3.00        7.00
     # 3    13575 A        2013  3.00       10.0 
     # 4    13575 A        2015  1.00       11.0 
     # 5    13575 A        2016  2.00        9.00
     # 6    13575 B        2011  3.00        3.00
     # 7    13575 B        2012  3.00        6.00
     # 8    13575 B        2013  4.00       10.0 
     # 9    13575 B        2014  5.00       15.0 
    # 10    13575 B        2015  6.00       21.0 
    # 11    13578 A        2010  3.00        3.00
    # 12    13578 A        2016  2.00        2.00
    # 13    13578 B        2013  2.00        2.00
    # 14    13578 C        2014  4.00        4.00
    # 15    13578 D        2015  2.00        2.00
    # 16    13578 E        2010  2.00        2.00 
    
    0 讨论(0)
  • 2020-12-17 18:15

    My solution stays on the tidyverse side of things, however, if your source data is not excessive the performance difference may not be an issue.

    I will start with declaring a function to calculate the rolling sum using tibbletime::rollify and expand the data frame to include missing FY values. Then group and summarise while applying the rolling sum.

    library(tidyr)
    library(dplyr)
    
    rollsum_5 <- tibbletime::rollify(sum, window = 5)
    
    df %>%
      complete(FY, Customer, Product) %>%
      replace_na(list(Rev = 0), Rev) %>%
      arrange(Customer, Product, FY) %>%
      group_by(Customer, Product, FY) %>%
      summarise(Rev = sum(Rev)) %>%
      mutate(cumsum = rollsum_5(Rev)) %>%
      ungroup %>%
      filter(Rev != 0)
    
    # # A tibble: 16 x 5
    #    Customer Product    FY   Rev cumsum
    #       <dbl> <chr>   <dbl> <dbl>  <dbl>
    #  1    13575 A        2011  4.00  NA   
    #  2    13575 A        2012  3.00  NA   
    #  3    13575 A        2013  3.00  NA   
    #  4    13575 A        2015  1.00  11.0 
    #  5    13575 A        2016  2.00   9.00
    #  6    13575 B        2011  3.00  NA   
    #  7    13575 B        2012  3.00  NA   
    #  8    13575 B        2013  4.00  NA   
    #  9    13575 B        2014  5.00  15.0 
    # 10    13575 B        2015  6.00  21.0 
    # 11    13578 A        2010  3.00  NA   
    # 12    13578 A        2016  2.00   2.00
    # 13    13578 B        2013  2.00  NA   
    # 14    13578 C        2014  4.00   4.00
    # 15    13578 D        2015  2.00   2.00
    # 16    13578 E        2010  2.00  NA 
    

    N.B. The rolling sum in this case will only appear in the rows where the window (5 rows) are intact. It could be misleading to suggest that partial values are equal to a five year sum.

    0 讨论(0)
提交回复
热议问题