How to create a lag variable within each group?

前端 未结 5 1447
没有蜡笔的小新
没有蜡笔的小新 2020-11-22 04:45

I have a data.table:

set.seed(1)
data <- data.table(time = c(1:3, 1:4),
                   groups = c(rep(c(\"b\", \"a\"), c(3, 4))),
                   v         


        
相关标签:
5条回答
  • 2020-11-22 05:13

    If you wanted to make sure that you avoided any issue with ordering the data, you can do this, using dplyr, manually with something like:

    df <- data.frame(Names = c(rep('Dan',50),rep('Dave',100)),
                Dates = c(seq(1,100,by=2),seq(1,100,by=1)),
                Values = rnorm(150,0,1))
    
    df <- df %>% group_by(Names) %>% mutate(Rank=rank(Dates),
                                        RankDown=Rank-1)
    
    df <- df %>% left_join(select(df,Rank,ValueDown=Values,Names),by=c('RankDown'='Rank','Names')
    ) %>% select(-Rank,-RankDown)
    
    head(df)
    

    Or alternatively I like the idea of putting it in a function with a chosen grouping variable(s), ranking column (like Date or otherwise), and chosen number of lags. This also requires lazyeval as well as dplyr.

    groupLag <- function(mydf,grouping,ranking,lag){
      df <- mydf
      groupL <- lapply(grouping,as.symbol)
    
      names <- c('Rank','RankDown')
      foos <- list(interp(~rank(var),var=as.name(ranking)),~Rank-lag)
    
      df <- df %>% group_by_(.dots=groupL) %>% mutate_(.dots=setNames(foos,names))
    
      selectedNames <- c('Rank','Values',grouping)
      df2 <- df %>% select_(.dots=selectedNames)
      colnames(df2) <- c('Rank','ValueDown',grouping)
    
      df <- df %>% left_join(df2,by=c('RankDown'='Rank',grouping)) %>% select(-Rank,-RankDown)
    
      return(df)
    }
    
    groupLag(df,c('Names'),c('Dates'),1)
    
    0 讨论(0)
  • 2020-11-22 05:21

    In base R, this will do the job:

    data$lag.value <- c(NA, data$value[-nrow(data)])
    data$lag.value[which(!duplicated(data$groups))] <- NA
    

    The first line adds a string of lagged (+1) observations. The second string corrects the first entry of each group, as the lagged observation is from previous group.

    Note that data is of format data.frame to not use data.table.

    0 讨论(0)
  • 2020-11-22 05:27

    Using package dplyr:

    library(dplyr)
    data <- 
        data %>%
        group_by(groups) %>%
        mutate(lag.value = dplyr::lag(value, n = 1, default = NA))
    

    gives

    > data
    Source: local data table [7 x 4]
    Groups: groups
    
      time groups       value   lag.value
    1    1      a  0.07614866          NA
    2    2      a -0.02784712  0.07614866
    3    3      a  1.88612245 -0.02784712
    4    1      b  0.26526825          NA
    5    2      b  1.23820506  0.26526825
    6    3      b  0.09276648  1.23820506
    7    4      b -0.09253594  0.09276648
    

    As noted by @BrianD, this implicitly assumes that value is sorted by group already. If not, either sort it by group, or use the order_by argument in lag. Also note that due to an existing issue with some versions of dplyr, for safety, arguments and the namespace should be explicitly given.

    0 讨论(0)
  • 2020-11-22 05:28

    You could do this within data.table

     library(data.table)
     data[, lag.value:=c(NA, value[-.N]), by=groups]
      data
     #   time groups       value   lag.value
     #1:    1      a  0.02779005          NA
     #2:    2      a  0.88029938  0.02779005
     #3:    3      a -1.69514201  0.88029938
     #4:    1      b -1.27560288          NA
     #5:    2      b -0.65976434 -1.27560288
     #6:    3      b -1.37804943 -0.65976434
     #7:    4      b  0.12041778 -1.37804943
    

    For multiple columns:

    nm1 <- grep("^value", colnames(data), value=TRUE)
    nm2 <- paste("lag", nm1, sep=".")
    data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
     data
    #    time groups      value     value1      value2  lag.value lag.value1
    #1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
    #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
    #3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
    #4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
    #5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
    #6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
    #7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
    #    lag.value2
    #1:          NA
    #2:  1.12493092
    #3: -0.04493361
    #4:          NA
    #5:  0.94383621
    #6:  0.82122120
    #7:  0.59390132
    

    Update

    From data.table versions >= v1.9.5, we can use shift with type as lag or lead. By default, the type is lag.

    data[, (nm2) :=  shift(.SD), by=groups, .SDcols=nm1]
    #   time groups      value     value1      value2  lag.value lag.value1
    #1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
    #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
    #3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
    #4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
    #5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
    #6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
    #7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
    #    lag.value2
    #1:          NA
    #2:  1.12493092
    #3: -0.04493361
    #4:          NA
    #5:  0.94383621
    #6:  0.82122120
    #7:  0.59390132
    

    If you need the reverse, use type=lead

    nm3 <- paste("lead", nm1, sep=".")
    

    Using the original dataset

      data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
      #  time groups      value     value1      value2 lead.value lead.value1
      #1:    1      b -0.6264538  0.7383247  1.12493092  0.1836433   0.5757814
      #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.8356286  -0.3053884
      #3:    3      b -0.8356286 -0.3053884 -0.01619026         NA          NA
      #4:    1      a  1.5952808  1.5117812  0.94383621  0.3295078   0.3898432
      #5:    2      a  0.3295078  0.3898432  0.82122120 -0.8204684  -0.6212406
      #6:    3      a -0.8204684 -0.6212406  0.59390132  0.4874291  -2.2146999
      #7:    4      a  0.4874291 -2.2146999  0.91897737         NA          NA
     #   lead.value2
     #1: -0.04493361
     #2: -0.01619026
     #3:          NA
     #4:  0.82122120
     #5:  0.59390132
     #6:  0.91897737
     #7:          NA
    

    data

     set.seed(1)
     data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
                 value = rnorm(7), value1=rnorm(7), value2=rnorm(7))
    
    0 讨论(0)
  • 2020-11-22 05:32

    I wanted to complement the previous answers by mentioning two ways in which I approach this problem in the important case when you are not guaranteed that each group has data for every time period. That is, you still have a regularly spaced time series, but there might be missings here and there. I will focus on two ways to improve the dplyr solution.

    We start with the same data that you used...

    library(dplyr)
    library(tidyr)
    
    set.seed(1)
    data_df = data.frame(time   = c(1:3, 1:4),
                         groups = c(rep(c("b", "a"), c(3, 4))),
                         value  = rnorm(7))
    data_df
    #>   time groups      value
    #> 1    1      b -0.6264538
    #> 2    2      b  0.1836433
    #> 3    3      b -0.8356286
    #> 4    1      a  1.5952808
    #> 5    2      a  0.3295078
    #> 6    3      a -0.8204684
    #> 7    4      a  0.4874291
    

    ... but now we delete a couple of rows

    data_df = data_df[-c(2, 6), ]
    data_df
    #>   time groups      value
    #> 1    1      b -0.6264538
    #> 3    3      b -0.8356286
    #> 4    1      a  1.5952808
    #> 5    2      a  0.3295078
    #> 7    4      a  0.4874291
    

    Simple dplyr solution no longer works

    data_df %>% 
      arrange(groups, time) %>% 
      group_by(groups) %>% 
      mutate(lag.value = lag(value)) %>% 
      ungroup()
    #> # A tibble: 5 x 4
    #>    time groups  value lag.value
    #>   <int> <fct>   <dbl>     <dbl>
    #> 1     1 a       1.60     NA    
    #> 2     2 a       0.330     1.60 
    #> 3     4 a       0.487     0.330
    #> 4     1 b      -0.626    NA    
    #> 5     3 b      -0.836    -0.626
    

    You see that, although we don't have the value for the case (group = 'a', time = '3'), the above still shows a value for the lag in the case of (group = 'a', time = '4'), which is actually the value at time = 2.

    Correct dplyr solution

    The idea is that we add the missing (group, time) combinations. This is VERY memory-inefficient when you have lots of possible (groups, time) combinations, but the values are sparsely captured.

    dplyr_correct_df = expand.grid(
      groups = sort(unique(data_df$groups)),
      time   = seq(from = min(data_df$time), to = max(data_df$time))
    ) %>% 
      left_join(data_df, by = c("groups", "time")) %>% 
      arrange(groups, time) %>% 
      group_by(groups) %>% 
      mutate(lag.value = lag(value)) %>% 
      ungroup()
    dplyr_correct_df
    #> # A tibble: 8 x 4
    #>   groups  time   value lag.value
    #>   <fct>  <int>   <dbl>     <dbl>
    #> 1 a          1   1.60     NA    
    #> 2 a          2   0.330     1.60 
    #> 3 a          3  NA         0.330
    #> 4 a          4   0.487    NA    
    #> 5 b          1  -0.626    NA    
    #> 6 b          2  NA        -0.626
    #> 7 b          3  -0.836    NA    
    #> 8 b          4  NA        -0.836
    

    Notice that we now have a NA at (group = 'a', time = '4'), which should be the expected behaviour. Same with (group = 'b', time = '3').

    Tedious but also correct solution using the class zoo::zooreg

    This solution should work better in terms of memory when the amount of cases is very large, because instead of filling the missing cases with NA's, it uses indices.

    library(zoo)
    
    zooreg_correct_df = data_df %>% 
      as_tibble() %>% 
      # nest the data for each group
      # should work for multiple groups variables
      nest(-groups, .key = "zoo_ob") %>%
      mutate(zoo_ob = lapply(zoo_ob, function(d) {
    
        # create zooreg objects from the individual data.frames created by nest
        z = zoo::zooreg(
          data      = select(d,-time),
          order.by  = d$time,
          frequency = 1
        ) %>% 
          # calculate lags
          # we also ask for the 0'th order lag so that we keep the original value
          zoo:::lag.zooreg(k = (-1):0) # note the sign convention is different
    
        # recover df's from zooreg objects
        cbind(
          time = as.integer(zoo::index(z)),
          zoo:::as.data.frame.zoo(z)
        )
    
      })) %>% 
      unnest() %>% 
      # format values
      select(groups, time, value = value.lag0, lag.value = `value.lag-1`) %>% 
      arrange(groups, time) %>% 
      # eliminate additional periods created by lag
      filter(time <= max(data_df$time))
    zooreg_correct_df
    #> # A tibble: 8 x 4
    #>   groups  time   value lag.value
    #>   <fct>  <int>   <dbl>     <dbl>
    #> 1 a          1   1.60     NA    
    #> 2 a          2   0.330     1.60 
    #> 3 a          3  NA         0.330
    #> 4 a          4   0.487    NA    
    #> 5 b          1  -0.626    NA    
    #> 6 b          2  NA        -0.626
    #> 7 b          3  -0.836    NA    
    #> 8 b          4  NA        -0.836
    

    Finally, lets check that both correct solutions are actually equal:

    all.equal(dplyr_correct_df, zooreg_correct_df)
    #> [1] TRUE
    
    0 讨论(0)
提交回复
热议问题