R: Rolling window function with adjustable window and step-size for irregularly spaced observations

后端 未结 5 2078
萌比男神i
萌比男神i 2020-12-30 11:34

Say there is a 2-column data frame with a time or distance column which sequentially increases and an observation column which may have NAs here and there. How can I effici

相关标签:
5条回答
  • 2020-12-30 12:15

    Ok, how about this.

    library(data.table)
    dat <- data.table(dat)
    setkey(dat, time)
    
    # function to compute a given stat over a time window on a given data.table
    window_summary <- function(start_tm, window_len, stat_fn, my_dt) {
      pos_vec <- my_dt[, which(time>=start_tm & time<=start_tm+window_len)]
      return(stat_fn(my_dt$measure[pos_vec]))
    }
    
    # a vector of window start times
    start_vec <- seq(from=-2.5, to=dat$time[nrow(dat)], by=2.5)
    
    # sapply'ing the function above over vector of start times 
    # (in this case, getting mean over 5 second windows)
    result <- sapply(start_vec, window_summary, 
                     window_len=5, stat_fn=mean, my_dt=dat)
    

    On my machine, it processes the first 20,000 rows of your large dataset in 13.06781 secs; all rows in 51.58614 secs

    0 讨论(0)
  • 2020-12-30 12:17

    Here is a function that gives the same result for your small data frame. It's not particularly quick: it takes several seconds to run on one of the larger datasets in your second dat example.

    rolling_summary <- function(DF, time_col, fun, window_size, step_size, min_window=min(DF[, time_col])) {
        # time_col is name of time column
        # fun is function to apply to the subsetted data frames
        # min_window is the start time of the earliest window
    
        times <- DF[, time_col]
    
        # window_starts is a vector of the windows' minimum times
        window_starts <- seq(from=min_window, to=max(times), by=step_size)
    
        # The i-th element of window_rows is a vector that tells us the row numbers of
        # the data-frame rows that are present in window i 
        window_rows <- lapply(window_starts, function(x) { which(times>=x & times<x+window_size) })
    
        window_summaries <- sapply(window_rows, function(w_r) fun(DF[w_r, ]))
        data.frame(start_time=window_starts, end_time=window_starts+window_size, summary=window_summaries)
    }
    
    rolling_summary(DF=dat,
                    time_col="time",
                    fun=function(DF) mean(DF$measure),
                    window_size=5,
                    step_size=2.5,
                    min_window=-2.5)
    
    0 讨论(0)
  • 2020-12-30 12:23

    Here is an attempt with Rcpp. The function assumes that data is sorted according to time. More testing would be advisable and adjustments could be made.

    #include <Rcpp.h>
    using namespace Rcpp;
    
    
    // [[Rcpp::export]]
    NumericVector rollAverage(const NumericVector & times, 
                              NumericVector & vals, 
                              double start,
                              const double winlen, 
                              const double winshift) {
      int n = ceil((max(times) - start) / winshift);
      NumericVector winvals;
      NumericVector means(n);
      int ind1(0), ind2(0);
      for(int i=0; i < n; i++) {
        if (times[0] < (start+winlen)) {
          while((times[ind1] <= start) & 
                    (times[ind1+1] <= (start+winlen)) & 
                    (ind1 < (times.size() - 1))) {
            ind1++;
          }    
    
          while((times[ind2+1] <= (start+winlen)) & (ind2 < (times.size() - 1))) {
            ind2++;
          }  
    
          if (times[ind1] >= start) {
            winvals = vals[seq(ind1, ind2)];
            means[i] = mean(winvals);
          } else {
            means[i] = NA_REAL;
          }
          } else {
            means[i] = NA_REAL;
        }
    
        start += winshift;    
      }
    
       return means;
    }
    

    Testing it:

    set.seed(42)
    dat <- data.frame(time = seq(1:20)+runif(20,0,1))
    dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
    dat$measure[sample(1:19,2)] <- NA_real_
    
    rollAverage(dat$time, dat$measure, -2.5, 5.0, 2.5)
    #[1] 1.0222694        NA        NA 1.0126639 0.9965048 0.9514456 1.0518228        NA        NA        NA
    

    With your list of data.frames (using data.table):

    set.seed(42)
    dat <- data.frame(time = seq(1:50000)+runif(50000, 0.025, 1))
    dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
    dat$measure[sample(1:50000,1000)] <- NA_real_
    dat$measure[c(350:450,3000:3300, 20000:28100)] <- NA_real_
    dat <- dat[-c(1000:2000, 30000:35000),] 
    
    # a list with a realistic number of observations:
    dat <- lapply(1:300,function(x) dat)
    
    library(data.table)
    dat <- lapply(dat, setDT)
    for (ind in seq_along(dat)) dat[[ind]][, i := ind]
    #possibly there is a way to avoid these copies?
    
    dat <- rbindlist(dat)
    
    system.time(res <- dat[, rollAverage(time, measure, -2.5, 5.0, 2.5), by=i])
    #user  system elapsed 
    #1.51    0.02    1.54 
    print(res)
    #           i        V1
    #      1:   1 1.0217126
    #      2:   1 0.9334415
    #      3:   1 0.9609050
    #      4:   1 1.0123473
    #      5:   1 0.9965922
    #     ---              
    #6000596: 300 1.1121296
    #6000597: 300 0.9984581
    #6000598: 300 1.0093060
    #6000599: 300        NA
    #6000600: 300        NA
    
    0 讨论(0)
  • 2020-12-30 12:25

    Here's another attempt to use pure data.table approach and its between function.

    Have compared Rprof against the above answers (except @Rolands answer) and it seems the most optimized one. Haven't tested for bugs though, but if you"ll like it, I'll expand the answer.

    Using your dat from above

    library(data.table)
    Rollfunc <- function(dat, time, measure, wind = 5, slide = 2.5, FUN = mean, ...){
      temp <- seq.int(-slide, max(dat$time), by = slide)
      temp <- cbind(temp, temp + wind)
      setDT(dat)[, apply(temp, 1, function(x) FUN(measure[between(time, x[1], x[2])], ...))]
    }
    
    Rollfunc(dat, time, measure, 5, 2.5)
    
    ## [1] 1.0222694        NA        NA 1.0126639 0.9965048 0.9514456 1.0518228        NA        NA
    ## [10]        NA
    

    You can also specify the functions and its arguments, i.e., for example:

    Rollfunc(dat, time, measure, 5, 2.5, max, na.rm = TRUE)
    

    will also work

    Edit: I did some benchnarks against @Roland and his method clearly wins (by far), so I would go with the Rcpp aproach

    0 讨论(0)
  • 2020-12-30 12:30

    Here are some functions that will give the same output on your first example:

    partition <- function(x, window, step = 0){
        a = x[x < step]    
        b = x[x >= step]
        ia = rep(0, length(a))
        ib = cut(b, seq(step, max(b) + window, by = window))    
        c(ia, ib)
    }
    
    roll <- function(df, window, step = 0, fun, ...){
        tapply(df$measure, partition(df$time, window, step), fun, ...)
    }
    
    roll_steps <- function(df, window, steps, fun, ...){
        X = lapply(steps, roll, df = df, window = window, fun = fun, ...)
        names(X) = steps
        X
    }
    

    Output for your first example:

    > roll_steps(dat, 5, c(0, 2.5), mean)
    $`0`
            1         2         3         4         5 
           NA 1.0126639 0.9514456        NA        NA 
    
    $`2.5`
            0         1         2         3         4 
    1.0222694        NA 0.9965048 1.0518228        NA
    

    You can also ignore missing values this way easily:

    > roll_steps(dat, 5, c(0, 2.5), mean, na.rm = TRUE)
    $`0`
            1         2         3         4         5 
    0.7275438 1.0126639 0.9514456 0.9351326       NaN 
    
    $`2.5`
            0         1         2         3         4 
    1.0222694 0.8138012 0.9965048 1.0518228 0.6122983 
    

    This can also be used for a list of data.frames:

    > x = lapply(dat2, roll_steps, 5, c(0, 2.5), mean)
    
    0 讨论(0)
提交回复
热议问题