Say there is a 2-column data frame with a time or distance column which sequentially increases and an observation column which may have NAs here and there. How can I effici
Here is an attempt with Rcpp. The function assumes that data is sorted according to time. More testing would be advisable and adjustments could be made.
#include
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector rollAverage(const NumericVector & times,
NumericVector & vals,
double start,
const double winlen,
const double winshift) {
int n = ceil((max(times) - start) / winshift);
NumericVector winvals;
NumericVector means(n);
int ind1(0), ind2(0);
for(int i=0; i < n; i++) {
if (times[0] < (start+winlen)) {
while((times[ind1] <= start) &
(times[ind1+1] <= (start+winlen)) &
(ind1 < (times.size() - 1))) {
ind1++;
}
while((times[ind2+1] <= (start+winlen)) & (ind2 < (times.size() - 1))) {
ind2++;
}
if (times[ind1] >= start) {
winvals = vals[seq(ind1, ind2)];
means[i] = mean(winvals);
} else {
means[i] = NA_REAL;
}
} else {
means[i] = NA_REAL;
}
start += winshift;
}
return means;
}
Testing it:
set.seed(42)
dat <- data.frame(time = seq(1:20)+runif(20,0,1))
dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
dat$measure[sample(1:19,2)] <- NA_real_
rollAverage(dat$time, dat$measure, -2.5, 5.0, 2.5)
#[1] 1.0222694 NA NA 1.0126639 0.9965048 0.9514456 1.0518228 NA NA NA
With your list of data.frames (using data.table):
set.seed(42)
dat <- data.frame(time = seq(1:50000)+runif(50000, 0.025, 1))
dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
dat$measure[sample(1:50000,1000)] <- NA_real_
dat$measure[c(350:450,3000:3300, 20000:28100)] <- NA_real_
dat <- dat[-c(1000:2000, 30000:35000),]
# a list with a realistic number of observations:
dat <- lapply(1:300,function(x) dat)
library(data.table)
dat <- lapply(dat, setDT)
for (ind in seq_along(dat)) dat[[ind]][, i := ind]
#possibly there is a way to avoid these copies?
dat <- rbindlist(dat)
system.time(res <- dat[, rollAverage(time, measure, -2.5, 5.0, 2.5), by=i])
#user system elapsed
#1.51 0.02 1.54
print(res)
# i V1
# 1: 1 1.0217126
# 2: 1 0.9334415
# 3: 1 0.9609050
# 4: 1 1.0123473
# 5: 1 0.9965922
# ---
#6000596: 300 1.1121296
#6000597: 300 0.9984581
#6000598: 300 1.0093060
#6000599: 300 NA
#6000600: 300 NA