I want to group a vector based on the sum of the elements being less than or equal to n. Assume the following,
set.seed(1)
x <- sample(10, 20
Here is my Rcpp-solution (close to Khashaa's solution but a bit shorter/stripped down), because you said speed was important, Rcppis probably the way to go:
# create the data
set.seed(1)
x <- sample(10, 20, replace = TRUE)
y <- c(1, 1, 1, 2, 2, 3, 4, 5 ,5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 10)
# create the Rcpp function
library(Rcpp)
cppFunction('
IntegerVector sotosGroup(NumericVector x, int cutoff) {
IntegerVector groupVec (x.size());
int group = 1;
double runSum = 0;
for (int i = 0; i < x.size(); i++) {
runSum += x[i];
if (runSum > cutoff) {
group++;
runSum = x[i];
}
groupVec[i] = group;
}
return groupVec;
}
')
# use the function as usual
y_cpp <- sotosGroup(x, 15)
sapply(split(x, y_cpp), sum)
#> 1 2 3 4 5 6 7 8 9 10
#> 13 13 9 10 15 12 12 13 14 8
all.equal(y, y_cpp)
#> [1] TRUE
In case anyone needs to be convinced by the speed:
# Speed Benchmarks
library(data.table)
library(microbenchmark)
dt <- data.table(x)
frank <- function(DT, n = 15) {
DT[, xc := cumsum(x)]
b = DT[.(shift(xc, fill=0) + n + 1), on=.(xc), roll=-Inf, which=TRUE]
z = 1; res = z
while (!is.na(z))
res <- c(res, z <- b[z])
DT[, g := cumsum(.I %in% res)][]
}
microbenchmark(
frank(dt),
sotosGroup(x, 15),
times = 100
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> frank(dt) 1720.589 1831.320 2148.83096 1878.0725 1981.576 13728.830 100 b
#> sotosGroup(x, 15) 2.595 3.962 6.47038 7.5035 8.290 11.579 100 a