问题
I have a large data frame with more than 100 000 records where the values are sorted
For example, consider the following dummy data set
df <- data.frame(values = c(1,1,2,2,3,4,5,6,6,7))
I want to create 3 groups of above values (in sequence only) such that the sum of each group is more or less the same
So for the above group, if I decide to divide the sorted df
in 3 groups as follows, their sums will be
1. 1 + 1 + 2 +2 + 3 + 4 = 13
2. 5 + 6 = 11
3. 6 + 7 = 13
How can create this optimization in R? any logic?
回答1:
So, let's use pruning. I think other solutions are giving a good solution, but not the best one.
First, we want to minimize where S_n is the cumulative sum of the first n elements.
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
I think the other solutions optimize over p and q independently, which won't give a global minima (expected for some particular cases).
optiCut <- function(v) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
This is as fast as the other solutions because it prunes a lot the iterations based on the condition S3*S3 < min
. But, it gives the optimal solution, see optiCut(c(1, 2, 3, 3, 5, 10))
.
For the solution with K >= 3, I basically reimplemented trees with nested tibbles, that was fun!
optiCut_K <- function(v, K) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / K
# good starting values
p_vec_first <- sapply(seq_len(K - 1), function(i) which.min((S - i*S_star)^2))
min_first <- sum((diff(c(0, S[c(p_vec_first, n)])) - S_star)^2)
compute_children <- function(level, ind, val) {
# leaf
if (level == 1) {
val <- val + (S[ind] - S_star)^2
if (val > min_first) {
return(NULL)
} else {
return(val)
}
}
P_all <- val + (S[ind] - S[seq_len(ind - 1)] - S_star)^2
inds <- which(P_all < min_first)
if (length(inds) == 0) return(NULL)
node <- tibble::tibble(
level = level - 1,
ind = inds,
val = P_all[inds]
)
node$children <- purrr::pmap(node, compute_children)
node <- dplyr::filter(node, !purrr::map_lgl(children, is.null))
`if`(nrow(node) == 0, NULL, node)
}
compute_children(K, n, 0)
}
This gives you all the solution that are least better than the greedy one:
v <- sort(sample(1:1000, 1e5, replace = TRUE))
test <- optiCut_K(v, 9)
You need to unnest this:
full_unnest <- function(tbl) {
tmp <- try(tidyr::unnest(tbl), silent = TRUE)
`if`(identical(class(tmp), "try-error"), tbl, full_unnest(tmp))
}
print(test <- full_unnest(test))
And finally, to get the best solution:
test[which.min(test$children), ]
回答2:
Here is one approach:
splitter <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
how good is it:
# I calculate the mean and sd of the maximal difference of the sums in the
#splits of 100 runs:
#split on 15 parts
set.seed(5)
z1 = as.data.frame(matrix(1:15, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 15)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
[1] 1004.158
sd(apply(z1, 1, function(x) max(x) - min(x)))
[1] 210.6653
#with less splits (4)
set.seed(5)
z1 = as.data.frame(matrix(1:4, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 4)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
#632.7723
sd(apply(z1, 1, function(x) max(x) - min(x)))
#260.9864
library(microbenchmark)
1M:
values = sort(sample(1:1000, 1000000, replace = T))
microbenchmark(
sp_27 = splitter(values, 27),
sp_3 = splitter(values, 3),
)
Unit: milliseconds
expr min lq mean median uq max neval cld
sp_27 897.7346 934.2360 1052.0972 1078.6713 1118.6203 1329.3044 100 b
sp_3 108.3283 116.2223 209.4777 173.0522 291.8669 409.7050 100 a
btw F. Privé is correct this function does not give the globally optimal split. It is greedy which is not a good characteristic for such a problem. It will give splits with sums closer to global sum / n in the initial part of the vector but behaving as so will compromise the splits in the later part of the vector.
Here is a test comparison of the three functions posted so far:
db = function(values, N){
temp = floor(sum(values)/N)
inds = c(0, which(c(0, diff(cumsum(values) %% temp)) < 0)[1:(N-1)], length(values))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
} #had to change it a bit since the posted one would not work - the core
#which calculates the splitting positions is the same
missuse <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
prive = function(v, N){ #added dummy N argument because of the tester function
dummy = N
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
optiCut <- function(v, N) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
z3 = optiCut(v)
inds = c(0, z3[1:2], length(v))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(v, re))
} #added output to be more in line with the other two
Function for testing:
tester = function(split, seed){
set.seed(seed)
z1 = as.data.frame(matrix(1:3, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = split(values, 3)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
m = mean(apply(z1, 1, function(x) max(x) - min(x)))
s = sd(apply(z1, 1, function(x) max(x) - min(x)))
return(c("mean" = m, "sd" = s))
} #tests 100 random 1M length vectors with elements drawn from 1:1000
tester(db, 5)
#mean sd
#779.5686 349.5717
tester(missuse, 5)
#mean sd
#481.4804 216.9158
tester(prive, 5)
#mean sd
#451.6765 174.6303
prive is the clear winner - however it takes quite a bit longer than the other 2. and can handle splitting on 3 elements only.
microbenchmark(
missuse(values, 3),
prive(values, 3),
db(values, 3)
)
Unit: milliseconds
expr min lq mean median uq max neval cld
missuse(values, 3) 100.85978 111.1552 185.8199 120.1707 304.0303 393.4031 100 a
prive(values, 3) 1932.58682 1980.0515 2096.7516 2043.7133 2211.6294 2671.9357 100 b
db(values, 3) 96.86879 104.5141 194.0085 117.6270 306.7143 500.6455 100 a
回答3:
N = 3
temp = floor(sum(df$values)/N)
inds = c(0, which(c(0, diff(cumsum(df$values) %% temp)) < 0)[1:(N-1)], NROW(df))
split(df$values, rep(1:N, ifelse(N == 1, NROW(df), diff(inds))))
#$`1`
#[1] 1 1 2 2 3 4
#$`2`
#[1] 5 6
#$`3`
#[1] 6 7
来源:https://stackoverflow.com/questions/46431527/split-a-vector-into-chunks-such-that-sum-of-each-chunk-is-approximately-constant