问题
I'm working on a data frame that contains:
counts per cluster (flow cytometry data)
of several files
- and mean, max, min, total for lots of variables recorded by the machine.
In the case that I want to reduce the number of groups (pool similar clusters together) I would want to merge all the information in a file for group 'a' and 'b' by file
So far, following this SO Question I have already worked out the min, max, and total, but am stuck on how to get the following calculation working in this structure (mutate_at
) using a custom function which would do:
(counts of 'a' * mean of 'a' + counts of 'b' * mean of 'b') / sum(counts for 'a', counts of 'b' )
in order to recalculate the new mean
for each of the mean_i columns, where "mean" in the equation refers to 1 of the columns containing mean values I'm calling with vars(mean_cols)
The code so far:
library(dplyr)
set.seed(123)
df <- data.frame(ID = 1:20,
total_X = runif(20),
min_X = runif(20),
max_X = runif(20),
mean_X = runif(20),
total_Y = runif(20),
min_Y = runif(20),
max_Y = runif(20),
mean_Y = runif(20),
Counts = runif(20)*1000,
category = rep(letters[1:5], 4),
file = as.factor(sort(rep(1:4, 5))))
total_cols = names(df)[which(grepl('total', names(df)))]
min_cols = names(df)[which(grepl('min', names(df)))]
max_cols = names(df)[which(grepl('max', names(df)))]
mean_cols = names(df)[which(grepl('total', names(df)))]
recalmean <- function() { sum(Counts * vars)/sum(Counts)}
#counts of 'a' * mean of 'a' + counts of 'b' * mean of 'b' / sum(counts for 'a', counts of 'b' )
x <- df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate_at(vars(total_cols), sum) %>%
mutate_at(vars(min_cols), min) %>%
mutate_at(vars(max_cols), max) %>%
# mutate_at(vars(mean_cols), recalmean) %>% ## this line needs to do the custom weighed mean calculation
mutate(category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
回答1:
got to admit it was challenging...you should reconsider the data structure
library(tidyverse)
set.seed(123)
df <- data.frame(ID = 1:20,
total_X = runif(20),
min_X = runif(20),
max_X = runif(20),
mean_X = runif(20),
total_Y = runif(20),
min_Y = runif(20),
max_Y = runif(20),
mean_Y = runif(20),
Counts = runif(20)*1000,
category = rep(letters[1:5], 4),
file = as.factor(sort(rep(1:4, 5))))
x <- df %>% bind_rows(
gather(df,metric,value,-ID,-file,-category,-Counts) %>%
mutate(group=str_extract(metric,"[A-Z]$"),metric = str_replace(metric,"_.$","")) %>%
filter(category %in% c('a' , 'b')) %>%
spread(metric,value) %>%
group_by(file,group) %>%
summarise(Counts = mean(Counts),
category = paste0(category,collapse = ''),
max = max(max),
min = min(min),
total = sum(total),
mean = sum(Counts * mean)/sum(Counts)) %>%
ungroup() %>%
gather(metric,value,-file,-group,-category,-Counts) %>%
mutate(metric = paste(metric,group,sep='_'),group=NULL) %>%
spread(metric,value) %>%
mutate(ID=0)
) %>% mutate(ID = row_number())
来源:https://stackoverflow.com/questions/56972840/recalculate-the-new-weighted-mean-when-merging-two-factors-by-group-and-keep-or