问题
I want to aggregate Date by group. However, each observation can belong to several groups (e.g. observation 1 belongs to group A and B). I could not find a nice way to achieve this with data.table
. Currently I created for each of the possible groups a logical variable which takes the value TRUE
if the observation belongs to that group. I am looking for a better way to do this than presented below. I would also like to know how I could achieve this with the tidyverse
.
library(data.table)
# Data
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.table(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
# This should be nicer and less repetitive
df[groupA == TRUE, .(A = sum(x)), by = time][
df[groupB == TRUE, .(B = sum(x)), by = time], on = "time"][
df[groupC == TRUE, .(C = sum(x)), by = time], on = "time"]
# desired output
time A B C
1: 1 NA 0.9432955 0.1331984
2: 2 1.2257538 0.2427420 0.1882493
3: 3 -0.1992284 -0.1992284 1.9016244
4: 4 0.5327774 0.9438362 0.9276459
回答1:
Here is a solution with data.table
:
df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# > df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# time groupA groupB groupC
# 1: 1 0.0000000 0.9432955 0.1331984
# 2: 2 1.2257538 0.2427420 0.1882493
# 3: 3 -0.1992284 -0.1992284 1.9016244
# 4: 4 0.5327774 0.9438362 0.9276459
or (thx to @chinsoon12 for the comment) more programmatically:
df[, lapply(.SD*x, sum), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
If you want the result in the long format you can do:
df[, colSums(.SD*x), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
### with indicator for the group:
df[, .(colSums(.SD*x), c("A","B","C")), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
回答2:
I think it's easier here to work in long format. First I gather the observations to long format, then keep only the values where the observation belongs to the corresponding group. Then I remove the logical column, and rename the groups to single letters. Then I aggregate across groups and time (summarise in dplyr
).
Finally I spread back to wide format.
library(dplyr)
library(tidyr)
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.frame(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
df %>%
gather(group, belongs, groupA:groupC) %>%
filter(belongs) %>%
select(-belongs) %>%
mutate(group = gsub("group", "", group)) %>%
group_by(time, group) %>%
summarise(x = sum(x)) %>%
spread(group, x)
Output
# A tibble: 4 x 4
# Groups: time [4]
time A B C
<int> <dbl> <dbl> <dbl>
1 1 NA 0.943 0.133
2 2 1.23 0.243 0.188
3 3 -0.199 -0.199 1.90
4 4 0.533 0.944 0.928
回答3:
An option can be using tidyr
and dplyr
packages in combination with data.table
. Try to work on data in long format and then change it to wide format.
library(dplyr)
library(tidyr)
melt(df, id.vars = c("time", "x")) %>%
filter(value) %>%
group_by(time, variable) %>%
summarise(sum = sum(x)) %>%
spread(variable, sum)
# # A tibble: 4 x 4
# # Groups: time [4]
# time groupA groupB groupC
# * <int> <dbl> <dbl> <dbl>
# 1 1 NA 0.943 0.133
# 2 2 1.23 0.243 0.188
# 3 3 - 0.199 -0.199 1.90
# 4 4 0.533 0.944 0.928
来源:https://stackoverflow.com/questions/50483582/aggregating-if-each-observation-can-belong-to-multiple-groups