问题
I had a similar questions with by() where I accepted the fact that I had to manually replace the resulting NAs. Now I would like to aggregate my data.frame and keep the structure. e.g. My larger data set has factors for 100 countries * 10 years * 5 segments, so it should reduce to 5000 rows. But sometimes some of the segment factors are empty and i only get <5000 rows. I cannot get my head around it...
My MWE still applies:
#All 3 categories are used
df1<-data.frame( val=rep(seq(1:4),3), factor=cut(rep(seq(1:4),3),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))
# Thirds category is not used
df2<-data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))
#df1 reduces to 3 rows as each category is used
aggregate(df1$val,list(df1$factor),sum)
#df2 reduces to 2 rows because C is empty
aggregate(df2$val,list(df2$factor),sum)
#I would like
data.frame(Group.1=LETTERS[1:3], x=c(12,12,0))
Group.1 x
1 A 12
2 B 12
3 C 0
回答1:
# create dataset
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))
library(dplyr)
levels(df2$factor) %>% # get distinct levels of the factor variable
data.frame(factor = .) %>% # create a data frame
left_join(df2 %>% # join with
group_by(factor) %>% # for each value that exists
summarise(x = sum(val)), by = "factor") %>% # sum column val
mutate(x = coalesce(x, 0L)) # replace NAs with 0s
# factor x
# 1 A 12
# 2 B 12
# 3 C 0
Or without any package
dd = merge(data.frame(Group.1 = levels(df2$factor)),
aggregate(df2$val,list(df2$factor),sum), all.x = T)
dd$x = ifelse(is.na(dd$x), 0, dd$x)
dd
# Group.1 x
# 1 A 12
# 2 B 12
# 3 C 0
Or using data.table package to check if it's faster
library(data.table)
# assuming you start with a data frame
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))
# create a data table with all unique values of the variable "factor" and an index (key) on that variable
dt_levels = data.table(factor = levels(df2$factor), key = "factor")
# make df2 a data table with an index on column "factor" and aggregate
dt_sum = setDT(df2, key = "factor")[, list(Sum = sum(val)), by = "factor"]
# left join the two data tables and replace NA values with 0s
dt_result = dt_sum[dt_levels][, Sum := ifelse(is.na(Sum), 0, Sum)]
dt_result[]
# factor Sum
# 1: A 12
# 2: B 12
# 3: C 0
回答2:
You can use the complete function from tidyr to explicitly show the missing values in the results:
library(dplyr); library(tidyr)
df2 %>%
group_by(factor) %>%
summarise(x = sum(val)) %>%
complete(factor, fill = list(x = 0))
# Source: local data frame [3 x 2]
# factor x
# <fctr> <dbl>
# 1 A 12
# 2 B 12
# 3 C 0
With aggregate function:
tidyr::complete(aggregate(df2$val,list(df2$factor),sum), Group.1, fill = list(x = 0))
# Source: local data frame [3 x 2]
# Group.1 x
# <fctr> <dbl>
#1 A 12
#2 B 12
#3 C 0
回答3:
So this is pretty basic, but I just made a new data frame with 2 columns.
One with each factor level, and one with all 0. Then I used rbind to but my new frame and df2 together, and then used aggregate.
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))
dat <- data.frame(val = 0, factor = levels(df2$factor))
df3 <- rbind(df2,dat)
aggregate(. ~ factor,df3,sum)
# factor val
#1 A 12
#2 B 12
#3 C 0
来源:https://stackoverflow.com/questions/38898276/aggregate-with-empty-factor-but-keep-row