问题
I would like to count all combinations in a data.frame.
The data look like this
9 10 11 12
1 1 1 1 1
2 0 0 0 0
3 0 0 0 0
4 1 1 1 1
5 1 1 1 1
6 0 0 0 0
7 1 0 0 1
8 1 0 0 1
9 1 1 1 1
10 1 1 1 1
The output I want is simply
comb n
1 1 1 1 5
0 0 0 0 3
1 0 0 1 2
Do you know any simple function to do that ?
Thanks
dt = structure(list(`9` = c(1, 0, 0, 1, 1, 0, 1, 1, 1, 1), `10` = c(1,
0, 0, 1, 1, 0, 0, 0, 1, 1), `11` = c(1, 0, 0, 1, 1, 0, 0, 0,
1, 1), `12` = c(1, 0, 0, 1, 1, 0, 1, 1, 1, 1)), .Names = c("9",
"10", "11", "12"), class = "data.frame", row.names = c(NA, -10L
))
回答1:
We can either use data.table or dplyr. These are very efficient. We convert the 'data.frame' to 'data.table' (setDT(dt)), grouped by all the columns of 'dt' (names(dt)), we get the nrow (.N) as the 'Count'
library(data.table)
setDT(dt)[,list(Count=.N) ,names(dt)]
Or we can use a similar methodology using dplyr.
library(dplyr)
names(dt) <- make.names(names(dt))
dt %>%
group_by_(.dots=names(dt)) %>%
summarise(count= n())
Benchmarks
In case somebody wants to look at some metrics (and also to backup my claim earlier (efficient!)),
set.seed(24)
df1 <- as.data.frame(matrix(sample(0:1, 1e6*6, replace=TRUE), ncol=6))
akrunDT <- function() {
as.data.table(df1)[,list(Count=.N) ,names(df1)]
}
akrunDplyr <- function() {
df1 %>%
group_by_(.dots=names(df1)) %>%
summarise(count= n())
}
cathG <- function() {
aggregate(cbind(n = 1:nrow(df1))~., df1, length)
}
docendoD <- function() {
as.data.frame(table(comb = do.call(paste, df1)))
}
deena <- function() {
table(apply(df1, 1, paste, collapse = ","))
}
Here are the microbenchmark results
library(microbenchmark)
microbenchmark(akrunDT(), akrunDplyr(), cathG(), docendoD(), deena(),
unit='relative', times=20L)
# Unit: relative
# expr min lq mean median uq max neval cld
# akrunDT() 1.000000 1.000000 1.000000 1.00000 1.000000 1.0000000 20 a
# akrunDplyr() 1.512354 1.523357 1.307724 1.45907 1.365928 0.7539773 20 a
# cathG() 43.893946 43.592062 37.008677 42.10787 38.556726 17.9834245 20 c
# docendoD() 18.778534 19.843255 16.560827 18.85707 17.296812 8.2688541 20 b
# deena() 90.391417 89.449547 74.607662 85.16295 77.316143 34.6962954 20 d
回答2:
A base R solution with aggregate:
aggregate(seq(nrow(dt))~., data=dt, FUN=length)
# 9 10 11 12 seq(nrow(dt))
#1 0 0 0 0 3
#2 1 0 0 1 2
#3 1 1 1 1 5
edit
To get colnames more conformed to your output, you can do:
`colnames<-`(aggregate(seq(nrow(dt))~., data=dt, FUN=length), c("c", "o", "m", "b", "n"))
# c o m b n
#1 0 0 0 0 3
#2 1 0 0 1 2
#3 1 1 1 1 5
Or, shorter:
aggregate(cbind(n = 1:nrow(dt))~., dt, length)
# 9 10 11 12 n
#1 0 0 0 0 3
#2 1 0 0 1 2
#3 1 1 1 1 5
回答3:
You could try the following approach using only base R:
as.data.frame(table(comb = do.call(paste, dt)))
# comb Freq
#1 0 0 0 0 3
#2 1 0 0 1 2
#3 1 1 1 1 5
回答4:
Maybe that too : table(apply(dt, 1, paste, collapse = ","))
回答5:
Also in base R:
Use unique.matrix to get the list of unique combinations.
uncs <- unique.matrix(as.matrix(df), MARGIN = 1)
Then make comparisons and count:
cnts <- colSums(apply(uncs, 1, function(r) apply(dt, 1, function(r2) all(r == r2))))
cbind(comb = apply(uncs, 1, paste), n = cnts)
来源:https://stackoverflow.com/questions/34312324/r-count-all-combinations