dplyr - filter by group size

后端 未结 6 1912
渐次进展
渐次进展 2020-11-28 15:12

What is the best way to filter a data.frame to only get groups of say size 5?

So my data looks as follows:

require(dplyr)
n <- 1e5
x <- rnorm(n         


        
6条回答
  •  再見小時候
    2020-11-28 15:45

    Here's another dplyr approach you can try

    semi_join(dat, count(dat, cat) %>% filter(n == 5), by = "cat")
    

    --

    Here's another approach based on OP's original approach with a little modification:

    n <- 1e5
    x <- rnorm(n)
    # Category size ranging each from 1 to 5
    cat <- rep(seq_len(n/3), sample(1:5, n/3, replace = TRUE))[1:n]
    
    dat <- data.frame(x = x, cat = cat)
    
    # second data set for the dt approch
    dat2 <- data.frame(x = x, cat = cat)
    
    sol_floo0 <- function(dat){
      dat <- group_by(dat, cat)
      all_ind <- rep(seq_len(n_groups(dat)), group_size(dat))
      take_only <- which(group_size(dat) == 5L)
      dat[all_ind %in% take_only, ]
    }
    
    sol_floo0_v2 <- function(dat){
      g <- group_by(dat, cat) %>% group_size()
      ind <- rep(g == 5, g)
      dat[ind, ]
    }
    
    
    
    microbenchmark::microbenchmark(times = 10,
                                   sol_floo0(dat),
                                   sol_floo0_v2(dat2))
    #Unit: milliseconds
    #               expr      min       lq     mean   median       uq      max neval cld
    #     sol_floo0(dat) 43.72903 44.89957 45.71121 45.10773 46.59019 48.64595    10   b
    # sol_floo0_v2(dat2) 29.83724 30.56719 32.92777 31.97169 34.10451 38.31037    10  a 
    all.equal(sol_floo0(dat), sol_floo0_v2(dat2))
    #[1] TRUE
    

提交回复
热议问题