Count unique values for every column

前端未结

关注

 8  1079

I would like to return the count of the unique values for every column in a table. For example, if I have the table:

 Testdata <- data.frame(var_1 = c(\"a


                      
              相关标签:


      
      
        
          8条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  再見小時候        
                
              
                            
                2020-12-05 14:54
              
            
            
                                                                       
In dplyr:

Testdata %>% summarise_all(n_distinct)


                                                                    

                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  星月不相逢        
                
              
                            
                2020-12-05 14:55
              
            
            
                                                                       
I just tried all solution and two of the above solutions did not work one with aggregate and the tidyr ones but two of them using did not work. I think using a data table is a good choice , 

setDT(Testdata)[, lapply(.SD, uniqueN), .SDcols=c("var_1","var_2","var_3")]
   #    var_1 var_2 var_3
   # 1:     1     1     3


I tried to compare them from each other 

library(microbenchmark)
Mycomp = microbenchmark(
  apply = apply(Testdata, 2, function(x)length(unique(x))),
  lapply = lapply(Testdata, function(x)length(unique(x))),
  sapply = sapply(Testdata, function(x)length(unique(x))),
  #base = aggregate(values ~ ind, unique(stack(Testdata)), length),
  datatable = setDT(Testdata)[, lapply(.SD, uniqueN), .SDcols=c("var_1","var_2","var_3")],
  times=50
)

#Unit: microseconds
#      expr     min      lq     mean   median      uq     max neval cld
#     apply 163.315 176.678 192.0435 181.7915 192.047 608.859    50  b 
#    lapply 138.217 147.339 157.9684 153.0640 165.829 254.145    50 a  
#    sapply 160.338 169.124 178.1486 174.3965 185.548 203.419    50  b 
# datatable 667.937 684.650 698.1306 696.0160 703.390 874.073    50   c

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  情书的邮戳        
                
              
                            
                2020-12-05 14:56
              
            
            
                                                                       
library(purrr)
Testdata %>% map_dbl(n_distinct)
var_1 var_2 var_3 
    1     1     3 

# in your format
Testdata %>% map_dbl(n_distinct)%>%melt(value.name = "unique_counts")
      unique_counts
var_1             1
var_2             1
var_3             3

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  长发绾君心        
                
              
                            
                2020-12-05 15:03
              
            
            
                                                                       
This is actually an improvement on the comment by @Ananda Mahto. It didn't fit in the comment so I decided to add as an answer. 

sapply is actually marginally faster than lapply, and gives the output in a more compact form, just like the output from apply.

A test run result on actual data:

> start <- Sys.time()
> apply(datafile, 2, function(x)length(unique(x)))
          symbol.           date     volume 
             1371            261      53647 
> Sys.time() - start
Time difference of 1.619567 secs
> 
> start <- Sys.time()
> lapply(datafile, function(x)length(unique(x)))
$symbol.
[1] 1371

$date
[1] 261

$volume
[1] 53647

> Sys.time() - start
Time difference of 0.07129478 secs
> 
> start <- Sys.time()
> sapply(datafile, function(x)length(unique(x)))
          symbol.              date             volume 
             1371               261              53647 
> Sys.time() - start
Time difference of 0.06939292 secs


The datafile has around 3.5 million rows.

Quoting the help text:


  sapply is a user-friendly version and wrapper of lapply by default
  returning a vector, matrix or, if simplify = "array", an array if
  appropriate, by applying simplify2array(). sapply(x, f, simplify =
  FALSE, USE.NAMES = FALSE) is the same as lapply(x, f).

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  北荒        
                
              
                            
                2020-12-05 15:03
              
            
            
                                                                       
Here, I've used dplyr and tidyr to count (using your Testdata data frame):

Testdata %>% 
  gather(var, value) %>% 
  distinct() %>% 
  count(var)

# # A tibble: 3 × 2
#     var     n
#   <chr> <int>
# 1 var_1     1
# 2 var_2     1
# 3 var_3     3

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  孤城傲影        
                
              
                            
                2020-12-05 15:04
              
            
            
                                                                       
Using the lengthsfunction:

lengths(lapply(Testdata, unique))

# var_1 var_2 var_3 
#     1     1     3 

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     1
2
下一页
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复