Count of unique elements of each row in a data frame in R

前端未结

关注

 3  450

I have a data frame like below:

Group1  Group2  Group3  Group4
A       B       A       B   
A       C       B       A   
B       B       B       B   
A


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  被撕碎了的回忆        
                
              
                            
                2020-12-19 14:55
              
            
            
                                                                       
duplicated in base R:

df$Count <- apply(df,1,function(x) sum(!duplicated(x)))

#  Group1 Group2 Group3 Group4 Count
#1      A      B      A      B     2
#2      A      C      B      A     3
#3      B      B      B      B     1
#4      A      C      B      D     4
#5      A      D      C      A     3

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  死守一世寂寞        
                
              
                            
                2020-12-19 14:57
              
            
            
                                                                       
Athough there are some pretty great solutions mentioned over here, You can also use, data.table :

DATA:

df <- data.frame(g1 = c("A","A","B","A","A"),g2 = c("B", "C", "B","C","D"),g3 = c("A","B","B","B","C"),g4 = c("B","A","B","D","A"),stringsAsFactors = F)


Code:

EDIT: After the David Arenberg's comment,added (.I) instead of 1:nrow(df). Thanks for valuable comments

library(data.table)
setDT(df)[, id := .I ]
df[, count := uniqueN(c(g1, g2, g3, g4)), by=id ]
df


Output:

> df
   g1 g2 g3 g4 id count
1:  A  B  A  B  1     2
2:  A  C  B  A  2     3
3:  B  B  B  B  3     1
4:  A  C  B  D  4     4
5:  A  D  C  A  5     3

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  感动是毒        
                
              
                            
                2020-12-19 15:05
              
            
            
                                                                       
We can use apply with MARGIN =1 to loop over the rows

df1$Count <- apply(df1, 1, function(x) length(unique(x)))
df1$Count
#[1] 2 3 1 4 3




Or using tidyverse

library(dplyr)
df1 %>%
    rowwise() %>%
    do(data.frame(., Count = n_distinct(unlist(.))))
# A tibble: 5 × 5
#   Group1 Group2 Group3 Group4 Count
#*  <chr>  <chr>  <chr>  <chr> <int>
#1      A      B      A      B     2
#2      A      C      B      A     3
#3      B      B      B      B     1
#4      A      C      B      D     4
#5      A      D      C      A     3




We can also use regex to do this in a faster way.  It is based on the assumption that there is only a single character per each cell

nchar(gsub("(.)(?=.*?\\1)", "", do.call(paste0, df1), perl = TRUE))
#[1] 2 3 1 4 3


More detailed explanation is given here
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复