Count unique values of a column by pairwise combinations of another column in R

前端未结

关注

 4  1248

Let\'s say I have the following data frame:


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  猫巷女王i        
                
              
                            
                2020-12-07 04:07
              
            
            
                                                                       
Using base only:

df <- data.frame(ID=c(1,1,1,2,2,2,3,3,3,3,4,4), 
                 code=c("A", "B", "C", "B", "C", "D", "C", "A", "D", "B", "D", "B"), stringsAsFactors =FALSE)
# Create data.frame of unique combinations of codes
e <- expand.grid(df$code, df$code)
e <- e[e[,1]!=e[,2],]
e1 <- as.data.frame(unique(t(apply(e, 1, sort))), stringsAsFactors = FALSE)

# Count the occurrence of each code combination across IDs
e1$count <- apply(e1, 1, function(y) 
                  sum(sapply(unique(df$ID), function(x) 
                             sum(y[1] %in% df$code[df$ID==x] & y[2] %in% df$code[df$ID==x]))))

# Turn the codes into a string and print output
out <- data.frame(Code.Combinations=do.call(paste, c(e1[,1:2], sep=", ")),
                  Count.of.ID=e1$count, stringsAsFactors = FALSE)


out
#   Code.Combinations Count.of.ID
# 1              A, B           2
# 2              A, C           2
# 3              A, D           1
# 4              B, C           3
# 5              B, D           3
# 6              C, D           2

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  野趣味        
                
              
                            
                2020-12-07 04:18
              
            
            
                                                                       
Below makes use of combinations from the gtools package as well as count from the plyr package.

library(gtools)
library(plyr)

PairWiseCombo <- function(df) {
    myID <- df$ID
    BreakDown <- rle(myID)
    Unis <- BreakDown$values
    numUnis <- BreakDown$lengths
    Len <- length(Unis)
    e <- cumsum(numUnis)
    s <- c(1L, e + 1L)

    ## more efficient to generate outside of the "do.call(c, lapply(.."
    ## below. This allows me to reference a particular combination 
    ## rather than re-generating the same combination multiple times
    myCombs <- lapply(2:max(numUnis), function(x) combinations(x,2L))

    tempDF <- plyr::count(do.call(c, lapply(1:Len, function(i) {
                myRange <- s[i]:e[i]
                combs <- myCombs[[numUnis[i]-1L]]
                vapply(1:nrow(combs), function(j) paste(sort(df$Code[myRange[combs[j,]]]), collapse = ","), "A,D")
              })))

    names(tempDF) <- c("Code.Combinations", "Count.of.ID")
    tempDF
}


Below are some metrics. I didn't test the solution by @Carl as it was giving different results than the other solutions.

set.seed(537)
ID <- do.call(c, lapply(1:100, function(x) rep(x, sample(2:26,1))))
temp <- rle(ID)
Code <- do.call(c, lapply(1:100, function(x) LETTERS[sample(temp$lengths[x])]))
TestDF <- data.frame(ID, Code, stringsAsFactors = FALSE)

system.time(t1 <- Noah(TestDF))
 user  system elapsed 
97.05    0.31   97.42

system.time(t2 <- DTSolution(TestDF))
 user  system elapsed 
0.43    0.00    0.42

system.time(t3 <- PairWiseCombo(TestDF))
 user  system elapsed 
0.42    0.00    0.42

identical(sort(t3[,2]),sort(t2$IdCount))
TRUE

identical(sort(t3[,2]),sort(t1[,2]))
TRUE


Using microbenchmark we have:

library(microbenchmark)
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF), times = 10L)
Unit: milliseconds
  expr      min       lq     mean   median       uq      max neval
Joseph 420.1090 433.9471 442.0133 446.4880 450.4420 452.7852    10
Psidom 396.8444 413.4933 416.3315 418.5573 420.9669 423.6303    10


Overall, the data.table solution provided by @Psidom was the fastest (not surprisingly). Both my solution and the data.table solution performed similarly on really large examples. However, the solution provided from @Noah is extremely memory intensive and couldn't be tested on larger data frames.

sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1




Update
After tweaking @Carl's solution, the dplyr approach is by far the fastest. Below is the code (you will see what parts I altered):

DPLYRSolution <- function(df) {
    df <- df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)

    ## These two lines were added by me to remove "duplicate" rows
    df <- mutate(df, Code=ifelse(Code.x < Code.y, paste(Code.x, Code.y), paste(Code.y, Code.x)))
    df[which(!duplicated(df$Code)), ]
}


Below are the new metrics:

system.time(t4 <- DPLYRSolution(TestDF))
 user  system elapsed 
 0.03    0.00    0.03     ### Wow!!! really fast

microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF),
               Carl = DPLYRSolution(TestDF), times = 10L)
Unit: milliseconds
  expr       min       lq      mean    median        uq       max neval
Joseph 437.87235 442.7348 450.91085 452.77204 457.09465 461.85035    10
Psidom 407.81519 416.9444 422.62793 425.26041 429.02064 434.38881    10
  Carl  44.33698  44.8066  48.39051  45.35073  54.06513  59.35653    10

## Equality Check
identical(sort(c(t4[,3])[[1]]), sort(t1[,2]))
[1] TRUE

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  长发绾君心        
                
              
                            
                2020-12-07 04:19
              
            
            
                                                                       
Assuming your data.frame is named df and using dplyr

df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)


Join the df with itself and then count by the groups
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  刺人心        
                
              
                            
                2020-12-07 04:27
              
            
            
                                                                       
Here is a data.table way to solve the problem. Use combn function to pick up all possible combinations of Code and then count ID for each unique CodeComb:

library(data.table)
setDT(df)[, .(CodeComb = sapply(combn(Code, 2, simplify = F), 
                                function(cmb) paste(sort(cmb), collapse = ", "))), .(ID)]
# list all combinations of Code for each ID
         [, .(IdCount = .N), .(CodeComb)]    
# count number of unique id for each code combination

#    CodeComb IdCount
# 1:     A, B       2
# 2:     A, C       2
# 3:     B, C       3
# 4:     B, D       3
# 5:     C, D       2
# 6:     A, D       1

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复