Find all sequences with the same column value

后端未结

关注

 9  972

误落风尘 2020-12-17 18:35

I have the following data frame:

╔══════╦═════════╗
║ Code ║ Airline ║
╠══════╬═════════╣
║    1 ║ AF      ║
║    1 ║ KL      ║
║    8 ║ AR      ║
║    8 ║ A


      
      
        
          9条回答        

        
                    
            
            
                         
                
              
              
                
                   鱼传尺愫
                                             
                
                
                (楼主)
            
              
              
                2020-12-17 18:49
              

            
            
                        
Several options using the data.table package:

1) Using strsplit, paste & operate by row:

library(data.table)
setDT(dat)[, Airline := trimws(Airline)  # this step is needed to remove the leading and trailing whitespaces
           ][, sharedwith := paste(Airline, collapse = ','), Code
            ][, sharedwith := paste(unlist(strsplit(sharedwith,','))[!unlist(strsplit(sharedwith,',')) %in% Airline], 
                                    collapse = ','), 1:nrow(dat)]


which gives:

> dat
   Code Airline sharedwith
1:    1      AF         KL
2:    1      KL         AF
3:    8      AR      AZ,DL
4:    8      AZ      AR,DL
5:    8      DL      AR,AZ


2) Using strsplit & paste with mapply instead of by = 1:nrow(dat):

setDT(dat)[, Airline := trimws(Airline)
           ][, sharedwith := paste(Airline, collapse = ','), Code
             ][, sharedwith := mapply(function(s,a) paste(unlist(strsplit(s,','))[!unlist(strsplit(s,',')) %in% a], 
                                                          collapse = ','),
                                      sharedwith, Airline)][]


which will give you the same result.

3) Or by using the CJ function with paste (inspired by the expand.grid solution of @zx8754):

library(data.table)
setDT(dat)[, Airline := trimws(Airline)
           ][, CJ(air=Airline, Airline,  unique=TRUE)[air!=V2][, .(shared=paste(V2,collapse=',')), air],
             Code]


which gives:

   Code air shared
1:    1  AF     KL
2:    1  KL     AF
3:    8  AR  AZ,DL
4:    8  AZ  AR,DL
5:    8  DL  AR,AZ




A solution with dplyr & tidyr to get the desired solution (inspired by @jaimedash):

library(dplyr)
library(tidyr)

dat <- dat %>% mutate(Airline = trimws(as.character(Airline)))

dat %>%
  mutate(SharedWith = Airline) %>% 
  group_by(Code) %>%
  nest(-Code, -Airline, .key = SharedWith) %>%
  left_join(dat, ., by = 'Code') %>%
  unnest() %>%
  filter(Airline != SharedWith) %>%
  group_by(Code, Airline) %>%
  summarise(SharedWith = toString(SharedWith))


which gives:

   Code Airline SharedWith
  (int)   (chr)      (chr)
1     1      AF         KL
2     1      KL         AF
3     8      AR     AZ, DL
4     8      AZ     AR, DL
5     8      DL     AR, AZ

    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它9个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复