Split columns in dataframe with NA

前端未结

关注

 5  2897

I have a df like this:

df <- data.frame(FOO = c(\'A|B|C\', \'A|B\', \'B|C\', \'A\', \'C\'))

> df
    FOO
1 A|B|C
2   A|B
3   B|C
4     A
5


                      
              相关标签:


      
      
        
          5条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  半阙折子戏        
                
              
                            
                2021-02-20 05:21
              
            
            
                                                                       
Use unique and strsplit to find all unique values (A, B and C in this case). Use grep to search for the unique values, and return the values when there's a match or character(0) otherwise. cbind the resulting characters. Use apply and ifelse to replace character(0) with NA.

vals <- unique(unlist(sapply(a1, function(x) strsplit(x, '|', fixed = T))))

out <- NULL
for(i in vals){
  out <- cbind(out, as.character((lapply(df$FOO, function(x) grep(x, i, value=T)))))
}

apply(out, 2, function(x) ifelse(x=="character(0)", NA, x))

     [,1] [,2] [,3]
[1,] "A"  "B"  "C" 
[2,] "A"  "B"  NA  
[3,] NA   "B"  "C" 
[4,] "A"  NA   NA  
[5,] NA   NA   "C" 

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  醉梦人生        
                
              
                            
                2021-02-20 05:23
              
            
            
                                                                       
Simply do:

splt <- strsplit(as.character(df$FOO),"\\|")
all_val <- sort(unique(unlist(splt)))
t(sapply(splt,function(x){all_val[!(all_val %in% x)]<-NA;all_val}))


#     [,1] [,2] [,3]
#[1,] "A"  "B"  "C" 
#[2,] "A"  "B"  NA  
#[3,] NA   "B"  "C" 
#[4,] "A"  NA   NA  
#[5,] NA   NA   "C" 


data:

df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C'))


Please note:

My version is base:: (no libraries needed) and general:

It would also work with:

df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F'))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  没有蜡笔的小新        
                
              
                            
                2021-02-20 05:23
              
            
            
                                                                       
You can try a tidyverse as well

library(tidyverse)
df %>%
  rownames_to_column() %>% 
  separate_rows(FOO, sep="[|]") %>% 
  mutate(L=factor(FOO, labels = paste0("X",1:length(unique(FOO))))) %>% 
  spread(L, FOO) %>% 
  select(-1)
    X1   X2   X3
1    A    B    C
2    A    B <NA>
3 <NA>    B    C
4    A <NA> <NA>
5 <NA> <NA>    C


It is also generally working e.g. df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F')). In addition you can set the levels e.g. B>C>A by yourself using levels = c("B", "C", "A") in the factor function in the mutate step. 
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  南旧        
                
              
                            
                2021-02-20 05:29
              
            
            
                                                                       
Overlooked that OP asked for a base R solution. Please try @AndreElrico's, @r.user.05apr's or @milan's solutions.



This can be done with cSplit_e from the splitstackshape package:

library(splitstackshape)
cSplit_e(
  data = df,
  split.col = "FOO",
  sep = "|",
  mode = "value",
  type = "character",
  fill = " ",
  drop = TRUE
)
#  FOO_A FOO_B FOO_C
#1     A     B     C
#2     A     B      
#3           B     C
#4     A            
#5                 C




Does also work in case of the following df (see OP's comment above).

(df1 <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F')))
#    FOO
#1 A|B|C
#2   A|B
#3   B|C
#4     A
#5     C
#6 B|D|F

cSplit_e(df1, "FOO", "|", "value",  "character", TRUE, fill = " ")
#  FOO_A FOO_B FOO_C FOO_D FOO_F
#1     A     B     C            
#2     A     B                  
#3           B     C            
#4     A                        
#5                 C            
#6           B           D     F

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  死守一世寂寞        
                
              
                            
                2021-02-20 05:43
              
            
            
                                                                       
In base R:

df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C'))

dummy <- strsplit(as.character(df$FOO), "[|]")
want <- data.frame(values = unlist(dummy),
                   ids = rep(1:length(dummy), unlist(lapply(dummy, length))), 
                   stringsAsFactors = FALSE)

library(reshape2)
want <- dcast(want, ids ~ values, value.var = "values", fill = " ")[, -1] # first col removed
names(want) <- paste0("X", seq_along(unique(unlist(dummy)))) 
want
# X1 X2 X3
#1  A  B  C
#2  A  B   
#3     B  C
#4  A      
#5        C

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复