Create a new column with non-null columns' names

前端未结

关注

 4  1111

My data set looks like this one:

library(data.table)

df <- data.table(a = c(1,2,3,4,5),
                 b = c(1,0,2,5,1),
                 c = c(0,1,1,


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  逝去的感伤        
                
              
                            
                2020-12-11 06:23
              
            
            
                                                                       
Assuming nrow >> ncol, you could work columnwise

ff = function(x)
{
    ans = character(nrow(x))
    for(j in seq_along(x)) {
        i = x[[j]] > 0L
        ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
    }
    return(gsub("^_", "", ans))
}
ff(df[, -1L, with = FALSE]) #or, `df[, ff(.SD), .SDcols = -1L]` from David Arenburg
#[1] "b_d" "c"   "b_c" "b_d" "b_d"

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  自闭症患者        
                
              
                            
                2020-12-11 06:27
              
            
            
                                                                       
This can be a bit lengthy.

For every row finding a column whose value is not 0 and then pasting the column names together.

data.table(a= df$a, z = lapply(apply(df, 1, 
           function(x) which(x[-1]!= 0)), 
           function(x) paste0(names(x), collapse = "_")))


#   a   z
#1: 1 b_d
#2: 2   c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  刺人心        
                
              
                            
                2020-12-11 06:32
              
            
            
                                                                       
One option would be to convert the format from 'wide' to 'long' using melt.  Grouped by 'a', we paste the 'variable' elements that corresponds to non-zero elements in 'value' (provided as logical condition in 'i').  

melt(df, id.var='a')[value!=0, 
      .(z=paste(variable, collapse="_")), keyby =a]
#   a   z
#1: 1 b_d
#2: 2   c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d




Or instead of melting, we can group by 'a', unlist the Subset of Data.table (.SD) and paste the names of the columns that corresponds to non-zero elements ('i1').

df[, {i1 <- !!unlist(.SD)
       paste(names(.SD)[i1], collapse="_")} , by= a]


Benchmarks

set.seed(24)
df1 <- data.table(a=1:1e6, b = sample(0:5, 1e6, 
   replace=TRUE), c = sample(0:4, 1e6, replace=TRUE), 
    d = sample(0:3, 1e6, replace=TRUE))

akrun1 <- function() {
   melt(df1, id.var='a')[value!=0, 
      .(z=paste(variable, collapse="_")), keyby =a]
    }

 akrun2 <- function() {
   df1[, {i1 <- !!unlist(.SD)
       paste(names(.SD)[i1], collapse="_")} , by= a]
   }

 ronak <- function() {
    data.table(z = lapply(apply(df1, 1, function(x)
                which(x[-1]!= 0)), 
       function(x) paste0(names(x), collapse = "_")))
   }

eddi <- function(){
 df1[, newcol := gsub("NA_|_NA|NA", "",                          
   do.call(function(...) paste(..., sep = "_"),            
     Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD)))
 , .SDcols = b:d]

 }

alexis = function(x)
   {
   ans = character(nrow(x))
   for(j in seq_along(x)) {
    i = x[[j]] > 0L
    ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
   }
  return(gsub("^_", "", ans))
}





system.time(akrun1())
#   user  system elapsed 
#  22.04    0.15   22.36 
 system.time(akrun2())
#   user  system elapsed 
# 26.33    0.00   26.41 
 system.time(ronak())
#   user  system elapsed 
#  25.60    0.26   25.96 


system.time(alexis(df1[, -1L, with = FALSE]))
#   user  system elapsed 
#   1.92    0.06    2.09 

system.time(eddi())
#  user  system elapsed 
#   2.41    0.06    3.19 

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  梦毁少年i        
                
              
                            
                2020-12-11 06:40
              
            
            
                                                                       
Here's a direct approach:

df[, newcol := gsub("NA_|_NA|NA", "",                           # remove unwanted text
       do.call(function(...) paste(..., sep = "_"),             # paste colnames together
         Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD))) # convert data to colnames
   , .SDcols = b:d]
#   a b c d newcol
#1: 1 1 0 1    b_d
#2: 2 0 1 0      c
#3: 3 2 1 0    b_c
#4: 4 5 0 2    b_d
#5: 5 1 0 2    b_d


On akrun's test data it's >10x faster.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复