Create a new column with non-null columns' names

前端 未结 4 1109
醉话见心
醉话见心 2020-12-11 06:16

My data set looks like this one:

library(data.table)

df <- data.table(a = c(1,2,3,4,5),
                 b = c(1,0,2,5,1),
                 c = c(0,1,1,         


        
相关标签:
4条回答
  • 2020-12-11 06:23

    Assuming nrow >> ncol, you could work columnwise

    ff = function(x)
    {
        ans = character(nrow(x))
        for(j in seq_along(x)) {
            i = x[[j]] > 0L
            ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
        }
        return(gsub("^_", "", ans))
    }
    ff(df[, -1L, with = FALSE]) #or, `df[, ff(.SD), .SDcols = -1L]` from David Arenburg
    #[1] "b_d" "c"   "b_c" "b_d" "b_d"
    
    0 讨论(0)
  • 2020-12-11 06:27

    This can be a bit lengthy.

    For every row finding a column whose value is not 0 and then pasting the column names together.

    data.table(a= df$a, z = lapply(apply(df, 1, 
               function(x) which(x[-1]!= 0)), 
               function(x) paste0(names(x), collapse = "_")))
    
    
    #   a   z
    #1: 1 b_d
    #2: 2   c
    #3: 3 b_c
    #4: 4 b_d
    #5: 5 b_d
    
    0 讨论(0)
  • 2020-12-11 06:32

    One option would be to convert the format from 'wide' to 'long' using melt. Grouped by 'a', we paste the 'variable' elements that corresponds to non-zero elements in 'value' (provided as logical condition in 'i').

    melt(df, id.var='a')[value!=0, 
          .(z=paste(variable, collapse="_")), keyby =a]
    #   a   z
    #1: 1 b_d
    #2: 2   c
    #3: 3 b_c
    #4: 4 b_d
    #5: 5 b_d
    

    Or instead of melting, we can group by 'a', unlist the Subset of Data.table (.SD) and paste the names of the columns that corresponds to non-zero elements ('i1').

    df[, {i1 <- !!unlist(.SD)
           paste(names(.SD)[i1], collapse="_")} , by= a]
    

    Benchmarks

    set.seed(24)
    df1 <- data.table(a=1:1e6, b = sample(0:5, 1e6, 
       replace=TRUE), c = sample(0:4, 1e6, replace=TRUE), 
        d = sample(0:3, 1e6, replace=TRUE))
    
    akrun1 <- function() {
       melt(df1, id.var='a')[value!=0, 
          .(z=paste(variable, collapse="_")), keyby =a]
        }
    
     akrun2 <- function() {
       df1[, {i1 <- !!unlist(.SD)
           paste(names(.SD)[i1], collapse="_")} , by= a]
       }
    
     ronak <- function() {
        data.table(z = lapply(apply(df1, 1, function(x)
                    which(x[-1]!= 0)), 
           function(x) paste0(names(x), collapse = "_")))
       }
    
    eddi <- function(){
     df1[, newcol := gsub("NA_|_NA|NA", "",                          
       do.call(function(...) paste(..., sep = "_"),            
         Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD)))
     , .SDcols = b:d]
    
     }
    
    alexis = function(x)
       {
       ans = character(nrow(x))
       for(j in seq_along(x)) {
        i = x[[j]] > 0L
        ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
       }
      return(gsub("^_", "", ans))
    }
    
    
    
    
    
    system.time(akrun1())
    #   user  system elapsed 
    #  22.04    0.15   22.36 
     system.time(akrun2())
    #   user  system elapsed 
    # 26.33    0.00   26.41 
     system.time(ronak())
    #   user  system elapsed 
    #  25.60    0.26   25.96 
    
    
    system.time(alexis(df1[, -1L, with = FALSE]))
    #   user  system elapsed 
    #   1.92    0.06    2.09 
    
    system.time(eddi())
    #  user  system elapsed 
    #   2.41    0.06    3.19 
    
    0 讨论(0)
  • 2020-12-11 06:40

    Here's a direct approach:

    df[, newcol := gsub("NA_|_NA|NA", "",                           # remove unwanted text
           do.call(function(...) paste(..., sep = "_"),             # paste colnames together
             Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD))) # convert data to colnames
       , .SDcols = b:d]
    #   a b c d newcol
    #1: 1 1 0 1    b_d
    #2: 2 0 1 0      c
    #3: 3 2 1 0    b_c
    #4: 4 5 0 2    b_d
    #5: 5 1 0 2    b_d
    

    On akrun's test data it's >10x faster.

    0 讨论(0)
提交回复
热议问题