Split a string column into several dummy variables

前端 未结 6 678
醉梦人生
醉梦人生 2020-12-01 22:13

As a relatively inexperienced user of the data.table package in R, I\'ve been trying to process one text column into a large number of indicator columns (dummy variables), w

6条回答
  •  暖寄归人
    2020-12-01 22:14

      # split the `messy_string` and create a long table, keeping track of the id
      DT2 <- setkey(DT[, list(val=unlist(strsplit(messy_string, "\\$"))), by=list(ID, messy_string)], "val")
    
      # add the columns, initialize to 0
      DT2[, c(elements_list) := 0L]
      # warning expected, re:adding large ammount of columns
    
    
      # iterate over each value in element_list, assigning 1's ass appropriate
      for (el in elements_list)
         DT2[el, c(el) := 1L]
    
      # sum by ID
      DT2[, lapply(.SD, sum), by=list(ID, messy_string), .SDcols=elements_list]
    

    Note that we are carrying along the messy_string column since it is cheaper than leaving it behind and then joining on ID to get it back. If you dont need it in the final output, just delete it above.


    Benchmarks:

    Creating the sample data:

    # sample data, using OP's exmple
    set.seed(10)
    N <- 1e6  # number of rows
    elements_list <- c(outer(letters, letters, FUN = paste, sep = ""))  
    messy_string_vec <- random_string_fast(N, 2, 5, "$")   # Create the messy strings in a single shot. 
    masterDT <- data.table(ID = c(1:N), messy_string = messy_string_vec, key="ID")   # create the data.table
    

    Side Note It is significantly faster to create the random strings all at once and assign the results as a single column than to call the function N times and assign each, one by one.

      # Faster way to create the `messy_string` 's
      random_string_fast <- function(N, min_length, max_length, separator) {  
        ints <- seq(from=min_length, to=max_length)
        replicate(N, paste(sample(elements_list, sample(ints)), collapse=separator))
      }
    

    Comparing Four Methods:

    • this answer -- "DT.RS"
    • @eddi's answer -- "Plyr.eddi"
    • @GeekTrader's answer -- DT.GT
    • GeekTrader's' answer with some modifications -- DT.GT_Mod

    Here is the setup:

    library(data.table); library(plyr); library(microbenchmark)
    
    # data.table method - RS
    usingDT.RS <- quote({DT <- copy(masterDT);
                        DT2 <- setkey(DT[, list(val=unlist(strsplit(messy_string, "\\$"))), by=list(ID, messy_string)], "val"); DT2[, c(elements_list) := 0L]
                        for (el in elements_list) DT2[el, c(el) := 1L]; DT2[, lapply(.SD, sum), by=list(ID, messy_string), .SDcols=elements_list]})
    
    # data.table method - GeekTrader
    usingDT.GT <- quote({dt <- copy(masterDT); myFunc()})
    
    # data.table method - GeekTrader, modified by RS
    usingDT.GT_Mod <- quote({dt <- copy(masterDT); myFunc.modified()})
    
    # ply method from below
    usingPlyr.eddi <- quote({dt <- copy(masterDT); indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i) dt[i, data.frame(t(as.matrix(table(strsplit(messy_string, split = "\\$"))))) ])); 
                        dt = cbind(dt, indicators); dt[is.na(dt)] = 0; dt })
    

    Here are the benchmark results:

    microbenchmark( usingDT.RS=eval(usingDT.RS), usingDT.GT=eval(usingDT.GT), usingDT.GT_Mod=eval(usingDT.GT_Mod), usingPlyr.eddi=eval(usingPlyr.eddi), times=5L)
    
    
      On smaller data: 
    
      N = 600
      Unit: milliseconds
                  expr       min        lq    median        uq       max
      1     usingDT.GT 1189.7549 1198.1481 1200.6731 1202.0972 1203.3683
      2 usingDT.GT_Mod  581.7003  591.5219  625.7251  630.8144  650.6701
      3     usingDT.RS 2586.0074 2602.7917 2637.5281 2819.9589 3517.4654
      4 usingPlyr.eddi 2072.4093 2127.4891 2225.5588 2242.8481 2349.6086
    
    
      N = 1,000 
      Unit: seconds
           expr      min       lq   median       uq      max
      1 usingDT.GT 1.941012 2.053190 2.196100 2.472543 3.096096
      2 usingDT.RS 3.107938 3.344764 3.903529 4.010292 4.724700
      3  usingPlyr 3.297803 3.435105 3.625319 3.812862 4.118307
    
      N = 2,500
      Unit: seconds
                  expr      min       lq   median       uq       max
      1     usingDT.GT 4.711010 5.210061 5.291999 5.307689  7.118794
      2 usingDT.GT_Mod 2.037558 2.092953 2.608662 2.638984  3.616596
      3     usingDT.RS 5.253509 5.334890 6.474915 6.740323  7.275444
      4 usingPlyr.eddi 7.842623 8.612201 9.142636 9.420615 11.102888
    
      N = 5,000
                  expr       min        lq    median        uq       max
      1     usingDT.GT  8.900226  9.058337  9.233387  9.622531 10.839409
      2 usingDT.GT_Mod  4.112934  4.293426  4.460745  4.584133  6.128176
      3     usingDT.RS  8.076821  8.097081  8.404799  8.800878  9.580892
      4 usingPlyr.eddi 13.260828 14.297614 14.523016 14.657193 16.698229
    
      # dropping the slower two from the tests:
      microbenchmark( usingDT.RS=eval(usingDT.RS), usingDT.GT=eval(usingDT.GT), usingDT.GT_Mod=eval(usingDT.GT_Mod), times=6L)
    
      N = 10,000
      Unit: seconds
                  expr       min        lq    median        uq       max
      1 usingDT.GT_Mod  8.426744  8.739659  8.750604  9.118382  9.848153
      2     usingDT.RS 15.260702 15.564495 15.742855 16.024293 16.249556
    
      N = 25,000
      ... (still running)
    

    -----------------

    Functions Used in benchmarking:

      # original random string function
      random_string <- function(min_length, max_length, separator) {  
          selection <- paste(sample(elements_list, ceiling(runif(1, min_length, max_length))), collapse = separator)  
          return(selection)  
      }  
    
      # GeekTrader's function
      myFunc <- function() {
        ll <- strsplit(dt[,messy_string], split="\\$")
    
    
        COLS <- do.call(rbind, 
                        lapply(1:length(ll), 
                               function(i) {
                                 data.frame(
                                   ID= rep(i, length(ll[[i]])),
                                   COL = ll[[i]], 
                                   VAL= rep(1, length(ll[[i]]))
                                   )
                                 }
                               )
                        )
    
        res <- as.data.table(tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length ))
        dt <- cbind(dt, res)
        for (j in names(dt))
          set(dt,which(is.na(dt[[j]])),j,0)
        return(dt)
      }
    
    
      # Improvements to @GeekTrader's `myFunc` -RS  '
      myFunc.modified <- function() {
        ll <- strsplit(dt[,messy_string], split="\\$")
    
        ## MODIFICATIONS: 
        # using `rbindlist` instead of `do.call(rbind.. )`
        COLS <- rbindlist( lapply(1:length(ll), 
                               function(i) {
                                 data.frame(
                                   ID= rep(i, length(ll[[i]])),
                                   COL = ll[[i]], 
                                   VAL= rep(1, length(ll[[i]])), 
      # MODICIATION:  Not coercing to factors                             
                                   stringsAsFactors = FALSE
                                   )
                                 }
                               )
                        )
    
      # MODIFICATION: Preserve as matrix, the output of tapply
        res2 <- tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length )
    
      # FLATTEN into a data.table
        resdt <- data.table(r=c(res2))
    
      # FIND & REPLACE NA's of single column
        resdt[is.na(r), r:=0L]
    
      # cbind with dt, a matrix, with the same attributes as `res2`  
        cbind(dt, 
              matrix(resdt[[1]], ncol=ncol(res2), byrow=FALSE, dimnames=dimnames(res2)))
      }
    
    
      ### Benchmarks comparing the two versions of GeekTrader's function: 
      orig = quote({dt <- copy(masterDT); myFunc()})
      modified = quote({dt <- copy(masterDT); myFunc.modified()})
      microbenchmark(Modified = eval(modified), Orig = eval(orig), times=20L)
    
      #  Unit: milliseconds
      #        expr      min        lq   median       uq      max
      #  1 Modified  895.025  971.0117 1011.216 1189.599 2476.972
      #  2     Orig 1953.638 2009.1838 2106.412 2230.326 2356.802
    

提交回复
热议问题