Split a string column into several dummy variables

前端 未结 6 682
醉梦人生
醉梦人生 2020-12-01 22:13

As a relatively inexperienced user of the data.table package in R, I\'ve been trying to process one text column into a large number of indicator columns (dummy variables), w

6条回答
  •  执念已碎
    2020-12-01 22:19

    UPDATE : VERSION 3

    Found even faster way. This function is also highly memory efficient. Primary reason previous function was slow because of copy/assignments happening inside lapply loop as well as rbinding of the result.

    In following version, we preallocate matrix with appropriate size, and then change values at appropriate coordinates, which makes it very fast compared to other looping versions.

    funcGT3 <- function() {
        #Get list of column names in result
        resCol <- unique(dt[, unlist(strsplit(messy_string, split="\\$"))])
    
        #Get dimension of result
        nresCol <- length(resCol)
        nresRow <- nrow(dt)
    
        #Create empty matrix with dimensions same as desired result
        mat <- matrix(rep(0, nresRow * nresCol), nrow = nresRow, dimnames = list(as.character(1:nresRow), resCol))
    
        #split each messy_string by $
        ll <- strsplit(dt[,messy_string], split="\\$")
    
        #Get coordinates of mat which we need to set to 1
        coords <- do.call(rbind, lapply(1:length(ll), function(i) cbind(rep(i, length(ll[[i]])), ll[[i]] )))
    
        #Set mat to 1 at appropriate coordinates
        mat[coords] <- 1    
    
        #Bind the mat to original data.table
        return(cbind(dt, mat))
    
    }
    
    
    result <- funcGT3()  #result for 1000 rows in dt
    result
            ID   messy_string zn tc sv db yx st ze qs wq oe cv ut is kh kk im le qg rq po wd kc un ft ye if zl zt wy et rg iu
       1:    1 zn$tc$sv$db$yx  1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
       2:    2    st$ze$qs$wq  0  0  0  0  0  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
       3:    3    oe$cv$ut$is  0  0  0  0  0  0  0  0  0  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
       4:    4 kh$kk$im$le$qg  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
       5:    5    rq$po$wd$kc  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  1  1  0  0  0  0  0  0  0  0  0  0
      ---                                                                                                                    
     996:  996    rp$cr$tb$sa  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
     997:  997    cz$wy$rj$he  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
     998:  998       cl$rr$bm  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
     999:  999    sx$hq$zy$zd  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    1000: 1000    bw$cw$pw$rq  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
    

    Benchmark againt version 2 suggested by Ricardo (this is for 250K rows in data) :

    Unit: seconds
     expr       min        lq    median        uq       max neval
      GT2 104.68672 104.68672 104.68672 104.68672 104.68672     1
      GT3  15.15321  15.15321  15.15321  15.15321  15.15321     1
    

    VERSION 1 Following is version 1 of suggested answer

    set.seed(10)  
    elements_list <- c(outer(letters, letters, FUN = paste, sep = ""))  
    random_string <- function(min_length, max_length, separator) {  
      selection <- paste(sample(elements_list, ceiling(runif(1, min_length, max_length))), collapse = separator)  
      return(selection)  
    }  
    dt <- data.table(ID = c(1:1000), messy_string = "")  
    dt[ , messy_string := random_string(2, 5, "$"), by = ID]  
    
    
    myFunc <- function() {
      ll <- strsplit(dt[,messy_string], split="\\$")
    
    
      COLS <- do.call(rbind, 
                      lapply(1:length(ll), 
                             function(i) {
                               data.frame(
                                 ID= rep(i, length(ll[[i]])),
                                 COL = ll[[i]], 
                                 VAL= rep(1, length(ll[[i]]))
                                 )
                               }
                             )
                      )
    
      res <- as.data.table(tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length ))
      dt <- cbind(dt, res)
      for (j in names(dt))
        set(dt,which(is.na(dt[[j]])),j,0)
      return(dt)
    }
    
    
    create_indicators <- function(search_list, searched_string) {  
      y <- rep(0, length(search_list))  
      for(j in 1:length(search_list)) {  
        x <- regexpr(search_list[j], searched_string)  
        x <- x[1]  
        y[j] <- ifelse(x > 0, 1, 0)  
      }  
      return(y)  
    }  
    OPFunc <- function() {
    indicators <- matrix(0, nrow = nrow(dt), ncol = length(elements_list))  
    for(n in 1:nrow(dt)) {  
      indicators[n, ] <- dt[n, create_indicators(elements_list, messy_string)]  
    }  
    indicators <- data.table(indicators)  
    setnames(indicators, elements_list)  
    dt <- cbind(dt, indicators)
    return(dt)
    }
    
    
    
    library(plyr)
    plyrFunc <- function() {
      indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i)
        dt[i,
           data.frame(t(as.matrix(table(strsplit(messy_string,
                                                 split = "\\$")))))
           ]))
      dt = cbind(dt, indicators)
      #dt[is.na(dt)] = 0 #THIS DOESN'T WORK. USING FOLLOWING INSTEAD
    
      for (j in names(dt))
        set(dt,which(is.na(dt[[j]])),j,0)
    
      return(dt)  
    }
    

    BENCHMARK

    system.time(res <- myFunc())
    ## user  system elapsed 
    ## 1.01    0.00    1.01
    
    system.time(res2 <- OPFunc())
    ## user  system elapsed 
    ## 21.58    0.00   21.61
    
    system.time(res3 <- plyrFunc())
    ## user  system elapsed 
    ## 1.81    0.00    1.81 
    

    VERSION 2 : Suggested by Ricardo

    I'm posting this here instead of in my answer as the framework is really @GeekTrader's -Rick_

      myFunc.modified <- function() {
        ll <- strsplit(dt[,messy_string], split="\\$")
    
        ## MODIFICATIONS: 
        # using `rbindlist` instead of `do.call(rbind.. )`
        COLS <- rbindlist( lapply(1:length(ll), 
                               function(i) {
                                 data.frame(
                                   ID= rep(i, length(ll[[i]])),
                                   COL = ll[[i]], 
                                   VAL= rep(1, length(ll[[i]])), 
      # MODICIATION:  Not coercing to factors                             
                                   stringsAsFactors = FALSE
                                   )
                                 }
                               )
                        )
    
      # MODIFICATION: Preserve as matrix, the output of tapply
        res2 <- tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length )
    
      # FLATTEN into a data.table
        resdt <- data.table(r=c(res2))
    
      # FIND & REPLACE NA's of single column
        resdt[is.na(r), r:=0L]
    
      # cbind with dt, a matrix, with the same attributes as `res2`  
        cbind(dt, 
              matrix(resdt[[1]], ncol=ncol(res2), byrow=FALSE, dimnames=dimnames(res2)))
      }
    
    
     ### Benchmarks: 
    
    orig = quote({dt <- copy(masterDT); myFunc()})
    modified = quote({dt <- copy(masterDT); myFunc.modified()})
    microbenchmark(Modified = eval(modified), Orig = eval(orig), times=20L)
    
    #  Unit: milliseconds
    #        expr      min        lq   median       uq      max
    #  1 Modified  895.025  971.0117 1011.216 1189.599 2476.972
    #  2     Orig 1953.638 2009.1838 2106.412 2230.326 2356.802
    

提交回复
热议问题