R plyr, data.table, apply certain columns of data.frame

怎甘沉沦 提交于 2019-12-04 08:30:20

Here's a data.table solution using set.

require(data.table)
DT <- data.table(data)
for (j in cols_to_fix) {
    set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
    set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
}

DT
#    col1 col2 col3 col4
# 1:    1    1   aa   vv
# 2:    1    2   bb   ww
# 3:    1    3   cc   xx
# 4:    1    4   dd   yy
# 5:    2    1   ee   zz
# 6:    2    2   NA   NA
# 7:    2    3   NA   NA
# 8:    2    4   NA   NA

First line reads: set in DT for all i(=NULL), and column=j the value gsub(..).
Second line reads: set in DT where i(=condn) and column=j with value NA_character_.

Note: Using PCRE (perl=TRUE) has nice speed-up, especially on bigger vectors.

marbel

Here is a data.table solution, should be faster if your table is large. The concept of := is an "update" of the columns. I believe that because of this you aren't copying the table internally again as a "normal" dataframe solution would.

require(data.table)
DT <- data.table(data)

fxn = function(col) {
  col = gsub("[ _]", "", col, perl = TRUE)
  col[which(col == "n/a")] <- NA_character_
  col
}

cols = c("col3", "col4");

# lapply your function
DT[, (cols) := lapply(.SD, fxn), .SDcols = cols]
print(DT)

No need for loops (for or *ply):

tmp <- gsub("[_ ]", "", as.matrix(data[,cols_to_fix]), perl=TRUE)
tmp[tmp=="n/a"] <- NA
data[,cols_to_fix] <- tmp

Benchmarks

I only benchmark Arun's data.table solution and my matrix solution. I assume that many columns need to be fixed.

Benchmark code:

options(stringsAsFactors=FALSE)

set.seed(45)
K <- 1000; N <- 1e5
foo <- function(K) paste(sample(c(letters, "_", " "), 8, replace=TRUE), collapse="")
bar <- function(K) replicate(K, foo(), simplify=TRUE)
data <- data.frame(id1=sample(5, K, TRUE), 
                   id2=sample(5, K, TRUE)
)
data <- cbind(data, matrix(sample(bar(K), N, TRUE), ncol=N/K))

cols_to_fix <- as.character(seq_len(N/K))
library(data.table)

benchfun <- function() {
  time1 <- system.time({
    DT <- data.table(data)
    for (j in cols_to_fix) {
      set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
      set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
    }
  })

  data2 <- data
  time2 <- system.time({
    tmp <- gsub("[_ ]", "", as.matrix(data2[,cols_to_fix]), perl=TRUE)
    tmp[tmp=="n/a"] <- NA   
    data2[,cols_to_fix] <- tmp
  })

  list(identical= identical(as.data.frame(DT), data2),
       data.table_timing= time1[[3]],
       matrix_timing=time2[[3]])
}

replicate(3, benchfun())

Benchmark results:

#100 columns to fix, nrow=1e5
#                  [,1]   [,2]  [,3]  
#identical         TRUE   TRUE  TRUE  
#data.table_timing 6.001  5.571 5.602 
#matrix_timing     17.906 17.21 18.343

#1000 columns to fix, nrow=1e4
#                  [,1]   [,2]   [,3]  
#identical         TRUE   TRUE   TRUE  
#data.table_timing 4.509  4.574  4.857 
#matrix_timing     13.604 14.219 13.234

#1000 columns to fix, nrow=100
#                  [,1]  [,2]  [,3] 
#identical         TRUE  TRUE  TRUE 
#data.table_timing 0.052 0.052 0.055
#matrix_timing     0.134 0.128 0.127

#100 columns to fix, nrow=1e5 and including 
#data1 <- as.data.frame(DT) in the timing
#                           [,1]  [,2]  [,3]   [,4]   [,5]   [,6]   [,7]   [,8]   [,9]   [,10] 
#identical                  TRUE  TRUE  TRUE   TRUE   TRUE   TRUE   TRUE   TRUE   TRUE   TRUE  
#data.table_timing          5.642 5.58  5.762  5.382  5.419  5.633  5.508  5.578  5.634  5.397 
#data.table_returnDF_timing 5.973 5.808 5.817  5.705  5.736  5.841  5.759  5.833  5.689  5.669 
#matrix_timing              20.89 20.3  19.988 20.271 19.177 19.676 20.836 20.098 20.005 19.409

data.table is faster only by a factor of three. This advantage could probably be even smaller, if we decide to change the data structure (as the data.table solution does) and keep it a matrix.

I think you can do this with regular old apply, which will call your cleanup function on each column (margin=2):

fxn = function(col) {
  col <- gsub("_", "", col)
  col <- gsub(" ", "", col)
  col <- ifelse(col=="n/a", NA, col)
  return(col)
}
data[,cols_to_fix] <- apply(data[,cols_to_fix], 2, fxn)
data
#   col1 col2 col3 col4
# 1    1    1   aa   vv
# 2    1    2   bb   ww
# 3    1    3   cc   xx
# 4    1    4   dd   yy
# 5    2    1   ee   zz
# 6    2    2 <NA> <NA>
# 7    2    3 <NA> <NA>
# 8    2    4 <NA> <NA>

Edit: it sounds like you're requiring the use of the plyr package. I'm not an expert in plyr, but this seemed to work:

library(plyr)
data[,cols_to_fix] <- t(laply(data[,cols_to_fix], fxn))
marbel

Here's a benchmark of all the different answers:

First, all the answers as separate functions:

1) Arun's

arun <- function(data, cols_to_fix) {
    DT <- data.table(data)
    for (j in cols_to_fix) {
        set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
        set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
    }
    return(DT)
}

2) Martin's

martin <- function(data, cols) {
    DT <- data.table(data)    
    colfun = function(col) {
        col <- gsub("_", "", col)
        col <- gsub(" ", "", col)
        col <- ifelse(col=="n/a", NA, col)
    }
    DT[, (cols) := lapply(.SD, colfun), .SDcols = cols]
    return(DT)
}    

3) Roland's

roland <- function(data, cols_to_fix) {
    tmp <- gsub("[_ ]", "", as.matrix(data[,cols_to_fix]))
    tmp[tmp=="n/a"] <- NA   
    data[,cols_to_fix] <- tmp
    return(data)
}

4) BrodieG's

brodieg <- function(data, cols_to_fix) {
    fix_fun <- function(x) gsub("(_| )", "", ifelse(x == "n/a", NA_character_, x))
    data[, cols_to_fix] <- apply(data[, cols_to_fix], 2, fix_fun)
    return(data)
}

5) Josilber's

josilber <- function(data, cols_to_fix) {
    colfun2 <- function(col) {
        col <- gsub("_", "", col)
        col <- gsub(" ", "", col)
        col <- ifelse(col=="n/a", NA, col)
        return(col)
    }
    data[,cols_to_fix] <- apply(data[,cols_to_fix], 2, colfun2)
    return(data)
}

2) benchmarking function:

We'll run this function 3 times and take the minimum of the run (removes cache effects) to be the runtime:

bench <- function(data, cols_to_fix) {
    ans <- c( 
        system.time(arun(data, cols_to_fix))["elapsed"], 
        system.time(martin(data, cols_to_fix))["elapsed"], 
        system.time(roland(data, cols_to_fix))["elapsed"], 
        system.time(brodieg(data, cols_to_fix))["elapsed"],
        system.time(josilber(data, cols_to_fix))["elapsed"]
    )
}

3) On (slightly) big data with just 2 cols to fix (like in OP's example here):

require(data.table)
set.seed(45)
K <- 1000; N <- 1e5
foo <- function(K) paste(sample(c(letters, "_", " "), 8, replace=TRUE), collapse="")
bar <- function(K) replicate(K, foo(), simplify=TRUE)
data <- data.frame(id1=sample(5, N, TRUE), 
                   id2=sample(5, N, TRUE), 
                   col3=sample(bar(K), N, TRUE), 
                   col4=sample(bar(K), N, TRUE)
        )

rown <- c("arun", "martin", "roland", "brodieg", "josilber")
coln <- paste("run", 1:3, sep="")
cols_to_fix <- c("col3","col4")
ans <- matrix(0L, nrow=5L, ncol=3L)
for (i in 1:3) {
    print(i)
    ans[, i] <- bench(data, cols_to_fix)
}
rownames(ans) <- rown
colnames(ans) <- coln

#           run1  run2  run3
# arun     0.149 0.140 0.142
# martin   0.643 0.629 0.621
# roland   1.741 1.708 1.761
# brodieg  1.926 1.919 1.899
# josilber 2.067 2.041 2.162

The apply version is the way to go. Looks like @josilber came up with the same answer, but this one is slightly different (note regexp).

fix_fun <- function(x) gsub("(_| )", "", ifelse(x == "n/a", NA_character_, x))
data[, cols_to_fix] <- apply(data[, cols_to_fix], 2, fix_fun)

More importantly, generally you want to use ddply and data.table when you want to do split-apply-combine analysis. In this case, all your data belongs to the same group (there aren't any subgroups you're doing anything different with), so you might as well use apply.

The 2 at the center of the apply statement means we want to subset the input by the 2nd dimension, and pass the result (in this case vectors, each representing a column from your data frame in cols_to_fix) to the function that does the work. apply then re-assembles the result, and we assign it back to the columns in cols_to_fix. If we had used 1 instead, apply would have passed the rows in our data frame to the function. Here is the result:

data
#   col1 col2 col3 col4
# 1    1    1   aa   vv
# 2    1    2   bb   ww
# 3    1    3   cc   xx
# 4    1    4   dd   yy
# 5    2    1   ee   zz
# 6    2    2 <NA> <NA>
# 7    2    3 <NA> <NA>
# 8    2    4 <NA> <NA>

If you do have sub-groups, then I recommend you use data.table. Once you get used to the syntax it's hard to beat for convenience and speed. It will also do efficient joins across data sets.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!