R: Replace multiple values in multiple columns of dataframes with NA

前端 未结 6 636
情书的邮戳
情书的邮戳 2020-12-11 01:31

I am trying to achieve something similar to this question but with multiple values that must be replaced by NA, and in large dataset.

df <- data.frame(nam         


        
相关标签:
6条回答
  • 2020-12-11 01:53

    I haven't timed this option, but I have written a function called makemeNA that is part of my GitHub-only "SOfun" package.

    With that function, the approach would be something like this:

    library(SOfun)
    
    Cols <- grep("^var", names(df))
    df[Cols] <- makemeNA(df[Cols], NAStrings = as.character(c(3, 4)))
    df
    #   name foo var1 var2
    # 1    a   1    1   NA
    # 2    a   2    2   NA
    # 3    a   3   NA   NA
    # 4    b   4   NA   NA
    # 5    b   5    5   NA
    # 6    b   6    6   NA
    # 7    c   7    7    5
    # 8    c   8    8    5
    # 9    c   9    9    5
    

    The function uses the na.strings argument in type.convert to do the conversion to NA.


    Install the package with:

    library(devtools)
    install_github("SOfun", "mrdwab")
    

    (or your favorite method of installing packages from GitHub).


    Here's some benchmarking. I've decided to make things interesting and replace both numeric and non-numeric values with NA to see how things compare.

    Here's the sample data:

    n <- 1000000
    set.seed(1)
    df <- data.frame(
      name1 = sample(letters[1:3], n, TRUE), 
      name2 = sample(letters[1:3], n, TRUE),
      name3 = sample(letters[1:3], n, TRUE),
      var1 = sample(9, n, TRUE), 
      var2 = sample(5, n, TRUE),
      var3 = sample(9, n, TRUE))
    

    Here are the functions to test:

    fun1 <- function() {
      Cols <- names(df)
      df[Cols] <- makemeNA(df[Cols], NAStrings = as.character(c(3, 4, "a")))
      df
    }
    
    fun2 <- function() {
      values <- c(3, 4, "a")
      col_idx <- names(df)
      m1 <- as.matrix(df)
      m1[m1 %in% values] <- NA
      df[col_idx]  <- m1
      df
    }
    
    fun3 <- function() {
      values <- c(3, 4, "a")
      col_idx <- names(df)
      val_idx <- sapply(df[col_idx], "%in%", table = values)
      is.na(df[col_idx]) <- val_idx
      df
    }
    
    fun4 <- function() {
      sel <- names(df)
      df[sel] <- lapply(df[sel], function(x) 
        replace(x, x %in% c(3, 4, "a"), NA))
      df
    }
    

    I'm breaking out fun2 and fun3. I'm not crazy about fun2 because it converts everything to the same type. I also expect fun3 to be slower.

    system.time(fun2())
    #    user  system elapsed 
    #    4.45    0.33    4.81 
    
    system.time(fun3())
    #    user  system elapsed 
    #   34.31    0.38   34.74 
    

    So now it comes down to me and Thela...

    library(microbenchmark)
    microbenchmark(fun1(), fun4(), times = 50)
    # Unit: seconds
    #    expr      min       lq   median       uq      max neval
    #  fun1() 2.934278 2.982292 3.070784 3.091579 3.617902    50
    #  fun4() 2.839901 2.964274 2.981248 3.128327 3.930542    50
    

    Dang you Thela!

    0 讨论(0)
  • 2020-12-11 01:54

    I think dplyr is very well-suited for this task.
    Using replace() as suggested by @thelatemail, you could do something like this:

    library("dplyr")
    df <- df %>% 
      mutate_at(vars(starts_with("var")),
                funs(replace(., . %in% c(3, 4), NA)))
    
    df
    #   name foo var1 var2
    # 1    a   1    1   NA
    # 2    a   2    2   NA
    # 3    a   3   NA   NA
    # 4    b   4   NA   NA
    # 5    b   5    5   NA
    # 6    b   6    6   NA
    # 7    c   7    7    5
    # 8    c   8    8    5
    # 9    c   9    9    5
    
    0 讨论(0)
  • 2020-12-11 02:00

    You can also do this using replace:

    sel <- grepl("var",names(df))
    df[sel] <- lapply(df[sel], function(x) replace(x,x %in% 3:4, NA) )
    df
    
    #  name foo var1 var2
    #1    a   1    1   NA
    #2    a   2    2   NA
    #3    a   3   NA   NA
    #4    b   4   NA   NA
    #5    b   5    5   NA
    #6    b   6    6   NA
    #7    c   7    7    5
    #8    c   8    8    5
    #9    c   9    9    5
    

    Some quick benchmarking using a million row sample of data suggests this is quicker than the other answers.

    0 讨论(0)
  • 2020-12-11 02:02

    You could also do:

    col_idx <- grep("^var", names(df))
    values <- c(3, 4)
    m1 <- as.matrix(df[,col_idx])
    m1[m1 %in% values] <- NA
    df[col_idx]  <- m1
    df
    #   name foo var1 var2
    #1    a   1    1   NA
    #2    a   2    2   NA
    #3    a   3   NA   NA
    #4    b   4   NA   NA
    #5    b   5    5   NA
    #6    b   6    6   NA
    #7    c   7    7    5
    #8    c   8    8    5
    #9    c   9    9    5
    
    0 讨论(0)
  • 2020-12-11 02:04

    Here's an approach:

    # the values that should be replaced by NA
    values <- c(3, 4)
    
    # index of columns
    col_idx <- grep("^var", names(df))
    # [1] 3 4
    
    # index of values (within these columns)
    val_idx <- sapply(df[col_idx], "%in%", table = values)
    #        var1  var2
    #  [1,] FALSE  TRUE
    #  [2,] FALSE  TRUE
    #  [3,]  TRUE  TRUE
    #  [4,]  TRUE  TRUE
    #  [5,] FALSE  TRUE
    #  [6,] FALSE  TRUE
    #  [7,] FALSE FALSE
    #  [8,] FALSE FALSE
    #  [9,] FALSE FALSE
    
    # replace with NA
    is.na(df[col_idx]) <- val_idx
    
    df
    #   name foo var1 var2
    # 1    a   1    1   NA
    # 2    a   2    2   NA
    # 3    a   3   NA   NA
    # 4    b   4   NA   NA
    # 5    b   5    5   NA
    # 6    b   6    6   NA
    # 7    c   7    7    5
    # 8    c   8    8    5
    # 9    c   9    9    5
    
    0 讨论(0)
  • 2020-12-11 02:11

    Here is a dplyr solution:

    # Define replace function
    repl.f <- function(x) ifelse(x%in%c(3,4), NA,x)
    
    library(dplyr)
    cbind(select(df, -starts_with("var")),
      mutate_each(select(df, starts_with("var")), funs(repl.f)))
    
      name foo var1 var2
    1    a   1    1   NA
    2    a   2    2   NA
    3    a   3   NA   NA
    4    b   4   NA   NA
    5    b   5    5   NA
    6    b   6    6   NA
    7    c   7    7    5
    8    c   8    8    5
    9    c   9    9    5
    
    0 讨论(0)
提交回复
热议问题