R: Replace multiple values in multiple columns of dataframes with NA

我只是一个虾纸丫 提交于 2019-12-04 23:40:39

You can also do this using replace:

sel <- grepl("var",names(df))
df[sel] <- lapply(df[sel], function(x) replace(x,x %in% 3:4, NA) )
df

#  name foo var1 var2
#1    a   1    1   NA
#2    a   2    2   NA
#3    a   3   NA   NA
#4    b   4   NA   NA
#5    b   5    5   NA
#6    b   6    6   NA
#7    c   7    7    5
#8    c   8    8    5
#9    c   9    9    5

Some quick benchmarking using a million row sample of data suggests this is quicker than the other answers.

You could also do:

col_idx <- grep("^var", names(df))
values <- c(3, 4)
m1 <- as.matrix(df[,col_idx])
m1[m1 %in% values] <- NA
df[col_idx]  <- m1
df
#   name foo var1 var2
#1    a   1    1   NA
#2    a   2    2   NA
#3    a   3   NA   NA
#4    b   4   NA   NA
#5    b   5    5   NA
#6    b   6    6   NA
#7    c   7    7    5
#8    c   8    8    5
#9    c   9    9    5

I haven't timed this option, but I have written a function called makemeNA that is part of my GitHub-only "SOfun" package.

With that function, the approach would be something like this:

library(SOfun)

Cols <- grep("^var", names(df))
df[Cols] <- makemeNA(df[Cols], NAStrings = as.character(c(3, 4)))
df
#   name foo var1 var2
# 1    a   1    1   NA
# 2    a   2    2   NA
# 3    a   3   NA   NA
# 4    b   4   NA   NA
# 5    b   5    5   NA
# 6    b   6    6   NA
# 7    c   7    7    5
# 8    c   8    8    5
# 9    c   9    9    5

The function uses the na.strings argument in type.convert to do the conversion to NA.


Install the package with:

library(devtools)
install_github("SOfun", "mrdwab")

(or your favorite method of installing packages from GitHub).


Here's some benchmarking. I've decided to make things interesting and replace both numeric and non-numeric values with NA to see how things compare.

Here's the sample data:

n <- 1000000
set.seed(1)
df <- data.frame(
  name1 = sample(letters[1:3], n, TRUE), 
  name2 = sample(letters[1:3], n, TRUE),
  name3 = sample(letters[1:3], n, TRUE),
  var1 = sample(9, n, TRUE), 
  var2 = sample(5, n, TRUE),
  var3 = sample(9, n, TRUE))

Here are the functions to test:

fun1 <- function() {
  Cols <- names(df)
  df[Cols] <- makemeNA(df[Cols], NAStrings = as.character(c(3, 4, "a")))
  df
}

fun2 <- function() {
  values <- c(3, 4, "a")
  col_idx <- names(df)
  m1 <- as.matrix(df)
  m1[m1 %in% values] <- NA
  df[col_idx]  <- m1
  df
}

fun3 <- function() {
  values <- c(3, 4, "a")
  col_idx <- names(df)
  val_idx <- sapply(df[col_idx], "%in%", table = values)
  is.na(df[col_idx]) <- val_idx
  df
}

fun4 <- function() {
  sel <- names(df)
  df[sel] <- lapply(df[sel], function(x) 
    replace(x, x %in% c(3, 4, "a"), NA))
  df
}

I'm breaking out fun2 and fun3. I'm not crazy about fun2 because it converts everything to the same type. I also expect fun3 to be slower.

system.time(fun2())
#    user  system elapsed 
#    4.45    0.33    4.81 

system.time(fun3())
#    user  system elapsed 
#   34.31    0.38   34.74 

So now it comes down to me and Thela...

library(microbenchmark)
microbenchmark(fun1(), fun4(), times = 50)
# Unit: seconds
#    expr      min       lq   median       uq      max neval
#  fun1() 2.934278 2.982292 3.070784 3.091579 3.617902    50
#  fun4() 2.839901 2.964274 2.981248 3.128327 3.930542    50

Dang you Thela!

Here's an approach:

# the values that should be replaced by NA
values <- c(3, 4)

# index of columns
col_idx <- grep("^var", names(df))
# [1] 3 4

# index of values (within these columns)
val_idx <- sapply(df[col_idx], "%in%", table = values)
#        var1  var2
#  [1,] FALSE  TRUE
#  [2,] FALSE  TRUE
#  [3,]  TRUE  TRUE
#  [4,]  TRUE  TRUE
#  [5,] FALSE  TRUE
#  [6,] FALSE  TRUE
#  [7,] FALSE FALSE
#  [8,] FALSE FALSE
#  [9,] FALSE FALSE

# replace with NA
is.na(df[col_idx]) <- val_idx

df
#   name foo var1 var2
# 1    a   1    1   NA
# 2    a   2    2   NA
# 3    a   3   NA   NA
# 4    b   4   NA   NA
# 5    b   5    5   NA
# 6    b   6    6   NA
# 7    c   7    7    5
# 8    c   8    8    5
# 9    c   9    9    5

I think dplyr is very well-suited for this task.
Using replace() as suggested by @thelatemail, you could do something like this:

library("dplyr")
df <- df %>% 
  mutate_at(vars(starts_with("var")),
            funs(replace(., . %in% c(3, 4), NA)))

df
#   name foo var1 var2
# 1    a   1    1   NA
# 2    a   2    2   NA
# 3    a   3   NA   NA
# 4    b   4   NA   NA
# 5    b   5    5   NA
# 6    b   6    6   NA
# 7    c   7    7    5
# 8    c   8    8    5
# 9    c   9    9    5

Here is a dplyr solution:

# Define replace function
repl.f <- function(x) ifelse(x%in%c(3,4), NA,x)

library(dplyr)
cbind(select(df, -starts_with("var")),
  mutate_each(select(df, starts_with("var")), funs(repl.f)))

  name foo var1 var2
1    a   1    1   NA
2    a   2    2   NA
3    a   3   NA   NA
4    b   4   NA   NA
5    b   5    5   NA
6    b   6    6   NA
7    c   7    7    5
8    c   8    8    5
9    c   9    9    5
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!