Delete last two characters in string if they match criteria

断了今生、忘了曾经 提交于 2019-12-05 14:25:01

If you want good performance for millions of records, the stringi package is what you need. It even outperforms the base R functions:

require(stringi)
n <- 10000
x <- stri_rand_strings(n, 1:100)
ind <- sample(n, n/100)
x[ind] <- stri_paste(x[ind]," A")

baseR <- function(x){
  sub("\\sA$", "", x)
}

stri1 <- function(x){
  stri_replace_last_regex(x, "\\sA$","")
}

stri2 <- function(x){
  ind <- stri_detect_regex(x, "\\sA$")
  x[ind] <- stri_sub(x[ind],1, -3)
  x
}

#if we assume that there can only be space, not any white character
#this is even faster (ca 200x)
stri3 <- function(x){
  ind <- stri_endswith_fixed(x, " A")
  x[ind] <- stri_sub(x[ind],1, -3)
  x
}


head(stri2(x),44)
require(microbenchmark)
microbenchmark(baseR(x), stri1(x),stri2(x),stri3(x))
Unit: microseconds
     expr        min        lq        mean      median         uq        max neval
 baseR(x) 166044.032 172054.30 183919.6684 183112.1765 194586.231 219207.905   100
 stri1(x)  36704.180  39015.59  41836.8612  40164.9365  43773.034  60373.866   100
 stri2(x)  17736.535  18884.56  20575.3306  19818.2895  21759.489  31846.582   100
 stri3(x)    491.963    802.27    918.1626    868.9935   1008.776   2489.923   100

We can use sub to match a space \\s followed by 'A' at the end ($) of the string and replace it with blank ("")

df$names <- sub("\\sA$", "", df$names)
df$names
#[1] "A ADAM"           "S BEAN"           "A APPLE"          "A SCHWARZENEGGER"

The answer from @akrun is, of course, correct, but based on the comments I will just add one more thing when the column is factor.

Using the example of @vincentmajor in the comments:

df <- df2 <- data.frame(names = rep(c("A ADAM", "S BEAN", "A APPLE A", "A SCHWARZENEGGER"), length.out = 2000000))

# Probably we want the column to remain factor after substitution
system.time(
   df$names <- factor(sub("\\sA$", "", df$names))
)
# user  system elapsed 
# 0.892   0.000   0.893 

# Also if there are a lot of duplicates, like in this example,
# substituting the levels is way quicker
system.time(
    levels(df2$names) <- sub("\\sA$", "", levels(df2$names))
)
# user  system elapsed 
# 0.052   0.000   0.053 

Maybe not the fastest solution, but this will work too:

require(stringi)
x <- stri_rand_strings(10, 1:10)
ind <- sample(10, 5)
x[ind] <- stri_paste(x[ind]," A")
x
# [1] "z A"          "hX"         "uv0 A"        "HQtD A"       "kTNZh"      "4SIVBh"     "v28UrqS A"    "uskxxNkl A"  
# [9] "dKxloBsA6"  "sRkCQp7sn4"
y <- stri_sub(x, -2,-1) == " A"
x[y] <- stri_sub(x[y], 1, -3)
x
# [1] "z"          "hX"         "uv0"        "HQtD"       "kTNZh"      "4SIVBh"     "v28UrqS"    "uskxxNkl"  
# [9] "dKxloBsA6"  "sRkCQp7sn4"
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!