问题
I have column of long names and I would like to cut these to max 40 characters length.
Sample data:
x <- c("This is the longest sentence in world, so now just make it longer",
"No in fact, this is the longest sentence in entire world, world, world, world, the whole world")
I would like to shorten the sentece length to about 40 (-/+ 3 nchar) so that I don't shorten the sentence in the middle of an word. (So the length is decised on empty space between words).
Also I would like to add 3 dots after the shortened sentece.
The desired output would be something like this:
c("This is the longest sentence...","No in fact, this is the longest...")
This function would just blindly shorten at 40 char.:
strtrim(x, 40)
回答1:
Ok, I have better solution now :)
x <- c("This is the longest sentence in world, so now just make it longer","No in fact, this is the longest sentence in entire world, world, world, world, the whole world")
extract <- function(x){
result <- stri_extract_first_regex(x, "^.{0,40}( |$)")
longer <- stri_length(x) > 40
result[longer] <- stri_paste(result[longer], "...")
result
}
extract(x)
## [1] "This is the longest sentence in world, ..." "No in fact, this is the longest sentence ..."
Benchmarks new vs old (32 000 sentences):
microbenchmark(sapply(x, cutAndAddDots, USE.NAMES = FALSE), extract(x), times=5)
Unit: milliseconds
expr min lq median uq max neval
sapply(x, cutAndAddDots, USE.NAMES = FALSE) 3762.51134 3762.92163 3767.87134 3776.03706 3788.139 5
extract(x) 56.01727 57.18771 58.50321 79.55759 97.924 5
OLD VERSION
This solution requires stringi
package and ALWAYS adds three dots ...
to the end of string.
require(stringi)
sapply(x, function(x) stri_paste(stri_wrap(x, 40)[1],"..."),USE.NAMES = FALSE)
## [1] "This is the longest sentence in world..." "No in fact, this is the longest..."
This one adds the three dots only to sentences which are longer than 40 characters:
require(stringi)
cutAndAddDots <- function(x){
w <- stri_wrap(x, 40)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
sapply(x, cutAndAddDots, USE.NAMES = FALSE)
## [1] "This is the longest sentence in world" "No in fact, this is the longest..."
PERFORMANCE NOTE
Setting normalize=FALSE
in stri_wrap
may speed up this roughly 3 times (tested on 30 000 sentences)
Test data:
x <- stri_rand_lipsum(3000)
x <- unlist(stri_split_regex(x,"(?<=\\.) "))
head(x)
[1] "Lorem ipsum dolor sit amet, vel commodo in."
[2] "Ultricies mauris sapien lectus dignissim."
[3] "Id pellentesque semper turpis habitasse egestas rutrum ligula vulputate laoreet mollis id."
[4] "Curabitur volutpat efficitur parturient nibh sociosqu, faucibus tellus, eleifend pretium, quis."
[5] "Feugiat vel mollis ultricies ut auctor."
[6] "Massa neque auctor lacus ridiculus."
stri_length(head(x))
[1] 43 41 90 95 39 35
cutAndAddDots <- function(x){
w <- stri_wrap(x, 40, normalize = FALSE)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
cutAndAddDotsNormalize <- function(x){
w <- stri_wrap(x, 40, normalize = TRUE)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
require(microbenchmark)
microbenchmark(sapply(x, cutAndAddDots, USE.NAMES = FALSE),sapply(x, cutAndAddDotsNormalize, USE.NAMES = FALSE),times=3)
Unit: seconds
expr min lq median uq max
sapply(x, cutAndAddDots, USE.NAMES = FALSE) 3.917858 3.967411 4.016964 4.055571 4.094178
sapply(x, cutAndAddDotsNormalize, USE.NAMES = FALSE) 13.493732 13.651451 13.809170 13.917854 14.026538
回答2:
Base R solution:
baseR <- function(x){
m <- regexpr("^.{0,40}( |$)", x)
result <- regmatches(x,m)
longer <- nchar(x)>40
result[longer] <- paste(result[longer],"...",sep = "")
result
}
baseR(x)==extract(x)
[1] TRUE TRUE
Works just like @bartektartanus extract
:) But it's slower... I tested this on data generated from his code. Still, if you don't want to use any external packages - this one works!
microbenchmark(baseR(x), extract(x))
Unit: milliseconds
expr min lq median uq max neval
baseR(x) 101.20905 107.0264 108.79086 111.03229 162.6375 100
extract(x) 52.83951 54.6931 55.46628 59.37808 103.0631 100
回答3:
Figured I'd post this one too. Definitely not stringi
speed, but it's not too shabby. I needed one to bypass the print methods for str
so I wrote this.
charTrunc <- function(x, width, end = " ...") {
ncw <- nchar(x) >= width
trm <- strtrim(x[ncw], width - nchar(end))
trimmed <- gsub("\\s+$", "", trm)
replace(x, ncw, paste0(trimmed, end))
}
Testing on the string from @bartektartanus answer:
x <- stri_rand_lipsum(3000)
x <- unlist(stri_split_regex(x,"(?<=\\.) "))
library(microbenchmark)
microbenchmark(charTrunc = {
out <- charTrunc(x, 40L)
},
times = 3
)
Unit: milliseconds
expr min lq mean median uq max neval
charTrunc 506.553 510.988 513.4603 515.423 516.9139 518.4049 3
head(out)
# [1] "Lorem ipsum dolor sit amet, venenati ..."
# [2] "Tincidunt at pellentesque id sociosq ..."
# [3] "At etiam quis et mauris non tincidun ..."
# [4] "In viverra aenean nisl ex aliquam du ..."
# [5] "Dui mi mauris ac lacus sit hac."
# [6] "Ultrices faucibus sed justo ridiculu ..."
来源:https://stackoverflow.com/questions/27757436/shorten-limit-the-length-of-a-sentence