问题
I have the following dataframe:
a <- seq(1:5)
b <- c("abc_a_123456_defghij_1", "abc_a_78912_abc_2", "abc_a_345678912_xyzabc_3",
"abc_b_34567_defgh_4", "abc_c_891234556778_ijklmnop_5")
df <- data.frame(a, b)
df$b <- as.character(df$b)
And I need to extract the numbers in df$b that come between the second and third underscores and assign to df$c.
I'm guessing there's a fairly simple solution, but haven't found it yet. The actual dataset is fairly large (3MM rows) so efficiency is a bit of a factor.
Thanks for the help!
回答1:
We can use sub
to match the zeor or more characters that are not a _
([^_]*
) from the start (^
) of the string followed by an underscore (_
), then another set of characters that are not an underscore followed by underscore, capture the one of more numbers that follows in a group ((\\d+)
) followed by underscore and other characters, then replace it with the backreference for that group and finally convert it to numeric
as.numeric(sub("^[^_]*_[^_]+_(\\d+)_.*", "\\1", df$b))
#[1] 123456 78912 345678912 34567 891234556778
回答2:
create a my_split
function that finds the start and end position of "_" using gregexpr
. Then extract the string between start and end position using substr
.
my_split <- function(x, start, end){
a1 <- gregexpr("_", x)
substr(x, a1[[1]][start]+1, a1[[1]][end]-1)
}
b <- c("abc_a_123456_defghij_1", "abc_a_78912_abc_2", "abc_a_345678912_xyzabc_3", "abc_b_34567_defgh_4", "abc_c_891234556778_ijklmnop_5")
sapply(b, my_split, start = 2, end = 3)
# abc_a_123456_defghij_1 abc_a_78912_abc_2
# "123456" "78912"
# abc_a_345678912_xyzabc_3 abc_b_34567_defgh_4
# "345678912" "34567"
# abc_c_891234556778_ijklmnop_5
# "891234556778"
using data.table library
library(data.table)
setDT(df)[, c := lapply(b, my_split, start = 2, end = 3)]
df
# a b c
# 1: 1 abc_a_123456_defghij_1 123456
# 2: 2 abc_a_78912_abc_2 78912
# 3: 3 abc_a_345678912_xyzabc_3 345678912
# 4: 4 abc_b_34567_defgh_4 34567
# 5: 5 abc_c_891234556778_ijklmnop_5 891234556778
data:
a <- seq(1:5)
b <- c("abc_a_123456_defghij_1", "abc_a_78912_abc_2", "abc_a_345678912_xyzabc_3", "abc_b_34567_defgh_4", "abc_c_891234556778_ijklmnop_5")
df <- data.frame(a, b, stringsAsFactors = FALSE)
来源:https://stackoverflow.com/questions/41641890/extracting-numbers-from-character-string-based-on-delimiters