I'd like to join two data frames if the seed
column in data frame y
is a partial match on the string
column in x
. This example should illustrate:
# What I have
x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))
x
idX string
1 1 Motorcycle
2 2 TractorTrailer
3 3 Sailboat
y
Source: local data frame [3 x 2]
idY seed
(chr) (chr)
1 a ractor
2 b otorcy
3 c irplan
# What I want
want <- data.frame(idX=c(1,2), idY=c("b", "a"), string=c("Motorcycle", "TractorTrailer"), seed=c("otorcy", "ractor"))
want
idX idY string seed
1 1 b Motorcycle otorcy
2 2 a TractorTrailer ractor
That is, something like
inner_join(x, y, by=stringr::str_detect(x$string, y$seed))
The fuzzyjoin
library has two functions regex_inner_join
and fuzzy_inner_join
that allow you to match partial strings:
x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data.frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))
x$string = as.character(x$string)
y$seed = as.character(y$seed)
library(fuzzyjoin)
x %>% regex_inner_join(y, by = c(string = "seed"))
idX string idY seed
1 1 Motorcycle b otorcy
2 2 TractorTrailer a ractor
library(stringr)
x %>% fuzzy_inner_join(y, by = c("string" = "seed"), match_fun = str_detect)
idX string idY seed
1 1 Motorcycle b otorcy
2 2 TractorTrailer a ractor
You can also use base-r with this function (slightly adapted from this answer here: https://stackoverflow.com/a/34723496/3048453, it uses dplyr to bind the columns together, use cbind
if you don't want to use dplyr):
partial_join <- function(x, y, by_x, pattern_y)
idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]])
idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]])))
df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F],
y[unlist(idx_y), , drop = F])
return(df)
}
With your example
x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))
df_merged <- partial_join(x, y, by_x = "string", pattern_y = "seed")
df_merged
# # A tibble: 2 × 4
# idX string idY seed
# <int> <chr> <chr> <chr>
# 1 1 Motorcycle b otorcy
# 2 2 TractorTrailer a ractor
Speed Benchmarks:
Functions
library(dplyr)
x <- data_frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))
partial_join <- function(x, y, by_x, pattern_y) {
idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]])
idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]])))
df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F],
y[unlist(idx_y), , drop = F])
return(df)
}
partial_join(x, y, by_x = "string", pattern_y = "seed")
#> # A tibble: 2 × 4
#> idX string idY seed
#> <int> <chr> <chr> <chr>
#> 1 1 Motorcycle b otorcy
#> 2 2 TractorTrailer a ractor
joran <- function(x, y, by_x, pattern_y) {
library(dplyr)
my_db <- src_sqlite(path = tempfile(), create= TRUE)
x_tbl <- copy_to(dest = my_db, df = x)
y_tbl <- copy_to(dest = my_db, df = y)
result <- tbl(my_db,
sql(sprintf("select * from x, y where x.%s like '%%' || y.%s || '%%'", by_x, pattern_y)))
collect(result, n = Inf)
}
joran(x, y, "string", "seed")
#> # A tibble: 2 × 4
#> idX string idY seed
#> <int> <chr> <chr> <chr>
#> 1 1 Motorcycle b otorcy
#> 2 2 TractorTrailer a ractor
stephen <- function(x, y, by_x, pattern_y) {
library(dplyr)
d <- full_join(mutate(x, i=1),
mutate(y, i=1), by = "i")
# quoting issue here, defaulting to base-r
d$take <- stringr::str_detect(d[[by_x]], d[[pattern_y]])
d %>%
filter(take == T) %>%
select(-i, -take)
}
stephen(x, y, "string", "seed")
#> # A tibble: 2 × 4
#> idX string idY seed
#> <int> <chr> <chr> <chr>
#> 1 1 Motorcycle b otorcy
#> 2 2 TractorTrailer a ractor
feng <- function(x, y, by_x, pattern_y) {
library(fuzzyjoin)
by_string <- pattern_y
names(by_string) <- by_x
regex_inner_join(x, y, by = by_string)
}
feng(x, y, "string", "seed")
#> # A tibble: 2 × 4
#> idX string idY seed
#> <int> <chr> <chr> <chr>
#> 1 1 Motorcycle b otorcy
#> 2 2 TractorTrailer a ractor
Benchmark
library(microbenchmark)
res <- microbenchmark(
joran(x, y, "string", "seed"),
stephen(x, y, "string", "seed"),
feng(x, y, "string", "seed"),
partial_join(x, y, "string", "seed")
)
res
#> Unit: microseconds
#> expr min lq mean
#> joran(x, y, "string", "seed") 18953.008 20099.0540 21641.6646
#> stephen(x, y, "string", "seed") 1320.161 1456.9415 1704.9218
#> feng(x, y, "string", "seed") 5187.366 5625.8825 6926.2336
#> partial_join(x, y, "string", "seed") 190.264 222.0055 257.7906
#> median uq max neval cld
#> 20675.5855 21827.764 70707.324 100 c
#> 1579.8925 1670.719 9676.176 100 a
#> 5842.8150 6065.530 107961.805 100 b
#> 242.0735 283.870 523.649 100 a
set.seed(123123)
x_large <- x %>% sample_n(1000, replace = T)
y_large <- y %>% sample_n(1000, replace = T)
res_large <- microbenchmark(
joran(x_large, y_large, "string", "seed"),
# stephen(x_large, y_large, "string", "seed"),
feng(x_large, y_large, "string", "seed"),
partial_join(x_large, y_large, "string", "seed")
)
res_large
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> joran(x_large, y_large, "string", "seed") 321.03631 324.49262 334.2760 329.13991 335.30185 368.1153 10 c
#> feng(x_large, y_large, "string", "seed") 88.00369 89.85744 103.8686 93.84477 97.69121 200.0473 10 a
#> partial_join(x_large, y_large, "string", "seed") 286.01533 286.78024 290.6295 288.89405 291.79887 303.4524 10 b
I don't know how this will perform for larger data, but it (or a variant of it) might be worth a try:
library(dplyr)
x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))
my_db <- src_sqlite(path = tempfile(),create= TRUE)
x_tbl <- copy_to(dest = my_db,df = x)
y_tbl <- copy_to(dest = my_db,df = y)
result <- tbl(my_db,sql("select * from x,y where x.string like '%' || y.seed || '%'"))
> collect(result)
Source: local data frame [2 x 4]
idX string idY seed
(int) (chr) (chr) (chr)
1 1 Motorcycle b otorcy
2 2 TractorTrailer a ractor
I also can't speak to how the performance of this might vary across DBs. postgres or mysql might be better or worse at this sort of query.
This works, but it's going to be incredibly slow on huge datasets.
x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))
library(dplyr)
full_join(mutate(x, i=1),
mutate(y, i=1)) %>%
select(-i) %>%
filter(str_detect(string, seed))
来源:https://stackoverflow.com/questions/32914357/dplyr-inner-join-with-a-partial-string-match