I'd like to join two data frames if the seed column in data frame y is a partial match on the string column in x. This example should illustrate:

# What I have
x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))


x

  idX         string
1   1     Motorcycle
2   2 TractorTrailer
3   3       Sailboat

y

Source: local data frame [3 x 2]

    idY   seed
  (chr)  (chr)
1     a ractor
2     b otorcy
3     c irplan


# What I want
want <- data.frame(idX=c(1,2), idY=c("b", "a"), string=c("Motorcycle", "TractorTrailer"), seed=c("otorcy", "ractor"))

want

  idX idY         string   seed
1   1   b     Motorcycle otorcy
2   2   a TractorTrailer ractor

That is, something like

inner_join(x, y, by=stringr::str_detect(x$string, y$seed))

The fuzzyjoin library has two functions regex_inner_join and fuzzy_inner_join that allow you to match partial strings:

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data.frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))
x$string = as.character(x$string)
y$seed = as.character(y$seed)


library(fuzzyjoin)
x %>% regex_inner_join(y, by = c(string = "seed"))

  idX         string idY   seed
1   1     Motorcycle   b otorcy
2   2 TractorTrailer   a ractor


library(stringr)
x %>% fuzzy_inner_join(y, by = c("string" = "seed"), match_fun = str_detect)


  idX         string idY   seed
1   1     Motorcycle   b otorcy
2   2 TractorTrailer   a ractor

You can also use base-r with this function (slightly adapted from this answer here: https://stackoverflow.com/a/34723496/3048453, it uses dplyr to bind the columns together, use cbind if you don't want to use dplyr):

partial_join <- function(x, y, by_x, pattern_y)
 idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]])
 idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]])))

 df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F],
                        y[unlist(idx_y), , drop = F])
 return(df)
}

With your example

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))

df_merged <- partial_join(x, y, by_x = "string", pattern_y = "seed")
df_merged
# # A tibble: 2 × 4
#     idX         string   idY   seed
#   <int>          <chr> <chr>  <chr>
# 1     1     Motorcycle     b otorcy
# 2     2 TractorTrailer     a ractor

Speed Benchmarks:

Functions

library(dplyr)
x <- data_frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))

partial_join <- function(x, y, by_x, pattern_y) {
 idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]])
 idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]])))

 df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F],
                        y[unlist(idx_y), , drop = F])
 return(df)
}

partial_join(x, y, by_x = "string", pattern_y = "seed")
#> # A tibble: 2 × 4
#>     idX         string   idY   seed
#>   <int>          <chr> <chr>  <chr>
#> 1     1     Motorcycle     b otorcy
#> 2     2 TractorTrailer     a ractor

joran <- function(x, y, by_x, pattern_y) {
 library(dplyr)
 my_db <- src_sqlite(path = tempfile(), create= TRUE)
 x_tbl <- copy_to(dest = my_db, df = x)
 y_tbl <- copy_to(dest = my_db, df = y)

 result <- tbl(my_db, 
               sql(sprintf("select * from x, y where x.%s like '%%' || y.%s || '%%'", by_x, pattern_y)))
 collect(result, n = Inf)
}

joran(x, y, "string", "seed")
#> # A tibble: 2 × 4
#>     idX         string   idY   seed
#>   <int>          <chr> <chr>  <chr>
#> 1     1     Motorcycle     b otorcy
#> 2     2 TractorTrailer     a ractor

stephen <- function(x, y, by_x, pattern_y) {
 library(dplyr)
 d <- full_join(mutate(x, i=1), 
                mutate(y, i=1), by = "i")
 # quoting issue here, defaulting to base-r
 d$take <- stringr::str_detect(d[[by_x]], d[[pattern_y]])
 d %>% 
  filter(take == T) %>% 
  select(-i, -take)
}

stephen(x, y, "string", "seed")
#> # A tibble: 2 × 4
#>     idX         string   idY   seed
#>   <int>          <chr> <chr>  <chr>
#> 1     1     Motorcycle     b otorcy
#> 2     2 TractorTrailer     a ractor


feng <- function(x, y, by_x, pattern_y) {
 library(fuzzyjoin)

 by_string <- pattern_y
 names(by_string) <- by_x
 regex_inner_join(x, y, by = by_string)
}

feng(x, y, "string", "seed")
#> # A tibble: 2 × 4
#>     idX         string   idY   seed
#>   <int>          <chr> <chr>  <chr>
#> 1     1     Motorcycle     b otorcy
#> 2     2 TractorTrailer     a ractor

Benchmark

library(microbenchmark)
res <- microbenchmark(
 joran(x, y, "string", "seed"),
 stephen(x, y, "string", "seed"),
 feng(x, y, "string", "seed"),
 partial_join(x, y, "string", "seed")
)
res
#> Unit: microseconds
#>                                  expr       min         lq       mean
#>         joran(x, y, "string", "seed") 18953.008 20099.0540 21641.6646
#>       stephen(x, y, "string", "seed")  1320.161  1456.9415  1704.9218
#>          feng(x, y, "string", "seed")  5187.366  5625.8825  6926.2336
#>  partial_join(x, y, "string", "seed")   190.264   222.0055   257.7906
#>      median        uq        max neval cld
#>  20675.5855 21827.764  70707.324   100   c
#>   1579.8925  1670.719   9676.176   100 a  
#>   5842.8150  6065.530 107961.805   100  b 
#>    242.0735   283.870    523.649   100 a

set.seed(123123)
x_large <- x %>% sample_n(1000, replace = T)
y_large <- y %>% sample_n(1000, replace = T)


res_large <- microbenchmark(
 joran(x_large, y_large, "string", "seed"),
 # stephen(x_large, y_large, "string", "seed"),
 feng(x_large, y_large, "string", "seed"),
 partial_join(x_large, y_large, "string", "seed")
)
res_large
#> Unit: milliseconds
#>                                              expr       min        lq     mean    median        uq      max neval cld
#>         joran(x_large, y_large, "string", "seed") 321.03631 324.49262 334.2760 329.13991 335.30185 368.1153    10   c
#>          feng(x_large, y_large, "string", "seed")  88.00369  89.85744 103.8686  93.84477  97.69121 200.0473    10 a  
#>  partial_join(x_large, y_large, "string", "seed") 286.01533 286.78024 290.6295 288.89405 291.79887 303.4524    10  b

I don't know how this will perform for larger data, but it (or a variant of it) might be worth a try:

library(dplyr)

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))

my_db <- src_sqlite(path = tempfile(),create= TRUE)
x_tbl <- copy_to(dest = my_db,df = x)
y_tbl <- copy_to(dest = my_db,df = y)

result <- tbl(my_db,sql("select * from x,y where x.string like '%' || y.seed || '%'"))
> collect(result)

Source: local data frame [2 x 4]

    idX         string   idY   seed
  (int)          (chr) (chr)  (chr)
1     1     Motorcycle     b otorcy
2     2 TractorTrailer     a ractor

I also can't speak to how the performance of this might vary across DBs. postgres or mysql might be better or worse at this sort of query.

This works, but it's going to be incredibly slow on huge datasets.

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))

library(dplyr)
full_join(mutate(x, i=1), 
          mutate(y, i=1)) %>% 
  select(-i) %>% 
  filter(str_detect(string, seed))

来源：https://stackoverflow.com/questions/32914357/dplyr-inner-join-with-a-partial-string-match

标签