dplyr: inner_join with a partial string match

佐手、 提交于 2019-11-27 07:51:48

The fuzzyjoin library has two functions regex_inner_join and fuzzy_inner_join that allow you to match partial strings:

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data.frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))
x$string = as.character(x$string)
y$seed = as.character(y$seed)


library(fuzzyjoin)
x %>% regex_inner_join(y, by = c(string = "seed"))

  idX         string idY   seed
1   1     Motorcycle   b otorcy
2   2 TractorTrailer   a ractor


library(stringr)
x %>% fuzzy_inner_join(y, by = c("string" = "seed"), match_fun = str_detect)


  idX         string idY   seed
1   1     Motorcycle   b otorcy
2   2 TractorTrailer   a ractor

You can also use base-r with this function (slightly adapted from this answer here: https://stackoverflow.com/a/34723496/3048453, it uses dplyr to bind the columns together, use cbind if you don't want to use dplyr):

partial_join <- function(x, y, by_x, pattern_y)
 idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]])
 idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]])))

 df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F],
                        y[unlist(idx_y), , drop = F])
 return(df)
}

With your example

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))

df_merged <- partial_join(x, y, by_x = "string", pattern_y = "seed")
df_merged
# # A tibble: 2 × 4
#     idX         string   idY   seed
#   <int>          <chr> <chr>  <chr>
# 1     1     Motorcycle     b otorcy
# 2     2 TractorTrailer     a ractor

Speed Benchmarks:

Functions


library(dplyr)
x <- data_frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))

partial_join <- function(x, y, by_x, pattern_y) {
 idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]])
 idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]])))

 df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F],
                        y[unlist(idx_y), , drop = F])
 return(df)
}

partial_join(x, y, by_x = "string", pattern_y = "seed")
#> # A tibble: 2 × 4
#>     idX         string   idY   seed
#>   <int>          <chr> <chr>  <chr>
#> 1     1     Motorcycle     b otorcy
#> 2     2 TractorTrailer     a ractor

joran <- function(x, y, by_x, pattern_y) {
 library(dplyr)
 my_db <- src_sqlite(path = tempfile(), create= TRUE)
 x_tbl <- copy_to(dest = my_db, df = x)
 y_tbl <- copy_to(dest = my_db, df = y)

 result <- tbl(my_db, 
               sql(sprintf("select * from x, y where x.%s like '%%' || y.%s || '%%'", by_x, pattern_y)))
 collect(result, n = Inf)
}

joran(x, y, "string", "seed")
#> # A tibble: 2 × 4
#>     idX         string   idY   seed
#>   <int>          <chr> <chr>  <chr>
#> 1     1     Motorcycle     b otorcy
#> 2     2 TractorTrailer     a ractor

stephen <- function(x, y, by_x, pattern_y) {
 library(dplyr)
 d <- full_join(mutate(x, i=1), 
                mutate(y, i=1), by = "i")
 # quoting issue here, defaulting to base-r
 d$take <- stringr::str_detect(d[[by_x]], d[[pattern_y]])
 d %>% 
  filter(take == T) %>% 
  select(-i, -take)
}

stephen(x, y, "string", "seed")
#> # A tibble: 2 × 4
#>     idX         string   idY   seed
#>   <int>          <chr> <chr>  <chr>
#> 1     1     Motorcycle     b otorcy
#> 2     2 TractorTrailer     a ractor


feng <- function(x, y, by_x, pattern_y) {
 library(fuzzyjoin)

 by_string <- pattern_y
 names(by_string) <- by_x
 regex_inner_join(x, y, by = by_string)
}

feng(x, y, "string", "seed")
#> # A tibble: 2 × 4
#>     idX         string   idY   seed
#>   <int>          <chr> <chr>  <chr>
#> 1     1     Motorcycle     b otorcy
#> 2     2 TractorTrailer     a ractor

Benchmark

library(microbenchmark)
res <- microbenchmark(
 joran(x, y, "string", "seed"),
 stephen(x, y, "string", "seed"),
 feng(x, y, "string", "seed"),
 partial_join(x, y, "string", "seed")
)
res
#> Unit: microseconds
#>                                  expr       min         lq       mean
#>         joran(x, y, "string", "seed") 18953.008 20099.0540 21641.6646
#>       stephen(x, y, "string", "seed")  1320.161  1456.9415  1704.9218
#>          feng(x, y, "string", "seed")  5187.366  5625.8825  6926.2336
#>  partial_join(x, y, "string", "seed")   190.264   222.0055   257.7906
#>      median        uq        max neval cld
#>  20675.5855 21827.764  70707.324   100   c
#>   1579.8925  1670.719   9676.176   100 a  
#>   5842.8150  6065.530 107961.805   100  b 
#>    242.0735   283.870    523.649   100 a

set.seed(123123)
x_large <- x %>% sample_n(1000, replace = T)
y_large <- y %>% sample_n(1000, replace = T)


res_large <- microbenchmark(
 joran(x_large, y_large, "string", "seed"),
 # stephen(x_large, y_large, "string", "seed"),
 feng(x_large, y_large, "string", "seed"),
 partial_join(x_large, y_large, "string", "seed")
)
res_large
#> Unit: milliseconds
#>                                              expr       min        lq     mean    median        uq      max neval cld
#>         joran(x_large, y_large, "string", "seed") 321.03631 324.49262 334.2760 329.13991 335.30185 368.1153    10   c
#>          feng(x_large, y_large, "string", "seed")  88.00369  89.85744 103.8686  93.84477  97.69121 200.0473    10 a  
#>  partial_join(x_large, y_large, "string", "seed") 286.01533 286.78024 290.6295 288.89405 291.79887 303.4524    10  b 

I don't know how this will perform for larger data, but it (or a variant of it) might be worth a try:

library(dplyr)

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))

my_db <- src_sqlite(path = tempfile(),create= TRUE)
x_tbl <- copy_to(dest = my_db,df = x)
y_tbl <- copy_to(dest = my_db,df = y)

result <- tbl(my_db,sql("select * from x,y where x.string like '%' || y.seed || '%'"))
> collect(result)

Source: local data frame [2 x 4]

    idX         string   idY   seed
  (int)          (chr) (chr)  (chr)
1     1     Motorcycle     b otorcy
2     2 TractorTrailer     a ractor

I also can't speak to how the performance of this might vary across DBs. postgres or mysql might be better or worse at this sort of query.

This works, but it's going to be incredibly slow on huge datasets.

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))
y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))

library(dplyr)
full_join(mutate(x, i=1), 
          mutate(y, i=1)) %>% 
  select(-i) %>% 
  filter(str_detect(string, seed))
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!