R: merge two data frames when either of two criteria matches

隐身守侯 提交于 2019-12-04 20:52:39

A coupld of problems: 1) you have built a couple of dataframes with factors which has a tendency to screw up matching and indexing, so I used stringsAsFactors =FALSE in hte dataframe calls. 2) you have an ambiguous situation with no stated resolution when both s2 and b2 have matches in the s column (as does occur in your example):

> df2[c("s")] <- list( c( df$s[pmax( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s
1  5 aa hh aa
2  6 bb nn bb
3  7 cc ff ff
4  6 ll dd dd
> df2[c("s")] <- list( c( df$s[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s
1  5 aa hh aa
2  6 bb nn bb
3  7 cc ff cc
4  6 ll dd dd

Once you resolve the ambiguity to your satiusfaction just use the same method to extract and match the "b"s:

> df2[c("b")] <- list( c( df$b[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s b
1  5 aa hh aa 2
2  6 bb nn bb 4
3  7 cc ff cc 5
4  6 ll dd dd 4

Modified df's:

> dput(df)
structure(list(n = c(2, 3, 5, 5, 6, 7), s = c("aa", "bb", "cc", 
"dd", "ee", "ff"), b = c(2, 4, 5, 4, 3, 2)), .Names = c("n", 
"s", "b"), row.names = c(NA, -6L), class = "data.frame")
> dput(df2)
structure(list(n2 = c(5, 6, 7, 6), s2 = c("aa", "bb", "cc", "ll"
), b2 = c("hh", "nn", "ff", "dd"), s = c("aa", "bb", "cc", "dd"
), b = c(2, 4, 5, 4)), row.names = c(NA, -4L), .Names = c("n2", 
"s2", "b2", "s", "b"), class = "data.frame")

One step solution:

> df2[c("s", "c")] <-  df[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE), c("s", "b")]
> df2
  n2 s2 b2  s c
1  5 aa hh aa 2
2  6 bb nn bb 4
3  7 cc ff cc 5
4  6 ll dd dd 4

If you are familiar with SQL you could use that:

library(sqldf)
res <- sqldf("SELECT l.*, r.*
              FROM df as l
              INNER JOIN df2 as r
              on l.s = r.s2 OR l.s = r.b2")

res
  n  s b n2 s2 b2
1 2 aa 2  5 aa hh
2 3 bb 4  6 bb nn
3 5 cc 5  7 cc ff
4 5 dd 4  6 ll dd
5 7 ff 2  7 cc ff

Data:

df<-structure(list(n = c(2, 3, 5, 5, 6, 7), s = structure(1:6, .Label = c("aa", 
"bb", "cc", "dd", "ee", "ff"), class = "factor"), b = c(2, 4, 
5, 4, 3, 2)), .Names = c("n", "s", "b"), row.names = c(NA, -6L
), class = "data.frame")

df2<-structure(list(n2 = c(5, 6, 7, 6), s2 = structure(1:4, .Label = c("aa", 
"bb", "cc", "ll"), class = "factor"), b2 = structure(c(3L, 4L, 
2L, 1L), .Label = c("dd", "ff", "hh", "nn"), class = "factor")), .Names = c("n2", 
"s2", "b2"), row.names = c(NA, -4L), class = "data.frame")

One base approach is rbinding two merges. You need to re-create the corresponding join keys in df2 to effectively concatenate the frames. Also, #5 row does not emerge in desired results:

t1 <- merge(df, df2, by.x=c("s"), by.y=c("s2"))
t1$s2 <- t1$s

t2 <- merge(df, df2, by.x=c("s"), by.y=c("b2"))
t2$b2 <- t2$s

finaldf <- rbind(t1, t2)

#    s n b n2 b2 s2
# 1 aa 2 2  5 hh aa
# 2 bb 3 4  6 nn bb
# 3 cc 5 5  7 ff cc
# 4 dd 5 4  6 dd ll
# 5 ff 7 2  7 ff cc

We could use a fuzzy join, it might not be very efficient in this case if you have big data but it's certainly readable. Using my package safejoin which wraps (in this case) around fuzzyjoin :

# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
safe_inner_join(df, df2, ~ X("s") == Y("s2") | X("s") == Y("b2"))
#   n  s b n2 s2 b2
# 1 2 aa 2  5 aa hh
# 2 3 bb 4  6 bb nn
# 3 5 cc 5  7 cc ff
# 4 5 dd 4  6 ll dd
# 5 7 ff 2  7 cc ff

The fuzzyjoin syntax would be:

library(fuzzyjoin)
fuzzy_inner_join(df, df2, match_fun = NULL, 
                 multi_by = list(x = "s", y= c("s2","b2")), 
                 multi_match_fun = function(x,y) x == y[,"s2"] | x == y[,"b2"])
#   n  s b n2 s2 b2
# 1 2 aa 2  5 aa hh
# 2 3 bb 4  6 bb nn
# 3 5 cc 5  7 cc ff
# 4 5 dd 4  6 ll dd
# 5 7 ff 2  7 cc ff
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!