R: merge two data frames when either of two criteria matches

问题

Say I have two dataframes like the following:

n = c(2, 3, 5, 5, 6, 7) 
s = c("aa", "bb", "cc", "dd", "ee", "ff") 
b = c(2, 4, 5, 4, 3, 2) 
df = data.frame(n, s, b)
#  n  s b
#1 2 aa 2
#2 3 bb 4
#3 5 cc 5  
#4 5 dd 4
#5 6 ee 3
#6 7 ff 2

n2 = c(5, 6, 7, 6) 
s2 = c("aa", "bb", "cc", "ll") 
b2 = c("hh", "nn", "ff", "dd")  
df2 = data.frame(n2, s2, b2)

 #   n2 s2 b2
 #1  5 aa hh
 #2  6 bb nn
 #3  7 cc ff
 #4  6 ll dd

I want to merge them to achieve the following result:

 #n s  b n2 s2 b2
 #2 aa 2 5  aa hh
 #3 bb 4 6  bb nn
 #5 cc 5 7  cc ff
 #5 dd 4 6  ll dd

Basically, what I want to achieve is to merge the two dataframes whenever the values in s of the first data is found in either the s2 or the b2 columns of data2.

I know that merge can work when I specify the two columns from each dataframe but I am not sure how to ADD the OR condition in the merge function. Or how to achieve this goal using other commands from packages such as dpylr.

Also, to clarify, there will be a situation where s2 and b2 have matches with s column in the same row. If this is the case, then just merge them once.

回答1:

A coupld of problems: 1) you have built a couple of dataframes with factors which has a tendency to screw up matching and indexing, so I used stringsAsFactors =FALSE in hte dataframe calls. 2) you have an ambiguous situation with no stated resolution when both s2 and b2 have matches in the s column (as does occur in your example):

> df2[c("s")] <- list( c( df$s[pmax( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s
1  5 aa hh aa
2  6 bb nn bb
3  7 cc ff ff
4  6 ll dd dd
> df2[c("s")] <- list( c( df$s[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s
1  5 aa hh aa
2  6 bb nn bb
3  7 cc ff cc
4  6 ll dd dd

Once you resolve the ambiguity to your satiusfaction just use the same method to extract and match the "b"s:

> df2[c("b")] <- list( c( df$b[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s b
1  5 aa hh aa 2
2  6 bb nn bb 4
3  7 cc ff cc 5
4  6 ll dd dd 4

Modified df's:

> dput(df)
structure(list(n = c(2, 3, 5, 5, 6, 7), s = c("aa", "bb", "cc", 
"dd", "ee", "ff"), b = c(2, 4, 5, 4, 3, 2)), .Names = c("n", 
"s", "b"), row.names = c(NA, -6L), class = "data.frame")
> dput(df2)
structure(list(n2 = c(5, 6, 7, 6), s2 = c("aa", "bb", "cc", "ll"
), b2 = c("hh", "nn", "ff", "dd"), s = c("aa", "bb", "cc", "dd"
), b = c(2, 4, 5, 4)), row.names = c(NA, -4L), .Names = c("n2", 
"s2", "b2", "s", "b"), class = "data.frame")

One step solution:

> df2[c("s", "c")] <-  df[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE), c("s", "b")]
> df2
  n2 s2 b2  s c
1  5 aa hh aa 2
2  6 bb nn bb 4
3  7 cc ff cc 5
4  6 ll dd dd 4

回答2:

If you are familiar with SQL you could use that:

library(sqldf)
res <- sqldf("SELECT l.*, r.*
              FROM df as l
              INNER JOIN df2 as r
              on l.s = r.s2 OR l.s = r.b2")

res
  n  s b n2 s2 b2
1 2 aa 2  5 aa hh
2 3 bb 4  6 bb nn
3 5 cc 5  7 cc ff
4 5 dd 4  6 ll dd
5 7 ff 2  7 cc ff

Data:

df<-structure(list(n = c(2, 3, 5, 5, 6, 7), s = structure(1:6, .Label = c("aa", 
"bb", "cc", "dd", "ee", "ff"), class = "factor"), b = c(2, 4, 
5, 4, 3, 2)), .Names = c("n", "s", "b"), row.names = c(NA, -6L
), class = "data.frame")

df2<-structure(list(n2 = c(5, 6, 7, 6), s2 = structure(1:4, .Label = c("aa", 
"bb", "cc", "ll"), class = "factor"), b2 = structure(c(3L, 4L, 
2L, 1L), .Label = c("dd", "ff", "hh", "nn"), class = "factor")), .Names = c("n2", 
"s2", "b2"), row.names = c(NA, -4L), class = "data.frame")

回答3:

One base approach is rbinding two merges. You need to re-create the corresponding join keys in df2 to effectively concatenate the frames. Also, #5 row does not emerge in desired results:

t1 <- merge(df, df2, by.x=c("s"), by.y=c("s2"))
t1$s2 <- t1$s

t2 <- merge(df, df2, by.x=c("s"), by.y=c("b2"))
t2$b2 <- t2$s

finaldf <- rbind(t1, t2)

#    s n b n2 b2 s2
# 1 aa 2 2  5 hh aa
# 2 bb 3 4  6 nn bb
# 3 cc 5 5  7 ff cc
# 4 dd 5 4  6 dd ll
# 5 ff 7 2  7 ff cc

回答4:

We could use a fuzzy join, it might not be very efficient in this case if you have big data but it's certainly readable. Using my package safejoin which wraps (in this case) around fuzzyjoin :

# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
safe_inner_join(df, df2, ~ X("s") == Y("s2") | X("s") == Y("b2"))
#   n  s b n2 s2 b2
# 1 2 aa 2  5 aa hh
# 2 3 bb 4  6 bb nn
# 3 5 cc 5  7 cc ff
# 4 5 dd 4  6 ll dd
# 5 7 ff 2  7 cc ff

The fuzzyjoin syntax would be:

library(fuzzyjoin)
fuzzy_inner_join(df, df2, match_fun = NULL, 
                 multi_by = list(x = "s", y= c("s2","b2")), 
                 multi_match_fun = function(x,y) x == y[,"s2"] | x == y[,"b2"])
#   n  s b n2 s2 b2
# 1 2 aa 2  5 aa hh
# 2 3 bb 4  6 bb nn
# 3 5 cc 5  7 cc ff
# 4 5 dd 4  6 ll dd
# 5 7 ff 2  7 cc ff

来源：https://stackoverflow.com/questions/38753092/r-merge-two-data-frames-when-either-of-two-criteria-matches

标签

merge

data-manipulation