R: merge two data frames when either of two criteria matches

我是研究僧i 提交于 2019-12-06 13:01:01

问题


Say I have two dataframes like the following:

n = c(2, 3, 5, 5, 6, 7) 
s = c("aa", "bb", "cc", "dd", "ee", "ff") 
b = c(2, 4, 5, 4, 3, 2) 
df = data.frame(n, s, b)
#  n  s b
#1 2 aa 2
#2 3 bb 4
#3 5 cc 5  
#4 5 dd 4
#5 6 ee 3
#6 7 ff 2

n2 = c(5, 6, 7, 6) 
s2 = c("aa", "bb", "cc", "ll") 
b2 = c("hh", "nn", "ff", "dd")  
df2 = data.frame(n2, s2, b2)

 #   n2 s2 b2
 #1  5 aa hh
 #2  6 bb nn
 #3  7 cc ff
 #4  6 ll dd

I want to merge them to achieve the following result:

 #n s  b n2 s2 b2
 #2 aa 2 5  aa hh
 #3 bb 4 6  bb nn
 #5 cc 5 7  cc ff
 #5 dd 4 6  ll dd

Basically, what I want to achieve is to merge the two dataframes whenever the values in s of the first data is found in either the s2 or the b2 columns of data2.

I know that merge can work when I specify the two columns from each dataframe but I am not sure how to ADD the OR condition in the merge function. Or how to achieve this goal using other commands from packages such as dpylr.

Also, to clarify, there will be a situation where s2 and b2 have matches with s column in the same row. If this is the case, then just merge them once.


回答1:


A coupld of problems: 1) you have built a couple of dataframes with factors which has a tendency to screw up matching and indexing, so I used stringsAsFactors =FALSE in hte dataframe calls. 2) you have an ambiguous situation with no stated resolution when both s2 and b2 have matches in the s column (as does occur in your example):

> df2[c("s")] <- list( c( df$s[pmax( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s
1  5 aa hh aa
2  6 bb nn bb
3  7 cc ff ff
4  6 ll dd dd
> df2[c("s")] <- list( c( df$s[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s
1  5 aa hh aa
2  6 bb nn bb
3  7 cc ff cc
4  6 ll dd dd

Once you resolve the ambiguity to your satiusfaction just use the same method to extract and match the "b"s:

> df2[c("b")] <- list( c( df$b[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE)]))
> df2
  n2 s2 b2  s b
1  5 aa hh aa 2
2  6 bb nn bb 4
3  7 cc ff cc 5
4  6 ll dd dd 4

Modified df's:

> dput(df)
structure(list(n = c(2, 3, 5, 5, 6, 7), s = c("aa", "bb", "cc", 
"dd", "ee", "ff"), b = c(2, 4, 5, 4, 3, 2)), .Names = c("n", 
"s", "b"), row.names = c(NA, -6L), class = "data.frame")
> dput(df2)
structure(list(n2 = c(5, 6, 7, 6), s2 = c("aa", "bb", "cc", "ll"
), b2 = c("hh", "nn", "ff", "dd"), s = c("aa", "bb", "cc", "dd"
), b = c(2, 4, 5, 4)), row.names = c(NA, -4L), .Names = c("n2", 
"s2", "b2", "s", "b"), class = "data.frame")

One step solution:

> df2[c("s", "c")] <-  df[pmin( match( df2$s2 , df$s), match(df2$b2, df$s),na.rm=TRUE), c("s", "b")]
> df2
  n2 s2 b2  s c
1  5 aa hh aa 2
2  6 bb nn bb 4
3  7 cc ff cc 5
4  6 ll dd dd 4



回答2:


If you are familiar with SQL you could use that:

library(sqldf)
res <- sqldf("SELECT l.*, r.*
              FROM df as l
              INNER JOIN df2 as r
              on l.s = r.s2 OR l.s = r.b2")

res
  n  s b n2 s2 b2
1 2 aa 2  5 aa hh
2 3 bb 4  6 bb nn
3 5 cc 5  7 cc ff
4 5 dd 4  6 ll dd
5 7 ff 2  7 cc ff

Data:

df<-structure(list(n = c(2, 3, 5, 5, 6, 7), s = structure(1:6, .Label = c("aa", 
"bb", "cc", "dd", "ee", "ff"), class = "factor"), b = c(2, 4, 
5, 4, 3, 2)), .Names = c("n", "s", "b"), row.names = c(NA, -6L
), class = "data.frame")

df2<-structure(list(n2 = c(5, 6, 7, 6), s2 = structure(1:4, .Label = c("aa", 
"bb", "cc", "ll"), class = "factor"), b2 = structure(c(3L, 4L, 
2L, 1L), .Label = c("dd", "ff", "hh", "nn"), class = "factor")), .Names = c("n2", 
"s2", "b2"), row.names = c(NA, -4L), class = "data.frame")



回答3:


One base approach is rbinding two merges. You need to re-create the corresponding join keys in df2 to effectively concatenate the frames. Also, #5 row does not emerge in desired results:

t1 <- merge(df, df2, by.x=c("s"), by.y=c("s2"))
t1$s2 <- t1$s

t2 <- merge(df, df2, by.x=c("s"), by.y=c("b2"))
t2$b2 <- t2$s

finaldf <- rbind(t1, t2)

#    s n b n2 b2 s2
# 1 aa 2 2  5 hh aa
# 2 bb 3 4  6 nn bb
# 3 cc 5 5  7 ff cc
# 4 dd 5 4  6 dd ll
# 5 ff 7 2  7 ff cc



回答4:


We could use a fuzzy join, it might not be very efficient in this case if you have big data but it's certainly readable. Using my package safejoin which wraps (in this case) around fuzzyjoin :

# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
safe_inner_join(df, df2, ~ X("s") == Y("s2") | X("s") == Y("b2"))
#   n  s b n2 s2 b2
# 1 2 aa 2  5 aa hh
# 2 3 bb 4  6 bb nn
# 3 5 cc 5  7 cc ff
# 4 5 dd 4  6 ll dd
# 5 7 ff 2  7 cc ff

The fuzzyjoin syntax would be:

library(fuzzyjoin)
fuzzy_inner_join(df, df2, match_fun = NULL, 
                 multi_by = list(x = "s", y= c("s2","b2")), 
                 multi_match_fun = function(x,y) x == y[,"s2"] | x == y[,"b2"])
#   n  s b n2 s2 b2
# 1 2 aa 2  5 aa hh
# 2 3 bb 4  6 bb nn
# 3 5 cc 5  7 cc ff
# 4 5 dd 4  6 ll dd
# 5 7 ff 2  7 cc ff


来源:https://stackoverflow.com/questions/38753092/r-merge-two-data-frames-when-either-of-two-criteria-matches

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!