merge two data frames on non overlapping intervals

徘徊边缘 提交于 2019-12-02 07:34:29

sqldf will work, but I tried a 'pure' R solution. It works, but it is a little sloppy. I haven't figured out how to 'vectorize' the solution (remove the two for loops in the split.interval, and remove the need to lapply over id.split).

First I create two functions that can take one id, and merge 'a' and 'b' together:

split.interval = function(sub.a, sub.b) {
    begs = c(sub.a$beg_a,sub.b$beg_b)  
    ends = c(sub.a$end_a,sub.b$end_b)
    dates=c(begs,ends)
    dates = dates[order(dates)]
    d = data.frame(overallBeg = dates[-length(dates)], overallEnd = dates[-1])
    date.match = function(x,y) {
            s = match(x, d$overallBeg )
            e = match(y, d$overallEnd )
            join=as.Date(rep(NA,length(d$overallBeg)))
            for (i in 1:length(x)) join [s[i]:e[i]]= x[i]
            join
    }

    d$a_join = date.match(sub.a$beg_a,sub.a$end_a)
    d$b_join = date.match(sub.b$beg_b,sub.b$end_b)

    d = merge(sub.a,d,by.x='beg_a',by.y='a_join',all.y=T)
    d = merge(sub.b,d,by.x='beg_b',by.y='b_join',all.y=T)

    d$id=pmax(d$id.x,d$id.y,na.rm=T)
    d = d [order(d$overallBeg),c('id','beg_a','end_a','prop_a','beg_b','end_b','prop_b','overallBeg','overallEnd')]
    # This next line will lead to a bug if overallBeg == overallEnd
    d$overallEnd [d$overallEnd == c(d$overallBeg[-1],F)] = d$overallEnd [d$overallEnd == c(d$overallBeg[-1],F)] - 1  
    d

}

id.split = function (ids) {
    sub.a=a[a$id==ids,]
    sub.b=b[b$id==ids,]

    split.interval ( sub.a , sub.b )
}

Then I run the function for every ID, and bind them all together.

l=lapply(unique(c(a$id,b$id)), id.split) 
res = do.call(rbind,l)
row.names(res) = NULL
res

You can do that in two steps: first, compute all the desired intervals and put them in an intermediary table, then join this table with the two initial data.frames.

# First build all the desired intervals
names(a) <- c( "id", "valid_from", "valid_until", "prop_a" )
names(b) <- c( "id", "valid_from", "valid_until", "prop_b" )

intervals <- rbind( 
  data.frame( id = a$id, date = a$valid_from ),
  data.frame( id = a$id, date = a$valid_until ),
  data.frame( id = b$id, date = b$valid_from ),
  data.frame( id = b$id, date = b$valid_until )
)
intervals <- unique( intervals )
intervals <- intervals[ order(intervals$id, intervals$date), ]
n <- dim(intervals)[1]
intervals <- data.frame(
  id = intervals$id[-n],
  id2 = intervals$id[-1],
  valid_from = intervals$date[-n],
  valid_until = intervals$date[-1]
)
intervals <- intervals[ 
  intervals$id == intervals$id2, 
  c("id", "valid_from", "valid_until") 
]

Since the condition on which we join the data is not a simple equality, let us use sqldf.

library(sqldf)
d <- sqldf( "
  SELECT intervals.id,
         intervals.valid_from, intervals.valid_until, 
         a.prop_a, b.prop_b
  FROM intervals
  LEFT JOIN a
  ON          a.valid_from  <= intervals.valid_from 
  AND intervals.valid_until <=         a.valid_until
  AND intervals.id = a.id
  LEFT JOIN b
  ON          b.valid_from  <= intervals.valid_from 
  AND intervals.valid_until <=         b.valid_until
  AND intervals.id = b.id
" )
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!