How to merge two data frames in r by a common column with mismatched date/time values

前端 未结 2 917
天命终不由人
天命终不由人 2020-12-10 21:44

I wish to merge two datasets using ‘date time’ columns that are present in both (POSIXct format: dd/mm/yyyy hh:mm). Below is example data from the two datasets:



        
相关标签:
2条回答
  • 2020-12-10 22:07

    The selector of data.table seems well suited for that. It is really efficient and allow to merge the nearest value (upper or lower or both).

    Find on this website: https://www.r-bloggers.com/understanding-data-table-rolling-joins/ example for left, right joins...etc

    website[, join_time:=session_start_time]
    paypal[, join_time:=purchase_time]
    setkey(website, name, join_time)
    setkey(paypal, name, join_time)
    website[paypal, roll = T]
    

    About DT: https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html

    0 讨论(0)
  • 2020-12-10 22:17

    After first converting your datetime character strings to POSIXt classes, some combination of rounding and truncating those times should get you something you can use as the basis of a merge.

    First read in your data, and create corresponding POSIXt datetimes:

    dts1 <- structure(list(datetime = structure(1:6,
       .Label = c("30/03/2011 02:32", "30/03/2011 02:42",
       "30/03/2011 02:52", "30/03/2011 03:02", "30/03/2011 03:12",
       "30/03/2011 03:22"), class = "factor"), count = c(27L, 3L,
       0L, 1L, 15L, 0L), period = c(561L, 600L, 574L, 550L, 600L,
       597L)), .Names = c("datetime", "count", "period"),
       class = "data.frame", row.names = c(NA, -6L))
    dts2 <- structure(list(datetime = structure(1:7,
        .Label = c("30/03/2011 01:59", "30/03/2011 02:58",
        "30/03/2011 03:55", "30/03/2011 04:53", "30/03/2011 05:52",
        "30/03/2011 06:48", "30/03/2011 07:48"), class = "factor"),
        dist = c(23.9, 14.7, 10.4, 35.4, 56.1, 12.3, 10.7), car =
        c(1L, 1L, 2L, 1L, 1L, 1L, 1L), satd = c(3L, 7L, 9L, 3L, 7L,
        4L, 5L), alt = c(1.76, 6.36, -0.34, 3.55, -0.91, 6.58,
        4.18)), .Names = c("datetime", "dist", "car", "satd",
        "alt"), class = "data.frame", row.names = c(NA, -7L))
    
    # create corresponding POSIXlt vector
    # (you could update the 'datetime' columns in-place if you prefer)
    datetime1 <- strptime(dts1$datetime, format="%d/%m/%Y %H:%M")
    datetime2 <- strptime(dts2$datetime, format="%d/%m/%Y %H:%M")
    

    The following code produces a merged table based on the nearest hour in all cases. Inside the merge it's just prepending a column with the rounded times to each of your data frames, merging based on that (i.e., column number 1), then using the -1 index to remove that column at the end:

    # merge based on nearest hour
    merge(
        cbind(round(datetime1, "hours"), dts1),
        cbind(round(datetime2, "hours"), dts2),
        by=1, suffixes=c("_dts1", "_dts2")
    )[-1]
    
         datetime_dts1 count period    datetime_dts2 dist car satd  alt
    1 30/03/2011 02:32    27    561 30/03/2011 02:58 14.7   1    7 6.36
    2 30/03/2011 02:42     3    600 30/03/2011 02:58 14.7   1    7 6.36
    3 30/03/2011 02:52     0    574 30/03/2011 02:58 14.7   1    7 6.36
    4 30/03/2011 03:02     1    550 30/03/2011 02:58 14.7   1    7 6.36
    5 30/03/2011 03:12    15    600 30/03/2011 02:58 14.7   1    7 6.36
    6 30/03/2011 03:22     0    597 30/03/2011 02:58 14.7   1    7 6.36
    

    As above, but this time just truncating on hour:

    merge(
        cbind(trunc(datetime1, "hours"), dts1),
        cbind(trunc(datetime2, "hours"), dts2),
        by=1, suffixes=c("_dts1", "_dts2")
    )[-1]
    
         datetime_dts1 count period    datetime_dts2 dist car satd   alt
    1 30/03/2011 02:32    27    561 30/03/2011 02:58 14.7   1    7  6.36
    2 30/03/2011 02:42     3    600 30/03/2011 02:58 14.7   1    7  6.36
    3 30/03/2011 02:52     0    574 30/03/2011 02:58 14.7   1    7  6.36
    4 30/03/2011 03:02     1    550 30/03/2011 03:55 10.4   2    9 -0.34
    5 30/03/2011 03:12    15    600 30/03/2011 03:55 10.4   2    9 -0.34
    6 30/03/2011 03:22     0    597 30/03/2011 03:55 10.4   2    9 -0.34
    

    As above, but for dts1 treat the record as belonging to previous hour until 10 minutes past the hour, by subtracting 10*60 seconds before truncating. This one produces the same output you specified, though without more information I'm not sure that it's the exact rule you want.

    merge(
        cbind(trunc(datetime1 - 10*60, "hours"), dts1),
        cbind(trunc(datetime2, "hours"), dts2),
        by=1, suffixes=c("_dts1", "_dts2")
    )[-1]
    
         datetime_dts1 count period    datetime_dts2 dist car satd   alt
    1 30/03/2011 02:32    27    561 30/03/2011 02:58 14.7   1    7  6.36
    2 30/03/2011 02:42     3    600 30/03/2011 02:58 14.7   1    7  6.36
    3 30/03/2011 02:52     0    574 30/03/2011 02:58 14.7   1    7  6.36
    4 30/03/2011 03:02     1    550 30/03/2011 02:58 14.7   1    7  6.36
    5 30/03/2011 03:12    15    600 30/03/2011 03:55 10.4   2    9 -0.34
    6 30/03/2011 03:22     0    597 30/03/2011 03:55 10.4   2    9 -0.34
    

    You could tweak the details of which ones you round, which ones you truncate, and whether you first subtract/add some time depending on your specific rule.

    Edit:

    Not the most elegant, but here is a different approach that accommodates the more complicated conditional rule you described in your comments. This leans heavily on na.locf from the zoo package to first determine which dts2 times come before and after each dts1 record. With those in hand, it's just a matter of applying the rule to select the desired dts2 time, matching back to the original dts1 table, then merging.

    library(zoo)
    
    # create ordered list of all datetimes, using names to keep
    # track of which ones come from each data frame
    alldts <- sort(c(
        setNames(datetime1, rep("dts1", length(datetime1))),
        setNames(datetime2, rep("dts2", length(datetime2)))))
    is.dts1 <- names(alldts)=="dts1"
    
    # for each dts1 record, get previous closest dts2 time
    dts2.prev <- alldts
    dts2.prev[is.dts1] <- NA
    dts2.prev <- na.locf(dts2.prev, na.rm=FALSE)[is.dts1]
    
    # for each dts1 record, get next closest dts2 time
    dts2.next <- alldts
    dts2.next[is.dts1] <- NA
    dts2.next <- na.locf(dts2.next, na.rm=FALSE, fromLast=TRUE)[is.dts1]
    
    # for each dts1 record, apply rule to choose dts2 time
    use.prev <- !is.na(dts2.prev) & (alldts[is.dts1] - dts2.prev < 5)
    dts2.to.use <- ifelse(use.prev, as.character(dts2.prev), 
        as.character(dts2.next))
    
    # merge based on chosen dts2 times, prepended as character vector
    # for the purpose of merging
    merge(
        cbind(.dt=dts2.to.use[match(datetime1, alldts[is.dts1])], dts1),
        cbind(.dt=as.character(datetime2), dts2),
        by=".dt", all.x=TRUE, suffixes=c("_dts1", "_dts2")
    )[-1]
    
    0 讨论(0)
提交回复
热议问题