Check if a date is within an interval in R

前端 未结 6 918
暗喜
暗喜 2020-11-30 12:48

I have these three intervals defined:

YEAR_1  <- interval(ymd(\'2002-09-01\'), ymd(\'2003-08-31\'))
YEAR_2  <- interval(ymd(\'2003-09-01\'), ymd(\'20         


        
相关标签:
6条回答
  • 2020-11-30 13:34

    With lubridate and mapply:

    library(lubridate)
    
    dates <- # your data here
    
    # no idea how you generated these, so let's just copy them
    YEAR_1 <- interval(ymd('2002-09-01'), ymd('2003-08-31'))
    YEAR_2 <- interval(ymd('2003-09-01'), ymd('2004-08-31')) 
    YEAR_3 <- interval(ymd('2004-09-01'), ymd('2005-08-31'))
    
    # this should scale nicely
    sapply(c(YEAR_1, YEAR_2, YEAR_3), function(x) { mapply(`%within%`, dates, x) })
    

    The result is a matrix with one column per interval:

            [,1]  [,2]  [,3]
      [1,]  TRUE FALSE FALSE
      [2,] FALSE  TRUE FALSE
      [3,] FALSE  TRUE FALSE
      [4,] FALSE  TRUE FALSE
      ... etc. (100 rows in your example data)
    

    There might be a nicer way to code that with purrr, but I am too novice to purrr to see it.

    0 讨论(0)
  • 2020-11-30 13:35

    Everybody has their favourite tool for this, mine happens to be data.table because of what it refers to as its dt[i, j, by] logic.

    library(data.table)
    
    dt <- data.table(date = as.IDate(pt))
    
    dt[, YR := 0.0 ]                        # I am using a numeric for year here...
    
    dt[ date >= as.IDate("2002-09-01") & date <= as.IDate("2003-08-31"), YR := 1 ]
    dt[ date >= as.IDate("2003-09-01") & date <= as.IDate("2004-08-31"), YR := 2 ]
    dt[ date >= as.IDate("2004-09-01") & date <= as.IDate("2005-08-31"), YR := 3 ]
    

    I create a data.table object, converting your times to date for later comparison. I then set up a new column, defaulting to one.

    We then execute three conditional statements: for each of the three intervals (which I just create by hand using the endpoints), we set the YR value to 1, 2 or 3.

    This does have the desired effect as we can see from

    R> print(dt, topn=5, nrows=10)
               date YR
      1: 2003-06-11  1
      2: 2004-08-11  2
      3: 2004-06-03  2
      4: 2004-01-20  2
      5: 2005-02-25  3
     ---              
     96: 2002-08-07  0
     97: 2004-02-04  2
     98: 2006-04-10  0
     99: 2005-03-21  3
    100: 2003-12-01  2
    R> table(dt[, YR])
    
     0  1  2  3 
    26 31 31 12 
    R> 
    

    One could have done this also simply by computing date differences and truncating down, but it is also nice to be a little explicit at times.

    Edit: A more generic form just uses arithmetic on the dates:

    R> dt[, YR2 := trunc(as.numeric(difftime(as.Date(date), 
    +                                        as.Date("2001-09-01"),
    +                                        unit="days"))/365.25)]
    R> table(dt[, YR2])
    
     0  1  2  3  4  5  6  7  9 
     7 31 31 12  9  5  1  2  1 
    R> 
    

    This does the job in one line.

    0 讨论(0)
  • 2020-11-30 13:35

    Here is my take on it all. I Like to keep things tidy ;)

    > ## load libraries
    > library(tidyverse)
    > library(lubridate)
    > 
    > ## define times
    > times <- c(1055289600, 1092182400, 1086220800, 1074556800, 1109289600, 
    +            1041897600, 1069200000, 1047427200, 1072656000, 1048636800, 1092873600, 
    +            1090195200, 1051574400, 1052179200, 1130371200, 1242777600, 1140652800, 
    +            1137974400, 1045526400, 1111104000, 1073952000, 1052870400, 1087948800, 
    +            1053993600, 1039564800, 1141603200, 1074038400, 1105315200, 1060560000, 
    +            1072051200, 1046217600, 1107129600, 1088553600, 1071619200, 1115596800, 
    +            1050364800, 1147046400, 1083628800, 1056412800, 1159747200, 1087257600, 
    +            1201478400, 1120521600, 1066176000, 1034553600, 1057622400, 1078876800, 
    +            1010880000, 1133913600, 1098230400, 1170806400, 1037318400, 1070409600, 
    +            1091577600, 1057708800, 1182556800, 1091059200, 1058227200, 1061337600, 
    +            1034121600, 1067644800, 1039478400, 1022198400, 1063065600, 1096329600, 
    +            1049760000, 1081728000, 1016150400, 1029801600, 1059350400, 1087257600, 
    +            1181692800, 1310947200, 1125446400, 1057104000, NA, 1085529600, 
    +            1037664000, 1091577600, 1080518400, 1110758400, 1092787200, 1094601600, 
    +            1169424000, 1232582400, 1058918400, 1021420800, 1133136000, 1030320000, 
    +            1060732800, 1035244800, 1090800000, 1129161600, 1055808000, 1060646400, 
    +            1028678400, 1075852800, 1144627200, 1111363200, 1070236800)
    > times <- tibble(time = as.POSIXct(times, origin = "1970-01-01", tz = "UTC")) %>% 
    +   mutate(time = as_date(time),
    +          duplicated = duplicated(time)) ## there are duplicated times!
    > 
    > 
    > ## define years
    > year <- c("YEAR_1", "YEAR_2", "YEAR_3")
    > interval <- c(interval(ymd("2002-09-01", tz = "UTC"), ymd("2003-08-31", tz = "UTC")),
    +               interval(ymd("2003-09-01", tz = "UTC"), ymd("2004-08-31", tz = "UTC")),
    +               interval(ymd("2004-09-01", tz = "UTC"), ymd("2005-08-31", tz = "UTC")))
    > years <- tibble(year, interval)
    > 
    > ## check data
    > times
    # A tibble: 100 x 2
       time       duplicated
       <date>     <lgl>     
     1 2003-06-11 FALSE     
     2 2004-08-11 FALSE     
     3 2004-06-03 FALSE     
     4 2004-01-20 FALSE     
     5 2005-02-25 FALSE     
     6 2003-01-07 FALSE     
     7 2003-11-19 FALSE     
     8 2003-03-12 FALSE     
     9 2003-12-29 FALSE     
    10 2003-03-26 FALSE     
    # ... with 90 more rows
    > years
    # A tibble: 3 x 2
      year   interval                      
      <chr>  <S4: Interval>                
    1 YEAR_1 2002-09-01 UTC--2003-08-31 UTC
    2 YEAR_2 2003-09-01 UTC--2004-08-31 UTC
    3 YEAR_3 2004-09-01 UTC--2005-08-31 UTC
    > 
    > ## create new indicator variavble
    > ##
    > ## join datasets (length = 3 x 100)
    > ## indicator for year
    > ## drop NAs
    > ## keep "time" and "active"
    > ## join with times to get back at full dataset
    > ## as duplications, keep only one of them
    > crossing(times, years) %>% 
    +   mutate(active = if_else(time %within% interval, year, NA_character_)) %>% 
    +   drop_na(active) %>% 
    +   select(time, active) %>% 
    +   right_join(times, by = "time") %>% 
    +   distinct() %>% 
    +   select(-duplicated)
    # A tibble: 100 x 2
       time       active
       <date>     <chr> 
     1 2003-06-11 YEAR_1
     2 2004-08-11 YEAR_2
     3 2004-06-03 YEAR_2
     4 2004-01-20 YEAR_2
     5 2005-02-25 YEAR_3
     6 2003-01-07 YEAR_1
     7 2003-11-19 YEAR_2
     8 2003-03-12 YEAR_1
     9 2003-12-29 YEAR_2
    10 2003-03-26 YEAR_1
    # ... with 90 more rows
    
    0 讨论(0)
  • 2020-11-30 13:37

    We can :

    1st: create a data.table contains all YEAR_N

    > interval.dt <- data.table(Interval = c(YEAR_1, YEAR_2, YEAR_3))
    > interval.dt
    #                         Interval
    #1: 2002-09-01 UTC--2003-08-31 UTC
    #2: 2003-09-01 UTC--2004-08-31 UTC
    #3: 2004-09-01 UTC--2005-08-31 UTC
    

    2nd: define a function to get row index of interval.dt when a specific year date fall in range interval.dt$Interval using int_start(interval.dt$Interval) < year < int_end(interval.dt$Interval)

    >  findYearIndex <- function(year) {
          interval.dt[,which(int_start(interval.dt$Interval) < year & year < int_end(interval.dt$Interval))]
          }
    

    3rd: sapply findYearIndex function to every element in year date data.table

    > dt <- data.table(year = df)
    > dt$YearIndex <- paste("YEAR", sapply(dt$year, findYearIndex), sep = "_")
    
    > dt
      #         year       YearIndex
      #1: 2003-06-11          YEAR_1
      #2: 2004-08-11          YEAR_2
      #3: 2004-06-03          YEAR_2
      #4: 2004-01-20          YEAR_2
      #5: 2005-02-25          YEAR_3
      #6: 2003-01-07          YEAR_1
      #7: 2003-11-19          YEAR_2
      #8: 2003-03-12          YEAR_1
      #9: 2003-12-29          YEAR_2
     #10: 2003-03-26          YEAR_1
     #11: 2004-08-19          YEAR_2
     #12: 2004-07-19          YEAR_2
     #13: 2003-04-29          YEAR_1
     #14: 2003-05-06          YEAR_1
     #15: 2005-10-27 YEAR_integer(0)
     #ignore the rest of dt   
    
    0 讨论(0)
  • 2020-11-30 13:38

    You can use walk from package purrr for this:

    purrr::walk(1:3, ~(df$Year[as.POSIXlt(df$DATE) %within% get(paste0("YEAR_", .))] <<- .))
    

    or maybe you should write a loop to improve readability (unless taboo for you):

    df$YR <- NA
    for(i in 1:3){
      interval <- get(paste0("YEAR_", i))
      index <-which(as.POSIXlt(df$DATE) %within% interval)
      df$YR[index] <- i
    }
    
    0 讨论(0)
  • 2020-11-30 13:41

    You could try something like this:

    df = as.data.frame(structure(c(1055289600, 1092182400, 1086220800, 1074556800, 1109289600, 
                1041897600, 1069200000, 1047427200, 1072656000, 1048636800, 1092873600, 
                1090195200, 1051574400, 1052179200, 1130371200, 1242777600, 1140652800, 
                1137974400, 1045526400, 1111104000, 1073952000, 1052870400, 1087948800, 
                1053993600, 1039564800, 1141603200, 1074038400, 1105315200, 1060560000, 
                1072051200, 1046217600, 1107129600, 1088553600, 1071619200, 1115596800, 
                1050364800, 1147046400, 1083628800, 1056412800, 1159747200, 1087257600, 
                1201478400, 1120521600, 1066176000, 1034553600, 1057622400, 1078876800, 
                1010880000, 1133913600, 1098230400, 1170806400, 1037318400, 1070409600, 
                1091577600, 1057708800, 1182556800, 1091059200, 1058227200, 1061337600, 
                1034121600, 1067644800, 1039478400, 1022198400, 1063065600, 1096329600, 
                1049760000, 1081728000, 1016150400, 1029801600, 1059350400, 1087257600, 
                1181692800, 1310947200, 1125446400, 1057104000, NA, 1085529600, 
                1037664000, 1091577600, 1080518400, 1110758400, 1092787200, 1094601600, 
                1169424000, 1232582400, 1058918400, 1021420800, 1133136000, 1030320000, 
                1060732800, 1035244800, 1090800000, 1129161600, 1055808000, 1060646400, 
                1028678400, 1075852800, 1144627200, 1111363200, 1070236800), class = c("POSIXct", 
                                                                                       "POSIXt"), tzone = "UTC"))
    
    colnames(df)[1] = "dates"
    
    YEAR_1_Start = as.Date('2002-09-01')
    YEAR_1_End = as.Date('2003-08-31')
    
    YEAR_2_Start = as.Date('2003-09-01')
    YEAR_2_End = as.Date('2004-08-31')
    
    YEAR_3_Start = as.Date('2004-09-01')
    YEAR_3_End = as.Date('2005-08-31')
    
    
    df$year = lapply(df$dates,FUN = function(x){
              x = as.Date(x)
              if(is.na(x)){
                return(NA)
              }else if(YEAR_1_Start <= x & x <= YEAR_1_End){
                return("YEAR_1")
              }else if(YEAR_2_Start <= x & x <= YEAR_2_End){
                return("YEAR_2")
              }else if(YEAR_3_Start <= x & x <= YEAR_3_End){
                return("YEAR_3")
              }else{
                return("Other")
              }
    })
    
    df
             dates   year
    1   2003-06-11 YEAR_1
    2   2004-08-11 YEAR_2
    3   2004-06-03 YEAR_2
    4   2004-01-20 YEAR_2
    5   2005-02-25 YEAR_3
    6   2003-01-07 YEAR_1
    7   2003-11-19 YEAR_2
    8   2003-03-12 YEAR_1
    9   2003-12-29 YEAR_2
    10  2003-03-26 YEAR_1
    11  2004-08-19 YEAR_2
    12  2004-07-19 YEAR_2
    13  2003-04-29 YEAR_1
    14  2003-05-06 YEAR_1
    15  2005-10-27  Other
    16  2009-05-20  Other
    17  2006-02-23  Other
    18  2006-01-23  Other
    19  2003-02-18 YEAR_1
    20  2005-03-18 YEAR_3
    21  2004-01-13 YEAR_2
    22  2003-05-14 YEAR_1
    23  2004-06-23 YEAR_2
    24  2003-05-27 YEAR_1
    25  2002-12-11 YEAR_1
    26  2006-03-06  Other
    27  2004-01-14 YEAR_2
    28  2005-01-10 YEAR_3
    29  2003-08-11 YEAR_1
    30  2003-12-22 YEAR_2
    31  2003-02-26 YEAR_1
    32  2005-01-31 YEAR_3
    33  2004-06-30 YEAR_2
    34  2003-12-17 YEAR_2
    35  2005-05-09 YEAR_3
    36  2003-04-15 YEAR_1
    37  2006-05-08  Other
    38  2004-05-04 YEAR_2
    39  2003-06-24 YEAR_1
    40  2006-10-02  Other
    41  2004-06-15 YEAR_2
    42  2008-01-28  Other
    43  2005-07-05 YEAR_3
    44  2003-10-15 YEAR_2
    45  2002-10-14 YEAR_1
    46  2003-07-08 YEAR_1
    47  2004-03-10 YEAR_2
    48  2002-01-13  Other
    49  2005-12-07  Other
    50  2004-10-20 YEAR_3
    51  2007-02-07  Other
    52  2002-11-15 YEAR_1
    53  2003-12-03 YEAR_2
    54  2004-08-04 YEAR_2
    55  2003-07-09 YEAR_1
    56  2007-06-23  Other
    57  2004-07-29 YEAR_2
    58  2003-07-15 YEAR_1
    59  2003-08-20 YEAR_1
    60  2002-10-09 YEAR_1
    61  2003-11-01 YEAR_2
    62  2002-12-10 YEAR_1
    63  2002-05-24  Other
    64  2003-09-09 YEAR_2
    65  2004-09-28 YEAR_3
    66  2003-04-08 YEAR_1
    67  2004-04-12 YEAR_2
    68  2002-03-15  Other
    69  2002-08-20  Other
    70  2003-07-28 YEAR_1
    71  2004-06-15 YEAR_2
    72  2007-06-13  Other
    73  2011-07-18  Other
    74  2005-08-31 YEAR_3
    75  2003-07-02 YEAR_1
    76        <NA>     NA
    77  2004-05-26 YEAR_2
    78  2002-11-19 YEAR_1
    79  2004-08-04 YEAR_2
    80  2004-03-29 YEAR_2
    81  2005-03-14 YEAR_3
    82  2004-08-18 YEAR_2
    83  2004-09-08 YEAR_3
    84  2007-01-22  Other
    85  2009-01-22  Other
    86  2003-07-23 YEAR_1
    87  2002-05-15  Other
    88  2005-11-28  Other
    89  2002-08-26  Other
    90  2003-08-13 YEAR_1
    91  2002-10-22 YEAR_1
    92  2004-07-26 YEAR_2
    93  2005-10-13  Other
    94  2003-06-17 YEAR_1
    95  2003-08-12 YEAR_1
    96  2002-08-07  Other
    97  2004-02-04 YEAR_2
    98  2006-04-10  Other
    99  2005-03-21 YEAR_3
    100 2003-12-01 YEAR_2
    

    Edit:

    If you can get your intervals into a data.frame or data.table we can change the lapply easily to account for this:

    df$year = lapply(df$dates,FUN = function(x){
      x = as.Date(x)
      if(is.na(x)){
        return(NA)
      }
      for(i in 1:nrow(intervals){
        if(df.intervals[i,"Start"]<=x & x<= df.intervals[i,"End"]){
                        return(paste0(YEAR_,i))}
    }})
    
    0 讨论(0)
提交回复
热议问题