Split date into several chunks ending by YYYY-12-31

爷,独闯天下 提交于 2019-12-04 10:54:19

A possible solution with :

library(data.table)
setDT(df)

df[df[, rep(.I, 1 + year(end) - year(start))]
   ][, `:=` (start = pmax(start[1], as.Date(paste0(year(start[1]) + 0:(.N-1), '-01-01'))),
             end = pmin(end[.N], as.Date(paste0(year(end[.N]) - (.N-1):0, '-12-31'))))
     , by = .(group, rleid(start))][]

which gives:

    group      start        end
 1:     a 2017-05-01 2017-12-31
 2:     a 2018-01-01 2018-09-01
 3:     a 2019-04-03 2019-12-31
 4:     a 2020-01-01 2020-04-03
 5:     b 2011-03-03 2011-12-31
 6:     b 2012-01-01 2012-05-03
 7:     b 2014-05-07 2014-12-31
 8:     b 2015-01-01 2015-12-31
 9:     b 2016-01-01 2016-04-02
10:     c 2017-02-01 2017-04-05

Two alternative solutions with :

# alternative 1:
df[, ri := rowid(group)
   ][df[, rep(.I, 1 + year(end) - year(start))]
     ][, `:=` (start = if (.N == 1) start else c(start[1], as.Date(paste0(year(start[1]) + 1:(.N-1), '-01-01') )),
               end = if (.N == 1) end else c(as.Date(paste0(year(end[.N]) - (.N-1):1, '-12-31') ), end[.N]))
       , by = .(group, ri)][, ri := NULL][]

# alternative 2:
df[, ri := rowid(group)
   ][df[, rep(.I, 1 + year(end) - year(start))]
     ][, `:=` (start = pmax(start[1], as.Date(paste0(year(start[1]) + 0:(.N-1), '-01-01'))),
               end = pmin(end[.N], as.Date(paste0(year(end[.N]) - (.N-1):0, '-12-31'))))
       , by = .(group, ri)][, ri := NULL][]

Used data:

df <- data.frame(group = c("a", "a", "b", "b", "c"),
                 start = c("2017-05-01", "2019-04-03", "2011-03-03", "2014-05-07", "2017-02-01"),
                 end = c("2018-09-01", "2020-04-03", "2012-05-03", "2016-04-02", "2017-04-05")) 
df[2:3] <- lapply(df[2:3], as.Date)
library(tidyverse)
library(lubridate)
df%>%
  mutate(end=as.Date(end),
     start=as.Date(start),
     diff=Map(":",0,1+year(end)-year(start)-1))%>%
  unnest()%>%
  mutate(end=pmin(end,as.Date(paste0(year(start)+diff,"-12-31"))),
         start=pmax(start,as.Date(paste0(year(start)+diff,"-1-1"))),
         diff=NULL)

A tibble: 9 x 3
  group start      end       
  <fct> <date>     <date>    
1 a     2017-05-02 2017-12-31
2 a     2018-01-01 2018-09-01
3 a     2019-04-03 2019-12-31
4 a     2020-01-01 2020-04-03
5 b     2020-01-01 2011-12-31
6 b     2012-01-01 2012-05-03
7 b     2014-05-07 2014-12-31
8 b     2015-01-01 2015-12-31
9 b     2015-01-01 2016-04-02

with the updated data just ran this exact function you will get:

 group      start        end
1      a 2017-05-01 2017-12-31
2      a 2018-01-01 2018-09-01
3      a 2019-04-03 2019-12-31
4      a 2020-01-01 2020-04-03
5      b 2011-03-03 2011-12-31
6      b 2012-01-01 2012-05-03
7      b 2014-05-07 2014-12-31
8      b 2015-01-01 2015-12-31
9      b 2016-01-01 2016-04-02
10     c 2017-02-01 2017-04-05

Here's a no-tidyverse/no-data.table version:

df <- data.frame(group = c("a", "a", "b", "b"),
                 start = c("2017-05-01", "2019-04-03", "2011-03-03", "2014-05-07"),
                 end = c("2018-09-01", "2020-04-03", "2012-05-03", "2016-04-02"), stringsAsFactors=FALSE)
# added stringsAsFactors =FALSE to your df for sanity
# reformatting start and end as Date  
df$start <- as.Date(df$start)
df$end <- as.Date(df$end)

dfs <- split(df, rownames(df))
# split the data frame by rows

res <- do.call(rbind, lapply(dfs, function(.){
  s <- seq(from=.$start, to=.$end, by="day")
  # sequence form df$start to df$end, by days
  y <- format(s, "%Y")
  # years of that sequence
  s2 <- as.character(s)
  # formatting s as character -- otherwise sapply will get rid of the 
  # Date class and the result will look as numeric
  ys <- split(s2,y)
  # split the sequence by years
  data.frame(group=.$group, start=sapply(ys, head,1), end = sapply(ys, tail, 1), stringsAsFactors=FALSE)
  # take the first and last element from each "sub-vector" of the split sequence
}))
rownames(res) <- NULL # kill the nasty rownames
res

  group      start        end
1     a 2017-05-01 2017-12-31
2     a 2018-01-01 2018-09-01
3     a 2019-04-03 2019-12-31
4     a 2020-01-01 2020-04-03
5     b 2011-03-03 2011-12-31
6     b 2012-01-01 2012-05-03
7     b 2014-05-07 2014-12-31
8     b 2015-01-01 2015-12-31
9     b 2016-01-01 2016-04-02

Notice that the result has start and end columns as character as it was in your original data frame.

I'm sorry for the way base R treats Date (and POSIXct) objects - you never know when they may lose their class and become simple numbers. Here I avoided this "feature" by treating dates as character, except when date operations were needed e.g in creating the sequence of days.

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!