问题
Given a data.frame that contains a time series and one or ore grouping fields. So we have several time series - one for each grouping combination. But some dates are missing. So, what's the easiest (in terms of the most "tidyverse way") of adding these dates with the right grouping values?
Normally I would say I generate a data.frame with all dates and do a full_join with my time series. But now we have to do it for each combination of grouping values -- and fill in the grouping values.
Let's look at an example:
First I create a data.frame with missing values:
library(dplyr)
library(lubridate)
set.seed(1234)
# Time series should run vom 2017-01-01 til 2017-01-10
date <- data.frame(date = seq.Date(from=ymd("2017-01-01"), to=ymd("2017-01-10"), by="days"), v = 1)
# Two grouping dimensions
d1 <- data.frame(d1 = c("A", "B", "C", "D"), v = 1)
d2 <- data.frame(d2 = c(1, 2, 3, 4, 5), v = 1)
# Generate the data.frame
df <- full_join(date, full_join(d1, d2)) %>%
select(date, d1, d2)
# and ad to value columns
df$v1 <- runif(200)
df$v2 <- runif(200)
# group by the dimension columns
df <- df %>%
group_by(d1, d2)
# create missing dates
df.missing <- df %>%
filter(v1 <= 0.8)
# So now 2017-01-01 and 2017-01-10, A, 5 are missing now
df.missing %>%
filter(d1 == "A" & d2 == 5)
# A tibble: 8 x 5
# Groups: d1, d2 [1]
date d1 d2 v1 v2
<date> <fctr> <dbl> <dbl> <dbl>
1 2017-01-02 A 5 0.21879954 0.1335497
2 2017-01-03 A 5 0.32977018 0.9802127
3 2017-01-04 A 5 0.23902573 0.1206089
4 2017-01-05 A 5 0.19617465 0.7378315
5 2017-01-06 A 5 0.13373890 0.9493668
6 2017-01-07 A 5 0.48613541 0.3392834
7 2017-01-08 A 5 0.35698708 0.3696965
8 2017-01-09 A 5 0.08498474 0.8354756
So to add the missing dates I generate a data.frame with all dates:
start <- min(df.missing$date)
end <- max(df.missing$date)
all.dates <- data.frame(date=seq.Date(start, end, by="day"))
No I want to do something like (remember: df.missing is group_by(d1, d2))
df.missing %>%
do(my_join())
So let's define my_join():
my_join <- function(data) {
# get value of both dimensions
d1.set <- data$d1[[1]]
d2.set <- data$d2[[1]]
tmp <- full_join(data, all.dates) %>%
# First we need to ungroup. Otherwise we can't change d1 and d2 because they are grouping variables
ungroup() %>%
mutate(
d1 = d1.set,
d2 = d2.set
) %>%
group_by(d1, d2)
return(tmp)
}
Now we can call my_join() for each combination and have a look at "A/5"
df.missing %>%
do(my_join(.)) %>%
filter(d1 == "A" & d2 == 5)
# A tibble: 10 x 5
# Groups: d1, d2 [1]
date d1 d2 v1 v2
<date> <fctr> <dbl> <dbl> <dbl>
1 2017-01-02 A 5 0.21879954 0.1335497
2 2017-01-03 A 5 0.32977018 0.9802127
3 2017-01-04 A 5 0.23902573 0.1206089
4 2017-01-05 A 5 0.19617465 0.7378315
5 2017-01-06 A 5 0.13373890 0.9493668
6 2017-01-07 A 5 0.48613541 0.3392834
7 2017-01-08 A 5 0.35698708 0.3696965
8 2017-01-09 A 5 0.08498474 0.8354756
9 2017-01-01 A 5 NA NA
10 2017-01-10 A 5 NA NA
Great! That's what we were looking for. But we need to define d1 and d2 in my_join and it feels a little bit clumsy.
So, is there any tidyverse-way of this solution?
P.S.: I've put the code into a gist: https://gist.github.com/JerryWho/1bf919ef73792569eb38f6462c6d7a8e
回答1:
tidyr has some great tools for these sorts of problems. Take a look at complete.
library(dplyr)
library(tidyr)
library(lubridate)
want <- df.missing %>%
ungroup() %>%
complete(nesting(d1, d2), date = seq(min(date), max(date), by = "day"))
want %>% filter(d1 == "A" & d2 == 5)
#> # A tibble: 10 x 5
#> # Groups: d1 [1]
#> d1 d2 date v1 v2
#> <fctr> <dbl> <date> <dbl> <dbl>
#> 1 A 5 2017-01-01 NA NA
#> 2 A 5 2017-01-02 0.21879954 0.1335497
#> 3 A 5 2017-01-03 0.32977018 0.9802127
#> 4 A 5 2017-01-04 0.23902573 0.1206089
#> 5 A 5 2017-01-05 0.19617465 0.7378315
#> 6 A 5 2017-01-06 0.13373890 0.9493668
#> 7 A 5 2017-01-07 0.48613541 0.3392834
#> 8 A 5 2017-01-08 0.35698708 0.3696965
#> 9 A 5 2017-01-09 0.08498474 0.8354756
#> 10 A 5 2017-01-10 NA NA
回答2:
Here's a tidyverse way starting with df.missing
library(tidyverse)
ans <- df.missing %>%
nest(date) %>%
mutate(data = map(data, ~seq.Date(start, end, by="day"))) %>%
unnest(data) %>%
rename(date = data) %>%
left_join(., df.missing, by=c("date","d1","d2"))
ans %>% filter(d1 == "A" & d2 == 5)
Output
d1 d2 date v1 v2
<fctr> <dbl> <date> <dbl> <dbl>
1 A 5 2017-01-01 NA NA
2 A 5 2017-01-02 0.21879954 0.1335497
3 A 5 2017-01-03 0.32977018 0.9802127
4 A 5 2017-01-04 0.23902573 0.1206089
5 A 5 2017-01-05 0.19617465 0.7378315
6 A 5 2017-01-06 0.13373890 0.9493668
7 A 5 2017-01-07 0.48613541 0.3392834
8 A 5 2017-01-08 0.35698708 0.3696965
9 A 5 2017-01-09 0.08498474 0.8354756
10 A 5 2017-01-10 NA NA
-------------------------------------------------------------------------------------------------
Here's an alternative approach that uses expand.grid
and dplyr
verbs
with(df.missing, expand.grid(unique(date), unique(d1), unique(d2))) %>%
setNames(c("date", "d1", "d2")) %>%
left_join(., df.missing, by=c("date","d1","d2"))
output (head)
date d1 d2 v1 v2
1 2017-01-01 A 1 0.113703411 0.660754634
2 2017-01-02 A 1 0.316612455 0.422330675
3 2017-01-03 A 1 0.553333591 0.424109178
4 2017-01-04 A 1 NA NA
5 2017-01-05 A 1 NA NA
6 2017-01-06 A 1 0.035456727 0.352998502
回答3:
Here read.zoo
creates a wide form zoo object and to that we merge the dates. Then we convert that back to a long data frame using fortify.zoo
and spread out out v1
and v2
using spread
.
Note that:
if we can assume that each date appears in at least one combination of the split variables, i.e.
sort(unique(df.missing$date))
contains all the dates, then we could omit themerge
line and no joins would have to be done at all. The test datadf.missing
shown in the question does have this property:all(all.dates$date %in% df.missing$date) ## [1] TRUE
we could stop after the
merge
(or afterread.zoo
if each date is present at least once as in prior point) if a wide form zoo object can be used as that already has all the dates.
In the code below the line marked ### can be omitted with the development version of zoo (1.8.1):
library(dplyr)
library(tidyr)
library(zoo)
split.vars <- c("d1", "d2")
df.missing %>%
as.data.frame %>% ###
read.zoo(split = split.vars) %>%
merge(zoo(, seq(start(.), end(.), "day"))) %>%
fortify.zoo(melt = TRUE) %>%
separate(Series, c("v", split.vars)) %>%
spread(v, Value)
Update: Note simplification in zoo 1.8.1 .
回答4:
package tsibble
function fill_gaps
should do the job easily.
library(tsibble)
df.missing %>%
# tsibble format
as_tsibble(key = c(d1, d2), index = date) %>%
# fill gaps
fill_gaps(.full = TRUE)
来源:https://stackoverflow.com/questions/46130246/filling-missing-dates-in-a-grouped-time-series-a-tidyverse-way