Production of a BSTS Mean Absolute Percentage Error (MAPE) Plot from a Bayesian Time Series Analysis with MCMC using ggplot() and bsts() packages

问题

Problem:

I have a data frame called FID (see below) that contains two columns for Year & Month, and Sighting_Frequency (counts of birds).

The data frame contains 3 years of observations between 2015-2017, indicating I have 36 months of data. I have run a Bayesian time series analysis with MCMC using the bsts() function in the bsts package (see the R-code below) by following the tutorial below.

I want to produce a holdout Mean Absolute Percentage Error (MAPE) Plot as seen in the diagram below, which illustrates the actual vs predicted values with credible intervals for the holdout period using the package ggplot().

I am getting stuck when I am attempting to produce the d2 data frame (see the tutorial and R-code below) because I keep on experiencing this error message:-

Error in data.frame(c(10^as.numeric(-colMeans(bsts.model$one.step.prediction.errors[-(1:burn),  : 
  arguments imply differing number of rows: 48, 32

I have been struggling to figure out the problem. If anyone can help me solve this issue, I would be deeply appreciative.

Many thanks in advance.

Tutorial

https://multithreaded.stitchfix.com/blog/2016/04/21/forget-arima/?fbclid=IwAR1q6QD5j6AW21FY2_gqDEq-bwBKDJNtg9alKm3bDytzS51w-dVkDZMdbT4

Diagram

R-code:

################################################################################
##Time Series Model using the bsts() function
##################################################################################

##Open packages for the time series analysis

library(lubridate)
library(bsts)
library(dplyr)
library(ggplot2)

##Create a time series object
myts2 <- ts(BSTS_Dataframe$Sightings_Frequency, start=c(2015, 1), end=c(2017, 12), frequency=12)

##Upload the data into the windows() function
x <- window(myts2, start=c(2015, 01), end=c(2017, 12))
y <- log(x)

### Run the bsts model
ss <- AddLocalLinearTrend(list(), y)
ss <- AddSeasonal(ss, y, nseasons = 3)
# bsts.model <- bsts(y, state.specification = ss, family = "poisson", niter = 2, ping=0, seed=1234)
bsts.model <- bsts(y, state.specification = ss, family = "logit",  niter = 100, ping = 0, seed = 123)

##Open plotting window
dev.new()

##Plot the bsts.model
plot(bsts.model)

##Get a suggested number of burns
burn<-bsts::SuggestBurn(0.1, bsts.model)

##Predict

p<-predict.bsts(bsts.model, horizon = 12, burn=burn, quantiles=c(.25, .975))

##Actual vs predicted

d2 <- data.frame(
  # fitted values and predictions
  c(10^as.numeric(-colMeans(bsts.model$one.step.prediction.errors[-(1:burn),])+y),  
    10^as.numeric(p$mean)),
   # actual data and dates 
     as.numeric(BSTS_Dataframe$Sightings_Frequency),
     as.Date(time(BSTS_Dataframe$Sightings_Frequency)))

 ######################################
 Error message
 ######################################

 Error in data.frame(c(10^as.numeric(-colMeans(bsts.model$one.step.prediction.errors[-(1:burn),  : 
      arguments imply differing number of rows: 48, 32

names(d2) <- c("Fitted", "Actual", "Date")

### MAPE (mean absolute percentage error)
MAPE <- dplyr::filter(d2, year(Date)>2017) %>% dplyr::summarise(MAPE=mean(abs(Actual-Fitted)/Actual))

### 95% forecast credible interval
posterior.interval <- cbind.data.frame(
  10^as.numeric(p$interval[1,]),
  10^as.numeric(p$interval[2,]), 
  subset(d2, year(Date)>2017)$Date)
names(posterior.interval) <- c("LL", "UL", "Date")

### Join intervals to the forecast
d3 <- left_join(d2, posterior.interval, by="Date")

### Plot actual versus predicted with credible intervals for the holdout period
ggplot(data=d3, aes(x=Date)) +
  geom_line(aes(y=Actual, colour = "Actual"), size=1.2) +
  geom_line(aes(y=Fitted, colour = "Fitted"), size=1.2, linetype=2) +
  theme_bw() + theme(legend.title = element_blank()) + ylab("") + xlab("") +
  geom_vline(xintercept=as.numeric(as.Date("2017-12-01")), linetype=2) + 
  geom_ribbon(aes(ymin=LL, ymax=UL), fill="grey", alpha=0.5) +
  ggtitle(paste0("BSTS -- Holdout MAPE = ", round(100*MAPE,2), "%")) +
  theme(axis.text.x=element_text(angle = -90, hjust = 0))

FID Dataframe

structure(list(Year = structure(1:32, .Label = c("2015-01", "2015-02", 
"2015-03", "2015-04", "2015-05", "2015-08", "2015-09", "2015-10", 
"2015-11", "2015-12", "2016-01", "2016-02", "2016-03", "2016-04", 
"2016-05", "2016-07", "2016-08", "2016-09", "2016-10", "2016-11", 
"2016-12", "2017-01", "2017-02", "2017-03", "2017-04", "2017-05", 
"2017-07", "2017-08", "2017-09", "2017-10", "2017-11", "2017-12"
), class = "factor"), Sightings_Frequency = c(36L, 28L, 39L, 
46L, 5L, 22L, 10L, 15L, 8L, 33L, 33L, 29L, 31L, 23L, 8L, 9L, 
40L, 41L, 40L, 30L, 30L, 44L, 37L, 41L, 42L, 20L, 7L, 27L, 35L, 
27L, 43L, 38L)), class = "data.frame", row.names = c(NA, -32L
))

回答1:

#######################################################################################
##A Bayesian Structural Time Series Model with mcmc
#######################################################################################

##Open packages for the time series analysis

library(lubridate)
library(bsts)
library(dplyr)
library(ggplot2)
library(ggfortify)

###################################################################################
##Time Series Model using the bsts() function
##################################################################################

BSTS_Dataframe$Year <- lubridate::ymd(paste0(FID$Year,"-01"))

allDates <- seq.Date(
               min(FID$Year),
               max(FID$Year),
               "month")

FID <- FID %>% right_join(data.frame(Year = allDates), by = c("Year")) %>% dplyr::arrange(Year) %>%
                     tidyr::fill(Sightings_Frequency, .direction = "down")

##Create a time series object
myts2 <- ts(FID$Sightings_Frequency, start=c(2015, 1), end=c(2017, 12), frequency=12)

##Upload the data into the windows() function
x <- window(myts2, start=c(2015, 01), end=c(2016, 12))
y <- log(x)

##Produce a list for the object ss
ss <- list()

#ss <- AddLocalLinearTrend(list(), y)
ss <- AddSeasonal(ss, y, nseasons = 12)
ss <- AddLocalLevel(ss, y)
# bsts.model <- bsts(y, state.specification = ss, family = "poisson", niter = 2, ping=0, seed=1234)
# If these are poisson distributed, no need to use logit because it bounds reponse
# between 0-1
bsts.model <- bsts(y, state.specification = ss,  niter = 100, ping = 0, seed = 123)

##Open plotting window
dev.new()

##Plot the bsts.model
plot(bsts.model)

##Get a suggested number of burns
burn<-bsts::SuggestBurn(0.1, bsts.model)

##Predict

p<-predict.bsts(bsts.model, horizon = 12, burn=burn, quantiles=c(.25, .975))

p$mean

##Actual vs predicted

d2 <- data.frame(
  # fitted values and predictions
  c(exp(as.numeric(-colMeans(bsts.model$one.step.prediction.errors[-(1:burn),])+y)),  
    exp(as.numeric(p$mean))),
  # actual data and dates
  as.numeric(FID$Sightings_Frequency),
  as.Date(FID$Year))

names(d2) <- c("Fitted", "Actual", "Date")

### MAPE (mean absolute percentage error)
MAPE <- dplyr::filter(d2, lubridate::year(Date)>=2017) %>%
  dplyr::summarise(MAPE=mean(abs(Actual-Fitted)/Actual))

### 95% forecast credible interval
posterior.interval <- cbind.data.frame(
              exp(as.numeric(p$interval[1,])),
              exp(as.numeric(p$interval[2,])),
              tail(d2,12)$Date)

names(posterior.interval) <- c("LL", "UL", "Date")

### Join intervals to the forecast
d3 <- left_join(d2, posterior.interval, by="Date")

##Open plotting window
dev.new()

### Plot actual versus predicted with credible intervals for the holdout period
ggplot(data=d3, aes(x=Date)) +
  geom_line(aes(y=Actual, colour = "Actual"), size=1.2) +
  geom_line(aes(y=Fitted, colour = "Fitted"), size=1.2, linetype=2) +
  theme_bw() + theme(legend.title = element_blank()) + ylab("") + xlab("") +
  geom_vline(xintercept=as.numeric(as.Date("2017-12-01")), linetype=2) +
  geom_ribbon(aes(ymin=LL, ymax=UL), fill="grey", alpha=0.5) +
  ggtitle(paste0("BSTS -- Holdout MAPE = ", round(100*MAPE,2), "%")) +
  theme(axis.text.x=element_text(angle = -90, hjust = 0))

Plot

来源：https://stackoverflow.com/questions/64708201/production-of-a-bsts-mean-absolute-percentage-error-mape-plot-from-a-bayesian

标签

ggplot2

time-series

bayesian

predict