--- title: "Forecasting Swiss ILI counts using `forecast::auto.arima`" author: "Sebastian Meyer" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: fig_width: 6 fig_height: 4 toc: TRUE vignette: > %\VignetteIndexEntry{Forecasting Swiss ILI counts using `forecast::auto.arima`} %\VignetteEngine{knitr::rmarkdown} %\VignetteDepends{ggplot2, surveillance, fanplot, forecast} --- ```{r setup_knitr, include = FALSE} knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE, fig.align = "center", dev.args = list(pointsize = 10)) ``` ```{r setup} options(digits = 4) # for more compact numerical outputs library("HIDDA.forecasting") library("ggplot2") source("setup.R", local = TRUE) # define test periods (OWA, TEST) ``` In this vignette, we use forecasting methods provided by: ```{r} library("forecast") ``` The corresponding software reference is: ```{r, echo = FALSE, results = "asis"} cat("
") print(citation(package = "forecast", auto = TRUE), style = "html") cat("\n") ``` ## Modelling ARIMA models assume a Gaussian response, so we need to work with transformed counts: ```{r BoxCox.lambda} BoxCox.lambda(CHILI, method = "loglik") ``` => Box-Cox procedure suggests a log-transformation (`lambda = 0`). ```{r arimafit, eval = FALSE} arimafit <- auto.arima(CHILI, lambda = 0, stepwise = FALSE, approximation = FALSE) ``` The above standard approach cannot automatically account for seasonality because the data have no regular frequency (not a standard `"ts"`) ... But we can manually add sine/cosine covariates and a christmas indicator just like in the endemic part of the `hhh4` model (see `vignette("CHILI_hhh4")`). ```{r sarimaxfit, cache = TRUE} sarima_cov <- t(sapply(2*pi*seq_along(CHILI)/52.1775, function (x) c(sin = sin(x), cos = cos(x)))) sarimax_cov <- cbind(sarima_cov, christmas = as.integer(strftime(index(CHILI), "%V") == "52")) sarimaxfit <- auto.arima(CHILI, lambda = 0, stepwise = FALSE, approximation = FALSE, xreg = sarimax_cov) summary(sarimaxfit) ``` ```{r} CHILIdat <- fortify(CHILI) CHILIdat$sarimaxfitted <- fitted(sarimaxfit) CHILIdat <- cbind(CHILIdat, sapply(c(sarimaxlower=0.025, sarimaxupper=0.975), function (p) InvBoxCox(BoxCox(fitted(sarimaxfit), lambda = sarimaxfit$lambda) + qnorm(p, sd = sqrt(sarimaxfit$sigma2)), lambda = sarimaxfit$lambda))) ``` ```{r sarimaxfitted, fig.width = 7, fig.height = 4} ggplot(CHILIdat, aes(x=Index, ymin=sarimaxlower, y=sarimaxfitted, ymax=sarimaxupper)) + geom_ribbon(fill="orange") + geom_line(col="darkred") + geom_point(aes(y=CHILI), pch=20) + scale_y_sqrt(expand = c(0,0), limits = c(0,NA)) ``` ## One-week-ahead forecasts We compute `r length(OWA)` one-week-ahead forecasts from `r format_period(OWA)` (the `OWA` period). The model selected above is refitted at each time point, but we do not repeat `auto.arima()` model selection. This is similar to so-called time-series cross-validation as implemented in `forecast::tsCV()`. However, `tsCV()` only computes absolute errors of the point forecasts, whereas we are interested in assessing probabilistic forecasts so also need the forecast variance. For each time point, forecasting with `Arima` takes about 0.5 seconds, i.e., computing all one-week-ahead forecasts takes approx. `r sprintf("%.1f", length(OWA) * 0.5/60)` minutes ... (we could parallelize via, e.g., [**future.apply**](https://CRAN.R-project.org/package=future.apply)`::future_lapply()`) ```{r, include = FALSE, eval = FALSE} ## check update.Arima: we obtain the same fit if we don't change the subset stopifnot(all.equal( sarimaxfit[setdiff(names(sarimaxfit), c("call", "series"))], update(sarimaxfit)[setdiff(names(sarimaxfit), c("call", "series"))] )) ``` ```{r sarimaxowa, eval = !file.exists("sarimaxowa.RData"), results = "hide"} sarimaxowa <- t(simplify2array(lapply(X = OWA, FUN = function (t) { sarimaxfit_t <- update(sarimaxfit, subset = 1:t) unlist(predict(sarimaxfit_t, nahead=1, newxreg=sarimax_cov[t+1,,drop=FALSE])) }))) save(sarimaxowa, file = "sarimaxowa.RData") ``` ```{r, include = FALSE} load("sarimaxowa.RData") ``` ARIMA forecasts for the log-counts are normal with mean `pred` and variance `se^2` => back-transformation via exp() is log-normal ```{r sarimaxowa_pit, fig.width = 3, fig.height = 3, echo = -1} par(mar = c(5,5,1,1), las = 1) .PIT <- plnorm(CHILI[OWA+1], meanlog = sarimaxowa[,"pred"], sdlog = sarimaxowa[,"se"]) hist(.PIT, breaks = seq(0, 1, 0.1), freq = FALSE, main = "", xlab = "PIT") abline(h = 1, lty = 2, col = "grey") ``` ```{r sarimaxowa_scores} sarimaxowa_scores <- scores_lnorm( x = CHILI[OWA+1], meanlog = sarimaxowa[,"pred"], sdlog = sarimaxowa[,"se"], which = c("dss", "logs")) summary(sarimaxowa_scores) ``` Note that discretized forecast distributions yield almost identical scores (essentially due to the large counts): ```{r sarimaxowa_scores_discretized} sarimaxowa_scores_discretized <- scores_lnorm_discrete( x = CHILI[OWA+1], meanlog = sarimaxowa[,"pred"], sdlog = sarimaxowa[,"se"], which = c("dss", "logs")) summary(sarimaxowa_scores_discretized) ``` ```{r, include = FALSE} stopifnot( all.equal(sarimaxowa_scores_discretized[,"dss"], sarimaxowa_scores[,"dss"], tolerance = 1e-5), all.equal(sarimaxowa_scores_discretized[,"logs"], sarimaxowa_scores[,"logs"], tolerance = 1e-5) ) ``` ```{r sarimaxowa_plot, echo = -2} sarimaxowa_quantiles <- sapply(X = 1:99/100, FUN = qlnorm, meanlog = sarimaxowa[,"pred"], sdlog = sarimaxowa[,"se"]) par(mar = c(5,5,1,1)) osaplot( quantiles = sarimaxowa_quantiles, probs = 1:99/100, observed = CHILI[OWA+1], scores = sarimaxowa_scores, start = OWA[1]+1, xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL) ) ``` ## Long-term forecasts ```{r, include = FALSE, eval = FALSE} ## try update.Arima() with the first test period TEST1 <- TEST[[1]] fit1 <- update(sarimaxfit, subset = 1:(TEST1[1]-1)) sarimaxfor1 <- predict(fit1, n.ahead = length(TEST1), newxreg = sarimax_cov[TEST1,,drop=FALSE]) ## check that quantiles from predict() agree with quantiles from forecast() q <- sapply(X = c(0.1,0.025,0.9,0.975), FUN = qlnorm, meanlog = sarimaxfor1$pred, sdlog = sarimaxfor1$se) fc <- forecast(fit1, xreg = sarimax_cov[TEST1,,drop=FALSE], fan = FALSE) stopifnot(all.equal(q, unclass(cbind(fc$lower, fc$upper)), check.attributes = FALSE)) ## check against quantiles from simulate.Arima() set.seed(3) sarimaxsims1 <- replicate(1000, simulate(fit1, xreg = sarimax_cov[TEST1,,drop=FALSE])) qsims <- t(apply(sarimaxsims1, 1, quantile, probs = c(0.1, 0.025, 0.9, 0.975))) matplot(q); matlines(qsims) # ok stopifnot(all.equal(q, unname(qsims), tolerance = 0.05)) ``` ```{r sarimaxfor} sarimaxfor <- lapply(TEST, function (testperiod) { t0 <- testperiod[1] - 1 fit0 <- update(sarimaxfit, subset = 1:t0) fc <- predict(fit0, n.ahead = length(testperiod), newxreg = sarimax_cov[testperiod,,drop=FALSE]) list(testperiod = testperiod, observed = as.vector(CHILI[testperiod]), pred = fc$pred, se = fc$se) }) ``` ```{r sarimaxfor_pit, echo = -1} par(mar = c(5,5,1,1), mfrow = sort(n2mfrow(length(sarimaxfor))), las = 1) invisible(lapply(sarimaxfor, function (x) { PIT <- plnorm(x$observed, meanlog = x$pred, sdlog = x$se) hist(PIT, breaks = seq(0, 1, 0.1), freq = FALSE, main = format_period(x$testperiod, fmt = "%Y", collapse = "/")) abline(h = 1, lty = 2, col = "grey") })) ``` ```{r sarimaxfor_plot, echo = -1, fig.show = "hold"} par(mar = c(5,5,1,1)) t(sapply(sarimaxfor, function (x) { quantiles <- sapply(X = 1:99/100, FUN = qlnorm, meanlog = x$pred, sdlog = x$se) scores <- scores_lnorm(x = x$observed, meanlog = x$pred, sdlog = x$se, which = c("dss", "logs")) osaplot(quantiles = quantiles, probs = 1:99/100, observed = x$observed, scores = scores, start = x$testperiod[1], xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL)) colMeans(scores) })) ```