--- title: "Forecasting Swiss ILI counts using `surveillance::hhh4`" author: "Sebastian Meyer" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: fig_width: 6 fig_height: 4 toc: TRUE vignette: > %\VignetteIndexEntry{Forecasting Swiss ILI counts using `surveillance::hhh4`} %\VignetteEngine{knitr::rmarkdown} %\VignetteDepends{ggplot2, surveillance, fanplot} --- ```{r setup_knitr, include = FALSE} knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE, fig.align = "center", dev.args = list(pointsize = 10)) ``` ```{r setup} options(digits = 4) # for more compact numerical outputs library("HIDDA.forecasting") library("ggplot2") source("setup.R", local = TRUE) # define test periods (OWA, TEST) ``` In this vignette, we use forecasting methods provided by: ```{r} library("surveillance") ``` The corresponding software reference is: ```{r, echo = FALSE, results = "asis"} cat("
") print(citation(package = "surveillance", auto = TRUE), style = "html") cat("\n") ``` ## Modelling ```{r} CHILI.sts <- sts(observed = CHILI, epoch = as.integer(index(CHILI)), epochAsDate = TRUE) ``` ```{r hhh4fit} (weeksInYear <- table(year(CHILI.sts))) mean(weeksInYear) ## long-term average is 52.1775 weeks per year f1 <- addSeason2formula(~ 1, period = 52.1775, timevar = "index") ## equivalent: f1 <- addSeason2formula(~ 1, period = 365.2425, timevar = "t") hhh4fit <- hhh4(stsObj = CHILI.sts, control = list( ar = list(f = update(f1, ~. + christmas)), end = list(f = f1), family = "NegBin1", data = list(index = 1:nrow(CHILI.sts), christmas = as.integer(epochInYear(CHILI.sts) %in% 52)) )) summary(hhh4fit, maxEV = TRUE) ``` Alternatively, use a yearly varying frequency of 52 or 53 weeks for the sinusoidal time effects: ```{r hhh4fit_varfreq} f1_varfreq <- ~ 1 + sin(2*pi*epochInYear/weeksInYear) + cos(2*pi*epochInYear/weeksInYear) hhh4fit_varfreq <- update(hhh4fit, ar = list(f = update(f1_varfreq, ~. + christmas)), end = list(f = f1_varfreq), data = list(epochInYear = epochInYear(CHILI.sts), weeksInYear = rep(weeksInYear, weeksInYear))) AIC(hhh4fit, hhh4fit_varfreq) ``` ```{r hhh4fit_maxEV, include = FALSE} plot(hhh4fit, hhh4fit_varfreq, type = "maxEV", matplot.args = list(ylab = expression(lambda[t]), xlim =c(2002,2007))) ``` ```{r hhh4fitted_components, fig.width = 7, fig.height = 4} plot(hhh4fit, pch = 20, names = "", ylab = "") ``` ```{r hhh4fitted_lambdat, fig.width = 3, fig.height = 2} qplot(x = 1:53, y = drop(predict(hhh4fit, newSubset = 1:53, type = "ar.exppred")), geom = "line", ylim = c(0,1.3), xlab = "week", ylab = expression(lambda[t])) + geom_hline(yintercept = 1, lty = 2) ``` ```{r} CHILIdat <- fortify(CHILI) ## add fitted mean and CI to CHILIdat CHILIdat$hhh4upper <- CHILIdat$hhh4lower <- CHILIdat$hhh4fitted <- NA_real_ CHILIdat[hhh4fit$control$subset,"hhh4fitted"] <- fitted(hhh4fit) CHILIdat[hhh4fit$control$subset,c("hhh4lower","hhh4upper")] <- sapply(c(0.025, 0.975), function (p) qnbinom(p, mu = fitted(hhh4fit), size = exp(hhh4fit$coefficients[["-log(overdisp)"]]))) ``` ```{r hhh4fitted, fig.width = 7, fig.height = 4} ggplot(CHILIdat, aes(x=Index, ymin=hhh4lower, y=hhh4fitted, ymax=hhh4upper)) + geom_ribbon(fill="orange") + geom_line(col="darkred") + geom_point(aes(y=CHILI), pch=20) + scale_y_sqrt(expand = c(0,0), limits = c(0,NA)) ``` ## One-week-ahead forecasts We compute `r length(OWA)` one-week-ahead forecasts from `r format_period(OWA)` (the `OWA` period), which takes roughly 4 seconds (we could parallelize using the `cores` argument of `oneStepAhead()`). ```{r hhh4owa, eval = !file.exists("hhh4owa.RData"), results = "hide"} hhh4owa <- oneStepAhead(hhh4fit, range(OWA), type = "rolling", verbose = FALSE) save(hhh4owa, file = "hhh4owa.RData") ``` ```{r, include = FALSE} load("hhh4owa.RData") ``` ```{r hhh4owa_pit, fig.width = 3, fig.height = 3, echo = -1} par(mar = c(5,5,1,1), las = 1) pit(hhh4owa, plot = list(ylab = "Density")) ``` ```{r hhh4owa_caltest} calibrationTest(hhh4owa, which = "dss") ## calibrationTest(hhh4owa, which = "logs") # skipped for CRAN ``` ```{r hhh4owa_scores} hhh4owa_scores <- scores(hhh4owa, which = c("dss", "logs"), reverse = FALSE) summary(hhh4owa_scores) ``` ```{r hhh4owa_plot, echo = -1} par(mar = c(5,5,1,1)) hhh4owa_quantiles <- quantile(hhh4owa, probs = 1:99/100) osaplot( quantiles = hhh4owa_quantiles, probs = 1:99/100, observed = hhh4owa$observed, scores = hhh4owa_scores, start = OWA[1]+1, xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL) ) ``` ## Long-term forecasts ### Example with the first test period ```{r hhh4sim1} TEST1 <- TEST[[1]] fit1 <- update(hhh4fit, subset.upper = TEST1[1]-1) hhh4sim1 <- simulate(fit1, nsim = 1000, seed = 726, subset = TEST1, y.start = observed(CHILI.sts)[TEST1[1]-1,]) ``` We can use the plot method provided by **surveillance**: ```{r hhh4sim1_plots, R.options = list(scipen=1)} par(mfrow=c(1,2)) plot(hhh4sim1, "fan", ylim = c(0,60000), xlab = "", ylab = "", xaxis = list(xaxis.tickFreq = list("%m"=atChange, "%Y"=atChange), xaxis.labelFreq = list("%Y"=atMedian), xaxis.labelFormat = "%Y")) plot(hhh4sim1, "size", horizontal = FALSE, main = "size of the epidemic", ylab = "", observed = list(labels = NULL)) ``` There is also an associated `scores` method: ```{r} summary(scores(hhh4sim1, which = c("dss", "logs"))) ``` Using relative frequencies to estimate the forecast distribution from these simulations is problematic. However, we can use kernel density estimation as implemented in package **scoringRules**, which the function `scores_sample()` wraps: ```{r} summary(scores_sample(x = observed(CHILI.sts)[TEST1], sims = drop(hhh4sim1))) ``` An even better approximation of the log-score at each time point can be obtained by using a mixture of the one-step-ahead negative binomial distributions given the samples from the previous time point. These forecast distributions are available through the function `dhhh4sims()`, which is used in `logs_hhh4sims()`: ```{r} summary(logs_hhh4sims(sims = hhh4sim1, model = fit1)) ``` ### For all test periods ```{r hhh4sims} hhh4sims <- lapply(TEST, function (testperiod) { t0 <- testperiod[1] - 1 fit0 <- update(hhh4fit, subset.upper = t0) sims <- simulate(fit0, nsim = 1000, seed = t0, subset = testperiod, y.start = observed(CHILI.sts)[t0,]) list(testperiod = testperiod, observed = observed(CHILI.sts)[testperiod], fit0 = fit0, sims = sims) }) ``` PIT histograms, based on the pointwise ECDF of the simulated epidemic curves: ```{r hhh4sims_pit, echo = -1} par(mar = c(5,5,1,1), mfrow = sort(n2mfrow(length(hhh4sims))), las = 1) invisible(lapply(hhh4sims, function (x) { pit(x = x$observed, pdistr = apply(x$sims, 1, ecdf), plot = list(main = format_period(x$testperiod, fmt = "%Y", collapse = "/"), ylab = "Density")) })) ``` ```{r hhh4sims_plot, echo = -1, fig.show = "hold"} par(mar = c(5,5,1,1)) t(sapply(hhh4sims, function (x) { quantiles <- t(apply(x$sims, 1, quantile, probs = 1:99/100)) scores <- scores_sample(x$observed, drop(x$sims)) ## improved estimate via mixture of one-step-ahead NegBin distributions ## NOTE: we skip this here for speed (for CRAN) ##scores <- cbind(scores, logs2 = logs_hhh4sims(sims=x$sims, model=x$fit0)) osaplot(quantiles = quantiles, probs = 1:99/100, observed = x$observed, scores = scores, start = x$testperiod[1], xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL)) colMeans(scores) })) ```