├── .gitattributes
├── nyt_vax_rate.png
├── .gitignore
├── model_plots
    ├── animated.gif
    ├── plot001.png
    ├── plot002.png
    ├── plot003.png
    ├── plot004.png
    ├── plot005.png
    ├── plot006.png
    ├── plot007.png
    ├── plot008.png
    ├── plot009.png
    └── plot010.png
├── vax_files
    └── figure-gfm
    │   ├── nyt_like-1.png
    │   ├── raw_data-1.png
    │   ├── diff_plot-1.png
    │   ├── vaxed_plot-1.png
    │   ├── log_diff_plot-1.png
    │   ├── model_breakdown-1.png
    │   ├── unnamed-chunk-2-1.png
    │   ├── unnamed-chunk-7-1.gif
    │   └── predictions_plot-1.png
├── vax.Rmd
└── vax.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 
3 | 


--------------------------------------------------------------------------------
/nyt_vax_rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/nyt_vax_rate.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | .Rhistory
3 | .RData
4 | *.Rproj*
5 | *_cache/
6 | .Rproj.user
7 | 


--------------------------------------------------------------------------------
/model_plots/animated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/animated.gif


--------------------------------------------------------------------------------
/model_plots/plot001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot001.png


--------------------------------------------------------------------------------
/model_plots/plot002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot002.png


--------------------------------------------------------------------------------
/model_plots/plot003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot003.png


--------------------------------------------------------------------------------
/model_plots/plot004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot004.png


--------------------------------------------------------------------------------
/model_plots/plot005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot005.png


--------------------------------------------------------------------------------
/model_plots/plot006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot006.png


--------------------------------------------------------------------------------
/model_plots/plot007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot007.png


--------------------------------------------------------------------------------
/model_plots/plot008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot008.png


--------------------------------------------------------------------------------
/model_plots/plot009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot009.png


--------------------------------------------------------------------------------
/model_plots/plot010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot010.png


--------------------------------------------------------------------------------
/vax_files/figure-gfm/nyt_like-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/nyt_like-1.png


--------------------------------------------------------------------------------
/vax_files/figure-gfm/raw_data-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/raw_data-1.png


--------------------------------------------------------------------------------
/vax_files/figure-gfm/diff_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/diff_plot-1.png


--------------------------------------------------------------------------------
/vax_files/figure-gfm/vaxed_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/vaxed_plot-1.png


--------------------------------------------------------------------------------
/vax_files/figure-gfm/log_diff_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/log_diff_plot-1.png


--------------------------------------------------------------------------------
/vax_files/figure-gfm/model_breakdown-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/model_breakdown-1.png


--------------------------------------------------------------------------------
/vax_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/vax_files/figure-gfm/unnamed-chunk-7-1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/unnamed-chunk-7-1.gif


--------------------------------------------------------------------------------
/vax_files/figure-gfm/predictions_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/predictions_plot-1.png


--------------------------------------------------------------------------------
/vax.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Vaccination rate in the US"
  3 | author: "Matthew Kay"
  4 | output: github_document
  5 | ---
  6 | 
  7 | ```{r chunk_options, include=FALSE}
  8 | knitr::opts_chunk$set(fig.retina = 2)
  9 | 
 10 | if (capabilities("cairo") && Sys.info()[['sysname']] != "Darwin") {
 11 |   knitr::opts_chunk$set(dev.args = list(png = list(type = "cairo")))
 12 | }
 13 | ```
 14 | 
 15 | The [New York Time vaccine tracker](https://www.nytimes.com/interactive/2020/us/covid-19-vaccine-doses.html)
 16 | currently shows a predicted proportion of people vaccinated if vaccination continues
 17 | "at the current pace", using a linear projection. I thought it might be helpful to try to 
 18 | add some uncertainty to that line in a way that does not assume a linear trend.
 19 | 
 20 | ![NYT Vacciation rate](nyt_vax_rate.png)
 21 | 
 22 | ## Libraries needed for this analysis:
 23 | 
 24 | ```{r setup, message = FALSE, warning = FALSE}
 25 | library(lubridate)
 26 | library(ggdist)
 27 | library(tidybayes)
 28 | library(bsts)
 29 | library(patchwork)
 30 | library(tidyverse)
 31 | library(magrittr)
 32 | library(posterior)   # pak::pkg_install("stan-dev/posterior")
 33 | library(future)
 34 | library(furrr)
 35 | 
 36 | theme_set(theme_ggdist())
 37 | ```
 38 | 
 39 | ## The data
 40 | 
 41 | I downloaded the data on what share of the population has recieved at least one dose from
 42 | [ourworldindata.org](https://ourworldindata.org/coronavirus/country/united-states#what-share-of-the-population-has-received-at-least-one-dose-of-the-covid-19-vaccine):
 43 | 
 44 | ```{r fetch_data, eval = FALSE}
 45 | download.file(
 46 |   "https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/vaccinations.csv", 
 47 |   "share-people-vaccinated-covid.csv"
 48 | )
 49 | ```
 50 | 
 51 | Filtering down just to the US:
 52 | 
 53 | ```{r df}
 54 | df = read.csv("share-people-vaccinated-covid.csv") %>%
 55 |   filter(location == "United States") %>%
 56 |   transmute(
 57 |     date = as_date(date),
 58 |     day = as.numeric(date - min(date)),
 59 |     vax = people_vaccinated_per_hundred / 100
 60 |   )
 61 | 
 62 | head(df)
 63 | ```
 64 | 
 65 | The data looks like this:
 66 | 
 67 | ```{r raw_data, fig.width = 7.11, fig.height = 4}
 68 | df %>%
 69 |   ggplot(aes(date, vax)) +
 70 |   geom_point(na.rm = TRUE)
 71 | ```
 72 | 
 73 | 
 74 | ## Data prep for time series model of daily changes
 75 | 
 76 | Let's fit a time series model to the data.
 77 | Analyzing the data on the raw scale is hard because (1) we
 78 | know that the percent of vaccinated people is bounded between 0 and 1 and (2)
 79 | we know that the percent of vaccinated people is always increasing. Combining these
 80 | two facts, we could instead look at the difference in the logit of the
 81 | percent of vaccinated people:
 82 | 
 83 | ```{r diff_plot, warning = FALSE, fig.width = 7.11, fig.height = 4}
 84 | df_diff = df %>%
 85 |   select(-date) %>%
 86 |   complete(day = min(day):max(day)) %>%
 87 |   mutate(date = min(df$date) + days(day))
 88 | 
 89 | df_diff %>%
 90 |   ggplot(aes(date, c(NA, diff(qlogis(vax))))) +
 91 |   geom_line() +
 92 |   geom_point() +
 93 |   labs(y = "daily increase in logit(Percent vaccinated)")
 94 | ```
 95 | 
 96 | Since we know that this difference must be positive (since the number of vaccinated
 97 | people cannot decrease), we might analyse this on a log scale. That
 98 | might also stabilize the variance:
 99 | 
100 | ```{r log_diff_plot, fig.width = 7.11, fig.height = 4}
101 | df_log_diff = df_diff %>%
102 |   mutate(log_diff_vax = c(NA, log(diff(qlogis(vax))))) %>%
103 |   slice(-1)
104 | 
105 | df_log_diff %>%
106 |   ggplot(aes(date, log_diff_vax)) +
107 |   geom_line() +
108 |   geom_point(na.rm = TRUE) +
109 |   labs(y = "log(daily increase in logit(Percent vaccinated))")
110 | ```
111 | 
112 | There's some missing data here, which we'll let the model impute (FWIW I also
113 | tried just using linear interpolation on the raw data prior to translating it
114 | into differences and got very similar results to the imputation approach).
115 | 
116 | ## Time series model
117 | 
118 | This seems a good point to fit a time series model. We'll fit a Bayesian Structural
119 | Time Series model with `bsts()`. We'll use a semi-local
120 | linear trend [intended for long-term forecasting](https://www.unofficialgoogledatascience.com/2017/07/fitting-bayesian-structural-time-series.html).
121 | There's a clear weekly trend in the numbers, which is to be expected given
122 | differences in vaccination rates on the weekend, so we'll also include a seasonal
123 | component with a 7-day period. Since we're on
124 | a log scale even changes by as much as 0.5 or 1 are large, we'll use some 
125 | tight-ish-looking priors here:
126 | 
127 | ```{r model}
128 | fit_model = function(max_date = max(df_log_diff$date)) {
129 |   with(filter(df_log_diff, date <= max_date), bsts(log_diff_vax, 
130 |     state.specification = list() %>%
131 |       AddSemilocalLinearTrend(log_diff_vax,
132 |         level.sigma.prior = SdPrior(0.5, 1),
133 |         slope.mean.prior = NormalPrior(0,0.5),
134 |         initial.level.prior = NormalPrior(0,0.5),
135 |         initial.slope.prior = NormalPrior(0,0.5),
136 |         slope.sigma.prior = SdPrior(0.5, 1),
137 |         slope.ar1.prior = Ar1CoefficientPrior(0, 0.5)
138 |       ) %>%
139 |       AddSeasonal(log_diff_vax, 7, 
140 |         sigma.prior = SdPrior(0.5, 1)
141 |       ),
142 |     prior = SdPrior(0.5, 1),
143 |     niter = 40000,
144 |     seed = 4272021 # for reproducibility
145 |   ))
146 | }
147 | 
148 | m = fit_model()
149 | ```
150 | 
151 | Some diagnostics: 
152 | 
153 | ```{r}
154 | draws = as_draws(do.call(cbind, m[startsWith(names(m), "trend") | startsWith(names(m), "sigma")]))
155 | summary(draws, median, mad, quantile2, default_convergence_measures())
156 | ```
157 | 
158 | The $\hat{R}$s and effective sample sizes look reasonable.
159 | 
160 | ```{r}
161 | bayesplot::mcmc_trace(draws)
162 | ```
163 | 
164 | Trace plots also look reasonable.
165 | 
166 | ## Predictions
167 | 
168 | Now we'll generate fits and predictions for a 180-day forecast:
169 | 
170 | ```{r predictions}
171 | forecast_days = 180
172 | 
173 | calc_fits = function(model = m, max_date = max(df_diff$date)) {
174 |   df_log_diff %>%
175 |     filter(date <= max_date) %>%
176 |     mutate(.value = rvar(colSums(aperm(model$state.contributions, c(2, 1, 3)))))
177 | }
178 | fits_all = calc_fits()
179 | 
180 | calc_pred_diff = function(model = m, max_date = max(df_diff$date)) {
181 |   tibble(date = max_date + days(1:forecast_days)) %>%
182 |     mutate(log_diff_vax = rvar(predict(model, horizon = forecast_days)$distribution))
183 | }
184 | pred_diff_all = calc_pred_diff()
185 | ```
186 | 
187 | Predictions from the model look like this (with imputed missing data in gray):
188 | 
189 | ```{r predictions_plot, fig.width = 7.11, fig.height = 4}
190 | x_dates = seq(make_date(2021, 1, 1), max(pred_diff_all$date), by = "month")
191 | x_scale = function(...) list(
192 |   scale_x_date(
193 |     limits = range(df_diff$date, pred_diff_all$date),
194 |     breaks = x_dates,
195 |     labels = months(x_dates, abbreviate = TRUE)
196 |   ),
197 |   coord_cartesian(expand = FALSE, ...) 
198 | )
199 | 
200 | summer_line = geom_vline(xintercept = make_date(2021, 9, 22), alpha = 0.25)
201 | 
202 | widths = c(.5, .8, .95)
203 | 
204 | plot_pred_diff = function(fits = fits_all, pred_diff = pred_diff_all) {
205 |   pred_diff %>%
206 |     ggplot(aes(date)) +
207 |     stat_dist_lineribbon(aes(dist = .value, fill_ramp = stat(level)), data = fits, fill = "gray75", color = "gray65") +
208 |     stat_dist_lineribbon(aes(dist = log_diff_vax), .width = widths, color = "#08519c") +
209 |     geom_line(aes(y = log_diff_vax), data = df_log_diff, size = 1) +
210 |     geom_vline(xintercept = min(pred_diff$date) - days(1), color = "gray75") +
211 |     # summer_line +
212 |     scale_fill_brewer() +
213 |     theme_ggdist() +
214 |     x_scale(ylim = c(-20, 5)) +
215 |     labs(
216 |       subtitle = "log(daily increase in logit(Percent vaccinated))",
217 |       y = NULL,
218 |       x = NULL
219 |     )  
220 | }
221 | plot_pred_diff()
222 | ```
223 | 
224 | There is quite a bit of uncertainty here, especially for far-out forecasts.
225 | 
226 | We can translate these predictions of differences into predictions of percent
227 | vaccinated by inverting the log, cumulatively summing differences in log odds,
228 | then inverting the logit transformation:
229 | 
230 | ```{r pred_vax}
231 | # need a logit and inverse logit functions in terms of elementary operations (plogis
232 | # does not work on rvars directly)
233 | logit = function(p) {log(p) - log1p(-p)}
234 | inv_logit = function(x) {1/(1 + exp(-x))}
235 | 
236 | calc_pred_vax = function(pred_diff = pred_diff_all) {
237 |   last_observed_vax = df_diff %>%
238 |     filter(date == min(pred_diff$date) - days(1)) %$%
239 |     vax
240 |   
241 |   pred_diff %>%
242 |     mutate(
243 |       logit_vax = cumsum(c(
244 |         logit(last_observed_vax) + exp(log_diff_vax[[1]]),
245 |         exp(log_diff_vax[-1])
246 |       )),
247 |       vax = inv_logit(logit_vax)
248 |     )
249 | }
250 | pred_vax_all = calc_pred_vax()
251 | ```
252 | 
253 | Now we can plot the latent model alongside predictions of vaccination rate and
254 | the predicted probability that the proportion of vaccinated people is above
255 | some threshold (say, 70%):
256 | 
257 | ```{r model_breakdown, fig.width = 5, fig.height = 7}
258 | plot_vaxed = function(pred_vax = pred_vax_all) {
259 |   pred_vax %>%
260 |     ggplot(aes(date)) +
261 |     stat_dist_lineribbon(aes(dist = vax), .width = widths, color = "#08519c") +
262 |     scale_fill_brewer() +
263 |     geom_line(aes(y = vax), data = df_diff, size = 1) +
264 |     geom_hline(yintercept = .7, alpha = 0.25) +
265 |     geom_vline(xintercept = min(pred_vax$date) - days(1), color = "gray75") +
266 |     # summer_line +
267 |     scale_y_continuous(limits = c(0,1), labels = scales::percent_format()) +
268 |     x_scale() +
269 |     theme_ggdist() +
270 |     annotate("text", x = make_date(2020, 12, 28), y = 0.72, label = "70%", hjust = 0, vjust = 0, size = 3.25) +
271 |     # annotate("text", x = make_date(2021, 9, 19), y = 0.25, hjust = 1, vjust = 0.5, label = "Sept 22\n End of summer", size = 3.25, lineheight = 1.05) +
272 |     labs(
273 |       subtitle = "Percent vaccinated (at least one dose)",
274 |       y = NULL,
275 |       x = NULL
276 |     )
277 | }
278 | 
279 | vaxed = plot_vaxed()
280 | 
281 | diffs = plot_pred_diff()
282 | 
283 | pred_vax_all$prob_vax_gt_70 = Pr(pred_vax_all$vax > .70)
284 | 
285 | prob = pred_vax_all %>%
286 |   ggplot(aes(date, prob_vax_gt_70)) +
287 |   geom_line(color = "#08519c", size = 1) +
288 |   theme_ggdist() +
289 |   geom_hline(yintercept = 0.5, alpha = 0.25) +
290 |   # summer_line +
291 |   scale_y_continuous(limits = c(0,1), labels = scales::percent_format()) +
292 |   x_scale() +
293 |   labs(
294 |     subtitle = "Pr(Percent vaccinated > 70%)",
295 |     y = NULL,
296 |     x = NULL
297 |   )
298 | 
299 | vaxed / diffs / prob
300 | ```
301 | 
302 | This gives us a final chart like this:
303 | 
304 | ```{r vaxed_plot, warning = FALSE, fig.width = 7.11, fig.height = 4}
305 | vaxed +
306 |   stat_smooth(aes(y = vax), formula = y ~ x, method = lm, se = FALSE,
307 |     data = df %>% filter(row_number() > n() - 8), 
308 |     fullrange = TRUE, color = scales::alpha("white", 0.5), size = 1
309 |   ) +
310 |   annotate("text", 
311 |     label = "white line=\nlinear model  ", 
312 |     fontface = "bold", x = make_date(2021, 9, 12), y = .89, color = "black", hjust = 1, size = 3.25, lineheight = 1.05
313 |   ) +
314 |   labs(
315 |     subtitle = "Forecasted % US with at least one dose, from time series model of daily increase in log odds"
316 |   )
317 | ```
318 | 
319 | Anyway, my conclusion from all of this is essentially that there is *a lot* of
320 | uncertainty in what the vaccination rate will be, at least if we just look at the raw numbers,
321 | and doubtless the model I've shown here is way oversimplified --- but I have 
322 | some trepidation about looking at even simpler models (like linear projections) 
323 | and ignoring their uncertainty, as this is probably going to be at least a little misleading.
324 | 
325 | 
326 | ## Model checking
327 | 
328 | For comparison purposes, let's fit the model on data up to a few different
329 | dates and see how it did on the data we've already observed.
330 | 
331 | ```{r eval = FALSE}
332 | plan(multisession)
333 | 
334 | models = tibble(
335 |   upto = seq(as.Date("2021-03-01"), max(df$date), by = 15),
336 |   model = future_map(upto, fit_model)
337 | ) 
338 | ```
339 | 
340 | Then build the charts for each model:
341 | 
342 | ```{r eval = FALSE}
343 | plots = map2(models$model, models$upto, function(m, max_date) {
344 |   fits = calc_fits(m, max_date)
345 |   pred_diff = calc_pred_diff(m, max_date)
346 |   pred_vax = calc_pred_vax(pred_diff)
347 |   list(
348 |     vaxed = plot_vaxed(pred_vax),
349 |     diffs = plot_pred_diff(fits, pred_diff)
350 |   )
351 | })
352 | 
353 | plots[[length(plots) + 1]] = list(
354 |   vaxed = vaxed,
355 |   diffs = diffs
356 | )
357 | ```
358 | 
359 | Save each chart to disk:
360 | 
361 | ```{r eval = FALSE}
362 | png_path <- file.path("model_plots", "plot%03d.png")
363 | png(png_path, type = "cairo", width = 1000, height = 1000, res = 200)
364 | for (p in plots) {
365 |   print(p$vaxed / p$diffs)
366 | }
367 | dev.off()
368 | ```
369 | 
370 | And put them together into an animation:
371 | 
372 | ```{r eval = FALSE}
373 | png_files <- sprintf(png_path, seq_along(plots))
374 | gifski::gifski(png_files, "model_plots/animated.gif", width = 1000, height = 1000)
375 | ```
376 | 
377 | 
378 | ```{r}
379 | gganimate::gif_file("model_plots/animated.gif")
380 | ```
381 | 
382 | 


--------------------------------------------------------------------------------
/vax.md:
--------------------------------------------------------------------------------
  1 | Vaccination rate in the US
  2 | ================
  3 | Matthew Kay
  4 | 
  5 | The [New York Time vaccine
  6 | tracker](https://www.nytimes.com/interactive/2020/us/covid-19-vaccine-doses.html)
  7 | currently shows a predicted proportion of people vaccinated if
  8 | vaccination continues “at the current pace”, using a linear projection.
  9 | I thought it might be helpful to try to add some uncertainty to that
 10 | line in a way that does not assume a linear trend.
 11 | 
 12 | ![NYT Vacciation rate](nyt_vax_rate.png)
 13 | 
 14 | ## Libraries needed for this analysis:
 15 | 
 16 | ``` r
 17 | library(lubridate)
 18 | library(ggdist)
 19 | library(tidybayes)
 20 | library(bsts)
 21 | library(patchwork)
 22 | library(tidyverse)
 23 | library(magrittr)
 24 | library(posterior)   # pak::pkg_install("stan-dev/posterior")
 25 | library(future)
 26 | library(furrr)
 27 | 
 28 | theme_set(theme_ggdist())
 29 | ```
 30 | 
 31 | ## The data
 32 | 
 33 | I downloaded the data on what share of the population has recieved at
 34 | least one dose from
 35 | [ourworldindata.org](https://ourworldindata.org/coronavirus/country/united-states#what-share-of-the-population-has-received-at-least-one-dose-of-the-covid-19-vaccine):
 36 | 
 37 | ``` r
 38 | download.file(
 39 |   "https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/vaccinations.csv", 
 40 |   "share-people-vaccinated-covid.csv"
 41 | )
 42 | ```
 43 | 
 44 | Filtering down just to the US:
 45 | 
 46 | ``` r
 47 | df = read.csv("share-people-vaccinated-covid.csv") %>%
 48 |   filter(location == "United States") %>%
 49 |   transmute(
 50 |     date = as_date(date),
 51 |     day = as.numeric(date - min(date)),
 52 |     vax = people_vaccinated_per_hundred / 100
 53 |   )
 54 | 
 55 | head(df)
 56 | ```
 57 | 
 58 |     ##         date day    vax
 59 |     ## 1 2020-12-20   0 0.0017
 60 |     ## 2 2020-12-21   1 0.0018
 61 |     ## 3 2020-12-22   2     NA
 62 |     ## 4 2020-12-23   3 0.0030
 63 |     ## 5 2020-12-24   4     NA
 64 |     ## 6 2020-12-25   5     NA
 65 | 
 66 | The data looks like this:
 67 | 
 68 | ``` r
 69 | df %>%
 70 |   ggplot(aes(date, vax)) +
 71 |   geom_point(na.rm = TRUE)
 72 | ```
 73 | 
 74 | <img src="vax_files/figure-gfm/raw_data-1.png" width="682.56" />
 75 | 
 76 | ## NYT-like linear model
 77 | 
 78 | First off, we can approximate what NYT is doing using linear regression
 79 | based on the last couple of points (this doesn’t line up exactly with
 80 | their model but it seems close):
 81 | 
 82 | ``` r
 83 | df %>%
 84 |   ggplot(aes(date, vax)) +
 85 |   geom_point(na.rm = TRUE) +
 86 |   stat_smooth(formula = y ~ x, method = lm, se = FALSE, data = . %>% filter(row_number() > n() - 8), fullrange = TRUE) +
 87 |   scale_x_date(limits = c(min(df$date), make_date(2021, 10, 31))) +
 88 |   coord_cartesian(ylim = c(0, 1)) +
 89 |   geom_vline(xintercept = make_date(2021, 10, 12), alpha = 0.2) +
 90 |   geom_hline(yintercept = 0.9, alpha = 0.2) +
 91 |   annotate("text", x = make_date(2021, 5, 28), y = 0.92, label = "90%", hjust = 0, vjust = 0, size = 3.5) +
 92 |   annotate("text", x = make_date(2021, 10, 8), y = 0.5, hjust = 1, vjust = 0.5, label = "Oct 12", size = 3.5) 
 93 | ```
 94 | 
 95 | <img src="vax_files/figure-gfm/nyt_like-1.png" width="682.56" />
 96 | 
 97 | However, instead of doing this I am going to fit a time series model to
 98 | the data.
 99 | 
100 | ## Data prep for time series model of daily changes
101 | 
102 | Analyzing the data on the raw scale is hard because (1) we know that the
103 | percent of vaccinated people is bounded between 0 and 1 and (2) we know
104 | that the percent of vaccinated people is always increasing. Combining
105 | these two facts, we could instead look at the difference in the logit of
106 | the percent of vaccinated people:
107 | 
108 | ``` r
109 | df_diff = df %>%
110 |   select(-date) %>%
111 |   complete(day = min(day):max(day)) %>%
112 |   mutate(date = min(df$date) + days(day))
113 | 
114 | df_diff %>%
115 |   ggplot(aes(date, c(NA, diff(qlogis(vax))))) +
116 |   geom_line() +
117 |   geom_point() +
118 |   labs(y = "daily increase in logit(Percent vaccinated)")
119 | ```
120 | 
121 | <img src="vax_files/figure-gfm/diff_plot-1.png" width="682.56" />
122 | 
123 | Since we know that this difference must be positive (since the number of
124 | vaccinated people cannot decrease), we might analyse this on a log
125 | scale. That might also stabilize the variance:
126 | 
127 | ``` r
128 | df_log_diff = df_diff %>%
129 |   mutate(log_diff_vax = c(NA, log(diff(qlogis(vax))))) %>%
130 |   slice(-1)
131 | 
132 | df_log_diff %>%
133 |   ggplot(aes(date, log_diff_vax)) +
134 |   geom_line() +
135 |   geom_point(na.rm = TRUE) +
136 |   labs(y = "log(daily increase in logit(Percent vaccinated))")
137 | ```
138 | 
139 | <img src="vax_files/figure-gfm/log_diff_plot-1.png" width="682.56" />
140 | 
141 | There’s some missing data here, which we’ll let the model impute (FWIW I
142 | also tried just using linear interpolation on the raw data prior to
143 | translating it into differences and got very similar results to the
144 | imputation approach).
145 | 
146 | ## Time series model
147 | 
148 | This seems a good point to fit a time series model. We’ll fit a Bayesian
149 | Structural Time Series model with `bsts()`. We’ll use a semi-local
150 | linear trend [intended for long-term
151 | forecasting](https://www.unofficialgoogledatascience.com/2017/07/fitting-bayesian-structural-time-series.html).
152 | There’s a clear weekly trend in the numbers, which is to be expected
153 | given differences in vaccination rates on the weekend, so we’ll also
154 | include a seasonal component with a 7-day period. Since we’re on a log
155 | scale even changes by as much as 0.5 or 1 are large, we’ll use some
156 | tight-ish-looking priors here:
157 | 
158 | ``` r
159 | fit_model = function(max_date = max(df_log_diff$date)) {
160 |   with(filter(df_log_diff, date <= max_date), bsts(log_diff_vax, 
161 |     state.specification = list() %>%
162 |       AddSemilocalLinearTrend(log_diff_vax,
163 |         level.sigma.prior = SdPrior(0.5, 1),
164 |         slope.mean.prior = NormalPrior(0,0.5),
165 |         initial.level.prior = NormalPrior(0,0.5),
166 |         initial.slope.prior = NormalPrior(0,0.5),
167 |         slope.sigma.prior = SdPrior(0.5, 1),
168 |         slope.ar1.prior = Ar1CoefficientPrior(0, 0.5)
169 |       ) %>%
170 |       AddSeasonal(log_diff_vax, 7, 
171 |         sigma.prior = SdPrior(0.5, 1)
172 |       ),
173 |     prior = SdPrior(0.5, 1),
174 |     niter = 40000,
175 |     seed = 4272021 # for reproducibility
176 |   ))
177 | }
178 | 
179 | m = fit_model()
180 | ```
181 | 
182 |     ## =-=-=-=-= Iteration 0 Thu May 27 01:39:04 2021
183 |     ##  =-=-=-=-=
184 |     ## =-=-=-=-= Iteration 4000 Thu May 27 01:39:10 2021
185 |     ##  =-=-=-=-=
186 |     ## =-=-=-=-= Iteration 8000 Thu May 27 01:39:16 2021
187 |     ##  =-=-=-=-=
188 |     ## =-=-=-=-= Iteration 12000 Thu May 27 01:39:22 2021
189 |     ##  =-=-=-=-=
190 |     ## =-=-=-=-= Iteration 16000 Thu May 27 01:39:28 2021
191 |     ##  =-=-=-=-=
192 |     ## =-=-=-=-= Iteration 20000 Thu May 27 01:39:35 2021
193 |     ##  =-=-=-=-=
194 |     ## =-=-=-=-= Iteration 24000 Thu May 27 01:39:41 2021
195 |     ##  =-=-=-=-=
196 |     ## =-=-=-=-= Iteration 28000 Thu May 27 01:39:48 2021
197 |     ##  =-=-=-=-=
198 |     ## =-=-=-=-= Iteration 32000 Thu May 27 01:39:54 2021
199 |     ##  =-=-=-=-=
200 |     ## =-=-=-=-= Iteration 36000 Thu May 27 01:39:59 2021
201 |     ##  =-=-=-=-=
202 | 
203 | Some diagnostics:
204 | 
205 | ``` r
206 | draws = as_draws(do.call(cbind, m[startsWith(names(m), "trend") | startsWith(names(m), "sigma")]))
207 | summary(draws, median, mad, quantile2, default_convergence_measures())
208 | ```
209 | 
210 |     ## # A tibble: 6 x 8
211 |     ##   variable                median    mad      q5      q95  rhat ess_bulk ess_tail
212 |     ##   <chr>                    <dbl>  <dbl>   <dbl>    <dbl> <dbl>    <dbl>    <dbl>
213 |     ## 1 sigma.obs               0.125  0.0160  0.102   0.155    1.00    4805.   10768.
214 |     ## 2 trend.level.sd          0.122  0.0158  0.0992  0.151    1.00    4052.    8332.
215 |     ## 3 trend.slope.mean       -0.0180 0.0133 -0.0400  0.00392  1.00   10531.   19778.
216 |     ## 4 trend.slope.ar.coeffi~ -0.379  0.181  -0.643  -0.0669   1.00    3913.    8190.
217 |     ## 5 trend.slope.sd          0.146  0.0196  0.118   0.183    1.00    4881.   10225.
218 |     ## 6 sigma.seasonal.7        0.0979 0.0109  0.0819  0.119    1.00    5081.   10211.
219 | 
220 | The *R̂*s and effective sample sizes look reasonable.
221 | 
222 | ``` r
223 | bayesplot::mcmc_trace(draws)
224 | ```
225 | 
226 | <img src="vax_files/figure-gfm/unnamed-chunk-2-1.png" width="672" />
227 | 
228 | Trace plots also look reasonable.
229 | 
230 | ## Predictions
231 | 
232 | Now we’ll generate fits and predictions for a 180-day forecast:
233 | 
234 | ``` r
235 | forecast_days = 180
236 | 
237 | calc_fits = function(model = m, max_date = max(df_diff$date)) {
238 |   df_log_diff %>%
239 |     filter(date <= max_date) %>%
240 |     add_draws(colSums(aperm(model$state.contributions, c(2, 1, 3))))
241 | }
242 | fits_all = calc_fits()
243 | 
244 | calc_pred_diff = function(model = m, max_date = max(df_diff$date)) {
245 |   tibble(date = max_date + days(1:forecast_days)) %>%
246 |     add_draws(predict(model, horizon = forecast_days)$distribution, value = "log_diff_vax")
247 | }
248 | pred_diff_all = calc_pred_diff()
249 | ```
250 | 
251 | Predictions from the model look like this (with imputed missing data in
252 | gray):
253 | 
254 | ``` r
255 | x_dates = seq(make_date(2021, 1, 1), max(pred_diff_all$date), by = "month")
256 | x_scale = function(...) list(
257 |   scale_x_date(
258 |     limits = range(df_diff$date, pred_diff_all$date),
259 |     breaks = x_dates,
260 |     labels = months(x_dates, abbreviate = TRUE)
261 |   ),
262 |   coord_cartesian(expand = FALSE, ...) 
263 | )
264 | 
265 | summer_line = geom_vline(xintercept = make_date(2021, 9, 22), alpha = 0.25)
266 | 
267 | widths = c(.5, .8, .95)
268 | 
269 | plot_pred_diff = function(fits = fits_all, pred_diff = pred_diff_all) {
270 |   pred_diff %>%
271 |     ggplot(aes(date, log_diff_vax)) +
272 |     stat_lineribbon(aes(y = .value, fill_ramp = stat(level)), data = fits, fill = "gray75", color = "gray65") +
273 |     stat_lineribbon(.width = widths, color = "#08519c") +
274 |     geom_line(data = df_log_diff, size = 1) +
275 |     geom_vline(xintercept = min(pred_diff$date) - days(1), color = "gray75") +
276 |     # summer_line +
277 |     scale_fill_brewer() +
278 |     theme_ggdist() +
279 |     x_scale(ylim = c(-20, 5)) +
280 |     labs(
281 |       subtitle = "log(daily increase in logit(Percent vaccinated))",
282 |       y = NULL,
283 |       x = NULL
284 |     )  
285 | }
286 | plot_pred_diff()
287 | ```
288 | 
289 | <img src="vax_files/figure-gfm/predictions_plot-1.png" width="682.56" />
290 | 
291 | There is quite a bit of uncertainty here, especially for far-out
292 | forecasts.
293 | 
294 | We can translate these predictions of differences into predictions of
295 | percent vaccinated by inverting the log, cumulatively summing
296 | differences in log odds, then inverting the logit transformation:
297 | 
298 | ``` r
299 | calc_pred_vax = function(pred_diff = pred_diff_all) {
300 |   last_observed_vax = df_diff %>%
301 |     filter(date == min(pred_diff$date) - days(1)) %$%
302 |     vax
303 |   
304 |   pred_diff %>%
305 |     group_by(.draw) %>%
306 |     mutate(
307 |       vax = plogis(cumsum(c(
308 |         qlogis(last_observed_vax) + exp(log_diff_vax[[1]]),
309 |         exp(log_diff_vax[-1])
310 |       )))
311 |     )
312 | }
313 | pred_vax_all = calc_pred_vax()
314 | ```
315 | 
316 | Now we can plot the latent model alongside predictions of vaccination
317 | rate and the predicted probability that the proportion of vaccinated
318 | people is above some threshold (say, 70%):
319 | 
320 | ``` r
321 | plot_vaxed = function(pred_vax = pred_vax_all) {
322 |   pred_vax %>%
323 |     ggplot(aes(date, vax)) +
324 |     stat_lineribbon(.width = widths, color = "#08519c") +
325 |     scale_fill_brewer() +
326 |     geom_line(data = df_diff, size = 1) +
327 |     geom_hline(yintercept = .7, alpha = 0.25) +
328 |     geom_vline(xintercept = min(pred_vax$date) - days(1), color = "gray75") +
329 |     # summer_line +
330 |     scale_y_continuous(limits = c(0,1), labels = scales::percent_format()) +
331 |     x_scale() +
332 |     theme_ggdist() +
333 |     annotate("text", x = make_date(2020, 12, 28), y = 0.72, label = "70%", hjust = 0, vjust = 0, size = 3.25) +
334 |     # annotate("text", x = make_date(2021, 9, 19), y = 0.25, hjust = 1, vjust = 0.5, label = "Sept 22\n End of summer", size = 3.25, lineheight = 1.05) +
335 |     labs(
336 |       subtitle = "Percent vaccinated (at least one dose)",
337 |       y = NULL,
338 |       x = NULL
339 |     )
340 | }
341 | 
342 | vaxed = plot_vaxed()
343 | 
344 | diffs = plot_pred_diff()
345 | 
346 | prob = pred_vax_all %>%
347 |   group_by(date) %>%
348 |   summarise(prob_vax_gt_70 = mean(vax > .70)) %>%
349 |   ggplot(aes(date, prob_vax_gt_70)) +
350 |   geom_line(color = "#08519c", size = 1) +
351 |   theme_ggdist() +
352 |   geom_hline(yintercept = 0.5, alpha = 0.25) +
353 |   # summer_line +
354 |   scale_y_continuous(limits = c(0,1), labels = scales::percent_format()) +
355 |   x_scale() +
356 |   labs(
357 |     subtitle = "Pr(Percent vaccinated > 70%)",
358 |     y = NULL,
359 |     x = NULL
360 |   )
361 | 
362 | vaxed / diffs / prob
363 | ```
364 | 
365 | <img src="vax_files/figure-gfm/model_breakdown-1.png" width="480" />
366 | 
367 | This gives us a final chart like this:
368 | 
369 | ``` r
370 | vaxed +
371 |   stat_smooth(formula = y ~ x, method = lm, se = FALSE,
372 |     data = df %>% filter(row_number() > n() - 8), 
373 |     fullrange = TRUE, color = scales::alpha("white", 0.5), size = 1
374 |   ) +
375 |   annotate("text", 
376 |     label = "white line=\nlinear model  ", 
377 |     fontface = "bold", x = make_date(2021, 9, 12), y = .89, color = "black", hjust = 1, size = 3.25, lineheight = 1.05
378 |   ) +
379 |   labs(
380 |     subtitle = "Forecasted % US with at least one dose, from time series model of daily increase in log odds"
381 |   )
382 | ```
383 | 
384 | <img src="vax_files/figure-gfm/vaxed_plot-1.png" width="682.56" />
385 | 
386 | Anyway, my conclusion from all of this is essentially that there is *a
387 | lot* of uncertainty in what the vaccination rate will be, at least if we
388 | just look at the raw numbers, and doubtless the model I’ve shown here is
389 | way oversimplified — but I have some trepidation about looking at even
390 | simpler models (like linear projections) and ignoring their uncertainty,
391 | as this is probably going to be at least a little misleading.
392 | 
393 | ## Model checking
394 | 
395 | For comparison purposes, let’s fit the model on data up to a few
396 | different dates and see how it did on the data we’ve already observed.
397 | 
398 | ``` r
399 | plan(multisession)
400 | 
401 | models = tibble(
402 |   upto = seq(as.Date("2021-03-01"), max(df$date), by = 15),
403 |   model = future_map(upto, fit_model)
404 | ) 
405 | ```
406 | 
407 | Then build the charts for each model:
408 | 
409 | ``` r
410 | plots = map2(models$model, models$upto, function(m, max_date) {
411 |   fits = calc_fits(m, max_date)
412 |   pred_diff = calc_pred_diff(m, max_date)
413 |   pred_vax = calc_pred_vax(pred_diff)
414 |   list(
415 |     vaxed = plot_vaxed(pred_vax),
416 |     diffs = plot_pred_diff(fits, pred_diff)
417 |   )
418 | })
419 | 
420 | plots[[length(plots) + 1]] = list(
421 |   vaxed = vaxed,
422 |   diffs = diffs
423 | )
424 | ```
425 | 
426 | And put them together into an animation:
427 | 
428 | ``` r
429 | png_path <- file.path("model_plots", "plot%03d.png")
430 | png(png_path, type = "cairo", width = 1000, height = 1000, res = 200)
431 | for (p in plots) {
432 |   print(p$vaxed / p$diffs)
433 | }
434 | dev.off()
435 | ```
436 | 
437 | ``` r
438 | png_files <- sprintf(png_path, seq_along(plots))
439 | gifski::gifski(png_files, "model_plots/animated.gif", width = 1000, height = 1000)
440 | ```
441 | 
442 | ``` r
443 | gganimate::gif_file("model_plots/animated.gif")
444 | ```
445 | 
446 | ![](vax_files/figure-gfm/unnamed-chunk-7-1.gif)<!-- -->
447 | 


--------------------------------------------------------------------------------