├── .gitattributes
├── nyt_vax_rate.png
├── .gitignore
├── model_plots
├── animated.gif
├── plot001.png
├── plot002.png
├── plot003.png
├── plot004.png
├── plot005.png
├── plot006.png
├── plot007.png
├── plot008.png
├── plot009.png
└── plot010.png
├── vax_files
└── figure-gfm
│ ├── nyt_like-1.png
│ ├── raw_data-1.png
│ ├── diff_plot-1.png
│ ├── vaxed_plot-1.png
│ ├── log_diff_plot-1.png
│ ├── model_breakdown-1.png
│ ├── unnamed-chunk-2-1.png
│ ├── unnamed-chunk-7-1.gif
│ └── predictions_plot-1.png
├── vax.Rmd
└── vax.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 |
3 |
--------------------------------------------------------------------------------
/nyt_vax_rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/nyt_vax_rate.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | .Rhistory
3 | .RData
4 | *.Rproj*
5 | *_cache/
6 | .Rproj.user
7 |
--------------------------------------------------------------------------------
/model_plots/animated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/animated.gif
--------------------------------------------------------------------------------
/model_plots/plot001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot001.png
--------------------------------------------------------------------------------
/model_plots/plot002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot002.png
--------------------------------------------------------------------------------
/model_plots/plot003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot003.png
--------------------------------------------------------------------------------
/model_plots/plot004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot004.png
--------------------------------------------------------------------------------
/model_plots/plot005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot005.png
--------------------------------------------------------------------------------
/model_plots/plot006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot006.png
--------------------------------------------------------------------------------
/model_plots/plot007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot007.png
--------------------------------------------------------------------------------
/model_plots/plot008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot008.png
--------------------------------------------------------------------------------
/model_plots/plot009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot009.png
--------------------------------------------------------------------------------
/model_plots/plot010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/model_plots/plot010.png
--------------------------------------------------------------------------------
/vax_files/figure-gfm/nyt_like-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/nyt_like-1.png
--------------------------------------------------------------------------------
/vax_files/figure-gfm/raw_data-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/raw_data-1.png
--------------------------------------------------------------------------------
/vax_files/figure-gfm/diff_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/diff_plot-1.png
--------------------------------------------------------------------------------
/vax_files/figure-gfm/vaxed_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/vaxed_plot-1.png
--------------------------------------------------------------------------------
/vax_files/figure-gfm/log_diff_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/log_diff_plot-1.png
--------------------------------------------------------------------------------
/vax_files/figure-gfm/model_breakdown-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/model_breakdown-1.png
--------------------------------------------------------------------------------
/vax_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/unnamed-chunk-2-1.png
--------------------------------------------------------------------------------
/vax_files/figure-gfm/unnamed-chunk-7-1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/unnamed-chunk-7-1.gif
--------------------------------------------------------------------------------
/vax_files/figure-gfm/predictions_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjskay/vax/master/vax_files/figure-gfm/predictions_plot-1.png
--------------------------------------------------------------------------------
/vax.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Vaccination rate in the US"
3 | author: "Matthew Kay"
4 | output: github_document
5 | ---
6 |
7 | ```{r chunk_options, include=FALSE}
8 | knitr::opts_chunk$set(fig.retina = 2)
9 |
10 | if (capabilities("cairo") && Sys.info()[['sysname']] != "Darwin") {
11 | knitr::opts_chunk$set(dev.args = list(png = list(type = "cairo")))
12 | }
13 | ```
14 |
15 | The [New York Time vaccine tracker](https://www.nytimes.com/interactive/2020/us/covid-19-vaccine-doses.html)
16 | currently shows a predicted proportion of people vaccinated if vaccination continues
17 | "at the current pace", using a linear projection. I thought it might be helpful to try to
18 | add some uncertainty to that line in a way that does not assume a linear trend.
19 |
20 | 
21 |
22 | ## Libraries needed for this analysis:
23 |
24 | ```{r setup, message = FALSE, warning = FALSE}
25 | library(lubridate)
26 | library(ggdist)
27 | library(tidybayes)
28 | library(bsts)
29 | library(patchwork)
30 | library(tidyverse)
31 | library(magrittr)
32 | library(posterior) # pak::pkg_install("stan-dev/posterior")
33 | library(future)
34 | library(furrr)
35 |
36 | theme_set(theme_ggdist())
37 | ```
38 |
39 | ## The data
40 |
41 | I downloaded the data on what share of the population has recieved at least one dose from
42 | [ourworldindata.org](https://ourworldindata.org/coronavirus/country/united-states#what-share-of-the-population-has-received-at-least-one-dose-of-the-covid-19-vaccine):
43 |
44 | ```{r fetch_data, eval = FALSE}
45 | download.file(
46 | "https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/vaccinations.csv",
47 | "share-people-vaccinated-covid.csv"
48 | )
49 | ```
50 |
51 | Filtering down just to the US:
52 |
53 | ```{r df}
54 | df = read.csv("share-people-vaccinated-covid.csv") %>%
55 | filter(location == "United States") %>%
56 | transmute(
57 | date = as_date(date),
58 | day = as.numeric(date - min(date)),
59 | vax = people_vaccinated_per_hundred / 100
60 | )
61 |
62 | head(df)
63 | ```
64 |
65 | The data looks like this:
66 |
67 | ```{r raw_data, fig.width = 7.11, fig.height = 4}
68 | df %>%
69 | ggplot(aes(date, vax)) +
70 | geom_point(na.rm = TRUE)
71 | ```
72 |
73 |
74 | ## Data prep for time series model of daily changes
75 |
76 | Let's fit a time series model to the data.
77 | Analyzing the data on the raw scale is hard because (1) we
78 | know that the percent of vaccinated people is bounded between 0 and 1 and (2)
79 | we know that the percent of vaccinated people is always increasing. Combining these
80 | two facts, we could instead look at the difference in the logit of the
81 | percent of vaccinated people:
82 |
83 | ```{r diff_plot, warning = FALSE, fig.width = 7.11, fig.height = 4}
84 | df_diff = df %>%
85 | select(-date) %>%
86 | complete(day = min(day):max(day)) %>%
87 | mutate(date = min(df$date) + days(day))
88 |
89 | df_diff %>%
90 | ggplot(aes(date, c(NA, diff(qlogis(vax))))) +
91 | geom_line() +
92 | geom_point() +
93 | labs(y = "daily increase in logit(Percent vaccinated)")
94 | ```
95 |
96 | Since we know that this difference must be positive (since the number of vaccinated
97 | people cannot decrease), we might analyse this on a log scale. That
98 | might also stabilize the variance:
99 |
100 | ```{r log_diff_plot, fig.width = 7.11, fig.height = 4}
101 | df_log_diff = df_diff %>%
102 | mutate(log_diff_vax = c(NA, log(diff(qlogis(vax))))) %>%
103 | slice(-1)
104 |
105 | df_log_diff %>%
106 | ggplot(aes(date, log_diff_vax)) +
107 | geom_line() +
108 | geom_point(na.rm = TRUE) +
109 | labs(y = "log(daily increase in logit(Percent vaccinated))")
110 | ```
111 |
112 | There's some missing data here, which we'll let the model impute (FWIW I also
113 | tried just using linear interpolation on the raw data prior to translating it
114 | into differences and got very similar results to the imputation approach).
115 |
116 | ## Time series model
117 |
118 | This seems a good point to fit a time series model. We'll fit a Bayesian Structural
119 | Time Series model with `bsts()`. We'll use a semi-local
120 | linear trend [intended for long-term forecasting](https://www.unofficialgoogledatascience.com/2017/07/fitting-bayesian-structural-time-series.html).
121 | There's a clear weekly trend in the numbers, which is to be expected given
122 | differences in vaccination rates on the weekend, so we'll also include a seasonal
123 | component with a 7-day period. Since we're on
124 | a log scale even changes by as much as 0.5 or 1 are large, we'll use some
125 | tight-ish-looking priors here:
126 |
127 | ```{r model}
128 | fit_model = function(max_date = max(df_log_diff$date)) {
129 | with(filter(df_log_diff, date <= max_date), bsts(log_diff_vax,
130 | state.specification = list() %>%
131 | AddSemilocalLinearTrend(log_diff_vax,
132 | level.sigma.prior = SdPrior(0.5, 1),
133 | slope.mean.prior = NormalPrior(0,0.5),
134 | initial.level.prior = NormalPrior(0,0.5),
135 | initial.slope.prior = NormalPrior(0,0.5),
136 | slope.sigma.prior = SdPrior(0.5, 1),
137 | slope.ar1.prior = Ar1CoefficientPrior(0, 0.5)
138 | ) %>%
139 | AddSeasonal(log_diff_vax, 7,
140 | sigma.prior = SdPrior(0.5, 1)
141 | ),
142 | prior = SdPrior(0.5, 1),
143 | niter = 40000,
144 | seed = 4272021 # for reproducibility
145 | ))
146 | }
147 |
148 | m = fit_model()
149 | ```
150 |
151 | Some diagnostics:
152 |
153 | ```{r}
154 | draws = as_draws(do.call(cbind, m[startsWith(names(m), "trend") | startsWith(names(m), "sigma")]))
155 | summary(draws, median, mad, quantile2, default_convergence_measures())
156 | ```
157 |
158 | The $\hat{R}$s and effective sample sizes look reasonable.
159 |
160 | ```{r}
161 | bayesplot::mcmc_trace(draws)
162 | ```
163 |
164 | Trace plots also look reasonable.
165 |
166 | ## Predictions
167 |
168 | Now we'll generate fits and predictions for a 180-day forecast:
169 |
170 | ```{r predictions}
171 | forecast_days = 180
172 |
173 | calc_fits = function(model = m, max_date = max(df_diff$date)) {
174 | df_log_diff %>%
175 | filter(date <= max_date) %>%
176 | mutate(.value = rvar(colSums(aperm(model$state.contributions, c(2, 1, 3)))))
177 | }
178 | fits_all = calc_fits()
179 |
180 | calc_pred_diff = function(model = m, max_date = max(df_diff$date)) {
181 | tibble(date = max_date + days(1:forecast_days)) %>%
182 | mutate(log_diff_vax = rvar(predict(model, horizon = forecast_days)$distribution))
183 | }
184 | pred_diff_all = calc_pred_diff()
185 | ```
186 |
187 | Predictions from the model look like this (with imputed missing data in gray):
188 |
189 | ```{r predictions_plot, fig.width = 7.11, fig.height = 4}
190 | x_dates = seq(make_date(2021, 1, 1), max(pred_diff_all$date), by = "month")
191 | x_scale = function(...) list(
192 | scale_x_date(
193 | limits = range(df_diff$date, pred_diff_all$date),
194 | breaks = x_dates,
195 | labels = months(x_dates, abbreviate = TRUE)
196 | ),
197 | coord_cartesian(expand = FALSE, ...)
198 | )
199 |
200 | summer_line = geom_vline(xintercept = make_date(2021, 9, 22), alpha = 0.25)
201 |
202 | widths = c(.5, .8, .95)
203 |
204 | plot_pred_diff = function(fits = fits_all, pred_diff = pred_diff_all) {
205 | pred_diff %>%
206 | ggplot(aes(date)) +
207 | stat_dist_lineribbon(aes(dist = .value, fill_ramp = stat(level)), data = fits, fill = "gray75", color = "gray65") +
208 | stat_dist_lineribbon(aes(dist = log_diff_vax), .width = widths, color = "#08519c") +
209 | geom_line(aes(y = log_diff_vax), data = df_log_diff, size = 1) +
210 | geom_vline(xintercept = min(pred_diff$date) - days(1), color = "gray75") +
211 | # summer_line +
212 | scale_fill_brewer() +
213 | theme_ggdist() +
214 | x_scale(ylim = c(-20, 5)) +
215 | labs(
216 | subtitle = "log(daily increase in logit(Percent vaccinated))",
217 | y = NULL,
218 | x = NULL
219 | )
220 | }
221 | plot_pred_diff()
222 | ```
223 |
224 | There is quite a bit of uncertainty here, especially for far-out forecasts.
225 |
226 | We can translate these predictions of differences into predictions of percent
227 | vaccinated by inverting the log, cumulatively summing differences in log odds,
228 | then inverting the logit transformation:
229 |
230 | ```{r pred_vax}
231 | # need a logit and inverse logit functions in terms of elementary operations (plogis
232 | # does not work on rvars directly)
233 | logit = function(p) {log(p) - log1p(-p)}
234 | inv_logit = function(x) {1/(1 + exp(-x))}
235 |
236 | calc_pred_vax = function(pred_diff = pred_diff_all) {
237 | last_observed_vax = df_diff %>%
238 | filter(date == min(pred_diff$date) - days(1)) %$%
239 | vax
240 |
241 | pred_diff %>%
242 | mutate(
243 | logit_vax = cumsum(c(
244 | logit(last_observed_vax) + exp(log_diff_vax[[1]]),
245 | exp(log_diff_vax[-1])
246 | )),
247 | vax = inv_logit(logit_vax)
248 | )
249 | }
250 | pred_vax_all = calc_pred_vax()
251 | ```
252 |
253 | Now we can plot the latent model alongside predictions of vaccination rate and
254 | the predicted probability that the proportion of vaccinated people is above
255 | some threshold (say, 70%):
256 |
257 | ```{r model_breakdown, fig.width = 5, fig.height = 7}
258 | plot_vaxed = function(pred_vax = pred_vax_all) {
259 | pred_vax %>%
260 | ggplot(aes(date)) +
261 | stat_dist_lineribbon(aes(dist = vax), .width = widths, color = "#08519c") +
262 | scale_fill_brewer() +
263 | geom_line(aes(y = vax), data = df_diff, size = 1) +
264 | geom_hline(yintercept = .7, alpha = 0.25) +
265 | geom_vline(xintercept = min(pred_vax$date) - days(1), color = "gray75") +
266 | # summer_line +
267 | scale_y_continuous(limits = c(0,1), labels = scales::percent_format()) +
268 | x_scale() +
269 | theme_ggdist() +
270 | annotate("text", x = make_date(2020, 12, 28), y = 0.72, label = "70%", hjust = 0, vjust = 0, size = 3.25) +
271 | # annotate("text", x = make_date(2021, 9, 19), y = 0.25, hjust = 1, vjust = 0.5, label = "Sept 22\n End of summer", size = 3.25, lineheight = 1.05) +
272 | labs(
273 | subtitle = "Percent vaccinated (at least one dose)",
274 | y = NULL,
275 | x = NULL
276 | )
277 | }
278 |
279 | vaxed = plot_vaxed()
280 |
281 | diffs = plot_pred_diff()
282 |
283 | pred_vax_all$prob_vax_gt_70 = Pr(pred_vax_all$vax > .70)
284 |
285 | prob = pred_vax_all %>%
286 | ggplot(aes(date, prob_vax_gt_70)) +
287 | geom_line(color = "#08519c", size = 1) +
288 | theme_ggdist() +
289 | geom_hline(yintercept = 0.5, alpha = 0.25) +
290 | # summer_line +
291 | scale_y_continuous(limits = c(0,1), labels = scales::percent_format()) +
292 | x_scale() +
293 | labs(
294 | subtitle = "Pr(Percent vaccinated > 70%)",
295 | y = NULL,
296 | x = NULL
297 | )
298 |
299 | vaxed / diffs / prob
300 | ```
301 |
302 | This gives us a final chart like this:
303 |
304 | ```{r vaxed_plot, warning = FALSE, fig.width = 7.11, fig.height = 4}
305 | vaxed +
306 | stat_smooth(aes(y = vax), formula = y ~ x, method = lm, se = FALSE,
307 | data = df %>% filter(row_number() > n() - 8),
308 | fullrange = TRUE, color = scales::alpha("white", 0.5), size = 1
309 | ) +
310 | annotate("text",
311 | label = "white line=\nlinear model ",
312 | fontface = "bold", x = make_date(2021, 9, 12), y = .89, color = "black", hjust = 1, size = 3.25, lineheight = 1.05
313 | ) +
314 | labs(
315 | subtitle = "Forecasted % US with at least one dose, from time series model of daily increase in log odds"
316 | )
317 | ```
318 |
319 | Anyway, my conclusion from all of this is essentially that there is *a lot* of
320 | uncertainty in what the vaccination rate will be, at least if we just look at the raw numbers,
321 | and doubtless the model I've shown here is way oversimplified --- but I have
322 | some trepidation about looking at even simpler models (like linear projections)
323 | and ignoring their uncertainty, as this is probably going to be at least a little misleading.
324 |
325 |
326 | ## Model checking
327 |
328 | For comparison purposes, let's fit the model on data up to a few different
329 | dates and see how it did on the data we've already observed.
330 |
331 | ```{r eval = FALSE}
332 | plan(multisession)
333 |
334 | models = tibble(
335 | upto = seq(as.Date("2021-03-01"), max(df$date), by = 15),
336 | model = future_map(upto, fit_model)
337 | )
338 | ```
339 |
340 | Then build the charts for each model:
341 |
342 | ```{r eval = FALSE}
343 | plots = map2(models$model, models$upto, function(m, max_date) {
344 | fits = calc_fits(m, max_date)
345 | pred_diff = calc_pred_diff(m, max_date)
346 | pred_vax = calc_pred_vax(pred_diff)
347 | list(
348 | vaxed = plot_vaxed(pred_vax),
349 | diffs = plot_pred_diff(fits, pred_diff)
350 | )
351 | })
352 |
353 | plots[[length(plots) + 1]] = list(
354 | vaxed = vaxed,
355 | diffs = diffs
356 | )
357 | ```
358 |
359 | Save each chart to disk:
360 |
361 | ```{r eval = FALSE}
362 | png_path <- file.path("model_plots", "plot%03d.png")
363 | png(png_path, type = "cairo", width = 1000, height = 1000, res = 200)
364 | for (p in plots) {
365 | print(p$vaxed / p$diffs)
366 | }
367 | dev.off()
368 | ```
369 |
370 | And put them together into an animation:
371 |
372 | ```{r eval = FALSE}
373 | png_files <- sprintf(png_path, seq_along(plots))
374 | gifski::gifski(png_files, "model_plots/animated.gif", width = 1000, height = 1000)
375 | ```
376 |
377 |
378 | ```{r}
379 | gganimate::gif_file("model_plots/animated.gif")
380 | ```
381 |
382 |
--------------------------------------------------------------------------------
/vax.md:
--------------------------------------------------------------------------------
1 | Vaccination rate in the US
2 | ================
3 | Matthew Kay
4 |
5 | The [New York Time vaccine
6 | tracker](https://www.nytimes.com/interactive/2020/us/covid-19-vaccine-doses.html)
7 | currently shows a predicted proportion of people vaccinated if
8 | vaccination continues “at the current pace”, using a linear projection.
9 | I thought it might be helpful to try to add some uncertainty to that
10 | line in a way that does not assume a linear trend.
11 |
12 | 
13 |
14 | ## Libraries needed for this analysis:
15 |
16 | ``` r
17 | library(lubridate)
18 | library(ggdist)
19 | library(tidybayes)
20 | library(bsts)
21 | library(patchwork)
22 | library(tidyverse)
23 | library(magrittr)
24 | library(posterior) # pak::pkg_install("stan-dev/posterior")
25 | library(future)
26 | library(furrr)
27 |
28 | theme_set(theme_ggdist())
29 | ```
30 |
31 | ## The data
32 |
33 | I downloaded the data on what share of the population has recieved at
34 | least one dose from
35 | [ourworldindata.org](https://ourworldindata.org/coronavirus/country/united-states#what-share-of-the-population-has-received-at-least-one-dose-of-the-covid-19-vaccine):
36 |
37 | ``` r
38 | download.file(
39 | "https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/vaccinations.csv",
40 | "share-people-vaccinated-covid.csv"
41 | )
42 | ```
43 |
44 | Filtering down just to the US:
45 |
46 | ``` r
47 | df = read.csv("share-people-vaccinated-covid.csv") %>%
48 | filter(location == "United States") %>%
49 | transmute(
50 | date = as_date(date),
51 | day = as.numeric(date - min(date)),
52 | vax = people_vaccinated_per_hundred / 100
53 | )
54 |
55 | head(df)
56 | ```
57 |
58 | ## date day vax
59 | ## 1 2020-12-20 0 0.0017
60 | ## 2 2020-12-21 1 0.0018
61 | ## 3 2020-12-22 2 NA
62 | ## 4 2020-12-23 3 0.0030
63 | ## 5 2020-12-24 4 NA
64 | ## 6 2020-12-25 5 NA
65 |
66 | The data looks like this:
67 |
68 | ``` r
69 | df %>%
70 | ggplot(aes(date, vax)) +
71 | geom_point(na.rm = TRUE)
72 | ```
73 |
74 |
75 |
76 | ## NYT-like linear model
77 |
78 | First off, we can approximate what NYT is doing using linear regression
79 | based on the last couple of points (this doesn’t line up exactly with
80 | their model but it seems close):
81 |
82 | ``` r
83 | df %>%
84 | ggplot(aes(date, vax)) +
85 | geom_point(na.rm = TRUE) +
86 | stat_smooth(formula = y ~ x, method = lm, se = FALSE, data = . %>% filter(row_number() > n() - 8), fullrange = TRUE) +
87 | scale_x_date(limits = c(min(df$date), make_date(2021, 10, 31))) +
88 | coord_cartesian(ylim = c(0, 1)) +
89 | geom_vline(xintercept = make_date(2021, 10, 12), alpha = 0.2) +
90 | geom_hline(yintercept = 0.9, alpha = 0.2) +
91 | annotate("text", x = make_date(2021, 5, 28), y = 0.92, label = "90%", hjust = 0, vjust = 0, size = 3.5) +
92 | annotate("text", x = make_date(2021, 10, 8), y = 0.5, hjust = 1, vjust = 0.5, label = "Oct 12", size = 3.5)
93 | ```
94 |
95 |
96 |
97 | However, instead of doing this I am going to fit a time series model to
98 | the data.
99 |
100 | ## Data prep for time series model of daily changes
101 |
102 | Analyzing the data on the raw scale is hard because (1) we know that the
103 | percent of vaccinated people is bounded between 0 and 1 and (2) we know
104 | that the percent of vaccinated people is always increasing. Combining
105 | these two facts, we could instead look at the difference in the logit of
106 | the percent of vaccinated people:
107 |
108 | ``` r
109 | df_diff = df %>%
110 | select(-date) %>%
111 | complete(day = min(day):max(day)) %>%
112 | mutate(date = min(df$date) + days(day))
113 |
114 | df_diff %>%
115 | ggplot(aes(date, c(NA, diff(qlogis(vax))))) +
116 | geom_line() +
117 | geom_point() +
118 | labs(y = "daily increase in logit(Percent vaccinated)")
119 | ```
120 |
121 |
122 |
123 | Since we know that this difference must be positive (since the number of
124 | vaccinated people cannot decrease), we might analyse this on a log
125 | scale. That might also stabilize the variance:
126 |
127 | ``` r
128 | df_log_diff = df_diff %>%
129 | mutate(log_diff_vax = c(NA, log(diff(qlogis(vax))))) %>%
130 | slice(-1)
131 |
132 | df_log_diff %>%
133 | ggplot(aes(date, log_diff_vax)) +
134 | geom_line() +
135 | geom_point(na.rm = TRUE) +
136 | labs(y = "log(daily increase in logit(Percent vaccinated))")
137 | ```
138 |
139 |
140 |
141 | There’s some missing data here, which we’ll let the model impute (FWIW I
142 | also tried just using linear interpolation on the raw data prior to
143 | translating it into differences and got very similar results to the
144 | imputation approach).
145 |
146 | ## Time series model
147 |
148 | This seems a good point to fit a time series model. We’ll fit a Bayesian
149 | Structural Time Series model with `bsts()`. We’ll use a semi-local
150 | linear trend [intended for long-term
151 | forecasting](https://www.unofficialgoogledatascience.com/2017/07/fitting-bayesian-structural-time-series.html).
152 | There’s a clear weekly trend in the numbers, which is to be expected
153 | given differences in vaccination rates on the weekend, so we’ll also
154 | include a seasonal component with a 7-day period. Since we’re on a log
155 | scale even changes by as much as 0.5 or 1 are large, we’ll use some
156 | tight-ish-looking priors here:
157 |
158 | ``` r
159 | fit_model = function(max_date = max(df_log_diff$date)) {
160 | with(filter(df_log_diff, date <= max_date), bsts(log_diff_vax,
161 | state.specification = list() %>%
162 | AddSemilocalLinearTrend(log_diff_vax,
163 | level.sigma.prior = SdPrior(0.5, 1),
164 | slope.mean.prior = NormalPrior(0,0.5),
165 | initial.level.prior = NormalPrior(0,0.5),
166 | initial.slope.prior = NormalPrior(0,0.5),
167 | slope.sigma.prior = SdPrior(0.5, 1),
168 | slope.ar1.prior = Ar1CoefficientPrior(0, 0.5)
169 | ) %>%
170 | AddSeasonal(log_diff_vax, 7,
171 | sigma.prior = SdPrior(0.5, 1)
172 | ),
173 | prior = SdPrior(0.5, 1),
174 | niter = 40000,
175 | seed = 4272021 # for reproducibility
176 | ))
177 | }
178 |
179 | m = fit_model()
180 | ```
181 |
182 | ## =-=-=-=-= Iteration 0 Thu May 27 01:39:04 2021
183 | ## =-=-=-=-=
184 | ## =-=-=-=-= Iteration 4000 Thu May 27 01:39:10 2021
185 | ## =-=-=-=-=
186 | ## =-=-=-=-= Iteration 8000 Thu May 27 01:39:16 2021
187 | ## =-=-=-=-=
188 | ## =-=-=-=-= Iteration 12000 Thu May 27 01:39:22 2021
189 | ## =-=-=-=-=
190 | ## =-=-=-=-= Iteration 16000 Thu May 27 01:39:28 2021
191 | ## =-=-=-=-=
192 | ## =-=-=-=-= Iteration 20000 Thu May 27 01:39:35 2021
193 | ## =-=-=-=-=
194 | ## =-=-=-=-= Iteration 24000 Thu May 27 01:39:41 2021
195 | ## =-=-=-=-=
196 | ## =-=-=-=-= Iteration 28000 Thu May 27 01:39:48 2021
197 | ## =-=-=-=-=
198 | ## =-=-=-=-= Iteration 32000 Thu May 27 01:39:54 2021
199 | ## =-=-=-=-=
200 | ## =-=-=-=-= Iteration 36000 Thu May 27 01:39:59 2021
201 | ## =-=-=-=-=
202 |
203 | Some diagnostics:
204 |
205 | ``` r
206 | draws = as_draws(do.call(cbind, m[startsWith(names(m), "trend") | startsWith(names(m), "sigma")]))
207 | summary(draws, median, mad, quantile2, default_convergence_measures())
208 | ```
209 |
210 | ## # A tibble: 6 x 8
211 | ## variable median mad q5 q95 rhat ess_bulk ess_tail
212 | ##
213 | ## 1 sigma.obs 0.125 0.0160 0.102 0.155 1.00 4805. 10768.
214 | ## 2 trend.level.sd 0.122 0.0158 0.0992 0.151 1.00 4052. 8332.
215 | ## 3 trend.slope.mean -0.0180 0.0133 -0.0400 0.00392 1.00 10531. 19778.
216 | ## 4 trend.slope.ar.coeffi~ -0.379 0.181 -0.643 -0.0669 1.00 3913. 8190.
217 | ## 5 trend.slope.sd 0.146 0.0196 0.118 0.183 1.00 4881. 10225.
218 | ## 6 sigma.seasonal.7 0.0979 0.0109 0.0819 0.119 1.00 5081. 10211.
219 |
220 | The *R̂*s and effective sample sizes look reasonable.
221 |
222 | ``` r
223 | bayesplot::mcmc_trace(draws)
224 | ```
225 |
226 |
227 |
228 | Trace plots also look reasonable.
229 |
230 | ## Predictions
231 |
232 | Now we’ll generate fits and predictions for a 180-day forecast:
233 |
234 | ``` r
235 | forecast_days = 180
236 |
237 | calc_fits = function(model = m, max_date = max(df_diff$date)) {
238 | df_log_diff %>%
239 | filter(date <= max_date) %>%
240 | add_draws(colSums(aperm(model$state.contributions, c(2, 1, 3))))
241 | }
242 | fits_all = calc_fits()
243 |
244 | calc_pred_diff = function(model = m, max_date = max(df_diff$date)) {
245 | tibble(date = max_date + days(1:forecast_days)) %>%
246 | add_draws(predict(model, horizon = forecast_days)$distribution, value = "log_diff_vax")
247 | }
248 | pred_diff_all = calc_pred_diff()
249 | ```
250 |
251 | Predictions from the model look like this (with imputed missing data in
252 | gray):
253 |
254 | ``` r
255 | x_dates = seq(make_date(2021, 1, 1), max(pred_diff_all$date), by = "month")
256 | x_scale = function(...) list(
257 | scale_x_date(
258 | limits = range(df_diff$date, pred_diff_all$date),
259 | breaks = x_dates,
260 | labels = months(x_dates, abbreviate = TRUE)
261 | ),
262 | coord_cartesian(expand = FALSE, ...)
263 | )
264 |
265 | summer_line = geom_vline(xintercept = make_date(2021, 9, 22), alpha = 0.25)
266 |
267 | widths = c(.5, .8, .95)
268 |
269 | plot_pred_diff = function(fits = fits_all, pred_diff = pred_diff_all) {
270 | pred_diff %>%
271 | ggplot(aes(date, log_diff_vax)) +
272 | stat_lineribbon(aes(y = .value, fill_ramp = stat(level)), data = fits, fill = "gray75", color = "gray65") +
273 | stat_lineribbon(.width = widths, color = "#08519c") +
274 | geom_line(data = df_log_diff, size = 1) +
275 | geom_vline(xintercept = min(pred_diff$date) - days(1), color = "gray75") +
276 | # summer_line +
277 | scale_fill_brewer() +
278 | theme_ggdist() +
279 | x_scale(ylim = c(-20, 5)) +
280 | labs(
281 | subtitle = "log(daily increase in logit(Percent vaccinated))",
282 | y = NULL,
283 | x = NULL
284 | )
285 | }
286 | plot_pred_diff()
287 | ```
288 |
289 |
290 |
291 | There is quite a bit of uncertainty here, especially for far-out
292 | forecasts.
293 |
294 | We can translate these predictions of differences into predictions of
295 | percent vaccinated by inverting the log, cumulatively summing
296 | differences in log odds, then inverting the logit transformation:
297 |
298 | ``` r
299 | calc_pred_vax = function(pred_diff = pred_diff_all) {
300 | last_observed_vax = df_diff %>%
301 | filter(date == min(pred_diff$date) - days(1)) %$%
302 | vax
303 |
304 | pred_diff %>%
305 | group_by(.draw) %>%
306 | mutate(
307 | vax = plogis(cumsum(c(
308 | qlogis(last_observed_vax) + exp(log_diff_vax[[1]]),
309 | exp(log_diff_vax[-1])
310 | )))
311 | )
312 | }
313 | pred_vax_all = calc_pred_vax()
314 | ```
315 |
316 | Now we can plot the latent model alongside predictions of vaccination
317 | rate and the predicted probability that the proportion of vaccinated
318 | people is above some threshold (say, 70%):
319 |
320 | ``` r
321 | plot_vaxed = function(pred_vax = pred_vax_all) {
322 | pred_vax %>%
323 | ggplot(aes(date, vax)) +
324 | stat_lineribbon(.width = widths, color = "#08519c") +
325 | scale_fill_brewer() +
326 | geom_line(data = df_diff, size = 1) +
327 | geom_hline(yintercept = .7, alpha = 0.25) +
328 | geom_vline(xintercept = min(pred_vax$date) - days(1), color = "gray75") +
329 | # summer_line +
330 | scale_y_continuous(limits = c(0,1), labels = scales::percent_format()) +
331 | x_scale() +
332 | theme_ggdist() +
333 | annotate("text", x = make_date(2020, 12, 28), y = 0.72, label = "70%", hjust = 0, vjust = 0, size = 3.25) +
334 | # annotate("text", x = make_date(2021, 9, 19), y = 0.25, hjust = 1, vjust = 0.5, label = "Sept 22\n End of summer", size = 3.25, lineheight = 1.05) +
335 | labs(
336 | subtitle = "Percent vaccinated (at least one dose)",
337 | y = NULL,
338 | x = NULL
339 | )
340 | }
341 |
342 | vaxed = plot_vaxed()
343 |
344 | diffs = plot_pred_diff()
345 |
346 | prob = pred_vax_all %>%
347 | group_by(date) %>%
348 | summarise(prob_vax_gt_70 = mean(vax > .70)) %>%
349 | ggplot(aes(date, prob_vax_gt_70)) +
350 | geom_line(color = "#08519c", size = 1) +
351 | theme_ggdist() +
352 | geom_hline(yintercept = 0.5, alpha = 0.25) +
353 | # summer_line +
354 | scale_y_continuous(limits = c(0,1), labels = scales::percent_format()) +
355 | x_scale() +
356 | labs(
357 | subtitle = "Pr(Percent vaccinated > 70%)",
358 | y = NULL,
359 | x = NULL
360 | )
361 |
362 | vaxed / diffs / prob
363 | ```
364 |
365 |
366 |
367 | This gives us a final chart like this:
368 |
369 | ``` r
370 | vaxed +
371 | stat_smooth(formula = y ~ x, method = lm, se = FALSE,
372 | data = df %>% filter(row_number() > n() - 8),
373 | fullrange = TRUE, color = scales::alpha("white", 0.5), size = 1
374 | ) +
375 | annotate("text",
376 | label = "white line=\nlinear model ",
377 | fontface = "bold", x = make_date(2021, 9, 12), y = .89, color = "black", hjust = 1, size = 3.25, lineheight = 1.05
378 | ) +
379 | labs(
380 | subtitle = "Forecasted % US with at least one dose, from time series model of daily increase in log odds"
381 | )
382 | ```
383 |
384 |
385 |
386 | Anyway, my conclusion from all of this is essentially that there is *a
387 | lot* of uncertainty in what the vaccination rate will be, at least if we
388 | just look at the raw numbers, and doubtless the model I’ve shown here is
389 | way oversimplified — but I have some trepidation about looking at even
390 | simpler models (like linear projections) and ignoring their uncertainty,
391 | as this is probably going to be at least a little misleading.
392 |
393 | ## Model checking
394 |
395 | For comparison purposes, let’s fit the model on data up to a few
396 | different dates and see how it did on the data we’ve already observed.
397 |
398 | ``` r
399 | plan(multisession)
400 |
401 | models = tibble(
402 | upto = seq(as.Date("2021-03-01"), max(df$date), by = 15),
403 | model = future_map(upto, fit_model)
404 | )
405 | ```
406 |
407 | Then build the charts for each model:
408 |
409 | ``` r
410 | plots = map2(models$model, models$upto, function(m, max_date) {
411 | fits = calc_fits(m, max_date)
412 | pred_diff = calc_pred_diff(m, max_date)
413 | pred_vax = calc_pred_vax(pred_diff)
414 | list(
415 | vaxed = plot_vaxed(pred_vax),
416 | diffs = plot_pred_diff(fits, pred_diff)
417 | )
418 | })
419 |
420 | plots[[length(plots) + 1]] = list(
421 | vaxed = vaxed,
422 | diffs = diffs
423 | )
424 | ```
425 |
426 | And put them together into an animation:
427 |
428 | ``` r
429 | png_path <- file.path("model_plots", "plot%03d.png")
430 | png(png_path, type = "cairo", width = 1000, height = 1000, res = 200)
431 | for (p in plots) {
432 | print(p$vaxed / p$diffs)
433 | }
434 | dev.off()
435 | ```
436 |
437 | ``` r
438 | png_files <- sprintf(png_path, seq_along(plots))
439 | gifski::gifski(png_files, "model_plots/animated.gif", width = 1000, height = 1000)
440 | ```
441 |
442 | ``` r
443 | gganimate::gif_file("model_plots/animated.gif")
444 | ```
445 |
446 | 
447 |
--------------------------------------------------------------------------------