├── .Rprofile
├── .gitignore
├── CONTRIBUTING.Rmd
├── CONTRIBUTING.md
├── LICENSE.md
├── Latent-Bayesian-IPW.Rproj
├── README.Rmd
├── README.md
├── assets
├── apsa.csl
├── references.bib
└── style.css
├── data
├── .gitignore
└── sim_results.rds
├── functions
├── ardl-sims.R
├── dgp-sim.R
└── msm-sims.R
├── manuscript
├── manuscript.qmd
└── output
│ └── Nafa and Heiss (2022)_Taking-Uncertainty-Seriously-APSA.pdf
├── models
└── stan
│ ├── ARDL_Simulation.exe
│ ├── ARDL_Simulation.stan
│ ├── IPTW_Outcome_Simulation.exe
│ └── IPTW_Outcome_Simulation.stan
├── presentation
├── .gitignore
├── Taking-Uncertainty-Seriously_Bayesian-Marginal-Structural-Models-for-Causal Inference.pdf
├── apsa-presentation.html
├── apsa-presentation.qmd
├── math-colors.js
└── style.css
└── scripts
└── 01-simulation-study.R
/.Rprofile:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # R stuff
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 |
6 | # Targets stuff
7 | _targets
8 |
9 | # Site output
10 | **/_site/*
11 | **/site_libs/*
12 |
13 | # knitr and caching stuff
14 | **/*_files/*
15 | **/*_cache/*
16 | **/model_cache*
17 | manuscript/*.html
18 |
19 | # Output
20 | # (ignore for now)
21 | analysis/output/
22 |
23 | # Miscellaneous
24 | *.~lock.*
25 | .DS_Store
26 | .dropbox
27 | Icon?
28 | admin/*
29 | desktop.ini
30 | sources/*
31 | .old-stuff
32 |
33 |
--------------------------------------------------------------------------------
/CONTRIBUTING.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output:
3 | github_document
4 | ---
5 |
6 | # Contributing
7 |
8 | We appreciate and welcome pull requests from everyone. By participating in this project, you
9 | agree to abide by our [code of conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html).
10 |
11 | ## Getting Started
12 |
13 | * Make sure you have a [GitHub account](https://github.com/signup/free). If you are not familiar with git and GitHub, take a look at to get started.
14 | * [Submit a post for your issue](https://github.com/ajnafa/Latent-Bayesian-MSM/issues/), assuming one does not already exist.
15 | * Clearly describe your issue, including steps to reproduce when it is a bug, or some justification for a proposed improvement.
16 | * Fork the repository on GitHub to make a copy of the repository on your account. Or use this line in your shell terminal:
17 |
18 | `git clone https://github.com/ajnafa/Latent-Bayesian-MSM.git`
19 |
20 | ## Making changes
21 |
22 | * Edit the files, save often, and make commits of logical units, where each commit indicates one concept
23 | * Follow the tidyverse [style guide](https://style.tidyverse.org/index.html).
24 | * Make sure you write [good commit messages](https://cbea.ms/git-commit/).
25 | * Make sure you have added the necessary tests for your code changes.
26 | * Run _all_ the tests using `devtools::check()` to assure nothing else was accidentally broken.
27 | * If you need help or unsure about anything, post an update to [your issue](https://github.com/ajnafa/Latent-Bayesian-MSM/issues/).
28 |
29 | ## Submitting your changes
30 |
31 | Push to your fork and [submit a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).
32 |
33 | At this point you're waiting on us. We like to at least comment on pull requests
34 | within a few business days. We may suggest some changes or improvements or alternatives.
35 |
36 | Some things you can do that will increase the chance that your pull request is accepted:
37 |
38 | * Engage in discussion on [your issue](https://github.com/ajnafa/Latent-Bayesian-MSM/issues/).
39 | * Be familiar with the background literature cited in the [README](README.md)
40 | * Write tests that pass.
41 | * Follow our [code style guide](https://style.tidyverse.org/index.html).
42 | * Write a [good commit message](https://cbea.ms/git-commit/).
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 |
2 | # Contributing
3 |
4 | We appreciate and welcome pull requests from everyone. By participating
5 | in this project, you agree to abide by our [code of
6 | conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html).
7 |
8 | ## Getting Started
9 |
10 | - Make sure you have a [GitHub
11 | account](https://github.com/signup/free). If you are not familiar
12 | with git and GitHub, take a look at to
13 | get started.
14 |
15 | - [Submit a post for your
16 | issue](https://github.com/ajnafa/Latent-Bayesian-MSM/issues/),
17 | assuming one does not already exist.
18 |
19 | - Clearly describe your issue, including steps to reproduce when
20 | it is a bug, or some justification for a proposed improvement.
21 |
22 | - Fork the repository on GitHub to make a copy of the repository on
23 | your account. Or use this line in your shell terminal:
24 |
25 | `git clone https://github.com/ajnafa/Latent-Bayesian-MSM.git`
26 |
27 | ## Making changes
28 |
29 | - Edit the files, save often, and make commits of logical units, where
30 | each commit indicates one concept
31 | - Follow the tidyverse [style
32 | guide](https://style.tidyverse.org/index.html).
33 | - Make sure you write [good commit
34 | messages](https://cbea.ms/git-commit/).
35 | - Make sure you have added the necessary tests for your code changes.
36 | - Run *all* the tests using `devtools::check()` to assure nothing else
37 | was accidentally broken.
38 | - If you need help or unsure about anything, post an update to [your
39 | issue](https://github.com/ajnafa/Latent-Bayesian-MSM/issues/).
40 |
41 | ## Submitting your changes
42 |
43 | Push to your fork and [submit a pull
44 | request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).
45 |
46 | At this point you’re waiting on us. We like to at least comment on pull
47 | requests within a few business days. We may suggest some changes or
48 | improvements or alternatives.
49 |
50 | Some things you can do that will increase the chance that your pull
51 | request is accepted:
52 |
53 | - Engage in discussion on [your
54 | issue](https://github.com/ajnafa/Latent-Bayesian-MSM/issues/).
55 | - Be familiar with the background literature cited in the
56 | [README](README.md)
57 | - Write tests that pass.
58 | - Follow our [code style
59 | guide](https://style.tidyverse.org/index.html).
60 | - Write a [good commit message](https://cbea.ms/git-commit/).
61 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2022, Andrew Heiss and Adam Jordan Nafa
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
--------------------------------------------------------------------------------
/Latent-Bayesian-IPW.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: No
4 | SaveWorkspace: No
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: knitr
13 | LaTeX: XeLaTeX
14 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output:
3 | github_document
4 | bibliography: "assets/references.bib"
5 | csl: "assets/apsa.csl"
6 | ---
7 |
8 |
9 |
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(
12 | collapse = TRUE,
13 | comment = "#>",
14 | fig.path = "README-",
15 | fig.retina = 3
16 | )
17 | ```
18 |
19 | # Taking Uncertainty Seriously: Bayesian Marginal Structural Models for Causal Inference in Political Science
20 |
21 | ### Andrew Heiss and A. Jordan Nafa
22 |
23 |
24 |
25 |
26 | ### Abstract
27 |
28 | > The past two decades have been characterized by considerable progress in developing approaches to causal inference in situations where true experimental manipulation is either impractical or impossible. With few exceptions, however, commonly employed techniques in political science have developed largely within a frequentist framework [i.e., @Blackwell2018; @Imai2019; @Torres2020]. In this article, we argue that common approaches rest fundamentally upon assumptions that are difficult to defend in many areas of political research and highlight the benefits of quantifying uncertainty in the estimation of causal effects [@Gill1999; @Gill2020; @Schrodt2014; @Western1994]. Extending the approach to causal inference for cross-sectional time series and panel data under selection on observables introduced by @Blackwell2018, we develop a two-step pseudo-Bayesian method for estimating marginal structural models. We demonstrate our proposed procedure in the context linear mixed effects models via a simulation study and two empirical examples. Finally, we provide flexible open-source software implementing the proposed method.
29 |
30 |
31 | ## Licenses
32 |
33 | **Text and figures:** All prose and images are licensed under Creative Commons ([CC-BY-4.0](http://creativecommons.org/licenses/by/4.0/)).
34 |
35 | **Code:** All code is licensed under the [BSD 2-Clause License](LICENSE.md).
36 |
37 |
38 | ## Contributions and Code of Conduct
39 |
40 | We welcome contributions from everyone. Before you get started, please see our [contributor guidelines](CONTRIBUTING.md). Please note that this project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms.
41 |
42 | ---
43 |
44 | ## References
45 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Taking Uncertainty Seriously: Bayesian Marginal Structural Models for Causal Inference in Political Science
5 |
6 | ### Andrew Heiss and A. Jordan Nafa
7 |
8 |
9 |
10 |
11 | ### Abstract
12 |
13 | > The past two decades have been characterized by considerable progress
14 | > in developing approaches to causal inference in situations where true
15 | > experimental manipulation is either impractical or impossible. With
16 | > few exceptions, however, commonly employed techniques in political
17 | > science have developed largely within a frequentist framework (i.e.,
18 | > Blackwell and Glynn 2018; Imai and Kim 2019; Torres 2020). In this
19 | > article, we argue that common approaches rest fundamentally upon
20 | > assumptions that are difficult to defend in many areas of political
21 | > research and highlight the benefits of quantifying uncertainty in the
22 | > estimation of causal effects (Gill 1999; Gill and Heuberger 2020;
23 | > Schrodt 2014; Western and Jackman 1994). Extending the approach to
24 | > causal inference for cross-sectional time series and panel data under
25 | > selection on observables introduced by Blackwell and Glynn (2018), we
26 | > develop a two-step pseudo-Bayesian method for estimating marginal
27 | > structural models. We demonstrate our proposed procedure in the
28 | > context linear mixed effects models via a simulation study and two
29 | > empirical examples. Finally, we provide flexible open-source software
30 | > implementing the proposed method.
31 |
32 | ## Licenses
33 |
34 | **Text and figures:** All prose and images are licensed under Creative
35 | Commons ([CC-BY-4.0](http://creativecommons.org/licenses/by/4.0/)).
36 |
37 | **Code:** All code is licensed under the [BSD 2-Clause
38 | License](LICENSE.md).
39 |
40 | ## Contributions and Code of Conduct
41 |
42 | We welcome contributions from everyone. Before you get started, please
43 | see our [contributor guidelines](CONTRIBUTING.md). Please note that this
44 | project is released with a [Contributor Code of
45 | Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html).
46 | By contributing to this project, you agree to abide by its terms.
47 |
48 | ------------------------------------------------------------------------
49 |
50 | ## References
51 |
52 |
53 |
54 |
55 |
56 | Blackwell, Matthew, and Adam N. Glynn. 2018. “How to Make Causal
57 | Inferences with Time-Series Cross-Sectional Data Under Selection on
58 | Observables.” *American Political Science Review* 112: 1067–82.
59 |
60 |
61 |
62 |
63 |
64 | Gill, Jeff. 1999. “The Insignificance of Null Hypothesis Significance
65 | Testing.” *Political Research Quarterly* 52: 647–74.
66 |
67 |
68 |
69 |
70 |
71 | Gill, Jeff, and Simon Heuberger. 2020. “Bayesian Modeling and Inference:
72 | A Post-Modern Perspective.” In *The SAGE Handbook of Research Methods in
73 | Political Science and International Relations*, eds. Luigi Curini and
74 | Robert Franzese. London, UK: SAGE, 961–84.
75 |
76 |
77 |
78 |
79 |
80 | Imai, Kosuke, and In Song Kim. 2019. “[When Should We Use Unit Fixed
81 | Effects Regression Models for Causal Inference with Longitudinal
82 | Data?](https://doi.org/10.1111/ajps.12417)” *American Journal of
83 | Political Science* 63(2): 467–90.
84 |
85 |
86 |
87 |
88 |
89 | Schrodt, Philip A. 2014. “[Seven Deadly Sins of Contemporary
90 | Quantitative Political
91 | Analysis](https://doi.org/10.1177/0022343313499597).” *Journal of Peace
92 | Research* 51: 287–300.
93 |
94 |
95 |
96 |
97 |
98 | Torres, Michelle. 2020. “[Estimating Controlled Direct Effects Through
99 | Marginal Structural Models](https://doi.org/10.1017/psrm.2020.3).”
100 | *Political Science Research and Methods* 8(3): 391–408.
101 |
102 |
103 |
104 |
105 |
106 | Western, Bruce, and Simon Jackman. 1994. “Bayesian Inference for
107 | Comparative Research.” *American Political Science Review* 88: 412–23.
108 |
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/assets/apsa.csl:
--------------------------------------------------------------------------------
1 |
2 |
237 |
--------------------------------------------------------------------------------
/assets/style.css:
--------------------------------------------------------------------------------
1 | /* Loading Fonts */
2 | @import url('https://fonts.googleapis.com/css2?family=EB+Garamond:ital,wght@0,400;0,500;0,600;0,700;1,400;1,500;1,600;1,700&display=swap');
3 | @import url(https://fonts.googleapis.com/css?family=Droid+Serif:400,700,400italic);
4 | @import url(https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700);
5 |
6 | /* Title Font Settings */
7 | h1 {
8 | font-family: 'EB Garamond', serif;
9 | color: #000000;
10 | font-weight: 700;
11 | font-style: bold;
12 | font-size: 40px;
13 | text-align: left;
14 | width: 100%;
15 | }
16 |
17 | /* Author, Affiliation, and Subheader Font Settings */
18 | .author {
19 | font-family: 'EB Garamond', serif;
20 | font-weight: 600;
21 | font-style: normal;
22 | font-size: 30px;
23 | width: 100%;
24 | display: table;
25 | align-items: center;
26 | }
27 |
28 | /* Author, Affiliation, and Subheader Font Settings */
29 | h2, h3 {
30 | font-family: 'EB Garamond', serif;
31 | font-weight: 600;
32 | font-size: 30px;
33 | }
34 |
35 | h3 {font-style: italic;}
36 |
37 | /* Author and Affiliation Font Settings */
38 | body {
39 | font-family: 'EB Garamond', serif;
40 | font-weight: 500;
41 | font-style: normal;
42 | font-size: 18px;
43 | text-align: justify;
44 | }
45 |
46 | pre {
47 | display: block;
48 | padding: 10px;
49 | margin: 0 0 10.5px;
50 | font-size: 14px;
51 | line-height: 1.42857143;
52 | word-break: break-all;
53 | word-wrap: break-word;
54 | color: #000000;
55 | background-color: rgb(255 255 255 / 15%);
56 | border: 1px solid #dddddd;
57 | border-radius: 4px;
58 | }
59 |
60 | pre.sourceCode {
61 | color: rgb(207, 207, 194);
62 | background-color: #0c0c0c;
63 | }
--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | design_stage_msm_sims.rds
2 | final_sims.rds
3 | outcome_stage_msm_sims.rds
4 |
--------------------------------------------------------------------------------
/data/sim_results.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajnafa/Latent-Bayesian-MSM/8d5596abbe49284feeb5f4f39e9ed0ee7abb24c5/data/sim_results.rds
--------------------------------------------------------------------------------
/functions/ardl-sims.R:
--------------------------------------------------------------------------------
1 | #' Function for Fitting the ARDL Model Simulations
2 | #'
3 | #' @param ardl.data A list object containing the data to be passed to the
4 | #' Stan model as returned by the `make_ardl_data` function.
5 | #'
6 | #' @param ardl.mod The compiled Stan model to be used for each simulated data
7 | #' set. Should be an environment returned by `cmdstanr::cmdstan_model`
8 | #'
9 | #' @param ... Additional arguments passed down to `cmdstanr::sample`
10 | #'
11 | #' @return Returns a tibble containing the summarized draws for each
12 | #' of the models in the simulation
13 | #'
14 | #' @export sim_ardl_bayes
15 | #'
16 | sim_ardl_bayes <- function(ardl.data, ardl.mod, ...) {
17 |
18 | # Fit the Outcome-Stage Model
19 | ardl_fit <- ardl.mod$sample(
20 | data = ardl.data,
21 | refresh = 0,
22 | sig_figs = 5,
23 | parallel_chains = 4,
24 | chains = 4,
25 | iter_warmup = 1000,
26 | iter_sampling = 1000,
27 | max_treedepth = 12,
28 | adapt_delta = 0.9,
29 | show_messages = FALSE,
30 | ...
31 | )
32 |
33 | # Calculate a summary of the draws
34 | ardl_result <- summarise_draws(ardl_fit$draws())
35 |
36 | # Return the data frame of draws
37 | return(ardl_result)
38 | }
39 |
40 | #' Function for Building the Data to Pass to the ARDL Stan Model
41 | #'
42 | #' @param sim.data A simulated data set returned by the `dgp_sim` function
43 | #'
44 | #' @param ardl.form A formula for the ARDL model to be passed down to
45 | #' `brms::make_standata`
46 | #'
47 | #' @param prior.scale Scale factor to be used for auto-scaling of the priors
48 | #' on the coefficients. For details see `vignette("priors", package = "rstanarm")`
49 | #'
50 | #' @param ... Currently unused
51 | #'
52 | #' @return Returns a list object with the data to be passed to the Stan model
53 | #'
54 | #' @export make_ardl_data
55 | #'
56 | make_ardl_data <- function(sim.data, ardl.form, prior.scale, ...) {
57 | # Take advantage of brms functionality to get the initial data
58 | ardl_data <- brms::make_standata(
59 | ardl.form,
60 | family = brms::brmsfamily(family = "gaussian", link = "identity"),
61 | data = sim.data
62 | )
63 |
64 | # Priors on the coefficients
65 | beta_sds <- prior.scale * (sd(ardl_data$Y)/apply(ardl_data$X, 2, sd))
66 |
67 | # Prepare the data for use with Stan
68 | ardl_data <- list(
69 | N = ardl_data$N,
70 | K = ardl_data$K,
71 | Y = ardl_data$Y,
72 | X = ardl_data$X,
73 | beta_sds = beta_sds
74 | )
75 |
76 | # Return the list of data
77 | return(ardl_data)
78 | }
79 |
--------------------------------------------------------------------------------
/functions/dgp-sim.R:
--------------------------------------------------------------------------------
1 | #' Function for Simulating the Datasets, Modified Version of Blackwell and Glynn (2018)
2 | #'
3 | #' @param .groups Number of groups in the cross-sectional dimension of the data
4 | #'
5 | #' @param .periods Number of periods in the temporal dimension of the data
6 | #'
7 | #' @param .true_gamma Numeric argument(s) indicating true value of the lagged treatment
8 | #' effect.
9 | #'
10 | #' @param .treat_conf Logical argument(s) indicating whether the unobserved confounder
11 | #' is correlated with the treatment
12 | #'
13 | #' @param .seed Interger value to use for the random number seed
14 | #'
15 | #' @param ... Additional arguments for the simulation, currently unused
16 | #'
17 | #' @export dgp_sim
18 | #'
19 | dgp_sim <- function(.groups, .periods, .true_gamma, .treat_conf, ...) {
20 |
21 | # Define the fixed parameter values
22 | mu = c(0, -0.1, -0.1) # Effect of X[t-1], X[t], and X[t] * X[t-1] on Y[t]
23 | alpha_t0 = c(-0.23, 2.5) # Values of X at time t = 1
24 | alpha_t1 = c(-1.3, 2.5, 1.5) # Values of X at time t = 2:T
25 | gamma = .true_gamma
26 |
27 | # Create N x T Matricies
28 | Y = X = Z = matrix(NA_real_, nrow = .groups, ncol = .periods)
29 |
30 | # Country-Specific Time Invariant Confounding
31 | U = rnorm(.groups, sd = 0.1)
32 | upsilon = .9 * U
33 |
34 | # Simulate the first time period for each country
35 | Y_00 = 0.8 + upsilon + rnorm(.groups, sd = 0.1)
36 | Y_01 = 0.8 + mu[2] + upsilon + rnorm(.groups, sd = 0.1)
37 | Y_10 = Y_11 = Y_00
38 |
39 | # Z[t[1]] ~ 0.8 + upsilon + N(0.4, 0.1)
40 | Z[, 1] = 0.8 + upsilon + rnorm(.groups, 0.4, 0.1)
41 |
42 | # Prob of X[t[1]] = 1 with time-invariant confounding
43 | if (isTRUE(.treat_conf)) {
44 | X_prob = plogis(alpha_t0[1] + upsilon + alpha_t0[2] * Z)
45 | X[, 1] = rbinom(.groups, 1, prob = X_prob)
46 | }
47 |
48 | # Prob of X[t[1]] = 1 without time-invariant confounding
49 | else if (isFALSE(.treat_conf)) {
50 | X_prob = plogis(alpha_t0[1] + alpha_t0[2] * Z)
51 | X[, 1] = rbinom(.groups, 1, prob = X_prob)
52 | }
53 |
54 | # Actual Values of Y[t[1]]
55 | Y[, 1] = Y_01 * X[, 1] + Y_00 * (1 - X[, 1])
56 |
57 | # Recursively simulate the data for time t = 2:T
58 | for (i in 2:.periods) {
59 |
60 | # Potential Outcomes Y(a, a')
61 | Y_11 = 0.8 + upsilon + mu[1] + mu[3] + rnorm(.groups, sd = 0.1)
62 | Y_10 = 0.8 + upsilon + mu[1] + rnorm(.groups, sd = 0.1)
63 | Y_01 = 0.8 + upsilon + mu[2] + rnorm(.groups, sd = 0.1)
64 | Y_00 = 0.8 + upsilon + rnorm(.groups, sd = 0.1)
65 |
66 | # Potential Confounders Z(a, a')
67 | Z_1 = 1 * gamma + 0.5 + 0.7 * U + rnorm(.groups, sd = 0.1)
68 | Z_0 = 0 * gamma + 0.5 + 0.7 * U + rnorm(.groups, sd = 0.1)
69 | Z[, i] = X[, i - 1] * Z_1 + (1 - X[, i - 1]) * Z_0
70 |
71 | # Treatment X with time-invariant confounding
72 | if (isTRUE(.treat_conf)) {
73 | X_pr = alpha_t1[1] + alpha_t1[2] * Z[, i] + alpha_t1[3] * Y[, i - 1] + upsilon + rnorm(.groups, 0, 1)
74 | X[, i] = 1 * (X_pr > 0)
75 | }
76 |
77 | # Treatment X without time-invariant confounding
78 | else {
79 | X_pr = alpha_t1[1] + alpha_t1[2] * Z[, i] + alpha_t1[3] * Y[, i - 1] + rnorm(.groups, 0, 1)
80 | X[, i] = 1 * (X_pr > 0)
81 | }
82 |
83 | # The Response Vector Y
84 | Y[, i] = # Control
85 | Y_00 +
86 | # Effect of Lagged Treatment X
87 | X[, i - 1] * (Y_10 - Y_00) +
88 | # Effect of Contemporaneous X
89 | X[, i] * (Y_01 - Y_00) +
90 | # Effect of Contemporaneous X and Lagged X
91 | X[, i - 1] * X[, i] * ((Y_11 - Y_01) - (Y_10 - Y_00))
92 | }
93 |
94 | # Reshape and Combine the Matrices
95 | out = list(X, Y, Z)
96 |
97 | for (i in 1:3) {
98 | out[[i]] = as.data.table(out[[i]])
99 | out[[i]][, unit := 1:.N]
100 | out[[i]] = melt(out[[i]], id.vars = "unit", variable.name = "time")
101 | out[[i]] = out[[i]][, time := as.numeric(gsub("V", "", time))]
102 | }
103 |
104 | colnames(out[[1]])[3] = "X"
105 | colnames(out[[2]])[3] = "Y"
106 | colnames(out[[3]])[3] = "Z"
107 |
108 | out = Reduce(function(x, y) merge(x, y, by = c("unit", "time")), out)
109 |
110 | # Generate the Lags
111 | out[, `:=` (
112 | Y_Lag = shift(Y),
113 | Y_Lag_2 = shift(Y, 2),
114 | X_Lag = shift(X),
115 | Z_Lag = shift(Z))]
116 |
117 | # Exclude missing data due to lags
118 | out = out[!is.na(Y_Lag_2)]
119 |
120 | return(out)
121 | }
122 |
123 |
--------------------------------------------------------------------------------
/functions/msm-sims.R:
--------------------------------------------------------------------------------
1 | #' Function for Calculating the Stabilized Inverse Probability Weights
2 | #'
3 | #' @param sim.data The simulated data table returned by `dgp_sim`
4 | #'
5 | #' @param psnum The numerator model of class `brmsfit` for the stabilized
6 | #' weights
7 | #'
8 | #' @param psdenom The denominator model of class `brmsfit` for the stabilized
9 | #' weights
10 | #'
11 | #' @return Returns a data table with the location and scale of the stabilized
12 | #' inverse probability of treatment weights for the ATE of a binary treatment
13 | #'
14 | #' @export bayes_ipwt
15 | #'
16 | bayes_ipwt <- function(sim.data, psnum, psdenom) {
17 |
18 | # Generate posterior expectations for the numerator model
19 | preds_num <- t(brms::posterior_epred(psnum))
20 |
21 | # Generate posterior expectations denominator model
22 | preds_denom <- t(brms::posterior_epred(psdenom))
23 |
24 | # Calculate the numerator of the stabilized weights
25 | num_scores <- preds_num * sim.data$X + (1 - preds_num) * (1 - sim.data$X)
26 |
27 | # Calculate the denominator of the stabilized weights
28 | denom_scores <- preds_denom * sim.data$X + (1 - preds_denom) * (1 - sim.data$X)
29 |
30 | # Calculate the weights
31 | wts <- num_scores/denom_scores
32 |
33 | ## Coerce the output to a tibble
34 | weights <- tibble::as_tibble(wts, .name_repair = "minimal")
35 |
36 | ## Assign column names to each draw
37 | colnames(weights) <- paste("draw_", 1:ncol(weights), sep = "")
38 |
39 | ## TODO: Figure out the data.table code for this
40 | weights <- weights |>
41 | ## Add group and time columns
42 | dplyr::mutate(
43 | unit = sim.data$unit,
44 | time = sim.data$time,
45 | .before = 1
46 | ) |>
47 | ## Group the data by identifier
48 | dplyr::group_by(unit) |>
49 | ## Calculate the cumulative product of the weights by unit
50 | dplyr::mutate(dplyr::across(
51 | dplyr::starts_with("draw"),
52 | ~ cumprod(tidyr::replace_na(.x, 1)))
53 | ) |>
54 | ## Ungroup the Data
55 | dplyr::ungroup()
56 |
57 | # Generate the Lags
58 | sim.data[, `:=` (
59 | wts_mean = rowMeans(wts),
60 | wts_sd = apply(wts, 1, sd),
61 | cws_mean = rowMeans(weights[, 3:ncol(weights)]),
62 | cws_med = apply(weights[, 3:ncol(weights)], 1, median),
63 | cws_sd = apply(weights[, 3:ncol(weights)], 1, sd),
64 | num_prob = rowMeans(preds_num),
65 | denom_prob = rowMeans(preds_denom))
66 | ]
67 |
68 | # Return the data frame with the weights info
69 | return(sim.data)
70 | }
71 |
72 | # Function for Fitting the Design Stage Models of the MSM
73 | #'
74 | #' @param sim.data The simulated data table returned by `dgp_sim`
75 | #'
76 | #' @param ... Additional arguments passed to `brms::brm`
77 | #'
78 | #' @return The original data table with the location and scale of the stabilized
79 | #' inverse probability of treatment weights for the ATE of a binary treatment
80 | #'
81 | #' @export sim_msm_bayes_design
82 | #'
83 | sim_msm_bayes_design <- function(sim.data, ...) {
84 |
85 | # Design stage model numerator
86 | ps_num_bayes <- brm(
87 | formula = bf(X ~ X_Lag),
88 | data = sim.data,
89 | family = bernoulli(link = "logit"),
90 | prior = prior(normal(0, 1), class = "b") +
91 | prior(normal(0, 2), class = "Intercept"),
92 | chains = 4,
93 | cores = 4L,
94 | iter = 4000,
95 | backend = "cmdstanr",
96 | save_pars = save_pars(all = TRUE),
97 | control = list(adapt_delta = 0.9, max_treedepth = 12),
98 | refresh = 0,
99 | ...
100 | )
101 |
102 | # Design stage model denominator
103 | ps_denom_bayes <- brm(
104 | formula = bf(X ~ Y_Lag + Z + X_Lag),
105 | data = sim.data,
106 | family = bernoulli(link = "logit"),
107 | prior = prior(normal(0, 1), class = "b") +
108 | prior(normal(0, 2), class = "Intercept"),
109 | chains = 4,
110 | cores = 4L,
111 | iter = 4000,
112 | backend = "cmdstanr",
113 | save_pars = save_pars(all = TRUE),
114 | control = list(adapt_delta = 0.9, max_treedepth = 12),
115 | refresh = 0,
116 | ...
117 | )
118 |
119 | # Calculating the stabilized weights
120 | out <- bayes_ipwt(
121 | sim.data,
122 | psnum = ps_num_bayes,
123 | psdenom = ps_denom_bayes
124 | )
125 |
126 | # Return the updated data
127 | return(out)
128 | }
129 |
130 | #' Function for Building the Data to Pass to the MSM Stan Model
131 | #'
132 | #' @param sim.data A data table object with the stabilized weight vectors
133 | #' returned by the `sim_msm_bayes_design` function
134 | #'
135 | #' @param shape_prior A numeric vector of length 2 containing the location and
136 | #' scale to use for the beta prior on the scale of the weights
137 | #'
138 | #' @param ... Currently unused
139 | #'
140 | #' @return Returns a list object with the data to be passed to the Stan model
141 | #'
142 | #' @export make_msm_data
143 | #'
144 | make_msm_data <- function(sim.data, shape_prior, ...) {
145 |
146 | # Take advantage of brms functionality because I'm lazy
147 | msm_data <- brms::make_standata(
148 | Y ~ X + X_Lag,
149 | family = gaussian(),
150 | data = sim.data
151 | )
152 |
153 | # Prepare the data for use with Stan
154 | msm_data <- list(
155 | N = nrow(sim.data),
156 | K = msm_data$K,
157 | Y = msm_data$Y,
158 | X = msm_data$X,
159 | ipw_mu = sim.data$cws_mean,
160 | ipw_sigma = sim.data$cws_sd,
161 | sd_prior_shape1 = shape_prior[1],
162 | sd_prior_shape2 = shape_prior[2]
163 | )
164 |
165 | return(msm_data)
166 | }
167 |
168 | #' Function for Fitting the MSM Outcome Model Simulations
169 | #'
170 | #' @param msm.data A list object containing the data to be passed to the
171 | #' Stan model as returned by the `make_msm_data` function.
172 | #'
173 | #' @param msm.stan.mod The compiled Stan model to be used for each simulated
174 | #' data set. Should be an environment returned by `cmdstanr::cmdstan_model`
175 | #'
176 | #' @param ... Additional arguments passed down to `cmdstanr::sample`
177 | #'
178 | #' @return Returns a tibble containing the summarized draws for each
179 | #' of the models in the simulation
180 | #'
181 | #' @export sim_msm_bayes_outcome
182 | #'
183 | sim_msm_bayes_outcome <- function(msm.data, msm.stan.mod, ...) {
184 |
185 | # Fit the Outcome-Stage Model
186 | msm_sim_fit <- msm_stan_mod$sample(
187 | data = stan.data,
188 | refresh = 0,
189 | sig_figs = 5,
190 | parallel_chains = 4,
191 | chains = 4,
192 | iter_warmup = 1000,
193 | iter_sampling = 1000,
194 | max_treedepth = 12,
195 | adapt_delta = 0.9,
196 | show_messages = FALSE,
197 | ...
198 | )
199 |
200 | # Calculate a summary of the draws
201 | msm_result <- posterior::summarise_draws(
202 | msm_sim_fit$draws(
203 | variables = c("lp__", "b_Intercept", "b", "w_tilde", "Intercept", "sigma")
204 | )
205 | )
206 |
207 | # Return the update data frame
208 | return(msm_result)
209 | }
210 |
211 |
--------------------------------------------------------------------------------
/manuscript/manuscript.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Taking Uncertainty Seriously"
3 | subtitle: "Bayesian Marginal Structural Models for Causal Inference in Political Science"
4 | short-title: "Taking Uncertainty Seriously"
5 | author:
6 | - name: "A. Jordan Nafa"
7 | affiliation: "University of North Texas"
8 | orcid: "0000-0001-5574-3950"
9 | attributes:
10 | corresponding: true
11 | - name: "Andrew Heiss"
12 | affiliation: "Georgia State University"
13 | orcid: "0000-0002-3948-3914"
14 | abstract: |-
15 | The past two decades have been characterized by considerable progress in developing approaches to causal inference in situations where true experimental manipulation is either impractical or impossible. With few exceptions, however, commonly employed techniques in political science have developed largely within a frequentist framework. In this article, we argue that common approaches rest fundamentally upon assumptions that are difficult to defend in many areas of political research and highlight the benefits of quantifying uncertainty in the estimation of causal effects. Extending the approach to causal inference for cross-sectional time series and panel data under selection on observables introduced by Blackwell and Glynn (2018), we develop a two-step Bayesian pseudo-likelihood method for estimating marginal structural models. We demonstrate our proposed procedure via a simulation study and two empirical examples. Finally, we provide flexible open-source software implementing the proposed method.
16 | code-repo: "Replication materials for this manuscript are available at "
17 | thanks: |-
18 | This manuscript was prepared for the American Political Science Association's Annual Meeting in Montreal, Quebec, September 15--18, 2022.
19 | date: September 9, 2022
20 | keywords:
21 | - Bayesian statistics
22 | - causal inference
23 | - panel data
24 | bibliography: "../assets/references.bib"
25 | format:
26 | hikmah-pdf:
27 | fontsize: 12pt
28 | # Custom fonts
29 | mainfont: "Linux Libertine O"
30 | mainfontoptions:
31 | - "Numbers=Proportional"
32 | - "Numbers=OldStyle"
33 | sansfont: "Linux Libertine O" # not actually sans but whatevs
34 | #monofont: "InconsolataGo"
35 | monofontoptions:
36 | - "Mapping=tex-ansi"
37 | - "Scale=MatchLowercase"
38 | mathfont: "Libertinus Math"
39 |
40 | # Colors
41 | include-in-header:
42 | text: |
43 | \definecolor{untgreen}{HTML}{00853E}
44 | linkcolor: untgreen
45 | urlcolor: untgreen
46 | citecolor: untgreen
47 |
48 | # Use biblatex-chicago
49 | biblatex-chicago: true
50 | biblio-style: authordate
51 | biblatexoptions:
52 | - backend=biber
53 | - autolang=hyphen
54 | - isbn=false
55 | - uniquename=false
56 | knitr:
57 | opts_chunk:
58 | fig.retina: 1
59 | html: default
60 | fig-cap-location: top
61 |
62 | knitr:
63 | opts_chunk:
64 | echo: false
65 | warning: false
66 | message: false
67 | fig.retina: 3
68 | fig.path: "figures/"
69 | cache.path: "_cache/"
70 | ---
71 |
72 | ```{r}
73 | pacman::p_load(
74 | "tidyverse",
75 | "patchwork",
76 | install = FALSE
77 | )
78 | ```
79 |
80 | Research in political science and public policy is often concerned with causal effects. Does a specific piece of legislation reduce intergenerational poverty? Does a US Supreme Court ruling cause increased polarization? Do gender-based legislative quotas cause improved gender-based economic outcomes? Do international economic sanctions prevent states from going to war? Quantitatively estimating these causal effects, however, is difficult task fraught with methodological pitfalls. Across scientific disciplines, randomized controlled trials have long been held up as the gold standard in causal thinking. If treatment conditions are randomly assigned across comparable samples of a population, differences in those samples' outcomes can be attributed directly to the treatment and researchers can safely tell causal stories. For many—if not most-questions in political science, however, estimating causal effects through experiments can be difficult, unethical, or impossible. Researchers cannot randomly assign countries to go to war, randomly assign states to adopt specific policies, or randomly assign legislators to win or lose elections.
81 |
82 | In the absence of experimental data on most political phenomena, researchers must work with observational data. However, existing observational data reflects already-realized outcomes. For instance, a country’s level of level of democracy, legislative system, and other political choices are all influenced by decades of prior institutional, political, social, and economic choices, as well as broader geographic and historical trends. Similarly, the behavior of individuals such as residents, voters, legislators, and political leaders is influenced by a host of other external factors. Observations in a dataset—be they individuals, Census blocks, states, or countries—self-select into (or out of) treatment conditions. As a result, we cannot directly compare observations that chose specific treatments.
83 |
84 | A robust cross-disciplinary body of methods has emerged in the past decades to tackle the problem of observational causal inference. Econometricians have focused on quasi-experimental methods such as difference-in-difference analysis, regression discontinuity designs, instrumental variable approaches, and synthetic controls [@Angrist2009]. Epidemiologists and biostatisticians, on the other hand, have developed matching and weighting techniques based on causal models and *do*-calculus [@Pearl2016; @Pearl2018; @Robins2015]. Each approach is designed to account for endogenous self-selection bias and recover causal effects in non-experimental data.
85 |
86 | In parallel to developments in observational causal thinking, advances in computational power in the past decade have led to the broader adoption of Bayesian statistical methods. Bayesian analysis provides an alternative to more more standard null hypothesis significance testing (NHST), where researchers test for the probability of observed data given a null hypothesis, or $P(\text{data} \mid H_0)$. In contrast, Bayesian analysis determines the probability of a specific hypothesis given the existing data, or $P(H \mid \text{data})$. When measuring the uncertainty of estimates under NHST, we calculate confidence intervals (often at a 95% level), which indicate that if we collected our data many more times, 95% of those confidence intervals would contain the true population parameter we are interested in. With Bayesian methods, uncertainty can be described with an entire posterior distribution of plausible values that can be summarized in various ways. We can use the posterior distribution to calculate a credible interval (at 95%, for instance) that would allow us to say that there is a 95% probability that the true population parameter falls in the range of the interval. This Bayesian estimand captures what researchers are most often interested in—the probability of a hypothesis being true, rather than the probability of seeing an effect of a certain size in a world where there is a null effect.
87 |
88 | This headway in both observational causal inference and Bayesian methods lays the foundation for analysis of causal effects that is more robust, deals more directly with uncertainty, and is more easily interpretable. Most statistical approaches for causal inference, such as difference-in-differences, regression discontinuity designs, and other quasi-experimental designs, map easily into a Bayesian paradigm. Unfortunately, this is not the case with research designs that rely on matching and weighting. Due to mathematical and philosophical incompatibilities, causal inference methods that rely on propensity multiple stages of models (i.e. a design model that estimates the probability of an observation selecting into treatment, which we then use to generate propensity scores and weights that we then use in an outcome model to estimate the effect of treatment) cannot use a Bayesian approach. Because propensity scores and weights are not part of the formal data-generating process for the relationship between a treatment and an outcome, we cannot model them with Bayesian methods. In a critique of attempts in bio-statistics at calculating propensity scores with Bayesian methods, Robins, Hernán, and Wasserman conclude that "Bayesian inference must ignore the propensity score" [-@Robins2015, p. 297].
89 |
90 | In this article, we seek to reconcile these parallel developments, merging Bayesian analysis with propensity score-based causal estimation methods. We introduce a method for working with propensity scores in a pseudo-Bayesian manner, allowing researchers to work with posterior distributions of causal effects and make causal inferences without the need for null hypotheses. In particular, we provide a general method for incorporating propensity score-based weights from a treatment model into an outcome model in the spirit of Bayes, propagating the uncertainty of these weights from the treatment model to the outcome model. The approach applies to situations where confounding is addressed through inverse probability weighting, both in a simple single-time-period setting and in a time-series cross-sectional panel data setting with marginal structural models [@Robins2000; @Blackwell2018]. We begin with a brief overview of confounding and endogeneity, introducing concepts from the language of causal graphs and *do*-calculus to help isolate causal effects [@Pearl2016; @Pearl2018] and describe how to statistically adjust for confounding with both inverse probability weighting and marginal structural models. We then explore the methodological reasons why Bayesian analysis is incompatible with these propensity score-based approaches to causal inference and propose an alternative (and compatible) pseudo-Bayesian estimator. Finally, we compare this estimator to existing approaches and demonstrate its results through simulation and by replicating previous research.
91 |
92 |
93 | # Dealing with confounding
94 |
95 | Recent developments in causal inference provide a standardized vocabulary and systematized calculus for discussing causal effects through directed acyclic graphs (DAGs) [@Pearl2016; @Pearl2018; @Robins2015]. In the left panel of @fig-simple-dag, we can see the relationship between some treatment or intervention $X$ (e.g. legislative quotas, foreign aid, economic sanctions, etc.) and an outcome $Y$ (e.g. improved minority representation in parliament, reduced poverty, decreased likelihood of war, etc.). The causal effect of $X$ on $Y$ is represented with an arrow connecting the two nodes. A third node $Z$ exerts a causal effect on both the treatment $X$ and the outcome $Y$. This variable confounds the $X \rightarrow Y$ relationship and opens up an alternative path between the treatment and outcome. For example, if we are interested in measuring the causal effect of foreign aid ($X$) on poverty ($Y$) (see the right panel of @fig-simple-dag) and we observe a positive correlation between the two in a dataset, that correlation could be caused by some other confounding factor, like a country's level of economic development ($Z_1$), or improvements in a country’s overall level of democracy or economic development ($Z_2$).
96 |
97 | ```{r dag-simple, echo=FALSE}
98 | #| label: fig-simple-dag
99 | #| fig-width: 12
100 | #| fig-height: 4
101 | #| out-width: 0.5\\linewidth
102 | #| fig-cap: "(L) DAG showing the causal effect of of $X$ on $Y$, confounded by $Z$; (R) DAG showing the causal effect of foreign aid on poverty, confounded by $Z1$ (economic development) and $Z2$ (democracy)"
103 | simple_dag <- readRDS("R/simple-dag.rds")
104 | labeled_dag <- readRDS("R/labeled-dag.rds")
105 | simple_dag | labeled_dag
106 | ```
107 |
108 | We can remove the effect of confounders like $Z$, $Z_1$, and $Z_2$ through statistical adjustment. For instance, if we compare countries with similar (or even identical) levels GDP per capita and democracy, we mitigate the confounding effects of economic development democratic development.
109 |
110 | We can use a variety of methods to deal with backdoor confounding. Quasi-experimental research designs like difference-in-differences, regression discontinuity, instrumental variables, and synthetic controls each use specific real-world situations to approximate treatment and control groups to remove the effect of confounders on the relationship between treatment and outcome. Alternatively, we can adjust for confounders through matching and weighting. One common approach—particularly in epidemiology and biostatistics—is to adjust for confounding using inverse probability of treatment weights (IPTWs).
111 |
112 | ## Inverse probability weighting
113 |
114 | Causal inference using inverse probability weighting involves a two-stage estimation process. In the first stage, often called the *treatment stage* or *design stage*, we create a model that predicts an observation’s choice to receive the treatment based on all confounders identified in a causal graph. We then use the design model to calculate a propensity score for each observation. Next, we calculate inverse probability of treatment weights for each observation based on its propensity of treatment. Weights are calculable for both binary and continuous treatments:
115 |
116 | $$
117 | w_{\text{binary}} = \frac{\overbrace{X_i}^{\text{\clap{Treatment}}}}{\underbrace{\pi_i}_{\text{\clap{Propensity}}}} + \frac{1 - X_i}{1 - \pi_i}
118 | $$ {#eq-iptw-binary}
119 |
120 | $$
121 | w_{\text{continuous}} = \frac{
122 | \overbrace{f_{X_i} (X_i; \mu, \sigma^2)}^{\text{Probability distribution of treatment } X}
123 | }{
124 | \underbrace{f_{X_i \mid Z_i} (X_i \mid Z_i; \mu, \sigma^2)}_{\substack{\text{Probability distribution of treatment } X \\ \text{given confounders } Z}}}
125 | $$ {#eq-iptw-continuous}
126 |
127 | The ultimate purpose of these weights IPTWs ($w$) is to create pseudo-populations of treated and untreated observations that are comparable across all levels of confounders. We can give less weight to observations with a low probability of being treated and who subsequently were not treated and more statistical weight to observations with a high probability of being treated but who were not. Conversely, we can give more weight to treated observations with a low probability of being treated and less weight to treated observations with a high probability of being treated. After scaling each observation by this IPTW, we can create comparable treated and untreated pseudo populations.
128 |
129 | ```{r echo=FALSE}
130 | #| label: fig-iptw-hist
131 | #| fig-width: 5
132 | #| fig-height: 5.5
133 | #| out-width: 0.6\\linewidth
134 | #| fig-cap: "Distribution of both original and weighted propensity scores; weighted scores represent comparable pseudo-populations of treated and untreated observations"
135 |
136 | library(patchwork)
137 |
138 | iptw1 <- readRDS("R/iptw-hist-1.rds")
139 | iptw2 <- readRDS("R/iptw-hist-2.rds")
140 |
141 | iptw1 / iptw2
142 | ```
143 |
144 | @fig-iptw-hist demonstrates the intuition visually using a simulated binary treatment. In the top panel of @fig-iptw-hist, fewer observations received the treatment than not, and those who did not had a lower average propensity of treatment, visible in the cluster in the lower range of propensities. This represents selection bias—those who did not receive the treatment already had a low probability of seeking out the treatment in the first place. As a result, the treated and untreated populations are not comparable. The bottom panel of @fig-iptw-hist shows the distribution of propensity scores after weighting by the IPTWs. The lighter distributions represent pseudo-populations of treated and untreated observations, and these two groups now mirror each other fairly well. Both low-propensity treated observations and high-propensity untreated observations are scaled up and receive more statistical weight to improve cross-group comparability.
145 |
146 | Having calculated IPTWs and created comparable pseudo-populations, the final stage in causal estimation is to create an outcome model the estimates the effect of the treatment on the outcome, weighted by $w$. The resulting effect of X on Y represents a causal effect, with all observable confounding accounted for and adjusted away.
147 |
148 |
149 | ## Marginal structural models
150 |
151 | The IPTW approach to adjusting for confounding can be extended to more complex data generating processes where treatments, outcomes, and confounders vary over time. In these cases, marginal structural models allow us to adjust for time-invariant confounders, time varying confounders, previous levels of the outcome, and treatment history [@Robins1997; @Robins2000; @Cole2008; @Imai2015; @Blackwell2018; @Saarela2015].
152 |
153 | ```{r dag-1, echo=FALSE}
154 | #| label: fig-msm-dag
155 | #| fig-width: 16
156 | #| fig-height: 9
157 | #| out-width: 1\\linewidth
158 | #| fig-cap: "DAG showing the contemporaneous effect of $X_t$ on $Y_t$, given contemporaneous confounders $Z_t$ and treatment history $X_{t-1}$"
159 | msm_dag <- readRDS("R/msm-dag.rds")
160 | msm_dag
161 | ```
162 |
163 | @fig-msm-dag illustrates one possible DAG showing the relationship between treatment $X$, outcome $Y$, and confounders $Z$ in three different time periods ($t$, $t-1$, and $t-2$). Following the logic of *do*-calculus, if we are interested in measuring and isolating the contemporaneous effect of $X_t$ on $Y_t$, we no longer need to adjust only for $Z_t$, as we did in the simpler IPTW example above. The previous value of $X$, or the treatment history $X_{t-1}$ is now also a confounder that needs to be accounted for statisticially.
164 |
165 | When creating IPTWs for data generating processes with repeated measures, as in @fig-msm-dag, the resulting weights need to account for the temporal nature of the data and incorporate weights from previous time periods. This can be done by taking the cumulative product of each observation's weights over time, both for binary and continuous weights:
166 |
167 | $$
168 | w_{it; \text{ binary}} = \prod^{t}_{t = 1} \frac{\Pr[X_{it} \mid X_{it-1},~ C_{i}]}{\Pr[X_{it} \mid \underbrace{Z_{it}}_{\substack{\text{\clap{Time-varying}} \\ \text{\clap{confounders}}}}, ~ X_{it-1},~ Y_{it-1},~ \underbrace{C_{i}}_{\substack{\text{\clap{Time-invariant}} \\ \text{\clap{confounders}}}}]}
169 | $$ {#eq-msm-binary}
170 |
171 | $$
172 | w_{it; \text{ continuous}} = \prod^{t}_{t=1} \frac{
173 | \overbrace{f_{X_{it} \mid X_{it-1},C_{i}}[(X_{it} \mid X_{it-1},~ C_{i}); ~\mu, ~\sigma^{2}]}^{\substack{\text{Probability distribution of treatment } X \text{ given past } \\ \text{treatment } X_{t-1} \text{ and time-invariant confounders } C}}
174 | }{
175 | \underbrace{f_{X_{it} \mid Z_{it}, X_{it-1}, Y_{it-1}, C_{i}}[(X_{it} \mid Z_{it}, ~ X_{it-1},~ Y_{it-1},~ C_{i}); ~\mu, ~\sigma^{2}]}_{\substack{\text{Probability distribution of treatment } X \text{ given time-varying confounders } Z, \\ \text{past treatment } X_{t-1}, \text{ past outcome } Y_{t-1}, \text{ and time-invariant confounders } C}}
176 | }
177 | $$ {#eq-msm-continuous}
178 |
179 | The process for estimating a causal effect using a marginal structural model follows the same two-stage procedure as standard IPTW adjustment. We first use a design stage model to predict treatment status using all backdoor confounders identified in a causal graph, including past treatment status, past level of the outcome, time-invariant covariates, and time-varying covariates. We then generate IPTWs using either @eq-msm-binary or @eq-msm-continuous, depending on the nature of the treatment variable. We finally fit an outcome model weighted by $w$ to estimate the adjusted effect of the treatment on the outcome.
180 |
181 |
182 | # Bayesian propensity scores
183 |
184 | Adjustment through inverse probability weighting---both with single time periods and with more complex marginal structural models---is typically done with frequentist statistical methods. The design stage model is ordinarily fit using logistic regression, while the outcome model is fit using weighted least squares regression with standard errors adjusted post-estimation through bootstrapping [@Hernan2020, 152]. In the case of marginal structural models, the outcome stage typically uses generalized estimating equations (GEE) to account for the panel structure of the data [@Thoemmes2016; @Hernan2020, 147]. Why bother developing a Bayesian procedure in the first place then if these alternative approaches already exist?
185 |
186 | First, in practical terms there is no shortage of examples demonstrating Bayesian estimators often have superior long-run properties and provide more accurate estimates than their frequentist alternatives in many common causal modeling applications including, though not necessarily limited to, matching and propensity score methods [@Alvarez2021; @Capistrano2019; @Kaplan2012; @Liao2020; @Zigler2014], g-computation [@Keil2017], instrumental variable estimation [@Hollenbach2018], and in the presence of heterogeneous treatment effects and high dimensional applications more broadly [@Antonelli2020; @Pang2021; @Hahn2020]. If, as researchers, we aspire to be *less wrong*, it is worth thinking seriously about a proper approach to quantifying uncertainty in our causal estimands.
187 |
188 | Second, in cases where the data comprise an apparent population, for example all sovereign countries between 1945 and 2020, uncertainty estimates and test statistics derived from asymptotic assumptions of repeated sampling and exact long-run replications that form the foundation of the null hypothesis significance testing (NHST) framework and valid interpretations of confidence intervals are logically difficult to defend [@Berk1995; @Gill2020; @Western1994]. Under such circumstances a Bayesian framework provides a principled and logically consistent alternative that allows us to quantify the uncertainty in our parameter estimates conditional on the observed data and prior assumptions about the universe of effect sizes we believe to be theoretically possible [@Gelman2012; @Jackman2004].
189 |
190 | Finally, a Bayesian framework also provides us with the ability to acknowledge that we are virtually always uncertain about the set of confounders we need to adjust for in observational settings and employ model averaging or stacking based approaches to average across different specifications for the design-stage model [@Yao2018; @Hollenbach2020]. We can then derive a distribution of weights as illustrated below and propagate the uncertainty inherent in design stage estimation on to the outcome stage model. Where residual correlation within units is a concern, Bayesian hierarchical approaches provide an alternative to GEE models as it is possible to obtain a population-averaged estimate by integrating out the group-level effects.
191 |
192 | ## The impossibility of true two-stage Bayesian analysis
193 |
194 | While Bayesian approaches to causal inference provide a wealth of information about the uncertainty and distributions of estimates, Bayesian regression is not directly applicable to two-stage models involving propensity scores or inverse probability weights, for both statistical and philosophical reasons [@Robins2015]. To explore this incompatibility, we can define the average treatment effect (ATE) of a binary intervention with the estimand in @eq-ate. Here we subtract the average outcome $Y$ when treatment $T$ is both 0 and 1, following adjustment for confounders $X$.
195 |
196 | $$
197 | \Delta_{\text{ATE}} = E[ \overbrace{E \left( Y_i \mid T_i = 1, X_i \right)}^{\substack{\text{Average outcome } Y \text{ when} \\ \text{treated, given confounders }X}} - \overbrace{E \left( Y_i \mid T_i = 0, X_i \right)}^{\substack{\text{Average outcome } Y \text{ when} \\ \text{not treated, given confounders }X}} ]
198 | $$ {#eq-ate}
199 |
200 | Expressed more generally, the ATE can be calculated by some arbitrary function $f$ that incorporates information from $\symbf{T}$, $\symbf{X}$, and $\symbf{Y}$, as seen in @eq-f-delta.
201 |
202 | $$
203 | f(\Delta \mid \symbf{T}, \symbf{X}, \symbf{Y})
204 | $$ {#eq-f-delta}
205 |
206 | This function $f$ can represent any kind of estimation approach, including two-stage inverse probability weighting, matching, or design-based quasi-experimential strategies. This more general definition of the ATE also fits well in a Bayesian paradigm. Since $\Delta$ is unknown, we can can build a Bayesian model to estimate an unknown $\theta$ parameter and set $\theta = \Delta$, conditional on a likelihood for $(\symbf{T}, \symbf{X}, \symbf{Y})$ and a prior distribution for $\theta$ [@Liao2020]. We can thus express @eq-f-delta using Bayes' formula, as seen in @eq-f-bayes. With observed data $\symbf{T}$, $\symbf{X}$, and $\symbf{Y}$, we can proceed with model fitting and sampling and obtain an estimate for $\theta$, or our ATE $\Delta$.
207 |
208 | $$
209 | \overbrace{P[\theta \mid (\symbf{T}, \symbf{X}, \symbf{Y})]}^{\substack{\text{Posterior estimate} \\ \text{of } \theta \text{, given data}}} \quad \propto \quad \overbrace{P[(\symbf{T}, \symbf{X}, \symbf{Y}) \mid \theta]}^{\substack{\text{Likelihood for existing} \\ \text{data, given unknown }\theta}} \quad \times \quad \overbrace{P[\theta]}^{\substack{\text{Prior} \\ \text{for }\theta}}
210 | $$ {#eq-f-bayes}
211 |
212 | Crucially, however, @eq-f-bayes does not include any propensity scores or weights. Inverse probability of treatment weights are not part of the data-generating process for $\theta$ and thus are not part of either the likelihood or the prior. Weights are a part of the estimation process, not reality. We use weights solely for approximating pseudo populations of treated and untreated observations—these weights do not determine observations' actual behavior or cause changes in the outcome, and therefore are not defined as part of the formal model for $\theta$. The absence of propensity scores or weights in the likelihood is the foundation for @Robins2015's critique of Bayesian attempts at causal inference with inverse probability weighting. They explain that "Bayesian logic is rigidly defined: given a likelihood and a prior, one turns the Bayesian crank to obtain a posterior" [@Robins2015, 297]. There is no place for weights in the Bayesian engine—since weights do not fit in either the likelihood or the prior, the posterior estimate of $\theta$ cannot reflect the pseudo-populations required for statistical adjustment of confounders. True Bayesians therefore cannot use inverse probability weights in causal inference.
213 |
214 | This is disappointing, given the advantages of Bayesian analysis noted earlier. The ability to make inferences with entire posterior distributions of causal effects rather than frequentist null hypotheses can provide us with richer details about those effects, and modeling the uncertainty of our estimates allows us to better explore the robustness of our findings.
215 |
216 | ## Incorporating propensity scores and weights into Bayesian models
217 |
218 | At the end of their critique, @Robins2015 state that though it is not possible to use a fully Bayesian approach with two-stage causal inference model, it is possible to adopt a "Bayes-frequentist" compromise in order to better analyze and work with the uncertainty inherent in causal estimation. @Liao2020 propose one such compromise and outline a method of incorporating propensity scores into Bayesian estimation of causal effects. To do so, they represent propensity scores as a new parameter $\nu$ that is used by a general function that estimates the causal effect given $\symbf{T}$, $\symbf{X}$, $\symbf{Y}$, and $\nu$, representing the propensity score-based weights. This $\nu$ parameter is estimated using a design stage model using both treatment status $\symbf{T}$ and confounders $\symbf{X}$. By marginalizing over the distribution of the product of the outcome model and the design model, we can eliminate the $\nu$ term, resulting in an estimand in @eq-nu that is identical to @eq-f-delta.
219 |
220 | $$
221 | \overbrace{f(\Delta \mid \symbf{T}, \symbf{X}, \symbf{Y})}^{\substack{\text{Estimand for} \\ \text{the ATE, without } \nu}} = \int_\nu \overbrace{f(\Delta \mid \symbf{T}, \symbf{X}, \symbf{Y}, \nu)}^{\substack{\text{Outcome model} \\ \text{with } \nu}}\ \overbrace{f(\nu \mid \symbf{T}, \symbf{X})}^{\substack{\text{Design model} \\ \text{creating propensity} \\ \text{scores with } T \text{ and } X}}\ \mathrm{d} \nu
222 | $$ {#eq-nu}
223 |
224 | $\nu$ represents the posterior distribution of propensity scores or treatment weights, and it contains complete information about the uncertainty in these weights. By incorporating the posterior distribution of $\nu$ into the outcome stage of the model, we're able to propagate the variation in weights into the final estimation. We can thus overcome the main shortcoming of Bayesian approaches to two-stage estimation—weights are now a formal parameter in the model (see @eq-f-bayes-nu).
225 |
226 | $$
227 | \overbrace{P[\theta \mid (\symbf{T}, \symbf{X}, \symbf{Y}, \nu)]}^{\substack{\text{Posterior estimate} \\ \text{of } \theta \text{, given data and weights}}} \quad \propto \quad \overbrace{P[(\symbf{T}, \symbf{X}, \symbf{Y}, \nu) \mid \theta]}^{\substack{\text{Likelihood for existing} \\ \text{data, given unknown }\theta}} \quad \times \quad \overbrace{P[\theta]}^{\substack{\text{Prior} \\ \text{for }\theta}}
228 | $$ {#eq-f-bayes-nu}
229 |
230 | Instead of calculating a single value of $\nu$ (i.e. a single set of propensity scores or weights) from the design stage of the model, we can incorporate a range of values of $\nu$ from the posterior distribution of the design model. To do so, we first fit a Bayesian design-stage model to calculate the posterior probabilities of treatment. We then generate $K$ samples of propensity scores from the posterior distribution of the treatment. $K$ can vary substantially, though it is often the number of posterior chains from the Bayesian model. Next, for each of the $K$ samples of scores, we generate inverse probability weights and build an outcome model (either Bayesian or frequenist) using those weights. Finally, we combine and average the results from these many outcome models to calculate the final $\nu$-free causal effect. The procedure is similar to multiple imputation or bootstrapping—we run the same outcome model many times using different variations of weights and then combine and average the results.
231 |
232 | Importantly, this approach is not fully Bayesian, but pseudo-Bayesian. The parameters for the analysis are estimated using separate independent models: $\nu$ in the design stage and $\theta$ in the outcome stage. To qualify as a truly Bayesian approach, $\nu$ and $\theta$ would need to be estimated jointly and simultaneously.
233 |
234 |
235 | ## Bayesian Pseudo-Likelihood Estimation
236 |
237 | There are several different approaches one might take to accounting for uncertainty in the design stage weights when estimating the outcome stage of a marginal structural model. It is, for example, possible to pass a different random draw from the distribution of weights directly to the model at each iteration of the MCMC sampler though such an approach quickly becomes intractable in terms of computation. In this section we outline an alternative computationally efficient approach that requires passing only the location and scale of the design stage weights to the outcome model and propagates uncertainty by placing a prior on the scale component of the weights.
238 |
239 | To implement our pseudo-likelihood estimator, we take as our starting point recent developments in the application of Bayesian methods for the analysis of complex survey designs [@Savitsky2016; @Williams2020b; @Williams2020a]. Following @Savitsky2016, we can express the Bayesian pseudo-posterior as
240 |
241 | $$
242 | \hat{\pi}( \theta~|~y, \tilde{w}) ~\propto~ \left [\prod_{i = 1}^{n} \Pr(y_{i} ~|~ \theta)^{\tilde{w_{i}}}\right ]\pi(\theta)
243 | $$ {#eq-pseudo-likelihood}
244 |
245 | \noindent where $\prod_{i = 1}^{n} \Pr(y_{i} ~|~ \theta)^{\tilde{w_{i}}}$ represents the pseudo-likelihood of the observed data $y$ and $\pi(\theta)$ is a prior on the unconstrained parameter space.
246 |
247 | If we are interested in the average treatment effect of some binary treatment $X$ at times $t$ and $t-1$, the posterior predictive distribution of the stabilized inverse probability weights from the design stage $SW$ is
248 |
249 | $$
250 | \text{SW}_{it} = \prod^{t}_{t = 1} \frac{\int\Pr(X_{it}~ | ~X_{it-1},~ C_{i})\pi(\theta)d\theta}{\int\Pr(X_{it}~ |~Z_{it}, ~ X_{it-1},~ Y_{it-1},~ C_{i})\pi(\theta)d\theta}
251 | $$ {#eq-bayes-stabilized-weights}
252 |
253 |
254 |
255 | \noindent where $i$ and $t$ index groups and periods respectively. The observed treatment status and outcome for the $i^{th}$ group at each period $t$ are represented by $X$ and $Y$ respectively. $C$ is the observed value of the baseline time invariant confounders and $Z$ is a set of time varying covariates that satisfies sequential ignorability [@Blackwell2018]. While we focus mainly on the average treatment effect at times $t$ and $t-1$ here, it is possible to estimate longer lags, different estimands such as the average treatment effect in the treated, and continuous treatments.
256 |
257 | We parameterize the regularized weights for each observation, denoted $\tilde{w_{i}}$ in @eq-pseudo-likelihood, in the outcome model as
258 |
259 | $$
260 | \tilde{w}_{i} = \lambda_{i} + \delta_{i} \cdot \pi{(\delta)}
261 | $$
262 |
263 | \noindent where $\lambda$ and $\delta$ represent a vector containing the location the weights--typically the mean, though it may be preferable to use the posterior median under certain circumstances--and the scale of the posterior distribution of the stabilized weights for each observation $i \in \{1, 2,\dots, n\}$. At each iteration of the MCMC algorithm, $\pi{(\delta)}$ is a vector of length $n$ sampled from a prior distribution on the scale of the weights that allows us to both model the uncertainty in the design stage weights and regularize their variance to stabilize computation by ruling out unrealistic or impossible values.
264 |
265 | This of course requires the researcher to specify a proper Bayesian prior on the scale component of the weights to avoid severe convergence issues. In practice, we recommend a weakly to moderately informative prior distribution that concentrates the bulk of the prior density between 0 and 1.5. The exponential distribution with rate $\lambda > 3.5$ or Beta distribution with shape parameters $\alpha = 2$ and $\beta \ge 2$ tend to perform well in simulations. Although we do not consider such an approach here, one might also consider regularizing the location of the weights in a similar fashion in cases where there are a large number of observations with excessively large inverse probability weights.
266 |
267 | This Bayesian pseudo-likelihood approach lends itself to relatively straight forward extensions such as multilevel regression as recently illustrated by @Savitsky2021 in the context of weighted survey designs. Perhaps more notably, it allows us to consider more than one possible specification for the design stage model via Bayesian weighting procedures such as model averaging or posterior stacking [@Yao2018; @Montgomery2010; @Hollenbach2020]. This may provide a way of reducing the degree to which the outcome stage model is sensitive to the design stage specification [@Kaplan2014; @Zigler2014], a common criticism of propensity score based weighting estimators.
268 |
269 | # Simulation Study
270 |
271 | To evaluate the performance of our proposed model in terms of its ability to recover the true parameter values, we employ a modified version of the simulation study in @Blackwell2018. As depicted in @fig-dag-simstudy, we assume that values of $X_{i}$ at time $t-1$ are independent of outcomes $Y_{i}$ at time $t$ and that $X_{i}$ has only a contemporaneous treatment effect on $Y_{i}$ at each time $t$--that is, the true lagged treatment effect of $X_{it-1}$ on $Y_{it}$ is 0. Furthermore, past values of $Y_{it-1}$ are independent of $Y_{it}$, conditional on the treatment $X_{it}$. To identify the causal effect of $X_{it-1}$ and $X_{it}$ on $Y_{it}$ we need to condition on the minimum adjustment set that blocks the unmeasured time invariant confounder $\upsilon_{i}$, which in this case is $\{Y_{it-1}, Z_{it}\}$ and satisfies sequential ignorability for the causal path $X_{it} \longrightarrow Y_{it}$ [@Blackwell2018, 1077].
272 |
273 | In our simulations we randomly vary whether the time varying covariate $Z_{it}$ depends on past values of the treatment--that is, whether $Z_{it}$ is endogenous to the treatment $X_{it-1}$--since under such circumstances $Z_{it}$ is post-treatment with respect to $X_{it-1}$ and conditioning on it results in bias of unknowable direction and magnitude [@Blackwell2018; @Montgomery2018]. A detailed explanation of the data generation process for the simulations and a discussion of additional considerations, interested readers may consult the online appendix.
274 |
275 | ```{r fig-dag-simstudy, echo=FALSE, dpi=600, fig.height=9, fig.width=16, fig.align='center', cache=TRUE, fig.cap="DAG Depicting the Data Generation Process for the Simulations"}
276 | ## Define the coordinates for the complex DAG
277 | sim_dag_coords <- list(
278 | x = c(X_L2 = 0, X_L1 = 1, X = 2, X_T1 = 3, Y_L2 = 0.5,
279 | Y_L1 = 1, Y = 2.2, Z_L2 = 0, Z_L1 = 1, Z = 1.7,
280 | Z_T1 = 3, U = 1.2),
281 | y = c(X_L2 = 1, X_L1 = 1, X = 1, X_T1 = 1, Y_L2 = 0.5,
282 | Y_L1 = 0.5, Y = 0.5, Z_L2 = 0, Z_L1 = 0, Z = 0,
283 | Z_T1 = 0, U = 0.25)
284 | )
285 |
286 | ## Plotmath labels for the complex DAG
287 | sim_dag_labels <- list(
288 | X_L2 = "...", Z_L2 = "...",
289 | Y_L2 = "Y[i*t-2]", X_L1 = "X[i*t-1]",
290 | Z_L1 = "Z[i*t-1]", Y_L1 = "Y[i*t-1]",
291 | X = "X[i*t]", Z = "Z[i*t]",
292 | Y = "Y[i*t]", X_T1 = "...",
293 | Z_T1 = "...", U = "upsilon[i]"
294 | )
295 |
296 | ## Creating a More Complex DAG using ggdag syntax
297 | sim_dag <- dagify(
298 | Y_L1 ~ X_L1 + U,
299 | Z_L1 ~ X_L2 + U,
300 | X_L1 ~ Z_L2,
301 | Y ~ X + U,
302 | X ~ Y_L1 + Z,
303 | Z_T1 ~ X,
304 | X_T1 ~ Y,
305 | Z ~ X_L1 + U,
306 | coords = sim_dag_coords,
307 | labels = sim_dag_labels
308 | )
309 |
310 | # Modifications for the contemporaneous effect of X on Y
311 | sim_dag_tidy <- sim_dag %>%
312 | # Convert the DAG to a tibble
313 | tidy_dagitty() %>%
314 | # Create Path-Specific colors and transparencies
315 | mutate(
316 | # Color for the edges
317 | .edge_colour = case_when(
318 | name == "X_L1" & to == "Z" ~ "blue",
319 | name == "U" ~ "#FF3030",
320 | TRUE ~ "black"
321 | ),
322 | .edge_type = case_when(
323 | name == "X_L1" & to == "Z" ~ "solid",
324 | TRUE ~ "dashed"
325 | ))
326 |
327 | # Generate the DAG for the contemporaneous effect of X on Y
328 | ggplot(data = sim_dag_tidy,
329 | aes(x = x, y = y, xend = xend, yend = yend)
330 | ) +
331 | # Add the graph edges
332 | geom_dag_edges(aes(edge_color = .edge_colour, edge_linetype = .edge_type),
333 | edge_width = 1.5
334 | ) +
335 | # Add the graph nodes
336 | geom_dag_node(alpha = 0) +
337 | # Add the graph text
338 | geom_dag_text(
339 | aes(label = label),
340 | parse = TRUE,
341 | size = 11,
342 | color = "black",
343 | family = "serif",
344 | show.legend = FALSE
345 | ) +
346 | # Apply theme settings
347 | theme_dag(
348 | base_size = 24,
349 | base_family = "serif",
350 | strip.background = element_blank(),
351 | plot.caption.position = "plot",
352 | legend.position = "top"
353 | ) +
354 | # Add a legend for the edge colors
355 | scale_edge_color_identity() +
356 | # Tweak the legend aesthetics
357 | guides(edge_alpha = "none", edge_color = "none", edge_linetype = "none")
358 | ```
359 |
360 | ## Design
361 |
362 | To assess how the model performs under different conditions and evaluate the asymptotic properties of our Bayesian pseudo-likelihood procedure, we vary both the number of groups--we consider 25, 45, 65, 85, and 100--and the number of periods per group--20 and 50--which results in $5 \times 2 \times 2$ unique period-group-condition combinations. For each combination, we repeat the simulation 100 times giving us 2,000 simulated data sets in total on which to evaluate the model and covering dimensions that are approximately representative of most cross-sectional time series applications in political science.
363 |
364 | For each data set, we estimate the design stage models for the numerator and denominator of the weights via Bayesian logistic regression models of the form
365 |
366 | ```{=tex}
367 | \begin{align*}
368 | \Pr(X_{it} = 1 ~|~ \theta_{it}) &\sim \textit{Bernoulli}(\theta_{it})\\
369 | &\theta_{it} = \text{logit}^{-1}(\alpha + X_{n}\beta_{k})\\
370 | \text{with priors}\\
371 | \alpha &\sim \textit{Normal}(0, ~2) \quad \quad \beta_{k} \sim \textit{Normal}(0,~ 1)\\
372 | \end{align*}
373 | ```
374 |
375 | \noindent where $\alpha$ represents the global intercept, $\beta$ is a vector of coefficients of length $k$, and $X_{n}$ is an $n \times k$ matrix of predictors for the numerator and denominator of the weights. After estimating the location and scale of the distribution of the weights as discussed in the preceding section, we fit an outcome stage model of the form
376 |
377 | ```{=tex}
378 | \begin{align*}
379 | y_{it} &\sim \textit{Normal}(\mu_{it}, \epsilon^{2})^{\tilde{w}_{it}}\\
380 | &\mu_{it} = \alpha + \beta_{1}X_{it} + \beta_{2}X_{it-1} + \epsilon & \\
381 | \text{where}\\
382 | \tilde{w}_{it} &\sim \lambda_{it} + \delta_{it} \cdot \pi{(\delta)}\\
383 | \text{with priors}\\
384 | \alpha &\sim \textit{Normal}(\bar{y}, ~ 2 \cdot \sigma_{y}) & \beta_{k} &\sim \textit{Normal}\left(0, ~ 1.5 \cdot \frac{\sigma_{y}}{\sigma_{x}}\right)\\
385 | \epsilon &\sim \textit{Exponential}\left(\frac{1}{\sigma_{y}}\right) & \delta_{it} &\sim \textit{Beta}(2, ~ 5)\\
386 | \end{align*}
387 | ```
388 |
389 | The response $y$ is assumed Gaussian with mean $\mu$ and variance $\sigma^{2}$ with the pseudo-likelihood of each observation being the product of the likelihood and the sampled weight $\tilde{w}_{it}$. Priors on the coefficients are assigned independent normal priors with mean 0 and standard deviation $1.5 \cdot\frac{\sigma_{y}}{\sigma_{x}}$ where $\sigma_{x}$ and $\sigma_{y}$ are the standard deviation of the predictor and response respectively. We place a slightly more diffuse prior on the global intercept $\alpha$ which is assumed normal with mean $\bar{y}$ and standard deviation $2 \cdot \sigma_{y}$. For the dispersion parameter $\epsilon$ we assign an exponential prior with rate $\frac{1}{\sigma_{y}}$. This approach automatically adjusts the scale of the priors to the data and can be regarded as weakly to moderately informative [@Gelman2020, 124-126]. At each iteration of the sampler, the prior on the scale of the weights is drawn from a regularizing Beta with shape parameters $\alpha = 2$ and $\beta = 5$ and the weights calculated for each observation as described in the preceding section.
390 |
391 | In addition to our proposed marginal structural model, we also fit an auto-regressive distributed lag specification to each of the simulated data sets of the form
392 |
393 | ```{=tex}
394 | \begin{align*}
395 | y_{it} &\sim \textit{Normal}(\mu_{it}, \epsilon^{2})\\
396 | &\mu_{it} = \alpha + \beta_{1}X_{it} + \beta_{2}X_{it-1} + \beta_{3}Y_{it-1} + \beta_{4}Y_{it-2} +\\
397 | & \quad \beta_{5}Z_{it} + \beta_{6}Z_{it-1} + \epsilon\\
398 | \text{with priors}\\
399 | \alpha &\sim \textit{Normal}(\bar{y}, ~ 2 \cdot \sigma_{y}) \quad\quad\quad \beta_{k} \sim \textit{Normal}\left(0, ~ 1.5 \cdot \frac{\sigma_{y}}{\sigma_{x}}\right)\\
400 | \epsilon &\sim \textit{Exponential}\left(\frac{1}{\sigma_{y}}\right) &\\
401 | \end{align*}
402 | ```
403 | \noindent where the priors on the intercept, coefficients, and dispersion parameter are based on the same auto-scaling procedure discussed above.
404 |
405 | Estimation is performed under version 2.30 of the probabilistic programming language Stan which implements the No-U-Turn sampler variant of Hamiltonian Markov Chain Monte Carlo [@Carpenter2017; @Hoffman2014]. For each of the models, we run four markov chains in parallel for 2,000 iterations each, discarding the first 1,000 after the initial warm-up adaptation stage. This number proved sufficient for convergence and leaves us with 4,000 posterior samples per model for subsequent analysis. Stan code for each of these outcome models is provided in the appendix.
406 |
407 | ## Results
408 |
409 | {#fig-rmse-sims}
410 |
411 | The results of the simulations for each model are shown in @fig-rmse-sims, which depicts estimates for the root mean square error (RMSE) by model and dimensions under each condition for the bias in the estimate of $X_{it-1}$. We see that our Bayesian pseudo-likelihood estimator performs quite well overall, and as expected, tends to exhibit less bias when $Z_{it}$ is endogenous to the treatment history $X_{it-1}$ compared to the ARDL. Moreover, as the number of observed periods increases, the bias of the ARDL model grows while our MSM provides an approximately unbiased estimate of the average lagged treatment effect.
412 |
413 | {#fig-sim-dists fig-align="center"}
414 |
415 | @fig-sim-dists shows the distribution of posterior means for each model by condition and further illustrates that our Bayesian MSM approach performs reasonably well in terms of parameter recovery under both conditions while exhibiting substantially less bias than the ARDL approach in cases where time varying covariates are a function of past treatments. Overall, we see that our Bayesian pseudo-likelihood procedure performs quite well across a range of different scenarios that are roughly typical of data in political science and international relations.
416 |
417 |
418 | # Applied example
419 |
420 | TODO: After APSA, apply this by replicating an existing paper
421 |
422 |
423 | # Conclusion
424 |
425 | Computational and methodological advances in the past decade have laid the groundwork for substantial advances in quantitative political science and policy research. Developments in observational causal inference—in both quasi-experimental and propsensity score-based approaches—have enhanced the credibility and robustness of research findings, while increased use of Bayesian analysis has led to more results that are more interpretable and deal more directly with the uncertainty of estimates.
426 |
427 | As we have demonstrated, however, Bayesian methods are incompatible with approaches to causal inference that rely on propensity scores. To overcome this incompatibility, we propose a pseudo-Bayesian approach to the calculation and use of inverse probability weights that allows researchers to work with posterior distributions that correctly capture and reflect uncertainty.
428 |
429 |
--------------------------------------------------------------------------------
/manuscript/output/Nafa and Heiss (2022)_Taking-Uncertainty-Seriously-APSA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajnafa/Latent-Bayesian-MSM/8d5596abbe49284feeb5f4f39e9ed0ee7abb24c5/manuscript/output/Nafa and Heiss (2022)_Taking-Uncertainty-Seriously-APSA.pdf
--------------------------------------------------------------------------------
/models/stan/ARDL_Simulation.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajnafa/Latent-Bayesian-MSM/8d5596abbe49284feeb5f4f39e9ed0ee7abb24c5/models/stan/ARDL_Simulation.exe
--------------------------------------------------------------------------------
/models/stan/ARDL_Simulation.stan:
--------------------------------------------------------------------------------
1 | // ARDL (t-1) Model with a Gaussian Likelihood for Simulation Comparison
2 | data {
3 | int N; // Observations
4 | vector[N] Y; // Outcome Stage Response
5 | int K; // Number of Population-Level Effects and Intercept
6 | matrix[N, K] X; // Design Matrix for the Population-Level Effects
7 |
8 | // Coefficient Prior Scales
9 | vector[K] beta_sds;
10 | }
11 |
12 | transformed data {
13 | // Priors on the intercept and error term
14 | real alpha_prior_mu = mean(Y);
15 | real alpha_prior_sd = 2 * sd(Y);
16 | real sigma_prior = 1/sd(Y);
17 |
18 | int Kc = K - 1;
19 | matrix[N, Kc] Xc; // Centered version of X without an Intercept
20 | vector[Kc] means_X; // Column Means of the Uncentered Design Matrix
21 |
22 | // Centering the design matrix
23 | for (i in 2:K) {
24 | means_X[i - 1] = mean(X[, i]);
25 | Xc[, i - 1] = X[, i] - means_X[i - 1];
26 | }
27 | }
28 |
29 | parameters {
30 | vector[Kc] b; // Population-Level Effects
31 | real Intercept; // Intercept for the centered predictors
32 | real sigma; // Dispersion parameter
33 | }
34 |
35 | model {
36 | // likelihood including constants
37 | target += normal_id_glm_lpdf(Y | Xc, Intercept, b, sigma);
38 |
39 | // Priors for the Model Parameters
40 | target += normal_lpdf(Intercept | alpha_prior_mu, alpha_prior_sd);
41 | target += normal_lpdf(b[1] | 0, beta_sds[2]);
42 | target += normal_lpdf(b[2] | 0, beta_sds[3]);
43 | target += normal_lpdf(b[3] | 0, beta_sds[4]);
44 | target += normal_lpdf(b[4] | 0, beta_sds[5]);
45 | target += normal_lpdf(b[5] | 0, beta_sds[6]);
46 | target += normal_lpdf(b[6] | 0, beta_sds[7]);
47 | target += exponential_lpdf(sigma | sigma_prior);
48 | }
49 |
50 | generated quantities {
51 | // actual population-level intercept
52 | real b_Intercept = Intercept - dot_product(means_X, b);
53 | }
54 |
55 |
--------------------------------------------------------------------------------
/models/stan/IPTW_Outcome_Simulation.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajnafa/Latent-Bayesian-MSM/8d5596abbe49284feeb5f4f39e9ed0ee7abb24c5/models/stan/IPTW_Outcome_Simulation.exe
--------------------------------------------------------------------------------
/models/stan/IPTW_Outcome_Simulation.stan:
--------------------------------------------------------------------------------
1 | /* Pseudo Bayesian Inverse Probability of Treatment Weighted Estimator
2 | * Author: A. Jordan Nafa; Stan Version 2.30.1; Last Revised 08-29-2022 */
3 | functions {
4 | // Weighted Log PDF of the Gaussian Pseudo-Likelihood
5 | real normal_ipw_lpdf(vector y, vector mu, real sigma, vector w_tilde, int N) {
6 | real weighted_term;
7 | weighted_term = 0.00;
8 | for (n in 1:N) {
9 | weighted_term = weighted_term + w_tilde[n] * (normal_lpdf(y[n] | mu[n], sigma));
10 | }
11 | return weighted_term;
12 | }
13 | }
14 |
15 | data {
16 | int N; // Observations
17 | vector[N] Y; // Outcome Stage Response
18 | int K; // Number of Population-Level Effects and Intercept
19 | matrix[N, K] X; // Design Matrix for the Population-Level Effects
20 |
21 | // Statistics from the Design Stage Model
22 | vector[N] ipw_mu; // Mean of the Population-Level Weights
23 | vector[N] ipw_sigma; // Scale of the Population-Level Weights
24 |
25 | // Prior on the scale of the weights
26 | real sd_prior_shape1;
27 | real sd_prior_shape2;
28 | }
29 |
30 | transformed data {
31 | // Priors on the coefficients and intercept
32 | real b_prior_sd = 1.5 * (sd(Y)/sd(X[, 2]));
33 | real alpha_prior_mu = mean(Y);
34 | real alpha_prior_sd = 2 * sd(Y);
35 | real sigma_prior = 1/sd(Y);
36 |
37 | int Kc = K - 1;
38 | matrix[N, Kc] Xc; // Centered version of X without an Intercept
39 | vector[Kc] means_X; // Column Means of the Uncentered Design Matrix
40 |
41 | // Centering the design matrix
42 | for (i in 2:K) {
43 | means_X[i - 1] = mean(X[, i]);
44 | Xc[, i - 1] = X[, i] - means_X[i - 1];
45 | }
46 | }
47 |
48 | parameters {
49 | vector[Kc] b; // Population-Level Effects
50 | real Intercept; // Population-Level Intercept for the Centered Predictors
51 | real sigma; // Dispersion Parameter
52 | real weights_z[N]; // Parameter for the IPT Weights
53 | }
54 |
55 | transformed parameters {
56 | // Compute the IPT Weights
57 | vector[N] w_tilde; // IPT Weights
58 | w_tilde = ipw_mu + ipw_sigma * weights_z[1];
59 | }
60 |
61 | model {
62 | // Initialize the Linear Predictor
63 | vector[N] mu = Intercept + Xc * b;
64 |
65 | // Sampling the Weights
66 | weights_z ~ beta(sd_prior_shape1, sd_prior_shape2);
67 |
68 | // Priors for the Model Parameters
69 | target += normal_lpdf(Intercept | alpha_prior_mu, alpha_prior_sd);
70 | target += normal_lpdf(b | 0, b_prior_sd);
71 | target += exponential_lpdf(sigma | sigma_prior);
72 |
73 | // Weighted Likelihood
74 | target += normal_ipw_lpdf(Y | mu, sigma, w_tilde, N);
75 | }
76 |
77 | generated quantities {
78 | // Population-level Intercept on the Original Scale
79 | real b_Intercept = Intercept - dot_product(means_X, b);
80 | }
81 |
82 |
83 |
--------------------------------------------------------------------------------
/presentation/.gitignore:
--------------------------------------------------------------------------------
1 | apsa-presentation_cache
2 | apsa-presentation_files
--------------------------------------------------------------------------------
/presentation/Taking-Uncertainty-Seriously_Bayesian-Marginal-Structural-Models-for-Causal Inference.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajnafa/Latent-Bayesian-MSM/8d5596abbe49284feeb5f4f39e9ed0ee7abb24c5/presentation/Taking-Uncertainty-Seriously_Bayesian-Marginal-Structural-Models-for-Causal Inference.pdf
--------------------------------------------------------------------------------
/presentation/apsa-presentation.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Taking Uncertainty Seriously: Bayesian Marginal Structural Models for Causal Inference"
3 | author: "A. Jordan Nafa and Andrew Heiss"
4 | date: "September 16th, 2022"
5 | institute: "University of North Texas and Georgia State University"
6 | format:
7 | revealjs:
8 | theme: blood
9 | highlight-style: monokai
10 | toc-depth: 2
11 | self-contained: true
12 | citations-hover: true
13 | code-link: true
14 | code-block-bg: "#272822"
15 | css: "style.css"
16 | include-in-header: "math-colors.js"
17 | pdf-separate-fragments: true
18 | editor: visual
19 | width: 1360
20 | height: 800
21 | bibliography: "../assets/references.bib"
22 | csl: "../assets/apsa.csl"
23 | link-citations: yes
24 | ---
25 |
26 | ```{r, include=FALSE}
27 | # Load the required libraries
28 | pacman::p_load(
29 | "tidyverse",
30 | "data.table",
31 | "dtplyr",
32 | "dagitty",
33 | "ggraph",
34 | "ggdag",
35 | "tidybayes",
36 | "ggblend",
37 | install = FALSE
38 | )
39 |
40 | ## Base theme for the figures
41 | fig_theme <- theme_light(base_size = 30, base_family = "serif") +
42 | ## Settings sepcific to the reveal.js theme
43 | theme(
44 | strip.background = element_blank(),
45 | panel.background = element_rect(fill = "transparent", colour = NA),
46 | plot.background = element_rect(fill = "transparent", colour = NA),
47 | legend.background = element_rect(fill = "transparent", colour = NA),
48 | legend.key = element_rect(fill = "transparent", colour = NA),
49 | plot.title = element_text(color = "white"),
50 | plot.subtitle = element_text(color = "white"),
51 | plot.caption = element_text(color = "white", face = "italic"),
52 | axis.text = element_text(color = "white"),
53 | axis.title = element_text(color = "white", face = "bold"),
54 | legend.title = element_text(color = "white", face = "bold"),
55 | legend.text = element_text(color = "white", face = "bold"),
56 | strip.text = element_text(face = "bold"),
57 | plot.caption.position = "plot",
58 | legend.position = "top"
59 | )
60 |
61 | shorten_dag_arrows <- function(tidy_dag, proportion){
62 | # Update underlying ggdag object
63 | tidy_dag$data <- dplyr::mutate(tidy_dag$data,
64 | xend = (1-proportion/2)*(xend - x) + x,
65 | yend = (1-proportion/2)*(yend - y) + y,
66 | xstart = (1-proportion/2)*(x - xend) + xend,
67 | ystart = (1-proportion/2)*(y-yend) + yend)
68 | return(tidy_dag)
69 | }
70 |
71 | # Read in the simulation results
72 | sim_results <- read_rds("../data/sim_results.rds")
73 | ```
74 |
75 | ## Introduction
76 |
77 | ::: incremental
78 | - In many fields of political science baseline random assignment and experimental design are impossible
79 |
80 | - For many research questions, observational data is our only option but this makes causal inference difficult
81 |
82 | - Assumption of *strict exogeneity* is usually unrealistic and valid instruments are a rare beast [@Liu2022; @Swamy2015; @Mogstad2018]
83 |
84 | - Need to rely on weaker assumptions for causal identification [@Blackwell2018; @Acharya2016; @Forastiere2018]
85 |
86 | - How can we estimate causal effects and describe their uncertainty using observational data?
87 | :::
88 |
89 | ## Causal Inference in Political Science
90 |
91 | ::: incremental
92 | - Cross-sectional time series data and causal inference
93 |
94 | - Causal inference literature in political science focuses largely on frequentist econometrics [i.e., @Imai2019; @Imai2020]
95 |
96 | - Still relies on strict exogeneity assumption
97 |
98 | - Recent works drawing on approaches developed in biostatistics outline a framework for estimating causal effects under the relatively weaker assumption of *sequential ignorability* [@Acharya2016; @Blackwell2018]
99 |
100 | - Our goal in this paper is to extend the approach to causal inference under selection on observables introduced by @Blackwell2018 to a Bayesian framework
101 | :::
102 |
103 | ## Marginal Structural Models
104 |
105 | ::: incremental
106 | - Marginal structural models (MSMs) are a multi-stage approach to estimating causal effects where baseline random assignment is not possible [@Robins1997; @Robins2000]
107 |
108 | - Relies on inverse probability of treatment weighting to achieve covariate balance by constructing pseudo-populations [@Imai2015; @Cole2008; @Saarela2015]
109 |
110 | - Adjusting for biasing paths in the propensity model allows for identification of causal effects that are impossible to estimate in a single model due to post-treatment bias
111 |
112 | - Possible to estimate lagged effects and "treatment histories" in cross-sectional time series data under complex temporal dependence [@Blackwell2018]
113 | :::
114 |
115 | ------------------------------------------------------------------------
116 |
117 | ```{r dag-1, echo=FALSE, dpi=600, dev.args = list(bg = 'transparent'), fig.height=9, fig.width=16, fig.align='center', cache=TRUE}
118 | ## Define the coordinates for the complex DAG
119 | complex_coords <- list(
120 | x = c(X_L2 = 0, X_L1 = 1, X = 2, X_T1 = 3, Y_L2 = 0.5,
121 | Y_L1 = 1.5, Y = 2.5, Z_L2 = 0, Z_L1 = 1, Z = 2, Z_T1 = 3),
122 | y = c(X_L2 = 1, X_L1 = 1, X = 1, X_T1 = 1, Y_L2 = 0.5,
123 | Y_L1 = 0.5, Y = 0.5, Z_L2 = 0, Z_L1 = 0, Z = 0, Z_T1 = 0)
124 | )
125 |
126 | ## Plotmath labels for the complex DAG
127 | complex_labels <- list(
128 | X_L2 = "...", Z_L2 = "...",
129 | Y_L2 = "Y[i*t-2]", X_L1 = "X[i*t-1]",
130 | Z_L1 = "Z[i*t-1]", Y_L1 = "Y[i*t-1]",
131 | X = "X[i*t]", Z = "Z[i*t]",
132 | Y = "Y[i*t]", X_T1 = "...", Z_T1 = "..."
133 | )
134 |
135 | ## Creating a More Complex DAG using ggdag syntax
136 | complex_dag <- dagify(
137 | Y_L2 ~ X_L2 + Z_L2,
138 | X_L1 ~ X_L2 + Z_L1 + Y_L2,
139 | Y_L1 ~ X_L1 + X_L2 + Z_L1,
140 | Z_L1 ~ Y_L2 + Z_L2,
141 | X ~ X_L1 + Z + Y_L1,
142 | Y ~ X_L1 + Z + X,
143 | Z ~ Y_L1 + Z_L1,
144 | X_T1 ~ X + Y,
145 | Z_T1 ~ Z + Y,
146 | coords = complex_coords,
147 | labels = complex_labels
148 | )
149 |
150 | # Modifications for the contemporaneous effect of X on Y
151 | complex_dag_contemp <- complex_dag %>%
152 | # Convert the DAG to a tibble
153 | tidy_dagitty() %>%
154 | # Get the adjustment Set
155 | dag_adjustment_sets(exposure = "X", outcome = "Y") %>%
156 | # Create Path-Specific colors and transparencies
157 | mutate(
158 | # Transparency for the edges
159 | .edge_alpha = case_when(
160 | name %in% c("X_L1", "Z", "X") & to %in% c("Y", "X") ~ 1,
161 | TRUE ~ 0.25
162 | ),
163 | # Color for the edges
164 | .edge_colour = case_when(
165 | name == "X" & to == "Y" ~ "#00FFFF",
166 | TRUE ~ "white"
167 | )
168 | )
169 |
170 | # Data for the plot annotations
171 | period_labels <- tribble(
172 | ~ x, ~ y, ~ .label,
173 | 0.5, 1.1, "bolditalic('Time '*t - 2)",
174 | 1.5, 1.1, "bolditalic('Time '*t - 1)",
175 | 2.5, 1.1, "bolditalic('Time '*t)"
176 | )
177 |
178 | # Adjust the length of the edges
179 | complex_dag_contemp <- shorten_dag_arrows(
180 | complex_dag_contemp,
181 | proportion = 0.06
182 | )
183 |
184 | # Generate the DAG for the contemporaneous effect of X on Y
185 | ggplot(
186 | data = complex_dag_contemp,
187 | aes(x = x, y = y, xend = xend, yend = yend)
188 | ) +
189 | # Add the graph edges
190 | geom_dag_edges(
191 | aes(
192 | x = xstart,
193 | y = ystart,
194 | edge_color = .edge_colour
195 | ),
196 | edge_width = 1.5
197 | ) +
198 | # Add the graph nodes
199 | geom_dag_node(alpha = 0) +
200 | # Add the graph text
201 | geom_dag_text(
202 | aes(label = label),
203 | parse = TRUE,
204 | size = 11,
205 | color = "white",
206 | family = "serif",
207 | show.legend = FALSE
208 | ) +
209 | # Apply theme settings
210 | theme_dag(
211 | base_size = 24,
212 | base_family = "serif",
213 | strip.background = element_blank(),
214 | panel.background = element_rect(fill = "transparent", colour = NA),
215 | plot.background = element_rect(fill = "transparent", colour = NA),
216 | legend.background = element_rect(fill = "transparent", colour = NA),
217 | legend.key = element_rect(fill = "transparent", colour = NA),
218 | plot.title = element_text(color = "white"),
219 | plot.subtitle = element_text(color = "white"),
220 | plot.caption = element_text(color = "white", face = "italic"),
221 | legend.title = element_text(color = "white", face = "bold"),
222 | legend.text = element_text(color = "white", face = "bold"),
223 | plot.caption.position = "plot",
224 | legend.position = "top"
225 | ) +
226 | # Add a legend for the edge colors
227 | scale_edge_color_identity(
228 | guide = "legend",
229 | name = NULL,
230 | labels = c(
231 | "Treatment Path",
232 | "Biasing Paths",
233 | "Other Paths"
234 | )
235 | ) +
236 | # Tweak the legend aesthetics
237 | guides(
238 | edge_alpha = "none",
239 | edge_color = "none") +
240 | # Plot Annotations
241 | annotate(
242 | "text",
243 | x = period_labels$x,
244 | y = period_labels$y,
245 | label = period_labels$.label,
246 | parse = TRUE,
247 | colour = "white",
248 | size = 12,
249 | family = "serif"
250 | )
251 | ```
252 |
253 | ------------------------------------------------------------------------
254 |
255 | ```{r dag-2, echo=FALSE, dpi=600, dev.args = list(bg = 'transparent'), fig.height=9, fig.width=16, fig.align='center', cache=TRUE}
256 | ## Define the coordinates for the complex DAG
257 | complex_coords <- list(
258 | x = c(X_L2 = 0, X_L1 = 1, X = 2, X_T1 = 3, Y_L2 = 0.2,
259 | Y_L1 = 1.2, Y = 2.2, Z_L2 = 0, Z_L1 = 1, Z = 2, Z_T1 = 3),
260 | y = c(X_L2 = 1, X_L1 = 1, X = 1, X_T1 = 1, Y_L2 = 0.5,
261 | Y_L1 = 0.5, Y = 0.5, Z_L2 = 0, Z_L1 = 0, Z = 0, Z_T1 = 0)
262 | )
263 |
264 | ## Plotmath labels for the complex DAG
265 | complex_labels <- list(
266 | X_L2 = "...", Z_L2 = "...",
267 | Y_L2 = "Y[i*t-2]", X_L1 = "X[i*t-1]",
268 | Z_L1 = "Z[i*t-1]", Y_L1 = "Y[i*t-1]",
269 | X = "X[i*t]", Z = "Z[i*t]",
270 | Y = "Y[i*t]", X_T1 = "...", Z_T1 = "..."
271 | )
272 |
273 | ## Creating a More Complex DAG using ggdag syntax
274 | complex_dag <- dagify(
275 | Y_L2 ~ X_L2 + Z_L2,
276 | X_L1 ~ X_L2 + Z_L1 + Y_L2,
277 | Y_L1 ~ X_L1 + X_L2 + Z_L1,
278 | Z_L1 ~ Z_L2 + Y_L2 + X_L2,
279 | X ~ X_L1 + Z + Y_L1,
280 | Y ~ X_L1 + Z + X,
281 | Z ~ Z_L1 + Y_L1 + X_L1,
282 | X_T1 ~ X + Y,
283 | Z_T1 ~ Z + Y + X,
284 | coords = complex_coords,
285 | labels = complex_labels
286 | )
287 |
288 | # Modifications for the contemporaneous effect of X on Y
289 | complex_dag_contemp <- complex_dag %>%
290 | # Convert the DAG to a tibble
291 | tidy_dagitty() %>%
292 | # Get the adjustment Set
293 | dag_adjustment_sets(exposure = "X", outcome = "Y") %>%
294 | # Create Path-Specific colors and transparencies
295 | mutate(
296 | # Color for the edges
297 | .edge_colour = case_when(
298 | name == "X" & to == "Y" ~ "#00FFFF",
299 | TRUE ~ "white"
300 | )
301 | )
302 |
303 | # Adjust the length of the edges
304 | complex_dag_contemp <- shorten_dag_arrows(
305 | complex_dag_contemp,
306 | proportion = 0.06
307 | )
308 |
309 | # Generate the DAG for the contemporaneous effect of X on Y
310 | ggplot(
311 | data = complex_dag_contemp,
312 | aes(x = x, y = y, xend = xend, yend = yend)
313 | ) +
314 | # Add the graph edges
315 | geom_dag_edges(
316 | aes(
317 | x = xstart,
318 | y = ystart,
319 | edge_color = .edge_colour
320 | ),
321 | edge_width = 1.5
322 | ) +
323 | # Add the graph nodes
324 | geom_dag_node(alpha = 0) +
325 | # Add the graph text
326 | geom_dag_text(
327 | aes(label = label),
328 | parse = TRUE,
329 | size = 11,
330 | color = "white",
331 | family = "serif",
332 | show.legend = FALSE
333 | ) +
334 | # Apply theme settings
335 | theme_dag(
336 | base_size = 24,
337 | base_family = "serif",
338 | strip.background = element_blank(),
339 | panel.background = element_rect(fill = "transparent", colour = NA),
340 | plot.background = element_rect(fill = "transparent", colour = NA),
341 | legend.background = element_rect(fill = "transparent", colour = NA),
342 | legend.key = element_rect(fill = "transparent", colour = NA),
343 | plot.title = element_text(color = "white"),
344 | plot.subtitle = element_text(color = "white"),
345 | plot.caption = element_text(color = "white", face = "italic"),
346 | legend.title = element_text(color = "white", face = "bold"),
347 | legend.text = element_text(color = "white", face = "bold"),
348 | plot.caption.position = "plot",
349 | legend.position = "top"
350 | ) +
351 | # Add a legend for the edge colors
352 | scale_edge_color_identity(
353 | guide = "legend",
354 | name = NULL,
355 | labels = c(
356 | "Treatment Path",
357 | "Biasing Paths",
358 | "Other Paths"
359 | )
360 | ) +
361 | # Tweak the legend aesthetics
362 | guides(
363 | edge_alpha = "none",
364 | edge_color = "none") +
365 | # Plot Annotations
366 | annotate(
367 | "text",
368 | x = period_labels$x,
369 | y = period_labels$y,
370 | label = period_labels$.label,
371 | parse = TRUE,
372 | colour = "white",
373 | size = 12,
374 | family = "serif"
375 | )
376 | ```
377 |
378 | ## Why Bayesian Estimation?
379 |
380 | ::: incremental
381 | - Frequentist uncertainty estimates are based on assumptions about *sampling distributions*
382 |
383 | - Yet, in many areas of political science our data comprise an *apparent population* rather than a random sample from a larger population of interest [@Berk1995; @Gill2001]
384 |
385 | - For example, all sovereign countries between 1945 and 2020 or all states in U.S. over some time period
386 |
387 | - It doesn't make sense to think in terms of random samples from a population if your observed data *is the population* [@Western1994; @Gill2020]
388 | :::
389 |
390 | ## Why Bayesian Estimation?
391 |
392 | ::: incremental
393 | - A Bayesian framework provides a straightforward approach to accounting for and propagating uncertainty in the specification of the propensity model
394 |
395 | - Bayesian Model Averaging (BMA) and cross-validation based stacking approaches allow us to avoid choosing a single specification for the propensity model [@Zigler2014; @Kaplan2014]
396 |
397 | - Acknowledges that we are virtually always uncertain about what the true set of confounders is
398 |
399 | - May help reduce the degree to which our results depend on the propensity model being correctly specified [@Hahn2020]
400 | :::
401 |
402 | ## Why Bayesian Estimation?
403 |
404 | ::: incremental
405 | - Inverse probability of treatment weights are an estimated quantity with associated uncertainty [@Liao2020]
406 |
407 | - Most applications of IPTW methods in political science assume the weights are deterministic [i.e., @Blackwell2018; @Ladam2018; @Kurtz2021]
408 |
409 | - Need to propagate uncertainty in the design stage weights to outcome stage [@Liao2020]
410 |
411 | - Highlights a problem with fully Bayesian estimation of MSMs which requires the models be estimated jointly [@Zigler2013; @Robins2015]
412 | :::
413 |
414 | # A Bayesian Pseudo-Likelihood Approach
415 |
416 | $$
417 | \definecolor{treat}{RGB}{27,208,213}
418 | \definecolor{outcome}{RGB}{98,252,107}
419 | \definecolor{baseconf}{RGB}{244,199,58}
420 | \definecolor{covariates}{RGB}{178,26,1}
421 | \definecolor{index}{RGB}{37,236,167}
422 | \definecolor{timeid}{RGB}{244,101,22}
423 | \definecolor{mu}{RGB}{71,119,239}
424 | \definecolor{sigma}{RGB}{219,58,7}
425 | \newcommand{normalcolor}{\color{white}}
426 | \newcommand{treat}[1]{\color{treat} #1 \normalcolor}
427 | \newcommand{resp}[1]{\color{outcome} #1 \normalcolor}
428 | \newcommand{conf}[1]{\color{baseconf} #1 \normalcolor}
429 | \newcommand{covar}[1]{\color{covariates} #1 \normalcolor}
430 | \newcommand{obs}[1]{\color{index} #1 \normalcolor}
431 | \newcommand{tim}[1]{\color{timeid} #1 \normalcolor}
432 | \newcommand{mean}[1]{\color{mu} #1 \normalcolor}
433 | \newcommand{vari}[1]{\color{sigma} #1 \normalcolor}
434 | $$
435 |
436 | ## Bayesian Design Stage Estimation
437 |
438 | For some binary treatment $\treat{X}_{\obs{i}\tim{t}}$, the posterior expectation of the stabilized inverse probability of treatment weights for each unit $\obs{i}$ at time $\tim{t}$ is
439 |
440 | $$
441 | \text{IPW}_{\obs{i}\tim{t}} = \prod^{\tim{t}}_{\tim{t} = \tim{1}} \frac{\int\Pr[\treat{X}_{\obs{i}\tim{t}}~ | ~\treat{X}_{\obs{i}\tim{t-1}},~ \conf{C}_{\obs{i}}]\pi(\theta)d\theta}{\int\Pr[\treat{X}_{\obs{i}\tim{t}}~ |~\covar{Z}_{\obs{i}\tim{t}}, ~ \treat{X}_{\obs{i}\tim{t-1}},~ \resp{Y}_{\obs{i}\tim{t-1}},~ \conf{C}_{\obs{i}}]\pi(\theta)d\theta}
442 | $$
443 |
444 | ::: incremental
445 | - $\treat{X}_{\obs{i}\tim{t-1}}$ and $\resp{Y}_{\obs{i}\tim{t-1}}$ denote the treatment status and outcome for the $\obs{i^{th}}$ unit in the previous period respectively
446 |
447 | - $\conf{C}_{\obs{i}}$ is a set of time-invariant baseline covariates
448 |
449 | - $\covar{Z}_{\obs{i}\tim{t}}$ is a set of time-varying covariates that satisfies sequential ignorability
450 |
451 | - Although we focus mainly on the average treatment effect at times $\tim{t}$ and $\tim{t-1}$, it is possible to estimate longer lags and other estimands as well.
452 | :::
453 |
454 | ## Bayesian Design Stage Estimation
455 |
456 | It is also possible to extend IPTW to cases in which $\treat{X}_{\obs{i}\tim{t}}$ is continuous, in which case the stabilized weights are
457 |
458 | $$\text{IPW}_{\obs{i}\tim{t}} = \prod^{\tim{t}}_{\tim{t} = \tim{1}} \frac{f_{\treat{X}_{\obs{i}\tim{t}} | \treat{X}_{\obs{i}\tim{t-1}},\conf{C}_{\obs{i}}}[(\treat{X}_{\obs{i}\tim{t}}~ | ~\treat{X}_{\obs{i}\tim{t-1}},~ \conf{C}_{\obs{i}}); ~\mean{\mu}, ~\vari{\sigma^{2}}]}{f_{\treat{X}_{\obs{i}\tim{t}} |\covar{Z}_{\obs{i}\tim{t}}, \treat{X}_{\obs{i}\tim{t-1}}, \resp{Y}_{\obs{i}\tim{t-1}}, \conf{C}_{\obs{i}}}[(\treat{X}_{\obs{i}\tim{t}}~ |~\covar{Z}_{\obs{i}\tim{t}}, ~ \treat{X}_{\obs{i}\tim{t-1}},~ \resp{Y}_{\obs{i}\tim{t-1}},~ \conf{C}_{\obs{i}}); ~\mean{\mu}, ~\vari{\sigma^{2}}]}
459 | $$
460 |
461 | ::: incremental
462 | - Each of the parameters $\treat{X}$, $\resp{Y}$, $\covar{Z}$, and $\conf{C}$ in the numerator and denominator are the same as in the binary version
463 |
464 | - The $f_{\dots}(\cdot)$ expressions represent a probability density function with mean $\mean{\mu}$ and variance $\vari{\sigma^{2}}$
465 |
466 | - We'll focus mainly on binary treatment regimes, though this particular method tends to behave better for a continuous $\treat{X}$ in some cases
467 | :::
468 |
469 | ## The Bayesian Pseudo-Likelihood
470 |
471 | To propagate uncertainty in the distribution of weights from the design stage while avoiding the problem of feedback inherent in joint estimation, we develop a Bayesian Pseudo-Likelihood estimator [@Savitsky2016; @Williams2020b; @Williams2020a]
472 |
473 | ```{=tex}
474 | \begin{align}
475 | \hat{\pi}( \theta~|~y, \tilde{w}) ~\propto~ \left [\prod_{i = 1}^{n} \Pr(y_{i} ~|~ \theta)^{\tilde{w_{i}}}\right ]\pi(\theta)
476 | \end{align}
477 | ```
478 | ::: incremental
479 | - $\tilde{w_{i}}$ is the realized IPT weight for the $i^{th}$ observation
480 |
481 | - $\prod_{i = 1}^{n} \Pr(y_{i} ~|~ \theta)^{\tilde{w_{i}}}$ is the pseudo-likelihood and $\pi$ denotes the prior probability for a parameter $\theta$
482 |
483 | - $\hat{\pi}( \theta~|~y, \tilde{w})$ represents the Bayesian pseudo-posterior for $\theta$
484 | :::
485 |
486 | ## Parameterization of the Weights
487 |
488 | ::: incremental
489 | - We decompose the matrix of weights from the design stage into a location component $\lambda$ and a scale component $\delta$
490 |
491 | - The weight for each observation is sampled as $$\tilde{w}_{\obs{i}\tim{t}} \sim \lambda_{\obs{i}\tim{t}} + \delta_{\obs{i}\tim{t}} \cdot \pi(\delta_{\obs{i}\tim{t}})$$ where $\pi(\delta_{\obs{i}\tim{t}})$ is a regularizing prior on the scale of the weights such as an exponential distribution with rate $\lambda > 3.5$ or Beta distribution with shape parameters $\alpha = 2$ and $\beta \ge 2$
492 |
493 | - Provides computational stability and shuts down extreme values when the IPT weights have high variance
494 |
495 | - Straightforward extensions for nested data structures via double-weighted estimation [@Savitsky2021]
496 | :::
497 |
498 | # Simulation Study Design
499 |
500 | ## Simulation Study Overview
501 |
502 | ::: incremental
503 | - To assess parameter recovery and bias, we adapt the original simulation design from @Blackwell2018
504 |
505 | - We simulate 2000 data sets of varying dimensions, manipulating the path $\treat{X}_{\obs{i}\tim{t-1}} \longrightarrow \covar{Z}_{\obs{i}\tim{t}}$
506 |
507 | - Periods $\in \{20, 50\}$
508 |
509 | - Groups $\in \{25, 45, 65, 85, 100\}$
510 |
511 | - Objectives
512 |
513 | - Identify both $\treat{X}_{\obs{i}\tim{t}} \longrightarrow \resp{Y}_{\obs{i}\tim{t}}$ and $\treat{X}_{\obs{i}\tim{t-1}} \longrightarrow \resp{Y}_{\obs{i}\tim{t}}$
514 |
515 | - Compare our Bayesian Pseduo-Likelihood approach against the more common auto-regressive distributed lag (ARDL) specification
516 | :::
517 |
518 | ## DAG for the Simulated Data
519 |
520 | ```{r dag-simstudy, echo=FALSE, dpi=600, dev.args = list(bg = 'transparent'), fig.height=9, fig.width=16, fig.align='center', cache=TRUE}
521 | ## Define the coordinates for the complex DAG
522 | sim_dag_coords <- list(
523 | x = c(X_L2 = 0, X_L1 = 1, X = 2, X_T1 = 3, Y_L2 = 0.5,
524 | Y_L1 = 1, Y = 2.2, Z_L2 = 0, Z_L1 = 1, Z = 1.7,
525 | Z_T1 = 3, U = 1.2),
526 | y = c(X_L2 = 1, X_L1 = 1, X = 1, X_T1 = 1, Y_L2 = 0.5,
527 | Y_L1 = 0.5, Y = 0.5, Z_L2 = 0, Z_L1 = 0, Z = 0,
528 | Z_T1 = 0, U = 0.25)
529 | )
530 |
531 | ## Plotmath labels for the complex DAG
532 | sim_dag_labels <- list(
533 | X_L2 = "...", Z_L2 = "...",
534 | Y_L2 = "Y[i*t-2]", X_L1 = "X[i*t-1]",
535 | Z_L1 = "Z[i*t-1]", Y_L1 = "Y[i*t-1]",
536 | X = "X[i*t]", Z = "Z[i*t]",
537 | Y = "Y[i*t]", X_T1 = "...",
538 | Z_T1 = "...", U = "upsilon[i]"
539 | )
540 |
541 | ## Creating a More Complex DAG using ggdag syntax
542 | sim_dag <- dagify(
543 | Y_L1 ~ X_L1 + U,
544 | Z_L1 ~ X_L2 + U,
545 | X_L1 ~ Z_L2,
546 | Y ~ X + U,
547 | X ~ Y_L1 + Z,
548 | Z_T1 ~ X,
549 | X_T1 ~ Y,
550 | Z ~ X_L1 + U,
551 | coords = sim_dag_coords,
552 | labels = sim_dag_labels
553 | )
554 |
555 | # Modifications for the contemporaneous effect of X on Y
556 | sim_dag_tidy <- sim_dag %>%
557 | # Convert the DAG to a tibble
558 | tidy_dagitty() %>%
559 | # Create Path-Specific colors and transparencies
560 | mutate(
561 | # Color for the edges
562 | .edge_colour = case_when(
563 | name == "X_L1" & to == "Z" ~ "#7CFC00",
564 | name == "U" ~ "#FF3030",
565 | TRUE ~ "white"
566 | ),
567 | .edge_type = case_when(
568 | name == "X_L1" & to == "Z" ~ "solid",
569 | TRUE ~ "dashed"
570 | ))
571 |
572 | # Generate the DAG for the contemporaneous effect of X on Y
573 | ggplot(data = sim_dag_tidy,
574 | aes(x = x, y = y, xend = xend, yend = yend)
575 | ) +
576 | # Add the graph edges
577 | geom_dag_edges(aes(edge_color = .edge_colour, edge_linetype = .edge_type),
578 | edge_width = 1.5
579 | ) +
580 | # Add the graph nodes
581 | geom_dag_node(alpha = 0) +
582 | # Add the graph text
583 | geom_dag_text(
584 | aes(label = label),
585 | parse = TRUE,
586 | size = 11,
587 | color = "white",
588 | family = "serif",
589 | show.legend = FALSE
590 | ) +
591 | # Apply theme settings
592 | theme_dag(
593 | base_size = 24,
594 | base_family = "serif",
595 | strip.background = element_blank(),
596 | panel.background = element_rect(fill = "transparent", colour = NA),
597 | plot.background = element_rect(fill = "transparent", colour = NA),
598 | legend.background = element_rect(fill = "transparent", colour = NA),
599 | legend.key = element_rect(fill = "transparent", colour = NA),
600 | plot.title = element_text(color = "white"),
601 | plot.subtitle = element_text(color = "white"),
602 | plot.caption = element_text(color = "white", face = "italic"),
603 | legend.title = element_text(color = "white", face = "bold"),
604 | legend.text = element_text(color = "white", face = "bold"),
605 | plot.caption.position = "plot",
606 | legend.position = "top"
607 | ) +
608 | # Add a legend for the edge colors
609 | scale_edge_color_identity() +
610 | # Tweak the legend aesthetics
611 | guides(edge_alpha = "none", edge_color = "none", edge_linetype = "none")
612 | ```
613 |
614 | ## ARDL Model Specification
615 |
616 | ```{=tex}
617 | \begin{align}
618 | \resp{y}_{\obs{i}\tim{t}} &\sim \textit{Normal}(\mu_{\obs{i}\tim{t}}, \epsilon^{2})\\
619 | &\mu_{\obs{i}\tim{t}} = \alpha + \beta_{1}\treat{X}_{\obs{i}\tim{t}} + \beta_{2}\treat{X}_{\obs{i}\tim{t-1}} + \beta_{3}\resp{Y}_{\obs{i}\tim{t-1}} + \beta_{4}\resp{Y}_{\obs{i}\tim{t-2}} +\\
620 | & \quad \beta_{5}\covar{Z}_{\obs{i}\tim{t}} + \beta_{6}\covar{Z}_{\obs{i}\tim{t-1}} + \epsilon\\
621 | \text{with priors}\\
622 | \alpha &\sim \textit{Normal}(\bar{y}, ~ 2 \cdot \sigma_{y}) \quad\quad\quad \beta_{k} \sim \textit{Normal}\left(0, ~ 1.5 \cdot \frac{\sigma_{y}}{\sigma_{x}}\right)\\
623 | \epsilon &\sim \textit{Exponential}\left(\frac{1}{\sigma_{y}}\right) &\\
624 | \end{align}
625 | ```
626 | ## MSM Design Stage Specification
627 |
628 | As illustrated in the equation for the stabilized weights, we specify two separate models for the numerator and denominator with weakly informative independent normal priors on $\alpha$ and $\beta$
629 |
630 | ```{=tex}
631 | \begin{align}
632 | \Pr(\treat{X}_{\obs{i}\tim{t}} = 1 ~|~ \theta_{\obs{i}\tim{t}}) &\sim \textit{Bernoulli}(\theta_{\obs{i}\tim{t}})\\
633 | &\theta_{\obs{i}\tim{t}} = \text{logit}^{-1}(\alpha + X_{n}\beta_{k})\\
634 | \text{with priors}\\
635 | \alpha &\sim \textit{Normal}(0, ~2) \quad \quad \beta_{k} \sim \textit{Normal}(0,~ 1)\\
636 | \end{align}
637 | ```
638 | ::: incremental
639 | - For the numerator model, the matrix $X_{n}$ is simply $\treat{X}_{\obs{i}\tim{t-1}}$
640 |
641 | - For the denominator model, $X_{n} = \{\covar{Z}_{\obs{i}\tim{t}}, ~ \treat{X}_{\obs{i}\tim{t-1}},~ \resp{Y}_{\obs{i}\tim{t-1}}\}$
642 | :::
643 |
644 | ## MSM Outcome Model Specification
645 |
646 | ```{=tex}
647 | \begin{align}
648 | \resp{y}_{\obs{i}\tim{t}} &\sim \textit{Normal}(\mu_{\obs{i}\tim{t}}, \epsilon^{2})^{\tilde{w}_{\obs{i}\tim{t}}}\\
649 | &\mu_{\obs{i}\tim{t}} = \alpha + \beta_{1}\treat{X}_{\obs{i}\tim{t}} + \beta_{2}\treat{X}_{\obs{i}\tim{t-1}} + \epsilon & \\
650 | \text{where}\\
651 | \tilde{w}_{\obs{i}\tim{t}} &\sim \lambda_{\obs{i}\tim{t}} + \delta_{\obs{i}\tim{t}} \cdot \pi{(\delta)}\\
652 | \text{with priors}\\
653 | \alpha &\sim \textit{Normal}(\bar{y}, ~ 2 \cdot \sigma_{y}) \quad \quad \beta_{k} \sim \textit{Normal}\left(0, ~ 1.5 \cdot \frac{\sigma_{y}}{\sigma_{x}}\right)\\
654 | \epsilon &\sim \textit{Exponential}\left(\frac{1}{\sigma_{y}}\right) \quad \quad \delta_{\obs{i}\tim{t}} \sim \textit{Beta}(2, ~ 5)\\
655 | \end{align}
656 | ```
657 | # Simulation Results
658 |
659 | ## Simulation Results
660 |
661 | ```{r sim-results-1, echo=FALSE, dpi=600, dev.args = list(bg = 'transparent'), fig.height=9, fig.width=16, fig.align='center', cache=TRUE, dev='svg'}
662 | # Calculate the loss function
663 | error_by_groups <- sim_results %>%
664 | # Group by dimensions
665 | group_by(id, groups, cond) %>%
666 | # Calculate the mean
667 | summarise(across(
668 | c(MSM_X_Lag, ARDL_Estimate),
669 | ~ sqrt(mean((truth - .x)^2)),
670 | .names = "{.col}_rmse"
671 | )) %>%
672 | # Pivot the result to long form
673 | pivot_longer(cols = ends_with("rmse")) %>%
674 | ## Set the names for the facets
675 | mutate(name = if_else(name == "MSM_X_Lag_rmse", "MSM", "ARDL"))
676 |
677 | # Plot RMSE by number of groups
678 | ggplot(error_by_groups, aes(x = groups, y = value)) +
679 | # Facet the plot by condition
680 | facet_wrap(~ cond) +
681 | geom_point(
682 | aes(fill = name, shape = name),
683 | size = 3,
684 | position = "jitter",
685 | ) * blend("multiply") +
686 | scale_fill_manual(values = c("#00EE76", "#FF4040")) +
687 | scale_shape_manual(values = c(22, 23)) +
688 | scale_x_continuous(breaks = seq(20, 100, 20)) +
689 | scale_y_continuous(limits = c(0, 0.10)) +
690 | labs(
691 | y = latex2exp::TeX(r'($\sqrt{E(\hat{\theta} - \theta)^{2}}$)'),
692 | x = "Number of Groups",
693 | fill = "Model",
694 | shape = "Model",
695 | ) +
696 | fig_theme +
697 | theme(legend.position = "top")
698 | ```
699 |
700 | ## Simulation Results
701 |
702 | ```{r sim-results-2, echo=FALSE, dpi=600, dev.args = list(bg = 'transparent'), fig.height=9, fig.width=16, fig.align='center', cache=TRUE, dev='svg'}
703 | # Calculate the loss function
704 | error_by_periods <- sim_results %>%
705 | # Group by dimensions
706 | group_by(id, periods, cond) %>%
707 | # Calculate the mean
708 | summarise(across(
709 | c(MSM_X_Lag, ARDL_Estimate),
710 | ~ sqrt(mean((truth - .x)^2)),
711 | .names = "{.col}_rmse"
712 | )) %>%
713 | # Pivot the result to long form
714 | pivot_longer(cols = ends_with("rmse")) %>%
715 | ## Set the names for the facets
716 | mutate(name = if_else(name == "MSM_X_Lag_rmse", "MSM", "ARDL"))
717 |
718 | # Plot RMSE by number of periods
719 | ggplot(error_by_periods, aes(x = periods, y = value)) +
720 | # Facet the plot by condition
721 | facet_wrap(~ cond) +
722 | geom_point(
723 | aes(fill = name, shape = name),
724 | size = 3,
725 | position = "jitter"
726 | ) * blend("multiply") +
727 | scale_fill_manual(values = c("#00EE76", "#FF4040")) +
728 | scale_shape_manual(values = c(22, 23)) +
729 | scale_y_continuous(limits = c(0, 0.10)) +
730 | scale_x_continuous(breaks = c(20, 50)) +
731 | labs(
732 | y = latex2exp::TeX(r'($\sqrt{E(\hat{\theta} - \theta)^{2}}$)'),
733 | x = "Number of Periods",
734 | fill = "Model",
735 | shape = "Model",
736 | ) +
737 | fig_theme +
738 | theme(legend.position = "top")
739 | ```
740 |
741 | ## Simulation Results
742 |
743 | ```{r sim-results-3, echo=FALSE, dpi=600, dev.args = list(bg = 'transparent'), fig.height=9, fig.width=16, fig.align='center', cache=TRUE, dev='svg'}
744 | # Calculate the loss function
745 | error_by_cond <- sim_results %>%
746 | # Group by dimensions
747 | group_by(id, cond) %>%
748 | # Calculate the mean
749 | summarise(across(
750 | c(MSM_X_Lag, ARDL_Estimate),
751 | ~ sqrt(mean((truth - .x)^2)),
752 | .names = "{.col}_rmse"
753 | )) %>%
754 | # Pivot the result to long form
755 | pivot_longer(cols = ends_with("rmse")) %>%
756 | ## Set the names for the facets
757 | mutate(name = if_else(name == "MSM_X_Lag_rmse", "MSM", "ARDL"))
758 |
759 | # Plot RMSE by number of periods
760 | ggplot(error_by_cond, aes(x = cond, y = value)) +
761 | geom_point(
762 | aes(fill = name, shape = name),
763 | size = 3,
764 | position = "jitter"
765 | ) * blend("multiply") +
766 | scale_fill_manual(values = c("#00EE76", "#FF4040")) +
767 | scale_shape_manual(values = c(22, 23)) +
768 | scale_y_continuous(limits = c(0, 0.10)) +
769 | labs(
770 | y = latex2exp::TeX(r'($\sqrt{E(\hat{\theta} - \theta)^{2}}$)'),
771 | x = "Z Exogeneity",
772 | fill = "Model",
773 | shape = "Model",
774 | ) +
775 | fig_theme +
776 | theme(legend.position = "right")
777 | ```
778 |
779 | ## Simulation Results
780 |
781 | ```{r sim-results-4, echo=FALSE, dpi=600, dev.args = list(bg = 'transparent'), fig.height=9, fig.width=16, fig.align='center', cache=TRUE, dev='svg'}
782 | # Data for plotting the distributions of the estimates
783 | param_results <- sim_results %>%
784 | # Pivot estimates to long form
785 | pivot_longer(
786 | cols = c(MSM_X_Lag:MSM_X, ARDL_X, ARDL_Estimate),
787 | names_to = "param",
788 | values_to = "estimate"
789 | ) %>%
790 | # Make facet labels
791 | mutate(
792 | parameter = case_when(
793 | param == "MSM_X_Lag" ~ latex2exp::TeX(r'(Distribution of MSM Estimates: $X_{it-1}$)', output = "character"),
794 | param == "ARDL_Estimate" ~ latex2exp::TeX(r'(Distribution of ARDL Estimates: $X_{it-1}$)', output = "character"),
795 | param == "MSM_X" ~ latex2exp::TeX(r'(Distribution of MSM Estimates: $X_{it}$)', output = "character"),
796 | param == "ARDL_X" ~ latex2exp::TeX(r'(Distribution of ARDL Estimates: $X_{it}$)', output = "character")
797 | ),
798 | model = case_when(
799 | str_detect(param, "MSM") ~ "MSM",
800 | str_detect(param, "ARDL") ~ "ARDL",
801 | ),
802 | truth = case_when(
803 | str_detect(param, "X_Lag|Estimate") ~ 0,
804 | str_detect(param, "MSM_X|ARDL_X") ~ -0.1
805 | )
806 | )
807 |
808 | # Distribution of the means for the lag of x
809 | param_results %>%
810 | filter(truth == 0) %>%
811 | ggplot(., aes(x = estimate, y = cond)) +
812 | facet_wrap(~ parameter, labeller = label_parsed) +
813 | stat_slabinterval(
814 | aes(slab_alpha = stat(pdf), fill = model, shape = model),
815 | fill_type = "gradient",
816 | point_interval = "mean_qi",
817 | show.legend = FALSE
818 | ) +
819 | geom_vline(xintercept = 0, lty = "dashed", color = "white") +
820 | scale_shape_manual(values = c(22, 23)) +
821 | scale_fill_manual(values = c("#00EE76", "#FF4040")) +
822 | labs(
823 | y = "",
824 | x = "Posterior Mean Estimates",
825 | subtitle = "True Parameter Value is 0"
826 | ) +
827 | fig_theme +
828 | theme(legend.position = "top")
829 | ```
830 |
831 | ## Simulation Results
832 |
833 | ```{r sim-results-5, echo=FALSE, dpi=600, dev.args = list(bg = 'transparent'), fig.height=9, fig.width=16, fig.align='center', cache=TRUE, dev='svg'}
834 | # Distribution of the means for x
835 | param_results %>%
836 | filter(truth == -0.1) %>%
837 | ggplot(., aes(x = estimate, y = cond)) +
838 | facet_wrap(~ parameter, labeller = label_parsed) +
839 | stat_slabinterval(
840 | aes(slab_alpha = stat(pdf), fill = model, shape = model),
841 | fill_type = "gradient",
842 | point_interval = "mean_qi",
843 | show.legend = FALSE
844 | ) +
845 | geom_vline(xintercept = -0.1, lty = "dashed") +
846 | scale_fill_manual(values = c("#00EE76", "#FF4040")) +
847 | scale_shape_manual(values = c(22, 23)) +
848 | labs(
849 | y = "",
850 | x = "Posterior Mean Estimates",
851 | subtitle = "True Parameter Value is -0.10"
852 | ) +
853 | fig_theme +
854 | theme(legend.position = "top")
855 | ```
856 |
857 | # Conclusions
858 |
859 | ## Conclusion
860 |
861 | ::: incremental
862 | - Overall, our proposed procedure performs well in terms of parameter recovery under fairly general conditions
863 |
864 | - Going forward, we need to apply this to some real world political science examples
865 |
866 | - Planned R package implementing our procedure by building on the `{brms}` package as a back end
867 |
868 | - Makes it super easy for anyone who knows standard R model syntax to use
869 | :::
870 |
871 | ## References
872 |
--------------------------------------------------------------------------------
/presentation/math-colors.js:
--------------------------------------------------------------------------------
1 |
4 |
--------------------------------------------------------------------------------
/presentation/style.css:
--------------------------------------------------------------------------------
1 | h1.title {
2 | font-size: 80px;
3 | }
4 |
5 | .csl-entry {
6 | padding-bottom: 0.5em;
7 | }
8 |
9 | ol.aside-footnotes {
10 | padding-top: 3em;
11 | font-size: 14pt;
12 | }
13 |
14 | .reveal a {
15 | color: #eee;
16 | }
--------------------------------------------------------------------------------
/scripts/01-simulation-study.R:
--------------------------------------------------------------------------------
1 | #--------------------------Bayesian MSM Simulation------------------------------
2 | #-Author: A. Jordan Nafa------------------------------Created: August 28, 2022-#
3 | #-R Version: 4.1.2----------------------------------Revised: September 4, 2022-#
4 |
5 | # Set Project Options----
6 | options(
7 | digits = 4, # Significant figures output
8 | scipen = 999, # Disable scientific notation
9 | repos = getOption("repos")["CRAN"],
10 | mc.cores = 12L
11 | )
12 |
13 | # Load the necessary libraries----
14 | pacman::p_load(
15 | "tidyverse",
16 | "data.table",
17 | "dtplyr",
18 | "brms",
19 | "cmdstanr",
20 | "posterior",
21 | "bayesplot",
22 | "furrr",
23 | "ggblend",
24 | "tidybayes",
25 | "patchwork",
26 | install = FALSE
27 | )
28 |
29 | # Load the functions for the simulations
30 | .helpers <- map(
31 | .x = list.files(
32 | path = "functions/",
33 | pattern = ".*.R",
34 | full.names = TRUE
35 | ),
36 | ~ source(.x)
37 | )
38 |
39 | #------------------------------------------------------------------------------#
40 | #------------------------------Data Pre-Processing------------------------------
41 | #------------------------------------------------------------------------------#
42 |
43 | # Set the rng seed
44 | set.seed(1234567)
45 |
46 | # Simulate 2,000 datasets of varying dimensions
47 | reference_df <- expand.grid(
48 | groups = c(25, 45, 65, 85, 100),
49 | periods = c(20, 50),
50 | gamma = c(0, -0.5),
51 | treat_conf = FALSE,
52 | id = 1:100
53 | ) %>%
54 | mutate(
55 | # Calculate the sample size
56 | N = groups*periods,
57 | # Condition labels
58 | cond = if_else(gamma == 0, "Z Exogenous", "Z Endogenous")
59 | ) %>%
60 | # Nest the data by columns
61 | nest(sim_pars = c(groups, periods, treat_conf, gamma)) %>%
62 | # Simulate the datasets
63 | mutate(sim_data = map(
64 | .x = sim_pars,
65 | ~ dgp_sim(
66 | .groups = .x$groups,
67 | .periods = .x$periods,
68 | .true_gamma = .x$gamma,
69 | .treat_conf = .x$treat_conf
70 | )
71 | )) %>%
72 | # Unnest the data dimensions
73 | unnest(cols = sim_pars)
74 |
75 | #------------------------------------------------------------------------------#
76 | #-------------------------------MSM Simulations---------------------------------
77 | #------------------------------------------------------------------------------#
78 |
79 | # Compile the Stan model
80 | msm_sim_mod <- cmdstan_model("models/stan/IPTW_Outcome_Simulation.stan")
81 |
82 | # Fit 3 models in parallel
83 | plan(tweak(multisession, workers = 4))
84 |
85 | # Estimate the weights for each model
86 | msm_estimates <- reference_df %>%
87 | mutate(
88 | # Calculate the design stage msm for each dataset
89 | sim_data = map(
90 | .x = sim_data,
91 | ~ sim_msm_bayes_design(.x)
92 | ),
93 | # Build the data lists for the outcome model
94 | stan_data = future_map(
95 | .x = sim_data,
96 | .options = furrr_options(
97 | scheduling = 1,
98 | seed = TRUE,
99 | prefix = "prefix"
100 | ),
101 | .f = ~ make_msm_data(.x, shape_prior = c(2, 5)),
102 | .progress = TRUE
103 | ),
104 | # Calculate the design stage msm for each dataset
105 | outcome_estimates = future_map(
106 | .x = stan_data,
107 | .options = furrr_options(
108 | scheduling = 1,
109 | seed = TRUE,
110 | prefix = "prefix"
111 | ),
112 | .f = ~ sim_msm_bayes_outcome(.x, msm_sim_mod),
113 | .progress = TRUE
114 | ))
115 |
116 | # Write the data frames with weights to a file because this takes
117 | # a long ass time to run
118 | write_rds(msm_estimates, "data/msm_sims.rds")
119 |
120 | #------------------------------------------------------------------------------#
121 | #-------------------------------ARDL Simulations--------------------------------
122 | #------------------------------------------------------------------------------#
123 |
124 | # Compile the Stan model
125 | ardl_sim_mod <- cmdstan_model("models/stan/ARDL_Simulation.stan")
126 |
127 | # Fit 3 models in parallel
128 | plan(tweak(multisession, workers = 4))
129 |
130 | # Estimate the naive ARDL for each model
131 | ardl_estimates <- msm_estimates %>%
132 | mutate(
133 | # Build the data lists for the outcome model
134 | ardl_stan_data = future_map(
135 | .x = sim_data,
136 | .options = furrr_options(
137 | scheduling = 1,
138 | seed = TRUE,
139 | prefix = "prefix"
140 | ),
141 | .f = ~ make_ardl_data(
142 | .x,
143 | ardl.form = Y ~ X + X_Lag + Y_Lag + Y_Lag_2 + Z + Z_Lag,
144 | prior.scale = 1.5
145 | ),
146 | .progress = TRUE
147 | ),
148 | # Calculate the design stage msm for each dataset
149 | ardl_estimates = future_map(
150 | .x = ardl_stan_data,
151 | .options = furrr_options(
152 | scheduling = 1,
153 | seed = TRUE,
154 | prefix = "prefix"
155 | ),
156 | .f = ~ sim_ardl_bayes(.x, ardl_sim_mod),
157 | .progress = TRUE
158 | ))
159 |
160 | ## Write the updated version to a file
161 | write_rds(ardl_estimates, "data/final_sims.rds")
162 |
163 | #------------------------------------------------------------------------------#
164 | #------------------------Build the data frame of Results------------------------
165 | #------------------------------------------------------------------------------#
166 |
167 | # Create a new tibble with the information for the graphs
168 | sim_results <- ardl_estimates %>%
169 | transmute(
170 | across(c(id:periods, gamma)),
171 | MSM_X_Lag = NA_real_,
172 | MSM_X = NA_real_,
173 | ARDL_X_Lag = NA_real_,
174 | ARDL_Y_Lag = NA_real_,
175 | ARDL_X = NA_real_,
176 | truth = 0
177 | )
178 |
179 | # Pull the posterior median for each model
180 | for (i in seq_along(ardl_estimates$sim_data)) {
181 | # MSM Estimate for X[t-1]
182 | sim_results[i, "MSM_X_Lag"] <- ardl_estimates$outcome_estimates[[i]] %>%
183 | .[4, "mean"]
184 | # MSM Estimate for X[t-1]
185 | sim_results[i, "MSM_X"] <- ardl_estimates$outcome_estimates[[i]] %>%
186 | .[3, "mean"]
187 | # ARDL Estimate for X[t-1]
188 | sim_results[i, "ARDL_X_Lag"] <- ardl_estimates$ardl_estimates[[i]] %>%
189 | .[.$variable == "b[2]", "mean"]
190 | # ARDL Estimate for Y[t-1]
191 | sim_results[i, "ARDL_Y_Lag"] <- ardl_estimates$ardl_estimates[[i]] %>%
192 | .[.$variable == "b[4]", "mean"]
193 | # ARDL Estimate for X
194 | sim_results[i, "ARDL_X"] <- ardl_estimates$ardl_estimates[[i]] %>%
195 | .[.$variable == "b[1]", "mean"]
196 | }
197 |
198 | # Calculate the ARDL Result
199 | sim_results <- sim_results %>%
200 | mutate(ARDL_Estimate = ARDL_X_Lag + ARDL_Y_Lag * ARDL_X)
201 |
202 | ## Write the updated version to a file
203 | write_rds(sim_results, "data/sim_results.rds")
204 |
205 | #------------------------------------------------------------------------------#
206 | #--------------------------Verify Parameter Recovery----------------------------
207 | #------------------------------------------------------------------------------#
208 |
209 | # Calculate the loss function
210 | error_by_groups <- sim_results %>%
211 | # Group by dimensions
212 | group_by(id, groups, cond) %>%
213 | # Calculate the mean
214 | summarise(across(
215 | c(MSM_X_Lag, ARDL_Estimate),
216 | ~ sqrt(mean((truth - .x)^2)),
217 | .names = "{.col}_rmse"
218 | )) %>%
219 | # Pivot the result to long form
220 | pivot_longer(cols = ends_with("rmse")) %>%
221 | ## Set the names for the facets
222 | mutate(name = if_else(name == "MSM_X_Lag_rmse", "MSM Bias", "ARDL Bias"))
223 |
224 | # Plot RMSE by number of groups
225 | bias_groups_plot <- ggplot(error_by_groups, aes(x = groups, y = value)) +
226 | # Facet the plot by condition
227 | facet_wrap(~ cond) +
228 | geom_point(
229 | aes(fill = name, shape = name),
230 | size = 3,
231 | position = "jitter"
232 | ) * blend("multiply") +
233 | scale_fill_manual(values = c("#9400D3", "#00CD00")) +
234 | scale_shape_manual(values = c(22, 23)) +
235 | scale_x_continuous(breaks = seq(20, 100, 20)) +
236 | scale_y_continuous(limits = c(0, 0.10)) +
237 | labs(
238 | y = latex2exp::TeX(r'($\sqrt{E(\hat{\theta} - \theta)^{2}}$)'),
239 | x = "Number of Groups",
240 | fill = "Condition",
241 | shape = "Condition",
242 | ) +
243 | theme_bw(base_family = "serif", base_size = 30) +
244 | theme(legend.position = "top")
245 |
246 | # Calculate the loss function
247 | error_by_periods <- sim_results %>%
248 | # Group by dimensions
249 | group_by(id, periods, cond) %>%
250 | # Calculate the mean
251 | summarise(across(
252 | c(MSM_X_Lag, ARDL_Estimate),
253 | ~ sqrt(mean((truth - .x)^2)),
254 | .names = "{.col}_rmse"
255 | )) %>%
256 | # Pivot the result to long form
257 | pivot_longer(cols = ends_with("rmse")) %>%
258 | ## Set the names for the facets
259 | mutate(name = if_else(name == "MSM_X_Lag_rmse", "MSM Bias", "ARDL Bias"))
260 |
261 | # Plot RMSE by number of periods
262 | bias_periods_plot <- ggplot(error_by_periods, aes(x = periods, y = value)) +
263 | # Facet the plot by condition
264 | facet_wrap(~ cond) +
265 | geom_point(
266 | aes(fill = name, shape = name),
267 | size = 3,
268 | position = "jitter"
269 | ) * blend("multiply") +
270 | scale_fill_manual(values = c("#9400D3", "#00CD00")) +
271 | scale_shape_manual(values = c(22, 23)) +
272 | scale_y_continuous(limits = c(0, 0.10)) +
273 | scale_x_continuous(breaks = c(20, 50)) +
274 | labs(
275 | y = latex2exp::TeX(r'($\sqrt{E(\hat{\theta} - \theta)^{2}}$)'),
276 | x = "Number of Periods",
277 | fill = "Condition",
278 | shape = "Condition",
279 | ) +
280 | theme_bw(base_family = "serif", base_size = 30) +
281 | theme(legend.position = "top")
282 |
283 | # Patch the two plots into one
284 | rmse_plots <- bias_periods_plot/bias_groups_plot +
285 | plot_layout(guides = "collect") &
286 | theme(
287 | legend.position = "top",
288 | strip.background = element_blank(),
289 | strip.text = element_text(face = "bold")
290 | )
291 |
292 | # Write the plot to a file
293 | ggsave(
294 | "rmse_lagx_sim_results.jpeg",
295 | plot = rmse_plots,
296 | device = "jpeg",
297 | path = "figures",
298 | width = 16,
299 | height = 16,
300 | units = "in",
301 | dpi = "retina",
302 | type = "cairo"
303 | )
304 |
305 | # Data for plotting the distributions of the estimates
306 | param_results <- sim_results %>%
307 | # Pivot estimates to long form
308 | pivot_longer(
309 | cols = c(MSM_X_Lag:MSM_X, ARDL_X, ARDL_Estimate),
310 | names_to = "param",
311 | values_to = "estimate"
312 | ) %>%
313 | # Make facet labels
314 | mutate(
315 | parameter = case_when(
316 | param == "MSM_X_Lag" ~ latex2exp::TeX(r'(Distribution of MSM Estimates: $X_{it-1}$)', output = "character"),
317 | param == "ARDL_Estimate" ~ latex2exp::TeX(r'(Distribution of ARDL Estimates: $X_{it-1}$)', output = "character"),
318 | param == "MSM_X" ~ latex2exp::TeX(r'(Distribution of MSM Estimates: $X_{it}$)', output = "character"),
319 | param == "ARDL_X" ~ latex2exp::TeX(r'(Distribution of ARDL Estimates: $X_{it}$)', output = "character")
320 | ),
321 | model = case_when(
322 | str_detect(param, "MSM") ~ "MSM",
323 | str_detect(param, "ARDL") ~ "ARDL",
324 | ),
325 | truth = case_when(
326 | str_detect(param, "X_Lag|Estimate") ~ 0,
327 | str_detect(param, "MSM_X|ARDL_X") ~ -0.1
328 | )
329 | )
330 |
331 | # Distribution of the means for the lag of x
332 | x_lag_estimates <- param_results %>%
333 | filter(truth == 0) %>%
334 | ggplot(., aes(x = estimate, y = cond)) +
335 | facet_wrap(~ parameter, labeller = label_parsed) +
336 | stat_slabinterval(
337 | aes(slab_alpha = stat(pdf), fill = model, shape = model),
338 | fill_type = "gradient",
339 | point_interval = "mean_qi",
340 | show.legend = FALSE
341 | ) +
342 | geom_vline(xintercept = 0, lty = "dashed") +
343 | scale_shape_manual(values = c(22, 23)) +
344 | scale_fill_manual(values = palettetown::pokepal(2, 4)) +
345 | labs(
346 | y = "",
347 | x = "Posterior Mean Estimates"
348 | ) +
349 | theme_bw(base_family = "serif", base_size = 30) +
350 | theme(
351 | legend.position = "top",
352 | strip.background = element_blank(),
353 | strip.text = element_text(face = "bold")
354 | )
355 |
356 | # Distribution of the means for x
357 | x_contemp_estimates <- param_results %>%
358 | filter(truth == -0.1) %>%
359 | ggplot(., aes(x = estimate, y = cond)) +
360 | facet_wrap(~ parameter, labeller = label_parsed) +
361 | stat_slabinterval(
362 | aes(slab_alpha = stat(pdf), fill = model, shape = model),
363 | fill_type = "gradient",
364 | point_interval = "mean_qi",
365 | show.legend = FALSE
366 | ) +
367 | geom_vline(xintercept = -0.1, lty = "dashed") +
368 | scale_fill_manual(values = palettetown::pokepal(2, 4)) +
369 | scale_shape_manual(values = c(22, 23)) +
370 | labs(
371 | y = "",
372 | x = "Posterior Mean Estimates"
373 | ) +
374 | theme_bw(base_family = "serif", base_size = 30) +
375 | theme(
376 | legend.position = "top",
377 | strip.background = element_blank(),
378 | strip.text = element_text(face = "bold")
379 | )
380 |
381 | # Combine the two plots in to one
382 | dist_plots <- x_lag_estimates/x_contemp_estimates +
383 | plot_layout(guides = "collect") &
384 | theme(
385 | legend.position = "top",
386 | strip.background = element_blank(),
387 | strip.text = element_text(face = "bold")
388 | )
389 |
390 | # Write the plot to a file
391 | ggsave(
392 | "sim_results_distributions.jpeg",
393 | plot = dist_plots,
394 | device = "jpeg",
395 | path = "figures",
396 | width = 16,
397 | height = 16,
398 | units = "in",
399 | dpi = "retina",
400 | type = "cairo"
401 | )
402 |
--------------------------------------------------------------------------------