├── .gitattributes
├── .gitignore
├── BibTexDatabase.bib
├── README.Rmd
├── README.md
├── beta_logit.stan
├── beta_logit_infl.stan
├── beta_logit_infl_simple.stan
├── beta_logit_infl_v2.stan
├── beta_logit_infl_v3.stan
├── beta_logit_infl_v4.stan
├── beta_logit_phireg.stan
├── dag_example.R
├── data
    ├── W28_Aug17
    │   ├── ATP W28 methodology.doc
    │   ├── ATP W28 que.docx
    │   ├── ATP W28 readme.txt
    │   ├── ATP W28 topline.docx
    │   ├── ATP W28.sav
    │   └── ~$P W28 que.docx
    ├── all_vars_Beta_trans.rds
    ├── all_vars_frac.rds
    ├── all_vars_ord.rds
    ├── all_vars_zoib.rds
    ├── compare_prob.rds
    ├── sample_all.rds
    ├── sim_cont_X.RData
    ├── sim_cont_X.rds
    ├── sim_cont_X_fixed.RData
    ├── sim_cont_X_fixed.rds
    └── suffrage_paper_replicationfiles
    │   ├── EER-D-13-00718R2_Event_extended.do
    │   ├── EER-D-13-00718R2_Event_main.do
    │   ├── EER-D-13-00718R2_Event_study_extended_sample_reform.dta
    │   ├── EER-D-13-00718R2_Event_study_extended_sample_reversals.dta
    │   ├── EER-D-13-00718R2_Event_study_main_sample_reform.dta
    │   ├── EER-D-13-00718R2_Event_study_main_sample_reversals.dta
    │   ├── EER-D-13-00718R2_READ_ME.pdf
    │   ├── EER-D-13-00718R2_maindata_suffrage.dta
    │   ├── EER-D-13-00718R2_reversals_suffrage.dta
    │   └── EER-D-13-00718R2_suffrage.do
├── define_ord_betareg.R
├── estimate_with_brms.Rmd
├── estimate_with_brms.html
├── figures
    ├── figure_1.pdf
    ├── figure_2.pdf
    ├── figure_3.pdf
    ├── figure_4.pdf
    ├── figure_5.pdf
    ├── figure_6.pdf
    └── figure_7.pdf
├── frac_logit.stan
├── helper_func.R
├── install.R
├── kubinec_ord_betareg.html
├── kubinec_ord_betareg_accepted.Rmd
├── kubinec_ord_betareg_accepted_dataverse.Rmd
├── kubinec_ord_betareg_appendix.Rmd
├── kubinec_ord_betareg_appendix_anon.Rmd
├── limited_dv_modeling.Rmd
├── master.R
├── old_code.R
├── ordbetareg.Rproj
├── ordered_beta_reg.R
├── ordered_beta_reg_sim.R
├── ordered_beta_reg_sim_fixed.R
├── preamble.tex
├── zoib.stan
└── zoib_nophireg.stan


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.html linguist-vendored
2 | preamble.tex linguist-vendored
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "README"
 3 | author: "Robert Kubinec"
 4 | date: "March 1st, 2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | This repository contains data and code for the paper, "Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Survey Sliders and Visual Analog Scales." The repository includes the following files:
13 | 
14 | - `kubinec_ord_betareg.Rmd` A reproducible Rmarkdown document that can be run in Rstudio to re-produce the results. Note that the `data` folder in this repository contains necessary data to reproduce results.
15 | - `estimate_with_brms.Rmd` and `estimate_with_brms.html` These files show how to run ordered beta regression models using the R package [`brms`](https://cran.r-project.org/web/packages/brms/vignettes/brms_overview.pdf).
16 | - `define_ord_betareg.R` This R script contains all the auxiliary code needed to fit the model with R package `brms` (see vignette above for more info).
17 | - `ordered_beta_reg_sim.R` This R script will run a simulation comparing the ordered beta regression model to alternatives, including the zero-one-inflated Beta regression model (ZOIB). The output of a 10,000 run of this simulation is in `data/`.
18 | - `beta_logit.stan` This file contains the Stan code used to fit an ordered beta regression model in Stan.
19 | - `zoib.stan` This file contains Stan code used to fit the zero-one-inflated beta regression model (ZOIB).
20 | - `beta_logit_phireg.stan` This file constains Stan code to fit an ordered beta regression model with additional predictors for phi, the scale parameter in the distribution. These additional parameters allow for understanding the effect of covariates on encouraging clustered or more dispersed (estreme) responses from respondents.
21 | - `beta_logit_infl*.stan` These additional Stan files are various ways of parameterizing the midpoint of the scale when the midpoint is considered missing data. None of them appear to do a better job at predicting the outcome than versions that considered the midpoint to be observed data.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | README
 2 | ================
 3 | Robert Kubinec
 4 | January  11th, 2022
 5 | 
 6 | **Please note: This is the repository for the paper files. To access the R package `ordbetareg`, please go to this repository: www.github.com/saudiwin/ordbetareg_pack.**
 7 | 
 8 | This repository contains data and code for the paper, “Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Continuous Data with Lower and Upper Bounds", which is  now forthcoming at the journal *Political Analysis*. An ungated preprint can be found here: https://osf.io/preprints/socarxiv/2sx6y/. Replication files can be found both on Dataverse and Github. 
 9 | 
10 | To replicate the paper, please first run the `install.R` script to make sure all relevant packages are installed. The script will also install `cmdstanr` and a version of `cmdstan`, which is the underlying MCMC sampling library from the Stan project. Installing `cmdstan` requires the R toolchain; if you have any trouble or are unsure see the `cmdstanr` package installation instructions: https://mc-stan.org/cmdstanr/articles/cmdstanr.html.
11 | 
12 | The file `master.R` will then run all the necessary scripts to compile the paper and supplementary information  (compilation requires a working Latex installation). Note that `master.R` by default loads the existing simulation data in the `data` folder. To fully reproduce the simulation, set the `run_sim` variable in `master.R` to `TRUE`. Note that running the full simulation can require up to a few days on a machine with ~40 cores. 
13 | 
14 | The version of R used for these results was 4.1.2 and the version of the R packages is as follows:
15 | 
16 |   - dplyr: 1.0.7
17 |   - rstanarm: 2.21.1
18 |   - tidyr: 1.1.4
19 |   - lubridate: 1.8.0
20 |   - loo: 2.4.1
21 |   - kableExtra: 1.3.4
22 |   - bayesplot: 1.8.1
23 |   - patchwork: 1.1.1
24 |   - stringr: 1.4.0
25 |   - grDevices: 4.1.2
26 |   - emojifont: 0.5.5
27 |   - latex2exp: 0.5.0
28 |   - haven: 2.4.3
29 |   - ggplot2: 3.3.5
30 |   - posterior: 1.2.0
31 |   - brms: 2.16.3
32 |   - remotes: 2.4.2
33 |   - future.apply: 1.8.1
34 |   - faux: 1.1.0
35 |   - rmarkdown: 2.11
36 |   - bookdown: 0.24
37 |   - tinytex: 0.36
38 |   - extrafont: 0.17
39 | 
40 | The repository includes the following files:
41 | 
42 |   - `kubinec_ord_betareg_accepted.Rmd` The accepted version of the reproducible Rmarkdown document that can
43 |     be run in Rstudio to re-produce the results. Note that the `data`
44 |     folder in this repository contains necessary data to reproduce
45 |     results.
46 |   - `estimate_with_brms.Rmd` and `estimate_with_brms.html` These files
47 |     show how to run ordered beta regression models using the R package
48 |     [`brms`](https://cran.r-project.org/web/packages/brms/vignettes/brms_overview.pdf).
49 |   - `define_ord_betareg.R` This R script contains all the auxiliary code
50 |     needed to fit the model with R package `brms` (see vignette above
51 |     for more info).
52 |   - `*_fit.rds` Fitted model object files to reproduce paper results much faster
53 |   - `data/sim_cont_X*.RData` Simulation results to reproduce paper results much faster
54 |   - `data/suffrage_paper_replicationfiles/EER-D-13-00718R2_maindata_suffrage.dta` Data from Toke and Aidt (2012)
55 |   - `ordered_beta_reg_sim.R` This R script will run a simulation
56 |     comparing the ordered beta regression model to alternatives,
57 |     including the zero-one-inflated Beta regression model (ZOIB). The
58 |     output of a 10,000 run of this simulation is saved in `data/` as `sim_cont_X.RData`.
59 |   - `ordered_beta_reg_sim_fixed.R` This R script will run a simulation
60 |     comparing the ordered beta regression model to alternatives, but with fixed rather than random draws 
61 |     of relevant parameters (results are in the SI, not main paper). The
62 |     output of a 4,000 run of this simulation is saved in `data/` as `sim_cont_X_fixed.RData`.
63 |   - `beta_logit.stan` This file contains the Stan code used to fit an
64 |     ordered beta regression model in Stan.
65 |   - `zoib_nophireg.stan` This file contains Stan code used to fit the
66 |     zero-one-inflated beta regression model (ZOIB).
67 |   - `beta_logit_phireg.stan` This file constains Stan code to fit an
68 |     ordered beta regression model with additional predictors for phi,
69 |     the scale parameter in the distribution. These additional parameters
70 |     allow for understanding the effect of covariates on encouraging
71 |     clustered or more dispersed (estreme) responses from respondents.
72 |   - `frac_logit.stan` This file contains a Stan parameterization of the 
73 |     fractional logit model.
74 |   - `beta_logit_infl*.stan` These additional Stan files are various ways
75 |     of parameterizing the midpoint of the scale when the midpoint is
76 |     considered missing data. None of them appear to do a better job at
77 |     predicting the outcome than versions that considered the midpoint to
78 |     be observed data.
79 |   - `BibTexDatabase.bib` References necessary  to compile the paper.
80 |   - `preamble.tex` Latex packages for paper
81 | 


--------------------------------------------------------------------------------
/beta_logit.stan:
--------------------------------------------------------------------------------
  1 | //
  2 | // Ordinal beta regression model for analying experimental outcomes
  3 | // with proportion and degenerate responses (i.e. 0 and 1)
  4 | // Models 0/1 as ordered categories above/below (0,1) 
  5 | // Robert Kubinec
  6 | // New York University Abu Dhabi
  7 | 
  8 | functions {
  9 |   
 10 |   // prior from Michael Betancourt for ordered cutpoints
 11 |     real induced_dirichlet_lpdf(vector c, vector alpha, real phi) {
 12 |     int K = num_elements(c) + 1;
 13 |     vector[K - 1] sigma = inv_logit(phi - c);
 14 |     vector[K] p;
 15 |     matrix[K, K] J = rep_matrix(0, K, K);
 16 |     
 17 |     // Induced ordinal probabilities
 18 |     p[1] = 1 - sigma[1];
 19 |     for (k in 2:(K - 1))
 20 |       p[k] = sigma[k - 1] - sigma[k];
 21 |     p[K] = sigma[K - 1];
 22 |     
 23 |     // Baseline column of Jacobian
 24 |     for (k in 1:K) J[k, 1] = 1;
 25 |     
 26 |     // Diagonal entries of Jacobian
 27 |     for (k in 2:K) {
 28 |       real rho = sigma[k - 1] * (1 - sigma[k - 1]);
 29 |       J[k, k] = - rho;
 30 |       J[k - 1, k] = rho;
 31 |     }
 32 |     
 33 |     return   dirichlet_lpdf(p | alpha)
 34 |            + log_determinant(J);
 35 |   }
 36 | }
 37 | data {
 38 |   int<lower=0> N_prop; // number of proportion observations (0,1)
 39 |   int<lower=0> N_degen; // number of 0/1 observations
 40 |   int X; // number predictors
 41 |   vector[N_prop] outcome_prop; // Y in (0,1)
 42 |   int outcome_degen[N_degen]; // Y in {0,1}
 43 |   matrix[N_prop,X] covar_prop; // covariate X for proportion outcome
 44 |   matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome
 45 |   int N_pred_degen; // number of posterior predictive samples for 0/1
 46 |   int N_pred_prop; // number of posterior predictive samples for (0,1)
 47 |   int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1
 48 |   int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1)
 49 |   int run_gen; // whether to use generated quantities
 50 | }
 51 | parameters {
 52 |   vector[X] X_beta; // common predictor
 53 |   real alpha; // common intercept
 54 |   ordered[2] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts)
 55 |   real<lower=0> kappa; // scale parameter for beta regression
 56 | }
 57 | transformed parameters {
 58 |     // store matrix calculations
 59 |   
 60 |   vector[N_degen] calc_degen;
 61 |   vector[N_prop] calc_prop;
 62 |   
 63 |   // drop the intercepts so everything is relative to the cutpoints
 64 |   if(N_degen>0) {
 65 |     calc_degen = alpha + covar_degen*X_beta;
 66 |   }
 67 |   
 68 |   //print(calc_degen[1:10]);
 69 |   
 70 |   calc_prop = alpha + covar_prop*X_beta;
 71 |   
 72 |   //print(calc_prop[1:10]);
 73 |   
 74 | }
 75 | model {
 76 |   
 77 |   // vague priors
 78 |   X_beta ~ normal(0,5);
 79 |   alpha ~ normal(0,5);
 80 |   kappa ~ exponential(.1);
 81 |   //cutpoints[2] - cutpoints[1] ~ normal(0,3);
 82 |   // induced dirichlet prior on cutpoints:
 83 |   
 84 |   target += induced_dirichlet_lpdf(cutpoints | rep_vector(1, 3), 0);
 85 |   
 86 |   // need separate counters for logit (0/1) and beta regression
 87 |   if(N_degen>0) {
 88 |     for(n in 1:N_degen) {
 89 |     if(outcome_degen[n]==0) {
 90 |       // Pr(Y==0)
 91 |       target += log1m_inv_logit(calc_degen[n] - cutpoints[1]);
 92 |     } else {
 93 |       //Pr(Y==1)
 94 |       target += log_inv_logit(calc_degen[n] - cutpoints[2]);
 95 |     }
 96 |     }
 97 |   }
 98 |   
 99 |   
100 |   for(n in 1:N_prop) {
101 |     // Pr(Y in (0,1))
102 |     target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
103 |     // Pr(Y==x where x in (0,1))
104 |     outcome_prop[n] ~ beta_proportion(inv_logit(calc_prop[n]),kappa);
105 |   }
106 |   
107 | }
108 | 
109 | generated quantities {
110 |   
111 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional)
112 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows
113 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_epred;
114 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo
115 |   
116 |   if(run_gen==1) {
117 |     
118 |     row_vector[2] all_pr[N_pred_degen+N_pred_prop];
119 |     
120 |     if(N_pred_degen>0) {
121 |      // first do degenerate outcomes 
122 |     // note: these could be *re-generated* as beta/propotions
123 |     for(i in 1:num_elements(indices_degen)) {
124 |       
125 |       // draw an outcome 0 / prop / 1
126 |       regen_degen[i] = ordered_logistic_rng(alpha + covar_degen[indices_degen[i],]*X_beta,cutpoints);
127 |       
128 |       if(outcome_degen[i]==0) {
129 |         ord_log[i] = log1m_inv_logit(calc_degen[indices_degen[i]] - cutpoints[1]);
130 |       } else {
131 |         ord_log[i] = log_inv_logit(calc_degen[indices_degen[i]] - cutpoints[2]);
132 |       }
133 |       
134 |       // don't need zero
135 |       
136 |       all_pr[i] = [log(inv_logit(calc_degen[indices_degen[i]] - cutpoints[1]) - inv_logit(calc_degen[i] - cutpoints[2])) + log_inv_logit(calc_degen[indices_degen[i]]),
137 |                     log_inv_logit(calc_degen[indices_degen[i]] - cutpoints[2])];
138 |       
139 |       if(regen_degen[i]==1) {
140 |         regen_all[i] = 0;
141 |       } else if(regen_degen[i]==3) {
142 |         regen_all[i] = 1;
143 |       } else {
144 |         // did not occur in original data but could re-occur probabilistically
145 |         regen_all[i] = beta_proportion_rng(inv_logit(alpha + covar_degen[indices_degen[i],]*X_beta),kappa);
146 |       }
147 |      }
148 |     }
149 |   if(N_pred_prop>0) {
150 |         // now do originally proportional outcomes
151 |       // can be re-generated as 0s or 1s
152 |       
153 |       int skip = num_elements(indices_degen);
154 |       
155 |       for(i in 1:num_elements(indices_prop)) {
156 |         
157 |         all_pr[i+skip] = [log(inv_logit(calc_prop[indices_prop[i]] - cutpoints[1]) - inv_logit(calc_prop[indices_prop[i]] - cutpoints[2])) + log_inv_logit(calc_prop[indices_prop[i]]),
158 |         log_inv_logit(calc_prop[indices_prop[i]] - cutpoints[2])];
159 |         
160 |         // draw an outcome 0 / prop / 1
161 |         regen_degen[i+skip] = ordered_logistic_rng(alpha + covar_prop[indices_prop[i],]*X_beta,cutpoints);
162 |         
163 |         ord_log[i+skip] = log(inv_logit(calc_prop[indices_prop[i]] - cutpoints[1]) - inv_logit(calc_prop[indices_prop[i]] - cutpoints[2])) +
164 |                         beta_proportion_lpdf(outcome_prop[indices_prop[i]]|inv_logit(calc_prop[indices_prop[i]]),kappa);
165 |         
166 |         if(regen_degen[i+skip]==1) {
167 |           regen_all[i+skip] = 0;
168 |         } else if(regen_degen[i+skip]==3) {
169 |           regen_all[i+skip] = 1;
170 |         } else {
171 |           // did not occur in original data but could re-occur probabilistically
172 |           regen_all[i+skip] = beta_proportion_rng(inv_logit(alpha + covar_prop[indices_prop[i],]*X_beta),kappa);
173 |         }
174 |         
175 |       } 
176 |       }
177 |       
178 |       for(i in 1:(N_pred_degen + N_pred_prop)) {
179 |         
180 |         regen_epred[i] = sum(exp(all_pr[i]));
181 |         
182 |       }
183 |   }
184 |   
185 | }
186 | 
187 | 


--------------------------------------------------------------------------------
/beta_logit_infl.stan:
--------------------------------------------------------------------------------
  1 | //
  2 | // Ordinal beta regression model for analying experimental outcomes
  3 | // with proportion and degenerate responses (i.e. 0 and 1)
  4 | // Models 0/1 as ordered categories above/below (0,1) 
  5 | // Robert Kubinec
  6 | // New York University Abu Dhabi
  7 | data {
  8 |   int<lower=0> N_prop; // number of proportion observations (0,1)
  9 |   int<lower=0> N_degen; // number of 0/1 observations
 10 |   int X; // number predictors
 11 |   int X_miss; // number of predictors for inflated model
 12 |   vector[N_prop] outcome_prop; // Y in (0,1)
 13 |   real infl_value; // set to value between 0 and 1. If negative, inflation is not used
 14 |   int outcome_degen[N_degen]; // Y in {0,1}
 15 |   matrix[N_prop,X] covar_prop; // covariate X for proportion outcome
 16 |   matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values
 17 |   matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values
 18 |   matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome
 19 |   int N_pred_degen; // number of posterior predictive samples for 0/1
 20 |   int N_pred_prop; // number of posterior predictive samples for (0,1)
 21 |   int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1
 22 |   int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1)
 23 |   int run_gen; // whether to use generated quantities
 24 | }
 25 | transformed data {
 26 |   int infl_this[N_prop];
 27 |   
 28 |   for(i in 1:N_prop) {
 29 |     if(outcome_prop[i]==infl_value) {
 30 |       infl_this[i] = 1;
 31 |     } else {
 32 |       infl_this[i] = 0;
 33 |     }
 34 |   }
 35 | }
 36 | parameters {
 37 |   vector[X] X_beta; // common predictor
 38 |   vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values
 39 |   ordered[3] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts)
 40 |   real<lower=0> kappa; // scale parameter for beta regression
 41 | }
 42 | transformed parameters {
 43 |     // store matrix calculations
 44 |   
 45 |   vector[N_degen] calc_degen;
 46 |   vector[N_prop] calc_prop;
 47 |   vector[!(infl_value<0) ? N_prop : 0] calc_miss;
 48 |   vector[!(infl_value<0) ? N_pred_degen : 0] calc_degen_miss; // must be defined over both distributionss
 49 |   
 50 |   // drop the intercepts so everything is relative to the cutpoints
 51 |   calc_degen = covar_degen*X_beta;
 52 |   calc_prop = covar_prop*X_beta;
 53 |   
 54 |   
 55 |   if(!(infl_value<0)) {
 56 |     calc_miss = covar_prop_infl*X_beta_miss;
 57 |     calc_degen_miss = covar_degen_infl*X_beta_miss;
 58 |   }
 59 |   
 60 | }
 61 | model {
 62 |   
 63 |   // vague priors
 64 |   X_beta ~ normal(0,5);
 65 |   X_beta_miss ~ normal(0,5);
 66 |   kappa ~ exponential(1);
 67 |   cutpoints[2] - cutpoints[1] ~ normal(0,3);
 68 |   
 69 |   // need separate counters for logit (0/1) and beta regression
 70 |   
 71 |   for(n in 1:N_degen) {
 72 |     
 73 |     if(outcome_degen[n]==0) {
 74 |       // Pr(Y==0)
 75 |       target += log1m_inv_logit(calc_degen[n] - cutpoints[1]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]);
 76 |     } else {
 77 |       //Pr(Y==1)
 78 |       target += log_inv_logit(calc_degen[n] - cutpoints[2]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]);
 79 |     }
 80 |   }
 81 |   if(infl_value<0) {
 82 |     for(n in 1:N_prop) {
 83 |     // Pr(Y in (0,1))
 84 |     target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
 85 |     // Pr(Y==x where x in (0,1))
 86 |     target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa);
 87 |     }
 88 |   } else {
 89 |     for(n in 1:N_prop) {
 90 |       
 91 |       // Pr(Y in (0,1))
 92 |       
 93 |       real pry01 = log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
 94 |         
 95 |       
 96 |       // inflate the outcome
 97 |       if(infl_this[n]==1) {
 98 |         //target += bernoulli_logit_lpmf(1|calc_miss[n]);
 99 |         target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]),
100 |                             bernoulli_logit_lpmf(0|calc_miss[n]) +
101 |                             beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa));
102 |       } else {
103 |         
104 |         target += bernoulli_logit_lpmf(0|calc_miss[n]); // "true" observed value
105 |         target += pry01;
106 |         // Pr(Y==x where x in (0,1))
107 |         target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa);
108 |       }
109 |     
110 |     }
111 |   }
112 |   
113 |   
114 | }
115 | 
116 | generated quantities {
117 |   
118 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional)
119 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows
120 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo
121 |   int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_degen+N_pred_prop]; // whether observation belongs to inflated value or not
122 |   
123 |   
124 |   if(run_gen==1) {
125 |     
126 |     if(N_pred_degen>0) {
127 |     
128 |      // first do degenerate outcomes 
129 |     // note: these could be *re-generated* as beta/propotions
130 |     for(i in 1:num_elements(indices_degen)) {
131 |       
132 |       // draw an outcome 0 / prop / 1
133 |       regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints);
134 |       
135 |       if(outcome_degen[i]==0) {
136 |         ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]);
137 |       } else {
138 |         ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]);
139 |       }
140 |       
141 |       
142 |       if(regen_degen[i]==1) {
143 |         regen_all[i] = 0;
144 |       } else if(regen_degen[i]==3) {
145 |         regen_all[i] = 1;
146 |       } else {
147 |         
148 |         if(!(infl_value<0)) {
149 |             infl_gen[i] = bernoulli_logit_rng(calc_degen_miss[i]);
150 |             
151 |             if(infl_gen[i]==1) {
152 |               regen_all[i] = infl_value;
153 |             } else {
154 |               
155 |               regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
156 |               
157 |             }
158 |           } else {
159 |             regen_all[i] = beta_proportion_rng(inv_logit(calc_degen[i]),kappa);
160 |           }
161 |       }
162 |       
163 |   }
164 |   
165 |   if(N_pred_prop>0) {
166 |         // now do originally proportional outcomes
167 |       // can be re-generated as 0s or 1s
168 |       
169 |       int skip = num_elements(indices_degen);
170 |       
171 |       for(i in 1:num_elements(indices_prop)) {
172 |         
173 |         // draw an outcome 0 / prop / 1
174 |         regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints);
175 |         
176 |         ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2]));
177 |         
178 |         if(!(infl_value<0)) {
179 |           if(infl_this[i]==1) {
180 |             //ord_log[i + skip] += bernoulli_logit_lpmf(1|calc_miss[i]);
181 |             ord_log[i + skip] += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[i]),
182 |                             bernoulli_logit_lpmf(0|calc_miss[i]) +
183 |                             beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa));
184 |         
185 |                           //         log_sum_exp(beta_proportion_lccdf(infl_value+0.1|inv_logit(calc_prop[i]),kappa),
186 |                           // beta_proportion_lcdf(infl_value-0.1|inv_logit(calc_prop[i]),kappa));
187 |           } else {
188 |             
189 |             ord_log[i + skip] += bernoulli_logit_lpmf(0|calc_miss[i]); // "true" observed value
190 |             // Pr(Y==x where x in (0,1))
191 |             
192 |             ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa);
193 |             
194 |           }
195 |               
196 |           
197 |         } else {
198 |           ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa);
199 |         }
200 |         
201 |         
202 |         if(regen_degen[i+skip]==1) {
203 |           regen_all[i+skip] = 0;
204 |         } else if(regen_degen[i+skip]==3) {
205 |           regen_all[i+skip] = 1;
206 |         } else {
207 |           // did not occur in original data but could re-occur probabilistically
208 |           // check for inflation first
209 |           
210 |           if(!(infl_value<0)) {
211 |             infl_gen[i] = bernoulli_logit_rng(calc_miss[i]);
212 |             
213 |             if(infl_gen[i]==1) {
214 |               regen_all[i + skip] = infl_value;
215 |             } else {
216 |               regen_all[i + skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
217 |             }
218 |           } else {
219 |             regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
220 |           }
221 |           
222 |         }
223 |         
224 |       } 
225 |       }
226 |     }
227 |   }
228 |   
229 | }
230 | 
231 | 


--------------------------------------------------------------------------------
/beta_logit_infl_simple.stan:
--------------------------------------------------------------------------------
  1 | //
  2 | // Ordinal beta regression model for analying experimental outcomes
  3 | // with proportion and degenerate responses (i.e. 0 and 1)
  4 | // Models 0/1 as ordered categories above/below (0,1) 
  5 | // Robert Kubinec
  6 | // New York University Abu Dhabi
  7 | data {
  8 |   int<lower=0> N_prop; // number of proportion observations (0,1)
  9 |   int<lower=0> N_degen; // number of 0/1 observations
 10 |   int X; // number predictors
 11 |   //int X_miss; // number of predictors for inflated model
 12 |   vector[N_prop] outcome_prop; // Y in (0,1)
 13 |   real infl_value; // set to value between 0 and 1. If negative, inflation is not used
 14 |   int outcome_degen[N_degen]; // Y in {0,1}
 15 |   matrix[N_prop,X] covar_prop; // covariate X for proportion outcome
 16 |   //matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values
 17 |   //matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values
 18 |   matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome
 19 |   int N_pred_degen; // number of posterior predictive samples for 0/1
 20 |   int N_pred_prop; // number of posterior predictive samples for (0,1)
 21 |   int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1
 22 |   int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1)
 23 |   int run_gen; // whether to use generated quantities
 24 | }
 25 | transformed data {
 26 |   int infl_this[N_prop];
 27 |   int miss_id[N_prop]; // index for imputed outcome
 28 |   int miss_counter = 1;
 29 |   
 30 |   for(i in 1:N_prop) {
 31 |     if(outcome_prop[i]==infl_value) {
 32 |       infl_this[i] = 1;
 33 |       miss_id[i] = miss_counter;
 34 |       miss_counter += 1;
 35 |     } else {
 36 |       infl_this[i] = 0;
 37 |       miss_id[i] = miss_counter;
 38 |     }
 39 |   }
 40 | }
 41 | parameters {
 42 |   vector[X] X_beta; // common predictor
 43 |   vector<lower=0.01,upper=0.99>[miss_counter] prop_miss;
 44 |   //vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values
 45 |   ordered[2] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts)
 46 |   real<lower=0> kappa; // scale parameter for beta regression
 47 |   real<lower=0,upper=1> theta; // probability of an observation being missing versus non-missing 
 48 | }
 49 | transformed parameters {
 50 |     // store matrix calculations
 51 |   
 52 |   vector[N_degen] calc_degen;
 53 |   vector[N_prop] calc_prop;
 54 |   //vector[!(infl_value<0) ? N_prop : 0] calc_miss;
 55 |   //vector[!(infl_value<0) ? N_pred_degen : 0] calc_degen_miss; // must be defined over both distributionss
 56 |   
 57 |   // drop the intercepts so everything is relative to the cutpoints
 58 |   calc_degen = covar_degen*X_beta;
 59 |   calc_prop = covar_prop*X_beta;
 60 |   
 61 |   
 62 |   if(!(infl_value<0)) {
 63 |    // calc_miss = covar_prop_infl*X_beta_miss;
 64 |   //  calc_degen_miss = covar_degen_infl*X_beta_miss;
 65 |   }
 66 |   
 67 | }
 68 | model {
 69 |   
 70 |   // vague priors
 71 |   X_beta ~ normal(0,5);
 72 |   //X_beta_miss ~ normal(0,5);
 73 |   theta ~ normal(0,2); // tight prior around the number of possible missings 
 74 |   kappa ~ exponential(1);
 75 |   prop_miss ~ normal(0,1); // put the majority of prior miss around 0.5
 76 |   cutpoints[2] - cutpoints[1] ~ normal(0,3);
 77 |   // for(c in 1:3)
 78 |   //   cutpoints[c+1] - cutpoints[c] ~ normal(0,3);
 79 |   
 80 |   // need separate counters for logit (0/1) and beta regression
 81 |   
 82 |   for(n in 1:N_degen) {
 83 |     
 84 |     if(outcome_degen[n]==0) {
 85 |       // Pr(Y==0)
 86 |       target += log1m_inv_logit(calc_degen[n] - cutpoints[1]);
 87 |     } else {
 88 |       //Pr(Y==1)
 89 |       target += log_inv_logit(calc_degen[n] - cutpoints[2]);
 90 |     }
 91 |   }
 92 |   
 93 |   //target += beta_proportion_lpdf(outcome_prop|inv_logit(calc_prop),kappa);
 94 |   
 95 |     for(n in 1:N_prop) {
 96 |       // - beta_proportion_lcdf(0.49|inv_logit(calc_prop[n]),kappa)
 97 |       //- beta_proportion_lccdf(0.51|inv_logit(calc_prop[n]),kappa)
 98 |     if(infl_this[n]==infl_value) {
 99 |       target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])) +
100 |                 log_mix(theta,
101 |                   beta_proportion_lpdf(prop_miss[miss_id[n]]|inv_logit(calc_prop[n]),kappa),
102 |                     beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa));
103 |     }  else {
104 |       target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
105 |       outcome_prop[n] ~ beta_proportion(inv_logit(calc_prop[n]),kappa);
106 |     } 
107 |     
108 |     }
109 |   
110 |   
111 | }
112 | 
113 | generated quantities {
114 |   
115 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional)
116 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows
117 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo
118 |   int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_prop]; // whether observation belongs to inflated value or not
119 |   
120 |   
121 |   if(run_gen==1) {
122 |     
123 |     if(N_pred_degen>0) {
124 |     
125 |      // first do degenerate outcomes 
126 |     // note: these could be *re-generated* as beta/propotions
127 |     for(i in 1:num_elements(indices_degen)) {
128 |       
129 |       // draw an outcome 0 / prop / 1
130 |       regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints);
131 |       
132 |       if(outcome_degen[i]==1) {
133 |         ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]);
134 |       } else {
135 |         ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]);
136 |       }
137 |       
138 |        if(regen_degen[i]==1) {
139 |         regen_all[i] = 0;
140 |       } else if(regen_degen[i]==3) {
141 |         regen_all[i] = 1;
142 |       } else {
143 |           regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
144 |       } 
145 |       
146 |       }
147 |     }
148 |   
149 |   if(N_pred_prop>0) {
150 |         // now do originally proportional outcomes
151 |       // can be re-generated as 0s or 1s
152 |       
153 |       int skip = num_elements(indices_degen);
154 |       
155 |       for(i in 1:num_elements(indices_prop)) {
156 |         
157 |         // draw an outcome 0 / prop / 1
158 |         regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints);
159 |         infl_gen[i] = bernoulli_rng(theta);
160 |         
161 |         //ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2]));
162 |         
163 |         if(infl_this[i]==0) {
164 |             ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2]));
165 |             ord_log[i+skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa);
166 |         } else {
167 |           ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2]));
168 |           ord_log[i+skip] += log_mix(theta,
169 |                       beta_proportion_lpdf(prop_miss[miss_id[i]]|inv_logit(calc_prop[i]),kappa),
170 |                       beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa));
171 |         }
172 |         
173 |         if(regen_degen[i+skip]==1) {
174 |           regen_all[i+skip] = 0;
175 |         } else if(regen_degen[i+skip]==3) {
176 |           regen_all[i+skip] = 1;
177 |         } else {
178 |           // did not occur in original data but could re-occur probabilistically
179 |           // check for inflation first
180 |           if(infl_this[i]==infl_value) {
181 |             if(infl_gen[i]==1) {
182 |               regen_all[i+skip] = prop_miss[miss_id[i]];
183 |             } else {
184 |               regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
185 |             }
186 |           } else {
187 |             regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
188 |           }
189 |           
190 |           
191 |         }
192 |         }
193 | 
194 |         
195 |     } 
196 |   
197 |   }
198 |   
199 | }
200 | 


--------------------------------------------------------------------------------
/beta_logit_infl_v2.stan:
--------------------------------------------------------------------------------
  1 | //
  2 | // Ordinal beta regression model for analying experimental outcomes
  3 | // with proportion and degenerate responses (i.e. 0 and 1)
  4 | // Models 0/1 as ordered categories above/below (0,1) 
  5 | // Robert Kubinec
  6 | // New York University Abu Dhabi
  7 | data {
  8 |   int<lower=0> N_prop; // number of proportion observations (0,1)
  9 |   int<lower=0> N_degen; // number of 0/1 observations
 10 |   int X; // number predictors
 11 |   int X_miss; // number of predictors for inflated model
 12 |   vector[N_prop] outcome_prop; // Y in (0,1)
 13 |   real infl_value; // set to value between 0 and 1. If negative, inflation is not used
 14 |   int outcome_degen[N_degen]; // Y in {0,1}
 15 |   matrix[N_prop,X] covar_prop; // covariate X for proportion outcome
 16 |   matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values
 17 |   matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values
 18 |   matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome
 19 |   int N_pred_degen; // number of posterior predictive samples for 0/1
 20 |   int N_pred_prop; // number of posterior predictive samples for (0,1)
 21 |   int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1
 22 |   int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1)
 23 |   int run_gen; // whether to use generated quantities
 24 | }
 25 | transformed data {
 26 |   int infl_this[N_prop];
 27 |   
 28 |   for(i in 1:N_prop) {
 29 |     if(outcome_prop[i]==infl_value) {
 30 |       infl_this[i] = 1;
 31 |     } else {
 32 |       infl_this[i] = 0;
 33 |     }
 34 |   }
 35 | }
 36 | parameters {
 37 |   vector[X] X_beta; // common predictor
 38 |   vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values
 39 |   ordered[4] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts)
 40 |   real<lower=0> kappa; // scale parameter for beta regression
 41 | }
 42 | transformed parameters {
 43 |     // store matrix calculations
 44 |   
 45 |   vector[N_degen] calc_degen;
 46 |   vector[N_prop] calc_prop;
 47 |   vector[!(infl_value<0) ? N_prop : 0] calc_miss;
 48 |   vector[!(infl_value<0) ? N_pred_degen : 0] calc_degen_miss; // must be defined over both distributionss
 49 |   
 50 |   // drop the intercepts so everything is relative to the cutpoints
 51 |   calc_degen = covar_degen*X_beta;
 52 |   calc_prop = covar_prop*X_beta;
 53 |   
 54 |   
 55 |   if(!(infl_value<0)) {
 56 |     calc_miss = covar_prop_infl*X_beta_miss;
 57 |     calc_degen_miss = covar_degen_infl*X_beta_miss;
 58 |   }
 59 |   
 60 | }
 61 | model {
 62 |   
 63 |   // vague priors
 64 |   X_beta ~ normal(0,5);
 65 |   X_beta_miss ~ normal(0,5);
 66 |   kappa ~ exponential(1);
 67 |   for(c in 1:3)
 68 |     cutpoints[c+1] - cutpoints[c] ~ normal(0,3);
 69 |   
 70 |   // need separate counters for logit (0/1) and beta regression
 71 |   
 72 |   for(n in 1:N_degen) {
 73 |     
 74 |     if(outcome_degen[n]==0) {
 75 |       // Pr(Y==0)
 76 |       target += log1m_inv_logit(calc_degen[n] - cutpoints[1]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]);
 77 |     } else {
 78 |       //Pr(Y==1)
 79 |       target += log_inv_logit(calc_degen[n] - cutpoints[4]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]);
 80 |     }
 81 |   }
 82 |   
 83 |   //target += beta_proportion_lpdf(outcome_prop|inv_logit(calc_prop),kappa);
 84 |   
 85 |     for(n in 1:N_prop) {
 86 |       // - beta_proportion_lcdf(0.49|inv_logit(calc_prop[n]),kappa)
 87 |       //- beta_proportion_lccdf(0.51|inv_logit(calc_prop[n]),kappa)
 88 |     if(outcome_prop[n]<0.5) {
 89 |       target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
 90 |       target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa) + bernoulli_logit_lpmf(0|calc_miss[n]);
 91 |     }  else if(outcome_prop[n]==0.5) {
 92 |       target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]),
 93 |                             bernoulli_logit_lpmf(0|calc_miss[n]) + log(inv_logit(calc_prop[n] - cutpoints[2]) - inv_logit(calc_prop[n] - cutpoints[3])) +
 94 |                             beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa));
 95 |     } else if(outcome_prop[n]>0.5) {
 96 |       target += log(inv_logit(calc_prop[n] - cutpoints[3]) - inv_logit(calc_prop[n] - cutpoints[4]));
 97 |       target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa) + bernoulli_logit_lpmf(0|calc_miss[n]);
 98 |     }
 99 |     // Pr(Y in (0,1))
100 |     
101 |     // Pr(Y==x where x in (0,1))
102 |     
103 |     }
104 |   // } else {
105 |   //   for(n in 1:N_prop) {
106 |   //     
107 |   //     // Pr(Y in (0,1))
108 |   //     
109 |   //     if()
110 |   //     
111 |   //     real pry01 = log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
112 |   //       
113 |   //     
114 |   //     // inflate the outcome
115 |   //     if(infl_this[n]==1) {
116 |   //       //target += bernoulli_logit_lpmf(1|calc_miss[n]);
117 |   //       target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]),
118 |   //                           bernoulli_logit_lpmf(0|calc_miss[n]) +
119 |   //                           beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa));
120 |   //     } else {
121 |   //       
122 |   //       target += bernoulli_logit_lpmf(0|calc_miss[n]); // "true" observed value
123 |   //       target += pry01;
124 |   //       // Pr(Y==x where x in (0,1))
125 |   //       target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa);
126 |   //     }
127 |   //   
128 |   //   }
129 |   // }
130 |   
131 |   
132 | }
133 | 
134 | generated quantities {
135 |   
136 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional)
137 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows
138 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo
139 |   int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_degen+N_pred_prop]; // whether observation belongs to inflated value or not
140 |   
141 |   
142 |   if(run_gen==1) {
143 |     
144 |     if(N_pred_degen>0) {
145 |     
146 |      // first do degenerate outcomes 
147 |     // note: these could be *re-generated* as beta/propotions
148 |     for(i in 1:num_elements(indices_degen)) {
149 |       
150 |       // draw an outcome 0 / prop / 1
151 |       regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints);
152 |       infl_gen[i] = bernoulli_logit_rng(calc_degen_miss[i]);
153 |       
154 |       if(outcome_degen[i]==0) {
155 |         ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]) + bernoulli_logit_lpmf(0|calc_degen_miss[i]);
156 |       } else {
157 |         ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[4]) + bernoulli_logit_lpmf(0|calc_degen_miss[i]);
158 |       }
159 |       
160 |       if(infl_gen[i]==1) {
161 |         regen_all[i] = 0.5;
162 |       } else {
163 |        if(regen_degen[i]==1) {
164 |         regen_all[i] = 0;
165 |       } else if(regen_degen[i]==5) {
166 |         regen_all[i] = 1;
167 |       } else {
168 |         
169 |       if(regen_degen[i]==2||regen_degen[i]==4) {
170 |           regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
171 |         } else {
172 |           regen_all[i] = infl_value;
173 |         }
174 |       } 
175 |       }
176 |       
177 |   }
178 |   
179 |   if(N_pred_prop>0) {
180 |         // now do originally proportional outcomes
181 |       // can be re-generated as 0s or 1s
182 |       
183 |       int skip = num_elements(indices_degen);
184 |       
185 |       for(i in 1:num_elements(indices_prop)) {
186 |         
187 |         // draw an outcome 0 / prop / 1
188 |         regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints);
189 |         infl_gen[i] = bernoulli_logit_rng(calc_miss[i]);
190 |         
191 |         //ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2]));
192 |         
193 |           if(outcome_prop[i]<0.5) {
194 |             ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2]));
195 |             ord_log[i+skip] = beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa)  + bernoulli_logit_lpmf(0|calc_miss[i]);
196 |           }  else if(outcome_prop[i]==0.5) {
197 |             ord_log[i+skip] = log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[i]),
198 |                             bernoulli_logit_lpmf(0|calc_miss[i]) + log(inv_logit(calc_prop[i] - cutpoints[2]) - inv_logit(calc_prop[i] - cutpoints[3])) +
199 |                             beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa));
200 |           } else if(outcome_prop[i]>0.5) {
201 |             ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[3]) - inv_logit(calc_prop[i] - cutpoints[4]));
202 |             ord_log[i+skip] = beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa)  + bernoulli_logit_lpmf(0|calc_miss[i]);
203 |           }
204 |         
205 |         if(infl_gen[i+skip] == 1) {
206 |           regen_all[i+skip] = infl_value;
207 |         } else {
208 |                   if(regen_degen[i+skip]==1) {
209 |           regen_all[i+skip] = 0;
210 |         } else if(regen_degen[i+skip]==5) {
211 |           regen_all[i+skip] = 1;
212 |         } else {
213 |           // did not occur in original data but could re-occur probabilistically
214 |           // check for inflation first
215 |           
216 |             if(regen_degen[i+skip]==1) {
217 |               regen_all[i+skip] = 0;
218 |             } else if(regen_degen[i+skip]==5) {
219 |               regen_all[i+skip] = 1;
220 |             } else {
221 |               
222 |             if(regen_degen[i+skip]==2||regen_degen[i+skip]==4) {
223 |                 regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
224 |               } else {
225 |                 regen_all[i+skip] = infl_value;
226 |               }
227 |             }
228 |           
229 |         }
230 |         }
231 | 
232 |         
233 |       } 
234 |       }
235 |     }
236 |   }
237 |   
238 | }
239 | 
240 | 


--------------------------------------------------------------------------------
/beta_logit_infl_v3.stan:
--------------------------------------------------------------------------------
  1 | //
  2 | // Ordinal beta regression model for analying experimental outcomes
  3 | // with proportion and degenerate responses (i.e. 0 and 1)
  4 | // Models 0/1 as ordered categories above/below (0,1) 
  5 | // Robert Kubinec
  6 | // New York University Abu Dhabi
  7 | data {
  8 |   int<lower=0> N_prop; // number of proportion observations (0,1)
  9 |   int<lower=0> N_degen; // number of 0/1 observations
 10 |   int X; // number predictors
 11 |   int X_miss; // number of predictors for inflated model
 12 |   vector[N_prop] outcome_prop; // Y in (0,1)
 13 |   real infl_value; // set to value between 0 and 1. If negative, inflation is not used
 14 |   int outcome_degen[N_degen]; // Y in {0,1}
 15 |   matrix[N_prop,X] covar_prop; // covariate X for proportion outcome
 16 |   matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values
 17 |   matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values
 18 |   matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome
 19 |   int N_pred_degen; // number of posterior predictive samples for 0/1
 20 |   int N_pred_prop; // number of posterior predictive samples for (0,1)
 21 |   int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1
 22 |   int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1)
 23 |   int run_gen; // whether to use generated quantities
 24 | }
 25 | transformed data {
 26 |   int infl_this[N_prop];
 27 |   
 28 |   for(i in 1:N_prop) {
 29 |     if(outcome_prop[i]==infl_value) {
 30 |       infl_this[i] = 1;
 31 |     } else {
 32 |       infl_this[i] = 0;
 33 |     }
 34 |   }
 35 | }
 36 | parameters {
 37 |   vector[X] X_beta; // common predictor
 38 |   vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values
 39 |   ordered[4] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts)
 40 |   real<lower=0> kappa; // scale parameter for beta regression
 41 | }
 42 | transformed parameters {
 43 |     // store matrix calculations
 44 |   
 45 |   vector[N_degen] calc_degen;
 46 |   vector[N_prop] calc_prop;
 47 |   vector[!(infl_value<0) ? N_prop : 0] calc_miss;
 48 |   vector[!(infl_value<0) ? N_pred_degen : 0] calc_degen_miss; // must be defined over both distributionss
 49 |   
 50 |   // drop the intercepts so everything is relative to the cutpoints
 51 |   calc_degen = covar_degen*X_beta;
 52 |   calc_prop = covar_prop*X_beta;
 53 |   
 54 |   
 55 |   if(!(infl_value<0)) {
 56 |     calc_miss = covar_prop_infl*X_beta_miss;
 57 |     calc_degen_miss = covar_degen_infl*X_beta_miss;
 58 |   }
 59 |   
 60 | }
 61 | model {
 62 |   
 63 |   // vague priors
 64 |   X_beta ~ normal(0,5);
 65 |   X_beta_miss ~ normal(0,5);
 66 |   kappa ~ exponential(1);
 67 |   for(c in 1:3)
 68 |     cutpoints[c+1] - cutpoints[c] ~ normal(0,3);
 69 |   
 70 |   // need separate counters for logit (0/1) and beta regression
 71 |   
 72 |   for(n in 1:N_degen) {
 73 |     
 74 |     if(outcome_degen[n]==0) {
 75 |       // Pr(Y==0)
 76 |       target += log1m_inv_logit(calc_degen[n] - cutpoints[1]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]);
 77 |     } else {
 78 |       //Pr(Y==1)
 79 |       target += log_inv_logit(calc_degen[n] - cutpoints[4]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]);
 80 |     }
 81 |   }
 82 |     for(n in 1:N_prop) {
 83 |       
 84 |     if(outcome_prop[n]<0.5) {
 85 |       target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
 86 |       target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa) + 
 87 |         bernoulli_logit_lpmf(0|calc_miss[n]);
 88 |     }  else if(outcome_prop[n]==0.5) {
 89 |       target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]),
 90 |                         bernoulli_logit_lpmf(0|calc_miss[n]) +
 91 |                         log(inv_logit(calc_prop[n] - cutpoints[2]) - inv_logit(calc_prop[n] - cutpoints[3])) +
 92 |                         beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa));
 93 |     } else if(outcome_prop[n]>0.5) {
 94 |       target += log(inv_logit(calc_prop[n] - cutpoints[3]) - inv_logit(calc_prop[n] - cutpoints[4]));
 95 |       target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa) + 
 96 |       bernoulli_logit_lpmf(0|calc_miss[n]);
 97 |     }
 98 |     // Pr(Y in (0,1))
 99 |     
100 |     // Pr(Y==x where x in (0,1))
101 |     
102 |     }
103 |   // } else {
104 |   //   for(n in 1:N_prop) {
105 |   //     
106 |   //     // Pr(Y in (0,1))
107 |   //     
108 |   //     if()
109 |   //     
110 |   //     real pry01 = log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
111 |   //       
112 |   //     
113 |   //     // inflate the outcome
114 |   //     if(infl_this[n]==1) {
115 |   //       //target += bernoulli_logit_lpmf(1|calc_miss[n]);
116 |   //       target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]),
117 |   //                           bernoulli_logit_lpmf(0|calc_miss[n]) +
118 |   //                           beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa));
119 |   //     } else {
120 |   //       
121 |   //       target += bernoulli_logit_lpmf(0|calc_miss[n]); // "true" observed value
122 |   //       target += pry01;
123 |   //       // Pr(Y==x where x in (0,1))
124 |   //       target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa);
125 |   //     }
126 |   //   
127 |   //   }
128 |   // }
129 |   
130 |   
131 | }
132 | 
133 | generated quantities {
134 |   
135 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional)
136 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows
137 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo
138 |   int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_degen+N_pred_prop]; // whether observation belongs to inflated value or not
139 |   
140 |   
141 |   if(run_gen==1) {
142 |     
143 |     if(N_pred_degen>0) {
144 |     
145 |      // first do degenerate outcomes 
146 |     // note: these could be *re-generated* as beta/propotions
147 |     for(i in 1:num_elements(indices_degen)) {
148 |       
149 |       // draw an outcome 0 / prop / 1
150 |       regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints);
151 |       
152 |       if(outcome_degen[i]==0) {
153 |         ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]);
154 |       } else {
155 |         ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]);
156 |       }
157 |       
158 |       
159 |       if(regen_degen[i]==1) {
160 |         regen_all[i] = 0;
161 |       } else if(regen_degen[i]==3) {
162 |         regen_all[i] = 1;
163 |       } else {
164 |         
165 |         if(!(infl_value<0)) {
166 |             infl_gen[i] = bernoulli_logit_rng(calc_degen_miss[i]);
167 |             
168 |             if(infl_gen[i]==1) {
169 |               regen_all[i] = infl_value;
170 |             } else {
171 |               
172 |               regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
173 |               
174 |             }
175 |           } else {
176 |             regen_all[i] = beta_proportion_rng(inv_logit(calc_degen[i]),kappa);
177 |           }
178 |       }
179 |       
180 |   }
181 |   
182 |   if(N_pred_prop>0) {
183 |         // now do originally proportional outcomes
184 |       // can be re-generated as 0s or 1s
185 |       
186 |       int skip = num_elements(indices_degen);
187 |       
188 |       for(i in 1:num_elements(indices_prop)) {
189 |         
190 |         // draw an outcome 0 / prop / 1
191 |         regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints);
192 |         
193 |         ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2]));
194 |         
195 |         if(!(infl_value<0)) {
196 |           if(infl_this[i]==1) {
197 |             //ord_log[i + skip] += bernoulli_logit_lpmf(1|calc_miss[i]);
198 |             ord_log[i + skip] += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[i]),
199 |                             bernoulli_logit_lpmf(0|calc_miss[i]) +
200 |                             beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa));
201 |         
202 |                           //         log_sum_exp(beta_proportion_lccdf(infl_value+0.1|inv_logit(calc_prop[i]),kappa),
203 |                           // beta_proportion_lcdf(infl_value-0.1|inv_logit(calc_prop[i]),kappa));
204 |           } else {
205 |             
206 |             ord_log[i + skip] += bernoulli_logit_lpmf(0|calc_miss[i]); // "true" observed value
207 |             // Pr(Y==x where x in (0,1))
208 |             
209 |             ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa);
210 |             
211 |           }
212 |               
213 |           
214 |         } else {
215 |           ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa);
216 |         }
217 |         
218 |         
219 |         if(regen_degen[i+skip]==1) {
220 |           regen_all[i+skip] = 0;
221 |         } else if(regen_degen[i+skip]==3) {
222 |           regen_all[i+skip] = 1;
223 |         } else {
224 |           // did not occur in original data but could re-occur probabilistically
225 |           // check for inflation first
226 |           
227 |           if(!(infl_value<0)) {
228 |             infl_gen[i] = bernoulli_logit_rng(calc_miss[i]);
229 |             
230 |             if(infl_gen[i]==1) {
231 |               regen_all[i + skip] = infl_value;
232 |             } else {
233 |               regen_all[i + skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
234 |             }
235 |           } else {
236 |             regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
237 |           }
238 |           
239 |         }
240 |         
241 |       } 
242 |       }
243 |     }
244 |   }
245 |   
246 | }
247 | 
248 | 


--------------------------------------------------------------------------------
/beta_logit_infl_v4.stan:
--------------------------------------------------------------------------------
  1 | //
  2 | // Ordinal beta regression model for analying experimental outcomes
  3 | // with proportion and degenerate responses (i.e. 0 and 1)
  4 | // Models 0/1 as ordered categories above/below (0,1) 
  5 | // Robert Kubinec
  6 | // New York University Abu Dhabi
  7 | data {
  8 |   int<lower=0> N_prop; // number of proportion observations (0,1)
  9 |   int<lower=0> N_degen; // number of 0/1 observations
 10 |   int X; // number predictors
 11 |   int X_miss; // number of predictors for inflated model
 12 |   vector[N_prop] outcome_prop; // Y in (0,1)
 13 |   real infl_value; // set to value between 0 and 1. If negative, inflation is not used
 14 |   int outcome_degen[N_degen]; // Y in {0,1}
 15 |   matrix[N_prop,X] covar_prop; // covariate X for proportion outcome
 16 |   matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values
 17 |   matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values
 18 |   matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome
 19 |   int N_pred_degen; // number of posterior predictive samples for 0/1
 20 |   int N_pred_prop; // number of posterior predictive samples for (0,1)
 21 |   int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1
 22 |   int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1)
 23 |   int run_gen; // whether to use generated quantities
 24 | }
 25 | transformed data {
 26 |   int infl_this[N_prop];
 27 |   
 28 |   for(i in 1:N_prop) {
 29 |     if(outcome_prop[i]==infl_value) {
 30 |       infl_this[i] = 1;
 31 |     } else {
 32 |       infl_this[i] = 0;
 33 |     }
 34 |   }
 35 | }
 36 | parameters {
 37 |   vector[X] X_beta; // common predictor
 38 |   vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values
 39 |   ordered[2] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts)
 40 |   real<lower=0> kappa; // scale parameter for beta regression
 41 | }
 42 | transformed parameters {
 43 |     // store matrix calculations
 44 |   
 45 |   vector[N_degen] calc_degen;
 46 |   vector[N_prop] calc_prop;
 47 |   vector[!(infl_value<0) ? N_prop : 0] calc_miss;
 48 |   vector[!(infl_value<0) ? N_degen : 0] calc_degen_miss; // must be defined over both distributionss
 49 |   
 50 |   // drop the intercepts so everything is relative to the cutpoints
 51 |   calc_degen = covar_degen*X_beta;
 52 |   calc_prop = covar_prop*X_beta;
 53 |   
 54 |   
 55 |   if(!(infl_value<0)) {
 56 |     calc_miss = covar_prop_infl*X_beta_miss;
 57 |     calc_degen_miss = covar_degen_infl*X_beta_miss;
 58 |   }
 59 |   
 60 | }
 61 | model {
 62 |   
 63 |   // vague priors
 64 |   X_beta ~ normal(0,5);
 65 |   X_beta_miss ~ normal(0,5);
 66 |   kappa ~ exponential(1);
 67 |   cutpoints[2] - cutpoints[1] ~ normal(0,3);
 68 |   
 69 |   // need separate counters for logit (0/1) and beta regression
 70 |   
 71 |   target += bernoulli_logit_lpmf(0|calc_degen_miss);
 72 |   
 73 |   for(n in 1:N_degen) {
 74 |     
 75 |     if(outcome_degen[n]==0) {
 76 |       // Pr(Y==0)
 77 |       target += log1m_inv_logit(calc_degen[n] - cutpoints[1]);
 78 |     } else {
 79 |       //Pr(Y==1)
 80 |       target += log_inv_logit(calc_degen[n] - cutpoints[2]);
 81 |     }
 82 |   }
 83 |   if(infl_value<0) {
 84 |     for(n in 1:N_prop) {
 85 |     // Pr(Y in (0,1))
 86 |     target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
 87 |     // Pr(Y==x where x in (0,1))
 88 |     target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa);
 89 |     }
 90 |   } else {
 91 |     for(n in 1:N_prop) {
 92 |       
 93 |       // Pr(Y in (0,1))
 94 |       
 95 |       real pry01 = log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
 96 |         
 97 |       
 98 |       // inflate the outcome
 99 |       if(infl_this[n]==1) {
100 |         //target += bernoulli_logit_lpmf(1|calc_miss[n]);
101 |         target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]),
102 |                             bernoulli_logit_lpmf(0|calc_miss[n]) +
103 |                             beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa));
104 |       } else {
105 |         
106 |         target += bernoulli_logit_lpmf(0|calc_miss[n]); // "true" observed value
107 |         target += pry01;
108 |         // Pr(Y==x where x in (0,1))
109 |         target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa);
110 |       }
111 |     
112 |     }
113 |   }
114 |   
115 |   
116 | }
117 | 
118 | generated quantities {
119 |   
120 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional)
121 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows
122 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo
123 |   int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_degen+N_pred_prop]; // whether observation belongs to inflated value or not
124 |   
125 |   
126 |   if(run_gen==1) {
127 |     
128 |     if(N_pred_degen>0) {
129 |     
130 |      // first do degenerate outcomes 
131 |     // note: these could be *re-generated* as beta/propotions
132 |     for(i in 1:num_elements(indices_degen)) {
133 |       
134 |       // draw an outcome 0 / prop / 1
135 |       regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints);
136 |       
137 |       if(outcome_degen[i]==0) {
138 |         ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]);
139 |       } else {
140 |         ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]);
141 |       }
142 |       
143 |       
144 |       if(regen_degen[i]==1) {
145 |         regen_all[i] = 0;
146 |       } else if(regen_degen[i]==3) {
147 |         regen_all[i] = 1;
148 |       } else {
149 |         
150 |         if(!(infl_value<0)) {
151 |             infl_gen[i] = bernoulli_logit_rng(calc_degen_miss[i]);
152 |             
153 |             if(infl_gen[i]==1) {
154 |               regen_all[i] = infl_value;
155 |             } else {
156 |               
157 |               regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
158 |               
159 |             }
160 |           } else {
161 |             regen_all[i] = beta_proportion_rng(inv_logit(calc_degen[i]),kappa);
162 |           }
163 |       }
164 |       
165 |   }
166 |   
167 |   if(N_pred_prop>0) {
168 |         // now do originally proportional outcomes
169 |       // can be re-generated as 0s or 1s
170 |       
171 |       int skip = num_elements(indices_degen);
172 |       
173 |       for(i in 1:num_elements(indices_prop)) {
174 |         
175 |         // draw an outcome 0 / prop / 1
176 |         regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints);
177 |         
178 |         ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2]));
179 |         
180 |         if(!(infl_value<0)) {
181 |           if(infl_this[i]==1) {
182 |             //ord_log[i + skip] += bernoulli_logit_lpmf(1|calc_miss[i]);
183 |             ord_log[i + skip] += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[i]),
184 |                             bernoulli_logit_lpmf(0|calc_miss[i]) +
185 |                             beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa));
186 |         
187 |                           //         log_sum_exp(beta_proportion_lccdf(infl_value+0.1|inv_logit(calc_prop[i]),kappa),
188 |                           // beta_proportion_lcdf(infl_value-0.1|inv_logit(calc_prop[i]),kappa));
189 |           } else {
190 |             
191 |             ord_log[i + skip] += bernoulli_logit_lpmf(0|calc_miss[i]); // "true" observed value
192 |             // Pr(Y==x where x in (0,1))
193 |             
194 |             ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa);
195 |             
196 |           }
197 |               
198 |           
199 |         } else {
200 |           ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa);
201 |         }
202 |         
203 |         
204 |         if(regen_degen[i+skip]==1) {
205 |           regen_all[i+skip] = 0;
206 |         } else if(regen_degen[i+skip]==3) {
207 |           regen_all[i+skip] = 1;
208 |         } else {
209 |           // did not occur in original data but could re-occur probabilistically
210 |           // check for inflation first
211 |           
212 |           if(!(infl_value<0)) {
213 |             infl_gen[i] = bernoulli_logit_rng(calc_miss[i]);
214 |             
215 |             if(infl_gen[i]==1) {
216 |               regen_all[i + skip] = infl_value;
217 |             } else {
218 |               regen_all[i + skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
219 |             }
220 |           } else {
221 |             regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa);
222 |           }
223 |           
224 |         }
225 |         
226 |       } 
227 |       }
228 |     }
229 |   }
230 |   
231 | }
232 | 
233 | 


--------------------------------------------------------------------------------
/beta_logit_phireg.stan:
--------------------------------------------------------------------------------
  1 | //
  2 | // Ordinal beta regression model for analying experimental outcomes
  3 | // with proportion and degenerate responses (i.e. 0 and 1)
  4 | // Models 0/1 as ordered categories above/below (0,1) 
  5 | // Robert Kubinec
  6 | // New York University Abu Dhabi
  7 | data {
  8 |   int<lower=0> N_prop; // number of proportion observations (0,1)
  9 |   int<lower=0> N_degen; // number of 0/1 observations
 10 |   int X; // number predictors
 11 |   int X_phi; // number of predictors for phi (scale of beta regression)
 12 |   vector[N_prop] outcome_prop; // Y in (0,1)
 13 |   int outcome_degen[N_degen]; // Y in {0,1}
 14 |   matrix[N_prop,X] covar_prop; // covariate X for proportion outcome
 15 |   matrix[N_prop,X_phi] covar_prop_phi; // covariate X_phi to predict phi for beta regression
 16 |   matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome
 17 |   matrix[N_degen,X_phi] covar_degen_phi; // covariate X_phi to predict phi for (unobserved) beta regression
 18 |   int N_pred_degen; // number of posterior predictive samples for 0/1
 19 |   int N_pred_prop; // number of posterior predictive samples for (0,1)
 20 |   int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1
 21 |   int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1)
 22 |   int run_gen; // whether to use generated quantities
 23 | }
 24 | parameters {
 25 |   vector[X] X_beta; // common predictor
 26 |   vector[X_phi] X_beta_phi; // predictors for phi in beta regression
 27 |   ordered[2] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts)
 28 |   real alpha_phi; // intercept for phi 
 29 | }
 30 | transformed parameters {
 31 |     // store matrix calculations
 32 |   
 33 |   vector[N_degen] calc_degen;
 34 |   vector[N_prop] calc_prop;
 35 |   vector[N_prop] calc_prop_phi;
 36 |   
 37 |   // drop the intercepts so everything is relative to the cutpoints
 38 |   calc_degen = covar_degen*X_beta;
 39 |   calc_prop = covar_prop*X_beta;
 40 |   calc_prop_phi = exp(alpha_phi + covar_prop_phi*X_beta_phi);
 41 | }
 42 | model {
 43 |   
 44 |   // vague priors
 45 |   X_beta ~ normal(0,5);
 46 |   X_beta_phi ~ normal(0,5);
 47 |   alpha_phi ~ normal(0,5);
 48 |   cutpoints[2] - cutpoints[1] ~ normal(0,3);
 49 |   
 50 |   // need separate counters for logit (0/1) and beta regression
 51 |   
 52 |   for(n in 1:N_degen) {
 53 |     if(outcome_degen[n]==0) {
 54 |       // Pr(Y==0)
 55 |       target += log1m_inv_logit(calc_degen[n] - cutpoints[1]);
 56 |     } else {
 57 |       //Pr(Y==1)
 58 |       target += log_inv_logit(calc_degen[n] - cutpoints[2]);
 59 |     }
 60 |   }
 61 |   
 62 |   for(n in 1:N_prop) {
 63 |     // Pr(Y in (0,1))
 64 |     target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2]));
 65 |     // Pr(Y==x where x in (0,1))
 66 |     outcome_prop[n] ~ beta_proportion(inv_logit(calc_prop[n]),calc_prop_phi[n]);
 67 |   }
 68 |   
 69 | }
 70 | 
 71 | generated quantities {
 72 |   
 73 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional)
 74 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows
 75 |   vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo
 76 |   vector[run_gen==0 ? 0 : N_pred_degen] calc_degen_phi = exp(alpha_phi + covar_degen_phi*X_beta_phi); // need phi for degenerate outcomes for which we don't use beta distribution
 77 |   
 78 |   if(run_gen==1) {
 79 |     if(N_pred_degen>0) {
 80 |      // first do degenerate outcomes 
 81 |     // note: these could be *re-generated* as beta/propotions
 82 |     for(i in 1:num_elements(indices_degen)) {
 83 |       
 84 |       // draw an outcome 0 / prop / 1
 85 |       regen_degen[i] = ordered_logistic_rng(covar_degen[indices_degen[i],]*X_beta,cutpoints);
 86 |       
 87 |       if(outcome_degen[i]==0) {
 88 |         ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]);
 89 |       } else {
 90 |         ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]);
 91 |       }
 92 |       
 93 |       
 94 |       if(regen_degen[i]==1) {
 95 |         regen_all[i] = 0;
 96 |       } else if(regen_degen[i]==3) {
 97 |         regen_all[i] = 1;
 98 |       } else {
 99 |         // did not occur in original data but could re-occur probabilistically
100 |         regen_all[i] = beta_proportion_rng(inv_logit(covar_degen[indices_degen[i],]*X_beta),calc_degen_phi[indices_degen[i]]);
101 |       }
102 |       
103 |   }
104 |   
105 |   if(N_pred_prop>0) {
106 |         // now do originally proportional outcomes
107 |       // can be re-generated as 0s or 1s
108 |       
109 |       int skip = num_elements(indices_degen);
110 |       
111 |       for(i in 1:num_elements(indices_prop)) {
112 |         
113 |         // draw an outcome 0 / prop / 1
114 |         regen_degen[i+skip] = ordered_logistic_rng(covar_prop[indices_prop[i],]*X_beta,cutpoints);
115 |         
116 |         ord_log[i+skip] = log(inv_logit(calc_prop[indices_prop[i]] - cutpoints[1]) - inv_logit(calc_prop[indices_prop[i]] - cutpoints[2])) +
117 |                         beta_proportion_lpdf(outcome_prop[indices_prop[i]]|inv_logit(calc_prop[indices_prop[i]]),calc_prop_phi[indices_prop[i]]);
118 |         
119 |         if(regen_degen[i+skip]==1) {
120 |           regen_all[i+skip] = 0;
121 |         } else if(regen_degen[i+skip]==3) {
122 |           regen_all[i+skip] = 1;
123 |         } else {
124 |           // did not occur in original data but could re-occur probabilistically
125 |           regen_all[i+skip] = beta_proportion_rng(inv_logit(covar_prop[indices_prop[i],]*X_beta),calc_prop_phi[indices_prop[i]]);
126 |         }
127 |         
128 |       } 
129 |       }
130 |     }
131 |   }
132 |   
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/dag_example.R:
--------------------------------------------------------------------------------
 1 | library(dagitty)
 2 | library(ggdag)
 3 | 
 4 | # roommate causal graph
 5 | 
 6 | roommate <- dagitty("dag{
 7 |         d -> r;
 8 |         p -> r;
 9 |         s -> r;
10 |         n -> r;
11 |         t -> r;
12 |         v -> r;
13 |         l -> r;
14 |         d -> l;
15 |         p  -> l;
16 |         p ->  n;
17 |         v -> s;
18 |         v -> n;
19 |         l -> v;
20 | }")
21 | 
22 | ggdag(roommate)
23 | 
24 | # find variables that aren't related
25 | impliedConditionalIndependencies(roommate)
26 | 
27 | # find the adjustment set
28 | adjustmentSets(roommate, exposure="l", outcome =  "r")
29 | 
30 | # ggdag version
31 | ggdag_adjustment_set(roommate, exposure="l",outcome="r")
32 | 


--------------------------------------------------------------------------------
/data/W28_Aug17/ATP W28 methodology.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28 methodology.doc


--------------------------------------------------------------------------------
/data/W28_Aug17/ATP W28 que.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28 que.docx


--------------------------------------------------------------------------------
/data/W28_Aug17/ATP W28 readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28 readme.txt


--------------------------------------------------------------------------------
/data/W28_Aug17/ATP W28 topline.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28 topline.docx


--------------------------------------------------------------------------------
/data/W28_Aug17/ATP W28.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28.sav


--------------------------------------------------------------------------------
/data/W28_Aug17/~$P W28 que.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/~$P W28 que.docx


--------------------------------------------------------------------------------
/data/all_vars_Beta_trans.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/all_vars_Beta_trans.rds


--------------------------------------------------------------------------------
/data/all_vars_frac.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/all_vars_frac.rds


--------------------------------------------------------------------------------
/data/all_vars_ord.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/all_vars_ord.rds


--------------------------------------------------------------------------------
/data/all_vars_zoib.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/all_vars_zoib.rds


--------------------------------------------------------------------------------
/data/compare_prob.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/compare_prob.rds


--------------------------------------------------------------------------------
/data/sample_all.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sample_all.rds


--------------------------------------------------------------------------------
/data/sim_cont_X.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sim_cont_X.RData


--------------------------------------------------------------------------------
/data/sim_cont_X.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sim_cont_X.rds


--------------------------------------------------------------------------------
/data/sim_cont_X_fixed.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sim_cont_X_fixed.RData


--------------------------------------------------------------------------------
/data/sim_cont_X_fixed.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sim_cont_X_fixed.rds


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_extended.do:
--------------------------------------------------------------------------------
 1 | clear
 2 | 
 3 | *The next line reads in the data for the baseline reform indicator. Note that the path should be corrected.
 4 | 
 5 | *use "......\EER-D-13-00718R2_Event_study_extended_sample_reform.dta"
 6 | 
 7 | xi: logit reform1 Revolution4b war ww1 timesincereformbase _splinebase*, cluster(ccode)
 8 | 
 9 | xi: relogit reform1 Revolution4b war ww1 timesincereformbase _splinebase*, cluster(ccode)
10 | 
11 | xi: xtlogit reform1 Revolution4b war ww1 timesincereformbase _splinebase*, fe 
12 | 
13 | xi: logit reform1 Revolution4b laggdp lagn war ww1 timesincereformbase _splinebase*, cluster(ccode)
14 | 
15 | xi: relogit reform1 Revolution4b laggdp lagn war ww1 timesincereformbase _splinebase*, cluster(ccode)
16 | 
17 | xi: xtlogit reform1 Revolution4b laggdp lagn war ww1 timesincereformbase _splinebase*, fe
18 | 
19 | *Equation (16) -- Linear probability models with year effects
20 | 
21 | xi: reg reform1 Revolution4b war i.ccode i.year,  cluster(ccode)
22 | 
23 | xi: reg reform1 Revolution4b laggdp lagn war i.ccode i.year,  cluster(ccode)
24 | 
25 | 
26 | * Models with reversals (columns 4 and 8 in Table 9)
27 | 
28 | clear
29 | 
30 | *The next line reads in the data for the reform indicator which allows for reversals. Note that the path should be corrected.
31 | 
32 | *use "......\EER-D-13-00718R2_Event_study_extended_sample_reversals.dta"
33 | 
34 | xi: logit reform_reversal Revolution4b war ww1 timesincereformreversal _splinereversal*, cluster(ccode)
35 | 
36 | xi: logit reform_reversal Revolution4b laggdp lagn war ww1 timesincereformreversal _splinereversal*, cluster(ccode)
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_main.do:
--------------------------------------------------------------------------------
 1 | clear
 2 | 
 3 | *The next line reads in the data for the baseline reform measure. Note that the path should be corrected.
 4 | 
 5 | *use "........\EER-D-13-00718R2_Event_study_main_sample_reform.dta"
 6 | 
 7 | *Table 8 except for regression 7 which estimated below 
 8 | 
 9 | xi: logit Reform1  Revolution2_2012 laggdp   lagn  lagurban  war ww1   lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 
10 | 
11 | xi: logit Reform1  revolution4b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
12 | 
13 | xi: logit Reform1  Revolution_ling_2012 laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
14 | 
15 | xi: logit Reform1  lagrevolution4b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
16 | 
17 | xi: logit Reform1  revolution4b New_learn_ling laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
18 | 
19 | xi: relogit Reform1  revolution4b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
20 | 
21 | xi: xtlogit Reform1  revolution4b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, fe
22 | 
23 | * Table 10 event study part
24 | 
25 | xi: logit Reform1  revolution4b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard lagtrade  timesincereform _spline*, cluster(ccode) 
26 | 
27 | xi: logit Reform1  revolution4b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard lagtc  timesincereform _spline*, cluster(ccode) 
28 | 
29 | xi: logit Reform1  revolution4b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard lagagricultural  timesincereform _spline*, cluster(ccode) 
30 | 
31 | xi: logit Reform1  revolution4b laggdp   lagn  lagurban  warintensity2 ww1  lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 
32 | 
33 | 
34 | * Table D1 in the supplementary appendix
35 | 
36 | 
37 | xi: logit Reform1  revolution5b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
38 | 
39 | xi: logit Reform1  revolution4b trend_hp cycle_hp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
40 | 
41 | xi: logit Reform1  revolution4b g0_rev laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
42 | 
43 | xi: logit Reform1  revolution4b g2_3rev laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
44 | 
45 | *Table 8, regression 7
46 |  
47 | clear
48 | 
49 | *The next line reads in the data for the reform measure which allows for reversals. Note that the path should be corrected.
50 | 
51 | *use "........\EER-D-13-00718R2_Event_study_main_sample_reversals.dta"
52 | 
53 | 
54 | xi: logit Reformreversal revolution4b laggdp   lagn  lagurban  war ww1  lagenrollment Gold_standard   timesincereform _spline*, cluster(ccode) 
55 | 


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_extended_sample_reform.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_extended_sample_reform.dta


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_extended_sample_reversals.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_extended_sample_reversals.dta


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_main_sample_reform.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_main_sample_reform.dta


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_main_sample_reversals.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_main_sample_reversals.dta


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_READ_ME.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_READ_ME.pdf


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_maindata_suffrage.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_maindata_suffrage.dta


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_reversals_suffrage.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_reversals_suffrage.dta


--------------------------------------------------------------------------------
/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_suffrage.do:
--------------------------------------------------------------------------------
  1 | 
  2 | clear all
  3 | 
  4 | *The next line reads in the data. Note that the path should be corrected.
  5 | 
  6 | *use "........\EER-D-13-00718R2_maindata_suffrage.dta"
  7 | 
  8 | *Table 2
  9 | 
 10 | xi: xtpcse e2c lage2c  Revolution2_2012 yy* i.ccode, pairwise correlation(psar1)
 11 | 
 12 | xi: xtpcse e2c lage2c  revolution4b yy* i.ccode, pairwise correlation(psar1)
 13 | 
 14 | xi: xtpcse e2c lage2c  Revolution_ling_2012 yy* i.ccode, pairwise correlation(psar1)
 15 | 
 16 | *part with with controls
 17 | 
 18 | xi: xtpcse e2c lage2c  Revolution2_2012 laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard  yy* i.ccode, pairwise correlation(psar1)
 19 | 
 20 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard  yy* i.ccode, pairwise correlation(psar1)
 21 | 
 22 | xi: xtpcse e2c lage2c  Revolution_ling_2012 laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard  yy* i.ccode, pairwise correlation(psar1)
 23 | 
 24 | 
 25 | *Table 3
 26 | 
 27 | *Conley type standard errors are obtained by using x_ols available form Conley's website (see documentation file).
 28 | 
 29 |  gen cutoff5001=500  /*more than +-500? km away -> assumed to be independent*/
 30 |  gen cutoff5002=500  
 31 |  gen const = 1
 32 |  
 33 | xi: x_ols LatKm LongKm cutoff5001 cutoff5001 e2c lage2c revolution4b laggdp lagn lagurban lagenrollment war ww1 Gold_standard i.ccode yy*, xreg(72) coord(2)
 34 | matrix bb = e(b)
 35 | matrix v_dep = vecdiag(cov_dep)
 36 | scalar coef_e2 = bb[1,2]
 37 | scalar se_e2 = sqrt(v_dep[1,2])
 38 | display coef_e2
 39 | display coef_e2/se_e2
 40 | 
 41 | drop epsilon
 42 | drop dis1
 43 | drop dis2
 44 | drop window
 45 | 
 46 |  gen cutoff8001=800  /*more than +-800? km away -> assumed to be independent*/
 47 |  gen cutoff8002=800  
 48 | 
 49 | xi: x_ols LatKm LongKm cutoff8001 cutoff8002 e2c lage2c revolution4b laggdp lagn lagurban lagenrollment war ww1 Gold_standard i.ccode yy*, xreg(72) coord(2)
 50 | matrix bb = e(b)
 51 | matrix v_dep = vecdiag(cov_dep)
 52 | scalar coef_e2 = bb[1,2]
 53 | scalar se_e2 = sqrt(v_dep[1,2])
 54 | display coef_e2
 55 | display coef_e2/se_e2
 56 | 
 57 | drop epsilon
 58 | drop dis1
 59 | drop dis2
 60 | drop window
 61 | 
 62 |  gen cutoff14001=1400  /*more than +-1400? km away -> assumed to be independent*/
 63 |  gen cutoff14002=1400  
 64 |  
 65 | xi: x_ols LatKm LongKm cutoff14001 cutoff14002 e2c lage2c revolution4b laggdp lagn lagurban lagenrollment war ww1 Gold_standard i.ccode yy*, xreg(72) coord(2)
 66 | matrix bb = e(b)
 67 | matrix v_dep = vecdiag(cov_dep)
 68 | scalar coef_e2 = bb[1,2]
 69 | scalar se_e2 = sqrt(v_dep[1,2])
 70 | display coef_e2
 71 | display coef_e2/se_e2
 72 | 
 73 | drop epsilon
 74 | drop dis1
 75 | drop dis2
 76 | drop window
 77 |  
 78 | xi: xtpcse e2c lage2c  laggdp   lagn  lagurban  war ww1  lagrevolution4b  Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1)
 79 | 
 80 | xi: xtpcse e2c lage2c  laggdp   lagn  lagurban  war ww1  revolution4b   ownrev Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1)
 81 | 
 82 | xi: xtpcse de2  laggdp   lagn  lagurban  war ww1  lagrevolution4b Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1)
 83 | 
 84 | *Table 5
 85 | 
 86 | xi: xtpcse e2c lage2c revolution4b ownrev laggdp   lagn  lagurban  war lagenrollment  Gold_standard  yy* i.ccode, correlation(psar1) pairwise 
 87 | 
 88 | xi: xtpcse e2c lage2c lagrevolution4b ownrev laggdp   lagn  lagurban  war lagenrollment  Gold_standard  yy* i.ccode, correlation(psar1) pairwise 
 89 | 
 90 | xi: xtpcse de2 lagrevolution4b ownrev laggdp   lagn  lagurban  war lagenrollment  Gold_standard  yy* i.ccode, correlation(psar1) pairwise 
 91 | 
 92 | xi: xtpcse e2c lage2c revolution4b ownrev laggdp   lagn  lagurban  war lagenrollment  Gold_standard  i.year i.ccode, het correlation(psar1) 
 93 | 
 94 | xi: xtpcse e2c lage2c lagrevolution4b ownrev laggdp   lagn  lagurban  war lagenrollment  Gold_standard  i.year i.ccode, het correlation(psar1) 
 95 | 
 96 | xi: xtpcse de2 lagrevolution4b ownrev laggdp   lagn  lagurban  war lagenrollment  Gold_standard  i.year i.ccode, het correlation(psar1) 
 97 | 
 98 | *Table 6 
 99 | 
100 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard g0_rev yy* i.ccode, pairwise correlation(psar1)
101 | 
102 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard g2_3rev yy* i.ccode, pairwise correlation(psar1)
103 | 
104 | xi: xtpcse e2c lage2c  revolution4b cycle_hp trend_hp   lagn  lagurban  war ww1 lagenrollment  Gold_standard  yy* i.ccode, pairwise correlation(psar1)
105 | 
106 | xi: xtpcse e2c lage2c  year lograin laglograin revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard  yy* i.ccode if year<1914, pairwise correlation(psar1)
107 | 
108 | xi: xtpcse e2c lage2c  year revolution4b raingrowth lagraingrowth laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard  yy* i.ccode if year<1914, pairwise correlation(psar1)
109 | 
110 | xi: xtpcse e2c lage2c  revolution5b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard yy* i.ccode, pairwise correlation(psar1)
111 | 
112 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard yy* i.ccode if year<1914, pairwise correlation(psar1)
113 | 
114 | *regression 8 with reversals can be found at the end of this do file
115 | 
116 | *Table 7
117 | 
118 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard yy* i.ccode, pairwise hetonly
119 | 
120 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard yy* i.ccode, pairwise 
121 | 
122 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard yy* i.ccode, pairwise hetonly correlation(psar1)
123 | 
124 | *Bruno correction
125 | xi:  xtlsdvc e2c  laggdp   lagn  lagurban  war ww1  revolution4b  Gold_standard lagenrollment yy*, initial(ah) vcov(50) 
126 | 
127 | *System GMM
128 | 
129 | xi: xtdpdsys e2c laggdp   lagn  lagurban  war ww1  revolution4b   Gold_standard lagenrollment yy*, maxldep(3) maxlags(3)
130 | 
131 | *Tobit
132 | 
133 | xi: tobit e2c lage2c  laggdp   lagn  lagurban  war ww1  revolution4b Gold_standard lagenrollment yy* i.ccode, ll(0) ul(100)
134 | 
135 | *Fractional logit. Use the rescalled Suffrage variables.
136 | 
137 | xi: glm e2c_wp lage2c_wp  laggdp   lagn  lagurban  war ww1  revolution4b  Gold_standard lagenrollment yy* i.ccode if e2c_wp<=1, fam(bin) link(logit) robust cluster(ccode)
138 | 
139 | *Table 10 - suffrage part
140 | 
141 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard lagtrade  yy* i.ccode, pairwise correlation(psar1)
142 | 
143 | 
144 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard lagtc  yy* i.ccode, pairwise correlation(psar1)
145 | 
146 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard lagagricultural  yy* i.ccode, pairwise correlation(psar1)
147 | 
148 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  warintensity2 ww1 lagenrollment  Gold_standard  yy* i.ccode, pairwise correlation(psar1)
149 | 
150 | 
151 | *Other estimations reported or discussed in the text
152 | 
153 | *Equation (12) -- with authors of liberty
154 | 
155 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard authors_2 authors_3 yy* i.ccode, pairwise correlation(psar1)
156 | 
157 | 
158 | *Equation (14) -- Distance interaction
159 | 
160 | xi: xtpcse e2c lage2c  laggdp   lagn  lagurban  war ww1  Revolution2_2012 Revtdist   Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1)
161 | 
162 | *Distance interaction with year effects (discussed after Equation (14))
163 | 
164 | xi: xtpcse e2c lage2c  laggdp   lagn  lagurban  war Revtdist   Gold_standard lagenrollment i.year i.ccode, het correlation(psar1)
165 | 
166 | 
167 | *Equation (15) - inequality interaction with year effects 
168 | 
169 | xi: xtpcse e2c lage2c  laggdp   lagn  lagurban  war  revolution4b lagGini Ginirev2 Gold_standard lagenrollment i.year i.ccode, hetonly correlation(psar1)
170 | 
171 | *Inequality interaction with two-year fixed effects and spatial correlation
172 | 
173 | xi: xtpcse e2c lage2c  laggdp   lagn  lagurban  war ww1  revolution4b lagGini Ginirev2 Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1)
174 | 
175 | 
176 | *Equation 3 in Supplementary Appendix D and discussed in Section 5.2.4
177 | 
178 | xi: reg e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard  yy* i.ccode, cluster(year)
179 | 
180 | *Error Correction Model estimates reported in Supplementary Appendix D and discussed in Section 5.5
181 | 
182 | xi: xtpcse d.e2c lage2c  l.laggdp   l.lagn  l.lagurban  l.war l.ww1  l.revolution4b  l.Gold_standard l.lagenrollment d.laggdp   d.lagn  d.lagurban  d.war d.ww1  d.revolution4b  d.Gold_standard d.lagenrollment yy*, pairwise correlation(psar1)
183 | 
184 | clear
185 | 
186 | *use "........\EER-D-13-00718R2_reversals_suffrage.dta"
187 | 
188 | *Table 6, regression 8 which allows for democratic reversals
189 | 
190 | 
191 | xi: xtpcse e2c lage2c  revolution4b laggdp   lagn  lagurban  war ww1 lagenrollment  Gold_standard  yy* i.ccode, pairwise correlation(psar1)
192 | 
193 | 


--------------------------------------------------------------------------------
/define_ord_betareg.R:
--------------------------------------------------------------------------------
  1 | # File with code needed to fit ordered beta regression
  2 | # As custom family in R package brms
  3 | # See vignette at 
  4 | 
  5 | # custom family
  6 | 
  7 | ord_beta_reg <- custom_family("ord_beta_reg",
  8 |                               dpars=c("mu","phi","cutzero","cutone"),
  9 |                               links=c("logit","log",NA,NA),
 10 |                               lb=c(NA,0,NA,NA),
 11 |                               type="real")
 12 | 
 13 | # stan code for density of the model
 14 | 
 15 | stan_funs <- "real ord_beta_reg_lpdf(real y, real mu, real phi, real cutzero, real cutone) {
 16 | 
 17 |     //auxiliary variables
 18 |     real mu_logit = logit(mu);
 19 |     vector[2] thresh;
 20 |     thresh[1] = cutzero;
 21 |     thresh[2] = cutzero + exp(cutone);
 22 |     
 23 |   if(y==0) {
 24 |       return log1m_inv_logit(mu_logit - thresh[1]);
 25 |     } else if(y==1) {
 26 |       return log_inv_logit(mu_logit  - thresh[2]);
 27 |     } else {
 28 |       return log(inv_logit(mu_logit  - thresh[1]) - inv_logit(mu_logit - thresh[2])) +
 29 |                 beta_proportion_lpdf(y|mu,phi);
 30 |     }
 31 |   }"
 32 | 
 33 | stanvars <- stanvar(scode=stan_funs,block="functions")
 34 | 
 35 | # For pulling posterior predictions
 36 | 
 37 | posterior_predict_ord_beta_reg <- function(i, draws, ...) {
 38 |   mu <- draws$dpars$mu[, i]
 39 |   phi <- draws$dpars$phi
 40 |   cutzero <- draws$dpars$cutzero
 41 |   cutone <- draws$dpars$cutone
 42 |   N <- length(phi)
 43 |   
 44 |   thresh1 <- cutzero
 45 |   thresh2 <- cutzero + exp(cutone)
 46 |   
 47 |   pr_y0 <- 1 - plogis(qlogis(mu) - thresh1)
 48 |   pr_y1 <- plogis(qlogis(mu) - thresh2)
 49 |   pr_cont <- plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2)
 50 |   out_beta <- rbeta(n=N,mu*phi,(1-mu)*phi)
 51 |   
 52 |   # now determine which one we get for each observation
 53 |   outcomes <- sapply(1:N, function(i) {
 54 |     sample(1:3,size=1,prob=c(pr_y0[i],pr_cont[i],pr_y1[i]))
 55 |   })
 56 |   
 57 |   final_out <- sapply(1:length(outcomes),function(i) {
 58 |     if(outcomes[i]==1) {
 59 |       return(0)
 60 |     } else if(outcomes[i]==2) {
 61 |       return(out_beta[i])
 62 |     } else {
 63 |       return(1)
 64 |     }
 65 |   })
 66 |   
 67 |   final_out
 68 |   
 69 | }
 70 | 
 71 | # for calculating marginal effects/conditional expectations
 72 | 
 73 | posterior_epred_ord_beta_reg<- function(draws) {
 74 |   
 75 |   cutzero <- draws$dpars$cutzero
 76 |   cutone <- draws$dpars$cutone
 77 |   
 78 |   mu <- draws$dpars$mu
 79 |   
 80 |   thresh1 <- cutzero
 81 |   thresh2 <- cutzero + exp(cutone)
 82 |   
 83 |   low <- 1 - plogis(qlogis(mu) - thresh1)
 84 |   middle <- plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2)
 85 |   high <- plogis(qlogis(mu) - thresh2)
 86 |   
 87 |   low*0 + middle*mu + high
 88 | }
 89 | 
 90 | # for calcuating LOO and Bayes Factors
 91 | 
 92 | log_lik_ord_beta_reg <- function(i, draws) {
 93 |   
 94 |   mu <- draws$dpars$mu[,i]
 95 |   phi <- draws$dpars$phi
 96 |   y <- draws$data$Y[i]
 97 |   cutzero <- draws$dpars$cutzero
 98 |   cutone <- draws$dpars$cutone
 99 |   
100 |   thresh1 <- cutzero
101 |   thresh2 <- cutzero + exp(cutone)
102 |   
103 |   if(y==0) {
104 |     out <- log(1 - plogis(qlogis(mu) - thresh1))
105 |   } else if(y==1) {
106 |     out <- log(plogis(qlogis(mu) - thresh2))
107 |   } else {
108 |     out <- log(plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2)) + dbeta(y,mu*phi,(1-mu)*phi,log=T)
109 |   }
110 |   
111 |   out
112 |   
113 | }
114 | 
115 | ###### Code declaring induced dirichlet prior ####
116 | # code from Michael Betancourt/Staffan Betner
117 | # discussion here: https://discourse.mc-stan.org/t/dirichlet-prior-on-ordinal-regression-cutpoints-in-brms/20640
118 | dirichlet_prior <- "
119 |   real induced_dirichlet_lpdf(vector c, vector alpha, real phi) {
120 |     int K = num_elements(c) + 1;
121 |     vector[K - 1] sigma = inv_logit(phi - c);
122 |     vector[K] p;
123 |     matrix[K, K] J = rep_matrix(0, K, K);
124 |     
125 |     // Induced ordinal probabilities
126 |     p[1] = 1 - sigma[1];
127 |     for (k in 2:(K - 1))
128 |       p[k] = sigma[k - 1] - sigma[k];
129 |     p[K] = sigma[K - 1];
130 |     
131 |     // Baseline column of Jacobian
132 |     for (k in 1:K) J[k, 1] = 1;
133 |     
134 |     // Diagonal entries of Jacobian
135 |     for (k in 2:K) {
136 |       real rho = sigma[k - 1] * (1 - sigma[k - 1]);
137 |       J[k, k] = - rho;
138 |       J[k - 1, k] = rho;
139 |     }
140 |     
141 |     return   dirichlet_lpdf(p | alpha)
142 |            + log_determinant(J);
143 |   }
144 | "
145 | dirichlet_prior_stanvar <- stanvar(scode = dirichlet_prior, block = "functions")
146 | 
147 | stanvar(scode = "ordered[2] thresh;
148 |               thresh[1] = cutzero;
149 |               thresh[2] = cutzero+exp(cutone);", 
150 |         block = "tparameters") -> # there might be a better way to specify this
151 |   dirichlet_prior_ordbeta_stanvar
152 | 
153 | stanvars <- stanvars + dirichlet_prior_stanvar + dirichlet_prior_ordbeta_stanvar
154 | 
155 | # Feel free to add any other priors / change the priors on b, 
156 | # which represent regression coefficients on the logit
157 | # scale
158 |   
159 | priors <- set_prior("target += induced_dirichlet_lpdf(thresh | rep_vector(1, 3), 0)", check=FALSE) +
160 |   set_prior("normal(0,5)",class="b") + 
161 |   set_prior("exponential(.1)",class="phi")
162 | 
163 | # priors <- set_prior("normal(0,5)",class="b") + 
164 | #   prior(constant(0),class="b",coef="Intercept") +
165 | #   prior_string("target += normal_lpdf((cutzero + exp(cutone)) - cutzero|0,3) + cutone",check=F) +
166 | #   set_prior("exponential(.1)",class="phi")
167 | 
168 | priors_phireg <- set_prior("normal(0,5)",class="b") + 
169 |   set_prior("target += induced_dirichlet_lpdf(thresh | rep_vector(1, 3), 0)", check=FALSE) 
170 | 


--------------------------------------------------------------------------------
/estimate_with_brms.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "How to Run Ordered Beta Regression with `brms`"
  3 | author: Robert Kubinec
  4 | output: html_document
  5 | ---
  6 | 
  7 | ```{r setup, include=F}
  8 | 
  9 | require(brms)
 10 | require(dplyr)
 11 | require(tidyr)
 12 | require(ggplot2)
 13 | require(haven)
 14 | 
 15 | knitr::opts_chunk$set(echo=T)
 16 | 
 17 | set.seed(71520177)
 18 | 
 19 | ```
 20 | 
 21 | 
 22 | # Overview
 23 | 
 24 | This notebook contains instructions for running the ordered beta regression model in the R package `brms`, a front-end to the Stan Hamiltonian Markov Chain Monte Carlo sampler. The ordered beta regression model is designed explicitly for slider scale/visual analog scale data of the type you will often find in online surveys among other areas. I refer you to a paper on the model if you are not familiar with it: https://osf.io/preprints/socarxiv/2sx6y/.
 25 | 
 26 | The ordered beta regression model is not natively supported in `brms` and so instead I define it here using the [custom response option](https://cran.r-project.org/web/packages/brms/vignettes/brms_customfamilies.html) of `brms`.
 27 | 
 28 | # Data Preparation
 29 | 
 30 | First, I load data from the Pew Forum that asks a question about respondents' views towards college professors (for a more complete explication, see the paper referenced above).
 31 | 
 32 | ```{r load_data}
 33 | 
 34 | pew <- read_sav("data/W28_Aug17/ATP W28.sav") %>% 
 35 |   mutate(therm=na_if(THERMO_THERMBC_W28,999)) %>% 
 36 |   filter(!is.na(therm))
 37 | 
 38 | pew %>% 
 39 |   ggplot(aes(x=therm)) +
 40 |   geom_histogram(bins=100) +
 41 |   theme_minimal() + 
 42 |   theme(panel.grid=element_blank()) +
 43 |   scale_x_continuous(breaks=c(0,25,50,75,100),
 44 |                      labels=c("0","Colder","50","Warmer","100")) +
 45 |   ylab("") +
 46 |   xlab("") +
 47 |   labs(caption=paste0("Figure shows the distribution of ",sum(!is.na(pew$therm))," non-missing survey responses."))
 48 | 
 49 | ```
 50 | 
 51 | The distributions of feelings towards college professors contains both degenerate (0 and 100) and continuous responses between 0 and 100. To model it, we first need to rescale the outcome so that it will have bounds between 0 and 1 instead of 0 and 100. This is done very easily by subtracting the minimum value, in this case 0, and then dividing by the difference between the minimum and maximum (i.e., 100). I also do some other data processing tasks:
 52 | 
 53 | ```{r munge_data}
 54 | 
 55 | model_data <- select(pew,therm,age="F_AGECAT_FINAL",
 56 |                         sex="F_SEX_FINAL",
 57 |                         income="F_INCOME_FINAL",
 58 |                         ideology="F_IDEO_FINAL",
 59 |                         race="F_RACETHN_RECRUITMENT",
 60 |                         education="F_EDUCCAT2_FINAL",
 61 |                      region="F_CREGION_FINAL",
 62 |                         approval="POL1DT_W28",
 63 |                        born_again="F_BORN_FINAL",
 64 |                        relig="F_RELIG_FINAL",
 65 |                         news="NEWS_PLATFORMA_W28") %>% 
 66 |     mutate_all(zap_missing) %>% 
 67 |     drop_na %>% 
 68 |   mutate(therm=(therm - min(therm,na.rm = T))/(max(therm,na.rm=T) - 
 69 |                                                        min(therm,na.rm = T)),
 70 |          news=as_factor(news,levels="labels"),
 71 |          age=c(scale(age)),
 72 |          race=as_factor(race,levels="labels"),
 73 |          ideology=as_factor(ideology,levels="labels"),
 74 |          income=as_factor(income,levels="labels"),
 75 |          approval=as_factor(approval,levels="labels"),
 76 |          sex=as_factor(sex,levels="labels"),
 77 |          education=as_factor(education,levels="labels"),
 78 |          born_again=as_factor(born_again,levels="labels"),
 79 |          relig=as_factor(relig,levels="labels")) %>% 
 80 |     mutate_at(c("race","ideology","income","approval","sex","education","born_again","relig"), function(c) {
 81 |       factor(c, exclude=levels(c)[length(levels(c))])
 82 |     }) %>% 
 83 |     # need to make these ordered factors for BRMS
 84 |     mutate(education=ordered(education),
 85 |            income=ordered(income))
 86 | 
 87 | ```
 88 | 
 89 | The completed dataset has `r nrow(model_data)` observations. 
 90 | 
 91 | # Define Custom Family
 92 | 
 93 | To model this data in `brms`, I have to define some code using the `custom_family` function to create a new distribution, `ord_beta_reg`. You need to run the following code in R before trying to use the custom family as it defines the log-likelihood and the priors (you can of course add additional priors of your own). You can access this code as an R script `define_ord_betareg.R` in the [Github repository containing this Rmarkdown file](https://github.com/saudiwin/ordbetareg).
 94 | 
 95 | ```{r customfam}
 96 | 
 97 | # custom family
 98 | 
 99 | ord_beta_reg <- custom_family("ord_beta_reg",
100 |                               dpars=c("mu","phi","cutzero","cutone"),
101 |                               links=c("logit","log",NA,NA),
102 |                               lb=c(NA,0,NA,NA),
103 |                               type="real")
104 | 
105 | # stan code for density of the model
106 | 
107 | stan_funs <- "real ord_beta_reg_lpdf(real y, real mu, real phi, real cutzero, real cutone) {
108 | 
109 |     //auxiliary variables
110 |     real mu_logit = logit(mu);
111 |     vector[2] thresh;
112 |     thresh[1] = cutzero;
113 |     thresh[2] = cutzero + exp(cutone);
114 |     
115 |   if(y==0) {
116 |       return log1m_inv_logit(mu_logit - thresh[1]);
117 |     } else if(y==1) {
118 |       return log_inv_logit(mu_logit  - thresh[2]);
119 |     } else {
120 |       return log(inv_logit(mu_logit  - thresh[1]) - inv_logit(mu_logit - thresh[2])) +
121 |                 beta_proportion_lpdf(y|mu,phi);
122 |     }
123 |   }"
124 | 
125 | stanvars <- stanvar(scode=stan_funs,block="functions")
126 | 
127 | # For pulling posterior predictions
128 | 
129 | posterior_predict_ord_beta_reg <- function(i, draws, ...) {
130 |   mu <- draws$dpars$mu[, i]
131 |   phi <- draws$dpars$phi
132 |   cutzero <- draws$dpars$cutzero
133 |   cutone <- draws$dpars$cutone
134 |   N <- length(phi)
135 |   
136 |   thresh1 <- cutzero
137 |   thresh2 <- cutzero + exp(cutone)
138 | 
139 |   pr_y0 <- 1 - plogis(qlogis(mu) - thresh1)
140 |   pr_y1 <- plogis(qlogis(mu) - thresh2)
141 |   pr_cont <- plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2)
142 |   out_beta <- rbeta(n=N,mu*phi,(1-mu)*phi)
143 |   
144 |    # now determine which one we get for each observation
145 |   outcomes <- sapply(1:N, function(i) {
146 |     sample(1:3,size=1,prob=c(pr_y0[i],pr_cont[i],pr_y1[i]))
147 |   })
148 |   
149 |   final_out <- sapply(1:length(outcomes),function(i) {
150 |     if(outcomes[i]==1) {
151 |       return(0)
152 |     } else if(outcomes[i]==2) {
153 |       return(out_beta[i])
154 |     } else {
155 |       return(1)
156 |     }
157 |   })
158 |   
159 |   final_out
160 |   
161 | }
162 | 
163 | # for calculating marginal effects/conditional expectations
164 | 
165 | posterior_epred_ord_beta_reg <- function(draws) {
166 | 
167 |   cutzero <- draws$dpars$cutzero
168 |   cutone <- draws$dpars$cutone
169 |   
170 |   mu <- draws$dpars$mu
171 |   
172 |   thresh1 <- cutzero
173 |   thresh2 <- cutzero + exp(cutone)
174 |   
175 |   low <- 1 - plogis(qlogis(mu) - thresh1)
176 |   middle <- plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2)
177 |   high <- plogis(qlogis(mu) - thresh2)
178 |   
179 |   low*0 + middle*mu + high
180 | }
181 | 
182 | # for calcuating LOO and Bayes Factors
183 | 
184 | log_lik_ord_beta_reg <- function(i, draws) {
185 | 
186 |   mu <- draws$dpars$mu[,i]
187 |   phi <- draws$dpars$phi
188 |   y <- draws$data$Y[i]
189 |   cutzero <- draws$dpars$cutzero
190 |   cutone <- draws$dpars$cutone
191 |   
192 |   thresh1 <- cutzero
193 |   thresh2 <- cutzero + exp(cutone)
194 | 
195 |   if(y==0) {
196 |     out <- log(1 - plogis(qlogis(mu) - thresh1))
197 |   } else if(y==1) {
198 |     out <- log(plogis(qlogis(mu) - thresh2))
199 |   } else {
200 |     out <- log(plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2)) + dbeta(y,mu*phi,(1-mu)*phi,log=T)
201 |   }
202 |   
203 |   out
204 |   
205 | }
206 | 
207 | # Feel free to add any other priors / change the priors on b, 
208 | # which represent regression coefficients on the logit
209 | # scale
210 | 
211 | priors <- set_prior("normal(0,5)",class="b") + 
212 |   prior(constant(0),class="b",coef="Intercept") +
213 |   prior_string("target += normal_lpdf((cutzero + exp(cutone)) - cutzero|0,3) + cutone",check=F) +
214 |   set_prior("exponential(.1)",class="phi")
215 | 
216 | priors_phireg <- set_prior("normal(0,5)",class="b") + 
217 |   prior(constant(0),class="b",coef="Intercept") +
218 |   prior_string("target += normal_lpdf((cutzero + exp(cutone)) - cutzero|0,3) + cutone",check=F)
219 | 
220 | ```
221 | 
222 | # Run In BRMS
223 | 
224 | Given these new functions, we can then run a `brms` model as usual. The one catch is that we need to include the `priors` object in the `priors` argument and the `stanvars` object in the `stanvars` argument of the function, which comes from the code block above. The second and *very important* item is that the model formula must start with `0 + Intercept` for the independent (right-hand side) variables. You can add anything you want after that first term. This is to ensure no multi-collinearity between the ordinal cutpoints in the model and the intercept.
225 | 
226 | Other than that, everything is the same and you can use any cool `brms` features. To demonstrate some of these, I will model education and income as ordinal predictors by using the `mo()` function. By doing so, we can get a single effect for education and income instead of having to use dummies for separate education/income categories. As a result, I can include an interaction between the two variables to see if wealthier more educated people have better views towards college professors than poorer better educated people. Finally, I include varying (random) census region intercepts.
227 | 
228 | One note is that I use the `cmdstanr` backend for `brms` to use the latest version of Stan, but you can remove that option to use the Stan that comes with `brms` (`rstan`).
229 | 
230 | ```{r run_brms}
231 | 
232 | brms_fit <- brm(therm ~ 0 + Intercept + mo(education)*mo(income) + (1|region), data=model_data,
233 |                 family=ord_beta_reg,
234 |                 cores=2,chains=2,
235 |                 prior = priors,
236 |                 refresh=0,
237 |                 backend="cmdstanr",
238 |                 stanvars=stanvars)
239 | 
240 | ```
241 | 
242 | The running time for this model, which has pretty complicated predictors, is about 7 minutes. So the model is currently robust enough to handle datasets of reasonable size. Performance will improve if I can get the model into `brms` proper. The one divergent transition referenced above is due to the well-known funnel problem of the variance of the random intercepts, and I will ignore it for the purposes of this vignette.
243 | 
244 | # Post-Estimation
245 | 
246 | The first thing we can do is extract the model cutpoints and overlay them on the empirical distribution to see how the model is dividing the outcome into discrete-ish categories. We have to do transformation of the cutpoints using the inverse logit function in R (`plogis`) to get back values in the scale of the response, and I have to exponentiate and add the first cutpoint to get the correct value for the second cutpoint:
247 | 
248 | ```{r plot_cut}
249 | 
250 | all_draws <- extract_draws(brms_fit)
251 | 
252 | cutzero <- plogis(all_draws$dpars$cutzero)
253 | cutone <- plogis(all_draws$dpars$cutzero + exp(all_draws$dpars$cutone))
254 | 
255 | pew %>% 
256 |   ggplot(aes(x=therm)) +
257 |   geom_histogram(bins=100) +
258 |   theme_minimal() + 
259 |   theme(panel.grid=element_blank()) +
260 |   scale_x_continuous(breaks=c(0,25,50,75,100),
261 |                      labels=c("0","Colder","50","Warmer","100")) +
262 |   geom_vline(xintercept = mean(cutzero)*100,linetype=2) +
263 |   geom_vline(xintercept = mean(cutone)*100,linetype=2) +
264 |   ylab("") +
265 |   xlab("") +
266 |   labs(caption=paste0("Figure shows the distribution of ",sum(!is.na(pew$therm))," non-missing survey responses."))
267 | 
268 | 
269 | ```
270 | 
271 | We can see in the plot above that the model does a good job isolating values that are very close to 0 and 1 from values that are more continuous in nature. 
272 | 
273 | We can plot the full predictive distribution relative to the original outcome:
274 | 
275 | ```{r post_predict}
276 | 
277 | pp_check(brms_fit) + theme_minimal()
278 | 
279 | ```
280 | 
281 | We can see the coefficients from the model with the following command (these are on the logit scale with the exception of phi, the scale/dispersion parameter):
282 | 
283 | ```{r coef_plot}
284 | plot(brms_fit,ask=F,theme=theme(panel.grid=element_blank(),
285 |                                 panel.background = element_blank()))
286 | ```
287 | 
288 | # Marginal Effects
289 | 
290 | We can also look at marginal effects, or the average change in the outcome given a unit change in one of the variables, by using the `conditional_effects` function in `brms`. This function plots the effect of `income` and `education` separately and then the interaction of the two.
291 | 
292 | ```{r marg_effect}
293 | 
294 | plot(conditional_effects(brms_fit),theme=theme(axis.text.x = element_text(angle=90),
295 |                                                panel.grid=element_blank(),
296 |                                                panel.background = element_blank()),ask=F)
297 | 
298 | ```
299 | 
300 | Broadly speaking, these plots show that richer people have less favorable views on college professors, less educated people have less favorable views towards college professors, and the effect is even stronger if we consider the interaction. Wealthier less educated people are dramatically more likely to have less favorable views towards college professors than poor people with a postgraduate education. The difference is equivalent to 0.4, or 40 points on the original 0 to 100 scale.
301 | 
302 | # Understanding Clustering/Polarization of Respondents
303 | 
304 | As I explain in the paper, one of the main advantages of using a Beta regression model is its ability to model the dispersion among respondents not just in terms of variance (i.e. heteroskedasticity) but also the shape of dispersion, whether it is U or inverted-U shaped. Conceptually, a U shape would imply that respondents are bipolar, moving towards the extremes. An inverted-U shape would imply that respondents tend to cluster around a central value. We can predict these responses conditionally in the sample by adding predictors for `phi`, the scale/dispersion parameter in the Beta distribution. Higher values of `phi` imply a uni-modal distribution clustered around a central value, with increasing `phi` implying more clustering. Lower values of `phi` imply a bi-modal distribution with values at the extremes. Notably, these effects are calculated independently of the expected value, or mean, of the distribution, so values of `phi` will produce different shapes depending on the average value. 
305 | 
306 | The one change we need to make to fit this model is to add a formula predicting `phi` in the code below (this formula does not need the `0 + Intercept` syntax of the main model). We wrap both formulas in the `bf` function to indicate they are both distributional parameters. We also need to change the prior argument to `priors_phireg` as the priors of the model have changed as a result of adding predictors. For this model, we will put in an interaction between age and sex to see if younger/older men/women tend to have clustered or more heterogenous views on college professors. To make the model run a bit faster, we drop the `mo` terms around `education` and `income` so that they are evaluated as dummy variables.
307 | 
308 | 
309 | ```{r run_brms_phi}
310 | 
311 | brms_fit_phireg <- brm(bf(therm ~ 0 + Intercept + education + income, 
312 |                 phi ~ age*sex),
313 |                 data=model_data,
314 |                 family=ord_beta_reg,
315 |                 cores=2,chains=2,
316 |                 prior = priors_phireg,
317 |                 refresh=100,
318 |                 stanvars=stanvars,
319 |                 backend="cmdstanr")
320 | 
321 | ```
322 | 
323 | We cannot use the `conditional_effects` option to plot because the dispersion parameter `phi` by definition does not affect the expected outcome (i.e., the average). Instead, we can uses the regular `plot` function to visualize the parameters:
324 | 
325 | ```{r plot_phi}
326 | 
327 | plot(brms_fit_phireg,pars = "phi",fixed = F,ask=F,combo=c("intervals","hist"))
328 | 
329 | ```
330 | 
331 | There is weak to moderate evidence in this plot that women have more homogenous views and that older women have more homogenous views than younger women. However, there is substantial uncertainty in this estimate and the estimate itself is not very large. We are using the log link for `phi`, so to get the value of the effect on `phi`, we can exponentiate the coefficient, i.e. `exp(0.1)=``r round(exp(0.1),2)`. We can compare what the distributions look like by plotting histograms of simulated outcomes using the Beta distribution and the average response in our data, which is `r round(mean(model_data$therm,na.rm=T),2)`. We will examine a 1-unit change in `phi` (much larger than our estimated effects) from 2 to 3 by using the average response for `mu` and plotting the density of both distributions:
332 | 
333 | ```{r plot_phi_sim}
334 | 
335 | # parameterization of beta distribution in terms of mu/phi (mean/dispersion)
336 | rbeta_mean <- function(N,mu,phi) {
337 |   rbeta(N,mu*phi,(1-mu)*phi)
338 | }
339 | 
340 | tibble(phi_small=rbeta_mean(10000,mean(model_data$therm,na.rm=T),2),
341 |        phi_big=rbeta_mean(10000,mean(model_data$therm,na.rm=T),3)) %>% 
342 |   gather("phi_type","simulated_value") %>% 
343 |   ggplot(aes(x=simulated_value)) +
344 |   geom_density(alpha=0.5,aes(fill=phi_type)) +
345 |   theme(panel.background = element_blank(),
346 |         panel.grid=element_blank())
347 | 
348 | ```
349 | 
350 | We can see that the `phi_big` distribution is more clustered around a central value while `phi_small` shows more movement towards the extremes of the distribution. However, the movement is modest, as the value of the coefficient suggests.
351 | 


--------------------------------------------------------------------------------
/figures/figure_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_1.pdf


--------------------------------------------------------------------------------
/figures/figure_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_2.pdf


--------------------------------------------------------------------------------
/figures/figure_3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_3.pdf


--------------------------------------------------------------------------------
/figures/figure_4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_4.pdf


--------------------------------------------------------------------------------
/figures/figure_5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_5.pdf


--------------------------------------------------------------------------------
/figures/figure_6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_6.pdf


--------------------------------------------------------------------------------
/figures/figure_7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_7.pdf


--------------------------------------------------------------------------------
/frac_logit.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 |   // need a custom cdf
 3 |   
 4 |   real frac_logit_lcdf(real y, real p) {
 5 |     
 6 |     return log(1 - 2*p) - (log(2) + log(atan(1 - 2*p)));
 7 |     
 8 |   }
 9 |   
10 |   
11 | }
12 | 
13 | data {
14 |   int n;
15 |   int k; // number of columns
16 |   matrix[n,k] x; 
17 |   vector<lower=0, upper=1>[n] y;
18 |   int run_gen; // whether to use generated quantities
19 | }
20 | parameters {
21 |   vector[k] X_beta;
22 |   real alpha;
23 | }
24 | model {
25 | 
26 |   X_beta ~ normal(0,5);
27 |   alpha ~ normal(0,5);
28 |   
29 |   // custom "quasi" likelihood
30 |   
31 |   target += y .* log_inv_logit(alpha + x * X_beta) + (1 - y) .* log1m_inv_logit(alpha + x*X_beta);
32 |   
33 | }
34 | generated quantities {
35 |   vector[run_gen==1 ? n: 0] frac_log;
36 |   vector[run_gen==1 ? n: 0] frac_rep;
37 |   
38 |   if(run_gen==1) {
39 | 
40 |     frac_log = y .* log_inv_logit(alpha + x * X_beta) + (1 - y) .* log1m_inv_logit(alpha + x*X_beta);
41 |     frac_rep = inv_logit(alpha + x * X_beta);
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/helper_func.R:
--------------------------------------------------------------------------------
 1 | # helper functions for calculating marginal effects
 2 | 
 3 | rbeta_mean <- function(N,mu,phi) {
 4 |   rbeta(N, mu * phi, (1 - mu) * phi)
 5 | }
 6 | 
 7 | # get marginal effects
 8 | 
 9 | eps <- 1e-7
10 | setstep <- function(x) {
11 |   x + (max(abs(x), 1, na.rm = TRUE) * sqrt(eps)) - x
12 | }
13 | 
14 | # functions for doing marginal effects
15 | 
16 | predict_ordbeta <- function(cutpoints=NULL,X=NULL,
17 |                             X_miss=NULL,X_beta=NULL,X_beta_miss=NULL,
18 |                             alpha=NULL,
19 |                             combined_out=T) {
20 |   
21 |   # we'll assume the same eta was used to generate outcomes
22 |   eta <- X %*% as.matrix(X_beta) + alpha
23 |   
24 |   # probabilities for three possible categories (0, proportion, 1)
25 |   
26 |   
27 |   low <- 1-plogis(eta - cutpoints[1])
28 |   middle <- plogis(eta-cutpoints[1]) - plogis(eta-cutpoints[2])
29 |   high <- plogis(eta - cutpoints[2])
30 |   
31 |   
32 |   # check for whether combined outcome or single outcome
33 |   
34 |   if(combined_out) {
35 |     
36 |     if(!is.null(X_beta_miss)) {
37 |       pr_miss <- X_miss %*% as.matrix(X_beta_miss)
38 |       low*0 + middle*(0.5*pr_miss + (1-pr_miss)*plogis(eta)) + high*1
39 |     } else {
40 |       low*0 + middle*plogis(eta) + high*1
41 |     }
42 |     
43 |   } else {
44 |     
45 |     out_list <- list(pr_zero=low,
46 |          pr_proportion=middle,
47 |          proportion_value=plogis(eta),
48 |          pr_one=high)
49 |     
50 |     if(!is.null(X_beta_miss)) {
51 |       pr_miss <- X_miss %*% as.matrix(X_beta_miss)
52 |       out_list$pr_miss <- pr_miss
53 |     }
54 |     
55 |     out_list
56 |     
57 |   }
58 |   
59 | }
60 | 
61 | predict_beta <- function(X=NULL,X_beta=NULL) {
62 |   eta <- X %*% as.matrix(X_beta)
63 |   
64 |   prob <- plogis(eta)
65 |   
66 |   prob
67 | }
68 | 
69 | predict_zoib <- function(coef_g=NULL,coef_a=NULL,coef_m=NULL,
70 |                          alpha1=NULL,
71 |                          alpha2=NULL,
72 |                          alpha3=NULL,
73 |                          X=NULL,
74 |                          combined_out=T) {
75 |   
76 |   # we'll assume the same eta was used to generate outcomes
77 |   psi <- plogis(as.numeric(alpha1) + X %*% coef_a)
78 |   gamma <- plogis(as.numeric(alpha2) + X %*% coef_g)
79 |   eta <- as.numeric(alpha3) + X %*% coef_m
80 |   
81 |   # probabilities for three possible categories (0, proportion, 1)
82 |   low <- psi * (1-gamma)
83 |   middle <- (1-psi)
84 |   high <- psi * gamma
85 |   
86 |   # check for whether combined outcome or single outcome
87 |   
88 |   if(combined_out) {
89 |     low*0 + middle*plogis(eta) + high*1
90 |   } else {
91 |     list(pr_zero=low,
92 |          pr_proportion=middle,
93 |          proportion_value=eta,
94 |          pr_one=high)
95 |   }
96 |   
97 | }
98 | 


--------------------------------------------------------------------------------
/install.R:
--------------------------------------------------------------------------------
 1 | # This file will install all packages and also run cmdstanr's install_cmdstan function if it cannot 
 2 | # detect an existing cmdstan installation
 3 | 
 4 | # code borrowed from Vikram Baliga https://vbaliga.github.io/verify-that-r-packages-are-installed-and-loaded/
 5 | 
 6 | ## First specify the packages of interest
 7 | packages = c("dplyr","rstanarm","tidyr",
 8 |              "lubridate","loo","kableExtra",
 9 |              "bayesplot","patchwork","stringr","grDevices","emojifont",
10 |              "latex2exp","haven","ggplot2","moments",
11 |              "posterior","brms","remotes","future.apply",
12 |              "faux","rmarkdown","bookdown","tinytex","extrafont","binom","Hmisc",
13 |              "ggthemes","ggtext","boot")
14 | 
15 | print("Checking and installing packages.")
16 | 
17 | # create directory for data
18 | 
19 | dir.create("data")
20 | 
21 | ## Now load or install&load all
22 | package.check <- lapply(
23 |   packages,
24 |   FUN = function(x) {
25 |     print(paste0("Now checking ", x))
26 |     if (!require(x, character.only = TRUE)) {
27 |       print(paste0("Package ",x," not installed, installing from CRAN."))
28 |       install.packages(x, dependencies = TRUE)
29 |     }
30 |   }
31 | )
32 | 
33 | # install cmdstanr
34 | 
35 | if (!require("cmdstanr", character.only = TRUE)) {
36 |   install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos")))
37 | }
38 | 
39 | # check for cmdstan installation
40 | 
41 | library(cmdstanr)
42 | 
43 | # checking if path to cmdstan exists
44 | check_path <- try(cmdstan_path())
45 | 
46 | if('try-error' %in% class(check_path)) {
47 |   
48 |   user_input <- menu(c("Yes", "No"), title="Cmdstanr cannot find a cmdstan installation (if it does exist, you can also pass it as an option to set_cmdstan_path()). Would you like to install cmdstan from source? This is required to replicate the paper results.")
49 |   
50 | }
51 | 
52 | if(user_input==1L) {
53 |   
54 |   print("Installing cmsdstan.")
55 |   
56 |   print("If this fails, see package documentation at https://mc-stan.org/cmdstanr/.")
57 |   
58 |   install_cmdstan()
59 | }
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/kubinec_ord_betareg_appendix.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Appendix to Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Survey Sliders and Visual Analog Scales"
  3 | bibliography: BibTexDatabase.bib
  4 | date: "March 7th, 2020"
  5 | output: 
  6 |   bookdown::pdf_document2:
  7 |       keep_tex: true
  8 |       toc: false
  9 |       includes:
 10 |         in_header: preamble.tex
 11 | author: |
 12 |   | Robert Kubinec
 13 |   | New York University Abu Dhabi
 14 | ---
 15 | 
 16 | ```{r setup, include=F}
 17 | 
 18 | require(rstan)
 19 | require(dplyr)
 20 | require(brms)
 21 | require(cmdstanr)
 22 | require(posterior)
 23 | require(tidyr)
 24 | require(lubridate)
 25 | require(loo)
 26 | require(kableExtra)
 27 | require(bayesplot)
 28 | require(patchwork)
 29 | require(latex2exp)
 30 | require(haven)
 31 | 
 32 | knitr::opts_chunk$set(warning=F,message=F)
 33 | 
 34 | source("helper_func.R")
 35 | 
 36 | ord_beta_mod <- cmdstan_model("beta_logit.stan")
 37 | 
 38 | # to reproduce results, change if you want fresh sampling
 39 | 
 40 | random_seed <- 77123101
 41 | 
 42 | ```
 43 | 
 44 | # Models Without Degenerate Responses
 45 | 
 46 | Not only is it possible to fit the ordered beta regression mdoel to data without observations at the bounds, but it is advisable to do so if there is even a remote chance that such observations could be observed. For example, it may well be that a certain realization of the data contains observations that just so happened to not reach the bounds and are in the (0.01,0.99) interval. We could imagine this arising in a feeling thermometer/VAS scale where respondents' preferences tend to fairly clustered around the midpoint of the scale. However, a future sample of this same data could end up with observations at the bounds. It would be problematic in this case to fit only a Beta regression to the current data as the estimates would later be incomparable to estimates of future data with observations at the bounds.
 47 | 
 48 | While this scenario does not necessarily need to happen, it is enough of a motivation to fit the ordered beta regression model even in situations where there are no observations at the bounds (or perhaps only at one bound). The costs of doing so, both in terms of inference and computation, are quite low. Because the cutpoints were assigned a weakly informative prior, *they are identified without any data*. As a result, if a model is fit without any observations on the bounds, the cutpoints will end up in the far corners of the distribution, say at 0.001 and 0.999, but they will still exist and the posterior predictive distribution can produce them with some small probability. If future data was added to the sample incorporating observations at the bounds, the combined estimates would be interpretable and the cutpoints would adjust to handle the new data. 
 49 | 
 50 | To demonstrate this, I simulate data from a model with widely spaced cutpoints where I remove any of the few observations that end up at the bounds:
 51 | 
 52 | ```{r sim_bounds}
 53 | 
 54 |   N <- 1000
 55 |   
 56 |   X <- rnorm(N,runif(1,-2,2),1)
 57 |   
 58 |   X_beta <- -1
 59 |   eta <- X*X_beta
 60 |   
 61 |   # ancillary parameter of beta distribution
 62 |   # high clustering
 63 |   phi <- 70
 64 |   
 65 |   # predictor for ordered model
 66 |   mu1 <- eta
 67 |   # predictor for beta regression
 68 |   mu2 <- eta
 69 |   
 70 |   # wide cutpoints on logit scale
 71 |   cutpoints <- c(-8,8)
 72 |   
 73 |   # probabilities for three possible categories (0, proportion, 1)
 74 |   low <- 1-plogis(mu2 - cutpoints[1])
 75 |   middle <- plogis(mu2-cutpoints[1]) - plogis(mu2-cutpoints[2])
 76 |   high <- plogis(mu2 - cutpoints[2])
 77 |   
 78 |   # we'll assume the same eta was used to generate outcomes
 79 |   
 80 |   out_beta <- rbeta(N,plogis(mu1) * phi, (1 - plogis(mu1)) * phi) 
 81 |   
 82 |   # now determine which one we get for each observation
 83 |   outcomes <- sapply(1:N, function(i) {
 84 |     sample(1:3,size=1,prob=c(low[i],middle[i],high[i]))
 85 |   })
 86 |   
 87 |   # now combine binary (0/1) with proportion (beta)
 88 |   
 89 |   final_out <- sapply(1:length(outcomes),function(i) {
 90 |     if(outcomes[i]==1) {
 91 |       return(0)
 92 |     } else if(outcomes[i]==2) {
 93 |       return(out_beta[i])
 94 |     } else {
 95 |       return(1)
 96 |     }
 97 |   })
 98 |   
 99 |   # remove residual 1/0s
100 |   remove_degen <- final_out>0 & final_out<1
101 |   final_out <- final_out[remove_degen]
102 |   X <- X[remove_degen]
103 |   
104 |   tibble(x=final_out) %>% 
105 |   ggplot(aes(x=final_out)) +
106 |     geom_histogram(bins=100) +
107 |     geom_vline(xintercept = 0,linetype=2) + 
108 |     geom_vline(xintercept = 1,linetype=2) +
109 |     theme(panel.grid=element_blank(),
110 |           panel.background=element_blank()) +
111 |     ylab("Count") +
112 |     xlab("Simulated Outcome")
113 |   
114 | ```
115 | 
116 | We can then model this distribution as follows:
117 | 
118 | ```{r model_degen}
119 | to_bl <- list(N_degen=sum(final_out %in% c(0,1)),
120 |                 N_prop=sum(final_out>0 & final_out<1),
121 |                 X=1,
122 |                 outcome_prop=final_out[final_out>0 & final_out<1],
123 |                 outcome_degen=final_out[final_out %in% c(0,1)],
124 |                 covar_prop=as.matrix(X),
125 |                 covar_degen=as.matrix(X[final_out %in% c(0,1)]),
126 |                 N_pred_degen=sum(final_out %in% c(0,1)),
127 |                 N_pred_prop=sum(final_out>0 & final_out<1),
128 |                 indices_degen=array(dim=0),
129 |                 indices_prop=1:(sum(final_out>0 & final_out<1)),
130 |                 run_gen=1)
131 |   
132 |   fit_model <- ord_beta_mod$sample(data=to_bl,seed=random_seed,
133 |                         refresh=0,
134 |                         chains=1,cores=1,iter_sampling=1000)
135 |   
136 |   cutpoints <- fit_model$draws("cutpoints") %>% as_draws_matrix
137 |   
138 |   print(fit_model,c("X_beta","cutpoints"))
139 | ```
140 | 
141 | We can see from the model results that our coefficient `X_beta` was estimated without bias (equal to -1). The cutpoints were estimated with a little bit of bias due to the censoring we did on the outcome variable, but are still quite close to the original values. Furthermore, they are estimated at extremes -- the lower cutpoint is `r round(plogis(mean(cutpoints[,1])),4)` and the upper cutpoint is `r round(plogis(mean(cutpoints[,2])),4)`. As this example indicates, there is no reason not to fit a model with no observations at the bounds. The cutpoints are still identified and the model converges without a problem. Furthermore, we can then still simulate observations at the bounds from the posterior predictive distribution:
142 | 
143 | ```{r y_rep}
144 | ppc_dens_overlay(final_out,as_draws_matrix(fit_model$draws("regen_all")))
145 | ```
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/kubinec_ord_betareg_appendix_anon.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Appendix to Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Continuous Data with Lower and Upper Bounds"
  3 | bibliography: BibTexDatabase.bib
  4 | name: Robert Kubinec
  5 | date: "March 7th, 2020"
  6 | geometry: margin=1in
  7 | output: 
  8 |   bookdown::pdf_document2:
  9 |       keep_tex: true
 10 |       toc: true
 11 |       includes:
 12 |         in_header: preamble.tex
 13 | ---
 14 | 
 15 | ```{r setup, include=F}
 16 | 
 17 | require(rstan)
 18 | require(dplyr)
 19 | require(brms)
 20 | require(cmdstanr)
 21 | require(posterior)
 22 | require(tidyr)
 23 | require(lubridate)
 24 | require(loo)
 25 | require(kableExtra)
 26 | require(bayesplot)
 27 | require(patchwork)
 28 | require(latex2exp)
 29 | require(haven)
 30 | require(ggthemes)
 31 | require(forcats)
 32 | require(ggplot2)
 33 | 
 34 | knitr::opts_chunk$set(warning=F,message=F)
 35 | 
 36 | source("helper_func.R")
 37 | 
 38 | ord_beta_mod <- cmdstan_model("beta_logit.stan")
 39 | 
 40 | ord_Beta_mod_infl <- cmdstan_model("Beta_logit_infl_simple.stan")
 41 | ord_Beta_mod_phi <- cmdstan_model("Beta_logit_phireg.stan")
 42 | 
 43 | # to reproduce results, change if you want fresh sampling
 44 | 
 45 | random_seed <- 77123101
 46 | 
 47 | ```
 48 | 
 49 | # Joint Posterior and Log-likelihood Definition
 50 | 
 51 | I can express the model as a log-likelihood for a given distribution of $y_i$ as follows:
 52 | 
 53 | ```{=tex}
 54 | \begin{equation}
 55 | ll(y_i|K,\beta,\phi) = \sum_{i=1}^N\left\{\begin{array}{lr}
 56 | \text{log } \left[1 - g(X'\beta - k_1)\right] & \text{if } y_i=0\\
 57 | \text{log }\left[g(X'\beta - k_1) - g(X'\beta - k_2) \right ] + \text{log }\text{Beta}(g(X'\beta),\phi) & \text{if } y_i \in (0,1)\\
 58 | \text{log }g(X'\beta - k_2) & \text{if } y_i=1\\
 59 | \end{array}\right\}
 60 | (\#eq:ll)
 61 | \end{equation}
 62 | ```
 63 | 
 64 | Given this likelihood, I can define a joint log posterior distribution over $y$ given the log-likelihood function and set of parameters:
 65 | 
 66 | ```{=tex}
 67 | \begin{equation}
 68 | \text{log } p(k_1, k_2, \beta,\phi|y) \propto \sum_{i=1}^N \text{ log }p(K) + \text{ log }p(\beta) + \text{ log }p(\phi) + ll(y_i|K,\beta,\phi)
 69 | (\#eq:logp)
 70 | \end{equation}
 71 | ```
 72 | where $\propto$ indicates that the posterior is calculated proportional to the normalizing constant, i.e., the denominator in Bayes' formula.
 73 | 
 74 | # Simulation with Fixed Parameter Values
 75 | 
 76 | The fixed simulation relative to the more thorough simulation presented in the main text shows that, for this particular set of parameter draws (five covariates with a $\rho_x$ of 0.5, $\phi$ of 2, $k_1$ of -3 and $k_2$ of 2), the ZOIB shows somewhat less variance than ordered beta regression but at the cost of very high M-errors. The average ZOIB coefficient magnitude isless than one-half that of ordered beta regression, which is a worrying level of bias admitted for a small reduction in variance. This simulation also shows that fractional logit regression has relatively high variance, as is also seen in the empirical example. The Beta regression on transformed values and only continuous responses show high M errors and very low variance, suggesting that again that these data-driven fixes can cause severe distortions in estimating marginal effects.
 77 | 
 78 | ```{r fixsim,echo=F}
 79 | 
 80 | #all_sim_fixed <- readRDS("data/sim_cont_X_fixed.rds")
 81 | load("data/sim_cont_X_fixed.RData")
 82 | checkc <- sapply(all_sim_fixed, class)
 83 | all_sim_fixed <- all_sim_fixed[checkc!='try-error']
 84 | all_sim_fixed <- bind_rows(all_sim_fixed) %>% 
 85 |   unchop(c("med_est","X_beta","marg_eff","high","low","var_calc",
 86 |            'marg_eff_est','high_marg','low_marg','var_marg'))
 87 | 
 88 | my_conf_fun <- function(x) {
 89 |   if(all(x>=0 & x<=1)) {
 90 |     # use binomial confidence intervals
 91 |     bi_ci <- binom::binom.bayes(sum(x,na.rm=T),length(x))
 92 |     return(list(y=bi_ci$mean,
 93 |                   ymin=bi_ci$lower,
 94 |                   ymax=bi_ci$upper))
 95 |   } else {
 96 |     boot_ci <- Hmisc::smean.cl.boot(x)
 97 |     return(list(y=boot_ci["Mean"],
 98 |                   ymin=boot_ci["Lower"],
 99 |                   ymax=boot_ci["Upper"]))
100 |   }
101 | }
102 | 
103 | 
104 | all_sim_fixed %>%
105 |   mutate(s_err=sign(marg_eff)!=sign(marg_eff_est),
106 |             m_err=abs(marg_eff_est)/abs(marg_eff)) %>% 
107 |   mutate(Power=as.numeric(ifelse(sign(marg_eff)==sign(high) & sign(marg_eff)==sign(low),
108 |                                  1,
109 |                                  0))) %>% 
110 |   select(`Proportion S Errors`="s_err",N,Power,
111 |          `M Errors`="m_err",Variance="var_marg",model) %>% 
112 |   gather(key = "type",value="estimate",-model,-N) %>% 
113 |   mutate(N=ifelse(N<100, 100, N)) %>% 
114 |   filter(!is.na(model)) %>% 
115 |   # filter(model %in% c("OLS","Ordinal Beta Regression",'ZOIB',"Fractional")) %>% 
116 |   # group_by(N,model,type) %>% 
117 |   # summarize(med_est=mean(estimate),
118 |   #           low_est=quantile(estimate,.05),
119 |   #           high_est=quantile(estimate,.95)) %>% 
120 |   ggplot(aes(y=estimate,x=N)) +
121 |   #geom_point(aes(colour=model),alpha=0.1) +
122 |    stat_summary(fun.data="mean_cl_boot",aes(colour=model,
123 |                                             shape=model),
124 |                 size=.5,fatten=1.5,
125 |                 position=position_dodge(width=0.5))   + 
126 |   ylab("") +
127 |   xlab("N") +
128 |   facet_wrap(~type,scales="free_y",ncol = 2) +
129 |   scale_color_viridis_d() +
130 |   #scale_y_log10() +
131 |   labs(caption=stringr::str_wrap("Summary statistics for each value of N calculated via bootstrapping. M Errors  and S errors are magnitude of bias and incorrect sign of the estimated marginal effect. Variance refers to estimated posterior variance (uncertainty) of the marginal effect.",width=80)) +
132 |   guides(color=guide_legend(title=""),
133 |          shape=guide_legend(title="")) +
134 |   theme_tufte()
135 | 
136 | ```
137 | 
138 | 
139 | # Models Without Degenerate Responses
140 | 
141 | Not only is it possible to fit the ordered beta regression mdoel to data without observations at the bounds, but it is advisable to do so if there is even a remote chance that such observations could be observed. For example, it may well be that a certain realization of the data contains observations that just so happened to not reach the bounds and are in the (0.01,0.99) interval. We could imagine this arising in a feeling thermometer/VAS scale where respondents' preferences tend to fairly clustered around the midpoint of the scale. However, a future sample of this same data could end up with observations at the bounds. It would be problematic in this case to fit only a Beta regression to the current data as the estimates would later be incomparable to estimates of future data with observations at the bounds.
142 | 
143 | While this scenario does not necessarily need to happen, it is enough of a motivation to fit the ordered beta regression model even in situations where there are no observations at the bounds (or perhaps only at one bound). The costs of doing so, both in terms of inference and computation, are quite low. Because the cutpoints were assigned a weakly informative prior, *they are identified without any data*. As a result, if a model is fit without any observations on the bounds, the cutpoints will end up in the far corners of the distribution, say at 0.001 and 0.999, but they will still exist and the posterior predictive distribution can produce them with some small probability. If future data was added to the sample incorporating observations at the bounds, the combined estimates would be interpretable and the cutpoints would adjust to handle the new data. 
144 | 
145 | To demonstrate this, I simulate data from a model with widely spaced cutpoints where I remove any of the few observations that end up at the bounds:
146 | 
147 | ```{r sim_bounds}
148 | 
149 |   N <- 1000
150 |   
151 |   X <- rnorm(N,runif(1,-2,2),1)
152 |   
153 |   X_beta <- -1
154 |   eta <- X*X_beta
155 |   
156 |   # ancillary parameter of beta distribution
157 |   # high clustering
158 |   phi <- 70
159 |   
160 |   # predictor for ordered model
161 |   mu1 <- eta
162 |   # predictor for beta regression
163 |   mu2 <- eta
164 |   
165 |   # wide cutpoints on logit scale
166 |   cutpoints <- c(-8,8)
167 |   
168 |   # probabilities for three possible categories (0, proportion, 1)
169 |   low <- 1-plogis(mu2 - cutpoints[1])
170 |   middle <- plogis(mu2-cutpoints[1]) - plogis(mu2-cutpoints[2])
171 |   high <- plogis(mu2 - cutpoints[2])
172 |   
173 |   # we'll assume the same eta was used to generate outcomes
174 |   
175 |   out_beta <- rbeta(N,plogis(mu1) * phi, (1 - plogis(mu1)) * phi) 
176 |   
177 |   # now determine which one we get for each observation
178 |   outcomes <- sapply(1:N, function(i) {
179 |     sample(1:3,size=1,prob=c(low[i],middle[i],high[i]))
180 |   })
181 |   
182 |   # now combine binary (0/1) with proportion (beta)
183 |   
184 |   final_out <- sapply(1:length(outcomes),function(i) {
185 |     if(outcomes[i]==1) {
186 |       return(0)
187 |     } else if(outcomes[i]==2) {
188 |       return(out_beta[i])
189 |     } else {
190 |       return(1)
191 |     }
192 |   })
193 |   
194 |   # remove residual 1/0s
195 |   remove_degen <- final_out>0 & final_out<1
196 |   final_out <- final_out[remove_degen]
197 |   X <- X[remove_degen]
198 |   
199 |   tibble(x=final_out) %>% 
200 |   ggplot(aes(x=final_out)) +
201 |     geom_histogram(bins=100) +
202 |     geom_vline(xintercept = 0,linetype=2) + 
203 |     geom_vline(xintercept = 1,linetype=2) +
204 |     theme(panel.grid=element_blank(),
205 |           panel.background=element_blank()) +
206 |     ylab("Count") +
207 |     xlab("Simulated Outcome")
208 |   
209 | ```
210 | 
211 | We can then model this distribution as follows:
212 | 
213 | ```{r model_degen}
214 | to_bl <- list(N_degen=sum(final_out %in% c(0,1)),
215 |                 N_prop=sum(final_out>0 & final_out<1),
216 |                 X=1,
217 |                 outcome_prop=final_out[final_out>0 & final_out<1],
218 |                 outcome_degen=final_out[final_out %in% c(0,1)],
219 |                 covar_prop=as.matrix(X),
220 |                 covar_degen=as.matrix(X[final_out %in% c(0,1)]),
221 |                 N_pred_degen=sum(final_out %in% c(0,1)),
222 |                 N_pred_prop=sum(final_out>0 & final_out<1),
223 |                 indices_degen=array(dim=0),
224 |                 indices_prop=1:(sum(final_out>0 & final_out<1)),
225 |                 run_gen=1)
226 |   
227 |   fit_model <- ord_beta_mod$sample(data=to_bl,seed=random_seed,
228 |                         refresh=0,
229 |                         chains=1,cores=1,iter_sampling=1000)
230 |   
231 |   cutpoints <- fit_model$draws("cutpoints") %>% as_draws_matrix
232 |   
233 |   print(fit_model,c("X_beta","cutpoints"))
234 | ```
235 | 
236 | We can see from the model results that our coefficient `X_beta` was estimated without bias (equal to -1). The cutpoints were estimated with a little bit of bias due to the censoring we did on the outcome variable, but are still quite close to the original values. Furthermore, they are estimated at extremes -- the lower cutpoint is `r round(plogis(mean(cutpoints[,1])),4)` and the upper cutpoint is `r round(plogis(mean(cutpoints[,2])),4)`. As this example indicates, there is no reason not to fit a model with no observations at the bounds. The cutpoints are still identified and the model converges without a problem. Furthermore, we can then still simulate observations at the bounds from the posterior predictive distribution:
237 | 
238 | ```{r y_rep}
239 | ppc_dens_overlay(final_out,as_draws_matrix(fit_model$draws("regen_all")))
240 | ```
241 | 
242 | <!-- # Phi (Dispersion) Regression -->
243 | 
244 | <!-- Figure \@ref(fig:phireg) shows the effect of covariates on $\phi$, or the dispersion/scale parameter in an ordered Beta regression. Higher values of $\phi$ indicate that respondents tend to cluster together in their views; lower values imply respondents tend to move towards the extremes in opposite directions. As can be seen, the effects of predictors on $\phi$ are intriguing, as wealthier and more educated respondents are more likely to hold much more heterogeneous views, while poorer, less educated and more conservative respondents tend to cluster together. As this figure shows, a properly parameterized Beta regression model can provide fascinating insights about human behavior and opinion. For an OLS model, this type of distinction among respondents is impossible as a U-shape and a cluster (inverted-U shape) can have the same mean and variance. -->
245 | 
246 | <!-- ```{r phiregset,fig.height=5,width=5,include=F} -->
247 | 
248 | <!-- pew <- haven::read_sav("data/W28_Aug17/ATP W28.sav") %>%  -->
249 | <!--   mutate(therm=na_if(THERMO_THERMBC_W28,999)) %>%  -->
250 | <!--   filter(!is.na(therm)) -->
251 | 
252 | <!-- # need a completed dataset with all of the covariates -->
253 | 
254 | <!--   model_data <- select(pew,therm,age="F_AGECAT_FINAL", -->
255 | <!--                         sex="F_SEX_FINAL", -->
256 | <!--                         income="F_INCOME_FINAL", -->
257 | <!--                         ideology="F_IDEO_FINAL", -->
258 | <!--                         race="F_RACETHN_RECRUITMENT", -->
259 | <!--                         education="F_EDUCCAT2_FINAL", -->
260 | <!--                         approval="POL1DT_W28", -->
261 | <!--                        born_again="F_BORN_FINAL", -->
262 | <!--                        relig="F_RELIG_FINAL", -->
263 | <!--                         news="NEWS_PLATFORMA_W28") %>%  -->
264 | <!--     mutate_all(zap_missing) %>%  -->
265 | <!--     drop_na %>%  -->
266 | <!--   mutate(therm=(therm - min(therm,na.rm = T))/(max(therm,na.rm=T) -  -->
267 | <!--                                                        min(therm,na.rm = T)), -->
268 | <!--          therm_rescale=(therm * (sum(!is.na(therm))-1) + 0.5)/sum(!is.na(therm)), -->
269 | <!--          news=as_factor(news,levels="labels"), -->
270 | <!--          age=c(scale(age)), -->
271 | <!--          race=as_factor(race,levels="labels"), -->
272 | <!--          ideology=as_factor(ideology,levels="labels"), -->
273 | <!--          income=as_factor(income,levels="labels"), -->
274 | <!--          approval=as_factor(approval,levels="labels"), -->
275 | <!--          sex=as_factor(sex,levels="labels"), -->
276 | <!--          education=as_factor(education,levels="labels"), -->
277 | <!--          born_again=as_factor(born_again,levels="labels"), -->
278 | <!--          relig=as_factor(relig,levels="labels")) %>%  -->
279 | <!--     mutate_at(c("race","ideology","income","approval","sex","education","born_again","relig"), function(c) { -->
280 | <!--       factor(c, exclude=levels(c)[length(levels(c))]) -->
281 | <!--     }) %>%  -->
282 | <!--     drop_na -->
283 | 
284 | <!--   model_data_prop <- filter(model_data,therm>0,therm<1) -->
285 | <!--   model_data_degen <- filter(model_data,therm==0|therm==1) -->
286 | 
287 | <!--   X_prop <- model.matrix(therm~race+sex+income+ideology+approval+age+education+born_again+relig,data=model_data_prop)[,-1] -->
288 | <!--   # don't drop the intercept for the inflation model -->
289 | <!--   X_prop_miss <- model.matrix(therm~education + news,data=model_data_prop) -->
290 | <!--   X_degen_miss <- model.matrix(therm~education + news,data=model_data_degen) -->
291 | <!--   X_prop_phi <- model.matrix(therm~ideology+ age,data=model_data_prop) -->
292 | <!--   X_degen_phi <- model.matrix(therm~ideology + age,data=model_data_degen) -->
293 | <!--   X_degen <- model.matrix(therm~race+sex+income+ideology+approval+age+education+born_again+relig,data=model_data_degen)[,-1] -->
294 | 
295 | <!--   to_bl <- list(N_degen=nrow(model_data_degen), -->
296 | <!--                 N_prop=nrow(model_data_prop), -->
297 | <!--                 X=ncol(X_prop), -->
298 | <!--                 X_miss=0, -->
299 | <!--                 infl_value=-1, -->
300 | <!--                 outcome_prop=model_data_prop$therm, -->
301 | <!--                 outcome_degen=model_data_degen$therm, -->
302 | <!--                 covar_prop=X_prop, -->
303 | <!--                 covar_degen=X_degen, -->
304 | <!--                 covar_prop_infl=array(0,dim=c(nrow(model_data_prop),0)), -->
305 | <!--                 covar_degen_infl=array(0,dim=c(nrow(model_data_degen),0)), -->
306 | <!--                 N_pred_degen=nrow(model_data_degen), -->
307 | <!--                 N_pred_prop=nrow(model_data_prop), -->
308 | <!--                 indices_degen=1:nrow(model_data_degen), -->
309 | <!--                 indices_prop=1:nrow(model_data_prop), -->
310 | <!--                 run_gen=1) -->
311 | 
312 | <!-- run_model <- T -->
313 | 
314 | <!--   to_bl_infl <- list(N_degen=nrow(model_data_degen), -->
315 | <!--                 N_prop=nrow(model_data_prop), -->
316 | <!--                 X=ncol(X_prop), -->
317 | <!--                 X_miss=ncol(X_prop_miss), -->
318 | <!--                 infl_value=0.5, -->
319 | <!--                 outcome_prop=model_data_prop$therm, -->
320 | <!--                 outcome_degen=model_data_degen$therm, -->
321 | <!--                 covar_prop=X_prop, -->
322 | <!--                 covar_degen=X_degen, -->
323 | <!--                 covar_prop_infl=X_prop_miss, -->
324 | <!--                 covar_degen_infl=X_degen_miss, -->
325 | <!--                 N_pred_degen=nrow(model_data_degen), -->
326 | <!--                 N_pred_prop=nrow(model_data_prop), -->
327 | <!--                 indices_degen=1:nrow(model_data_degen), -->
328 | <!--                 indices_prop=1:nrow(model_data_prop), -->
329 | <!--                 run_gen=1) -->
330 | 
331 | 
332 | <!--   if(run_model) { -->
333 | <!--     fit_pew_infl <- ord_Beta_mod_infl$sample(iter_warmup = 500, -->
334 | <!--                                              iter_sampling=500, -->
335 | <!--                         seed=random_seed, -->
336 | <!--                         data=to_bl_infl,parallel_chains=2,cores=2) -->
337 | 
338 | <!--     saveRDS(fit_pew_infl,"data/fit_pew_infl.rds") -->
339 | <!--   } else { -->
340 | <!--     fit_pew_infl <- readRDS("data/fit_pew_infl.rds") -->
341 | <!--   } -->
342 | 
343 | <!--   # finally try phireg model -->
344 | 
345 | <!--   to_bl_phireg <- list(N_degen=nrow(model_data_degen), -->
346 | <!--                 N_prop=nrow(model_data_prop), -->
347 | <!--                 X=ncol(X_prop), -->
348 | <!--                 X_miss=ncol(X_prop_miss), -->
349 | <!--                 X_phi=ncol(X_prop_phi), -->
350 | <!--                 infl_value=0.5, -->
351 | <!--                 outcome_prop=model_data_prop$therm, -->
352 | <!--                 outcome_degen=model_data_degen$therm, -->
353 | <!--                 covar_prop=X_prop, -->
354 | <!--                 covar_degen=X_degen, -->
355 | <!--                 covar_prop_phi=X_prop_phi, -->
356 | <!--                 covar_degen_phi=X_degen_phi, -->
357 | <!--                 N_pred_degen=nrow(model_data_degen), -->
358 | <!--                 N_pred_prop=nrow(model_data_prop), -->
359 | <!--                 indices_degen=1:nrow(model_data_degen), -->
360 | <!--                 indices_prop=1:nrow(model_data_prop), -->
361 | <!--                 run_gen=1) -->
362 | 
363 | 
364 | <!--   if(run_model) { -->
365 | <!--     fit_pew_phireg <- ord_Beta_mod_phi$sample(  -->
366 | <!--                         seed=random_seed, -->
367 | <!--                         data=to_bl_phireg,parallel_chains=2,cores=2, -->
368 | <!--                         iter_warmup=500, -->
369 | <!--                         iter_sampling = 500) -->
370 | 
371 | <!--     fit_pew_phireg$save_object("data/fit_pew_phireg.rds") -->
372 | <!--   } else { -->
373 | <!--     fit_pew_phireg <- readRDS("data/fit_pew_phireg.rds") -->
374 | <!--   } -->
375 | 
376 | <!--   cutpoints_est_phi <- as_draws_matrix(fit_pew_phireg$draws("cutpoints")) -->
377 | <!--   X_beta_ord_phi <- as_draws_matrix(fit_pew_phireg$draws("X_beta")) -->
378 | <!--   yrep_ord_phi <- as_draws_matrix(fit_pew_phireg$draws("regen_all")) -->
379 | 
380 | <!--   cutpoints_est_infl <- as_draws_matrix(fit_pew_infl$draws("cutpoints")) -->
381 | <!--   X_beta_ord_infl <- as_draws_matrix(fit_pew_infl$draws("X_beta")) -->
382 | <!--   yrep_ord_infl <- as_draws_matrix(fit_pew_infl$draws("regen_all")) -->
383 | 
384 | <!-- # iterate over columns to get marginal effects -->
385 | <!-- # first for phi reg -->
386 | 
387 | <!-- mat_data <- rbind(X_degen,X_prop) -->
388 | 
389 | <!-- all_vars_ord_phi <- parallel::mclapply(1:ncol(mat_data),function(c) { -->
390 | 
391 | <!--   if(all(mat_data[!is.na(mat_data[,c]),c] %in% c(0,1))) { -->
392 | <!--     pred_data_high <- mat_data -->
393 | 
394 | <!--     pred_data_high[,c] <- 0 -->
395 | 
396 | <!--     pred_data_low <- mat_data -->
397 | 
398 | <!--     pred_data_low[,c] <- 1 -->
399 | <!--   } else { -->
400 | <!--     pred_data_high <- mat_data -->
401 | 
402 | <!--     pred_data_high[,c] <- pred_data_high[,c] + setstep(pred_data_high[,c]) -->
403 | 
404 | <!--     pred_data_low <- mat_data -->
405 | 
406 | <!--     pred_data_low[,c] <- pred_data_low[,c] - setstep(pred_data_low[,c]) -->
407 | <!--   } -->
408 | 
409 | 
410 | 
411 | <!--   margin_ord <- sapply(1:nrow(X_beta_ord_phi), function(i,this_col) { -->
412 | <!--     y0 <- predict_ordbeta(cutpoints=cutpoints_est_phi[i,], -->
413 | <!--                           X=pred_data_low, -->
414 | <!--                           X_beta=t(X_beta_ord_phi[i,])) -->
415 | 
416 | <!--     y1 <- predict_ordbeta(cutpoints=cutpoints_est_phi[i,], -->
417 | <!--                           X=pred_data_high, -->
418 | <!--                           X_beta=t(X_beta_ord_phi[i,])) -->
419 | 
420 | <!--     marg_eff <- (y1-y0)/(pred_data_high[,this_col]-pred_data_low[,this_col]) -->
421 | 
422 | <!--     mean(marg_eff) -->
423 | <!--   },c) -->
424 | 
425 | <!--   tibble(marg=margin_ord,variable=colnames(mat_data)[c]) -->
426 | <!-- },mc.cores=3) %>% bind_rows -->
427 | 
428 | <!-- recode_marg_phi <- all_vars_ord_phi %>%  -->
429 | <!--   mutate(variable=recode(variable, -->
430 | <!--                          `raceBlack non-Hispanic`="Black", -->
431 | <!--          `raceHispanic`="Hispanic", -->
432 | <!--          `raceOther`="Other Race", -->
433 | <!--          `sexFemale`="Female", -->
434 | <!--          `income10 to under $20,000`="$10k - $20k", -->
435 | <!--          `income20 to under $30,000`="$20k - $30k", -->
436 | <!--          `income30 to under $40,000`="$30k - $40k", -->
437 | <!--          `income40 to under $50,000`="$40k - $50k", -->
438 | <!--          `income50 to under $75,000`="$50k - $75k", -->
439 | <!--          `income75 to under $100,000`="$75k - $100k", -->
440 | <!--          `income100 to under $150,000 [OR]`="$100k - $150k", -->
441 | <!--          `income$150,000 or more`="> $150k", -->
442 | <!--          `ideologyConservative`="Conservative", -->
443 | <!--          `ideologyModerate`="Moderate", -->
444 | <!--          `ideologyLiberal`="Liberal", -->
445 | <!--          `ideologyVery liberal`="Very Liberal", -->
446 | <!--          `approvalDisapprove`="Disapprove Trump", -->
447 | <!--          age="Age", -->
448 | <!--          `educationSome college, no degree`="Some College", -->
449 | <!--          `educationHigh school graduate`="H.S. Graduate", -->
450 | <!--          `educationAssociate’s degree`="Associate's", -->
451 | <!--          `educationCollege graduate/some postgrad`="College Grad", -->
452 | <!--          `educationPostgraduate`="Postgraduate", -->
453 | <!--          `born_againNo, not born-again or evangelical Christian`="Born Again", -->
454 | <!--          `religRoman Catholic`="Roman Catholic", -->
455 | <!--          `religMormon (Church of Jesus Christ of Latter-day Saints or LDS)`="Mormon", -->
456 | <!--          `religOrthodox (such as Greek, Russian, or some other Orthodox church)`="Orthodox", -->
457 | <!--          `religSomething else, Specify:`="Other Religion"), -->
458 | <!--          variable=factor(variable,levels=c("Conservative", -->
459 | <!--                                            "Moderate", -->
460 | <!--                                            "Liberal", -->
461 | <!--                                            "Very Liberal", -->
462 | <!--                                            "Disapprove Trump", -->
463 | <!--                                            "Age", -->
464 | <!--                                            "Female", -->
465 | <!--                                            "Black", -->
466 | <!--                                            "Hispanic", -->
467 | <!--                                            "Other Race", -->
468 | <!--                                            "H.S. Graduate", -->
469 | <!--                                            "Some College", -->
470 | <!--                                            "Associate's", -->
471 | <!--                                            "College Grad", -->
472 | <!--                                            "Postgraduate", -->
473 | <!--                                            "Born Again", -->
474 | <!--                                            "Roman Catholic", -->
475 | <!--                                            "Mormon", -->
476 | <!--                                            "Orthodox", -->
477 | <!--                                            "Other Religion", -->
478 | <!--                                            "$10k - $20k", -->
479 | <!--                                            "$20k - $30k", -->
480 | <!--                                            "$30k - $40k", -->
481 | <!--                                            "$40k - $50k", -->
482 | <!--                                            "$50k - $75k", -->
483 | <!--                                            "$75k - $100k", -->
484 | <!--                                            "$100k - $150k", -->
485 | <!--                                            "> $150k")), -->
486 | <!--          group_list=forcats::fct_collapse(variable, -->
487 | <!--                                           `Ideology\nBaseline=Very\nConservative`=c("Conservative", -->
488 | <!--                                            "Moderate", -->
489 | <!--                                            "Liberal", -->
490 | <!--                                            "Very Liberal", -->
491 | <!--                                            "Disapprove Trump"), -->
492 | <!--                                           Demographics=c("Age", -->
493 | <!--                                            "Female", -->
494 | <!--                                            "Black", -->
495 | <!--                                            "Hispanic", -->
496 | <!--                                            "Other Race"), -->
497 | <!--                                           `Education\nBaseline=Less Than H.S.`=c("Some College", -->
498 | <!--                                            "H.S. Graduate", -->
499 | <!--                                            "Associate's", -->
500 | <!--                                            "College Grad", -->
501 | <!--                                            "Postgraduate"), -->
502 | <!--                                           `Religion\nBaseline=Protestant`=c("Born Again", -->
503 | <!--                                            "Roman Catholic", -->
504 | <!--                                            "Mormon", -->
505 | <!--                                            "Orthodox", -->
506 | <!--                                            "Other Religion"), -->
507 | <!--                                           `Income\nBaseline= <$10k`=c("$10k - $20k", -->
508 | <!--                                            "$20k - $30k", -->
509 | <!--                                            "$30k - $40k", -->
510 | <!--                                            "$40k - $50k", -->
511 | <!--                                            "$50k - $75k", -->
512 | <!--                                            "$75k - $100k", -->
513 | <!--                                            "$100k - $150k", -->
514 | <!--                                            "> $150k"))) -->
515 | <!-- ``` -->
516 | 
517 | ```{r phireg,echo=F,fig.cap="Effects of Covariates on Thermometer Dispersion (Extreme Responses)",eval=F}
518 | 
519 |   recode_marg_phi %>% 
520 |     group_by(variable,group_list) %>% 
521 |     mutate(marg=marg*100) %>% 
522 |   summarize(med_est=median(marg),
523 |             high=quantile(marg,.95),
524 |             low=quantile(marg,.05)) %>% 
525 |   ggplot(aes(y=med_est,x=forcats::fct_rev(variable))) +
526 |   geom_pointrange(aes(ymin=low,ymax=high),position=position_dodge(width=.5),alpha=0.5) +
527 |   scale_color_viridis_d() +
528 |   theme_minimal() +
529 |   geom_hline(yintercept = 0,linetype=2) +
530 |   ylab(TeX("Value of Scale/Dispersion Parameter $\\phi$")) +
531 |   theme(legend.position = "top") +
532 |   guides(color=guide_legend(title=""),shape=guide_legend(title="")) +
533 |   facet_wrap(~group_list,scales="free",ncol=2) +
534 |   xlab("") +
535 |   coord_flip() 
536 | 
537 | 
538 | ```
539 | 
540 | 


--------------------------------------------------------------------------------
/master.R:
--------------------------------------------------------------------------------
 1 | # run all scripts necessary to replicate & compile paper and SI
 2 | 
 3 | # whether to run the  simulations from scratch
 4 | 
 5 | run_sim <- F
 6 | 
 7 | if(run_sim)  {
 8 |   
 9 |   print("Now on random simulation")
10 |   
11 |   source("ordered_beta_reg_sim.R")
12 |   
13 |   print("Now on fixed simulation")
14 |   
15 |   source("ordered_beta_reg_sim_fixed.R")
16 |   
17 | }
18 | 
19 | print("Compiling paper")
20 | 
21 | rmarkdown::render("kubinec_ord_betareg_accepted_dataverse.Rmd")
22 | 
23 | print("Compiling supplementary information")
24 | 
25 | rmarkdown::render("kubinec_ord_betareg_appendix_anon.Rmd")


--------------------------------------------------------------------------------
/old_code.R:
--------------------------------------------------------------------------------
  1 | "Biden Therm" = "b_change_biden_therm",
  2 | "Voted" = "b_post_did_voteVoted",
  3 | "Age" = "b_pre_age",
  4 | "BA" = "b_pre_educBachelors",
  5 | "HS" = "b_pre_educHighSchool",
  6 | "MA" = "b_pre_educMasters",
  7 | "Other Ed" = "b_pre_educOther",
  8 | "PhD/MBA/JD" = "b_pre_educProfessionalDDoctoral",
  9 | "Some College" = "b_pre_educSomeCollege",
 10 | "COVID About Right" = "b_pre_fed_response_covidAboutright",
 11 | "COVID Too Slow" = "b_pre_fed_response_covidMuchtooslow",
 12 | "COVID Somewhat Quick" = "b_pre_fed_response_covidSomewhattooquick",
 13 | "COVID Somewhat Slow" = "b_pre_fed_response_covidSomewhattooslow",
 14 | "Disabled" = "b_pre_ocupDisabled",
 15 | "Homemaker" = "b_pre_ocupHomemaker",
 16 | "Retired" = "b_pre_ocupRetired",
 17 | "Student" = "b_pre_ocupStudent",
 18 | "Unemployed" = "b_pre_ocupUnemployed",
 19 | "Asian" = "b_pre_raceAsian",
 20 | "Black" = "b_pre_raceBlack",
 21 | "Hispanic" = "b_pre_raceHispanic",
 22 | "Multiracial" = "b_pre_raceMultiMracial",
 23 | "Native American" = "b_pre_raceNativeAmerican",
 24 | "Female" = "b_pre_sexFemale",
 25 | "Biden Therm" = "change_biden_therm",
 26 | "Biden Therm^2" = "b_Ichange_biden_thermE2",
 27 | "Voted" = "post_did_voteVoted",
 28 | "Age" = "pre_age",
 29 | "BA" = "pre_educBachelor's",
 30 | "HS" = "pre_educHigh School",
 31 | "MA" = "pre_educMaster's",
 32 | "Other Ed" = "pre_educOther",
 33 | "PhD/MBA/JD" = "pre_educProfessional/Doctoral",
 34 | "Some College" = "pre_educSome College",
 35 | "COVID About Right" = "pre_fed_response_covidAbout right",
 36 | "COVID Too Slow" = "pre_fed_response_covidMuch too slow",
 37 | "COVID Somewhat Quick" = "pre_fed_response_covidSomewhat too quick",
 38 | "COVID Somewhat Slow" = "pre_fed_response_covidSomewhat too slow",
 39 | "Disabled" = "pre_ocupDisabled",
 40 | "Homemaker" = "pre_ocupHomemaker",
 41 | "Retired" = "pre_ocupRetired",
 42 | "Student" = "pre_ocupStudent",
 43 | "Unemployed" = "pre_ocupUnemployed",
 44 | "Asian" = "pre_raceAsian",
 45 | "Black" = "pre_raceBlack",
 46 | "Biden Therm^2"="I(change_biden_therm^2)",
 47 | "Hispanic" = "pre_raceHispanic",
 48 | "Multiracial" = "pre_raceMulti-racial",
 49 | "Native American" = "pre_raceNative American",
 50 | "Female" = "pre_sexFemale")
 51 | 
 52 | group_list=forcats::fct_collapse(variable,
 53 |                                  `Biden Therm`=c("Biden Therm","Biden Therm^2"),
 54 |                                  Demographics=c("Age",
 55 |                                                 "Female",
 56 |                                                 "Black",
 57 |                                                 "Hispanic","Asian",
 58 |                                                 "Multiracial",
 59 |                                                 "Native American"),
 60 |                                  `Education\nBaseline=Less Than H.S.`=c("HS",
 61 |                                                                         "Some College",
 62 |                                                                         "BA","MA",
 63 |                                                                         "PhD/MBA/JD",
 64 |                                                                         "Other Ed"),
 65 |                                  `Federal Response\nBaseline= Too Quick`=c("COVID About Right","COVID Somewhat Slow","COVID Somewhat Quick","COVID Too Slow"),
 66 |                                  `Employment\nBaseline=Employed`=c("Disabled",
 67 |                                                                    "Homemaker", "Retired","Student","Unemployed")))
 68 | 
 69 | # anes <- read_csv("data/anes_timeseries_2020_csv_20210719/anes_timeseries_2020_csv_20210719.csv") %>% 
 70 | #   select(pre_vote_choice_2020="V201029",
 71 | #          pre_pref_strong="V201030",
 72 | #          pre_intend_vote_2020="V201032",
 73 | #          pre_intend_vote_choice_2020="V201033",
 74 | #          pre_did_vote_2016="V201101",
 75 | #          pre_country_on_track="V201114",
 76 | #          pre_afraid="V201116",
 77 | #          pre_angry="V201118",
 78 | #          pre_worry="V201120",
 79 | #          pre_sex="V201600",
 80 | #          pre_pres_approval="V201129x",
 81 | #          handle_covid="V201144x",
 82 | #          pre_biden_therm="V201151",
 83 | #          pre_trump_therm="V201152",
 84 | #          pre_ideology="V201200",
 85 | #          pre_party_id="V201231x",
 86 | #          pre_trust_gov="V201233",
 87 | #          pre_party_covid="V201244",
 88 | #          pre_econ_worry="V201335",
 89 | #          pre_trust_news="V201377",
 90 | #          pre_fed_response_covid="V201392x",
 91 | #          pre_age="V201507x",
 92 | #          pre_marital_status="V201508",
 93 | #          pre_educ="V201510",
 94 | #          pre_ocup="V201534x",
 95 | #          pre_race="V201549x",
 96 | #          post_party_contact="V202005",
 97 | #          post_give_money="V202017",
 98 | #          post_discuss_pol="V202023",
 99 | #          post_discuss_pol_online="V202029",
100 | #          post_asian_therm="V202477",
101 | #          post_did_vote="V202109x",
102 | #          post_biden_therm="V202143",
103 | #          post_who_voted="V202110x",
104 | #          pre_date="V203053") %>% 
105 | #   mutate_all(~ifelse(. %in% c(998,-1:-10),NA,.))
106 | # 
107 | # # clean data
108 | # 
109 | # model_data_long <- mutate(anes,
110 | #                      change_biden_therm=scale(post_biden_therm - pre_biden_therm)[,1],
111 | #                      asian_scale=post_asian_therm/100,
112 | #                      therm=floor(post_asian_therm)/100,
113 | #                      therm_rescale=(therm * (sum(!is.na(therm))-1) + 0.5)/sum(!is.na(therm)),
114 | #                      #therm=floor(post_biden_therm)/100,
115 | #                      #therm_rescale=(therm * (sum(!is.na(therm))-1) + 0.5)/sum(!is.na(therm)),
116 | #                      pre_sex=factor(pre_sex,labels=c("Male","Female")),
117 | #                      pre_age=pre_age/10,
118 | #                      pre_educ=factor(pre_educ,
119 | #                                      labels=c("<High School",
120 | #                                               "High School",
121 | #                                               "Some College",
122 | #                                               "Associate Vocational",
123 | #                                               "Associate Academic",
124 | #                                               "Bachelor's",
125 | #                                               "Master's",
126 | #                                               "Professional/Doctoral",
127 | #                                               "Other")),
128 | #                      pre_educ=fct_collapse(pre_educ,
129 | #                                            `Some College`=c("Associate Vocational",
130 | #                                                                  "Associate Academic",
131 | #                                                         "Some College")),
132 | #                      pre_ocup=factor(pre_ocup,labels=c("Employed",
133 | #                                                        "Laid Off",
134 | #                                                        "Unemployed",
135 | #                                                        "Retired",
136 | #                                                        "Disabled",
137 | #                                                        "Homemaker",
138 | #                                                        "Student")),
139 | #                      pre_ocup=fct_collapse(pre_ocup,Unemployed=c("Laid Off",
140 | #                                                                  "Unemployed")),
141 | #                      post_did_vote=factor(post_did_vote,
142 | #                                           labels=c("Did not vote",
143 | #                                                    "Voted")),
144 | #                      pre_race=factor(pre_race,
145 | #                                      labels=c("White",
146 | #                                               "Black",
147 | #                                               "Hispanic",
148 | #                                               "Asian",
149 | #                                               "Native American",
150 | #                                               "Multi-racial")),
151 | #                      pre_fed_response_covid=factor(pre_fed_response_covid,
152 | #                                                    labels=c("Much too quick",
153 | #                                                             "Somewhat too quick",
154 | #                                                             "About right",
155 | #                                                             "Somewhat too slow",
156 | #                                                             "Much too slow")),
157 | #                      therm_type=case_when(therm>0  & therm<1 ~ "inside",therm==0|therm==1~"outside",TRUE~'other'))


--------------------------------------------------------------------------------
/ordbetareg.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/ordered_beta_reg.R:
--------------------------------------------------------------------------------
  1 | # Robert Kubinec
  2 | # New York University Abu Dhabi
  3 | # January 23, 2020
  4 | # let's try to simulate a new distribution for ordered beta regression
  5 | 
  6 | require(rstan)
  7 | require(bayesplot)
  8 | require(dplyr)
  9 | 
 10 | N <- 1000
 11 | 
 12 | X <- runif(N,-2,2)
 13 | 
 14 | # on the logit scale, determines threshold above which we get degenerate (0/1) outcomes
 15 | cutpoints <- c(-2,2) 
 16 | 
 17 | X_beta <- 2.5
 18 | 
 19 | 
 20 | sample_ordbeta <- function(cutpoints=NULL,phi=NULL,X=NULL,X_beta=NULL) {
 21 |   
 22 |   # we'll assume the same eta was used to generate outcomes
 23 |   eta <- X*X_beta
 24 |   
 25 |   # non-inflated outcome
 26 |   out_beta <- rbeta(N,plogis(eta) * phi, (1 - plogis(eta)) * phi) 
 27 |   
 28 |   # probabilities for three possible categories (0, proportion, 1)
 29 |   low <- 1-plogis(eta - cutpoints[1])
 30 |   middle <- plogis(eta-cutpoints[1]) - plogis(eta-cutpoints[2])
 31 |   high <- plogis(eta - cutpoints[2])
 32 |   
 33 |   # now determine which one we get for each observation
 34 |   
 35 |   outcomes <- sapply(1:N, function(i) {
 36 |     sample(1:3,size=1,prob=c(low[i],middle[i],high[i]))
 37 |   })
 38 |   
 39 |   # combine and sample from full distribution
 40 |   
 41 |   sapply(1:length(outcomes),function(i) {
 42 |     if(outcomes[i]==1) {
 43 |       return(0)
 44 |     } else if(outcomes[i]==2) {
 45 |       return(out_beta[i])
 46 |     } else {
 47 |       return(1)
 48 |     }
 49 |   })
 50 |   
 51 | }
 52 | 
 53 | predict_ordbeta <- function(cutpoints=NULL,phi=NULL,X=NULL,X_beta=NULL,
 54 |                          combined_out=T) {
 55 |   
 56 |   # we'll assume the same eta was used to generate outcomes
 57 |   eta <- X*X_beta
 58 |   
 59 |   # probabilities for three possible categories (0, proportion, 1)
 60 |   low <- 1-plogis(eta - cutpoints[1])
 61 |   middle <- plogis(eta-cutpoints[1]) - plogis(eta-cutpoints[2])
 62 |   high <- plogis(eta - cutpoints[2])
 63 |   
 64 |   # check for whether combined outcome or single outcome
 65 |   
 66 |   if(combined_out) {
 67 |     low*0 + middle*plogis(eta) + high*1
 68 |   } else {
 69 |     list(pr_zero=low,
 70 |          pr_proportion=middle,
 71 |          proportion_value=out_beta,
 72 |          pr_one=high)
 73 |   }
 74 |   
 75 | }
 76 | 
 77 | 
 78 | 
 79 | # use this function to sample data
 80 | 
 81 | final_out <- sample_ordbeta(cutpoints=cutpoints,
 82 |                              phi=phi,
 83 |                              X=X,
 84 |                              X_beta=X_beta)
 85 | 
 86 | # we can also use it to calculate "true" marginal effect of X on Y using code from margins package
 87 | # i.e., numerical differentiation
 88 | # set value of `h` based on `eps` to deal with machine precision
 89 | 
 90 | eps <- 1e-7
 91 | setstep <- function(x) {
 92 |   x + (max(abs(x), 1, na.rm = TRUE) * sqrt(eps)) - x
 93 | }
 94 | 
 95 | y0 <- predict_ordbeta(cutpoints=cutpoints,
 96 |                       phi=phi,
 97 |                       X=X - setstep(X),
 98 |                       X_beta=X_beta)
 99 | 
100 | y1 <- predict_ordbeta(cutpoints=cutpoints,
101 |                       phi=phi,
102 |                       X=X + setstep(X),
103 |                       X_beta=X_beta)
104 | 
105 | marg_eff <- (y1-y0)/((X + setstep(X))-(X - setstep(X)))
106 | 
107 | mean(marg_eff)
108 | 
109 | # check for pr0 and pr1
110 | 
111 | y0 <- predict_ordbeta(cutpoints=cutpoints,
112 |                       phi=phi,
113 |                       X=X - setstep(X),
114 |                       X_beta=X_beta,combined_out = F)
115 | 
116 | y1 <- predict_ordbeta(cutpoints=cutpoints,
117 |                       phi=phi,
118 |                       X=X + setstep(X),
119 |                       X_beta=X_beta,combined_out = F)
120 | 
121 | marg_eff_0 <- (y1$pr_zero-y0$pr_zero)/((X + setstep(X))-(X - setstep(X)))
122 | 
123 | marg_eff_1 <- (y1$pr_one-y0$pr_one)/((X + setstep(X))-(X - setstep(X)))
124 | 
125 | mean(marg_eff_0)
126 | mean(marg_eff_1)
127 | 
128 | # check what OLS does
129 | 
130 | ols_fit <- lm(final_out~X)
131 | 
132 | summary(ols_fit)
133 | 
134 | # now we need a Stan file
135 | 
136 | beta_logit <- stan_model("beta_logit.stan")
137 | 
138 | to_sample <- 400
139 | 
140 | indices_degen <- sample(1:length(X[final_out %in% c(0,1)]),size=to_sample/2)
141 | indices_prop <- sample(1:length(X[final_out>0 & final_out<1]),size=to_sample/2)
142 | 
143 | to_bl <- list(N_degen=sum(final_out %in% c(0,1)),
144 |               N_prop=sum(final_out>0 & final_out<1),
145 |               X=1,
146 |               outcome_prop=final_out[final_out>0 & final_out<1],
147 |               outcome_degen=final_out[final_out %in% c(0,1)],
148 |               covar_prop=as.matrix(X[final_out>0 & final_out<1]),
149 |               covar_degen=as.matrix(X[final_out %in% c(0,1)]),
150 |               N_pred_degen=sum(final_out %in% c(0,1)),
151 |               N_pred_prop=sum(final_out>0 & final_out<1),
152 |               indices_degen=1:(sum(final_out %in% c(0,1))),
153 |               indices_prop=1:(sum(final_out>0 & final_out<1)),
154 |               run_gen=1)
155 | 
156 | fit_model <- sampling(beta_logit,data=to_bl,chains=2,cores=2,iter=1000)
157 | 
158 | # regenerate data and see how we do capturing it
159 | 
160 | yrep_ordbeta <- extract(fit_model,"regen_all")[[1]]
161 | 
162 | # use only a sample of draws
163 | 
164 | # final_out_prop <- final_out[final_out>0 & final_out<1][indices_prop]
165 | # final_out_degen <- final_out[final_out %in% c(0,1)][indices_degen]
166 | 
167 | final_out_prop <- final_out[final_out>0 & final_out<1]
168 | final_out_degen <- final_out[final_out %in% c(0,1)]
169 | 
170 | ppc_ecdf_overlay(y=c(final_out_degen,final_out_prop),yrep=yrep_ordbeta) + 
171 |   ggtitle("Empirical Posterior Predictive Distribution for Ordinal Beta Regression",subtitle="N=1000")
172 | 
173 | # ppc_dens_overlay(y=c(final_out_degen,final_out_prop),yrep=yrep) + 
174 | #   ggtitle("Posterior Predictive Distribution for Ordinal Beta Regression",subtitle="N=1000")
175 | 
176 | # looks pretty good
177 | 
178 | # try with rstanarm betareg where we pre-transform outcome
179 | 
180 | require(rstanarm)
181 | 
182 | final_out_scale <- (final_out * (length(final_out) - 1) + .5) / length(final_out)
183 | 
184 | betareg_fit <- stan_betareg(formula = outcome~X,data=tibble(outcome=final_out_scale,
185 |                                                             X=X),chains=2,cores=2,iter=1000)
186 | 
187 | summary(betareg_fit)
188 | 
189 | # X_beta has lost 1/3 its value from original (i.e., effect has been compressed)
190 | 
191 | # see the distortion re: the original values
192 | 
193 | yrep_betareg <- posterior_predict(betareg_fit,draws=100)
194 | 
195 | ppc_dens_overlay(y=final_out,yrep=yrep_betareg) + ggtitle("Posterior Predictive Distribution for Rstanarm Beta Regression",subtitle="N=1000, Transformed to (0,1) Scale")
196 | 
197 | ppc_dens_overlay(y=final_out_scale,yrep=yrep_betareg) + ggtitle("Posterior Predictive Distribution for Rstanarm Beta Regression",subtitle="N=1000, Transformed to (0,1) Scale")
198 | 
199 | # Tends to over/under predict middle and outliers (i.e. 0/1)
200 | 
201 | # now try the ZOIB
202 | 
203 | zoib_model <- stan_model("zoib.stan")
204 | x <- as.matrix(X)
205 | zoib_fit <- sampling(zoib_model,data=list(n=length(final_out),
206 |                                           y=final_out,
207 |                                           k=ncol(x),
208 |                                           x=x,
209 |                                           run_gen=1),chains=2,cores=2,iter=1000)
210 | 
211 | # this is X_beta
212 | print(zoib_fit,pars="coef_m")
213 | yrep_zoib <- extract(zoib_fit,"zoib_regen")[[1]]
214 | ppc_dens_overlay(y=final_out,yrep=yrep_zoib) + 
215 |   ggtitle("Empirical Posterior Predictive Distribution for Ordinal Beta Regression",subtitle="N=1000")
216 | 
217 | # Compare Distributions ---------------------------------------------------
218 | 
219 | require(loo)
220 | 
221 | loo_ordbeta <- loo(fit_model,"ord_log")
222 | 
223 | loo_zoib <- loo(zoib_fit,"zoib_log")
224 | 
225 | loo_compare(loo_ordbeta,loo_zoib)
226 | 
227 | 
228 | # Marginal Effects --------------------------------------------------------
229 | 
230 | # we can calculate predicted values and marginal effects for a single X
231 | 
232 | X_beta <- as.matrix(fit_model,"X_beta")
233 | cuts_est <- as.matrix(fit_model,"cutpoints")
234 | 
235 | # fascinating. we can do numerical differentiation -- cool
236 | 
237 | pr_y0 <- sapply(1:nrow(X_beta),function(i) {
238 |   b <- X_beta[i,]
239 |   c <- cuts_est[i,1]
240 |   mean(1-plogis(b*X - c)) 
241 |            })
242 | 
243 | pr_y1 <- sapply(1:nrow(X_beta),function(i) {
244 |   b <- X_beta[i,]
245 |   c <- cuts_est[i,2]
246 |   mean(plogis(b*X - c)) 
247 | })
248 | 
249 | mu_est <- sapply(1:nrow(X_beta),function(i) {
250 |   b <- X_beta[i,]
251 |   plogis(mean(b*X)) 
252 | })
253 | 
254 | mean(plogis(mu1))
255 | 
256 | 


--------------------------------------------------------------------------------
/ordered_beta_reg_sim_fixed.R:
--------------------------------------------------------------------------------
  1 | # Robert Kubinec
  2 | # New York University Abu Dhabi
  3 | # October 12, 2021
  4 | # Simulation of 0 - 1 bounded dependent variables
  5 | # Note simulation will take some time, approx ~2 days with 3 cores
  6 | 
  7 | require(cmdstanr)
  8 | require(bayesplot)
  9 | require(dplyr)
 10 | require(brms)
 11 | require(loo)
 12 | require(posterior)
 13 | require(future.apply)
 14 | require(faux)
 15 | 
 16 | RNGkind("L'Ecuyer-CMRG")
 17 | 
 18 | beta_logit <- cmdstanr::cmdstan_model("beta_logit.stan",compile = T)
 19 | zoib_model <- cmdstanr::cmdstan_model("zoib_nophireg.stan")
 20 | frac_mod <- cmdstanr::cmdstan_model("frac_logit.stan")
 21 | 
 22 | set.seed(772235)
 23 | 
 24 | # number of draws per sample size
 25 | draws <- 100
 26 | 
 27 | # let's do some simulations
 28 | 
 29 | simul_data <- tibble(N=rep(c(25,50,75,seq(100,3000,by=200)),each=draws)) %>% 
 30 |   mutate(k=5,
 31 |          rho=.7,
 32 |          phi=2,
 33 |          cutpoints1=-3) %>% 
 34 |   mutate(cutpoints2=2,
 35 |          X_beta=list(c(-1,1,-1,1,-1)))
 36 | 
 37 | # we can also use it to calculate "true" marginal effect of X on Y using code from margins package
 38 | # i.e., numerical differentiation
 39 | # set value of `h` based on `eps` to deal with machine precision
 40 | 
 41 | eps <- 1e-7
 42 | setstep <- function(x) {
 43 |   x + (max(abs(x), 1, na.rm = TRUE) * sqrt(eps)) - x
 44 | }
 45 | 
 46 | # functions for doing marginal effects
 47 | 
 48 | predict_ordbeta <- function(cutpoints=NULL,X=NULL,X_beta=NULL,
 49 |                             combined_out=T) {
 50 |   
 51 |   # we'll assume the same eta was used to generate outcomes
 52 |   eta <- X%*%matrix(X_beta)[,1]
 53 |   
 54 |   # probabilities for three possible categories (0, proportion, 1)
 55 |   low <- 1-plogis(eta - cutpoints[1])
 56 |   middle <- plogis(eta-cutpoints[1]) - plogis(eta-cutpoints[2])
 57 |   high <- plogis(eta - cutpoints[2])
 58 |   
 59 |   # check for whether combined outcome or single outcome
 60 |   
 61 |   if(combined_out) {
 62 |     low*0 + middle*plogis(eta) + high*1
 63 |   } else {
 64 |     list(pr_zero=low,
 65 |          pr_proportion=middle,
 66 |          proportion_value=plogis(eta),
 67 |          pr_one=high)
 68 |   }
 69 |   
 70 | }
 71 | 
 72 | predict_zoib <- function(coef_g=NULL,coef_a=NULL,coef_m=NULL,
 73 |                          alpha1=NULL,
 74 |                          alpha2=NULL,
 75 |                          alpha3=NULL,
 76 |                          X=NULL,
 77 |                          combined_out=T) {
 78 |   
 79 |   # we'll assume the same eta was used to generate outcomes
 80 |   psi <- plogis(alpha1 + X %*% t(coef_a))
 81 |   gamma <- plogis(alpha2 + X %*% t(coef_g))
 82 |   eta <- alpha3 + X %*% t(coef_m)
 83 |   
 84 |   # probabilities for three possible categories (0, proportion, 1)
 85 |   low <- psi * (1-gamma)
 86 |   middle <- (1-psi)
 87 |   high <- psi * gamma
 88 |   
 89 |   # check for whether combined outcome or single outcome
 90 |   
 91 |   if(combined_out) {
 92 |     low*0 + middle*plogis(eta) + high*1
 93 |   } else {
 94 |     list(pr_zero=low,
 95 |          pr_proportion=middle,
 96 |          proportion_value=eta,
 97 |          pr_one=high)
 98 |   }
 99 |   
100 | }
101 | 
102 | gen_x <- function(k,rho,N_rep) {
103 |   
104 |   # generate true coefs
105 |   
106 |   true_coef <- runif(k, -5, 5)
107 |   
108 |   # generate matrix of correlated Xs
109 |   
110 |   out_x <- sapply(1:length(k), function(i) {
111 |     
112 |     this_x <- 1
113 |       
114 |   })
115 |   
116 | }
117 | 
118 | # one for each model type
119 | 
120 | r_seeds <- c(6635,2216,8845,9936,3321,63914)
121 | 
122 | plan(multicore)
123 | 
124 | all_simul_data <- future_lapply(1:nrow(simul_data), function(i,simul_data=NULL,r_seeds=NULL) {
125 | #all_simul_data <- lapply(1:nrow(simul_data), function(i,simul_data=NULL,r_seeds=NULL) {
126 |   
127 |   this_data <- slice(simul_data,i)
128 |   cat(file = "simul_status.txt",paste0("Now on row ",i),append = T)
129 | 
130 | # Draw from ordered beta regression ---------------------------------------
131 | 
132 |   
133 |   
134 |   N <- this_data$N
135 |   
136 |   #X <- rnorm(N,runif(1,-2,2),1)
137 |   
138 |   X_beta <- this_data$X_beta[[1]]
139 |   
140 |   # need to create X
141 |   
142 |   X <- rnorm_multi(n=N,vars=this_data$k,r=this_data$rho,as.matrix=T)
143 |   
144 |   eta <- -2 + X%*%matrix(X_beta)
145 |   
146 |   # ancillary parameter of beta distribution
147 |   phi <- this_data$phi
148 |   
149 |   # predictor for ordered model
150 |   mu1 <- eta[,1]
151 |   # predictor for beta regression
152 |   mu2 <- eta[,1]
153 |   
154 |   cutpoints <- c(this_data$cutpoints1,this_data$cutpoints2)
155 |   
156 |   # probabilities for three possible categories (0, proportion, 1)
157 |   low <- 1-plogis(mu2 - cutpoints[1])
158 |   middle <- plogis(mu2-cutpoints[1]) - plogis(mu2-cutpoints[2])
159 |   high <- plogis(mu2 - cutpoints[2])
160 |   
161 |   # we'll assume the same eta was used to generate outcomes
162 |   
163 |   out_beta <- rbeta(N,plogis(mu1) * phi, (1 - plogis(mu1)) * phi) 
164 |   
165 |   print(i)
166 |   
167 |   # now determine which one we get for each observation
168 |   outcomes <- sapply(1:N, function(i) {
169 |     sample(1:3,size=1,prob=c(low[i],middle[i],high[i]))
170 |   })
171 |   
172 |   # now combine binary (0/1) with proportion (beta)
173 |   
174 |   final_out <- sapply(1:length(outcomes),function(i) {
175 |     if(outcomes[i]==1) {
176 |       return(0)
177 |     } else if(outcomes[i]==2) {
178 |       return(out_beta[i])
179 |     } else {
180 |       return(1)
181 |     }
182 |   })
183 |   
184 |   # check for floating point errors
185 |   
186 |   final_out <- ifelse(final_out>(1 - 1e-10) & final_out<1,final_out - 1e-10,
187 |                                final_out)
188 |   
189 |   final_out <- ifelse(final_out<(0 + 1e-10) & final_out>0,final_out + 1e-10,
190 |                                final_out)
191 |   
192 |   counter <- environment()
193 |   counter$i <- 0
194 |   
195 |   # get rid of very rare outcomes, can be difficult for ZOIB
196 |   
197 |   while(((sum(final_out>0 & final_out<1)<5) || (sum(final_out==1)<5) || (sum(final_out==0)<5)) && (counter$i<20)) {
198 |     
199 |     final_out <- sapply(1:length(outcomes),function(i) {
200 |       if(outcomes[i]==1) {
201 |         return(0)
202 |       } else if(outcomes[i]==2) {
203 |         return(out_beta[i])
204 |       } else {
205 |         return(1)
206 |       }
207 |     })
208 |     cat(file = "simul_status.txt","Resampling",append=T)
209 |     counter$i <- counter$i + 1
210 |   }
211 |   
212 |   if((sum(final_out>0 & final_out<1)<5) || (sum(final_out==1)<5) || (sum(final_out==0)<5)) {
213 |     
214 |     print(paste0("DGP failed for row ",i,"\n"))
215 |     
216 |     this_data$status <- "dgp_failure"
217 |     
218 |     return(this_data)
219 |   }
220 |   
221 |   # calculate "true" marginal effects
222 |   
223 |   # loop over k
224 |   
225 |   X_low <- X
226 |   X_high <- X
227 |   
228 |   marg_eff <- sapply(1:this_data$k, function(tk)  {
229 |     
230 |     X_low[,tk] <- X[,tk] - setstep(X[,tk])
231 |     X_high[,tk] <- X[,tk] + setstep(X[,tk])
232 |     
233 |     y0 <- predict_ordbeta(cutpoints=cutpoints,
234 |                           X=X_low,
235 |                           X_beta=X_beta)
236 |     
237 |     y1 <- predict_ordbeta(cutpoints=cutpoints,
238 |                           X=X_high,
239 |                           X_beta=X_beta)
240 |     
241 |     mean((y1-y0)/(X_high[,tk]-X_low[,tk]))
242 |     
243 |     
244 |   })
245 |   
246 |   
247 |   
248 | 
249 | # Fit models --------------------------------------------------------------
250 |   
251 |   # now fit ordinal beta
252 |   
253 |   to_bl <- list(N_degen=sum(final_out %in% c(0,1)),
254 |                 N_prop=sum(final_out>0 & final_out<1),
255 |                 X=this_data$k,
256 |                 outcome_prop=final_out[final_out>0 & final_out<1],
257 |                 outcome_degen=final_out[final_out %in% c(0,1)],
258 |                 covar_prop=X[final_out>0 & final_out<1,,drop=F],
259 |                 covar_degen=X[final_out %in% c(0,1),,drop=F],
260 |                 N_pred_degen=sum(final_out %in% c(0,1)),
261 |                 N_pred_prop=sum(final_out>0 & final_out<1),
262 |                 indices_degen=1:(sum(final_out %in% c(0,1))),
263 |                 indices_prop=1:(sum(final_out>0 & final_out<1)),
264 |                 run_gen=1)
265 |   
266 |   x <- X
267 |   
268 |   final_out_scale <- (final_out * (length(final_out) - 1) + .5) / length(final_out)
269 |   
270 |   frac_data <- list(y=final_out,
271 |                     x=x,
272 |                     k=ncol(x),
273 |                     run_gen=1,
274 |                     n=nrow(x))
275 |   # fit all models
276 |   
277 |   
278 |   fit_model <-try(beta_logit$sample(data=to_bl,seed=r_seeds[1],
279 |                         chains=1,parallel_chains=1,iter_warmup=500,
280 |                         refresh=0,
281 |                         iter_sampling=500))
282 |   
283 |   
284 |  
285 |   zoib_fit <- try(zoib_model$sample(data=list(n=length(final_out),
286 |                                               s=length(final_out),
287 |                                               sample_all=1:length(final_out),
288 |                                             y=final_out,
289 |                                             k=ncol(x),
290 |                                             x=x,
291 |                                             run_gen=1), 
292 |                                     seed=r_seeds[2],
293 |                                     refresh=0,chains=1,parallel_chains=1,iter_warmup=500,
294 |                                 iter_sampling=500))
295 |   
296 |   X_brms <- X
297 |   colnames(X_brms) <- paste0(rep("Var",ncol(X)),1:ncol(X))
298 |   X_brms <- as_tibble(X_brms)
299 |   
300 |   betareg_fit <- try(brm(formula = outcome~.,data=mutate(X_brms,
301 |                                                          outcome=final_out_scale),
302 |                      chains=1,cores=1,iter=1000,
303 |                               seed=r_seeds[3],
304 |                      silent=0,refresh=0,
305 |                      family="beta",
306 |                      prior=set_prior("normal(0,5)", class = "b"),
307 |                      backend='cmdstanr'))
308 |   
309 |   X_brms_small <- filter(X_brms, final_out>0 & final_out<1)
310 |   
311 |   X_brms_small$outcome <- final_out[final_out>0 & final_out<1]
312 |   
313 |   betareg_fit2 <- try(update(betareg_fit,newdata=X_brms_small,
314 |                              chains=1,cores=1,iter=1000,
315 |                              seed=r_seeds[4],
316 |                              silent=0,refresh=0,
317 |                              family="beta",
318 |                              backend="cmdstanr"))
319 |   
320 |   frac_fit <- try(frac_mod$sample(data=frac_data,seed=r_seeds[6],
321 |                                   chains=1,parallel_chains = 1,refresh=0,
322 |                                   iter_warmup=500,iter_sampling = 500))
323 |   
324 |   lm_fit <- try(brm(formula = outcome~X,data=tibble(outcome=final_out,
325 |                                                     r_seeds[5],
326 |                                                     X=X),chains=1,cores=1,iter=1000,
327 |                     silent=0,refresh=0,
328 |                     backend="cmdstanr",
329 |                     prior=set_prior("normal(0,5)", class = "b")))
330 |   
331 |   if('try-error' %in% c(class(fit_model),
332 |                         class(zoib_model),
333 |                         class(betareg_fit),
334 |                         class(betareg_fit2),
335 |                         class(frac_fit),
336 |                         class(lm_fit))) {
337 |     
338 |     print(paste0("Estimation failed for row ",i,"\n"))
339 |     
340 |     this_data$status <- "estimation_failure"
341 |     
342 |     return(this_data)
343 |     
344 |   }
345 |   
346 |   yrep_ord <- try(as_draws_matrix(fit_model$draws("regen_epred")))
347 |   
348 |   yrep_zoib <- try(as_draws_matrix(zoib_fit$draws("zoib_epred")))
349 |   
350 |   yrep_betareg <- try(posterior_epred(betareg_fit,draws=100))
351 | 
352 |   yrep_betareg2 <- try(posterior_epred(betareg_fit2,draws=100))
353 | 
354 |   yrep_lm <- try(posterior_epred(lm_fit,draws=100))
355 |   
356 |   yrep_frac <- try(as_draws_matrix(frac_fit$draws("frac_rep")))
357 |   
358 |   if('try-error' %in% c(class(yrep_ord),
359 |                         class(yrep_zoib),
360 |                         class(yrep_betareg),
361 |                         class(yrep_betareg2),
362 |                         class(yrep_frac),
363 |                         class(yrep_lm))) {
364 |     
365 |     print(paste0("Estimation failed for row ",i,"\n"))
366 |     
367 |     this_data$status <- "estimation_failure"
368 |     
369 |     return(this_data)
370 |     
371 |   }
372 |   
373 |   this_data$status <- "success"
374 |   
375 | 
376 | # Calculate estimands -----------------------------------------------------
377 | 
378 |   
379 |   # now return the full data frame
380 |   
381 |   X_beta_ord <- as_draws_matrix(fit_model$draws("X_beta"))
382 |   X_beta_zoib <- as_draws_matrix(zoib_fit$draws("coef_m"))
383 |   X_beta_reg <- as.matrix(betareg_fit,pars=paste0("Var",1:this_data$k))
384 |   X_beta_reg2 <- as.matrix(betareg_fit2,pars=paste0("Var",1:this_data$k))
385 |   X_beta_lm <- as.matrix(lm_fit,pars="X")
386 |   X_beta_frac <- as_draws_matrix(frac_fit$draws(variables="X_beta"))
387 |   
388 |   # calculate rmse
389 |   
390 |   rmse_ord <- sqrt(mean(apply(yrep_ord,1,function(c) { (c - c(final_out[final_out %in% c(0,1)],final_out[final_out>0 & final_out<1]))^2 })))
391 |   rmse_zoib <-sqrt( mean(apply(yrep_zoib,1,function(c) { (c - final_out)^2 })))
392 |   rmse_betareg <- sqrt(mean(apply(yrep_betareg,1,function(c) { (c - final_out)^2 })))
393 |   rmse_betareg2 <- sqrt(mean(apply(yrep_betareg2,1,function(c) { (c - final_out[final_out>0 & final_out<1])^2 })))
394 |   rmse_lm <- sqrt(mean(apply(yrep_lm,1,function(c) { (c - final_out)^2 })))
395 |   rmse_frac <- sqrt(mean(apply(yrep_frac,1,function(c) { (c - final_out)^2 })))
396 |   
397 |   # calculate loo
398 |   
399 |   loo_ordbeta <-try(fit_model$loo("ord_log"))
400 |   loo_betareg <- loo(betareg_fit)
401 |   loo_betareg2 <- loo(betareg_fit2)
402 |   loo_zoib <- try(zoib_fit$loo("zoib_log"))
403 |   loo_lm <- loo(lm_fit)
404 |   loo_frac <- try(frac_fit$loo("frac_log"))
405 |   
406 |   if(any('try-error' %in% c(class(loo_ordbeta),
407 |                             class(loo_zoib),
408 |                             class(loo_frac)))) {
409 |     comp_loo <- matrix(rep(NA,6),ncol=2)
410 |     row.names(comp_loo) <- c("ord","zoib","frac")
411 |     loo_ordbeta <- list(estimates=matrix(c(NA,NA),ncol=2))
412 |     loo_zoib <- list(estimates=matrix(c(NA,NA),ncol=2))
413 |     loo_frac <- list(estimates=matrix(c(NA,NA),ncol=2))
414 |   } else {
415 |     comp_loo <- loo_compare(list(ord=loo_ordbeta,zoib=loo_zoib,
416 |                                  lm=loo_lm,frac=loo_frac))
417 |   }
418 |   
419 |   
420 |   
421 |   # calculate marginal effects
422 |   
423 |   cutpoints_est <- as_draws_matrix(fit_model$draws("cutpoints"))
424 |   
425 |   margin_ord <- lapply(1:ncol(X), function(tk) {
426 |     
427 |     X_low[,tk] <- X[,tk] - setstep(X[,tk])
428 |     X_high[,tk] <- X[,tk] + setstep(X[,tk])
429 |     
430 |     tibble(marg_eff=sapply(1:nrow(X_beta_ord), function(i) {
431 |       y0 <- predict_ordbeta(cutpoints=cutpoints_est[i,],
432 |                             X=X_low,
433 |                             X_beta=c(X_beta_ord[i,]))
434 |       
435 |       y1 <- predict_ordbeta(cutpoints=cutpoints_est[i,],
436 |                             X=X_high,
437 |                             X_beta=c(X_beta_ord[i,]))
438 |       
439 |       marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk])
440 |       
441 |       mean(marg_eff)
442 |     }),
443 |     x_col=tk)
444 |     
445 |     }) %>% bind_rows
446 |   
447 |   # now for the ZOIB
448 |   
449 |   coef_a <- as_draws_matrix(zoib_fit$draws("coef_a"))
450 |   coef_g <- as_draws_matrix(zoib_fit$draws("coef_g"))
451 |   alpha <- as_draws_matrix(zoib_fit$draws("alpha"))
452 |   
453 |   margin_zoib <- lapply(1:ncol(X), function(tk) {
454 |     
455 |     X_low[,tk] <- X[,tk] - setstep(X[,tk])
456 |     X_high[,tk] <- X[,tk] + setstep(X[,tk]) 
457 |     
458 |     tibble(marg_eff=    sapply(1:nrow(X_beta_zoib), function(i) {
459 |       y0 <- predict_zoib(coef_g=coef_g[i,],
460 |                          coef_a=coef_a[i,],
461 |                          alpha1=c(alpha[i,1]),
462 |                          alpha2=c(alpha[i,2]),
463 |                          alpha3=c(alpha[i,3]),
464 |                          X=X_low,
465 |                          coef_m=X_beta_zoib[i,])
466 |       
467 |       y1 <-  predict_zoib(coef_g=coef_g[i,],
468 |                           coef_a=coef_a[i,],
469 |                           alpha1=c(alpha[i,1]),
470 |                           alpha2=c(alpha[i,2]),
471 |                           alpha3=c(alpha[i,3]),
472 |                           X=X_high,
473 |                           coef_m=X_beta_zoib[i,])
474 |       
475 |       marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk])
476 |       
477 |       mean(marg_eff)
478 |     }),
479 |     x_col=tk)
480 |     
481 |     }) %>% bind_rows
482 |   
483 |   # now betareg
484 |   
485 |   betareg_int <- as.matrix(betareg_fit,pars="(Intercept)")
486 |   
487 |   margin_betareg <- lapply(1:ncol(X), function(tk) {
488 |     
489 |     X_low[,tk] <- X[,tk] - setstep(X[,tk])
490 |     X_high[,tk] <- X[,tk] + setstep(X[,tk]) 
491 |     
492 |     tibble(marg_eff= sapply(1:nrow(X_beta_reg), function(i) {
493 |       y0 <- plogis(betareg_int[i,"b_Intercept"] + X_low%*%X_beta_reg[i,])
494 |       y1 <- plogis(betareg_int[i,"b_Intercept"] + X_high%*%X_beta_reg[i,])
495 |       
496 |       marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk])
497 |       
498 |       mean(marg_eff)
499 |     }),
500 |     x_col=tk)
501 |     
502 |     }) %>% bind_rows
503 |   
504 |   betareg2_int <- as.matrix(betareg_fit2,pars="(Intercept)")
505 |   
506 |   margin_betareg2 <- lapply(1:ncol(X), function(tk) {
507 |     
508 |     X_low[,tk] <- X[,tk] - setstep(X[,tk])
509 |     X_high[,tk] <- X[,tk] + setstep(X[,tk]) 
510 |     
511 |     tibble(marg_eff= sapply(1:nrow(X_beta_reg), function(i) {
512 |       y0 <- plogis(betareg2_int[i,"b_Intercept"] + X_low%*%X_beta_reg2[i,])
513 |       y1 <- plogis(betareg2_int[i,"b_Intercept"] + X_high%*%X_beta_reg2[i,])
514 |       
515 |       marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk])
516 |       
517 |       mean(marg_eff)
518 |     }),
519 |     x_col=tk)
520 |     
521 |   }) %>% bind_rows
522 |   
523 |   
524 |   # Fractional logit marg effects -------------------------------------------
525 |   
526 |   frac_int <- as_draws_matrix(frac_fit$draws(variables="alpha"))
527 |   
528 |   margin_frac <- lapply(1:ncol(X), function(tk) {
529 |     
530 |     X_low[,tk] <- X[,tk] - setstep(X[,tk])
531 |     X_high[,tk] <- X[,tk] + setstep(X[,tk]) 
532 |     
533 |     tibble(marg_eff= sapply(1:nrow(X_beta_frac), function(i) {
534 |       y0 <- plogis(c(frac_int[i,]) + X_low%*%X_beta_frac[i,,drop=T])
535 |       y1 <- plogis(c(frac_int[i,]) + X_high%*%X_beta_frac[i,,drop=T])
536 |       
537 |       marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk])
538 |       
539 |       mean(marg_eff)
540 |     }),
541 |     x_col=tk)
542 |     
543 |     
544 |     }) %>% bind_rows
545 |   
546 |   
547 |   
548 |   this_data$marg_eff <- list(marg_eff)
549 |   this_data$true_kurt <- moments::kurtosis(final_out)
550 |   
551 |   
552 |   
553 |   # Combine estimates -------------------------------------------------------
554 |   
555 |   sum_marg <- function(d,func,...) {
556 |     
557 |    ret_vec <-  arrange(d,x_col) %>% 
558 |       group_by(x_col) %>% 
559 |       summarize(sum_stat=func(marg_eff,...)) %>% 
560 |       pull(sum_stat)
561 |    
562 |    list(ret_vec)
563 |    
564 |   }
565 |   
566 |   try(bind_cols(purrr::map_dfr(seq_len(6), ~this_data),bind_rows(tibble(model="Ordinal Beta Regression",
567 |                                                                     med_est=list(apply(X_beta_ord,2,mean)),
568 |                                                                     high=list(apply(X_beta_ord,2,quantile,.95)),
569 |                                                                     low=list(apply(X_beta_ord,2,quantile,.05)),
570 |                                                                     var_calc=list(apply(X_beta_ord,2,var)),
571 |                                                                     loo_val=loo_ordbeta$estimates[1,1],
572 |                                                                     p_loo=loo_ordbeta$estimates[2,1],
573 |                                                                     bad_k=sum(loo_ordbeta$diagnostics$pareto_k>0.5),
574 |                                                                     win_loo=which(row.names(comp_loo)=="ord"),
575 |                                                                     win_loo_se=comp_loo[which(row.names(comp_loo)=="ord"),2],
576 |                                                                     rmse=rmse_ord,
577 |                                                                     kurt_est=mean(apply(yrep_ord,1,moments::kurtosis)),
578 |                                                                     marg_eff_est=sum_marg(margin_ord,mean),
579 |                                                                     high_marg=sum_marg(margin_ord,quantile,.95),
580 |                                                                     low_marg=sum_marg(margin_ord,quantile,.05),
581 |                                                                     var_marg=sum_marg(margin_ord,var)),
582 |                                                              tibble(model="ZOIB",
583 |                                                                     med_est=list(apply(X_beta_zoib,2,mean)),
584 |                                                                     high=list(apply(X_beta_zoib,2,quantile,.95)),
585 |                                                                     low=list(apply(X_beta_zoib,2,quantile,.05)),
586 |                                                                     var_calc=list(apply(X_beta_zoib,2,var)),
587 |                                                                     loo_val=loo_zoib$estimates[1,1],
588 |                                                                     p_loo=loo_zoib$estimates[2,1],
589 |                                                                     bad_k=sum(loo_zoib$diagnostics$pareto_k>0.5),
590 |                                                                     kurt_est=mean(apply(yrep_zoib,1,moments::kurtosis)),
591 |                                                                     win_loo=which(row.names(comp_loo)=="zoib"),
592 |                                                                     win_loo_se=comp_loo[which(row.names(comp_loo)=="zoib"),2],
593 |                                                                     rmse=rmse_zoib,
594 |                                                                     marg_eff_est=sum_marg(margin_zoib,mean),
595 |                                                                     high_marg=sum_marg(margin_zoib,quantile,.95),
596 |                                                                     low_marg=sum_marg(margin_zoib,quantile,.05),
597 |                                                                     var_marg=sum_marg(margin_zoib,var)),
598 |                                                              tibble(model="Beta Regression - Transformed",
599 |                                                                     med_est=list(apply(X_beta_reg,2,mean)),
600 |                                                                     high=list(apply(X_beta_reg,2,quantile,.95)),
601 |                                                                     low=list(apply(X_beta_reg,2,quantile,.05)),
602 |                                                                     var_calc=list(apply(X_beta_reg,2,var)),
603 |                                                                     loo_val=loo_betareg$estimates[1,1],
604 |                                                                     p_loo=loo_betareg$estimates[2,1],
605 |                                                                     bad_k=sum(loo_betareg$diagnostics$pareto_k>0.5),
606 |                                                                     rmse=rmse_betareg,
607 |                                                                     kurt_est=mean(apply(yrep_betareg,1,moments::kurtosis)),
608 |                                                                     marg_eff_est=sum_marg(margin_betareg,mean),
609 |                                                                     high_marg=sum_marg(margin_betareg,quantile,.95),
610 |                                                                     low_marg=sum_marg(margin_betareg,quantile,.05),
611 |                                                                     var_marg=sum_marg(margin_betareg,var)),
612 |                                                              tibble(model="Beta Regression - (0,1)",
613 |                                                                     med_est=list(apply(X_beta_reg2,2,mean)),
614 |                                                                     high=list(apply(X_beta_reg2,2,quantile,.95)),
615 |                                                                     low=list(apply(X_beta_reg2,2,quantile,.05)),
616 |                                                                     var_calc=list(apply(X_beta_reg2,2,var)),
617 |                                                                     loo_val=loo_betareg2$estimates[1,1],
618 |                                                                     p_loo=loo_betareg2$estimates[2,1],
619 |                                                                     bad_k=sum(loo_betareg2$diagnostics$pareto_k>0.5),
620 |                                                                     kurt_est=mean(apply(yrep_betareg2,1,moments::kurtosis)),
621 |                                                                     rmse=rmse_betareg2,
622 |                                                                     marg_eff_est=sum_marg(margin_betareg2,mean),
623 |                                                                     high_marg=sum_marg(margin_betareg2,quantile,.95),
624 |                                                                     low_marg=sum_marg(margin_betareg2,quantile,.05),
625 |                                                                     var_marg=sum_marg(margin_betareg2,var)),
626 |                                                              tibble(model="OLS",
627 |                                                                     med_est=list(apply(X_beta_lm,2,mean)),
628 |                                                                     high=list(apply(X_beta_lm,2,quantile,.95)),
629 |                                                                     low=list(apply(X_beta_lm,2,quantile,.05)),
630 |                                                                     var_calc=list(apply(X_beta_lm,2,var)),
631 |                                                                     kurt_est=mean(apply(yrep_lm,1,moments::kurtosis)),
632 |                                                                     loo_val=loo_lm$estimates[1,1],
633 |                                                                     p_loo=loo_lm$estimates[2,1],
634 |                                                                     bad_k=sum(loo_lm$diagnostics$pareto_k>0.5),
635 |                                                                     win_loo=which(row.names(comp_loo)=="lm"),
636 |                                                                     win_loo_se=comp_loo[which(row.names(comp_loo)=="lm"),2],
637 |                                                                     rmse=rmse_lm,
638 |                                                                     marg_eff_est=list(apply(X_beta_lm,2,mean)),
639 |                                                                     high_marg=list(apply(X_beta_lm,2,quantile,.95)),
640 |                                                                     low_marg=list(apply(X_beta_lm,2,quantile,.05)),
641 |                                                                     var_marg=list(apply(X_beta_lm,2,var))),
642 |                                                              tibble(model="Fractional",
643 |                                                                     med_est=list(apply(X_beta_frac,2,mean)),
644 |                                                                     high=list(apply(X_beta_frac,2,quantile,.95)),
645 |                                                                     low=list(apply(X_beta_frac,2,quantile,.05)),
646 |                                                                     var_calc=list(apply(X_beta_frac,2,var)),
647 |                                                                     loo_val=loo_frac$estimates[1,1],
648 |                                                                     p_loo=loo_frac$estimates[2,1],
649 |                                                                     bad_k=sum(loo_frac$diagnostics$pareto_k>0.5),
650 |                                                                     win_loo=which(row.names(comp_loo)=="frac"),
651 |                                                                     win_loo_se=comp_loo[which(row.names(comp_loo)=="frac"),2],
652 |                                                                     kurt_est=mean(apply(yrep_frac,1,moments::kurtosis)),
653 |                                                                     rmse=rmse_frac,
654 |                                                                     marg_eff_est=sum_marg(margin_frac,mean),
655 |                                                                     high_marg=sum_marg(margin_frac,quantile,.95),
656 |                                                                     low_marg=sum_marg(margin_frac,quantile,.05),
657 |                                                                     var_marg=sum_marg(margin_frac,var)))))
658 |   
659 |   
660 | #},simul_data=simul_data,r_seeds=r_seeds) 
661 | },simul_data=simul_data,r_seeds=r_seeds,future.seed=TRUE)
662 | 
663 | #simul_data_final <- bind_rows(all_simul_data)
664 | 
665 | saveRDS(all_simul_data,"sim_cont_X_fixed.rds")
666 | 
667 | all_sim_fixed <- all_simul_data
668 | 
669 | save(all_sim_fixed, file="sim_cont_X_fixed.RData")
670 | 


--------------------------------------------------------------------------------
/preamble.tex:
--------------------------------------------------------------------------------
 1 | \usepackage{booktabs}
 2 | \usepackage{longtable}
 3 | \usepackage{array}
 4 | \usepackage{multirow}
 5 | \usepackage{wrapfig}
 6 | \usepackage{float}
 7 | \usepackage{colortbl}
 8 | \usepackage{pdflscape}
 9 | \usepackage{tabu}
10 | \usepackage{threeparttable}
11 | \usepackage{threeparttablex}
12 | \usepackage[normalem]{ulem}
13 | \usepackage{makecell}
14 | \usepackage{xcolor}
15 | \usepackage{xstring}
16 | \usepackage{nameref}
17 | \linespread{2}
18 | 
19 | 


--------------------------------------------------------------------------------
/zoib.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int n;
 3 |   int k; // number of columns
 4 |   matrix[n,k] x; 
 5 |   vector<lower=0, upper=1>[n] y;
 6 |   int run_gen; // whether to use generated quantities
 7 | }
 8 | transformed data {
 9 |   int<lower=0, upper=1> is_discrete[n];
10 |   int<lower=-1, upper=1> y_discrete[n];
11 | 
12 |   // create indicator for whether y is discrete 
13 |   // and an integer value to pass to bernoulli_lpmf for discrete y
14 |   for (i in 1:n) {
15 |     if (y[i] == 0) {
16 |       is_discrete[i] = 1;
17 |       y_discrete[i] = 0;
18 |     } else if (y[i] == 1) {
19 |       is_discrete[i] = 1;
20 |       y_discrete[i] = 1;
21 |     } else {
22 |       is_discrete[i] = 0;
23 |       // hack to ensure that throws error if passed to bernoulli_lpmf
24 |       y_discrete[i] = -1;
25 |     }
26 |   }
27 | }
28 | parameters {
29 |   vector[k] coef_a;
30 |   vector[k] coef_g;
31 |   vector[k] coef_m;
32 |   //vector[k] coef_p;
33 |   real<lower=0> phi;
34 |   vector[3] alpha;
35 | }
36 | transformed parameters {
37 |   vector[n] psi;
38 |   vector[n] gamma;
39 |   vector[n] mu;
40 |   //vector<lower=0>[n] phi;
41 |   
42 |   psi = alpha[1] + x*coef_a;
43 |   gamma = alpha[2] + x*coef_g;
44 |   mu = inv_logit(alpha[3] + x*coef_m);
45 |   //phi = exp(alpha[4] + x*coef_p);
46 |   
47 | }
48 | model {
49 |   coef_a ~ normal(0, 3);
50 |   coef_g ~ normal(0, 3);
51 |   coef_m ~ normal(0, 3);
52 |   //coef_p ~ normal(0, 3);
53 |   phi ~ exponential(1);
54 |   alpha ~ normal(0,3);
55 |   
56 |   is_discrete ~ bernoulli_logit(psi);
57 |   for (i in 1:n) {
58 |     if (is_discrete[i] == 1) {
59 |       y_discrete[i] ~ bernoulli_logit(gamma[i]);
60 |     } else {
61 |       y[i] ~ beta_proportion(mu[i], phi);
62 |     }
63 |   }
64 |   
65 | }
66 | generated quantities {
67 |   vector[run_gen==1 ? n: 0] zoib_log;
68 |   vector[run_gen==1 ? n: 0] zoib_regen;
69 |   vector[run_gen==1 ? n: 0] is_discrete_regen;
70 |   
71 |   if(run_gen==1) {
72 |     for (i in 1:n) {
73 |       real psit = inv_logit(psi[i]);
74 |       real gammat = inv_logit(gamma[i]);
75 |       if (y[i] == 0) {
76 |         zoib_log[i] = log(psit) + log1m(gammat);
77 |       } else if (y[i] == 1) {
78 |         zoib_log[i] = log(psit) + log(gammat);
79 |       } else {
80 |         zoib_log[i] = log1m(psit) + beta_proportion_lpdf(y[i] | mu[i], phi);
81 |       }
82 |       
83 |       is_discrete_regen[i] = bernoulli_rng(psit);
84 |       
85 |       if(is_discrete_regen[i]==0) {
86 |         zoib_regen[i] = beta_proportion_rng(mu[i], phi);
87 |       } else {
88 |         zoib_regen[i] = bernoulli_rng(gammat);
89 |       }
90 |     }
91 |   
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/zoib_nophireg.stan:
--------------------------------------------------------------------------------
  1 | data {
  2 |   int n;
  3 |   int k; // number of columns
  4 |   int s; // number of samples (if not estimating predictions on all outcomes)
  5 |   int sample_all[s];
  6 |   matrix[n,k] x; 
  7 |   vector<lower=0, upper=1>[n] y;
  8 |   int run_gen; // whether to use generated quantities
  9 | }
 10 | transformed data {
 11 |   int<lower=0, upper=1> is_discrete[n];
 12 |   int<lower=-1, upper=1> y_discrete[n];
 13 | 
 14 |   // create indicator for whether y is discrete 
 15 |   // and an integer value to pass to bernoulli_lpmf for discrete y
 16 |   for (i in 1:n) {
 17 |     if (y[i] == 0) {
 18 |       is_discrete[i] = 1;
 19 |       y_discrete[i] = 0;
 20 |     } else if (y[i] == 1) {
 21 |       is_discrete[i] = 1;
 22 |       y_discrete[i] = 1;
 23 |     } else {
 24 |       is_discrete[i] = 0;
 25 |       // hack to ensure that throws error if passed to bernoulli_lpmf
 26 |       y_discrete[i] = -1;
 27 |     }
 28 |   }
 29 | }
 30 | parameters {
 31 |   vector[k] coef_a;
 32 |   vector[k] coef_g;
 33 |   vector[k] coef_m;
 34 |   //vector[k] coef_p;
 35 |   real<lower=0> phi;
 36 |   vector[3] alpha;
 37 | }
 38 | transformed parameters {
 39 |   vector[n] psi;
 40 |   vector[n] gamma;
 41 |   vector[n] mu;
 42 |   //vector<lower=0>[n] phi;
 43 |   
 44 |   psi = alpha[1] + x*coef_a;
 45 |   gamma = alpha[2] + x*coef_g;
 46 |   mu = inv_logit(alpha[3] + x*coef_m);
 47 |   //phi = exp(alpha[4] + x*coef_p);
 48 |   
 49 | }
 50 | model {
 51 |   coef_a ~ normal(0, 5);
 52 |   coef_g ~ normal(0, 5);
 53 |   coef_m ~ normal(0, 5);
 54 |   //coef_p ~ normal(0, 3);
 55 |   phi ~ exponential(.1);
 56 |   alpha ~ normal(0,5);
 57 |   
 58 |   is_discrete ~ bernoulli_logit(psi);
 59 |   for (i in 1:n) {
 60 |     if (is_discrete[i] == 1) {
 61 |       y_discrete[i] ~ bernoulli_logit(gamma[i]);
 62 |     } else {
 63 |       y[i] ~ beta_proportion(mu[i], phi);
 64 |     }
 65 |   }
 66 |   
 67 | }
 68 | generated quantities {
 69 |   vector[run_gen==1 ? s: 0] zoib_log;
 70 |   vector[run_gen==1 ? s: 0] zoib_regen;
 71 |   vector[run_gen==1 ? s: 0] zoib_epred;
 72 |   vector[run_gen==1 ? s: 0] is_discrete_regen;
 73 |   
 74 |   if(run_gen==1) {
 75 |     for (i in 1:s) {
 76 |       
 77 |       real psit = inv_logit(psi[sample_all[i]]);
 78 |       real gammat = inv_logit(gamma[sample_all[i]]);
 79 |       
 80 |       zoib_epred[i] = psit * gammat + (1-psit) * mu[sample_all[i]];
 81 |       
 82 |       if (y[sample_all[i]] == 0) {
 83 |         zoib_log[i] = log(psit) + log1m(gammat);
 84 |       } else if (y[sample_all[i]] == 1) {
 85 |         zoib_log[i] = log(psit) + log(gammat);
 86 |       } else {
 87 |         zoib_log[i] = log1m(psit) + beta_proportion_lpdf(y[sample_all[i]] | mu[sample_all[i]], phi);
 88 |       }
 89 |       
 90 |       
 91 |       
 92 |       is_discrete_regen[i] = bernoulli_rng(psit);
 93 |       
 94 |       if(is_discrete_regen[i]==0) {
 95 |         zoib_regen[i] = beta_proportion_rng(mu[sample_all[i]], phi);
 96 |       } else {
 97 |         zoib_regen[i] = bernoulli_rng(gammat);
 98 |       }
 99 |     }
100 |   
101 |   }
102 | }
103 | 


--------------------------------------------------------------------------------