├── .gitattributes ├── .gitignore ├── BibTexDatabase.bib ├── README.Rmd ├── README.md ├── beta_logit.stan ├── beta_logit_infl.stan ├── beta_logit_infl_simple.stan ├── beta_logit_infl_v2.stan ├── beta_logit_infl_v3.stan ├── beta_logit_infl_v4.stan ├── beta_logit_phireg.stan ├── dag_example.R ├── data ├── W28_Aug17 │ ├── ATP W28 methodology.doc │ ├── ATP W28 que.docx │ ├── ATP W28 readme.txt │ ├── ATP W28 topline.docx │ ├── ATP W28.sav │ └── ~$P W28 que.docx ├── all_vars_Beta_trans.rds ├── all_vars_frac.rds ├── all_vars_ord.rds ├── all_vars_zoib.rds ├── compare_prob.rds ├── sample_all.rds ├── sim_cont_X.RData ├── sim_cont_X.rds ├── sim_cont_X_fixed.RData ├── sim_cont_X_fixed.rds └── suffrage_paper_replicationfiles │ ├── EER-D-13-00718R2_Event_extended.do │ ├── EER-D-13-00718R2_Event_main.do │ ├── EER-D-13-00718R2_Event_study_extended_sample_reform.dta │ ├── EER-D-13-00718R2_Event_study_extended_sample_reversals.dta │ ├── EER-D-13-00718R2_Event_study_main_sample_reform.dta │ ├── EER-D-13-00718R2_Event_study_main_sample_reversals.dta │ ├── EER-D-13-00718R2_READ_ME.pdf │ ├── EER-D-13-00718R2_maindata_suffrage.dta │ ├── EER-D-13-00718R2_reversals_suffrage.dta │ └── EER-D-13-00718R2_suffrage.do ├── define_ord_betareg.R ├── estimate_with_brms.Rmd ├── estimate_with_brms.html ├── figures ├── figure_1.pdf ├── figure_2.pdf ├── figure_3.pdf ├── figure_4.pdf ├── figure_5.pdf ├── figure_6.pdf └── figure_7.pdf ├── frac_logit.stan ├── helper_func.R ├── install.R ├── kubinec_ord_betareg.html ├── kubinec_ord_betareg_accepted.Rmd ├── kubinec_ord_betareg_accepted_dataverse.Rmd ├── kubinec_ord_betareg_appendix.Rmd ├── kubinec_ord_betareg_appendix_anon.Rmd ├── limited_dv_modeling.Rmd ├── master.R ├── old_code.R ├── ordbetareg.Rproj ├── ordered_beta_reg.R ├── ordered_beta_reg_sim.R ├── ordered_beta_reg_sim_fixed.R ├── preamble.tex ├── zoib.stan └── zoib_nophireg.stan /.gitattributes: -------------------------------------------------------------------------------- 1 | *.html linguist-vendored 2 | preamble.tex linguist-vendored 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "README" 3 | author: "Robert Kubinec" 4 | date: "March 1st, 2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | This repository contains data and code for the paper, "Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Survey Sliders and Visual Analog Scales." The repository includes the following files: 13 | 14 | - `kubinec_ord_betareg.Rmd` A reproducible Rmarkdown document that can be run in Rstudio to re-produce the results. Note that the `data` folder in this repository contains necessary data to reproduce results. 15 | - `estimate_with_brms.Rmd` and `estimate_with_brms.html` These files show how to run ordered beta regression models using the R package [`brms`](https://cran.r-project.org/web/packages/brms/vignettes/brms_overview.pdf). 16 | - `define_ord_betareg.R` This R script contains all the auxiliary code needed to fit the model with R package `brms` (see vignette above for more info). 17 | - `ordered_beta_reg_sim.R` This R script will run a simulation comparing the ordered beta regression model to alternatives, including the zero-one-inflated Beta regression model (ZOIB). The output of a 10,000 run of this simulation is in `data/`. 18 | - `beta_logit.stan` This file contains the Stan code used to fit an ordered beta regression model in Stan. 19 | - `zoib.stan` This file contains Stan code used to fit the zero-one-inflated beta regression model (ZOIB). 20 | - `beta_logit_phireg.stan` This file constains Stan code to fit an ordered beta regression model with additional predictors for phi, the scale parameter in the distribution. These additional parameters allow for understanding the effect of covariates on encouraging clustered or more dispersed (estreme) responses from respondents. 21 | - `beta_logit_infl*.stan` These additional Stan files are various ways of parameterizing the midpoint of the scale when the midpoint is considered missing data. None of them appear to do a better job at predicting the outcome than versions that considered the midpoint to be observed data. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | README 2 | ================ 3 | Robert Kubinec 4 | January 11th, 2022 5 | 6 | **Please note: This is the repository for the paper files. To access the R package `ordbetareg`, please go to this repository: www.github.com/saudiwin/ordbetareg_pack.** 7 | 8 | This repository contains data and code for the paper, “Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Continuous Data with Lower and Upper Bounds", which is now forthcoming at the journal *Political Analysis*. An ungated preprint can be found here: https://osf.io/preprints/socarxiv/2sx6y/. Replication files can be found both on Dataverse and Github. 9 | 10 | To replicate the paper, please first run the `install.R` script to make sure all relevant packages are installed. The script will also install `cmdstanr` and a version of `cmdstan`, which is the underlying MCMC sampling library from the Stan project. Installing `cmdstan` requires the R toolchain; if you have any trouble or are unsure see the `cmdstanr` package installation instructions: https://mc-stan.org/cmdstanr/articles/cmdstanr.html. 11 | 12 | The file `master.R` will then run all the necessary scripts to compile the paper and supplementary information (compilation requires a working Latex installation). Note that `master.R` by default loads the existing simulation data in the `data` folder. To fully reproduce the simulation, set the `run_sim` variable in `master.R` to `TRUE`. Note that running the full simulation can require up to a few days on a machine with ~40 cores. 13 | 14 | The version of R used for these results was 4.1.2 and the version of the R packages is as follows: 15 | 16 | - dplyr: 1.0.7 17 | - rstanarm: 2.21.1 18 | - tidyr: 1.1.4 19 | - lubridate: 1.8.0 20 | - loo: 2.4.1 21 | - kableExtra: 1.3.4 22 | - bayesplot: 1.8.1 23 | - patchwork: 1.1.1 24 | - stringr: 1.4.0 25 | - grDevices: 4.1.2 26 | - emojifont: 0.5.5 27 | - latex2exp: 0.5.0 28 | - haven: 2.4.3 29 | - ggplot2: 3.3.5 30 | - posterior: 1.2.0 31 | - brms: 2.16.3 32 | - remotes: 2.4.2 33 | - future.apply: 1.8.1 34 | - faux: 1.1.0 35 | - rmarkdown: 2.11 36 | - bookdown: 0.24 37 | - tinytex: 0.36 38 | - extrafont: 0.17 39 | 40 | The repository includes the following files: 41 | 42 | - `kubinec_ord_betareg_accepted.Rmd` The accepted version of the reproducible Rmarkdown document that can 43 | be run in Rstudio to re-produce the results. Note that the `data` 44 | folder in this repository contains necessary data to reproduce 45 | results. 46 | - `estimate_with_brms.Rmd` and `estimate_with_brms.html` These files 47 | show how to run ordered beta regression models using the R package 48 | [`brms`](https://cran.r-project.org/web/packages/brms/vignettes/brms_overview.pdf). 49 | - `define_ord_betareg.R` This R script contains all the auxiliary code 50 | needed to fit the model with R package `brms` (see vignette above 51 | for more info). 52 | - `*_fit.rds` Fitted model object files to reproduce paper results much faster 53 | - `data/sim_cont_X*.RData` Simulation results to reproduce paper results much faster 54 | - `data/suffrage_paper_replicationfiles/EER-D-13-00718R2_maindata_suffrage.dta` Data from Toke and Aidt (2012) 55 | - `ordered_beta_reg_sim.R` This R script will run a simulation 56 | comparing the ordered beta regression model to alternatives, 57 | including the zero-one-inflated Beta regression model (ZOIB). The 58 | output of a 10,000 run of this simulation is saved in `data/` as `sim_cont_X.RData`. 59 | - `ordered_beta_reg_sim_fixed.R` This R script will run a simulation 60 | comparing the ordered beta regression model to alternatives, but with fixed rather than random draws 61 | of relevant parameters (results are in the SI, not main paper). The 62 | output of a 4,000 run of this simulation is saved in `data/` as `sim_cont_X_fixed.RData`. 63 | - `beta_logit.stan` This file contains the Stan code used to fit an 64 | ordered beta regression model in Stan. 65 | - `zoib_nophireg.stan` This file contains Stan code used to fit the 66 | zero-one-inflated beta regression model (ZOIB). 67 | - `beta_logit_phireg.stan` This file constains Stan code to fit an 68 | ordered beta regression model with additional predictors for phi, 69 | the scale parameter in the distribution. These additional parameters 70 | allow for understanding the effect of covariates on encouraging 71 | clustered or more dispersed (estreme) responses from respondents. 72 | - `frac_logit.stan` This file contains a Stan parameterization of the 73 | fractional logit model. 74 | - `beta_logit_infl*.stan` These additional Stan files are various ways 75 | of parameterizing the midpoint of the scale when the midpoint is 76 | considered missing data. None of them appear to do a better job at 77 | predicting the outcome than versions that considered the midpoint to 78 | be observed data. 79 | - `BibTexDatabase.bib` References necessary to compile the paper. 80 | - `preamble.tex` Latex packages for paper 81 | -------------------------------------------------------------------------------- /beta_logit.stan: -------------------------------------------------------------------------------- 1 | // 2 | // Ordinal beta regression model for analying experimental outcomes 3 | // with proportion and degenerate responses (i.e. 0 and 1) 4 | // Models 0/1 as ordered categories above/below (0,1) 5 | // Robert Kubinec 6 | // New York University Abu Dhabi 7 | 8 | functions { 9 | 10 | // prior from Michael Betancourt for ordered cutpoints 11 | real induced_dirichlet_lpdf(vector c, vector alpha, real phi) { 12 | int K = num_elements(c) + 1; 13 | vector[K - 1] sigma = inv_logit(phi - c); 14 | vector[K] p; 15 | matrix[K, K] J = rep_matrix(0, K, K); 16 | 17 | // Induced ordinal probabilities 18 | p[1] = 1 - sigma[1]; 19 | for (k in 2:(K - 1)) 20 | p[k] = sigma[k - 1] - sigma[k]; 21 | p[K] = sigma[K - 1]; 22 | 23 | // Baseline column of Jacobian 24 | for (k in 1:K) J[k, 1] = 1; 25 | 26 | // Diagonal entries of Jacobian 27 | for (k in 2:K) { 28 | real rho = sigma[k - 1] * (1 - sigma[k - 1]); 29 | J[k, k] = - rho; 30 | J[k - 1, k] = rho; 31 | } 32 | 33 | return dirichlet_lpdf(p | alpha) 34 | + log_determinant(J); 35 | } 36 | } 37 | data { 38 | int N_prop; // number of proportion observations (0,1) 39 | int N_degen; // number of 0/1 observations 40 | int X; // number predictors 41 | vector[N_prop] outcome_prop; // Y in (0,1) 42 | int outcome_degen[N_degen]; // Y in {0,1} 43 | matrix[N_prop,X] covar_prop; // covariate X for proportion outcome 44 | matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome 45 | int N_pred_degen; // number of posterior predictive samples for 0/1 46 | int N_pred_prop; // number of posterior predictive samples for (0,1) 47 | int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1 48 | int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1) 49 | int run_gen; // whether to use generated quantities 50 | } 51 | parameters { 52 | vector[X] X_beta; // common predictor 53 | real alpha; // common intercept 54 | ordered[2] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts) 55 | real kappa; // scale parameter for beta regression 56 | } 57 | transformed parameters { 58 | // store matrix calculations 59 | 60 | vector[N_degen] calc_degen; 61 | vector[N_prop] calc_prop; 62 | 63 | // drop the intercepts so everything is relative to the cutpoints 64 | if(N_degen>0) { 65 | calc_degen = alpha + covar_degen*X_beta; 66 | } 67 | 68 | //print(calc_degen[1:10]); 69 | 70 | calc_prop = alpha + covar_prop*X_beta; 71 | 72 | //print(calc_prop[1:10]); 73 | 74 | } 75 | model { 76 | 77 | // vague priors 78 | X_beta ~ normal(0,5); 79 | alpha ~ normal(0,5); 80 | kappa ~ exponential(.1); 81 | //cutpoints[2] - cutpoints[1] ~ normal(0,3); 82 | // induced dirichlet prior on cutpoints: 83 | 84 | target += induced_dirichlet_lpdf(cutpoints | rep_vector(1, 3), 0); 85 | 86 | // need separate counters for logit (0/1) and beta regression 87 | if(N_degen>0) { 88 | for(n in 1:N_degen) { 89 | if(outcome_degen[n]==0) { 90 | // Pr(Y==0) 91 | target += log1m_inv_logit(calc_degen[n] - cutpoints[1]); 92 | } else { 93 | //Pr(Y==1) 94 | target += log_inv_logit(calc_degen[n] - cutpoints[2]); 95 | } 96 | } 97 | } 98 | 99 | 100 | for(n in 1:N_prop) { 101 | // Pr(Y in (0,1)) 102 | target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 103 | // Pr(Y==x where x in (0,1)) 104 | outcome_prop[n] ~ beta_proportion(inv_logit(calc_prop[n]),kappa); 105 | } 106 | 107 | } 108 | 109 | generated quantities { 110 | 111 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional) 112 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows 113 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_epred; 114 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo 115 | 116 | if(run_gen==1) { 117 | 118 | row_vector[2] all_pr[N_pred_degen+N_pred_prop]; 119 | 120 | if(N_pred_degen>0) { 121 | // first do degenerate outcomes 122 | // note: these could be *re-generated* as beta/propotions 123 | for(i in 1:num_elements(indices_degen)) { 124 | 125 | // draw an outcome 0 / prop / 1 126 | regen_degen[i] = ordered_logistic_rng(alpha + covar_degen[indices_degen[i],]*X_beta,cutpoints); 127 | 128 | if(outcome_degen[i]==0) { 129 | ord_log[i] = log1m_inv_logit(calc_degen[indices_degen[i]] - cutpoints[1]); 130 | } else { 131 | ord_log[i] = log_inv_logit(calc_degen[indices_degen[i]] - cutpoints[2]); 132 | } 133 | 134 | // don't need zero 135 | 136 | all_pr[i] = [log(inv_logit(calc_degen[indices_degen[i]] - cutpoints[1]) - inv_logit(calc_degen[i] - cutpoints[2])) + log_inv_logit(calc_degen[indices_degen[i]]), 137 | log_inv_logit(calc_degen[indices_degen[i]] - cutpoints[2])]; 138 | 139 | if(regen_degen[i]==1) { 140 | regen_all[i] = 0; 141 | } else if(regen_degen[i]==3) { 142 | regen_all[i] = 1; 143 | } else { 144 | // did not occur in original data but could re-occur probabilistically 145 | regen_all[i] = beta_proportion_rng(inv_logit(alpha + covar_degen[indices_degen[i],]*X_beta),kappa); 146 | } 147 | } 148 | } 149 | if(N_pred_prop>0) { 150 | // now do originally proportional outcomes 151 | // can be re-generated as 0s or 1s 152 | 153 | int skip = num_elements(indices_degen); 154 | 155 | for(i in 1:num_elements(indices_prop)) { 156 | 157 | all_pr[i+skip] = [log(inv_logit(calc_prop[indices_prop[i]] - cutpoints[1]) - inv_logit(calc_prop[indices_prop[i]] - cutpoints[2])) + log_inv_logit(calc_prop[indices_prop[i]]), 158 | log_inv_logit(calc_prop[indices_prop[i]] - cutpoints[2])]; 159 | 160 | // draw an outcome 0 / prop / 1 161 | regen_degen[i+skip] = ordered_logistic_rng(alpha + covar_prop[indices_prop[i],]*X_beta,cutpoints); 162 | 163 | ord_log[i+skip] = log(inv_logit(calc_prop[indices_prop[i]] - cutpoints[1]) - inv_logit(calc_prop[indices_prop[i]] - cutpoints[2])) + 164 | beta_proportion_lpdf(outcome_prop[indices_prop[i]]|inv_logit(calc_prop[indices_prop[i]]),kappa); 165 | 166 | if(regen_degen[i+skip]==1) { 167 | regen_all[i+skip] = 0; 168 | } else if(regen_degen[i+skip]==3) { 169 | regen_all[i+skip] = 1; 170 | } else { 171 | // did not occur in original data but could re-occur probabilistically 172 | regen_all[i+skip] = beta_proportion_rng(inv_logit(alpha + covar_prop[indices_prop[i],]*X_beta),kappa); 173 | } 174 | 175 | } 176 | } 177 | 178 | for(i in 1:(N_pred_degen + N_pred_prop)) { 179 | 180 | regen_epred[i] = sum(exp(all_pr[i])); 181 | 182 | } 183 | } 184 | 185 | } 186 | 187 | -------------------------------------------------------------------------------- /beta_logit_infl.stan: -------------------------------------------------------------------------------- 1 | // 2 | // Ordinal beta regression model for analying experimental outcomes 3 | // with proportion and degenerate responses (i.e. 0 and 1) 4 | // Models 0/1 as ordered categories above/below (0,1) 5 | // Robert Kubinec 6 | // New York University Abu Dhabi 7 | data { 8 | int N_prop; // number of proportion observations (0,1) 9 | int N_degen; // number of 0/1 observations 10 | int X; // number predictors 11 | int X_miss; // number of predictors for inflated model 12 | vector[N_prop] outcome_prop; // Y in (0,1) 13 | real infl_value; // set to value between 0 and 1. If negative, inflation is not used 14 | int outcome_degen[N_degen]; // Y in {0,1} 15 | matrix[N_prop,X] covar_prop; // covariate X for proportion outcome 16 | matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values 17 | matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values 18 | matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome 19 | int N_pred_degen; // number of posterior predictive samples for 0/1 20 | int N_pred_prop; // number of posterior predictive samples for (0,1) 21 | int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1 22 | int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1) 23 | int run_gen; // whether to use generated quantities 24 | } 25 | transformed data { 26 | int infl_this[N_prop]; 27 | 28 | for(i in 1:N_prop) { 29 | if(outcome_prop[i]==infl_value) { 30 | infl_this[i] = 1; 31 | } else { 32 | infl_this[i] = 0; 33 | } 34 | } 35 | } 36 | parameters { 37 | vector[X] X_beta; // common predictor 38 | vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values 39 | ordered[3] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts) 40 | real kappa; // scale parameter for beta regression 41 | } 42 | transformed parameters { 43 | // store matrix calculations 44 | 45 | vector[N_degen] calc_degen; 46 | vector[N_prop] calc_prop; 47 | vector[!(infl_value<0) ? N_prop : 0] calc_miss; 48 | vector[!(infl_value<0) ? N_pred_degen : 0] calc_degen_miss; // must be defined over both distributionss 49 | 50 | // drop the intercepts so everything is relative to the cutpoints 51 | calc_degen = covar_degen*X_beta; 52 | calc_prop = covar_prop*X_beta; 53 | 54 | 55 | if(!(infl_value<0)) { 56 | calc_miss = covar_prop_infl*X_beta_miss; 57 | calc_degen_miss = covar_degen_infl*X_beta_miss; 58 | } 59 | 60 | } 61 | model { 62 | 63 | // vague priors 64 | X_beta ~ normal(0,5); 65 | X_beta_miss ~ normal(0,5); 66 | kappa ~ exponential(1); 67 | cutpoints[2] - cutpoints[1] ~ normal(0,3); 68 | 69 | // need separate counters for logit (0/1) and beta regression 70 | 71 | for(n in 1:N_degen) { 72 | 73 | if(outcome_degen[n]==0) { 74 | // Pr(Y==0) 75 | target += log1m_inv_logit(calc_degen[n] - cutpoints[1]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]); 76 | } else { 77 | //Pr(Y==1) 78 | target += log_inv_logit(calc_degen[n] - cutpoints[2]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]); 79 | } 80 | } 81 | if(infl_value<0) { 82 | for(n in 1:N_prop) { 83 | // Pr(Y in (0,1)) 84 | target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 85 | // Pr(Y==x where x in (0,1)) 86 | target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa); 87 | } 88 | } else { 89 | for(n in 1:N_prop) { 90 | 91 | // Pr(Y in (0,1)) 92 | 93 | real pry01 = log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 94 | 95 | 96 | // inflate the outcome 97 | if(infl_this[n]==1) { 98 | //target += bernoulli_logit_lpmf(1|calc_miss[n]); 99 | target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]), 100 | bernoulli_logit_lpmf(0|calc_miss[n]) + 101 | beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa)); 102 | } else { 103 | 104 | target += bernoulli_logit_lpmf(0|calc_miss[n]); // "true" observed value 105 | target += pry01; 106 | // Pr(Y==x where x in (0,1)) 107 | target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa); 108 | } 109 | 110 | } 111 | } 112 | 113 | 114 | } 115 | 116 | generated quantities { 117 | 118 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional) 119 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows 120 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo 121 | int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_degen+N_pred_prop]; // whether observation belongs to inflated value or not 122 | 123 | 124 | if(run_gen==1) { 125 | 126 | if(N_pred_degen>0) { 127 | 128 | // first do degenerate outcomes 129 | // note: these could be *re-generated* as beta/propotions 130 | for(i in 1:num_elements(indices_degen)) { 131 | 132 | // draw an outcome 0 / prop / 1 133 | regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints); 134 | 135 | if(outcome_degen[i]==0) { 136 | ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]); 137 | } else { 138 | ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]); 139 | } 140 | 141 | 142 | if(regen_degen[i]==1) { 143 | regen_all[i] = 0; 144 | } else if(regen_degen[i]==3) { 145 | regen_all[i] = 1; 146 | } else { 147 | 148 | if(!(infl_value<0)) { 149 | infl_gen[i] = bernoulli_logit_rng(calc_degen_miss[i]); 150 | 151 | if(infl_gen[i]==1) { 152 | regen_all[i] = infl_value; 153 | } else { 154 | 155 | regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 156 | 157 | } 158 | } else { 159 | regen_all[i] = beta_proportion_rng(inv_logit(calc_degen[i]),kappa); 160 | } 161 | } 162 | 163 | } 164 | 165 | if(N_pred_prop>0) { 166 | // now do originally proportional outcomes 167 | // can be re-generated as 0s or 1s 168 | 169 | int skip = num_elements(indices_degen); 170 | 171 | for(i in 1:num_elements(indices_prop)) { 172 | 173 | // draw an outcome 0 / prop / 1 174 | regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints); 175 | 176 | ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2])); 177 | 178 | if(!(infl_value<0)) { 179 | if(infl_this[i]==1) { 180 | //ord_log[i + skip] += bernoulli_logit_lpmf(1|calc_miss[i]); 181 | ord_log[i + skip] += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[i]), 182 | bernoulli_logit_lpmf(0|calc_miss[i]) + 183 | beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa)); 184 | 185 | // log_sum_exp(beta_proportion_lccdf(infl_value+0.1|inv_logit(calc_prop[i]),kappa), 186 | // beta_proportion_lcdf(infl_value-0.1|inv_logit(calc_prop[i]),kappa)); 187 | } else { 188 | 189 | ord_log[i + skip] += bernoulli_logit_lpmf(0|calc_miss[i]); // "true" observed value 190 | // Pr(Y==x where x in (0,1)) 191 | 192 | ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa); 193 | 194 | } 195 | 196 | 197 | } else { 198 | ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa); 199 | } 200 | 201 | 202 | if(regen_degen[i+skip]==1) { 203 | regen_all[i+skip] = 0; 204 | } else if(regen_degen[i+skip]==3) { 205 | regen_all[i+skip] = 1; 206 | } else { 207 | // did not occur in original data but could re-occur probabilistically 208 | // check for inflation first 209 | 210 | if(!(infl_value<0)) { 211 | infl_gen[i] = bernoulli_logit_rng(calc_miss[i]); 212 | 213 | if(infl_gen[i]==1) { 214 | regen_all[i + skip] = infl_value; 215 | } else { 216 | regen_all[i + skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 217 | } 218 | } else { 219 | regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 220 | } 221 | 222 | } 223 | 224 | } 225 | } 226 | } 227 | } 228 | 229 | } 230 | 231 | -------------------------------------------------------------------------------- /beta_logit_infl_simple.stan: -------------------------------------------------------------------------------- 1 | // 2 | // Ordinal beta regression model for analying experimental outcomes 3 | // with proportion and degenerate responses (i.e. 0 and 1) 4 | // Models 0/1 as ordered categories above/below (0,1) 5 | // Robert Kubinec 6 | // New York University Abu Dhabi 7 | data { 8 | int N_prop; // number of proportion observations (0,1) 9 | int N_degen; // number of 0/1 observations 10 | int X; // number predictors 11 | //int X_miss; // number of predictors for inflated model 12 | vector[N_prop] outcome_prop; // Y in (0,1) 13 | real infl_value; // set to value between 0 and 1. If negative, inflation is not used 14 | int outcome_degen[N_degen]; // Y in {0,1} 15 | matrix[N_prop,X] covar_prop; // covariate X for proportion outcome 16 | //matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values 17 | //matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values 18 | matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome 19 | int N_pred_degen; // number of posterior predictive samples for 0/1 20 | int N_pred_prop; // number of posterior predictive samples for (0,1) 21 | int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1 22 | int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1) 23 | int run_gen; // whether to use generated quantities 24 | } 25 | transformed data { 26 | int infl_this[N_prop]; 27 | int miss_id[N_prop]; // index for imputed outcome 28 | int miss_counter = 1; 29 | 30 | for(i in 1:N_prop) { 31 | if(outcome_prop[i]==infl_value) { 32 | infl_this[i] = 1; 33 | miss_id[i] = miss_counter; 34 | miss_counter += 1; 35 | } else { 36 | infl_this[i] = 0; 37 | miss_id[i] = miss_counter; 38 | } 39 | } 40 | } 41 | parameters { 42 | vector[X] X_beta; // common predictor 43 | vector[miss_counter] prop_miss; 44 | //vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values 45 | ordered[2] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts) 46 | real kappa; // scale parameter for beta regression 47 | real theta; // probability of an observation being missing versus non-missing 48 | } 49 | transformed parameters { 50 | // store matrix calculations 51 | 52 | vector[N_degen] calc_degen; 53 | vector[N_prop] calc_prop; 54 | //vector[!(infl_value<0) ? N_prop : 0] calc_miss; 55 | //vector[!(infl_value<0) ? N_pred_degen : 0] calc_degen_miss; // must be defined over both distributionss 56 | 57 | // drop the intercepts so everything is relative to the cutpoints 58 | calc_degen = covar_degen*X_beta; 59 | calc_prop = covar_prop*X_beta; 60 | 61 | 62 | if(!(infl_value<0)) { 63 | // calc_miss = covar_prop_infl*X_beta_miss; 64 | // calc_degen_miss = covar_degen_infl*X_beta_miss; 65 | } 66 | 67 | } 68 | model { 69 | 70 | // vague priors 71 | X_beta ~ normal(0,5); 72 | //X_beta_miss ~ normal(0,5); 73 | theta ~ normal(0,2); // tight prior around the number of possible missings 74 | kappa ~ exponential(1); 75 | prop_miss ~ normal(0,1); // put the majority of prior miss around 0.5 76 | cutpoints[2] - cutpoints[1] ~ normal(0,3); 77 | // for(c in 1:3) 78 | // cutpoints[c+1] - cutpoints[c] ~ normal(0,3); 79 | 80 | // need separate counters for logit (0/1) and beta regression 81 | 82 | for(n in 1:N_degen) { 83 | 84 | if(outcome_degen[n]==0) { 85 | // Pr(Y==0) 86 | target += log1m_inv_logit(calc_degen[n] - cutpoints[1]); 87 | } else { 88 | //Pr(Y==1) 89 | target += log_inv_logit(calc_degen[n] - cutpoints[2]); 90 | } 91 | } 92 | 93 | //target += beta_proportion_lpdf(outcome_prop|inv_logit(calc_prop),kappa); 94 | 95 | for(n in 1:N_prop) { 96 | // - beta_proportion_lcdf(0.49|inv_logit(calc_prop[n]),kappa) 97 | //- beta_proportion_lccdf(0.51|inv_logit(calc_prop[n]),kappa) 98 | if(infl_this[n]==infl_value) { 99 | target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])) + 100 | log_mix(theta, 101 | beta_proportion_lpdf(prop_miss[miss_id[n]]|inv_logit(calc_prop[n]),kappa), 102 | beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa)); 103 | } else { 104 | target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 105 | outcome_prop[n] ~ beta_proportion(inv_logit(calc_prop[n]),kappa); 106 | } 107 | 108 | } 109 | 110 | 111 | } 112 | 113 | generated quantities { 114 | 115 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional) 116 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows 117 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo 118 | int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_prop]; // whether observation belongs to inflated value or not 119 | 120 | 121 | if(run_gen==1) { 122 | 123 | if(N_pred_degen>0) { 124 | 125 | // first do degenerate outcomes 126 | // note: these could be *re-generated* as beta/propotions 127 | for(i in 1:num_elements(indices_degen)) { 128 | 129 | // draw an outcome 0 / prop / 1 130 | regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints); 131 | 132 | if(outcome_degen[i]==1) { 133 | ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]); 134 | } else { 135 | ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]); 136 | } 137 | 138 | if(regen_degen[i]==1) { 139 | regen_all[i] = 0; 140 | } else if(regen_degen[i]==3) { 141 | regen_all[i] = 1; 142 | } else { 143 | regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 144 | } 145 | 146 | } 147 | } 148 | 149 | if(N_pred_prop>0) { 150 | // now do originally proportional outcomes 151 | // can be re-generated as 0s or 1s 152 | 153 | int skip = num_elements(indices_degen); 154 | 155 | for(i in 1:num_elements(indices_prop)) { 156 | 157 | // draw an outcome 0 / prop / 1 158 | regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints); 159 | infl_gen[i] = bernoulli_rng(theta); 160 | 161 | //ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2])); 162 | 163 | if(infl_this[i]==0) { 164 | ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2])); 165 | ord_log[i+skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa); 166 | } else { 167 | ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2])); 168 | ord_log[i+skip] += log_mix(theta, 169 | beta_proportion_lpdf(prop_miss[miss_id[i]]|inv_logit(calc_prop[i]),kappa), 170 | beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa)); 171 | } 172 | 173 | if(regen_degen[i+skip]==1) { 174 | regen_all[i+skip] = 0; 175 | } else if(regen_degen[i+skip]==3) { 176 | regen_all[i+skip] = 1; 177 | } else { 178 | // did not occur in original data but could re-occur probabilistically 179 | // check for inflation first 180 | if(infl_this[i]==infl_value) { 181 | if(infl_gen[i]==1) { 182 | regen_all[i+skip] = prop_miss[miss_id[i]]; 183 | } else { 184 | regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 185 | } 186 | } else { 187 | regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 188 | } 189 | 190 | 191 | } 192 | } 193 | 194 | 195 | } 196 | 197 | } 198 | 199 | } 200 | -------------------------------------------------------------------------------- /beta_logit_infl_v2.stan: -------------------------------------------------------------------------------- 1 | // 2 | // Ordinal beta regression model for analying experimental outcomes 3 | // with proportion and degenerate responses (i.e. 0 and 1) 4 | // Models 0/1 as ordered categories above/below (0,1) 5 | // Robert Kubinec 6 | // New York University Abu Dhabi 7 | data { 8 | int N_prop; // number of proportion observations (0,1) 9 | int N_degen; // number of 0/1 observations 10 | int X; // number predictors 11 | int X_miss; // number of predictors for inflated model 12 | vector[N_prop] outcome_prop; // Y in (0,1) 13 | real infl_value; // set to value between 0 and 1. If negative, inflation is not used 14 | int outcome_degen[N_degen]; // Y in {0,1} 15 | matrix[N_prop,X] covar_prop; // covariate X for proportion outcome 16 | matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values 17 | matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values 18 | matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome 19 | int N_pred_degen; // number of posterior predictive samples for 0/1 20 | int N_pred_prop; // number of posterior predictive samples for (0,1) 21 | int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1 22 | int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1) 23 | int run_gen; // whether to use generated quantities 24 | } 25 | transformed data { 26 | int infl_this[N_prop]; 27 | 28 | for(i in 1:N_prop) { 29 | if(outcome_prop[i]==infl_value) { 30 | infl_this[i] = 1; 31 | } else { 32 | infl_this[i] = 0; 33 | } 34 | } 35 | } 36 | parameters { 37 | vector[X] X_beta; // common predictor 38 | vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values 39 | ordered[4] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts) 40 | real kappa; // scale parameter for beta regression 41 | } 42 | transformed parameters { 43 | // store matrix calculations 44 | 45 | vector[N_degen] calc_degen; 46 | vector[N_prop] calc_prop; 47 | vector[!(infl_value<0) ? N_prop : 0] calc_miss; 48 | vector[!(infl_value<0) ? N_pred_degen : 0] calc_degen_miss; // must be defined over both distributionss 49 | 50 | // drop the intercepts so everything is relative to the cutpoints 51 | calc_degen = covar_degen*X_beta; 52 | calc_prop = covar_prop*X_beta; 53 | 54 | 55 | if(!(infl_value<0)) { 56 | calc_miss = covar_prop_infl*X_beta_miss; 57 | calc_degen_miss = covar_degen_infl*X_beta_miss; 58 | } 59 | 60 | } 61 | model { 62 | 63 | // vague priors 64 | X_beta ~ normal(0,5); 65 | X_beta_miss ~ normal(0,5); 66 | kappa ~ exponential(1); 67 | for(c in 1:3) 68 | cutpoints[c+1] - cutpoints[c] ~ normal(0,3); 69 | 70 | // need separate counters for logit (0/1) and beta regression 71 | 72 | for(n in 1:N_degen) { 73 | 74 | if(outcome_degen[n]==0) { 75 | // Pr(Y==0) 76 | target += log1m_inv_logit(calc_degen[n] - cutpoints[1]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]); 77 | } else { 78 | //Pr(Y==1) 79 | target += log_inv_logit(calc_degen[n] - cutpoints[4]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]); 80 | } 81 | } 82 | 83 | //target += beta_proportion_lpdf(outcome_prop|inv_logit(calc_prop),kappa); 84 | 85 | for(n in 1:N_prop) { 86 | // - beta_proportion_lcdf(0.49|inv_logit(calc_prop[n]),kappa) 87 | //- beta_proportion_lccdf(0.51|inv_logit(calc_prop[n]),kappa) 88 | if(outcome_prop[n]<0.5) { 89 | target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 90 | target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa) + bernoulli_logit_lpmf(0|calc_miss[n]); 91 | } else if(outcome_prop[n]==0.5) { 92 | target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]), 93 | bernoulli_logit_lpmf(0|calc_miss[n]) + log(inv_logit(calc_prop[n] - cutpoints[2]) - inv_logit(calc_prop[n] - cutpoints[3])) + 94 | beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa)); 95 | } else if(outcome_prop[n]>0.5) { 96 | target += log(inv_logit(calc_prop[n] - cutpoints[3]) - inv_logit(calc_prop[n] - cutpoints[4])); 97 | target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa) + bernoulli_logit_lpmf(0|calc_miss[n]); 98 | } 99 | // Pr(Y in (0,1)) 100 | 101 | // Pr(Y==x where x in (0,1)) 102 | 103 | } 104 | // } else { 105 | // for(n in 1:N_prop) { 106 | // 107 | // // Pr(Y in (0,1)) 108 | // 109 | // if() 110 | // 111 | // real pry01 = log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 112 | // 113 | // 114 | // // inflate the outcome 115 | // if(infl_this[n]==1) { 116 | // //target += bernoulli_logit_lpmf(1|calc_miss[n]); 117 | // target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]), 118 | // bernoulli_logit_lpmf(0|calc_miss[n]) + 119 | // beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa)); 120 | // } else { 121 | // 122 | // target += bernoulli_logit_lpmf(0|calc_miss[n]); // "true" observed value 123 | // target += pry01; 124 | // // Pr(Y==x where x in (0,1)) 125 | // target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa); 126 | // } 127 | // 128 | // } 129 | // } 130 | 131 | 132 | } 133 | 134 | generated quantities { 135 | 136 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional) 137 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows 138 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo 139 | int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_degen+N_pred_prop]; // whether observation belongs to inflated value or not 140 | 141 | 142 | if(run_gen==1) { 143 | 144 | if(N_pred_degen>0) { 145 | 146 | // first do degenerate outcomes 147 | // note: these could be *re-generated* as beta/propotions 148 | for(i in 1:num_elements(indices_degen)) { 149 | 150 | // draw an outcome 0 / prop / 1 151 | regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints); 152 | infl_gen[i] = bernoulli_logit_rng(calc_degen_miss[i]); 153 | 154 | if(outcome_degen[i]==0) { 155 | ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]) + bernoulli_logit_lpmf(0|calc_degen_miss[i]); 156 | } else { 157 | ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[4]) + bernoulli_logit_lpmf(0|calc_degen_miss[i]); 158 | } 159 | 160 | if(infl_gen[i]==1) { 161 | regen_all[i] = 0.5; 162 | } else { 163 | if(regen_degen[i]==1) { 164 | regen_all[i] = 0; 165 | } else if(regen_degen[i]==5) { 166 | regen_all[i] = 1; 167 | } else { 168 | 169 | if(regen_degen[i]==2||regen_degen[i]==4) { 170 | regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 171 | } else { 172 | regen_all[i] = infl_value; 173 | } 174 | } 175 | } 176 | 177 | } 178 | 179 | if(N_pred_prop>0) { 180 | // now do originally proportional outcomes 181 | // can be re-generated as 0s or 1s 182 | 183 | int skip = num_elements(indices_degen); 184 | 185 | for(i in 1:num_elements(indices_prop)) { 186 | 187 | // draw an outcome 0 / prop / 1 188 | regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints); 189 | infl_gen[i] = bernoulli_logit_rng(calc_miss[i]); 190 | 191 | //ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2])); 192 | 193 | if(outcome_prop[i]<0.5) { 194 | ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2])); 195 | ord_log[i+skip] = beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa) + bernoulli_logit_lpmf(0|calc_miss[i]); 196 | } else if(outcome_prop[i]==0.5) { 197 | ord_log[i+skip] = log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[i]), 198 | bernoulli_logit_lpmf(0|calc_miss[i]) + log(inv_logit(calc_prop[i] - cutpoints[2]) - inv_logit(calc_prop[i] - cutpoints[3])) + 199 | beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa)); 200 | } else if(outcome_prop[i]>0.5) { 201 | ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[3]) - inv_logit(calc_prop[i] - cutpoints[4])); 202 | ord_log[i+skip] = beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa) + bernoulli_logit_lpmf(0|calc_miss[i]); 203 | } 204 | 205 | if(infl_gen[i+skip] == 1) { 206 | regen_all[i+skip] = infl_value; 207 | } else { 208 | if(regen_degen[i+skip]==1) { 209 | regen_all[i+skip] = 0; 210 | } else if(regen_degen[i+skip]==5) { 211 | regen_all[i+skip] = 1; 212 | } else { 213 | // did not occur in original data but could re-occur probabilistically 214 | // check for inflation first 215 | 216 | if(regen_degen[i+skip]==1) { 217 | regen_all[i+skip] = 0; 218 | } else if(regen_degen[i+skip]==5) { 219 | regen_all[i+skip] = 1; 220 | } else { 221 | 222 | if(regen_degen[i+skip]==2||regen_degen[i+skip]==4) { 223 | regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 224 | } else { 225 | regen_all[i+skip] = infl_value; 226 | } 227 | } 228 | 229 | } 230 | } 231 | 232 | 233 | } 234 | } 235 | } 236 | } 237 | 238 | } 239 | 240 | -------------------------------------------------------------------------------- /beta_logit_infl_v3.stan: -------------------------------------------------------------------------------- 1 | // 2 | // Ordinal beta regression model for analying experimental outcomes 3 | // with proportion and degenerate responses (i.e. 0 and 1) 4 | // Models 0/1 as ordered categories above/below (0,1) 5 | // Robert Kubinec 6 | // New York University Abu Dhabi 7 | data { 8 | int N_prop; // number of proportion observations (0,1) 9 | int N_degen; // number of 0/1 observations 10 | int X; // number predictors 11 | int X_miss; // number of predictors for inflated model 12 | vector[N_prop] outcome_prop; // Y in (0,1) 13 | real infl_value; // set to value between 0 and 1. If negative, inflation is not used 14 | int outcome_degen[N_degen]; // Y in {0,1} 15 | matrix[N_prop,X] covar_prop; // covariate X for proportion outcome 16 | matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values 17 | matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values 18 | matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome 19 | int N_pred_degen; // number of posterior predictive samples for 0/1 20 | int N_pred_prop; // number of posterior predictive samples for (0,1) 21 | int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1 22 | int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1) 23 | int run_gen; // whether to use generated quantities 24 | } 25 | transformed data { 26 | int infl_this[N_prop]; 27 | 28 | for(i in 1:N_prop) { 29 | if(outcome_prop[i]==infl_value) { 30 | infl_this[i] = 1; 31 | } else { 32 | infl_this[i] = 0; 33 | } 34 | } 35 | } 36 | parameters { 37 | vector[X] X_beta; // common predictor 38 | vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values 39 | ordered[4] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts) 40 | real kappa; // scale parameter for beta regression 41 | } 42 | transformed parameters { 43 | // store matrix calculations 44 | 45 | vector[N_degen] calc_degen; 46 | vector[N_prop] calc_prop; 47 | vector[!(infl_value<0) ? N_prop : 0] calc_miss; 48 | vector[!(infl_value<0) ? N_pred_degen : 0] calc_degen_miss; // must be defined over both distributionss 49 | 50 | // drop the intercepts so everything is relative to the cutpoints 51 | calc_degen = covar_degen*X_beta; 52 | calc_prop = covar_prop*X_beta; 53 | 54 | 55 | if(!(infl_value<0)) { 56 | calc_miss = covar_prop_infl*X_beta_miss; 57 | calc_degen_miss = covar_degen_infl*X_beta_miss; 58 | } 59 | 60 | } 61 | model { 62 | 63 | // vague priors 64 | X_beta ~ normal(0,5); 65 | X_beta_miss ~ normal(0,5); 66 | kappa ~ exponential(1); 67 | for(c in 1:3) 68 | cutpoints[c+1] - cutpoints[c] ~ normal(0,3); 69 | 70 | // need separate counters for logit (0/1) and beta regression 71 | 72 | for(n in 1:N_degen) { 73 | 74 | if(outcome_degen[n]==0) { 75 | // Pr(Y==0) 76 | target += log1m_inv_logit(calc_degen[n] - cutpoints[1]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]); 77 | } else { 78 | //Pr(Y==1) 79 | target += log_inv_logit(calc_degen[n] - cutpoints[4]) + bernoulli_logit_lpmf(0|calc_degen_miss[n]); 80 | } 81 | } 82 | for(n in 1:N_prop) { 83 | 84 | if(outcome_prop[n]<0.5) { 85 | target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 86 | target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa) + 87 | bernoulli_logit_lpmf(0|calc_miss[n]); 88 | } else if(outcome_prop[n]==0.5) { 89 | target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]), 90 | bernoulli_logit_lpmf(0|calc_miss[n]) + 91 | log(inv_logit(calc_prop[n] - cutpoints[2]) - inv_logit(calc_prop[n] - cutpoints[3])) + 92 | beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa)); 93 | } else if(outcome_prop[n]>0.5) { 94 | target += log(inv_logit(calc_prop[n] - cutpoints[3]) - inv_logit(calc_prop[n] - cutpoints[4])); 95 | target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa) + 96 | bernoulli_logit_lpmf(0|calc_miss[n]); 97 | } 98 | // Pr(Y in (0,1)) 99 | 100 | // Pr(Y==x where x in (0,1)) 101 | 102 | } 103 | // } else { 104 | // for(n in 1:N_prop) { 105 | // 106 | // // Pr(Y in (0,1)) 107 | // 108 | // if() 109 | // 110 | // real pry01 = log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 111 | // 112 | // 113 | // // inflate the outcome 114 | // if(infl_this[n]==1) { 115 | // //target += bernoulli_logit_lpmf(1|calc_miss[n]); 116 | // target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]), 117 | // bernoulli_logit_lpmf(0|calc_miss[n]) + 118 | // beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa)); 119 | // } else { 120 | // 121 | // target += bernoulli_logit_lpmf(0|calc_miss[n]); // "true" observed value 122 | // target += pry01; 123 | // // Pr(Y==x where x in (0,1)) 124 | // target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa); 125 | // } 126 | // 127 | // } 128 | // } 129 | 130 | 131 | } 132 | 133 | generated quantities { 134 | 135 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional) 136 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows 137 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo 138 | int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_degen+N_pred_prop]; // whether observation belongs to inflated value or not 139 | 140 | 141 | if(run_gen==1) { 142 | 143 | if(N_pred_degen>0) { 144 | 145 | // first do degenerate outcomes 146 | // note: these could be *re-generated* as beta/propotions 147 | for(i in 1:num_elements(indices_degen)) { 148 | 149 | // draw an outcome 0 / prop / 1 150 | regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints); 151 | 152 | if(outcome_degen[i]==0) { 153 | ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]); 154 | } else { 155 | ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]); 156 | } 157 | 158 | 159 | if(regen_degen[i]==1) { 160 | regen_all[i] = 0; 161 | } else if(regen_degen[i]==3) { 162 | regen_all[i] = 1; 163 | } else { 164 | 165 | if(!(infl_value<0)) { 166 | infl_gen[i] = bernoulli_logit_rng(calc_degen_miss[i]); 167 | 168 | if(infl_gen[i]==1) { 169 | regen_all[i] = infl_value; 170 | } else { 171 | 172 | regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 173 | 174 | } 175 | } else { 176 | regen_all[i] = beta_proportion_rng(inv_logit(calc_degen[i]),kappa); 177 | } 178 | } 179 | 180 | } 181 | 182 | if(N_pred_prop>0) { 183 | // now do originally proportional outcomes 184 | // can be re-generated as 0s or 1s 185 | 186 | int skip = num_elements(indices_degen); 187 | 188 | for(i in 1:num_elements(indices_prop)) { 189 | 190 | // draw an outcome 0 / prop / 1 191 | regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints); 192 | 193 | ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2])); 194 | 195 | if(!(infl_value<0)) { 196 | if(infl_this[i]==1) { 197 | //ord_log[i + skip] += bernoulli_logit_lpmf(1|calc_miss[i]); 198 | ord_log[i + skip] += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[i]), 199 | bernoulli_logit_lpmf(0|calc_miss[i]) + 200 | beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa)); 201 | 202 | // log_sum_exp(beta_proportion_lccdf(infl_value+0.1|inv_logit(calc_prop[i]),kappa), 203 | // beta_proportion_lcdf(infl_value-0.1|inv_logit(calc_prop[i]),kappa)); 204 | } else { 205 | 206 | ord_log[i + skip] += bernoulli_logit_lpmf(0|calc_miss[i]); // "true" observed value 207 | // Pr(Y==x where x in (0,1)) 208 | 209 | ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa); 210 | 211 | } 212 | 213 | 214 | } else { 215 | ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa); 216 | } 217 | 218 | 219 | if(regen_degen[i+skip]==1) { 220 | regen_all[i+skip] = 0; 221 | } else if(regen_degen[i+skip]==3) { 222 | regen_all[i+skip] = 1; 223 | } else { 224 | // did not occur in original data but could re-occur probabilistically 225 | // check for inflation first 226 | 227 | if(!(infl_value<0)) { 228 | infl_gen[i] = bernoulli_logit_rng(calc_miss[i]); 229 | 230 | if(infl_gen[i]==1) { 231 | regen_all[i + skip] = infl_value; 232 | } else { 233 | regen_all[i + skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 234 | } 235 | } else { 236 | regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 237 | } 238 | 239 | } 240 | 241 | } 242 | } 243 | } 244 | } 245 | 246 | } 247 | 248 | -------------------------------------------------------------------------------- /beta_logit_infl_v4.stan: -------------------------------------------------------------------------------- 1 | // 2 | // Ordinal beta regression model for analying experimental outcomes 3 | // with proportion and degenerate responses (i.e. 0 and 1) 4 | // Models 0/1 as ordered categories above/below (0,1) 5 | // Robert Kubinec 6 | // New York University Abu Dhabi 7 | data { 8 | int N_prop; // number of proportion observations (0,1) 9 | int N_degen; // number of 0/1 observations 10 | int X; // number predictors 11 | int X_miss; // number of predictors for inflated model 12 | vector[N_prop] outcome_prop; // Y in (0,1) 13 | real infl_value; // set to value between 0 and 1. If negative, inflation is not used 14 | int outcome_degen[N_degen]; // Y in {0,1} 15 | matrix[N_prop,X] covar_prop; // covariate X for proportion outcome 16 | matrix[N_prop,X_miss] covar_prop_infl; // covariate X for inflated values 17 | matrix[N_degen,X_miss] covar_degen_infl; // covariate X for inflated values 18 | matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome 19 | int N_pred_degen; // number of posterior predictive samples for 0/1 20 | int N_pred_prop; // number of posterior predictive samples for (0,1) 21 | int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1 22 | int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1) 23 | int run_gen; // whether to use generated quantities 24 | } 25 | transformed data { 26 | int infl_this[N_prop]; 27 | 28 | for(i in 1:N_prop) { 29 | if(outcome_prop[i]==infl_value) { 30 | infl_this[i] = 1; 31 | } else { 32 | infl_this[i] = 0; 33 | } 34 | } 35 | } 36 | parameters { 37 | vector[X] X_beta; // common predictor 38 | vector[!(infl_value<0) ? X_miss : 0] X_beta_miss; // predictor for inflated values 39 | ordered[2] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts) 40 | real kappa; // scale parameter for beta regression 41 | } 42 | transformed parameters { 43 | // store matrix calculations 44 | 45 | vector[N_degen] calc_degen; 46 | vector[N_prop] calc_prop; 47 | vector[!(infl_value<0) ? N_prop : 0] calc_miss; 48 | vector[!(infl_value<0) ? N_degen : 0] calc_degen_miss; // must be defined over both distributionss 49 | 50 | // drop the intercepts so everything is relative to the cutpoints 51 | calc_degen = covar_degen*X_beta; 52 | calc_prop = covar_prop*X_beta; 53 | 54 | 55 | if(!(infl_value<0)) { 56 | calc_miss = covar_prop_infl*X_beta_miss; 57 | calc_degen_miss = covar_degen_infl*X_beta_miss; 58 | } 59 | 60 | } 61 | model { 62 | 63 | // vague priors 64 | X_beta ~ normal(0,5); 65 | X_beta_miss ~ normal(0,5); 66 | kappa ~ exponential(1); 67 | cutpoints[2] - cutpoints[1] ~ normal(0,3); 68 | 69 | // need separate counters for logit (0/1) and beta regression 70 | 71 | target += bernoulli_logit_lpmf(0|calc_degen_miss); 72 | 73 | for(n in 1:N_degen) { 74 | 75 | if(outcome_degen[n]==0) { 76 | // Pr(Y==0) 77 | target += log1m_inv_logit(calc_degen[n] - cutpoints[1]); 78 | } else { 79 | //Pr(Y==1) 80 | target += log_inv_logit(calc_degen[n] - cutpoints[2]); 81 | } 82 | } 83 | if(infl_value<0) { 84 | for(n in 1:N_prop) { 85 | // Pr(Y in (0,1)) 86 | target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 87 | // Pr(Y==x where x in (0,1)) 88 | target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa); 89 | } 90 | } else { 91 | for(n in 1:N_prop) { 92 | 93 | // Pr(Y in (0,1)) 94 | 95 | real pry01 = log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 96 | 97 | 98 | // inflate the outcome 99 | if(infl_this[n]==1) { 100 | //target += bernoulli_logit_lpmf(1|calc_miss[n]); 101 | target += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[n]), 102 | bernoulli_logit_lpmf(0|calc_miss[n]) + 103 | beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa)); 104 | } else { 105 | 106 | target += bernoulli_logit_lpmf(0|calc_miss[n]); // "true" observed value 107 | target += pry01; 108 | // Pr(Y==x where x in (0,1)) 109 | target += beta_proportion_lpdf(outcome_prop[n]|inv_logit(calc_prop[n]),kappa); 110 | } 111 | 112 | } 113 | } 114 | 115 | 116 | } 117 | 118 | generated quantities { 119 | 120 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional) 121 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows 122 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo 123 | int infl_gen[(run_gen==0 || infl_value<0) ? 0 : N_pred_degen+N_pred_prop]; // whether observation belongs to inflated value or not 124 | 125 | 126 | if(run_gen==1) { 127 | 128 | if(N_pred_degen>0) { 129 | 130 | // first do degenerate outcomes 131 | // note: these could be *re-generated* as beta/propotions 132 | for(i in 1:num_elements(indices_degen)) { 133 | 134 | // draw an outcome 0 / prop / 1 135 | regen_degen[i] = ordered_logistic_rng(calc_degen[i],cutpoints); 136 | 137 | if(outcome_degen[i]==0) { 138 | ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]); 139 | } else { 140 | ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]); 141 | } 142 | 143 | 144 | if(regen_degen[i]==1) { 145 | regen_all[i] = 0; 146 | } else if(regen_degen[i]==3) { 147 | regen_all[i] = 1; 148 | } else { 149 | 150 | if(!(infl_value<0)) { 151 | infl_gen[i] = bernoulli_logit_rng(calc_degen_miss[i]); 152 | 153 | if(infl_gen[i]==1) { 154 | regen_all[i] = infl_value; 155 | } else { 156 | 157 | regen_all[i] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 158 | 159 | } 160 | } else { 161 | regen_all[i] = beta_proportion_rng(inv_logit(calc_degen[i]),kappa); 162 | } 163 | } 164 | 165 | } 166 | 167 | if(N_pred_prop>0) { 168 | // now do originally proportional outcomes 169 | // can be re-generated as 0s or 1s 170 | 171 | int skip = num_elements(indices_degen); 172 | 173 | for(i in 1:num_elements(indices_prop)) { 174 | 175 | // draw an outcome 0 / prop / 1 176 | regen_degen[i+skip] = ordered_logistic_rng(calc_prop[i],cutpoints); 177 | 178 | ord_log[i+skip] = log(inv_logit(calc_prop[i] - cutpoints[1]) - inv_logit(calc_prop[i] - cutpoints[2])); 179 | 180 | if(!(infl_value<0)) { 181 | if(infl_this[i]==1) { 182 | //ord_log[i + skip] += bernoulli_logit_lpmf(1|calc_miss[i]); 183 | ord_log[i + skip] += log_sum_exp(bernoulli_logit_lpmf(1|calc_miss[i]), 184 | bernoulli_logit_lpmf(0|calc_miss[i]) + 185 | beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa)); 186 | 187 | // log_sum_exp(beta_proportion_lccdf(infl_value+0.1|inv_logit(calc_prop[i]),kappa), 188 | // beta_proportion_lcdf(infl_value-0.1|inv_logit(calc_prop[i]),kappa)); 189 | } else { 190 | 191 | ord_log[i + skip] += bernoulli_logit_lpmf(0|calc_miss[i]); // "true" observed value 192 | // Pr(Y==x where x in (0,1)) 193 | 194 | ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa); 195 | 196 | } 197 | 198 | 199 | } else { 200 | ord_log[i + skip] += beta_proportion_lpdf(outcome_prop[i]|inv_logit(calc_prop[i]),kappa); 201 | } 202 | 203 | 204 | if(regen_degen[i+skip]==1) { 205 | regen_all[i+skip] = 0; 206 | } else if(regen_degen[i+skip]==3) { 207 | regen_all[i+skip] = 1; 208 | } else { 209 | // did not occur in original data but could re-occur probabilistically 210 | // check for inflation first 211 | 212 | if(!(infl_value<0)) { 213 | infl_gen[i] = bernoulli_logit_rng(calc_miss[i]); 214 | 215 | if(infl_gen[i]==1) { 216 | regen_all[i + skip] = infl_value; 217 | } else { 218 | regen_all[i + skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 219 | } 220 | } else { 221 | regen_all[i+skip] = beta_proportion_rng(inv_logit(calc_prop[i]),kappa); 222 | } 223 | 224 | } 225 | 226 | } 227 | } 228 | } 229 | } 230 | 231 | } 232 | 233 | -------------------------------------------------------------------------------- /beta_logit_phireg.stan: -------------------------------------------------------------------------------- 1 | // 2 | // Ordinal beta regression model for analying experimental outcomes 3 | // with proportion and degenerate responses (i.e. 0 and 1) 4 | // Models 0/1 as ordered categories above/below (0,1) 5 | // Robert Kubinec 6 | // New York University Abu Dhabi 7 | data { 8 | int N_prop; // number of proportion observations (0,1) 9 | int N_degen; // number of 0/1 observations 10 | int X; // number predictors 11 | int X_phi; // number of predictors for phi (scale of beta regression) 12 | vector[N_prop] outcome_prop; // Y in (0,1) 13 | int outcome_degen[N_degen]; // Y in {0,1} 14 | matrix[N_prop,X] covar_prop; // covariate X for proportion outcome 15 | matrix[N_prop,X_phi] covar_prop_phi; // covariate X_phi to predict phi for beta regression 16 | matrix[N_degen,X] covar_degen; // covariate X for degenerate (0,1) outcome 17 | matrix[N_degen,X_phi] covar_degen_phi; // covariate X_phi to predict phi for (unobserved) beta regression 18 | int N_pred_degen; // number of posterior predictive samples for 0/1 19 | int N_pred_prop; // number of posterior predictive samples for (0,1) 20 | int indices_degen[N_pred_degen]; // random row indices to use for posterior predictive calculation of 0/1 21 | int indices_prop[N_pred_prop]; // random row indices to use for posterior predictive calculation of (0,1) 22 | int run_gen; // whether to use generated quantities 23 | } 24 | parameters { 25 | vector[X] X_beta; // common predictor 26 | vector[X_phi] X_beta_phi; // predictors for phi in beta regression 27 | ordered[2] cutpoints; // cutpoints on ordered (latent) variable (also stand in as intercepts) 28 | real alpha_phi; // intercept for phi 29 | } 30 | transformed parameters { 31 | // store matrix calculations 32 | 33 | vector[N_degen] calc_degen; 34 | vector[N_prop] calc_prop; 35 | vector[N_prop] calc_prop_phi; 36 | 37 | // drop the intercepts so everything is relative to the cutpoints 38 | calc_degen = covar_degen*X_beta; 39 | calc_prop = covar_prop*X_beta; 40 | calc_prop_phi = exp(alpha_phi + covar_prop_phi*X_beta_phi); 41 | } 42 | model { 43 | 44 | // vague priors 45 | X_beta ~ normal(0,5); 46 | X_beta_phi ~ normal(0,5); 47 | alpha_phi ~ normal(0,5); 48 | cutpoints[2] - cutpoints[1] ~ normal(0,3); 49 | 50 | // need separate counters for logit (0/1) and beta regression 51 | 52 | for(n in 1:N_degen) { 53 | if(outcome_degen[n]==0) { 54 | // Pr(Y==0) 55 | target += log1m_inv_logit(calc_degen[n] - cutpoints[1]); 56 | } else { 57 | //Pr(Y==1) 58 | target += log_inv_logit(calc_degen[n] - cutpoints[2]); 59 | } 60 | } 61 | 62 | for(n in 1:N_prop) { 63 | // Pr(Y in (0,1)) 64 | target += log(inv_logit(calc_prop[n] - cutpoints[1]) - inv_logit(calc_prop[n] - cutpoints[2])); 65 | // Pr(Y==x where x in (0,1)) 66 | outcome_prop[n] ~ beta_proportion(inv_logit(calc_prop[n]),calc_prop_phi[n]); 67 | } 68 | 69 | } 70 | 71 | generated quantities { 72 | 73 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_degen; // which model is selected (degenerate or proportional) 74 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] regen_all; // final (combined) outcome -- defined as random subset of rows 75 | vector[run_gen==0 ? 0 : N_pred_degen+N_pred_prop] ord_log; // store log calculation for loo 76 | vector[run_gen==0 ? 0 : N_pred_degen] calc_degen_phi = exp(alpha_phi + covar_degen_phi*X_beta_phi); // need phi for degenerate outcomes for which we don't use beta distribution 77 | 78 | if(run_gen==1) { 79 | if(N_pred_degen>0) { 80 | // first do degenerate outcomes 81 | // note: these could be *re-generated* as beta/propotions 82 | for(i in 1:num_elements(indices_degen)) { 83 | 84 | // draw an outcome 0 / prop / 1 85 | regen_degen[i] = ordered_logistic_rng(covar_degen[indices_degen[i],]*X_beta,cutpoints); 86 | 87 | if(outcome_degen[i]==0) { 88 | ord_log[i] = log1m_inv_logit(calc_degen[i] - cutpoints[1]); 89 | } else { 90 | ord_log[i] = log_inv_logit(calc_degen[i] - cutpoints[2]); 91 | } 92 | 93 | 94 | if(regen_degen[i]==1) { 95 | regen_all[i] = 0; 96 | } else if(regen_degen[i]==3) { 97 | regen_all[i] = 1; 98 | } else { 99 | // did not occur in original data but could re-occur probabilistically 100 | regen_all[i] = beta_proportion_rng(inv_logit(covar_degen[indices_degen[i],]*X_beta),calc_degen_phi[indices_degen[i]]); 101 | } 102 | 103 | } 104 | 105 | if(N_pred_prop>0) { 106 | // now do originally proportional outcomes 107 | // can be re-generated as 0s or 1s 108 | 109 | int skip = num_elements(indices_degen); 110 | 111 | for(i in 1:num_elements(indices_prop)) { 112 | 113 | // draw an outcome 0 / prop / 1 114 | regen_degen[i+skip] = ordered_logistic_rng(covar_prop[indices_prop[i],]*X_beta,cutpoints); 115 | 116 | ord_log[i+skip] = log(inv_logit(calc_prop[indices_prop[i]] - cutpoints[1]) - inv_logit(calc_prop[indices_prop[i]] - cutpoints[2])) + 117 | beta_proportion_lpdf(outcome_prop[indices_prop[i]]|inv_logit(calc_prop[indices_prop[i]]),calc_prop_phi[indices_prop[i]]); 118 | 119 | if(regen_degen[i+skip]==1) { 120 | regen_all[i+skip] = 0; 121 | } else if(regen_degen[i+skip]==3) { 122 | regen_all[i+skip] = 1; 123 | } else { 124 | // did not occur in original data but could re-occur probabilistically 125 | regen_all[i+skip] = beta_proportion_rng(inv_logit(covar_prop[indices_prop[i],]*X_beta),calc_prop_phi[indices_prop[i]]); 126 | } 127 | 128 | } 129 | } 130 | } 131 | } 132 | 133 | } 134 | 135 | -------------------------------------------------------------------------------- /dag_example.R: -------------------------------------------------------------------------------- 1 | library(dagitty) 2 | library(ggdag) 3 | 4 | # roommate causal graph 5 | 6 | roommate <- dagitty("dag{ 7 | d -> r; 8 | p -> r; 9 | s -> r; 10 | n -> r; 11 | t -> r; 12 | v -> r; 13 | l -> r; 14 | d -> l; 15 | p -> l; 16 | p -> n; 17 | v -> s; 18 | v -> n; 19 | l -> v; 20 | }") 21 | 22 | ggdag(roommate) 23 | 24 | # find variables that aren't related 25 | impliedConditionalIndependencies(roommate) 26 | 27 | # find the adjustment set 28 | adjustmentSets(roommate, exposure="l", outcome = "r") 29 | 30 | # ggdag version 31 | ggdag_adjustment_set(roommate, exposure="l",outcome="r") 32 | -------------------------------------------------------------------------------- /data/W28_Aug17/ATP W28 methodology.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28 methodology.doc -------------------------------------------------------------------------------- /data/W28_Aug17/ATP W28 que.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28 que.docx -------------------------------------------------------------------------------- /data/W28_Aug17/ATP W28 readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28 readme.txt -------------------------------------------------------------------------------- /data/W28_Aug17/ATP W28 topline.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28 topline.docx -------------------------------------------------------------------------------- /data/W28_Aug17/ATP W28.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/ATP W28.sav -------------------------------------------------------------------------------- /data/W28_Aug17/~$P W28 que.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/W28_Aug17/~$P W28 que.docx -------------------------------------------------------------------------------- /data/all_vars_Beta_trans.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/all_vars_Beta_trans.rds -------------------------------------------------------------------------------- /data/all_vars_frac.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/all_vars_frac.rds -------------------------------------------------------------------------------- /data/all_vars_ord.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/all_vars_ord.rds -------------------------------------------------------------------------------- /data/all_vars_zoib.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/all_vars_zoib.rds -------------------------------------------------------------------------------- /data/compare_prob.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/compare_prob.rds -------------------------------------------------------------------------------- /data/sample_all.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sample_all.rds -------------------------------------------------------------------------------- /data/sim_cont_X.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sim_cont_X.RData -------------------------------------------------------------------------------- /data/sim_cont_X.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sim_cont_X.rds -------------------------------------------------------------------------------- /data/sim_cont_X_fixed.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sim_cont_X_fixed.RData -------------------------------------------------------------------------------- /data/sim_cont_X_fixed.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/sim_cont_X_fixed.rds -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_extended.do: -------------------------------------------------------------------------------- 1 | clear 2 | 3 | *The next line reads in the data for the baseline reform indicator. Note that the path should be corrected. 4 | 5 | *use "......\EER-D-13-00718R2_Event_study_extended_sample_reform.dta" 6 | 7 | xi: logit reform1 Revolution4b war ww1 timesincereformbase _splinebase*, cluster(ccode) 8 | 9 | xi: relogit reform1 Revolution4b war ww1 timesincereformbase _splinebase*, cluster(ccode) 10 | 11 | xi: xtlogit reform1 Revolution4b war ww1 timesincereformbase _splinebase*, fe 12 | 13 | xi: logit reform1 Revolution4b laggdp lagn war ww1 timesincereformbase _splinebase*, cluster(ccode) 14 | 15 | xi: relogit reform1 Revolution4b laggdp lagn war ww1 timesincereformbase _splinebase*, cluster(ccode) 16 | 17 | xi: xtlogit reform1 Revolution4b laggdp lagn war ww1 timesincereformbase _splinebase*, fe 18 | 19 | *Equation (16) -- Linear probability models with year effects 20 | 21 | xi: reg reform1 Revolution4b war i.ccode i.year, cluster(ccode) 22 | 23 | xi: reg reform1 Revolution4b laggdp lagn war i.ccode i.year, cluster(ccode) 24 | 25 | 26 | * Models with reversals (columns 4 and 8 in Table 9) 27 | 28 | clear 29 | 30 | *The next line reads in the data for the reform indicator which allows for reversals. Note that the path should be corrected. 31 | 32 | *use "......\EER-D-13-00718R2_Event_study_extended_sample_reversals.dta" 33 | 34 | xi: logit reform_reversal Revolution4b war ww1 timesincereformreversal _splinereversal*, cluster(ccode) 35 | 36 | xi: logit reform_reversal Revolution4b laggdp lagn war ww1 timesincereformreversal _splinereversal*, cluster(ccode) 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_main.do: -------------------------------------------------------------------------------- 1 | clear 2 | 3 | *The next line reads in the data for the baseline reform measure. Note that the path should be corrected. 4 | 5 | *use "........\EER-D-13-00718R2_Event_study_main_sample_reform.dta" 6 | 7 | *Table 8 except for regression 7 which estimated below 8 | 9 | xi: logit Reform1 Revolution2_2012 laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 10 | 11 | xi: logit Reform1 revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 12 | 13 | xi: logit Reform1 Revolution_ling_2012 laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 14 | 15 | xi: logit Reform1 lagrevolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 16 | 17 | xi: logit Reform1 revolution4b New_learn_ling laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 18 | 19 | xi: relogit Reform1 revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 20 | 21 | xi: xtlogit Reform1 revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, fe 22 | 23 | * Table 10 event study part 24 | 25 | xi: logit Reform1 revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard lagtrade timesincereform _spline*, cluster(ccode) 26 | 27 | xi: logit Reform1 revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard lagtc timesincereform _spline*, cluster(ccode) 28 | 29 | xi: logit Reform1 revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard lagagricultural timesincereform _spline*, cluster(ccode) 30 | 31 | xi: logit Reform1 revolution4b laggdp lagn lagurban warintensity2 ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 32 | 33 | 34 | * Table D1 in the supplementary appendix 35 | 36 | 37 | xi: logit Reform1 revolution5b laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 38 | 39 | xi: logit Reform1 revolution4b trend_hp cycle_hp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 40 | 41 | xi: logit Reform1 revolution4b g0_rev laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 42 | 43 | xi: logit Reform1 revolution4b g2_3rev laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 44 | 45 | *Table 8, regression 7 46 | 47 | clear 48 | 49 | *The next line reads in the data for the reform measure which allows for reversals. Note that the path should be corrected. 50 | 51 | *use "........\EER-D-13-00718R2_Event_study_main_sample_reversals.dta" 52 | 53 | 54 | xi: logit Reformreversal revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard timesincereform _spline*, cluster(ccode) 55 | -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_extended_sample_reform.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_extended_sample_reform.dta -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_extended_sample_reversals.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_extended_sample_reversals.dta -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_main_sample_reform.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_main_sample_reform.dta -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_main_sample_reversals.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_Event_study_main_sample_reversals.dta -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_READ_ME.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_READ_ME.pdf -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_maindata_suffrage.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_maindata_suffrage.dta -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_reversals_suffrage.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/data/suffrage_paper_replicationfiles/EER-D-13-00718R2_reversals_suffrage.dta -------------------------------------------------------------------------------- /data/suffrage_paper_replicationfiles/EER-D-13-00718R2_suffrage.do: -------------------------------------------------------------------------------- 1 | 2 | clear all 3 | 4 | *The next line reads in the data. Note that the path should be corrected. 5 | 6 | *use "........\EER-D-13-00718R2_maindata_suffrage.dta" 7 | 8 | *Table 2 9 | 10 | xi: xtpcse e2c lage2c Revolution2_2012 yy* i.ccode, pairwise correlation(psar1) 11 | 12 | xi: xtpcse e2c lage2c revolution4b yy* i.ccode, pairwise correlation(psar1) 13 | 14 | xi: xtpcse e2c lage2c Revolution_ling_2012 yy* i.ccode, pairwise correlation(psar1) 15 | 16 | *part with with controls 17 | 18 | xi: xtpcse e2c lage2c Revolution2_2012 laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise correlation(psar1) 19 | 20 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise correlation(psar1) 21 | 22 | xi: xtpcse e2c lage2c Revolution_ling_2012 laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise correlation(psar1) 23 | 24 | 25 | *Table 3 26 | 27 | *Conley type standard errors are obtained by using x_ols available form Conley's website (see documentation file). 28 | 29 | gen cutoff5001=500 /*more than +-500? km away -> assumed to be independent*/ 30 | gen cutoff5002=500 31 | gen const = 1 32 | 33 | xi: x_ols LatKm LongKm cutoff5001 cutoff5001 e2c lage2c revolution4b laggdp lagn lagurban lagenrollment war ww1 Gold_standard i.ccode yy*, xreg(72) coord(2) 34 | matrix bb = e(b) 35 | matrix v_dep = vecdiag(cov_dep) 36 | scalar coef_e2 = bb[1,2] 37 | scalar se_e2 = sqrt(v_dep[1,2]) 38 | display coef_e2 39 | display coef_e2/se_e2 40 | 41 | drop epsilon 42 | drop dis1 43 | drop dis2 44 | drop window 45 | 46 | gen cutoff8001=800 /*more than +-800? km away -> assumed to be independent*/ 47 | gen cutoff8002=800 48 | 49 | xi: x_ols LatKm LongKm cutoff8001 cutoff8002 e2c lage2c revolution4b laggdp lagn lagurban lagenrollment war ww1 Gold_standard i.ccode yy*, xreg(72) coord(2) 50 | matrix bb = e(b) 51 | matrix v_dep = vecdiag(cov_dep) 52 | scalar coef_e2 = bb[1,2] 53 | scalar se_e2 = sqrt(v_dep[1,2]) 54 | display coef_e2 55 | display coef_e2/se_e2 56 | 57 | drop epsilon 58 | drop dis1 59 | drop dis2 60 | drop window 61 | 62 | gen cutoff14001=1400 /*more than +-1400? km away -> assumed to be independent*/ 63 | gen cutoff14002=1400 64 | 65 | xi: x_ols LatKm LongKm cutoff14001 cutoff14002 e2c lage2c revolution4b laggdp lagn lagurban lagenrollment war ww1 Gold_standard i.ccode yy*, xreg(72) coord(2) 66 | matrix bb = e(b) 67 | matrix v_dep = vecdiag(cov_dep) 68 | scalar coef_e2 = bb[1,2] 69 | scalar se_e2 = sqrt(v_dep[1,2]) 70 | display coef_e2 71 | display coef_e2/se_e2 72 | 73 | drop epsilon 74 | drop dis1 75 | drop dis2 76 | drop window 77 | 78 | xi: xtpcse e2c lage2c laggdp lagn lagurban war ww1 lagrevolution4b Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1) 79 | 80 | xi: xtpcse e2c lage2c laggdp lagn lagurban war ww1 revolution4b ownrev Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1) 81 | 82 | xi: xtpcse de2 laggdp lagn lagurban war ww1 lagrevolution4b Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1) 83 | 84 | *Table 5 85 | 86 | xi: xtpcse e2c lage2c revolution4b ownrev laggdp lagn lagurban war lagenrollment Gold_standard yy* i.ccode, correlation(psar1) pairwise 87 | 88 | xi: xtpcse e2c lage2c lagrevolution4b ownrev laggdp lagn lagurban war lagenrollment Gold_standard yy* i.ccode, correlation(psar1) pairwise 89 | 90 | xi: xtpcse de2 lagrevolution4b ownrev laggdp lagn lagurban war lagenrollment Gold_standard yy* i.ccode, correlation(psar1) pairwise 91 | 92 | xi: xtpcse e2c lage2c revolution4b ownrev laggdp lagn lagurban war lagenrollment Gold_standard i.year i.ccode, het correlation(psar1) 93 | 94 | xi: xtpcse e2c lage2c lagrevolution4b ownrev laggdp lagn lagurban war lagenrollment Gold_standard i.year i.ccode, het correlation(psar1) 95 | 96 | xi: xtpcse de2 lagrevolution4b ownrev laggdp lagn lagurban war lagenrollment Gold_standard i.year i.ccode, het correlation(psar1) 97 | 98 | *Table 6 99 | 100 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard g0_rev yy* i.ccode, pairwise correlation(psar1) 101 | 102 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard g2_3rev yy* i.ccode, pairwise correlation(psar1) 103 | 104 | xi: xtpcse e2c lage2c revolution4b cycle_hp trend_hp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise correlation(psar1) 105 | 106 | xi: xtpcse e2c lage2c year lograin laglograin revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode if year<1914, pairwise correlation(psar1) 107 | 108 | xi: xtpcse e2c lage2c year revolution4b raingrowth lagraingrowth laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode if year<1914, pairwise correlation(psar1) 109 | 110 | xi: xtpcse e2c lage2c revolution5b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise correlation(psar1) 111 | 112 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode if year<1914, pairwise correlation(psar1) 113 | 114 | *regression 8 with reversals can be found at the end of this do file 115 | 116 | *Table 7 117 | 118 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise hetonly 119 | 120 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise 121 | 122 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise hetonly correlation(psar1) 123 | 124 | *Bruno correction 125 | xi: xtlsdvc e2c laggdp lagn lagurban war ww1 revolution4b Gold_standard lagenrollment yy*, initial(ah) vcov(50) 126 | 127 | *System GMM 128 | 129 | xi: xtdpdsys e2c laggdp lagn lagurban war ww1 revolution4b Gold_standard lagenrollment yy*, maxldep(3) maxlags(3) 130 | 131 | *Tobit 132 | 133 | xi: tobit e2c lage2c laggdp lagn lagurban war ww1 revolution4b Gold_standard lagenrollment yy* i.ccode, ll(0) ul(100) 134 | 135 | *Fractional logit. Use the rescalled Suffrage variables. 136 | 137 | xi: glm e2c_wp lage2c_wp laggdp lagn lagurban war ww1 revolution4b Gold_standard lagenrollment yy* i.ccode if e2c_wp<=1, fam(bin) link(logit) robust cluster(ccode) 138 | 139 | *Table 10 - suffrage part 140 | 141 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard lagtrade yy* i.ccode, pairwise correlation(psar1) 142 | 143 | 144 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard lagtc yy* i.ccode, pairwise correlation(psar1) 145 | 146 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard lagagricultural yy* i.ccode, pairwise correlation(psar1) 147 | 148 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban warintensity2 ww1 lagenrollment Gold_standard yy* i.ccode, pairwise correlation(psar1) 149 | 150 | 151 | *Other estimations reported or discussed in the text 152 | 153 | *Equation (12) -- with authors of liberty 154 | 155 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard authors_2 authors_3 yy* i.ccode, pairwise correlation(psar1) 156 | 157 | 158 | *Equation (14) -- Distance interaction 159 | 160 | xi: xtpcse e2c lage2c laggdp lagn lagurban war ww1 Revolution2_2012 Revtdist Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1) 161 | 162 | *Distance interaction with year effects (discussed after Equation (14)) 163 | 164 | xi: xtpcse e2c lage2c laggdp lagn lagurban war Revtdist Gold_standard lagenrollment i.year i.ccode, het correlation(psar1) 165 | 166 | 167 | *Equation (15) - inequality interaction with year effects 168 | 169 | xi: xtpcse e2c lage2c laggdp lagn lagurban war revolution4b lagGini Ginirev2 Gold_standard lagenrollment i.year i.ccode, hetonly correlation(psar1) 170 | 171 | *Inequality interaction with two-year fixed effects and spatial correlation 172 | 173 | xi: xtpcse e2c lage2c laggdp lagn lagurban war ww1 revolution4b lagGini Ginirev2 Gold_standard lagenrollment yy* i.ccode, pairwise correlation(psar1) 174 | 175 | 176 | *Equation 3 in Supplementary Appendix D and discussed in Section 5.2.4 177 | 178 | xi: reg e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, cluster(year) 179 | 180 | *Error Correction Model estimates reported in Supplementary Appendix D and discussed in Section 5.5 181 | 182 | xi: xtpcse d.e2c lage2c l.laggdp l.lagn l.lagurban l.war l.ww1 l.revolution4b l.Gold_standard l.lagenrollment d.laggdp d.lagn d.lagurban d.war d.ww1 d.revolution4b d.Gold_standard d.lagenrollment yy*, pairwise correlation(psar1) 183 | 184 | clear 185 | 186 | *use "........\EER-D-13-00718R2_reversals_suffrage.dta" 187 | 188 | *Table 6, regression 8 which allows for democratic reversals 189 | 190 | 191 | xi: xtpcse e2c lage2c revolution4b laggdp lagn lagurban war ww1 lagenrollment Gold_standard yy* i.ccode, pairwise correlation(psar1) 192 | 193 | -------------------------------------------------------------------------------- /define_ord_betareg.R: -------------------------------------------------------------------------------- 1 | # File with code needed to fit ordered beta regression 2 | # As custom family in R package brms 3 | # See vignette at 4 | 5 | # custom family 6 | 7 | ord_beta_reg <- custom_family("ord_beta_reg", 8 | dpars=c("mu","phi","cutzero","cutone"), 9 | links=c("logit","log",NA,NA), 10 | lb=c(NA,0,NA,NA), 11 | type="real") 12 | 13 | # stan code for density of the model 14 | 15 | stan_funs <- "real ord_beta_reg_lpdf(real y, real mu, real phi, real cutzero, real cutone) { 16 | 17 | //auxiliary variables 18 | real mu_logit = logit(mu); 19 | vector[2] thresh; 20 | thresh[1] = cutzero; 21 | thresh[2] = cutzero + exp(cutone); 22 | 23 | if(y==0) { 24 | return log1m_inv_logit(mu_logit - thresh[1]); 25 | } else if(y==1) { 26 | return log_inv_logit(mu_logit - thresh[2]); 27 | } else { 28 | return log(inv_logit(mu_logit - thresh[1]) - inv_logit(mu_logit - thresh[2])) + 29 | beta_proportion_lpdf(y|mu,phi); 30 | } 31 | }" 32 | 33 | stanvars <- stanvar(scode=stan_funs,block="functions") 34 | 35 | # For pulling posterior predictions 36 | 37 | posterior_predict_ord_beta_reg <- function(i, draws, ...) { 38 | mu <- draws$dpars$mu[, i] 39 | phi <- draws$dpars$phi 40 | cutzero <- draws$dpars$cutzero 41 | cutone <- draws$dpars$cutone 42 | N <- length(phi) 43 | 44 | thresh1 <- cutzero 45 | thresh2 <- cutzero + exp(cutone) 46 | 47 | pr_y0 <- 1 - plogis(qlogis(mu) - thresh1) 48 | pr_y1 <- plogis(qlogis(mu) - thresh2) 49 | pr_cont <- plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2) 50 | out_beta <- rbeta(n=N,mu*phi,(1-mu)*phi) 51 | 52 | # now determine which one we get for each observation 53 | outcomes <- sapply(1:N, function(i) { 54 | sample(1:3,size=1,prob=c(pr_y0[i],pr_cont[i],pr_y1[i])) 55 | }) 56 | 57 | final_out <- sapply(1:length(outcomes),function(i) { 58 | if(outcomes[i]==1) { 59 | return(0) 60 | } else if(outcomes[i]==2) { 61 | return(out_beta[i]) 62 | } else { 63 | return(1) 64 | } 65 | }) 66 | 67 | final_out 68 | 69 | } 70 | 71 | # for calculating marginal effects/conditional expectations 72 | 73 | posterior_epred_ord_beta_reg<- function(draws) { 74 | 75 | cutzero <- draws$dpars$cutzero 76 | cutone <- draws$dpars$cutone 77 | 78 | mu <- draws$dpars$mu 79 | 80 | thresh1 <- cutzero 81 | thresh2 <- cutzero + exp(cutone) 82 | 83 | low <- 1 - plogis(qlogis(mu) - thresh1) 84 | middle <- plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2) 85 | high <- plogis(qlogis(mu) - thresh2) 86 | 87 | low*0 + middle*mu + high 88 | } 89 | 90 | # for calcuating LOO and Bayes Factors 91 | 92 | log_lik_ord_beta_reg <- function(i, draws) { 93 | 94 | mu <- draws$dpars$mu[,i] 95 | phi <- draws$dpars$phi 96 | y <- draws$data$Y[i] 97 | cutzero <- draws$dpars$cutzero 98 | cutone <- draws$dpars$cutone 99 | 100 | thresh1 <- cutzero 101 | thresh2 <- cutzero + exp(cutone) 102 | 103 | if(y==0) { 104 | out <- log(1 - plogis(qlogis(mu) - thresh1)) 105 | } else if(y==1) { 106 | out <- log(plogis(qlogis(mu) - thresh2)) 107 | } else { 108 | out <- log(plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2)) + dbeta(y,mu*phi,(1-mu)*phi,log=T) 109 | } 110 | 111 | out 112 | 113 | } 114 | 115 | ###### Code declaring induced dirichlet prior #### 116 | # code from Michael Betancourt/Staffan Betner 117 | # discussion here: https://discourse.mc-stan.org/t/dirichlet-prior-on-ordinal-regression-cutpoints-in-brms/20640 118 | dirichlet_prior <- " 119 | real induced_dirichlet_lpdf(vector c, vector alpha, real phi) { 120 | int K = num_elements(c) + 1; 121 | vector[K - 1] sigma = inv_logit(phi - c); 122 | vector[K] p; 123 | matrix[K, K] J = rep_matrix(0, K, K); 124 | 125 | // Induced ordinal probabilities 126 | p[1] = 1 - sigma[1]; 127 | for (k in 2:(K - 1)) 128 | p[k] = sigma[k - 1] - sigma[k]; 129 | p[K] = sigma[K - 1]; 130 | 131 | // Baseline column of Jacobian 132 | for (k in 1:K) J[k, 1] = 1; 133 | 134 | // Diagonal entries of Jacobian 135 | for (k in 2:K) { 136 | real rho = sigma[k - 1] * (1 - sigma[k - 1]); 137 | J[k, k] = - rho; 138 | J[k - 1, k] = rho; 139 | } 140 | 141 | return dirichlet_lpdf(p | alpha) 142 | + log_determinant(J); 143 | } 144 | " 145 | dirichlet_prior_stanvar <- stanvar(scode = dirichlet_prior, block = "functions") 146 | 147 | stanvar(scode = "ordered[2] thresh; 148 | thresh[1] = cutzero; 149 | thresh[2] = cutzero+exp(cutone);", 150 | block = "tparameters") -> # there might be a better way to specify this 151 | dirichlet_prior_ordbeta_stanvar 152 | 153 | stanvars <- stanvars + dirichlet_prior_stanvar + dirichlet_prior_ordbeta_stanvar 154 | 155 | # Feel free to add any other priors / change the priors on b, 156 | # which represent regression coefficients on the logit 157 | # scale 158 | 159 | priors <- set_prior("target += induced_dirichlet_lpdf(thresh | rep_vector(1, 3), 0)", check=FALSE) + 160 | set_prior("normal(0,5)",class="b") + 161 | set_prior("exponential(.1)",class="phi") 162 | 163 | # priors <- set_prior("normal(0,5)",class="b") + 164 | # prior(constant(0),class="b",coef="Intercept") + 165 | # prior_string("target += normal_lpdf((cutzero + exp(cutone)) - cutzero|0,3) + cutone",check=F) + 166 | # set_prior("exponential(.1)",class="phi") 167 | 168 | priors_phireg <- set_prior("normal(0,5)",class="b") + 169 | set_prior("target += induced_dirichlet_lpdf(thresh | rep_vector(1, 3), 0)", check=FALSE) 170 | -------------------------------------------------------------------------------- /estimate_with_brms.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to Run Ordered Beta Regression with `brms`" 3 | author: Robert Kubinec 4 | output: html_document 5 | --- 6 | 7 | ```{r setup, include=F} 8 | 9 | require(brms) 10 | require(dplyr) 11 | require(tidyr) 12 | require(ggplot2) 13 | require(haven) 14 | 15 | knitr::opts_chunk$set(echo=T) 16 | 17 | set.seed(71520177) 18 | 19 | ``` 20 | 21 | 22 | # Overview 23 | 24 | This notebook contains instructions for running the ordered beta regression model in the R package `brms`, a front-end to the Stan Hamiltonian Markov Chain Monte Carlo sampler. The ordered beta regression model is designed explicitly for slider scale/visual analog scale data of the type you will often find in online surveys among other areas. I refer you to a paper on the model if you are not familiar with it: https://osf.io/preprints/socarxiv/2sx6y/. 25 | 26 | The ordered beta regression model is not natively supported in `brms` and so instead I define it here using the [custom response option](https://cran.r-project.org/web/packages/brms/vignettes/brms_customfamilies.html) of `brms`. 27 | 28 | # Data Preparation 29 | 30 | First, I load data from the Pew Forum that asks a question about respondents' views towards college professors (for a more complete explication, see the paper referenced above). 31 | 32 | ```{r load_data} 33 | 34 | pew <- read_sav("data/W28_Aug17/ATP W28.sav") %>% 35 | mutate(therm=na_if(THERMO_THERMBC_W28,999)) %>% 36 | filter(!is.na(therm)) 37 | 38 | pew %>% 39 | ggplot(aes(x=therm)) + 40 | geom_histogram(bins=100) + 41 | theme_minimal() + 42 | theme(panel.grid=element_blank()) + 43 | scale_x_continuous(breaks=c(0,25,50,75,100), 44 | labels=c("0","Colder","50","Warmer","100")) + 45 | ylab("") + 46 | xlab("") + 47 | labs(caption=paste0("Figure shows the distribution of ",sum(!is.na(pew$therm))," non-missing survey responses.")) 48 | 49 | ``` 50 | 51 | The distributions of feelings towards college professors contains both degenerate (0 and 100) and continuous responses between 0 and 100. To model it, we first need to rescale the outcome so that it will have bounds between 0 and 1 instead of 0 and 100. This is done very easily by subtracting the minimum value, in this case 0, and then dividing by the difference between the minimum and maximum (i.e., 100). I also do some other data processing tasks: 52 | 53 | ```{r munge_data} 54 | 55 | model_data <- select(pew,therm,age="F_AGECAT_FINAL", 56 | sex="F_SEX_FINAL", 57 | income="F_INCOME_FINAL", 58 | ideology="F_IDEO_FINAL", 59 | race="F_RACETHN_RECRUITMENT", 60 | education="F_EDUCCAT2_FINAL", 61 | region="F_CREGION_FINAL", 62 | approval="POL1DT_W28", 63 | born_again="F_BORN_FINAL", 64 | relig="F_RELIG_FINAL", 65 | news="NEWS_PLATFORMA_W28") %>% 66 | mutate_all(zap_missing) %>% 67 | drop_na %>% 68 | mutate(therm=(therm - min(therm,na.rm = T))/(max(therm,na.rm=T) - 69 | min(therm,na.rm = T)), 70 | news=as_factor(news,levels="labels"), 71 | age=c(scale(age)), 72 | race=as_factor(race,levels="labels"), 73 | ideology=as_factor(ideology,levels="labels"), 74 | income=as_factor(income,levels="labels"), 75 | approval=as_factor(approval,levels="labels"), 76 | sex=as_factor(sex,levels="labels"), 77 | education=as_factor(education,levels="labels"), 78 | born_again=as_factor(born_again,levels="labels"), 79 | relig=as_factor(relig,levels="labels")) %>% 80 | mutate_at(c("race","ideology","income","approval","sex","education","born_again","relig"), function(c) { 81 | factor(c, exclude=levels(c)[length(levels(c))]) 82 | }) %>% 83 | # need to make these ordered factors for BRMS 84 | mutate(education=ordered(education), 85 | income=ordered(income)) 86 | 87 | ``` 88 | 89 | The completed dataset has `r nrow(model_data)` observations. 90 | 91 | # Define Custom Family 92 | 93 | To model this data in `brms`, I have to define some code using the `custom_family` function to create a new distribution, `ord_beta_reg`. You need to run the following code in R before trying to use the custom family as it defines the log-likelihood and the priors (you can of course add additional priors of your own). You can access this code as an R script `define_ord_betareg.R` in the [Github repository containing this Rmarkdown file](https://github.com/saudiwin/ordbetareg). 94 | 95 | ```{r customfam} 96 | 97 | # custom family 98 | 99 | ord_beta_reg <- custom_family("ord_beta_reg", 100 | dpars=c("mu","phi","cutzero","cutone"), 101 | links=c("logit","log",NA,NA), 102 | lb=c(NA,0,NA,NA), 103 | type="real") 104 | 105 | # stan code for density of the model 106 | 107 | stan_funs <- "real ord_beta_reg_lpdf(real y, real mu, real phi, real cutzero, real cutone) { 108 | 109 | //auxiliary variables 110 | real mu_logit = logit(mu); 111 | vector[2] thresh; 112 | thresh[1] = cutzero; 113 | thresh[2] = cutzero + exp(cutone); 114 | 115 | if(y==0) { 116 | return log1m_inv_logit(mu_logit - thresh[1]); 117 | } else if(y==1) { 118 | return log_inv_logit(mu_logit - thresh[2]); 119 | } else { 120 | return log(inv_logit(mu_logit - thresh[1]) - inv_logit(mu_logit - thresh[2])) + 121 | beta_proportion_lpdf(y|mu,phi); 122 | } 123 | }" 124 | 125 | stanvars <- stanvar(scode=stan_funs,block="functions") 126 | 127 | # For pulling posterior predictions 128 | 129 | posterior_predict_ord_beta_reg <- function(i, draws, ...) { 130 | mu <- draws$dpars$mu[, i] 131 | phi <- draws$dpars$phi 132 | cutzero <- draws$dpars$cutzero 133 | cutone <- draws$dpars$cutone 134 | N <- length(phi) 135 | 136 | thresh1 <- cutzero 137 | thresh2 <- cutzero + exp(cutone) 138 | 139 | pr_y0 <- 1 - plogis(qlogis(mu) - thresh1) 140 | pr_y1 <- plogis(qlogis(mu) - thresh2) 141 | pr_cont <- plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2) 142 | out_beta <- rbeta(n=N,mu*phi,(1-mu)*phi) 143 | 144 | # now determine which one we get for each observation 145 | outcomes <- sapply(1:N, function(i) { 146 | sample(1:3,size=1,prob=c(pr_y0[i],pr_cont[i],pr_y1[i])) 147 | }) 148 | 149 | final_out <- sapply(1:length(outcomes),function(i) { 150 | if(outcomes[i]==1) { 151 | return(0) 152 | } else if(outcomes[i]==2) { 153 | return(out_beta[i]) 154 | } else { 155 | return(1) 156 | } 157 | }) 158 | 159 | final_out 160 | 161 | } 162 | 163 | # for calculating marginal effects/conditional expectations 164 | 165 | posterior_epred_ord_beta_reg <- function(draws) { 166 | 167 | cutzero <- draws$dpars$cutzero 168 | cutone <- draws$dpars$cutone 169 | 170 | mu <- draws$dpars$mu 171 | 172 | thresh1 <- cutzero 173 | thresh2 <- cutzero + exp(cutone) 174 | 175 | low <- 1 - plogis(qlogis(mu) - thresh1) 176 | middle <- plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2) 177 | high <- plogis(qlogis(mu) - thresh2) 178 | 179 | low*0 + middle*mu + high 180 | } 181 | 182 | # for calcuating LOO and Bayes Factors 183 | 184 | log_lik_ord_beta_reg <- function(i, draws) { 185 | 186 | mu <- draws$dpars$mu[,i] 187 | phi <- draws$dpars$phi 188 | y <- draws$data$Y[i] 189 | cutzero <- draws$dpars$cutzero 190 | cutone <- draws$dpars$cutone 191 | 192 | thresh1 <- cutzero 193 | thresh2 <- cutzero + exp(cutone) 194 | 195 | if(y==0) { 196 | out <- log(1 - plogis(qlogis(mu) - thresh1)) 197 | } else if(y==1) { 198 | out <- log(plogis(qlogis(mu) - thresh2)) 199 | } else { 200 | out <- log(plogis(qlogis(mu)-thresh1) - plogis(qlogis(mu) - thresh2)) + dbeta(y,mu*phi,(1-mu)*phi,log=T) 201 | } 202 | 203 | out 204 | 205 | } 206 | 207 | # Feel free to add any other priors / change the priors on b, 208 | # which represent regression coefficients on the logit 209 | # scale 210 | 211 | priors <- set_prior("normal(0,5)",class="b") + 212 | prior(constant(0),class="b",coef="Intercept") + 213 | prior_string("target += normal_lpdf((cutzero + exp(cutone)) - cutzero|0,3) + cutone",check=F) + 214 | set_prior("exponential(.1)",class="phi") 215 | 216 | priors_phireg <- set_prior("normal(0,5)",class="b") + 217 | prior(constant(0),class="b",coef="Intercept") + 218 | prior_string("target += normal_lpdf((cutzero + exp(cutone)) - cutzero|0,3) + cutone",check=F) 219 | 220 | ``` 221 | 222 | # Run In BRMS 223 | 224 | Given these new functions, we can then run a `brms` model as usual. The one catch is that we need to include the `priors` object in the `priors` argument and the `stanvars` object in the `stanvars` argument of the function, which comes from the code block above. The second and *very important* item is that the model formula must start with `0 + Intercept` for the independent (right-hand side) variables. You can add anything you want after that first term. This is to ensure no multi-collinearity between the ordinal cutpoints in the model and the intercept. 225 | 226 | Other than that, everything is the same and you can use any cool `brms` features. To demonstrate some of these, I will model education and income as ordinal predictors by using the `mo()` function. By doing so, we can get a single effect for education and income instead of having to use dummies for separate education/income categories. As a result, I can include an interaction between the two variables to see if wealthier more educated people have better views towards college professors than poorer better educated people. Finally, I include varying (random) census region intercepts. 227 | 228 | One note is that I use the `cmdstanr` backend for `brms` to use the latest version of Stan, but you can remove that option to use the Stan that comes with `brms` (`rstan`). 229 | 230 | ```{r run_brms} 231 | 232 | brms_fit <- brm(therm ~ 0 + Intercept + mo(education)*mo(income) + (1|region), data=model_data, 233 | family=ord_beta_reg, 234 | cores=2,chains=2, 235 | prior = priors, 236 | refresh=0, 237 | backend="cmdstanr", 238 | stanvars=stanvars) 239 | 240 | ``` 241 | 242 | The running time for this model, which has pretty complicated predictors, is about 7 minutes. So the model is currently robust enough to handle datasets of reasonable size. Performance will improve if I can get the model into `brms` proper. The one divergent transition referenced above is due to the well-known funnel problem of the variance of the random intercepts, and I will ignore it for the purposes of this vignette. 243 | 244 | # Post-Estimation 245 | 246 | The first thing we can do is extract the model cutpoints and overlay them on the empirical distribution to see how the model is dividing the outcome into discrete-ish categories. We have to do transformation of the cutpoints using the inverse logit function in R (`plogis`) to get back values in the scale of the response, and I have to exponentiate and add the first cutpoint to get the correct value for the second cutpoint: 247 | 248 | ```{r plot_cut} 249 | 250 | all_draws <- extract_draws(brms_fit) 251 | 252 | cutzero <- plogis(all_draws$dpars$cutzero) 253 | cutone <- plogis(all_draws$dpars$cutzero + exp(all_draws$dpars$cutone)) 254 | 255 | pew %>% 256 | ggplot(aes(x=therm)) + 257 | geom_histogram(bins=100) + 258 | theme_minimal() + 259 | theme(panel.grid=element_blank()) + 260 | scale_x_continuous(breaks=c(0,25,50,75,100), 261 | labels=c("0","Colder","50","Warmer","100")) + 262 | geom_vline(xintercept = mean(cutzero)*100,linetype=2) + 263 | geom_vline(xintercept = mean(cutone)*100,linetype=2) + 264 | ylab("") + 265 | xlab("") + 266 | labs(caption=paste0("Figure shows the distribution of ",sum(!is.na(pew$therm))," non-missing survey responses.")) 267 | 268 | 269 | ``` 270 | 271 | We can see in the plot above that the model does a good job isolating values that are very close to 0 and 1 from values that are more continuous in nature. 272 | 273 | We can plot the full predictive distribution relative to the original outcome: 274 | 275 | ```{r post_predict} 276 | 277 | pp_check(brms_fit) + theme_minimal() 278 | 279 | ``` 280 | 281 | We can see the coefficients from the model with the following command (these are on the logit scale with the exception of phi, the scale/dispersion parameter): 282 | 283 | ```{r coef_plot} 284 | plot(brms_fit,ask=F,theme=theme(panel.grid=element_blank(), 285 | panel.background = element_blank())) 286 | ``` 287 | 288 | # Marginal Effects 289 | 290 | We can also look at marginal effects, or the average change in the outcome given a unit change in one of the variables, by using the `conditional_effects` function in `brms`. This function plots the effect of `income` and `education` separately and then the interaction of the two. 291 | 292 | ```{r marg_effect} 293 | 294 | plot(conditional_effects(brms_fit),theme=theme(axis.text.x = element_text(angle=90), 295 | panel.grid=element_blank(), 296 | panel.background = element_blank()),ask=F) 297 | 298 | ``` 299 | 300 | Broadly speaking, these plots show that richer people have less favorable views on college professors, less educated people have less favorable views towards college professors, and the effect is even stronger if we consider the interaction. Wealthier less educated people are dramatically more likely to have less favorable views towards college professors than poor people with a postgraduate education. The difference is equivalent to 0.4, or 40 points on the original 0 to 100 scale. 301 | 302 | # Understanding Clustering/Polarization of Respondents 303 | 304 | As I explain in the paper, one of the main advantages of using a Beta regression model is its ability to model the dispersion among respondents not just in terms of variance (i.e. heteroskedasticity) but also the shape of dispersion, whether it is U or inverted-U shaped. Conceptually, a U shape would imply that respondents are bipolar, moving towards the extremes. An inverted-U shape would imply that respondents tend to cluster around a central value. We can predict these responses conditionally in the sample by adding predictors for `phi`, the scale/dispersion parameter in the Beta distribution. Higher values of `phi` imply a uni-modal distribution clustered around a central value, with increasing `phi` implying more clustering. Lower values of `phi` imply a bi-modal distribution with values at the extremes. Notably, these effects are calculated independently of the expected value, or mean, of the distribution, so values of `phi` will produce different shapes depending on the average value. 305 | 306 | The one change we need to make to fit this model is to add a formula predicting `phi` in the code below (this formula does not need the `0 + Intercept` syntax of the main model). We wrap both formulas in the `bf` function to indicate they are both distributional parameters. We also need to change the prior argument to `priors_phireg` as the priors of the model have changed as a result of adding predictors. For this model, we will put in an interaction between age and sex to see if younger/older men/women tend to have clustered or more heterogenous views on college professors. To make the model run a bit faster, we drop the `mo` terms around `education` and `income` so that they are evaluated as dummy variables. 307 | 308 | 309 | ```{r run_brms_phi} 310 | 311 | brms_fit_phireg <- brm(bf(therm ~ 0 + Intercept + education + income, 312 | phi ~ age*sex), 313 | data=model_data, 314 | family=ord_beta_reg, 315 | cores=2,chains=2, 316 | prior = priors_phireg, 317 | refresh=100, 318 | stanvars=stanvars, 319 | backend="cmdstanr") 320 | 321 | ``` 322 | 323 | We cannot use the `conditional_effects` option to plot because the dispersion parameter `phi` by definition does not affect the expected outcome (i.e., the average). Instead, we can uses the regular `plot` function to visualize the parameters: 324 | 325 | ```{r plot_phi} 326 | 327 | plot(brms_fit_phireg,pars = "phi",fixed = F,ask=F,combo=c("intervals","hist")) 328 | 329 | ``` 330 | 331 | There is weak to moderate evidence in this plot that women have more homogenous views and that older women have more homogenous views than younger women. However, there is substantial uncertainty in this estimate and the estimate itself is not very large. We are using the log link for `phi`, so to get the value of the effect on `phi`, we can exponentiate the coefficient, i.e. `exp(0.1)=``r round(exp(0.1),2)`. We can compare what the distributions look like by plotting histograms of simulated outcomes using the Beta distribution and the average response in our data, which is `r round(mean(model_data$therm,na.rm=T),2)`. We will examine a 1-unit change in `phi` (much larger than our estimated effects) from 2 to 3 by using the average response for `mu` and plotting the density of both distributions: 332 | 333 | ```{r plot_phi_sim} 334 | 335 | # parameterization of beta distribution in terms of mu/phi (mean/dispersion) 336 | rbeta_mean <- function(N,mu,phi) { 337 | rbeta(N,mu*phi,(1-mu)*phi) 338 | } 339 | 340 | tibble(phi_small=rbeta_mean(10000,mean(model_data$therm,na.rm=T),2), 341 | phi_big=rbeta_mean(10000,mean(model_data$therm,na.rm=T),3)) %>% 342 | gather("phi_type","simulated_value") %>% 343 | ggplot(aes(x=simulated_value)) + 344 | geom_density(alpha=0.5,aes(fill=phi_type)) + 345 | theme(panel.background = element_blank(), 346 | panel.grid=element_blank()) 347 | 348 | ``` 349 | 350 | We can see that the `phi_big` distribution is more clustered around a central value while `phi_small` shows more movement towards the extremes of the distribution. However, the movement is modest, as the value of the coefficient suggests. 351 | -------------------------------------------------------------------------------- /figures/figure_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_1.pdf -------------------------------------------------------------------------------- /figures/figure_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_2.pdf -------------------------------------------------------------------------------- /figures/figure_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_3.pdf -------------------------------------------------------------------------------- /figures/figure_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_4.pdf -------------------------------------------------------------------------------- /figures/figure_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_5.pdf -------------------------------------------------------------------------------- /figures/figure_6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_6.pdf -------------------------------------------------------------------------------- /figures/figure_7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saudiwin/ordbetareg/348a22a51d63563b721dcdcf356b0892c884ace3/figures/figure_7.pdf -------------------------------------------------------------------------------- /frac_logit.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | // need a custom cdf 3 | 4 | real frac_logit_lcdf(real y, real p) { 5 | 6 | return log(1 - 2*p) - (log(2) + log(atan(1 - 2*p))); 7 | 8 | } 9 | 10 | 11 | } 12 | 13 | data { 14 | int n; 15 | int k; // number of columns 16 | matrix[n,k] x; 17 | vector[n] y; 18 | int run_gen; // whether to use generated quantities 19 | } 20 | parameters { 21 | vector[k] X_beta; 22 | real alpha; 23 | } 24 | model { 25 | 26 | X_beta ~ normal(0,5); 27 | alpha ~ normal(0,5); 28 | 29 | // custom "quasi" likelihood 30 | 31 | target += y .* log_inv_logit(alpha + x * X_beta) + (1 - y) .* log1m_inv_logit(alpha + x*X_beta); 32 | 33 | } 34 | generated quantities { 35 | vector[run_gen==1 ? n: 0] frac_log; 36 | vector[run_gen==1 ? n: 0] frac_rep; 37 | 38 | if(run_gen==1) { 39 | 40 | frac_log = y .* log_inv_logit(alpha + x * X_beta) + (1 - y) .* log1m_inv_logit(alpha + x*X_beta); 41 | frac_rep = inv_logit(alpha + x * X_beta); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /helper_func.R: -------------------------------------------------------------------------------- 1 | # helper functions for calculating marginal effects 2 | 3 | rbeta_mean <- function(N,mu,phi) { 4 | rbeta(N, mu * phi, (1 - mu) * phi) 5 | } 6 | 7 | # get marginal effects 8 | 9 | eps <- 1e-7 10 | setstep <- function(x) { 11 | x + (max(abs(x), 1, na.rm = TRUE) * sqrt(eps)) - x 12 | } 13 | 14 | # functions for doing marginal effects 15 | 16 | predict_ordbeta <- function(cutpoints=NULL,X=NULL, 17 | X_miss=NULL,X_beta=NULL,X_beta_miss=NULL, 18 | alpha=NULL, 19 | combined_out=T) { 20 | 21 | # we'll assume the same eta was used to generate outcomes 22 | eta <- X %*% as.matrix(X_beta) + alpha 23 | 24 | # probabilities for three possible categories (0, proportion, 1) 25 | 26 | 27 | low <- 1-plogis(eta - cutpoints[1]) 28 | middle <- plogis(eta-cutpoints[1]) - plogis(eta-cutpoints[2]) 29 | high <- plogis(eta - cutpoints[2]) 30 | 31 | 32 | # check for whether combined outcome or single outcome 33 | 34 | if(combined_out) { 35 | 36 | if(!is.null(X_beta_miss)) { 37 | pr_miss <- X_miss %*% as.matrix(X_beta_miss) 38 | low*0 + middle*(0.5*pr_miss + (1-pr_miss)*plogis(eta)) + high*1 39 | } else { 40 | low*0 + middle*plogis(eta) + high*1 41 | } 42 | 43 | } else { 44 | 45 | out_list <- list(pr_zero=low, 46 | pr_proportion=middle, 47 | proportion_value=plogis(eta), 48 | pr_one=high) 49 | 50 | if(!is.null(X_beta_miss)) { 51 | pr_miss <- X_miss %*% as.matrix(X_beta_miss) 52 | out_list$pr_miss <- pr_miss 53 | } 54 | 55 | out_list 56 | 57 | } 58 | 59 | } 60 | 61 | predict_beta <- function(X=NULL,X_beta=NULL) { 62 | eta <- X %*% as.matrix(X_beta) 63 | 64 | prob <- plogis(eta) 65 | 66 | prob 67 | } 68 | 69 | predict_zoib <- function(coef_g=NULL,coef_a=NULL,coef_m=NULL, 70 | alpha1=NULL, 71 | alpha2=NULL, 72 | alpha3=NULL, 73 | X=NULL, 74 | combined_out=T) { 75 | 76 | # we'll assume the same eta was used to generate outcomes 77 | psi <- plogis(as.numeric(alpha1) + X %*% coef_a) 78 | gamma <- plogis(as.numeric(alpha2) + X %*% coef_g) 79 | eta <- as.numeric(alpha3) + X %*% coef_m 80 | 81 | # probabilities for three possible categories (0, proportion, 1) 82 | low <- psi * (1-gamma) 83 | middle <- (1-psi) 84 | high <- psi * gamma 85 | 86 | # check for whether combined outcome or single outcome 87 | 88 | if(combined_out) { 89 | low*0 + middle*plogis(eta) + high*1 90 | } else { 91 | list(pr_zero=low, 92 | pr_proportion=middle, 93 | proportion_value=eta, 94 | pr_one=high) 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /install.R: -------------------------------------------------------------------------------- 1 | # This file will install all packages and also run cmdstanr's install_cmdstan function if it cannot 2 | # detect an existing cmdstan installation 3 | 4 | # code borrowed from Vikram Baliga https://vbaliga.github.io/verify-that-r-packages-are-installed-and-loaded/ 5 | 6 | ## First specify the packages of interest 7 | packages = c("dplyr","rstanarm","tidyr", 8 | "lubridate","loo","kableExtra", 9 | "bayesplot","patchwork","stringr","grDevices","emojifont", 10 | "latex2exp","haven","ggplot2","moments", 11 | "posterior","brms","remotes","future.apply", 12 | "faux","rmarkdown","bookdown","tinytex","extrafont","binom","Hmisc", 13 | "ggthemes","ggtext","boot") 14 | 15 | print("Checking and installing packages.") 16 | 17 | # create directory for data 18 | 19 | dir.create("data") 20 | 21 | ## Now load or install&load all 22 | package.check <- lapply( 23 | packages, 24 | FUN = function(x) { 25 | print(paste0("Now checking ", x)) 26 | if (!require(x, character.only = TRUE)) { 27 | print(paste0("Package ",x," not installed, installing from CRAN.")) 28 | install.packages(x, dependencies = TRUE) 29 | } 30 | } 31 | ) 32 | 33 | # install cmdstanr 34 | 35 | if (!require("cmdstanr", character.only = TRUE)) { 36 | install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos"))) 37 | } 38 | 39 | # check for cmdstan installation 40 | 41 | library(cmdstanr) 42 | 43 | # checking if path to cmdstan exists 44 | check_path <- try(cmdstan_path()) 45 | 46 | if('try-error' %in% class(check_path)) { 47 | 48 | user_input <- menu(c("Yes", "No"), title="Cmdstanr cannot find a cmdstan installation (if it does exist, you can also pass it as an option to set_cmdstan_path()). Would you like to install cmdstan from source? This is required to replicate the paper results.") 49 | 50 | } 51 | 52 | if(user_input==1L) { 53 | 54 | print("Installing cmsdstan.") 55 | 56 | print("If this fails, see package documentation at https://mc-stan.org/cmdstanr/.") 57 | 58 | install_cmdstan() 59 | } 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /kubinec_ord_betareg_appendix.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Appendix to Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Survey Sliders and Visual Analog Scales" 3 | bibliography: BibTexDatabase.bib 4 | date: "March 7th, 2020" 5 | output: 6 | bookdown::pdf_document2: 7 | keep_tex: true 8 | toc: false 9 | includes: 10 | in_header: preamble.tex 11 | author: | 12 | | Robert Kubinec 13 | | New York University Abu Dhabi 14 | --- 15 | 16 | ```{r setup, include=F} 17 | 18 | require(rstan) 19 | require(dplyr) 20 | require(brms) 21 | require(cmdstanr) 22 | require(posterior) 23 | require(tidyr) 24 | require(lubridate) 25 | require(loo) 26 | require(kableExtra) 27 | require(bayesplot) 28 | require(patchwork) 29 | require(latex2exp) 30 | require(haven) 31 | 32 | knitr::opts_chunk$set(warning=F,message=F) 33 | 34 | source("helper_func.R") 35 | 36 | ord_beta_mod <- cmdstan_model("beta_logit.stan") 37 | 38 | # to reproduce results, change if you want fresh sampling 39 | 40 | random_seed <- 77123101 41 | 42 | ``` 43 | 44 | # Models Without Degenerate Responses 45 | 46 | Not only is it possible to fit the ordered beta regression mdoel to data without observations at the bounds, but it is advisable to do so if there is even a remote chance that such observations could be observed. For example, it may well be that a certain realization of the data contains observations that just so happened to not reach the bounds and are in the (0.01,0.99) interval. We could imagine this arising in a feeling thermometer/VAS scale where respondents' preferences tend to fairly clustered around the midpoint of the scale. However, a future sample of this same data could end up with observations at the bounds. It would be problematic in this case to fit only a Beta regression to the current data as the estimates would later be incomparable to estimates of future data with observations at the bounds. 47 | 48 | While this scenario does not necessarily need to happen, it is enough of a motivation to fit the ordered beta regression model even in situations where there are no observations at the bounds (or perhaps only at one bound). The costs of doing so, both in terms of inference and computation, are quite low. Because the cutpoints were assigned a weakly informative prior, *they are identified without any data*. As a result, if a model is fit without any observations on the bounds, the cutpoints will end up in the far corners of the distribution, say at 0.001 and 0.999, but they will still exist and the posterior predictive distribution can produce them with some small probability. If future data was added to the sample incorporating observations at the bounds, the combined estimates would be interpretable and the cutpoints would adjust to handle the new data. 49 | 50 | To demonstrate this, I simulate data from a model with widely spaced cutpoints where I remove any of the few observations that end up at the bounds: 51 | 52 | ```{r sim_bounds} 53 | 54 | N <- 1000 55 | 56 | X <- rnorm(N,runif(1,-2,2),1) 57 | 58 | X_beta <- -1 59 | eta <- X*X_beta 60 | 61 | # ancillary parameter of beta distribution 62 | # high clustering 63 | phi <- 70 64 | 65 | # predictor for ordered model 66 | mu1 <- eta 67 | # predictor for beta regression 68 | mu2 <- eta 69 | 70 | # wide cutpoints on logit scale 71 | cutpoints <- c(-8,8) 72 | 73 | # probabilities for three possible categories (0, proportion, 1) 74 | low <- 1-plogis(mu2 - cutpoints[1]) 75 | middle <- plogis(mu2-cutpoints[1]) - plogis(mu2-cutpoints[2]) 76 | high <- plogis(mu2 - cutpoints[2]) 77 | 78 | # we'll assume the same eta was used to generate outcomes 79 | 80 | out_beta <- rbeta(N,plogis(mu1) * phi, (1 - plogis(mu1)) * phi) 81 | 82 | # now determine which one we get for each observation 83 | outcomes <- sapply(1:N, function(i) { 84 | sample(1:3,size=1,prob=c(low[i],middle[i],high[i])) 85 | }) 86 | 87 | # now combine binary (0/1) with proportion (beta) 88 | 89 | final_out <- sapply(1:length(outcomes),function(i) { 90 | if(outcomes[i]==1) { 91 | return(0) 92 | } else if(outcomes[i]==2) { 93 | return(out_beta[i]) 94 | } else { 95 | return(1) 96 | } 97 | }) 98 | 99 | # remove residual 1/0s 100 | remove_degen <- final_out>0 & final_out<1 101 | final_out <- final_out[remove_degen] 102 | X <- X[remove_degen] 103 | 104 | tibble(x=final_out) %>% 105 | ggplot(aes(x=final_out)) + 106 | geom_histogram(bins=100) + 107 | geom_vline(xintercept = 0,linetype=2) + 108 | geom_vline(xintercept = 1,linetype=2) + 109 | theme(panel.grid=element_blank(), 110 | panel.background=element_blank()) + 111 | ylab("Count") + 112 | xlab("Simulated Outcome") 113 | 114 | ``` 115 | 116 | We can then model this distribution as follows: 117 | 118 | ```{r model_degen} 119 | to_bl <- list(N_degen=sum(final_out %in% c(0,1)), 120 | N_prop=sum(final_out>0 & final_out<1), 121 | X=1, 122 | outcome_prop=final_out[final_out>0 & final_out<1], 123 | outcome_degen=final_out[final_out %in% c(0,1)], 124 | covar_prop=as.matrix(X), 125 | covar_degen=as.matrix(X[final_out %in% c(0,1)]), 126 | N_pred_degen=sum(final_out %in% c(0,1)), 127 | N_pred_prop=sum(final_out>0 & final_out<1), 128 | indices_degen=array(dim=0), 129 | indices_prop=1:(sum(final_out>0 & final_out<1)), 130 | run_gen=1) 131 | 132 | fit_model <- ord_beta_mod$sample(data=to_bl,seed=random_seed, 133 | refresh=0, 134 | chains=1,cores=1,iter_sampling=1000) 135 | 136 | cutpoints <- fit_model$draws("cutpoints") %>% as_draws_matrix 137 | 138 | print(fit_model,c("X_beta","cutpoints")) 139 | ``` 140 | 141 | We can see from the model results that our coefficient `X_beta` was estimated without bias (equal to -1). The cutpoints were estimated with a little bit of bias due to the censoring we did on the outcome variable, but are still quite close to the original values. Furthermore, they are estimated at extremes -- the lower cutpoint is `r round(plogis(mean(cutpoints[,1])),4)` and the upper cutpoint is `r round(plogis(mean(cutpoints[,2])),4)`. As this example indicates, there is no reason not to fit a model with no observations at the bounds. The cutpoints are still identified and the model converges without a problem. Furthermore, we can then still simulate observations at the bounds from the posterior predictive distribution: 142 | 143 | ```{r y_rep} 144 | ppc_dens_overlay(final_out,as_draws_matrix(fit_model$draws("regen_all"))) 145 | ``` 146 | 147 | 148 | -------------------------------------------------------------------------------- /kubinec_ord_betareg_appendix_anon.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Appendix to Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Continuous Data with Lower and Upper Bounds" 3 | bibliography: BibTexDatabase.bib 4 | name: Robert Kubinec 5 | date: "March 7th, 2020" 6 | geometry: margin=1in 7 | output: 8 | bookdown::pdf_document2: 9 | keep_tex: true 10 | toc: true 11 | includes: 12 | in_header: preamble.tex 13 | --- 14 | 15 | ```{r setup, include=F} 16 | 17 | require(rstan) 18 | require(dplyr) 19 | require(brms) 20 | require(cmdstanr) 21 | require(posterior) 22 | require(tidyr) 23 | require(lubridate) 24 | require(loo) 25 | require(kableExtra) 26 | require(bayesplot) 27 | require(patchwork) 28 | require(latex2exp) 29 | require(haven) 30 | require(ggthemes) 31 | require(forcats) 32 | require(ggplot2) 33 | 34 | knitr::opts_chunk$set(warning=F,message=F) 35 | 36 | source("helper_func.R") 37 | 38 | ord_beta_mod <- cmdstan_model("beta_logit.stan") 39 | 40 | ord_Beta_mod_infl <- cmdstan_model("Beta_logit_infl_simple.stan") 41 | ord_Beta_mod_phi <- cmdstan_model("Beta_logit_phireg.stan") 42 | 43 | # to reproduce results, change if you want fresh sampling 44 | 45 | random_seed <- 77123101 46 | 47 | ``` 48 | 49 | # Joint Posterior and Log-likelihood Definition 50 | 51 | I can express the model as a log-likelihood for a given distribution of $y_i$ as follows: 52 | 53 | ```{=tex} 54 | \begin{equation} 55 | ll(y_i|K,\beta,\phi) = \sum_{i=1}^N\left\{\begin{array}{lr} 56 | \text{log } \left[1 - g(X'\beta - k_1)\right] & \text{if } y_i=0\\ 57 | \text{log }\left[g(X'\beta - k_1) - g(X'\beta - k_2) \right ] + \text{log }\text{Beta}(g(X'\beta),\phi) & \text{if } y_i \in (0,1)\\ 58 | \text{log }g(X'\beta - k_2) & \text{if } y_i=1\\ 59 | \end{array}\right\} 60 | (\#eq:ll) 61 | \end{equation} 62 | ``` 63 | 64 | Given this likelihood, I can define a joint log posterior distribution over $y$ given the log-likelihood function and set of parameters: 65 | 66 | ```{=tex} 67 | \begin{equation} 68 | \text{log } p(k_1, k_2, \beta,\phi|y) \propto \sum_{i=1}^N \text{ log }p(K) + \text{ log }p(\beta) + \text{ log }p(\phi) + ll(y_i|K,\beta,\phi) 69 | (\#eq:logp) 70 | \end{equation} 71 | ``` 72 | where $\propto$ indicates that the posterior is calculated proportional to the normalizing constant, i.e., the denominator in Bayes' formula. 73 | 74 | # Simulation with Fixed Parameter Values 75 | 76 | The fixed simulation relative to the more thorough simulation presented in the main text shows that, for this particular set of parameter draws (five covariates with a $\rho_x$ of 0.5, $\phi$ of 2, $k_1$ of -3 and $k_2$ of 2), the ZOIB shows somewhat less variance than ordered beta regression but at the cost of very high M-errors. The average ZOIB coefficient magnitude isless than one-half that of ordered beta regression, which is a worrying level of bias admitted for a small reduction in variance. This simulation also shows that fractional logit regression has relatively high variance, as is also seen in the empirical example. The Beta regression on transformed values and only continuous responses show high M errors and very low variance, suggesting that again that these data-driven fixes can cause severe distortions in estimating marginal effects. 77 | 78 | ```{r fixsim,echo=F} 79 | 80 | #all_sim_fixed <- readRDS("data/sim_cont_X_fixed.rds") 81 | load("data/sim_cont_X_fixed.RData") 82 | checkc <- sapply(all_sim_fixed, class) 83 | all_sim_fixed <- all_sim_fixed[checkc!='try-error'] 84 | all_sim_fixed <- bind_rows(all_sim_fixed) %>% 85 | unchop(c("med_est","X_beta","marg_eff","high","low","var_calc", 86 | 'marg_eff_est','high_marg','low_marg','var_marg')) 87 | 88 | my_conf_fun <- function(x) { 89 | if(all(x>=0 & x<=1)) { 90 | # use binomial confidence intervals 91 | bi_ci <- binom::binom.bayes(sum(x,na.rm=T),length(x)) 92 | return(list(y=bi_ci$mean, 93 | ymin=bi_ci$lower, 94 | ymax=bi_ci$upper)) 95 | } else { 96 | boot_ci <- Hmisc::smean.cl.boot(x) 97 | return(list(y=boot_ci["Mean"], 98 | ymin=boot_ci["Lower"], 99 | ymax=boot_ci["Upper"])) 100 | } 101 | } 102 | 103 | 104 | all_sim_fixed %>% 105 | mutate(s_err=sign(marg_eff)!=sign(marg_eff_est), 106 | m_err=abs(marg_eff_est)/abs(marg_eff)) %>% 107 | mutate(Power=as.numeric(ifelse(sign(marg_eff)==sign(high) & sign(marg_eff)==sign(low), 108 | 1, 109 | 0))) %>% 110 | select(`Proportion S Errors`="s_err",N,Power, 111 | `M Errors`="m_err",Variance="var_marg",model) %>% 112 | gather(key = "type",value="estimate",-model,-N) %>% 113 | mutate(N=ifelse(N<100, 100, N)) %>% 114 | filter(!is.na(model)) %>% 115 | # filter(model %in% c("OLS","Ordinal Beta Regression",'ZOIB',"Fractional")) %>% 116 | # group_by(N,model,type) %>% 117 | # summarize(med_est=mean(estimate), 118 | # low_est=quantile(estimate,.05), 119 | # high_est=quantile(estimate,.95)) %>% 120 | ggplot(aes(y=estimate,x=N)) + 121 | #geom_point(aes(colour=model),alpha=0.1) + 122 | stat_summary(fun.data="mean_cl_boot",aes(colour=model, 123 | shape=model), 124 | size=.5,fatten=1.5, 125 | position=position_dodge(width=0.5)) + 126 | ylab("") + 127 | xlab("N") + 128 | facet_wrap(~type,scales="free_y",ncol = 2) + 129 | scale_color_viridis_d() + 130 | #scale_y_log10() + 131 | labs(caption=stringr::str_wrap("Summary statistics for each value of N calculated via bootstrapping. M Errors and S errors are magnitude of bias and incorrect sign of the estimated marginal effect. Variance refers to estimated posterior variance (uncertainty) of the marginal effect.",width=80)) + 132 | guides(color=guide_legend(title=""), 133 | shape=guide_legend(title="")) + 134 | theme_tufte() 135 | 136 | ``` 137 | 138 | 139 | # Models Without Degenerate Responses 140 | 141 | Not only is it possible to fit the ordered beta regression mdoel to data without observations at the bounds, but it is advisable to do so if there is even a remote chance that such observations could be observed. For example, it may well be that a certain realization of the data contains observations that just so happened to not reach the bounds and are in the (0.01,0.99) interval. We could imagine this arising in a feeling thermometer/VAS scale where respondents' preferences tend to fairly clustered around the midpoint of the scale. However, a future sample of this same data could end up with observations at the bounds. It would be problematic in this case to fit only a Beta regression to the current data as the estimates would later be incomparable to estimates of future data with observations at the bounds. 142 | 143 | While this scenario does not necessarily need to happen, it is enough of a motivation to fit the ordered beta regression model even in situations where there are no observations at the bounds (or perhaps only at one bound). The costs of doing so, both in terms of inference and computation, are quite low. Because the cutpoints were assigned a weakly informative prior, *they are identified without any data*. As a result, if a model is fit without any observations on the bounds, the cutpoints will end up in the far corners of the distribution, say at 0.001 and 0.999, but they will still exist and the posterior predictive distribution can produce them with some small probability. If future data was added to the sample incorporating observations at the bounds, the combined estimates would be interpretable and the cutpoints would adjust to handle the new data. 144 | 145 | To demonstrate this, I simulate data from a model with widely spaced cutpoints where I remove any of the few observations that end up at the bounds: 146 | 147 | ```{r sim_bounds} 148 | 149 | N <- 1000 150 | 151 | X <- rnorm(N,runif(1,-2,2),1) 152 | 153 | X_beta <- -1 154 | eta <- X*X_beta 155 | 156 | # ancillary parameter of beta distribution 157 | # high clustering 158 | phi <- 70 159 | 160 | # predictor for ordered model 161 | mu1 <- eta 162 | # predictor for beta regression 163 | mu2 <- eta 164 | 165 | # wide cutpoints on logit scale 166 | cutpoints <- c(-8,8) 167 | 168 | # probabilities for three possible categories (0, proportion, 1) 169 | low <- 1-plogis(mu2 - cutpoints[1]) 170 | middle <- plogis(mu2-cutpoints[1]) - plogis(mu2-cutpoints[2]) 171 | high <- plogis(mu2 - cutpoints[2]) 172 | 173 | # we'll assume the same eta was used to generate outcomes 174 | 175 | out_beta <- rbeta(N,plogis(mu1) * phi, (1 - plogis(mu1)) * phi) 176 | 177 | # now determine which one we get for each observation 178 | outcomes <- sapply(1:N, function(i) { 179 | sample(1:3,size=1,prob=c(low[i],middle[i],high[i])) 180 | }) 181 | 182 | # now combine binary (0/1) with proportion (beta) 183 | 184 | final_out <- sapply(1:length(outcomes),function(i) { 185 | if(outcomes[i]==1) { 186 | return(0) 187 | } else if(outcomes[i]==2) { 188 | return(out_beta[i]) 189 | } else { 190 | return(1) 191 | } 192 | }) 193 | 194 | # remove residual 1/0s 195 | remove_degen <- final_out>0 & final_out<1 196 | final_out <- final_out[remove_degen] 197 | X <- X[remove_degen] 198 | 199 | tibble(x=final_out) %>% 200 | ggplot(aes(x=final_out)) + 201 | geom_histogram(bins=100) + 202 | geom_vline(xintercept = 0,linetype=2) + 203 | geom_vline(xintercept = 1,linetype=2) + 204 | theme(panel.grid=element_blank(), 205 | panel.background=element_blank()) + 206 | ylab("Count") + 207 | xlab("Simulated Outcome") 208 | 209 | ``` 210 | 211 | We can then model this distribution as follows: 212 | 213 | ```{r model_degen} 214 | to_bl <- list(N_degen=sum(final_out %in% c(0,1)), 215 | N_prop=sum(final_out>0 & final_out<1), 216 | X=1, 217 | outcome_prop=final_out[final_out>0 & final_out<1], 218 | outcome_degen=final_out[final_out %in% c(0,1)], 219 | covar_prop=as.matrix(X), 220 | covar_degen=as.matrix(X[final_out %in% c(0,1)]), 221 | N_pred_degen=sum(final_out %in% c(0,1)), 222 | N_pred_prop=sum(final_out>0 & final_out<1), 223 | indices_degen=array(dim=0), 224 | indices_prop=1:(sum(final_out>0 & final_out<1)), 225 | run_gen=1) 226 | 227 | fit_model <- ord_beta_mod$sample(data=to_bl,seed=random_seed, 228 | refresh=0, 229 | chains=1,cores=1,iter_sampling=1000) 230 | 231 | cutpoints <- fit_model$draws("cutpoints") %>% as_draws_matrix 232 | 233 | print(fit_model,c("X_beta","cutpoints")) 234 | ``` 235 | 236 | We can see from the model results that our coefficient `X_beta` was estimated without bias (equal to -1). The cutpoints were estimated with a little bit of bias due to the censoring we did on the outcome variable, but are still quite close to the original values. Furthermore, they are estimated at extremes -- the lower cutpoint is `r round(plogis(mean(cutpoints[,1])),4)` and the upper cutpoint is `r round(plogis(mean(cutpoints[,2])),4)`. As this example indicates, there is no reason not to fit a model with no observations at the bounds. The cutpoints are still identified and the model converges without a problem. Furthermore, we can then still simulate observations at the bounds from the posterior predictive distribution: 237 | 238 | ```{r y_rep} 239 | ppc_dens_overlay(final_out,as_draws_matrix(fit_model$draws("regen_all"))) 240 | ``` 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | ```{r phireg,echo=F,fig.cap="Effects of Covariates on Thermometer Dispersion (Extreme Responses)",eval=F} 518 | 519 | recode_marg_phi %>% 520 | group_by(variable,group_list) %>% 521 | mutate(marg=marg*100) %>% 522 | summarize(med_est=median(marg), 523 | high=quantile(marg,.95), 524 | low=quantile(marg,.05)) %>% 525 | ggplot(aes(y=med_est,x=forcats::fct_rev(variable))) + 526 | geom_pointrange(aes(ymin=low,ymax=high),position=position_dodge(width=.5),alpha=0.5) + 527 | scale_color_viridis_d() + 528 | theme_minimal() + 529 | geom_hline(yintercept = 0,linetype=2) + 530 | ylab(TeX("Value of Scale/Dispersion Parameter $\\phi$")) + 531 | theme(legend.position = "top") + 532 | guides(color=guide_legend(title=""),shape=guide_legend(title="")) + 533 | facet_wrap(~group_list,scales="free",ncol=2) + 534 | xlab("") + 535 | coord_flip() 536 | 537 | 538 | ``` 539 | 540 | -------------------------------------------------------------------------------- /master.R: -------------------------------------------------------------------------------- 1 | # run all scripts necessary to replicate & compile paper and SI 2 | 3 | # whether to run the simulations from scratch 4 | 5 | run_sim <- F 6 | 7 | if(run_sim) { 8 | 9 | print("Now on random simulation") 10 | 11 | source("ordered_beta_reg_sim.R") 12 | 13 | print("Now on fixed simulation") 14 | 15 | source("ordered_beta_reg_sim_fixed.R") 16 | 17 | } 18 | 19 | print("Compiling paper") 20 | 21 | rmarkdown::render("kubinec_ord_betareg_accepted_dataverse.Rmd") 22 | 23 | print("Compiling supplementary information") 24 | 25 | rmarkdown::render("kubinec_ord_betareg_appendix_anon.Rmd") -------------------------------------------------------------------------------- /old_code.R: -------------------------------------------------------------------------------- 1 | "Biden Therm" = "b_change_biden_therm", 2 | "Voted" = "b_post_did_voteVoted", 3 | "Age" = "b_pre_age", 4 | "BA" = "b_pre_educBachelors", 5 | "HS" = "b_pre_educHighSchool", 6 | "MA" = "b_pre_educMasters", 7 | "Other Ed" = "b_pre_educOther", 8 | "PhD/MBA/JD" = "b_pre_educProfessionalDDoctoral", 9 | "Some College" = "b_pre_educSomeCollege", 10 | "COVID About Right" = "b_pre_fed_response_covidAboutright", 11 | "COVID Too Slow" = "b_pre_fed_response_covidMuchtooslow", 12 | "COVID Somewhat Quick" = "b_pre_fed_response_covidSomewhattooquick", 13 | "COVID Somewhat Slow" = "b_pre_fed_response_covidSomewhattooslow", 14 | "Disabled" = "b_pre_ocupDisabled", 15 | "Homemaker" = "b_pre_ocupHomemaker", 16 | "Retired" = "b_pre_ocupRetired", 17 | "Student" = "b_pre_ocupStudent", 18 | "Unemployed" = "b_pre_ocupUnemployed", 19 | "Asian" = "b_pre_raceAsian", 20 | "Black" = "b_pre_raceBlack", 21 | "Hispanic" = "b_pre_raceHispanic", 22 | "Multiracial" = "b_pre_raceMultiMracial", 23 | "Native American" = "b_pre_raceNativeAmerican", 24 | "Female" = "b_pre_sexFemale", 25 | "Biden Therm" = "change_biden_therm", 26 | "Biden Therm^2" = "b_Ichange_biden_thermE2", 27 | "Voted" = "post_did_voteVoted", 28 | "Age" = "pre_age", 29 | "BA" = "pre_educBachelor's", 30 | "HS" = "pre_educHigh School", 31 | "MA" = "pre_educMaster's", 32 | "Other Ed" = "pre_educOther", 33 | "PhD/MBA/JD" = "pre_educProfessional/Doctoral", 34 | "Some College" = "pre_educSome College", 35 | "COVID About Right" = "pre_fed_response_covidAbout right", 36 | "COVID Too Slow" = "pre_fed_response_covidMuch too slow", 37 | "COVID Somewhat Quick" = "pre_fed_response_covidSomewhat too quick", 38 | "COVID Somewhat Slow" = "pre_fed_response_covidSomewhat too slow", 39 | "Disabled" = "pre_ocupDisabled", 40 | "Homemaker" = "pre_ocupHomemaker", 41 | "Retired" = "pre_ocupRetired", 42 | "Student" = "pre_ocupStudent", 43 | "Unemployed" = "pre_ocupUnemployed", 44 | "Asian" = "pre_raceAsian", 45 | "Black" = "pre_raceBlack", 46 | "Biden Therm^2"="I(change_biden_therm^2)", 47 | "Hispanic" = "pre_raceHispanic", 48 | "Multiracial" = "pre_raceMulti-racial", 49 | "Native American" = "pre_raceNative American", 50 | "Female" = "pre_sexFemale") 51 | 52 | group_list=forcats::fct_collapse(variable, 53 | `Biden Therm`=c("Biden Therm","Biden Therm^2"), 54 | Demographics=c("Age", 55 | "Female", 56 | "Black", 57 | "Hispanic","Asian", 58 | "Multiracial", 59 | "Native American"), 60 | `Education\nBaseline=Less Than H.S.`=c("HS", 61 | "Some College", 62 | "BA","MA", 63 | "PhD/MBA/JD", 64 | "Other Ed"), 65 | `Federal Response\nBaseline= Too Quick`=c("COVID About Right","COVID Somewhat Slow","COVID Somewhat Quick","COVID Too Slow"), 66 | `Employment\nBaseline=Employed`=c("Disabled", 67 | "Homemaker", "Retired","Student","Unemployed"))) 68 | 69 | # anes <- read_csv("data/anes_timeseries_2020_csv_20210719/anes_timeseries_2020_csv_20210719.csv") %>% 70 | # select(pre_vote_choice_2020="V201029", 71 | # pre_pref_strong="V201030", 72 | # pre_intend_vote_2020="V201032", 73 | # pre_intend_vote_choice_2020="V201033", 74 | # pre_did_vote_2016="V201101", 75 | # pre_country_on_track="V201114", 76 | # pre_afraid="V201116", 77 | # pre_angry="V201118", 78 | # pre_worry="V201120", 79 | # pre_sex="V201600", 80 | # pre_pres_approval="V201129x", 81 | # handle_covid="V201144x", 82 | # pre_biden_therm="V201151", 83 | # pre_trump_therm="V201152", 84 | # pre_ideology="V201200", 85 | # pre_party_id="V201231x", 86 | # pre_trust_gov="V201233", 87 | # pre_party_covid="V201244", 88 | # pre_econ_worry="V201335", 89 | # pre_trust_news="V201377", 90 | # pre_fed_response_covid="V201392x", 91 | # pre_age="V201507x", 92 | # pre_marital_status="V201508", 93 | # pre_educ="V201510", 94 | # pre_ocup="V201534x", 95 | # pre_race="V201549x", 96 | # post_party_contact="V202005", 97 | # post_give_money="V202017", 98 | # post_discuss_pol="V202023", 99 | # post_discuss_pol_online="V202029", 100 | # post_asian_therm="V202477", 101 | # post_did_vote="V202109x", 102 | # post_biden_therm="V202143", 103 | # post_who_voted="V202110x", 104 | # pre_date="V203053") %>% 105 | # mutate_all(~ifelse(. %in% c(998,-1:-10),NA,.)) 106 | # 107 | # # clean data 108 | # 109 | # model_data_long <- mutate(anes, 110 | # change_biden_therm=scale(post_biden_therm - pre_biden_therm)[,1], 111 | # asian_scale=post_asian_therm/100, 112 | # therm=floor(post_asian_therm)/100, 113 | # therm_rescale=(therm * (sum(!is.na(therm))-1) + 0.5)/sum(!is.na(therm)), 114 | # #therm=floor(post_biden_therm)/100, 115 | # #therm_rescale=(therm * (sum(!is.na(therm))-1) + 0.5)/sum(!is.na(therm)), 116 | # pre_sex=factor(pre_sex,labels=c("Male","Female")), 117 | # pre_age=pre_age/10, 118 | # pre_educ=factor(pre_educ, 119 | # labels=c("0 & therm<1 ~ "inside",therm==0|therm==1~"outside",TRUE~'other')) -------------------------------------------------------------------------------- /ordbetareg.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /ordered_beta_reg.R: -------------------------------------------------------------------------------- 1 | # Robert Kubinec 2 | # New York University Abu Dhabi 3 | # January 23, 2020 4 | # let's try to simulate a new distribution for ordered beta regression 5 | 6 | require(rstan) 7 | require(bayesplot) 8 | require(dplyr) 9 | 10 | N <- 1000 11 | 12 | X <- runif(N,-2,2) 13 | 14 | # on the logit scale, determines threshold above which we get degenerate (0/1) outcomes 15 | cutpoints <- c(-2,2) 16 | 17 | X_beta <- 2.5 18 | 19 | 20 | sample_ordbeta <- function(cutpoints=NULL,phi=NULL,X=NULL,X_beta=NULL) { 21 | 22 | # we'll assume the same eta was used to generate outcomes 23 | eta <- X*X_beta 24 | 25 | # non-inflated outcome 26 | out_beta <- rbeta(N,plogis(eta) * phi, (1 - plogis(eta)) * phi) 27 | 28 | # probabilities for three possible categories (0, proportion, 1) 29 | low <- 1-plogis(eta - cutpoints[1]) 30 | middle <- plogis(eta-cutpoints[1]) - plogis(eta-cutpoints[2]) 31 | high <- plogis(eta - cutpoints[2]) 32 | 33 | # now determine which one we get for each observation 34 | 35 | outcomes <- sapply(1:N, function(i) { 36 | sample(1:3,size=1,prob=c(low[i],middle[i],high[i])) 37 | }) 38 | 39 | # combine and sample from full distribution 40 | 41 | sapply(1:length(outcomes),function(i) { 42 | if(outcomes[i]==1) { 43 | return(0) 44 | } else if(outcomes[i]==2) { 45 | return(out_beta[i]) 46 | } else { 47 | return(1) 48 | } 49 | }) 50 | 51 | } 52 | 53 | predict_ordbeta <- function(cutpoints=NULL,phi=NULL,X=NULL,X_beta=NULL, 54 | combined_out=T) { 55 | 56 | # we'll assume the same eta was used to generate outcomes 57 | eta <- X*X_beta 58 | 59 | # probabilities for three possible categories (0, proportion, 1) 60 | low <- 1-plogis(eta - cutpoints[1]) 61 | middle <- plogis(eta-cutpoints[1]) - plogis(eta-cutpoints[2]) 62 | high <- plogis(eta - cutpoints[2]) 63 | 64 | # check for whether combined outcome or single outcome 65 | 66 | if(combined_out) { 67 | low*0 + middle*plogis(eta) + high*1 68 | } else { 69 | list(pr_zero=low, 70 | pr_proportion=middle, 71 | proportion_value=out_beta, 72 | pr_one=high) 73 | } 74 | 75 | } 76 | 77 | 78 | 79 | # use this function to sample data 80 | 81 | final_out <- sample_ordbeta(cutpoints=cutpoints, 82 | phi=phi, 83 | X=X, 84 | X_beta=X_beta) 85 | 86 | # we can also use it to calculate "true" marginal effect of X on Y using code from margins package 87 | # i.e., numerical differentiation 88 | # set value of `h` based on `eps` to deal with machine precision 89 | 90 | eps <- 1e-7 91 | setstep <- function(x) { 92 | x + (max(abs(x), 1, na.rm = TRUE) * sqrt(eps)) - x 93 | } 94 | 95 | y0 <- predict_ordbeta(cutpoints=cutpoints, 96 | phi=phi, 97 | X=X - setstep(X), 98 | X_beta=X_beta) 99 | 100 | y1 <- predict_ordbeta(cutpoints=cutpoints, 101 | phi=phi, 102 | X=X + setstep(X), 103 | X_beta=X_beta) 104 | 105 | marg_eff <- (y1-y0)/((X + setstep(X))-(X - setstep(X))) 106 | 107 | mean(marg_eff) 108 | 109 | # check for pr0 and pr1 110 | 111 | y0 <- predict_ordbeta(cutpoints=cutpoints, 112 | phi=phi, 113 | X=X - setstep(X), 114 | X_beta=X_beta,combined_out = F) 115 | 116 | y1 <- predict_ordbeta(cutpoints=cutpoints, 117 | phi=phi, 118 | X=X + setstep(X), 119 | X_beta=X_beta,combined_out = F) 120 | 121 | marg_eff_0 <- (y1$pr_zero-y0$pr_zero)/((X + setstep(X))-(X - setstep(X))) 122 | 123 | marg_eff_1 <- (y1$pr_one-y0$pr_one)/((X + setstep(X))-(X - setstep(X))) 124 | 125 | mean(marg_eff_0) 126 | mean(marg_eff_1) 127 | 128 | # check what OLS does 129 | 130 | ols_fit <- lm(final_out~X) 131 | 132 | summary(ols_fit) 133 | 134 | # now we need a Stan file 135 | 136 | beta_logit <- stan_model("beta_logit.stan") 137 | 138 | to_sample <- 400 139 | 140 | indices_degen <- sample(1:length(X[final_out %in% c(0,1)]),size=to_sample/2) 141 | indices_prop <- sample(1:length(X[final_out>0 & final_out<1]),size=to_sample/2) 142 | 143 | to_bl <- list(N_degen=sum(final_out %in% c(0,1)), 144 | N_prop=sum(final_out>0 & final_out<1), 145 | X=1, 146 | outcome_prop=final_out[final_out>0 & final_out<1], 147 | outcome_degen=final_out[final_out %in% c(0,1)], 148 | covar_prop=as.matrix(X[final_out>0 & final_out<1]), 149 | covar_degen=as.matrix(X[final_out %in% c(0,1)]), 150 | N_pred_degen=sum(final_out %in% c(0,1)), 151 | N_pred_prop=sum(final_out>0 & final_out<1), 152 | indices_degen=1:(sum(final_out %in% c(0,1))), 153 | indices_prop=1:(sum(final_out>0 & final_out<1)), 154 | run_gen=1) 155 | 156 | fit_model <- sampling(beta_logit,data=to_bl,chains=2,cores=2,iter=1000) 157 | 158 | # regenerate data and see how we do capturing it 159 | 160 | yrep_ordbeta <- extract(fit_model,"regen_all")[[1]] 161 | 162 | # use only a sample of draws 163 | 164 | # final_out_prop <- final_out[final_out>0 & final_out<1][indices_prop] 165 | # final_out_degen <- final_out[final_out %in% c(0,1)][indices_degen] 166 | 167 | final_out_prop <- final_out[final_out>0 & final_out<1] 168 | final_out_degen <- final_out[final_out %in% c(0,1)] 169 | 170 | ppc_ecdf_overlay(y=c(final_out_degen,final_out_prop),yrep=yrep_ordbeta) + 171 | ggtitle("Empirical Posterior Predictive Distribution for Ordinal Beta Regression",subtitle="N=1000") 172 | 173 | # ppc_dens_overlay(y=c(final_out_degen,final_out_prop),yrep=yrep) + 174 | # ggtitle("Posterior Predictive Distribution for Ordinal Beta Regression",subtitle="N=1000") 175 | 176 | # looks pretty good 177 | 178 | # try with rstanarm betareg where we pre-transform outcome 179 | 180 | require(rstanarm) 181 | 182 | final_out_scale <- (final_out * (length(final_out) - 1) + .5) / length(final_out) 183 | 184 | betareg_fit <- stan_betareg(formula = outcome~X,data=tibble(outcome=final_out_scale, 185 | X=X),chains=2,cores=2,iter=1000) 186 | 187 | summary(betareg_fit) 188 | 189 | # X_beta has lost 1/3 its value from original (i.e., effect has been compressed) 190 | 191 | # see the distortion re: the original values 192 | 193 | yrep_betareg <- posterior_predict(betareg_fit,draws=100) 194 | 195 | ppc_dens_overlay(y=final_out,yrep=yrep_betareg) + ggtitle("Posterior Predictive Distribution for Rstanarm Beta Regression",subtitle="N=1000, Transformed to (0,1) Scale") 196 | 197 | ppc_dens_overlay(y=final_out_scale,yrep=yrep_betareg) + ggtitle("Posterior Predictive Distribution for Rstanarm Beta Regression",subtitle="N=1000, Transformed to (0,1) Scale") 198 | 199 | # Tends to over/under predict middle and outliers (i.e. 0/1) 200 | 201 | # now try the ZOIB 202 | 203 | zoib_model <- stan_model("zoib.stan") 204 | x <- as.matrix(X) 205 | zoib_fit <- sampling(zoib_model,data=list(n=length(final_out), 206 | y=final_out, 207 | k=ncol(x), 208 | x=x, 209 | run_gen=1),chains=2,cores=2,iter=1000) 210 | 211 | # this is X_beta 212 | print(zoib_fit,pars="coef_m") 213 | yrep_zoib <- extract(zoib_fit,"zoib_regen")[[1]] 214 | ppc_dens_overlay(y=final_out,yrep=yrep_zoib) + 215 | ggtitle("Empirical Posterior Predictive Distribution for Ordinal Beta Regression",subtitle="N=1000") 216 | 217 | # Compare Distributions --------------------------------------------------- 218 | 219 | require(loo) 220 | 221 | loo_ordbeta <- loo(fit_model,"ord_log") 222 | 223 | loo_zoib <- loo(zoib_fit,"zoib_log") 224 | 225 | loo_compare(loo_ordbeta,loo_zoib) 226 | 227 | 228 | # Marginal Effects -------------------------------------------------------- 229 | 230 | # we can calculate predicted values and marginal effects for a single X 231 | 232 | X_beta <- as.matrix(fit_model,"X_beta") 233 | cuts_est <- as.matrix(fit_model,"cutpoints") 234 | 235 | # fascinating. we can do numerical differentiation -- cool 236 | 237 | pr_y0 <- sapply(1:nrow(X_beta),function(i) { 238 | b <- X_beta[i,] 239 | c <- cuts_est[i,1] 240 | mean(1-plogis(b*X - c)) 241 | }) 242 | 243 | pr_y1 <- sapply(1:nrow(X_beta),function(i) { 244 | b <- X_beta[i,] 245 | c <- cuts_est[i,2] 246 | mean(plogis(b*X - c)) 247 | }) 248 | 249 | mu_est <- sapply(1:nrow(X_beta),function(i) { 250 | b <- X_beta[i,] 251 | plogis(mean(b*X)) 252 | }) 253 | 254 | mean(plogis(mu1)) 255 | 256 | -------------------------------------------------------------------------------- /ordered_beta_reg_sim_fixed.R: -------------------------------------------------------------------------------- 1 | # Robert Kubinec 2 | # New York University Abu Dhabi 3 | # October 12, 2021 4 | # Simulation of 0 - 1 bounded dependent variables 5 | # Note simulation will take some time, approx ~2 days with 3 cores 6 | 7 | require(cmdstanr) 8 | require(bayesplot) 9 | require(dplyr) 10 | require(brms) 11 | require(loo) 12 | require(posterior) 13 | require(future.apply) 14 | require(faux) 15 | 16 | RNGkind("L'Ecuyer-CMRG") 17 | 18 | beta_logit <- cmdstanr::cmdstan_model("beta_logit.stan",compile = T) 19 | zoib_model <- cmdstanr::cmdstan_model("zoib_nophireg.stan") 20 | frac_mod <- cmdstanr::cmdstan_model("frac_logit.stan") 21 | 22 | set.seed(772235) 23 | 24 | # number of draws per sample size 25 | draws <- 100 26 | 27 | # let's do some simulations 28 | 29 | simul_data <- tibble(N=rep(c(25,50,75,seq(100,3000,by=200)),each=draws)) %>% 30 | mutate(k=5, 31 | rho=.7, 32 | phi=2, 33 | cutpoints1=-3) %>% 34 | mutate(cutpoints2=2, 35 | X_beta=list(c(-1,1,-1,1,-1))) 36 | 37 | # we can also use it to calculate "true" marginal effect of X on Y using code from margins package 38 | # i.e., numerical differentiation 39 | # set value of `h` based on `eps` to deal with machine precision 40 | 41 | eps <- 1e-7 42 | setstep <- function(x) { 43 | x + (max(abs(x), 1, na.rm = TRUE) * sqrt(eps)) - x 44 | } 45 | 46 | # functions for doing marginal effects 47 | 48 | predict_ordbeta <- function(cutpoints=NULL,X=NULL,X_beta=NULL, 49 | combined_out=T) { 50 | 51 | # we'll assume the same eta was used to generate outcomes 52 | eta <- X%*%matrix(X_beta)[,1] 53 | 54 | # probabilities for three possible categories (0, proportion, 1) 55 | low <- 1-plogis(eta - cutpoints[1]) 56 | middle <- plogis(eta-cutpoints[1]) - plogis(eta-cutpoints[2]) 57 | high <- plogis(eta - cutpoints[2]) 58 | 59 | # check for whether combined outcome or single outcome 60 | 61 | if(combined_out) { 62 | low*0 + middle*plogis(eta) + high*1 63 | } else { 64 | list(pr_zero=low, 65 | pr_proportion=middle, 66 | proportion_value=plogis(eta), 67 | pr_one=high) 68 | } 69 | 70 | } 71 | 72 | predict_zoib <- function(coef_g=NULL,coef_a=NULL,coef_m=NULL, 73 | alpha1=NULL, 74 | alpha2=NULL, 75 | alpha3=NULL, 76 | X=NULL, 77 | combined_out=T) { 78 | 79 | # we'll assume the same eta was used to generate outcomes 80 | psi <- plogis(alpha1 + X %*% t(coef_a)) 81 | gamma <- plogis(alpha2 + X %*% t(coef_g)) 82 | eta <- alpha3 + X %*% t(coef_m) 83 | 84 | # probabilities for three possible categories (0, proportion, 1) 85 | low <- psi * (1-gamma) 86 | middle <- (1-psi) 87 | high <- psi * gamma 88 | 89 | # check for whether combined outcome or single outcome 90 | 91 | if(combined_out) { 92 | low*0 + middle*plogis(eta) + high*1 93 | } else { 94 | list(pr_zero=low, 95 | pr_proportion=middle, 96 | proportion_value=eta, 97 | pr_one=high) 98 | } 99 | 100 | } 101 | 102 | gen_x <- function(k,rho,N_rep) { 103 | 104 | # generate true coefs 105 | 106 | true_coef <- runif(k, -5, 5) 107 | 108 | # generate matrix of correlated Xs 109 | 110 | out_x <- sapply(1:length(k), function(i) { 111 | 112 | this_x <- 1 113 | 114 | }) 115 | 116 | } 117 | 118 | # one for each model type 119 | 120 | r_seeds <- c(6635,2216,8845,9936,3321,63914) 121 | 122 | plan(multicore) 123 | 124 | all_simul_data <- future_lapply(1:nrow(simul_data), function(i,simul_data=NULL,r_seeds=NULL) { 125 | #all_simul_data <- lapply(1:nrow(simul_data), function(i,simul_data=NULL,r_seeds=NULL) { 126 | 127 | this_data <- slice(simul_data,i) 128 | cat(file = "simul_status.txt",paste0("Now on row ",i),append = T) 129 | 130 | # Draw from ordered beta regression --------------------------------------- 131 | 132 | 133 | 134 | N <- this_data$N 135 | 136 | #X <- rnorm(N,runif(1,-2,2),1) 137 | 138 | X_beta <- this_data$X_beta[[1]] 139 | 140 | # need to create X 141 | 142 | X <- rnorm_multi(n=N,vars=this_data$k,r=this_data$rho,as.matrix=T) 143 | 144 | eta <- -2 + X%*%matrix(X_beta) 145 | 146 | # ancillary parameter of beta distribution 147 | phi <- this_data$phi 148 | 149 | # predictor for ordered model 150 | mu1 <- eta[,1] 151 | # predictor for beta regression 152 | mu2 <- eta[,1] 153 | 154 | cutpoints <- c(this_data$cutpoints1,this_data$cutpoints2) 155 | 156 | # probabilities for three possible categories (0, proportion, 1) 157 | low <- 1-plogis(mu2 - cutpoints[1]) 158 | middle <- plogis(mu2-cutpoints[1]) - plogis(mu2-cutpoints[2]) 159 | high <- plogis(mu2 - cutpoints[2]) 160 | 161 | # we'll assume the same eta was used to generate outcomes 162 | 163 | out_beta <- rbeta(N,plogis(mu1) * phi, (1 - plogis(mu1)) * phi) 164 | 165 | print(i) 166 | 167 | # now determine which one we get for each observation 168 | outcomes <- sapply(1:N, function(i) { 169 | sample(1:3,size=1,prob=c(low[i],middle[i],high[i])) 170 | }) 171 | 172 | # now combine binary (0/1) with proportion (beta) 173 | 174 | final_out <- sapply(1:length(outcomes),function(i) { 175 | if(outcomes[i]==1) { 176 | return(0) 177 | } else if(outcomes[i]==2) { 178 | return(out_beta[i]) 179 | } else { 180 | return(1) 181 | } 182 | }) 183 | 184 | # check for floating point errors 185 | 186 | final_out <- ifelse(final_out>(1 - 1e-10) & final_out<1,final_out - 1e-10, 187 | final_out) 188 | 189 | final_out <- ifelse(final_out<(0 + 1e-10) & final_out>0,final_out + 1e-10, 190 | final_out) 191 | 192 | counter <- environment() 193 | counter$i <- 0 194 | 195 | # get rid of very rare outcomes, can be difficult for ZOIB 196 | 197 | while(((sum(final_out>0 & final_out<1)<5) || (sum(final_out==1)<5) || (sum(final_out==0)<5)) && (counter$i<20)) { 198 | 199 | final_out <- sapply(1:length(outcomes),function(i) { 200 | if(outcomes[i]==1) { 201 | return(0) 202 | } else if(outcomes[i]==2) { 203 | return(out_beta[i]) 204 | } else { 205 | return(1) 206 | } 207 | }) 208 | cat(file = "simul_status.txt","Resampling",append=T) 209 | counter$i <- counter$i + 1 210 | } 211 | 212 | if((sum(final_out>0 & final_out<1)<5) || (sum(final_out==1)<5) || (sum(final_out==0)<5)) { 213 | 214 | print(paste0("DGP failed for row ",i,"\n")) 215 | 216 | this_data$status <- "dgp_failure" 217 | 218 | return(this_data) 219 | } 220 | 221 | # calculate "true" marginal effects 222 | 223 | # loop over k 224 | 225 | X_low <- X 226 | X_high <- X 227 | 228 | marg_eff <- sapply(1:this_data$k, function(tk) { 229 | 230 | X_low[,tk] <- X[,tk] - setstep(X[,tk]) 231 | X_high[,tk] <- X[,tk] + setstep(X[,tk]) 232 | 233 | y0 <- predict_ordbeta(cutpoints=cutpoints, 234 | X=X_low, 235 | X_beta=X_beta) 236 | 237 | y1 <- predict_ordbeta(cutpoints=cutpoints, 238 | X=X_high, 239 | X_beta=X_beta) 240 | 241 | mean((y1-y0)/(X_high[,tk]-X_low[,tk])) 242 | 243 | 244 | }) 245 | 246 | 247 | 248 | 249 | # Fit models -------------------------------------------------------------- 250 | 251 | # now fit ordinal beta 252 | 253 | to_bl <- list(N_degen=sum(final_out %in% c(0,1)), 254 | N_prop=sum(final_out>0 & final_out<1), 255 | X=this_data$k, 256 | outcome_prop=final_out[final_out>0 & final_out<1], 257 | outcome_degen=final_out[final_out %in% c(0,1)], 258 | covar_prop=X[final_out>0 & final_out<1,,drop=F], 259 | covar_degen=X[final_out %in% c(0,1),,drop=F], 260 | N_pred_degen=sum(final_out %in% c(0,1)), 261 | N_pred_prop=sum(final_out>0 & final_out<1), 262 | indices_degen=1:(sum(final_out %in% c(0,1))), 263 | indices_prop=1:(sum(final_out>0 & final_out<1)), 264 | run_gen=1) 265 | 266 | x <- X 267 | 268 | final_out_scale <- (final_out * (length(final_out) - 1) + .5) / length(final_out) 269 | 270 | frac_data <- list(y=final_out, 271 | x=x, 272 | k=ncol(x), 273 | run_gen=1, 274 | n=nrow(x)) 275 | # fit all models 276 | 277 | 278 | fit_model <-try(beta_logit$sample(data=to_bl,seed=r_seeds[1], 279 | chains=1,parallel_chains=1,iter_warmup=500, 280 | refresh=0, 281 | iter_sampling=500)) 282 | 283 | 284 | 285 | zoib_fit <- try(zoib_model$sample(data=list(n=length(final_out), 286 | s=length(final_out), 287 | sample_all=1:length(final_out), 288 | y=final_out, 289 | k=ncol(x), 290 | x=x, 291 | run_gen=1), 292 | seed=r_seeds[2], 293 | refresh=0,chains=1,parallel_chains=1,iter_warmup=500, 294 | iter_sampling=500)) 295 | 296 | X_brms <- X 297 | colnames(X_brms) <- paste0(rep("Var",ncol(X)),1:ncol(X)) 298 | X_brms <- as_tibble(X_brms) 299 | 300 | betareg_fit <- try(brm(formula = outcome~.,data=mutate(X_brms, 301 | outcome=final_out_scale), 302 | chains=1,cores=1,iter=1000, 303 | seed=r_seeds[3], 304 | silent=0,refresh=0, 305 | family="beta", 306 | prior=set_prior("normal(0,5)", class = "b"), 307 | backend='cmdstanr')) 308 | 309 | X_brms_small <- filter(X_brms, final_out>0 & final_out<1) 310 | 311 | X_brms_small$outcome <- final_out[final_out>0 & final_out<1] 312 | 313 | betareg_fit2 <- try(update(betareg_fit,newdata=X_brms_small, 314 | chains=1,cores=1,iter=1000, 315 | seed=r_seeds[4], 316 | silent=0,refresh=0, 317 | family="beta", 318 | backend="cmdstanr")) 319 | 320 | frac_fit <- try(frac_mod$sample(data=frac_data,seed=r_seeds[6], 321 | chains=1,parallel_chains = 1,refresh=0, 322 | iter_warmup=500,iter_sampling = 500)) 323 | 324 | lm_fit <- try(brm(formula = outcome~X,data=tibble(outcome=final_out, 325 | r_seeds[5], 326 | X=X),chains=1,cores=1,iter=1000, 327 | silent=0,refresh=0, 328 | backend="cmdstanr", 329 | prior=set_prior("normal(0,5)", class = "b"))) 330 | 331 | if('try-error' %in% c(class(fit_model), 332 | class(zoib_model), 333 | class(betareg_fit), 334 | class(betareg_fit2), 335 | class(frac_fit), 336 | class(lm_fit))) { 337 | 338 | print(paste0("Estimation failed for row ",i,"\n")) 339 | 340 | this_data$status <- "estimation_failure" 341 | 342 | return(this_data) 343 | 344 | } 345 | 346 | yrep_ord <- try(as_draws_matrix(fit_model$draws("regen_epred"))) 347 | 348 | yrep_zoib <- try(as_draws_matrix(zoib_fit$draws("zoib_epred"))) 349 | 350 | yrep_betareg <- try(posterior_epred(betareg_fit,draws=100)) 351 | 352 | yrep_betareg2 <- try(posterior_epred(betareg_fit2,draws=100)) 353 | 354 | yrep_lm <- try(posterior_epred(lm_fit,draws=100)) 355 | 356 | yrep_frac <- try(as_draws_matrix(frac_fit$draws("frac_rep"))) 357 | 358 | if('try-error' %in% c(class(yrep_ord), 359 | class(yrep_zoib), 360 | class(yrep_betareg), 361 | class(yrep_betareg2), 362 | class(yrep_frac), 363 | class(yrep_lm))) { 364 | 365 | print(paste0("Estimation failed for row ",i,"\n")) 366 | 367 | this_data$status <- "estimation_failure" 368 | 369 | return(this_data) 370 | 371 | } 372 | 373 | this_data$status <- "success" 374 | 375 | 376 | # Calculate estimands ----------------------------------------------------- 377 | 378 | 379 | # now return the full data frame 380 | 381 | X_beta_ord <- as_draws_matrix(fit_model$draws("X_beta")) 382 | X_beta_zoib <- as_draws_matrix(zoib_fit$draws("coef_m")) 383 | X_beta_reg <- as.matrix(betareg_fit,pars=paste0("Var",1:this_data$k)) 384 | X_beta_reg2 <- as.matrix(betareg_fit2,pars=paste0("Var",1:this_data$k)) 385 | X_beta_lm <- as.matrix(lm_fit,pars="X") 386 | X_beta_frac <- as_draws_matrix(frac_fit$draws(variables="X_beta")) 387 | 388 | # calculate rmse 389 | 390 | rmse_ord <- sqrt(mean(apply(yrep_ord,1,function(c) { (c - c(final_out[final_out %in% c(0,1)],final_out[final_out>0 & final_out<1]))^2 }))) 391 | rmse_zoib <-sqrt( mean(apply(yrep_zoib,1,function(c) { (c - final_out)^2 }))) 392 | rmse_betareg <- sqrt(mean(apply(yrep_betareg,1,function(c) { (c - final_out)^2 }))) 393 | rmse_betareg2 <- sqrt(mean(apply(yrep_betareg2,1,function(c) { (c - final_out[final_out>0 & final_out<1])^2 }))) 394 | rmse_lm <- sqrt(mean(apply(yrep_lm,1,function(c) { (c - final_out)^2 }))) 395 | rmse_frac <- sqrt(mean(apply(yrep_frac,1,function(c) { (c - final_out)^2 }))) 396 | 397 | # calculate loo 398 | 399 | loo_ordbeta <-try(fit_model$loo("ord_log")) 400 | loo_betareg <- loo(betareg_fit) 401 | loo_betareg2 <- loo(betareg_fit2) 402 | loo_zoib <- try(zoib_fit$loo("zoib_log")) 403 | loo_lm <- loo(lm_fit) 404 | loo_frac <- try(frac_fit$loo("frac_log")) 405 | 406 | if(any('try-error' %in% c(class(loo_ordbeta), 407 | class(loo_zoib), 408 | class(loo_frac)))) { 409 | comp_loo <- matrix(rep(NA,6),ncol=2) 410 | row.names(comp_loo) <- c("ord","zoib","frac") 411 | loo_ordbeta <- list(estimates=matrix(c(NA,NA),ncol=2)) 412 | loo_zoib <- list(estimates=matrix(c(NA,NA),ncol=2)) 413 | loo_frac <- list(estimates=matrix(c(NA,NA),ncol=2)) 414 | } else { 415 | comp_loo <- loo_compare(list(ord=loo_ordbeta,zoib=loo_zoib, 416 | lm=loo_lm,frac=loo_frac)) 417 | } 418 | 419 | 420 | 421 | # calculate marginal effects 422 | 423 | cutpoints_est <- as_draws_matrix(fit_model$draws("cutpoints")) 424 | 425 | margin_ord <- lapply(1:ncol(X), function(tk) { 426 | 427 | X_low[,tk] <- X[,tk] - setstep(X[,tk]) 428 | X_high[,tk] <- X[,tk] + setstep(X[,tk]) 429 | 430 | tibble(marg_eff=sapply(1:nrow(X_beta_ord), function(i) { 431 | y0 <- predict_ordbeta(cutpoints=cutpoints_est[i,], 432 | X=X_low, 433 | X_beta=c(X_beta_ord[i,])) 434 | 435 | y1 <- predict_ordbeta(cutpoints=cutpoints_est[i,], 436 | X=X_high, 437 | X_beta=c(X_beta_ord[i,])) 438 | 439 | marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk]) 440 | 441 | mean(marg_eff) 442 | }), 443 | x_col=tk) 444 | 445 | }) %>% bind_rows 446 | 447 | # now for the ZOIB 448 | 449 | coef_a <- as_draws_matrix(zoib_fit$draws("coef_a")) 450 | coef_g <- as_draws_matrix(zoib_fit$draws("coef_g")) 451 | alpha <- as_draws_matrix(zoib_fit$draws("alpha")) 452 | 453 | margin_zoib <- lapply(1:ncol(X), function(tk) { 454 | 455 | X_low[,tk] <- X[,tk] - setstep(X[,tk]) 456 | X_high[,tk] <- X[,tk] + setstep(X[,tk]) 457 | 458 | tibble(marg_eff= sapply(1:nrow(X_beta_zoib), function(i) { 459 | y0 <- predict_zoib(coef_g=coef_g[i,], 460 | coef_a=coef_a[i,], 461 | alpha1=c(alpha[i,1]), 462 | alpha2=c(alpha[i,2]), 463 | alpha3=c(alpha[i,3]), 464 | X=X_low, 465 | coef_m=X_beta_zoib[i,]) 466 | 467 | y1 <- predict_zoib(coef_g=coef_g[i,], 468 | coef_a=coef_a[i,], 469 | alpha1=c(alpha[i,1]), 470 | alpha2=c(alpha[i,2]), 471 | alpha3=c(alpha[i,3]), 472 | X=X_high, 473 | coef_m=X_beta_zoib[i,]) 474 | 475 | marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk]) 476 | 477 | mean(marg_eff) 478 | }), 479 | x_col=tk) 480 | 481 | }) %>% bind_rows 482 | 483 | # now betareg 484 | 485 | betareg_int <- as.matrix(betareg_fit,pars="(Intercept)") 486 | 487 | margin_betareg <- lapply(1:ncol(X), function(tk) { 488 | 489 | X_low[,tk] <- X[,tk] - setstep(X[,tk]) 490 | X_high[,tk] <- X[,tk] + setstep(X[,tk]) 491 | 492 | tibble(marg_eff= sapply(1:nrow(X_beta_reg), function(i) { 493 | y0 <- plogis(betareg_int[i,"b_Intercept"] + X_low%*%X_beta_reg[i,]) 494 | y1 <- plogis(betareg_int[i,"b_Intercept"] + X_high%*%X_beta_reg[i,]) 495 | 496 | marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk]) 497 | 498 | mean(marg_eff) 499 | }), 500 | x_col=tk) 501 | 502 | }) %>% bind_rows 503 | 504 | betareg2_int <- as.matrix(betareg_fit2,pars="(Intercept)") 505 | 506 | margin_betareg2 <- lapply(1:ncol(X), function(tk) { 507 | 508 | X_low[,tk] <- X[,tk] - setstep(X[,tk]) 509 | X_high[,tk] <- X[,tk] + setstep(X[,tk]) 510 | 511 | tibble(marg_eff= sapply(1:nrow(X_beta_reg), function(i) { 512 | y0 <- plogis(betareg2_int[i,"b_Intercept"] + X_low%*%X_beta_reg2[i,]) 513 | y1 <- plogis(betareg2_int[i,"b_Intercept"] + X_high%*%X_beta_reg2[i,]) 514 | 515 | marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk]) 516 | 517 | mean(marg_eff) 518 | }), 519 | x_col=tk) 520 | 521 | }) %>% bind_rows 522 | 523 | 524 | # Fractional logit marg effects ------------------------------------------- 525 | 526 | frac_int <- as_draws_matrix(frac_fit$draws(variables="alpha")) 527 | 528 | margin_frac <- lapply(1:ncol(X), function(tk) { 529 | 530 | X_low[,tk] <- X[,tk] - setstep(X[,tk]) 531 | X_high[,tk] <- X[,tk] + setstep(X[,tk]) 532 | 533 | tibble(marg_eff= sapply(1:nrow(X_beta_frac), function(i) { 534 | y0 <- plogis(c(frac_int[i,]) + X_low%*%X_beta_frac[i,,drop=T]) 535 | y1 <- plogis(c(frac_int[i,]) + X_high%*%X_beta_frac[i,,drop=T]) 536 | 537 | marg_eff <- (y1-y0)/(X_high[,tk]-X_low[,tk]) 538 | 539 | mean(marg_eff) 540 | }), 541 | x_col=tk) 542 | 543 | 544 | }) %>% bind_rows 545 | 546 | 547 | 548 | this_data$marg_eff <- list(marg_eff) 549 | this_data$true_kurt <- moments::kurtosis(final_out) 550 | 551 | 552 | 553 | # Combine estimates ------------------------------------------------------- 554 | 555 | sum_marg <- function(d,func,...) { 556 | 557 | ret_vec <- arrange(d,x_col) %>% 558 | group_by(x_col) %>% 559 | summarize(sum_stat=func(marg_eff,...)) %>% 560 | pull(sum_stat) 561 | 562 | list(ret_vec) 563 | 564 | } 565 | 566 | try(bind_cols(purrr::map_dfr(seq_len(6), ~this_data),bind_rows(tibble(model="Ordinal Beta Regression", 567 | med_est=list(apply(X_beta_ord,2,mean)), 568 | high=list(apply(X_beta_ord,2,quantile,.95)), 569 | low=list(apply(X_beta_ord,2,quantile,.05)), 570 | var_calc=list(apply(X_beta_ord,2,var)), 571 | loo_val=loo_ordbeta$estimates[1,1], 572 | p_loo=loo_ordbeta$estimates[2,1], 573 | bad_k=sum(loo_ordbeta$diagnostics$pareto_k>0.5), 574 | win_loo=which(row.names(comp_loo)=="ord"), 575 | win_loo_se=comp_loo[which(row.names(comp_loo)=="ord"),2], 576 | rmse=rmse_ord, 577 | kurt_est=mean(apply(yrep_ord,1,moments::kurtosis)), 578 | marg_eff_est=sum_marg(margin_ord,mean), 579 | high_marg=sum_marg(margin_ord,quantile,.95), 580 | low_marg=sum_marg(margin_ord,quantile,.05), 581 | var_marg=sum_marg(margin_ord,var)), 582 | tibble(model="ZOIB", 583 | med_est=list(apply(X_beta_zoib,2,mean)), 584 | high=list(apply(X_beta_zoib,2,quantile,.95)), 585 | low=list(apply(X_beta_zoib,2,quantile,.05)), 586 | var_calc=list(apply(X_beta_zoib,2,var)), 587 | loo_val=loo_zoib$estimates[1,1], 588 | p_loo=loo_zoib$estimates[2,1], 589 | bad_k=sum(loo_zoib$diagnostics$pareto_k>0.5), 590 | kurt_est=mean(apply(yrep_zoib,1,moments::kurtosis)), 591 | win_loo=which(row.names(comp_loo)=="zoib"), 592 | win_loo_se=comp_loo[which(row.names(comp_loo)=="zoib"),2], 593 | rmse=rmse_zoib, 594 | marg_eff_est=sum_marg(margin_zoib,mean), 595 | high_marg=sum_marg(margin_zoib,quantile,.95), 596 | low_marg=sum_marg(margin_zoib,quantile,.05), 597 | var_marg=sum_marg(margin_zoib,var)), 598 | tibble(model="Beta Regression - Transformed", 599 | med_est=list(apply(X_beta_reg,2,mean)), 600 | high=list(apply(X_beta_reg,2,quantile,.95)), 601 | low=list(apply(X_beta_reg,2,quantile,.05)), 602 | var_calc=list(apply(X_beta_reg,2,var)), 603 | loo_val=loo_betareg$estimates[1,1], 604 | p_loo=loo_betareg$estimates[2,1], 605 | bad_k=sum(loo_betareg$diagnostics$pareto_k>0.5), 606 | rmse=rmse_betareg, 607 | kurt_est=mean(apply(yrep_betareg,1,moments::kurtosis)), 608 | marg_eff_est=sum_marg(margin_betareg,mean), 609 | high_marg=sum_marg(margin_betareg,quantile,.95), 610 | low_marg=sum_marg(margin_betareg,quantile,.05), 611 | var_marg=sum_marg(margin_betareg,var)), 612 | tibble(model="Beta Regression - (0,1)", 613 | med_est=list(apply(X_beta_reg2,2,mean)), 614 | high=list(apply(X_beta_reg2,2,quantile,.95)), 615 | low=list(apply(X_beta_reg2,2,quantile,.05)), 616 | var_calc=list(apply(X_beta_reg2,2,var)), 617 | loo_val=loo_betareg2$estimates[1,1], 618 | p_loo=loo_betareg2$estimates[2,1], 619 | bad_k=sum(loo_betareg2$diagnostics$pareto_k>0.5), 620 | kurt_est=mean(apply(yrep_betareg2,1,moments::kurtosis)), 621 | rmse=rmse_betareg2, 622 | marg_eff_est=sum_marg(margin_betareg2,mean), 623 | high_marg=sum_marg(margin_betareg2,quantile,.95), 624 | low_marg=sum_marg(margin_betareg2,quantile,.05), 625 | var_marg=sum_marg(margin_betareg2,var)), 626 | tibble(model="OLS", 627 | med_est=list(apply(X_beta_lm,2,mean)), 628 | high=list(apply(X_beta_lm,2,quantile,.95)), 629 | low=list(apply(X_beta_lm,2,quantile,.05)), 630 | var_calc=list(apply(X_beta_lm,2,var)), 631 | kurt_est=mean(apply(yrep_lm,1,moments::kurtosis)), 632 | loo_val=loo_lm$estimates[1,1], 633 | p_loo=loo_lm$estimates[2,1], 634 | bad_k=sum(loo_lm$diagnostics$pareto_k>0.5), 635 | win_loo=which(row.names(comp_loo)=="lm"), 636 | win_loo_se=comp_loo[which(row.names(comp_loo)=="lm"),2], 637 | rmse=rmse_lm, 638 | marg_eff_est=list(apply(X_beta_lm,2,mean)), 639 | high_marg=list(apply(X_beta_lm,2,quantile,.95)), 640 | low_marg=list(apply(X_beta_lm,2,quantile,.05)), 641 | var_marg=list(apply(X_beta_lm,2,var))), 642 | tibble(model="Fractional", 643 | med_est=list(apply(X_beta_frac,2,mean)), 644 | high=list(apply(X_beta_frac,2,quantile,.95)), 645 | low=list(apply(X_beta_frac,2,quantile,.05)), 646 | var_calc=list(apply(X_beta_frac,2,var)), 647 | loo_val=loo_frac$estimates[1,1], 648 | p_loo=loo_frac$estimates[2,1], 649 | bad_k=sum(loo_frac$diagnostics$pareto_k>0.5), 650 | win_loo=which(row.names(comp_loo)=="frac"), 651 | win_loo_se=comp_loo[which(row.names(comp_loo)=="frac"),2], 652 | kurt_est=mean(apply(yrep_frac,1,moments::kurtosis)), 653 | rmse=rmse_frac, 654 | marg_eff_est=sum_marg(margin_frac,mean), 655 | high_marg=sum_marg(margin_frac,quantile,.95), 656 | low_marg=sum_marg(margin_frac,quantile,.05), 657 | var_marg=sum_marg(margin_frac,var))))) 658 | 659 | 660 | #},simul_data=simul_data,r_seeds=r_seeds) 661 | },simul_data=simul_data,r_seeds=r_seeds,future.seed=TRUE) 662 | 663 | #simul_data_final <- bind_rows(all_simul_data) 664 | 665 | saveRDS(all_simul_data,"sim_cont_X_fixed.rds") 666 | 667 | all_sim_fixed <- all_simul_data 668 | 669 | save(all_sim_fixed, file="sim_cont_X_fixed.RData") 670 | -------------------------------------------------------------------------------- /preamble.tex: -------------------------------------------------------------------------------- 1 | \usepackage{booktabs} 2 | \usepackage{longtable} 3 | \usepackage{array} 4 | \usepackage{multirow} 5 | \usepackage{wrapfig} 6 | \usepackage{float} 7 | \usepackage{colortbl} 8 | \usepackage{pdflscape} 9 | \usepackage{tabu} 10 | \usepackage{threeparttable} 11 | \usepackage{threeparttablex} 12 | \usepackage[normalem]{ulem} 13 | \usepackage{makecell} 14 | \usepackage{xcolor} 15 | \usepackage{xstring} 16 | \usepackage{nameref} 17 | \linespread{2} 18 | 19 | -------------------------------------------------------------------------------- /zoib.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int n; 3 | int k; // number of columns 4 | matrix[n,k] x; 5 | vector[n] y; 6 | int run_gen; // whether to use generated quantities 7 | } 8 | transformed data { 9 | int is_discrete[n]; 10 | int y_discrete[n]; 11 | 12 | // create indicator for whether y is discrete 13 | // and an integer value to pass to bernoulli_lpmf for discrete y 14 | for (i in 1:n) { 15 | if (y[i] == 0) { 16 | is_discrete[i] = 1; 17 | y_discrete[i] = 0; 18 | } else if (y[i] == 1) { 19 | is_discrete[i] = 1; 20 | y_discrete[i] = 1; 21 | } else { 22 | is_discrete[i] = 0; 23 | // hack to ensure that throws error if passed to bernoulli_lpmf 24 | y_discrete[i] = -1; 25 | } 26 | } 27 | } 28 | parameters { 29 | vector[k] coef_a; 30 | vector[k] coef_g; 31 | vector[k] coef_m; 32 | //vector[k] coef_p; 33 | real phi; 34 | vector[3] alpha; 35 | } 36 | transformed parameters { 37 | vector[n] psi; 38 | vector[n] gamma; 39 | vector[n] mu; 40 | //vector[n] phi; 41 | 42 | psi = alpha[1] + x*coef_a; 43 | gamma = alpha[2] + x*coef_g; 44 | mu = inv_logit(alpha[3] + x*coef_m); 45 | //phi = exp(alpha[4] + x*coef_p); 46 | 47 | } 48 | model { 49 | coef_a ~ normal(0, 3); 50 | coef_g ~ normal(0, 3); 51 | coef_m ~ normal(0, 3); 52 | //coef_p ~ normal(0, 3); 53 | phi ~ exponential(1); 54 | alpha ~ normal(0,3); 55 | 56 | is_discrete ~ bernoulli_logit(psi); 57 | for (i in 1:n) { 58 | if (is_discrete[i] == 1) { 59 | y_discrete[i] ~ bernoulli_logit(gamma[i]); 60 | } else { 61 | y[i] ~ beta_proportion(mu[i], phi); 62 | } 63 | } 64 | 65 | } 66 | generated quantities { 67 | vector[run_gen==1 ? n: 0] zoib_log; 68 | vector[run_gen==1 ? n: 0] zoib_regen; 69 | vector[run_gen==1 ? n: 0] is_discrete_regen; 70 | 71 | if(run_gen==1) { 72 | for (i in 1:n) { 73 | real psit = inv_logit(psi[i]); 74 | real gammat = inv_logit(gamma[i]); 75 | if (y[i] == 0) { 76 | zoib_log[i] = log(psit) + log1m(gammat); 77 | } else if (y[i] == 1) { 78 | zoib_log[i] = log(psit) + log(gammat); 79 | } else { 80 | zoib_log[i] = log1m(psit) + beta_proportion_lpdf(y[i] | mu[i], phi); 81 | } 82 | 83 | is_discrete_regen[i] = bernoulli_rng(psit); 84 | 85 | if(is_discrete_regen[i]==0) { 86 | zoib_regen[i] = beta_proportion_rng(mu[i], phi); 87 | } else { 88 | zoib_regen[i] = bernoulli_rng(gammat); 89 | } 90 | } 91 | 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /zoib_nophireg.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int n; 3 | int k; // number of columns 4 | int s; // number of samples (if not estimating predictions on all outcomes) 5 | int sample_all[s]; 6 | matrix[n,k] x; 7 | vector[n] y; 8 | int run_gen; // whether to use generated quantities 9 | } 10 | transformed data { 11 | int is_discrete[n]; 12 | int y_discrete[n]; 13 | 14 | // create indicator for whether y is discrete 15 | // and an integer value to pass to bernoulli_lpmf for discrete y 16 | for (i in 1:n) { 17 | if (y[i] == 0) { 18 | is_discrete[i] = 1; 19 | y_discrete[i] = 0; 20 | } else if (y[i] == 1) { 21 | is_discrete[i] = 1; 22 | y_discrete[i] = 1; 23 | } else { 24 | is_discrete[i] = 0; 25 | // hack to ensure that throws error if passed to bernoulli_lpmf 26 | y_discrete[i] = -1; 27 | } 28 | } 29 | } 30 | parameters { 31 | vector[k] coef_a; 32 | vector[k] coef_g; 33 | vector[k] coef_m; 34 | //vector[k] coef_p; 35 | real phi; 36 | vector[3] alpha; 37 | } 38 | transformed parameters { 39 | vector[n] psi; 40 | vector[n] gamma; 41 | vector[n] mu; 42 | //vector[n] phi; 43 | 44 | psi = alpha[1] + x*coef_a; 45 | gamma = alpha[2] + x*coef_g; 46 | mu = inv_logit(alpha[3] + x*coef_m); 47 | //phi = exp(alpha[4] + x*coef_p); 48 | 49 | } 50 | model { 51 | coef_a ~ normal(0, 5); 52 | coef_g ~ normal(0, 5); 53 | coef_m ~ normal(0, 5); 54 | //coef_p ~ normal(0, 3); 55 | phi ~ exponential(.1); 56 | alpha ~ normal(0,5); 57 | 58 | is_discrete ~ bernoulli_logit(psi); 59 | for (i in 1:n) { 60 | if (is_discrete[i] == 1) { 61 | y_discrete[i] ~ bernoulli_logit(gamma[i]); 62 | } else { 63 | y[i] ~ beta_proportion(mu[i], phi); 64 | } 65 | } 66 | 67 | } 68 | generated quantities { 69 | vector[run_gen==1 ? s: 0] zoib_log; 70 | vector[run_gen==1 ? s: 0] zoib_regen; 71 | vector[run_gen==1 ? s: 0] zoib_epred; 72 | vector[run_gen==1 ? s: 0] is_discrete_regen; 73 | 74 | if(run_gen==1) { 75 | for (i in 1:s) { 76 | 77 | real psit = inv_logit(psi[sample_all[i]]); 78 | real gammat = inv_logit(gamma[sample_all[i]]); 79 | 80 | zoib_epred[i] = psit * gammat + (1-psit) * mu[sample_all[i]]; 81 | 82 | if (y[sample_all[i]] == 0) { 83 | zoib_log[i] = log(psit) + log1m(gammat); 84 | } else if (y[sample_all[i]] == 1) { 85 | zoib_log[i] = log(psit) + log(gammat); 86 | } else { 87 | zoib_log[i] = log1m(psit) + beta_proportion_lpdf(y[sample_all[i]] | mu[sample_all[i]], phi); 88 | } 89 | 90 | 91 | 92 | is_discrete_regen[i] = bernoulli_rng(psit); 93 | 94 | if(is_discrete_regen[i]==0) { 95 | zoib_regen[i] = beta_proportion_rng(mu[sample_all[i]], phi); 96 | } else { 97 | zoib_regen[i] = bernoulli_rng(gammat); 98 | } 99 | } 100 | 101 | } 102 | } 103 | --------------------------------------------------------------------------------