├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── pkgdown.yaml ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── checks.R ├── clarify-package.R ├── clarify_est_methods.R ├── clarify_predict.R ├── get_model_components.R ├── misim.R ├── plot.clarify_adrf.R ├── plot.clarify_est.R ├── plot.clarify_setx.R ├── sim.R ├── sim_adrf.R ├── sim_ame.R ├── sim_apply.R ├── sim_setx.R ├── summary.clarify_est.R ├── transform.clarify_est.R ├── utils.R └── zzz.R ├── README.Rmd ├── README.md ├── _dev └── sim_chain.R ├── _pkgdown.yml ├── clarify.Rproj ├── clarify ├── Submission 1 │ ├── RJournal.sty │ ├── RJreferences.bib │ ├── RJwrapper.log │ ├── RJwrapper.tex │ ├── clarify.R │ ├── clarify.Rmd │ ├── clarify.html │ ├── clarify.log │ ├── clarify.pdf │ ├── clarify.tex │ ├── figures │ │ ├── unnamed-chunk-10-1.pdf │ │ ├── unnamed-chunk-10-1.png │ │ ├── unnamed-chunk-14-1.pdf │ │ ├── unnamed-chunk-14-1.png │ │ ├── unnamed-chunk-16-1.pdf │ │ ├── unnamed-chunk-16-1.png │ │ ├── unnamed-chunk-25-1.pdf │ │ ├── unnamed-chunk-25-1.png │ │ ├── unnamed-chunk-28-1.pdf │ │ ├── unnamed-chunk-28-1.png │ │ ├── unnamed-chunk-8-1.pdf │ │ └── unnamed-chunk-8-1.png │ ├── initial_checks.log │ └── motivation-letter.md └── Submission 2 │ ├── 1-review-1.txt │ ├── 1-review-2.txt │ ├── RJournal.sty │ ├── RJreferences.bib │ ├── RJwrapper.log │ ├── RJwrapper.tex │ ├── clarify.R │ ├── clarify.Rmd │ ├── clarify.html │ ├── clarify.log │ ├── clarify.pdf │ ├── clarify.tex │ ├── figures │ ├── plot1-1.png │ ├── plot2-1.png │ ├── plot3-1.png │ ├── plot4-1.png │ ├── plot8-1.png │ ├── plot9-1.png │ ├── unnamed-chunk-10-1.pdf │ ├── unnamed-chunk-10-1.png │ ├── unnamed-chunk-14-1.pdf │ ├── unnamed-chunk-14-1.png │ ├── unnamed-chunk-16-1.pdf │ ├── unnamed-chunk-16-1.png │ ├── unnamed-chunk-25-1.pdf │ ├── unnamed-chunk-25-1.png │ ├── unnamed-chunk-28-1.pdf │ ├── unnamed-chunk-28-1.png │ ├── unnamed-chunk-8-1.pdf │ └── unnamed-chunk-8-1.png │ ├── initial_checks.log │ ├── motivation-letter.md │ ├── response_to_reviewers.Rmd │ └── response_to_reviewers.html ├── man ├── clarify-package.Rd ├── figures │ ├── README-example-1.png │ ├── README-example2-1.png │ ├── README-unnamed-chunk-6-1.png │ └── README-unnamed-chunk-7-1.png ├── misim.Rd ├── plot.clarify_adrf.Rd ├── plot.clarify_setx.Rd ├── sim.Rd ├── sim_adrf.Rd ├── sim_ame.Rd ├── sim_apply.Rd ├── sim_setx.Rd ├── summary.clarify_est.Rd └── transform.clarify_est.Rd ├── tests ├── testthat.R └── testthat │ ├── fixtures │ ├── make_mdata.R │ ├── make_mira.R │ ├── mdata.rds │ ├── mimira.rds │ ├── mira.rds │ └── model_list.rds │ ├── helper.R │ ├── test-misim.R │ ├── test-sim.R │ ├── test-sim_ame.R │ └── test-transform.R └── vignettes ├── .gitignore ├── Zelig.Rmd ├── clarify.Rmd └── references.bib /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^README\.Rmd$ 5 | ^_pkgdown\.yml$ 6 | ^docs$ 7 | ^pkgdown$ 8 | ^\.github$ 9 | ^\_dev$ 10 | ^CRAN-SUBMISSION$ 11 | ^clarify$ 12 | ^tests/testthat/fixtures/mimira\.rds$ 13 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@4.1.4 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | docs 6 | inst/doc 7 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: clarify 2 | Type: Package 3 | Title: Simulation-Based Inference for Regression Models 4 | Version: 0.2.1 5 | Authors@R: c( 6 | person("Noah", "Greifer", role = c("aut", "cre"), 7 | email = "ngreifer@iq.harvard.edu", 8 | comment = c(ORCID="0000-0003-3067-7154")), 9 | person("Steven", "Worthington", role = c("aut"), 10 | email = "sworthington@iq.harvard.edu", 11 | comment = c(ORCID="0000-0001-9550-5797")), 12 | person("Stefano", "Iacus", role = c("aut"), 13 | email = "siacus@iq.harvard.edu", 14 | comment = c(ORCID="0000-0002-4884-0047")), 15 | person("Gary", "King", role = c("aut"), 16 | email = "king@harvard.edu", 17 | comment = c(ORCID="0000-0002-5327-7631")) 18 | ) 19 | Description: Performs simulation-based inference as an alternative to the delta method for obtaining valid confidence intervals and p-values for regression post-estimation quantities, such as average marginal effects and predictions at representative values. This framework for simulation-based inference is especially useful when the resulting quantity is not normally distributed and the delta method approximation fails. The methodology is described in King, Tomz, and Wittenberg (2000) . 'clarify' is meant to replace some of the functionality of the archived package 'Zelig'; see the vignette "Translating Zelig to clarify" for replicating this functionality. 20 | License: GPL (>= 3) 21 | Encoding: UTF-8 22 | Depends: R (>= 3.5.0) 23 | Imports: 24 | ggplot2 (>= 3.4.0), 25 | pbapply (>= 1.7-0), 26 | chk (>= 0.9.0), 27 | rlang (>= 1.0.6), 28 | insight (>= 0.19.11), 29 | marginaleffects (>= 0.20.0), 30 | mvnfast (>= 0.2.6) 31 | Suggests: 32 | testthat (>= 3.0.0), 33 | MatchIt (>= 4.0.0), 34 | parallel, 35 | knitr, 36 | rmarkdown, 37 | Amelia, 38 | MASS, betareg, survey, estimatr, fixest, logistf, geepack, rms, 39 | robustbase, robust, AER, ivreg, mgcv, sandwich 40 | Config/testthat/edition: 3 41 | RoxygenNote: 7.3.1 42 | Roxygen: list(markdown = TRUE) 43 | URL: https://github.com/iqss/clarify, 44 | https://iqss.github.io/clarify/ 45 | BugReports: https://github.com/iqss/clarify/issues 46 | VignetteBuilder: knitr 47 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(Ops,clarify_est) 4 | S3method(`[`,clarify_est) 5 | S3method(`dimnames<-`,clarify_est) 6 | S3method(`names<-`,clarify_est) 7 | S3method(as.data.frame,clarify_est) 8 | S3method(as.matrix,clarify_est) 9 | S3method(cbind,clarify_est) 10 | S3method(coef,clarify_est) 11 | S3method(confint,clarify_est) 12 | S3method(dimnames,clarify_est) 13 | S3method(names,clarify_est) 14 | S3method(plot,clarify_adrf) 15 | S3method(plot,clarify_est) 16 | S3method(plot,clarify_setx) 17 | S3method(print,clarify_adrf) 18 | S3method(print,clarify_ame) 19 | S3method(print,clarify_est) 20 | S3method(print,clarify_misim) 21 | S3method(print,clarify_setx) 22 | S3method(print,clarify_sim) 23 | S3method(print,summary.clarify_est) 24 | S3method(str,clarify_est) 25 | S3method(summary,clarify_est) 26 | S3method(transform,clarify_est) 27 | S3method(vcov,clarify_est) 28 | export(misim) 29 | export(sim) 30 | export(sim_adrf) 31 | export(sim_ame) 32 | export(sim_apply) 33 | export(sim_setx) 34 | import(ggplot2) 35 | import(stats) 36 | importFrom(utils,str) 37 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # `clarify` 0.2.1 2 | 3 | * In `sim_ame()` and `sim_adrf()`, unit-level weights are no longer used to compute means, consistent with advice in [Gabriel et al. (2023)](https://doi.org/10.1002/sim.9969). For those using these functions after matching or weighting for the ATT or ATC, this will not change results. For matching or weighting for the ATE, this improves robustness against misspecified weights. 4 | 5 | * In `sim_ame()`, more than one variable can be supplied to `var` to generate average adjusted predictions or compute average marginal effects with other variables set to supplied values. The help page for `sim_ame()` has been retooled to reflect this. 6 | 7 | * In `transform()`, values can now be indicated by positional shortcuts of the form `.b{#}`, e.g., `.b1 - .b2`, to facilitate specifying transformations of the desired quantities without using the names of the quantities, which can be frustrating to use. 8 | 9 | * When `reference = TRUE` with `plot()`, a blue line at the median of the simulated estimates is also included on the plot; when this value does not align with the estimate, quantile confidence intervals may be invalid. 10 | 11 | # `clarify` 0.2.0 12 | 13 | * `sim_ame()` and `sim_adrf()` now have a `by` argument, which can be used to estimate quantities of interest within subsets of one or more variables. 14 | 15 | * `sim_setx()` can now receive a data frame for its `x` and `x1` arguments. 16 | 17 | * `sim_ame()` can accept new options for `contrast`: `"sr"` for the survival ratio and `"srr"` for the switch relative risk. 18 | 19 | * Slight speed improvements in `sim_ame()` with continuous `var` and `sim_adrf()` with `contrast = `"amef"`. 20 | 21 | * Typo fixes in vignettes. 22 | 23 | # `clarify` 0.1.3 24 | 25 | * Documentation updates incorporating the work of Rainey (2023). `clarify` already implemented the recommendations in Rainey (2023) so no functionality has changed. 26 | 27 | # `clarify` 0.1.2 28 | 29 | * Added the argument `reference` to `plot.clarify_est()`, which adds a reference normal distribution to the density of the estimates. 30 | 31 | * Fixed error in `sim()` documentation about how degrees of freedom are computed. Thanks to @wviechtb. (#8) 32 | 33 | * Fixed a warning that can occur about recovering model data, from `insight`. 34 | 35 | # `clarify` 0.1.1 36 | 37 | * In `summary.clarify_est()`, `null` can now be supplied as a named vector to specify the quantities for which p-values should be computed. 38 | 39 | * Fixes in anticipation of breaking changes from `marginaleffects` to ensure compatibility (including with older versions). 40 | 41 | * Updates to the README and vignettes. 42 | 43 | # `clarify` 0.1.0 44 | 45 | * First release! 46 | -------------------------------------------------------------------------------- /R/clarify-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | ## usethis namespace: start 5 | #' @import stats 6 | #' @import ggplot2 7 | #' @importFrom utils str 8 | ## usethis namespace: end 9 | NULL 10 | -------------------------------------------------------------------------------- /R/clarify_est_methods.R: -------------------------------------------------------------------------------- 1 | #' @exportS3Method names clarify_est 2 | names.clarify_est <- function(x) { 3 | names(attr(x, "original")) 4 | } 5 | 6 | #' @exportS3Method `names<-` clarify_est 7 | `names<-.clarify_est` <- function(x, value) { 8 | original_names <- names(x) 9 | original_class <- class(x) 10 | x <- drop_sim_class(x) 11 | colnames(x) <- value 12 | names(attr(x, "original")) <- value 13 | for (i in names(attributes(x))) { 14 | if (identical(names(attr(x, i)), original_names)) { 15 | names(attr(x, i)) <- value 16 | } 17 | if (identical(rownames(attr(x, i)), original_names)) { 18 | rownames(attr(x, i)) <- value 19 | } 20 | if (identical(colnames(attr(x, i)), original_names)) { 21 | colnames(attr(x, i)) <- value 22 | } 23 | } 24 | class(x) <- original_class 25 | x 26 | } 27 | 28 | #' @export 29 | Ops.clarify_est <- function(e1, e2 = NULL) { 30 | unary <- nargs() == 1L 31 | FUN <- get(.Generic, envir = parent.frame(), mode = "function") 32 | 33 | if (!.Generic %in% c("+", "-", "*", "^", "%%", "%/%", "/")) { 34 | .err("only mathematical operations can be applied to `clarify_est` objects") 35 | } 36 | 37 | if (unary) { 38 | f <- quote(FUN(left)) 39 | left <- drop_sim_class(e1) 40 | e1[] <- eval(f) 41 | 42 | left <- attr(e1, "original") 43 | attr(e1, "original")[] <- eval(f) 44 | return(e1) 45 | } 46 | 47 | f <- quote(FUN(left, right)) 48 | 49 | e1_clarify_est <- inherits(e1, "clarify_est") 50 | e2_clarify_est <- inherits(e2, "clarify_est") 51 | 52 | if (e1_clarify_est && e2_clarify_est) { 53 | if (!identical(class(e1), class(e2))) { 54 | .wrn(sprintf("`%s` should only be used on `clarify_est` objects produced from the same function", 55 | .Generic)) 56 | } 57 | 58 | if (!identical(attr(e1, "hash"), attr(e2, "hash"))) { 59 | .err(sprintf("`%s` can only be used on `clarify_est` objects originating from calls applied to the same `clarify-sim` object", 60 | .Generic)) 61 | } 62 | 63 | if (any(dim(e2) != dim(e1))) { 64 | .err(sprintf("`%s` can only be used on `clarify_est` objects with an equal number of estimated quantities", 65 | .Generic)) 66 | } 67 | 68 | if (!identical(attr(e1, "at"), attr(e2, "at"))) { 69 | .err(sprintf("`%s` can only be used on `clarify_adrf` objects with the same values of `at`", 70 | .Generic)) 71 | } 72 | } 73 | 74 | left <- drop_sim_class(e1) 75 | right <- drop_sim_class(e2) 76 | 77 | if (e1_clarify_est) 78 | e1[] <- eval(f) 79 | else 80 | e2[] < eval(f) 81 | 82 | if (e1_clarify_est) 83 | left <- attr(e1, "original") 84 | if (e2_clarify_est) 85 | right <- attr(e2, "original") 86 | 87 | if (e1_clarify_est) { 88 | attr(e1, "original")[] <- eval(f) 89 | attr(e1, "contrast") <- NULL 90 | # class(e1) <- "clarify_est" 91 | return(e1) 92 | } 93 | 94 | attr(e2, "original")[] <- eval(f) 95 | attr(e1, "contrast") <- NULL 96 | # class(e2) <- "clarify_est" 97 | e2 98 | } 99 | 100 | #' @exportS3Method `[` clarify_est 101 | `[.clarify_est` <- function(x, i, ...) { 102 | 103 | Narg <- nargs() 104 | 105 | if (Narg == 1) return(x) 106 | if (Narg > 2) { 107 | .err("`clarify_est` objects can only by subset as obj[.], not obj[., .]") 108 | } 109 | 110 | attrs <- attributes(x) 111 | cl <- class(x) 112 | 113 | x <- as.matrix(x)[, i, drop = FALSE] 114 | 115 | for (z in setdiff(names(attrs), c("names", "dimnames", "dim"))) { 116 | attr(x, z) <- attrs[[z]] 117 | } 118 | attr(x, "original") <- attr(x, "original")[i] 119 | 120 | if ("at" %in% names(attrs)) { 121 | attr(x, "at") <- unname(setNames(attrs[["at"]], names(attrs[["original"]]))[i]) 122 | } 123 | if ("setx" %in% names(attrs)) { 124 | attr(x, "setx") <- attrs[["setx"]][i, , drop = FALSE] 125 | } 126 | 127 | class(x) <- cl 128 | x 129 | } 130 | 131 | #' @exportS3Method as.matrix clarify_est 132 | as.matrix.clarify_est <- function(x, ...) { 133 | drop_sim_class(x) 134 | for (i in setdiff(names(attributes(x)), c("dimnames", "dim"))) { 135 | attr(x, i) <- NULL 136 | } 137 | x 138 | } 139 | 140 | #' @exportS3Method as.data.frame clarify_est 141 | as.data.frame.clarify_est <- function(x, ...) { 142 | as.data.frame(as.matrix(x), ...) 143 | } 144 | 145 | #' @exportS3Method dimnames clarify_est 146 | dimnames.clarify_est <- function(x) { 147 | .err("do not use `colnames()`, `rownames()`, or `dimnames()` with a `clarify_est` object. Use `names()` instead") 148 | } 149 | 150 | #' @exportS3Method `dimnames<-` clarify_est 151 | `dimnames<-.clarify_est` <- function(x, value) { 152 | .err("do not use `colnames()`, `rownames()`, or `dimnames()` with a `clarify_est` object. Use `names()` instead") 153 | } 154 | 155 | #' @exportS3Method str clarify_est 156 | str.clarify_est <- function(object, 157 | max.level = NA, vec.len = getOption("str")$vec.len, digits.d = getOption("str")$digits.d, 158 | nchar.max = 128, give.attr = TRUE, drop.deparse.attr = getOption("str")$drop.deparse.attr, 159 | give.head = TRUE, give.length = give.head, width = getOption("width"), 160 | nest.lev = 0, indent.str = paste(rep.int(" ", max(0, nest.lev + 1)), collapse = ".."), 161 | comp.str = "$ ", no.list = FALSE, 162 | envir = baseenv(), strict.width = getOption("str")$strict.width, formatNum = getOption("str")$formatNum, 163 | list.len = getOption("str")$list.len, deparse.lines = getOption("str")$deparse.lines, 164 | ...) { 165 | 166 | oDefs <- c("vec.len", "digits.d", "strict.width", "formatNum", 167 | "drop.deparse.attr", "list.len", "deparse.lines") 168 | strO <- getOption("str") 169 | if (!is.list(strO)) { 170 | warning("invalid options(\"str\") -- using defaults instead") 171 | strO <- utils::strOptions() 172 | } 173 | else { 174 | if (!all(names(strO) %in% oDefs)) 175 | warning(gettextf("invalid components in options(\"str\"): %s", 176 | paste(setdiff(names(strO), oDefs), collapse = ", ")), 177 | domain = NA) 178 | strO <- utils::modifyList(utils::strOptions(), strO) 179 | } 180 | 181 | oo <- options(digits = digits.d) 182 | on.exit(options(oo)) 183 | le <- length(object) 184 | 185 | nchar.w <- function(x) nchar(x, type = "w", allowNA = TRUE) 186 | 187 | maybe_truncate <- function(x, nx = nchar.w(x), S = "\"", 188 | ch = "| __truncated__") { 189 | ok <- { 190 | if (anyNA(nx)) !is.na(nx) 191 | else TRUE 192 | } 193 | 194 | if (any(lrg <- ok & nx > nchar.max)) { 195 | nc <- nchar(ch <- paste0(S, ch)) 196 | if (nchar.max <= nc) 197 | stop(gettextf("'nchar.max = %d' is too small", 198 | nchar.max), domain = NA) 199 | x.lrg <- x[lrg] 200 | tr.x <- strtrim(x.lrg, nchar.max - nc) 201 | if (any(ii <- tr.x != x.lrg & paste0(tr.x, S) != 202 | x.lrg)) { 203 | x[lrg][ii] <- paste0(tr.x[ii], ch) 204 | } 205 | } 206 | x 207 | } 208 | 209 | nfS <- names(fStr <- formals()) 210 | strSub <- function(obj, ...) { 211 | nf <- setdiff(nfS, c("object", "give.length", "comp.str", 212 | "no.list", names(match.call())[-(1:2)], "...")) 213 | aList <- as.list(fStr)[nf] 214 | aList[] <- lapply(nf, function(n) eval(as.name(n))) 215 | do.call(function(...) str(obj, ...), c(aList, list(...)), 216 | quote = TRUE) 217 | } 218 | 219 | le.str <- { 220 | if (give.length) paste0("[1:", paste(le), "]") 221 | else "" 222 | } 223 | 224 | v.len <- vec.len 225 | std.attr <- "names" 226 | cl <- oldClass(object) 227 | 228 | if (give.attr) 229 | a <- attributes(object) 230 | dCtrl <- eval(formals(deparse)$control) 231 | 232 | if (drop.deparse.attr) 233 | dCtrl <- dCtrl[dCtrl != "showAttributes"] 234 | 235 | arrLenstr <- function(obj) { 236 | rnk <- length(di. <- dim(obj)) 237 | di <- paste0(ifelse(di. > 1, "1:", ""), di., ifelse(di. > 238 | 0, "", " ")) 239 | pDi <- function(...) paste(c("[", ..., "]"), collapse = "") 240 | if (rnk == 1) 241 | pDi(di[1L], "(1d)") 242 | else pDi(paste0(di[-rnk], ", "), di[rnk]) 243 | } 244 | 245 | mod <- "num" 246 | 247 | le.str <- arrLenstr(object) 248 | if (m <- match("AsIs", cl, 0L)) 249 | oldClass(object) <- cl[-m] 250 | std.attr <- "dim" 251 | 252 | cl <- cl[1L] 253 | if (cl != mod && substr(cl, 1L, nchar(mod)) != mod) 254 | mod <- paste0("'", cl, "' ", mod) 255 | std.attr <- c(std.attr, "class") 256 | 257 | str1 <- paste0(" ", mod, " ", le.str) 258 | 259 | iv.len <- round(2.5 * v.len) 260 | 261 | ob <- { 262 | if (le > iv.len) 263 | as.matrix(object)[seq_len(iv.len)] 264 | else as.matrix(object) 265 | } 266 | 267 | ao <- abs(ob <- unclass(ob[!is.na(ob)])) 268 | 269 | v.len <- { 270 | if ((all(ao > 1e-10 | ao == 0) && all(ao < 1e+10 | ao == 0) && all(abs(ob - signif(ob, digits.d)) <= 9e-16 * ao))) 271 | iv.len 272 | else 273 | round(1.25 * v.len) 274 | } 275 | 276 | format.fun <- formatNum 277 | 278 | if (!exists("format.fun")) { 279 | format.fun <- format 280 | } 281 | 282 | ile <- min(v.len, le) 283 | formObj <- function(x) maybe_truncate(paste(format.fun(x), collapse = " "), S = "") 284 | 285 | cat(if (give.head) paste0(str1, " "), 286 | formObj( 287 | if (ile >= 1 && mod != "...") as.matrix(object)[seq_len(ile)] 288 | else if (v.len > 0) object), 289 | if (le > v.len) " ...", "\n", sep = "") 290 | 291 | if (give.attr) { 292 | nam <- names(a) 293 | give.L <- give.length || identical(attr(give.length, "from"), "data.frame") 294 | for (i in seq_along(a)) if (all(nam[i] != std.attr)) { 295 | cat(indent.str, paste0("- attr(*, \"", nam[i], "\")="), 296 | sep = "") 297 | strSub(a[[i]], give.length = give.L, 298 | indent.str = paste(indent.str, ".."), 299 | nest.lev = nest.lev + 1) 300 | } 301 | } 302 | 303 | invisible() 304 | } 305 | 306 | -------------------------------------------------------------------------------- /R/clarify_predict.R: -------------------------------------------------------------------------------- 1 | clarify_predict <- function(x, newdata = NULL, group = NULL, type = NULL) { 2 | ord_mean <- identical(type, "mean") && isTRUE(insight::model_info(x)$is_ordinal) 3 | 4 | if (ord_mean) { 5 | type <- NULL 6 | group <- NULL 7 | } 8 | 9 | args <- list(model = x, newdata = newdata, vcov = FALSE) 10 | args$type <- type 11 | 12 | p <- try(do.call(marginaleffects::get_predict, args), silent = TRUE) 13 | 14 | if (length(p) == 0L || is_error(p)) { 15 | .err("predicted values could not be extracted from the model") 16 | } 17 | 18 | if (ord_mean) { 19 | p <- .get_ordinal_mean_preds(p) 20 | } 21 | else if (!is.null(group) && "group" %in% names(p)) { 22 | p <- .subset_group(p, group) 23 | } 24 | 25 | p 26 | } 27 | 28 | .subset_group <- function(pred, group = NULL) { 29 | if (is.null(group)) pred 30 | else pred[pred$group == group, , drop = FALSE] 31 | } 32 | 33 | .get_p <- function(pred) { 34 | if ("estimate" %in% names(pred)) pred[["estimate"]] 35 | else pred[["predicted"]] 36 | } 37 | 38 | .get_ordinal_mean_preds <- function(p) { 39 | ids <- unique(p$rowid) 40 | groups <- unique(p$group) 41 | m <- matrix(p$estimate, nrow = length(ids), ncol = length(groups)) 42 | 43 | if (anyNA(groups)) { 44 | nas <- is.na(groups) 45 | gn <- rep(NA_real_, length(groups)) 46 | 47 | if (!anyNA(suppressWarnings(g <- as.numeric(groups[!nas])))) { 48 | gn[!nas] <- g 49 | } 50 | else { 51 | gn[!nas] <- seq_along(g) 52 | } 53 | } 54 | else { 55 | if (!anyNA(suppressWarnings(g <- as.numeric(groups)))) { 56 | groups <- g 57 | } 58 | else { 59 | groups <- seq_along(g) 60 | } 61 | } 62 | 63 | data.frame(rowid = ids, 64 | estimate = drop(m %*% groups)) 65 | } 66 | -------------------------------------------------------------------------------- /R/get_model_components.R: -------------------------------------------------------------------------------- 1 | #Functions for extracting information from models 2 | 3 | # Get the coefficients from a model as a vector 4 | get_coefs <- function(fit) { 5 | 6 | b <- try(marginaleffects::get_coef(fit), silent = TRUE) 7 | 8 | if (!check_valid_coef(b)) { 9 | .err("`sim()` was unable to extract a valid set of coefficients from the model fit; please supply coefficients to the `coefs` argument and a covariance matrix to the `vcov` argument") 10 | } 11 | 12 | b 13 | } 14 | 15 | # Get the covariance from a model 16 | get_vcov <- function(fit, vcov = NULL) { 17 | v <- try(marginaleffects::get_vcov(fit, vcov), silent = TRUE) 18 | 19 | if (!check_valid_vcov(v)) { 20 | .err("`sim()` was unable to extract a valid covariance matrix from the model fit; please supply a covariance matrix to the `vcov` argument") 21 | } 22 | 23 | v 24 | } 25 | 26 | # Get the model degrees of freedom 27 | ## Assesses whether the model is linear and fit with OLS; if not, 28 | ## returns Inf. Linear models fit with MLE get Inf. 29 | get_df <- function(fit) { 30 | 31 | if (!insight::is_model_supported(fit)) { 32 | return(Inf) 33 | } 34 | 35 | statistic <- insight::find_statistic(fit) 36 | 37 | if (identical(statistic, "chi-squared statistic")) { 38 | return(Inf) 39 | } 40 | 41 | insight::get_df(fit, type = "wald", statistic = statistic) 42 | } 43 | -------------------------------------------------------------------------------- /R/misim.R: -------------------------------------------------------------------------------- 1 | #' @title Simulate model coefficients after multiple imputation 2 | #' 3 | #' @description `misim()` simulates model parameters from multivariate normal or t distributions after multiple imputation that are then used by [sim_apply()] to calculate quantities of interest. 4 | #' 5 | #' @param fitlist a list of model fits, one for each imputed dataset, or a `mira` object (the output of a call to `with()` applied to a `mids` object in `mice`). 6 | #' @param n the number of simulations to run for each imputed dataset; default is 1000. More is always better but resulting calculations will take longer. 7 | #' @param vcov a square covariance matrix of the coefficient covariance estimates, a function to use to extract it from `fit`, or a list thereof with an element for each imputed dataset. By default, uses [stats::vcov()] or [insight::get_varcov()] if that doesn't work. 8 | #' @param coefs a vector of coefficient estimates, a function to use to extract it from `fit`, or a list thereof with an element for each imputed dataset. By default, uses [stats::coef()] or [insight::get_parameters()] if that doesn't work. 9 | #' @param dist a character vector containing the name of the multivariate distribution(s) to use to draw simulated coefficients. Should be one of `"normal"` (multivariate normal distribution) or `"t_{#}"` (multivariate t distribution), where `{#}` corresponds to the desired degrees of freedom (e.g., `"t_100"`). If `NULL`, the right distributions to use will be determined based on heuristics; see [sim()] for details. 10 | #' 11 | #' @return 12 | #' A `clarify_misim` object, which inherits from `clarify_sim` and has the following components: 13 | #' \item{sim.coefs}{a matrix containing the simulated coefficients with a column for each coefficient and a row for each simulation for each imputation} 14 | #' \item{coefs}{a matrix containing the original coefficients extracted from `fitlist` or supplied to `coefs`, with a row per imputation.} 15 | #' \item{fit}{the list of model fits supplied to `fitlist`} 16 | #' \item{imp}{a identifier of which imputed dataset each set of simulated coefficients corresponds to.} 17 | #' The `"dist"` attribute contains `"normal"` if the coefficients were sampled from a multivariate normal distribution and `"t({df})"` if sampled from a multivariate t distribution. The `"clarify_hash"` attribute contains a unique hash generated by [rlang::hash()]. 18 | #' 19 | #' @details 20 | #' `misim()` essentially combines multiple `sim()` calls applied to a list of model fits, each fit in an imputed dataset, into a single combined pool of simulated coefficients. When simulation-based inference is to be used with multiply imputed data, many imputations are required; see Zhou and Reiter (2010). 21 | #' 22 | #' @references 23 | #' 24 | #' Zhou, X., & Reiter, J. P. (2010). A Note on Bayesian Inference After Multiple Imputation. *The American Statistician*, 64(2), 159–163. \doi{10.1198/tast.2010.09109} 25 | #' 26 | #' @examplesIf requireNamespace("Amelia", quietly = TRUE) 27 | #' data("africa", package = "Amelia") 28 | #' 29 | #' # Multiple imputation using Amelia 30 | #' a.out <- Amelia::amelia(x = africa, m = 10, 31 | #' cs = "country", 32 | #' ts = "year", logs = "gdp_pc", 33 | #' p2s = 0) 34 | #' 35 | #' fits <- with(a.out, lm(gdp_pc ~ infl * trade)) 36 | #' 37 | #' # Simulate coefficients 38 | #' s <- misim(fits) 39 | #' s 40 | #' 41 | #' @seealso 42 | #' * [sim()] for simulating model coefficients for a single dataset 43 | #' * [sim_apply()] for applying a function to each set of simulated coefficients 44 | #' * [sim_ame()] for computing average marginal effects in each simulation draw 45 | #' * [sim_setx()] for computing marginal predictions and first differences at typical values in each simulation draw 46 | #' @export 47 | #' 48 | misim <- function(fitlist, 49 | n = 1e3, 50 | vcov = NULL, 51 | coefs = NULL, 52 | dist = NULL) { 53 | 54 | if (missing(fitlist)) fitlist <- NULL 55 | 56 | if (inherits(fitlist, "mira")) { 57 | fitlist <- fitlist$analyses 58 | } 59 | 60 | if (is.null(fitlist)) { 61 | if (is.null(coefs) || is.null(vcov)) { 62 | .err("when `fitlist` is not supplied, arguments must be supplied to both `coefs` and `vcov`") 63 | } 64 | if (!is.list(coefs) && !is.list(vcov)) { 65 | .err("when `fitlist` is not supplied, at least one of `coefs` or `vcov` must be a list") 66 | } 67 | nimp <- if (!is.list(coefs)) length(vcov) else length(coefs) 68 | } 69 | else { 70 | check_fitlist(fitlist) 71 | nimp <- length(fitlist) 72 | } 73 | 74 | chk::chk_count(n) 75 | 76 | if (!is.list(coefs)) { 77 | coefs <- lapply(seq_len(nimp), function(i) coefs) 78 | } 79 | else if (length(coefs) != nimp) { 80 | if (is.null(fitlist)) { 81 | .err("when `fitlist` is not supplied and `coefs` is supplied as a list, `coefs` must have as many entries as there are entries in `vcov`") 82 | } 83 | else { 84 | .err("when supplied as a list, `coefs` must have as many entries as there are models in `fitlist`") 85 | } 86 | } 87 | 88 | coef_supplied <- { 89 | if (all(vapply(coefs, is.null, logical(1L)))) "null" 90 | else if (all(vapply(coefs, is.function, logical(1L)))) "fun" 91 | else if (all(vapply(coefs, check_valid_coef, logical(1L)))) "num" 92 | else { 93 | .err("`coefs` must be a vector of coefficients, a function that extracts one from each model in `fitlist`, or a list thereof") 94 | } 95 | } 96 | 97 | if (!is.list(vcov)) { 98 | vcov <- lapply(seq_len(nimp), function(i) vcov) 99 | } 100 | else if (length(vcov) != nimp) { 101 | if (is.null(fitlist)) { 102 | .err("when `fitlist` is not supplied and `vcov` is supplied as a list, `vcov` must have as many entries as there are entries in `coefs`") 103 | } 104 | else { 105 | .err("when supplied as a list, `vcov` must have as many entries as there are models in `fitlist`") 106 | } 107 | } 108 | 109 | vcov_supplied <- { 110 | if (all(vapply(vcov, is.null, logical(1L)))) "null" 111 | else if (all(vapply(vcov, is.matrix, logical(1L)))) "num" 112 | else "marginaleffects_code" 113 | } 114 | 115 | for (i in seq_len(nimp)) { 116 | coefs[[i]] <- process_coefs(coefs[[i]], fitlist[[i]], coef_supplied) 117 | } 118 | 119 | for (i in seq_len(nimp)) { 120 | vcov[[i]] <- process_vcov(vcov[[i]], fitlist[[i]], vcov_supplied) 121 | } 122 | 123 | check_coefs_vcov_length_mi(vcov, coefs, vcov_supplied, coef_supplied) 124 | 125 | chk::chk_count(n) 126 | 127 | if (!is.null(dist)) { 128 | if (length(dist) == 1) { 129 | dist <- lapply(seq_len(nimp), function(i) dist) 130 | } 131 | else if (length(dist) != nimp) { 132 | .err("when supplied as a vector, `dist` must have as many values as there are imputations") 133 | } 134 | else { 135 | dist <- as.list(dist) 136 | } 137 | } 138 | 139 | samplers <- lapply(seq_len(nimp), function(i) { 140 | get_sampling_dist(fitlist[[i]], dist[[i]]) 141 | }) 142 | 143 | sim.coefs <- do.call("rbind", lapply(seq_len(nimp), function(i) { 144 | samplers[[i]](n, coefs[[i]], vcov[[i]]) 145 | })) 146 | 147 | out <- list(sim.coefs = sim.coefs, 148 | coefs = do.call("rbind", coefs), 149 | fit = fitlist, 150 | imp = rep(seq_len(nimp), each = n)) 151 | 152 | dists <- unlist(lapply(samplers, attr, "dist")) 153 | if (all_the_same(dists)) dists <- dists[1] 154 | 155 | attr(out, "dist") <- dists 156 | attr(out, "use_fit") <- !is.null(fitlist) 157 | attr(out, "sim_hash") <- rlang::hash(out$sim.coefs) 158 | class(out) <- c("clarify_misim", "clarify_sim") 159 | 160 | out 161 | } 162 | 163 | #' @export 164 | print.clarify_misim <- function(x, ...) { 165 | obj <- deparse1(substitute(x)) 166 | cat("A `clarify_misim` object\n") 167 | cat(sprintf(" - %s coefficients, %s imputations with %s simulated values each\n", 168 | ncol(x$sim.coefs), nrow(x$coefs), nrow(x$sim.coefs) / nrow(x$coefs))) 169 | cat(" - sampled distributions: ") 170 | if (length(attr(x, "dist")) == 1) { 171 | cat(sprintf("multivariate %s\n", attr(x, "dist"))) 172 | } 173 | else { 174 | cat("multiple different multivariate distributions") 175 | if (exists(obj)) { 176 | cat(sprintf(" (use `attr(%s, \"dist\") to view them\n"), obj) 177 | } 178 | else { 179 | cat("\n") 180 | } 181 | } 182 | 183 | invisible(x) 184 | } 185 | -------------------------------------------------------------------------------- /R/plot.clarify_adrf.R: -------------------------------------------------------------------------------- 1 | #' Plot marginal predictions from `sim_adrf()` 2 | #' 3 | #' `plot.clarify_adrf()` plots the output of [sim_adrf()]. For the average dose-response function (ADRF, requested with `contrast = "adrf"` in `sim_adrf()`), this is a plot of the average marginal mean of the outcome against the requested values of the focal predictor; for the average marginal effects function (AMEF, requested with `contrast = "amef"` in `sim_adrf()`), this is a plot of the instantaneous average marginal effect of the focal predictor on the outcome against the requested values of the focal predictor. 4 | #' 5 | #' @inheritParams plot.clarify_est 6 | #' @param x a `clarify_adrf` object resulting from a call to [sim_adrf()]. 7 | #' @param ci `logical`; whether to display confidence bands for the estimates. Default is `TRUE`. 8 | #' @param method the method used to compute confidence bands. Can be `"wald"` to use a Normal approximation or `"quantile"` to use the simulated sampling distribution (default). See [summary.clarify_est()] for details. Abbreviations allowed. 9 | #' @param baseline `logical`; whether to include a horizontal line at `y = 0` on the plot. Default is `FALSE` for the ADRF (since 0 might not be in the range of the outcome) and `TRUE` for the AMEF. 10 | #' @param color the color of the line and confidence band in the plot. 11 | #' 12 | #' @return A `ggplot` object. 13 | #' 14 | #' @details These plots are produced using [ggplot2::geom_line()] and [ggplot2::geom_ribbon()]. The confidence bands should be interpreted pointwise (i.e., they do not account for simultaneous inference). 15 | #' 16 | #' @seealso [summary.clarify_est()] for computing p-values and confidence intervals for the estimated quantities. 17 | #' 18 | #' @examples 19 | #' ## See help("sim_adrf") for examples 20 | #' 21 | #' @exportS3Method plot clarify_adrf 22 | plot.clarify_adrf <- function(x, 23 | ci = TRUE, 24 | level = .95, 25 | method = "quantile", 26 | baseline, 27 | color = "black", 28 | ...) { 29 | 30 | at <- attr(x, "at") 31 | var <- attr(x, "var") 32 | contrast <- attr(x, "contrast") 33 | by <- attr(x, "by") 34 | 35 | if (missing(baseline)) { 36 | baseline <- !is.null(contrast) && contrast == "amef" 37 | } 38 | else { 39 | chk::chk_flag(baseline) 40 | } 41 | 42 | s <- { 43 | if (ci) 44 | as.data.frame(summary.clarify_est(x, level = level, method = method)) 45 | else 46 | data.frame(Estimate = coef(x)) 47 | } 48 | 49 | if (!is.null(by)) { 50 | s$by_var <- factor(.extract_by_values(x)) 51 | if (nlevels(s$by_var) == 1) 52 | by <- NULL 53 | } 54 | 55 | p <- ggplot(mapping = aes(x = at)) 56 | 57 | if (baseline) { 58 | p <- p + geom_hline(yintercept = 0) 59 | } 60 | 61 | if (is.null(by)) { 62 | p <- p + geom_line(aes(y = s$Estimate), 63 | color = color) + 64 | labs(x = var, y = "E[Y|X]") 65 | } 66 | else { 67 | p <- p + geom_line(aes(y = s$Estimate, color = s$by_var)) + 68 | labs(x = var, y = "E[Y|X]", color = paste(by, collapse = ", ")) 69 | } 70 | 71 | if (ci) { 72 | if (is.null(by)) { 73 | p <- p + 74 | geom_ribbon(aes(ymin = s[[2]], ymax = s[[3]]), 75 | alpha = .3, fill = color) 76 | } 77 | else { 78 | p <- p + 79 | geom_ribbon(aes(ymin = s[[2]], ymax = s[[3]], 80 | fill = s$by_var), 81 | alpha = .3) + 82 | labs(fill = paste(by, collapse = ", ")) 83 | } 84 | } 85 | p + labs(x = var, y = if (!is.null(attr(x, "contrast"))) switch(attr(x, "contrast"), "adrf" = sprintf("E[Y(%s)]", var), 86 | "amef" = sprintf("E[dY/d(%s)]", var))) + 87 | theme_bw() 88 | } 89 | 90 | .extract_by_values <- function(obj) { 91 | x <- names(obj) 92 | 93 | if (identical(attr(obj, "contrast"), "amef")) 94 | pattern <- "\\,([^]]+)\\]" 95 | else 96 | pattern <- "\\|([^]]+)\\]" 97 | 98 | matches <- regexpr(pattern, x, perl = TRUE) 99 | out <- regmatches(x, matches) 100 | 101 | substr(out, 2, nchar(out) - 1) 102 | } 103 | -------------------------------------------------------------------------------- /R/plot.clarify_est.R: -------------------------------------------------------------------------------- 1 | #' @exportS3Method plot clarify_est 2 | #' @rdname summary.clarify_est 3 | plot.clarify_est <- function(x, 4 | parm, 5 | ci = TRUE, 6 | level = .95, 7 | method = "quantile", 8 | reference = FALSE, 9 | ncol = 3, 10 | ...) { 11 | 12 | chk::chk_flag(ci) 13 | chk::chk_flag(reference) 14 | 15 | original_est <- coef(x) 16 | est_names <- names(x) 17 | 18 | parm <- process_parm(x, parm) 19 | if (anyNA(parm)) { 20 | .err("`parm` must be a numeric or character vector identifiying the estimates to plot") 21 | } 22 | 23 | est_names <- est_names[parm] 24 | 25 | est_long <- setNames(utils::stack(as.data.frame(as.matrix(x))[est_names]), 26 | c("val", "est")) 27 | original_est_long <- setNames(utils::stack(original_est[est_names]), 28 | c("val", "est")) 29 | 30 | p <- ggplot() + 31 | geom_density(data = est_long, mapping = aes(x = .data$val), 32 | color = "black", fill = "gray90", 33 | ...) + 34 | geom_hline(yintercept = 0) + 35 | geom_vline(data = original_est_long, mapping = aes(xintercept = .data$val)) + 36 | facet_wrap(vars(.data$est), scales = "free", ncol = min(ncol, nlevels(original_est_long$est))) 37 | 38 | if (ci) { 39 | ci <- confint(x, parm = parm, level = level, 40 | method = method) 41 | 42 | ci_long <- setNames(utils::stack(as.data.frame(t(ci))), c("val", "est")) 43 | p <- p + geom_vline(data = ci_long, mapping = aes(xintercept = .data$val), 44 | linetype = 2) 45 | } 46 | 47 | if (reference) { 48 | #Add normal density and mean line 49 | ref_means_and_medians <- data.frame( 50 | est = factor(levels(est_long$est), levels = levels(est_long$est)), 51 | mean = tapply(est_long$val, est_long$est, mean), 52 | height = dnorm(0, 0, tapply(est_long$val, est_long$est, sd)), 53 | median = tapply(est_long$val, est_long$est, median)) 54 | 55 | p <- p + geom_density(data = est_long, mapping = aes(x = .data$val), 56 | stat = StatNormal, color = "red") + 57 | geom_segment(aes(x = .data$mean, xend = .data$mean, 58 | y = 0, yend = .data$height), 59 | data = ref_means_and_medians, color = "red") + 60 | geom_segment(aes(x = .data$median, xend = .data$median, 61 | y = 0, yend = .2 * .data$height), 62 | data = ref_means_and_medians, color = "blue") 63 | } 64 | 65 | p + 66 | labs(x = "Estimate", y = "Density") + 67 | theme_bw() + 68 | theme(panel.grid = element_blank()) 69 | } 70 | 71 | #Stat for normal reference density 72 | StatNormal <- ggplot2::ggproto("StatNormal", ggplot2::Stat, 73 | required_aes = "x|y", 74 | default_aes = aes(x = ggplot2::after_stat(density), 75 | y = ggplot2::after_stat(density), 76 | fill = NA, weight = NULL), 77 | setup_params = function(data, params) { 78 | params$flipped_aes <- ggplot2::has_flipped_aes(data, params, main_is_orthogonal = FALSE, main_is_continuous = TRUE) 79 | 80 | has_x <- !(is.null(data$x) && is.null(params$x)) 81 | has_y <- !(is.null(data$y) && is.null(params$y)) 82 | if (!has_x && !has_y) { 83 | rlang::abort("stat_normal() requires an x or y aesthetic.") 84 | } 85 | 86 | params 87 | }, 88 | extra_params = c("na.rm", "orientation"), 89 | compute_group = function(data, scales, n = 512, trim = FALSE, 90 | na.rm = FALSE, flipped_aes = FALSE) { 91 | data <- ggplot2::flip_data(data, flipped_aes) 92 | if (trim) { 93 | range <- range(data$x, na.rm = TRUE) 94 | } else { 95 | range <- scales[[flipped_names(flipped_aes)$x]]$dimension() 96 | } 97 | 98 | density <- compute_norm_dens(data$x, w = data$weight, from = range[1], 99 | to = range[2], n = n) 100 | density$flipped_aes <- flipped_aes 101 | ggplot2::flip_data(density, flipped_aes) 102 | } 103 | ) 104 | 105 | compute_norm_dens <- function(x, w, from, to, n = 512) { 106 | nx <- length(x) 107 | if (is.null(w)) { 108 | w <- rep(1, nx) 109 | } 110 | 111 | nax <- is.na(x) 112 | naw <- is.na(w) 113 | 114 | x <- x[!nax & !naw] 115 | w <- w[!nax & !naw] 116 | 117 | # if less than 2 points return data frame of NAs and a warning 118 | if (nx < 2) { 119 | rlang::warn("Groups with fewer than two data points have been dropped.") 120 | return(data.frame( 121 | x = NA_real_, 122 | density = NA_real_, 123 | scaled = NA_real_, 124 | ndensity = NA_real_, 125 | count = NA_real_, 126 | n = NA_integer_ 127 | )) 128 | } 129 | 130 | covw <- cov.wt(as.matrix(x), w) 131 | s <- sqrt(covw$cov) 132 | m <- covw$center 133 | 134 | x <- seq(from, to, length.out = n) 135 | y <- dnorm(x, m, s) 136 | 137 | data.frame( 138 | x = x, 139 | density = y, 140 | scaled = y / max(y, na.rm = TRUE), 141 | ndensity = y / max(y, na.rm = TRUE), 142 | count = y * nx, 143 | n = nx 144 | ) 145 | } 146 | -------------------------------------------------------------------------------- /R/plot.clarify_setx.R: -------------------------------------------------------------------------------- 1 | #' Plot marginal predictions from `sim_setx()` 2 | #' 3 | #' `plot.clarify_sext()` plots the output of [sim_setx()], providing graphics similar to those of [plot.clarify_est()] but with features specifically for plot marginal predictions. For continues predictors, this is a plot of the marginal predictions and their confidence bands across levels of the predictor. Otherwise, this is is a plot of simulated sampling distribution of the marginal predictions. 4 | #' 5 | #' @inheritParams plot.clarify_est 6 | #' @param x a `clarify_est` object resulting from a call to [sim_setx()]. 7 | #' @param var the name of the focal varying predictor, i.e., the variable to be on the x-axis of the plot. All other variables with varying set values will be used to color the resulting plot. See Details. Ignored if no predictors vary or if only one predictor varies in the reference grid or if `x1` was specified in `sim_setx()`. If not set, will use the predictor with the greatest number of unique values specified in the reference grid. 8 | #' @param ci `logical`; whether to display confidence intervals or bands for the estimates. Default is `TRUE`. 9 | #' @param method the method used to compute confidence intervals or bands. Can be `"wald"` to use a Normal approximation or `"quantile"` to use the simulated sampling distribution (default). See [summary.clarify_est()] for details. Abbreviations allowed. 10 | #' @param reference `logical`; whether to overlay a normal density reference distribution over the plots. Default is `FALSE`. Ignored when variables other than the focal varying predictor vary. 11 | #' 12 | #' @return A `ggplot` object. 13 | #' 14 | #' @details `plot()` creates one of two kinds of plots depending on how the reference grid was specified in the call to `sim_setx()` and what `var` is set to. When the focal varying predictor (i.e., the one set in `var`) is numeric and takes on three or more unique values in the reference grid, the produced plot is a line graph displaying the value of the marginal prediction (denoted as `E[Y|X]`) across values of the focal varying predictor, with confidence bands displayed when `ci = TRUE`. If other predictors also vary, lines for different values will be displayed in different colors. These plots are produced using [ggplot2::geom_line()] and [ggplot2::geom_ribbon()] 15 | #' 16 | #' When the focal varying predictor is a factor or character or only takes on two or fewer values in the reference grid, the produced plot is a density plot of the simulated predictions, similar to the plot resulting from [plot.clarify_est()]. When other variables vary, densities for different values will be displayed in different colors. These plots are produced using [ggplot2::geom_density()]. 17 | #' 18 | #' Marginal predictions are identified by the corresponding levels of the predictors that vary. The user should keep track of whether the non-varying predictors are set at specified or automatically set "typical" levels. 19 | #' 20 | #' @seealso [summary.clarify_est()] for computing p-values and confidence intervals for the estimated quantities. 21 | #' 22 | #' @examples 23 | #' ## See help("sim_setx") for examples 24 | #' 25 | #' @export 26 | plot.clarify_setx <- function(x, 27 | var = NULL, 28 | ci = TRUE, 29 | level = .95, 30 | method = "quantile", 31 | reference = FALSE, 32 | ...) { 33 | 34 | newdata <- attr(x, "setx") 35 | 36 | if (nrow(newdata) == 1) { 37 | if (!is.null(var)) { 38 | .wrn("ignoring `var` because no variables vary over predictions") 39 | } 40 | return(plot.clarify_est(x, parm = 1, ci = ci, level = level, 41 | method = method, reference = reference, ...)) 42 | } 43 | 44 | if (isTRUE(attr(x, "fd"))) { 45 | if (!is.null(var)) { 46 | .wrn("ignoring `var`") 47 | } 48 | return(plot.clarify_est(x, parm = 1:3, ci = ci, level = level, 49 | method = method, reference = reference, ...)) 50 | } 51 | 52 | len_unique_newdata <- vapply(newdata, function(v) length(unique(v)), integer(1L)) 53 | varying <- names(newdata)[len_unique_newdata > 1] 54 | 55 | if (length(varying) == 1) { 56 | if (!is.null(var) && !identical(var, varying)) { 57 | .wrn("ignoring `var` because only one variable varies over predictions") 58 | } 59 | var <- varying 60 | } 61 | else if (is.null(var)) { 62 | if (any(len_unique_newdata[varying] > 2)) { 63 | var <- attr(newdata, "set_preds")[which.max(len_unique_newdata[attr(newdata, "set_preds")])] 64 | } 65 | else { 66 | var <- attr(newdata, "set_preds")[attr(newdata, "set_preds") %in% varying][1] 67 | } 68 | } 69 | else { 70 | chk::chk_string(var) 71 | if (!var %in% varying) { 72 | .err("`var` must be the name of a predictor set to be varying. Allowable options include ", word_list(varying, quotes = TRUE)) 73 | } 74 | } 75 | 76 | non_var_varying <- setdiff(varying, var) 77 | 78 | p <- { 79 | if (len_unique_newdata[var] == 2 || chk::vld_character_or_factor(newdata[[var]])) 80 | setx_sim_plot(x, var, non_var_varying, ci = ci, 81 | level = level, method = method, ...) 82 | else 83 | setx_reg_plot(x, var, non_var_varying, ci = ci, 84 | level = level, method = method) 85 | } 86 | 87 | p + theme_bw() + scale_fill_brewer(palette = "Set1") 88 | } 89 | 90 | #sim_plot, but with grouping by non_var_varying if present 91 | setx_sim_plot <- function(x, var, non_var_varying = NULL, ci = TRUE, level = .95, 92 | method = "quantile", ...) { 93 | 94 | chk::chk_flag(ci) 95 | 96 | newdata <- attr(x, "setx") 97 | original_est <- coef(x) 98 | est_names <- rownames(newdata) 99 | 100 | est_long <- setNames(utils::stack(as.data.frame(x)[est_names]), 101 | c("val", "est")) 102 | est_long <- merge(est_long, 103 | newdata[c(var, non_var_varying)], 104 | by.x = "est", by.y = 0) 105 | est_long[[var]] <- paste0(var, " = ", add_quotes(est_long[[var]], chk::vld_character_or_factor(est_long[[var]]))) 106 | 107 | original_est_long <- setNames(utils::stack(original_est[est_names]), 108 | c("val", "est")) 109 | original_est_long <- merge(original_est_long, 110 | newdata[c(var, non_var_varying)], 111 | by.x = "est", by.y = 0) 112 | original_est_long[[var]] <- paste0(var, " = ", add_quotes(original_est_long[[var]], chk::vld_character_or_factor(original_est_long[[var]]))) 113 | 114 | if (length(non_var_varying) > 0) { 115 | non_var_varying_f <- do.call("paste", c(lapply(non_var_varying, function(i) { 116 | paste0(i, " = ", add_quotes(est_long[[i]], chk::vld_character_or_factor(est_long[[i]]))) 117 | }), list(sep = ", "))) 118 | non_var_varying_f <- factor(non_var_varying_f, levels = unique(non_var_varying_f)) 119 | 120 | non_var_varying_f_o <- do.call("paste", c(lapply(non_var_varying, function(i) { 121 | paste0(i, " = ", add_quotes(original_est_long[[i]], chk::vld_character_or_factor(original_est_long[[i]]))) 122 | }), list(sep = ", "))) 123 | non_var_varying_f_o <- factor(non_var_varying_f_o, levels = unique(non_var_varying_f_o)) 124 | } 125 | else { 126 | non_var_varying_f <- non_var_varying_f_o <- NULL 127 | } 128 | 129 | p <- ggplot() + 130 | geom_density(data = est_long, mapping = aes(x = .data$val, color = non_var_varying_f, 131 | fill = non_var_varying_f), 132 | alpha = .3, ...) + 133 | geom_hline(yintercept = 0) + 134 | geom_vline(data = original_est_long, mapping = aes(xintercept = .data$val, 135 | color = non_var_varying_f_o)) + 136 | facet_wrap(vars(.data[[var]]), scales = "free") 137 | 138 | 139 | if (ci) { 140 | ci <- confint(x, level = level, method = method) 141 | ci_long <- setNames(utils::stack(as.data.frame(t(ci))), c("val", "est")) 142 | 143 | ci_long <- merge(ci_long, 144 | newdata[c(var, non_var_varying)], 145 | by.x = "est", by.y = 0) 146 | ci_long[[var]] <- paste0(var, " = ", add_quotes(ci_long[[var]], chk::vld_character_or_factor(ci_long[[var]]))) 147 | 148 | if (length(non_var_varying) > 0) { 149 | non_var_varying_f_ci <- do.call("paste", c(lapply(non_var_varying, function(i) { 150 | paste0(i, " = ", add_quotes(ci_long[[i]], chk::vld_character_or_factor(ci_long[[i]]))) 151 | }), list(sep = ", "))) 152 | non_var_varying_f_ci <- factor(non_var_varying_f_ci, levels = unique(non_var_varying_f_ci)) 153 | } 154 | else { 155 | non_var_varying_f_ci <- NULL 156 | } 157 | 158 | p <- p + geom_vline(data = ci_long, mapping = aes(xintercept = .data$val, 159 | color = non_var_varying_f_ci), 160 | linetype = 2) 161 | } 162 | 163 | p + 164 | scale_color_brewer(palette = "Set1") + 165 | labs(x = "Estimate", y = "Density", color = NULL, fill = NULL) + 166 | theme(panel.background = element_rect(fill = "white", color = "black"), 167 | panel.border = element_rect(color = "black", fill = NA)) 168 | } 169 | 170 | #Line plot with confidence bands 171 | setx_reg_plot <- function(x, var, non_var_varying = NULL, ci = TRUE, level = .95, method = "quantile") { 172 | 173 | newdata <- attr(x, "setx") 174 | 175 | if (length(non_var_varying)) { 176 | non_var_varying_f <- do.call("paste", c(lapply(non_var_varying, function(i) { 177 | paste0(i, " = ", add_quotes(newdata[[i]], chk::vld_character_or_factor(newdata[[i]]))) 178 | }), list(sep = ", "))) 179 | non_var_varying_f <- factor(non_var_varying_f, levels = unique(non_var_varying_f)) 180 | } 181 | else { 182 | non_var_varying_f <- NULL 183 | } 184 | 185 | s <- { 186 | if (ci) 187 | summary.clarify_est(x, level = level, method = method)[rownames(newdata), , drop = FALSE] 188 | else 189 | matrix(coef(x)[rownames(newdata)], ncol = 1, 190 | dimnames = list(rownames(newdata), "Estimate")) 191 | } 192 | 193 | s <- cbind(s, newdata) 194 | 195 | p <- ggplot(s, aes(x = .data[[var]], color = non_var_varying_f, 196 | fill = non_var_varying_f)) + 197 | geom_line(aes(y = .data$Estimate)) + 198 | scale_color_brewer(palette = "Set1") + 199 | labs(x = var, y = sprintf("E[Y|%s]", var), color = NULL, fill = NULL) 200 | 201 | if (ci) { 202 | p <- p + 203 | geom_ribbon(aes(ymin = .data[[colnames(s)[2]]], ymax = .data[[colnames(s)[3]]], 204 | color = NULL), 205 | alpha = .3) 206 | } 207 | 208 | p 209 | } 210 | -------------------------------------------------------------------------------- /R/sim.R: -------------------------------------------------------------------------------- 1 | #' Simulate model parameters 2 | #' 3 | #' @description `sim()` simulates model parameters from a multivariate normal or t distribution that are then used by [sim_apply()] to calculate quantities of interest. 4 | #' 5 | #' @param fit a model fit, such as the output of a call to [lm()] or [glm()]. Can be left unspecified if `coefs` and `vcov` are not functions. 6 | #' @param n the number of simulations to run; default is 1000. More is always better but resulting calculations will take longer. 7 | #' @param vcov either a square covariance matrix of the coefficient covariance estimates or a function to use to extract it from `fit`. By default, uses [stats::vcov()] or [insight::get_varcov()] if that doesn't work. 8 | #' @param coefs either a vector of coefficient estimates or a function to use to extract it from `fit`. By default, uses [stats::coef()] or [insight::get_parameters()] if that doesn't work. 9 | #' @param dist a string containing the name of the multivariate distribution to use to draw simulated coefficients. Should be one of `"normal"` (multivariate normal distribution) or `"t({#})"` (multivariate t distribution), where `{#}` corresponds to the desired degrees of freedom (e.g., `"t(100)"`). If `NULL`, the right distribution to use will be determined based on heuristics; see Details. 10 | #' 11 | #' @return 12 | #' A `clarify_sim` object, which has the following components: 13 | #' \item{sim.coefs}{a matrix containing the simulated coefficients with a column for each coefficient and a row for each simulation} 14 | #' \item{coefs}{the original coefficients extracted from `fit` or supplied to `coefs`.} 15 | #' \item{vcov}{the covariance matrix of the coefficients extracted from `fit` or supplied to `vcov`} 16 | #' \item{fit}{the original model fit supplied to `fit`} 17 | #' The `"dist"` attribute contains `"normal"` if the coefficients were sampled from a multivariate normal distribution and `"t(df)"` if sampled from a multivariate t distribution. The `"clarify_hash"` attribute contains a unique hash generated by [rlang::hash()]. 18 | #' 19 | #' @details When `dist` is `NULL`, `sim()` samples from a multivariate normal or t distribution depending on the degrees of freedom extracted from `insight::get_df(., type = "wald")`. If `Inf`, a normal distribution will be used; otherwise, a t-distribution with the returned degrees of freedom will be used. Models not supported by `insight` will use a normal distribution. 20 | #' 21 | #' When a multivariate normal is used, it is sampled from with means equal to the estimated coefficients and the parameter covariance matrix as the covariance matrix using [mvnfast::rmvn()]. When a multivariate t distribution is used, it is sampled from with means equal to the estimated coefficients and scaling matrix equal to `cov*(df - 2)/df`, where `cov` is the parameter covariance matrix and `df` is the residual degrees of freedom for the model, using [mvnfast::rmvt()]. 22 | #' 23 | #' @seealso 24 | #' * [misim()] for simulating model coefficients after multiple imputation 25 | #' * [sim_apply()] for applying a function to each set of simulated coefficients 26 | #' * [sim_ame()] for computing average marginal effects in each simulation draw 27 | #' * [sim_setx()] for computing marginal predictions and first differences at typical values in each simulation draw 28 | #' * [sim_adrf()] for computing average dose-response functions in each simulation draw 29 | #' 30 | #' @examples 31 | #' 32 | #' data("lalonde", package = "MatchIt") 33 | #' fit <- lm(re78 ~ treat * (age + race + nodegree + re74), data = lalonde) 34 | #' 35 | #' # Simulate coefficients 36 | #' s <- sim(fit) 37 | #' s 38 | #' 39 | #' ## Could also use a robust covariance matrix, e.g., 40 | #' s <- sim(fit, vcov = "HC3") 41 | #' 42 | #' # Simulated coefficients assuming a normal distribution 43 | #' # for coefficients; default for `lm` objects is a t- 44 | #' # distribution 45 | #' s <- sim(fit, dist = "normal") 46 | #' s 47 | #' 48 | #' @export 49 | sim <- function(fit, 50 | n = 1e3, 51 | vcov = NULL, 52 | coefs = NULL, 53 | dist = NULL) { 54 | 55 | if (missing(fit)) fit <- NULL 56 | 57 | if (!is.null(fit)) { 58 | if (!insight::is_regression_model(fit)) { 59 | .wrn("`fit` was not detected to be a regression model; proceed with caution") 60 | } 61 | # if (insight::is_mixed_model(fit)) { 62 | # .wrn("`sim()` may not fully support models with random effects; proceed with caution") 63 | # } 64 | } 65 | 66 | chk::chk_count(n) 67 | 68 | coef_supplied <- { 69 | if (is.null(coefs)) "null" 70 | else if (is.function(coefs)) "fun" 71 | else if (check_valid_coef(coefs)) "num" 72 | else { 73 | .err("`coefs` must be a vector of coefficients or a function that extracts one from `fit`") 74 | } 75 | } 76 | 77 | vcov_supplied <- { 78 | if (is.null(vcov)) "null" 79 | else if (is.matrix(vcov)) "num" 80 | else "marginaleffects_code" 81 | } 82 | 83 | coefs <- process_coefs(coefs, fit, coef_supplied) 84 | 85 | vcov <- process_vcov(vcov, fit, vcov_supplied) 86 | 87 | check_coefs_vcov_length(vcov, coefs, vcov_supplied, coef_supplied) 88 | 89 | sampler <- get_sampling_dist(fit, dist) 90 | 91 | out <- list(sim.coefs = sampler(n, coefs, vcov), 92 | coefs = coefs, 93 | vcov = vcov, 94 | fit = fit) 95 | 96 | attr(out, "dist") <- attr(sampler, "dist") 97 | attr(out, "use_fit") <- !is.null(fit) 98 | attr(out, "sim_hash") <- rlang::hash(out$sim.coefs) 99 | class(out) <- "clarify_sim" 100 | 101 | out 102 | } 103 | 104 | #' @export 105 | print.clarify_sim <- function(x, ...) { 106 | cat("A `clarify_sim` object\n") 107 | cat(sprintf(" - %s coefficients, %s simulated values\n", ncol(x$sim.coefs), nrow(x$sim.coefs))) 108 | cat(sprintf(" - sampled distribution: multivariate %s\n", attr(x, "dist"))) 109 | if (!is.null(x$fit)) { 110 | cat(" - original fitting function call:\n\n") 111 | print(insight::get_call(x$fit)) 112 | } 113 | 114 | invisible(x) 115 | } 116 | 117 | #Returns a function that generates random variates, with arguments 118 | #`n`, `mu`, and `cov`; name of distribution is stored in attr(., "dist") 119 | get_sampling_dist <- function(fit = NULL, dist = NULL) { 120 | 121 | if (!is.null(dist)) { 122 | chk::chk_string(dist) 123 | dist <- tolower(dist) 124 | if (startsWith(dist, "t(") && endsWith(dist, ")")) { 125 | df <- substr(dist, 3, nchar(dist) - 1) 126 | if (nchar(df) == 0 || anyNA(suppressWarnings(df <- as.numeric(df))) || !chk::vld_number(df)) { 127 | .err("when `dist` is supplied as t({#}), `{#}` must be a number") 128 | } 129 | df <- as.numeric(df) 130 | dist <- "t" 131 | } 132 | else if (!anyNA(pmatch(dist, "normal"))) { 133 | dist <- "normal" 134 | } 135 | else { 136 | .err("`dist` must be \"normal\" or \"t({#})\", where `{#}` corresponds to the desired degrees of freedom") 137 | } 138 | } 139 | else if (is.null(fit)) { 140 | dist <- "normal" 141 | } 142 | else { 143 | df <- get_df(fit) 144 | 145 | if (any(is.finite(df)) && all(df > 0)) dist <- "t" 146 | else dist <- "normal" 147 | } 148 | 149 | f <- { 150 | if (dist == "t") 151 | function(n, mu, cov) { 152 | sigma <- cov * (df - 2) / df 153 | #Need pivoted cholesky for when cov isn't PSD (sometimes true for fixed effects models) 154 | ch <- suppressWarnings(chol(sigma, pivot = TRUE)) 155 | piv <- attr(ch, "pivot") 156 | x <- mvnfast::rmvt(n, mu = mu[piv], sigma = ch, isChol = TRUE, df = df, kpnames = TRUE) 157 | x[, order(piv), drop = FALSE] 158 | } 159 | else 160 | function(n, mu, cov) { 161 | #Need pivoted cholesky for when cov isn't PSD (sometimes true for fixed effects models) 162 | ch <- suppressWarnings(chol(cov, pivot = TRUE)) 163 | piv <- attr(ch, "pivot") 164 | x <- mvnfast::rmvn(n, mu = mu[piv], sigma = ch, isChol = TRUE, kpnames = TRUE) 165 | x[, order(piv), drop = FALSE] 166 | } 167 | } 168 | 169 | attr(f, "dist") <- if (dist == "t") sprintf("t(%s)", df) else dist 170 | 171 | f 172 | } 173 | 174 | #Extracts coefs based on given inputs 175 | process_coefs <- function(coefs, fit = NULL, coef_supplied) { 176 | if (coef_supplied == "null") { 177 | if (is.null(fit)) { 178 | .err("`coefs` must be supplied when `fit` is not specified") 179 | } 180 | coefs <- marginaleffects::get_coef(fit) 181 | if (!check_valid_coef(coefs)) { 182 | .err("a valid set of coefficients could not be extracted automatically; please supply coefficients to the `coefs` argument and a covariance matrix to the `vcov` argument") 183 | } 184 | } 185 | if (coef_supplied == "fun") { 186 | if (is.null(fit)) { 187 | .err("`fit` must be supplied when `coefs` is a function") 188 | } 189 | 190 | coefs <- try_chk(coefs(fit)) 191 | if (!check_valid_coef(coefs)) { 192 | .err("the output of the function supplied to `coefs` must be a numeric vector") 193 | } 194 | } 195 | else if (coef_supplied == "num") { 196 | #do nothing 197 | } 198 | 199 | if (anyNA(coefs) || any(!is.finite(coefs))) { 200 | .err("the coefficients cannot contain `NA` or non-finite values. This can occur with rank-deficient fits") 201 | } 202 | 203 | coefs 204 | } 205 | 206 | #Extracts vcov based on given inputs 207 | process_vcov <- function(vcov, fit = NULL, vcov_supplied) { 208 | if (vcov_supplied == "null") { 209 | if (is.null(fit)) { 210 | .err("`vcov` must be supplied when `fit` is not specified") 211 | } 212 | vcov <- marginaleffects::get_vcov(fit) 213 | if (!check_valid_vcov(vcov)) { 214 | .err("a valid covariance matrix could not be extracted automatically; please supply an argument to `vcov`") 215 | } 216 | } 217 | else if (vcov_supplied == "num") { 218 | if (!check_valid_vcov(vcov)) { 219 | .err("when supplied as a matrix, `vcov` must be a square, symmetric, numeric matrix") 220 | } 221 | } 222 | else { 223 | if (is.null(fit)) { 224 | .err("`fit` must be supplied when `vcov` is a not supplied as a matrix") 225 | } 226 | 227 | vcov <- marginaleffects::get_vcov(fit, vcov) 228 | if (!check_valid_vcov(vcov)) { 229 | .err("a valid covariance matrix could not be extracted using the argument supplied to `vcov`") 230 | } 231 | } 232 | 233 | if (anyNA(vcov) || any(!is.finite(vcov))) { 234 | .err("the covariance matrix cannot contain `NA` or non-finite values. This can occur with rank-deficient fits") 235 | } 236 | 237 | vcov 238 | } 239 | -------------------------------------------------------------------------------- /R/transform.clarify_est.R: -------------------------------------------------------------------------------- 1 | #' Transform and combine `clarify_est` objects 2 | #' 3 | #' @description 4 | #' `transform()` modifies a `clarify_est` object by allowing for the calculation of new quantities from the existing quantities without re-simulating them. `cbind()` binds two `clarify_est` objects together. 5 | #' 6 | #' @param _data the `clarify_est` object to be transformed. 7 | #' @param ... for `transform()`, arguments in the form `name = value`, where `name` is the name of a new quantity to be computed and `value` is an expression that is a function of the existing quantities corresponding to the new quantity to be computed. See Details. For `cbind()`, `clarify_est` objects to be combined. 8 | #' @param deparse.level ignored. 9 | #' 10 | #' @details 11 | #' For `transform()`, the expression on the right side of the `=` should use the names of the existing quantities (e.g., `` `E[Y(1)]` - `E[Y(1)]` ``), with `` ` `` appropriately included when the quantity name include parentheses or brackets. Alternatively, it can use indexes prefixed by `.b`, e.g., `.b2 - .b1`, to refer to the corresponding quantity by position. This can aid in computing derived quantities of quantities with complicated names. (Note that if a quantity is named something like `.b1`, it will need to be referred to by position rather than name, as the position-based label takes precedence). See examples. Setting an existing value to `NULL` will remove that quantity from the object. 12 | #' 13 | #' `cbind()` does not rename the quanities or check for uniqueness of the names, so it is important to rename them yourself prior to combining the objects. 14 | #' 15 | #' @return 16 | #' A `clarify_est` object, either with new columns added (when using `transform()`) or combining two `clarify_est` objects. Note that any type attributes corresponding to the `sim_apply()` wrapper used (e.g., `sim_ame()`) is lost when using either function. This can affect any helper functions (e.g., `plot()`) designed to work with the output of specific wrappers. 17 | #' 18 | #' @seealso [transform()], [cbind()], [sim()] 19 | #' 20 | #' @examples 21 | #' data("lalonde", package = "MatchIt") 22 | #' 23 | #' # Fit the model 24 | #' fit <- lm(re78 ~ treat * (age + educ + race + 25 | #' married + re74 + re75), 26 | #' data = lalonde) 27 | #' 28 | #' # Simulate coefficients 29 | #' set.seed(123) 30 | #' s <- sim(fit, n = 100) 31 | #' 32 | #' # Average adjusted predictions for `treat` within 33 | #' # subsets of `race` 34 | #' est_b <- sim_ame(s, var = "treat", verbose = FALSE, 35 | #' subset = race == "black") 36 | #' est_b 37 | #' 38 | #' est_h <- sim_ame(s, var = "treat", verbose = FALSE, 39 | #' subset = race == "hispan") 40 | #' est_h 41 | #' 42 | #' # Compute differences between adjusted predictions 43 | #' est_b <- transform(est_b, 44 | #' diff = `E[Y(1)]` - `E[Y(0)]`) 45 | #' est_b 46 | #' 47 | #' est_h <- transform(est_h, 48 | #' diff = `E[Y(1)]` - `E[Y(0)]`) 49 | #' est_h 50 | #' 51 | #' # Bind estimates together after renaming 52 | #' names(est_b) <- paste0(names(est_b), "_b") 53 | #' names(est_h) <- paste0(names(est_h), "_h") 54 | #' 55 | #' est <- cbind(est_b, est_h) 56 | #' est 57 | #' 58 | #' # Compute difference in race-specific differences 59 | #' est <- transform(est, 60 | #' `diff-diff` = .b6 - .b3) 61 | #' 62 | #' summary(est, 63 | #' parm = c("diff_b", "diff_h", "diff-diff")) 64 | #' 65 | #' # Remove last quantity by using `NULL` 66 | #' transform(est, `diff-diff` = NULL) 67 | 68 | #' @exportS3Method transform clarify_est 69 | #' @name transform.clarify_est 70 | transform.clarify_est <- function(`_data`, ...) { 71 | 72 | # Process dots to substitute .b{#} for corresponding value 73 | dots <- substitute(list(...)) 74 | 75 | available_b <- sprintf(".b%s", seq_along(names(`_data`))) 76 | 77 | names_list <- setNames(lapply(add_quotes(names(`_data`), "`"), str2lang), 78 | available_b) 79 | 80 | for (i in seq_along(dots)[-1]) { 81 | if (!is.null(dots[[i]])) 82 | dots[[i]] <- do.call("substitute", list(dots[[i]], names_list)) 83 | } 84 | 85 | e <- try(eval(dots, as.data.frame(`_data`), parent.frame()), silent = TRUE) 86 | 87 | if (is_error(e)) .err(conditionMessage(attr(e, "condition")), tidy = FALSE) 88 | 89 | n <- nrow(`_data`) 90 | if (!all(vapply(e, function(e.) length(e.) == 0 || (length(e.) == n && is.numeric(e.)), logical(1L)))) { 91 | .err("all transformations must be vector operations of the variables in the original `clarify_est` object") 92 | } 93 | 94 | e_original <- eval(dots, as.list(attr(`_data`, "original")), parent.frame()) 95 | 96 | inx <- match(names(e), names(`_data`)) 97 | matched <- !is.na(inx) 98 | 99 | if (any(matched)) { 100 | nulls <- lengths(e[matched]) == 0 101 | 102 | if (any(!nulls)) { 103 | for (i in seq_along(e)[matched][!nulls]) { 104 | `_data`[, inx[i]] <- e[[i]] 105 | attr(`_data`, "original")[inx[i]] <- as.numeric(e_original[i]) 106 | } 107 | } 108 | 109 | if (any(nulls)) { 110 | `_data` <- `_data`[-inx[matched][nulls]] 111 | } 112 | } 113 | 114 | if (!all(matched)) { 115 | nulls <- lengths(e[!matched]) == 0 116 | if (any(!nulls)) { 117 | new_e <- as.matrix(do.call("cbind", e[!matched][!nulls])) 118 | attr(new_e, "original") <- do.call("c", e_original[!matched][!nulls]) 119 | attr(new_e, "sim_hash") <- attr(`_data`, "sim_hash") 120 | class(new_e) <- c("clarify_est", class(new_e)) 121 | return(cbind.clarify_est(`_data`, new_e)) 122 | } 123 | } 124 | 125 | `_data` 126 | } 127 | 128 | #' @exportS3Method cbind clarify_est 129 | #' @rdname transform.clarify_est 130 | cbind.clarify_est <- function(..., deparse.level = 1) { 131 | if (...length() == 0) return(NULL) 132 | 133 | for (i in seq_len(...length())) { 134 | if (!inherits(...elt(i), "clarify_est")) { 135 | .err("all supplied objects must be `clarify_est` objects, the output of calls to `sim_apply()` or its wrappers") 136 | } 137 | } 138 | 139 | obj <- list(...) 140 | hashes <- lapply(obj, attr, "sim_hash") 141 | 142 | if (any(lengths(hashes) == 0) || any(!vapply(hashes, chk::vld_string, logical(1L)))) { 143 | .err("all supplied objects must be unmodified `clarify_est` objects") 144 | } 145 | if (!all_the_same(unlist(hashes)) || !all_the_same(unlist(lapply(obj, nrow)))) { 146 | .err("all supplied objects must be calls of `sim_apply()` or its wrappers on the same `clarify_sim` object") 147 | } 148 | 149 | out <- do.call("cbind", lapply(obj, drop_sim_class)) 150 | 151 | attr(out, "original") <- do.call("c", lapply(obj, attr, "original")) 152 | attr(out, "sim_hash") <- hashes[[1]] 153 | class(out) <- c("clarify_est", class(out)) 154 | 155 | out 156 | } 157 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #Utilities 2 | word_list <- function(word.list = NULL, and.or = c("and", "or"), is.are = FALSE, quotes = FALSE) { 3 | #When given a vector of strings, creates a string of the form "a and b" 4 | #or "a, b, and c" 5 | #If is.are, adds "is" or "are" appropriately 6 | L <- length(word.list) 7 | word.list <- add_quotes(word.list, quotes) 8 | 9 | if (L == 0) { 10 | out <- "" 11 | attr(out, "plural") <- FALSE 12 | } 13 | else { 14 | word.list <- word.list[!word.list %in% c(NA_character_, "")] 15 | L <- length(word.list) 16 | if (L == 0) { 17 | out <- "" 18 | attr(out, "plural") <- FALSE 19 | } 20 | else if (L == 1) { 21 | out <- word.list 22 | if (is.are) out <- paste(out, "is") 23 | attr(out, "plural") <- FALSE 24 | } 25 | else { 26 | and.or <- match_arg(and.or) 27 | if (L == 2) { 28 | out <- paste(word.list, collapse = paste0(" ", and.or, " ")) 29 | } 30 | else { 31 | out <- paste(paste(word.list[seq_len(L - 1)], collapse = ", "), 32 | word.list[L], sep = paste0(", ", and.or, " ")) 33 | 34 | } 35 | if (is.are) out <- paste(out, "are") 36 | attr(out, "plural") <- TRUE 37 | } 38 | 39 | } 40 | 41 | out 42 | } 43 | 44 | #Add quotation marks around a string. 45 | add_quotes <- function(x, quotes = 2L) { 46 | if (!isFALSE(quotes)) { 47 | if (isTRUE(quotes)) quotes <- 2 48 | 49 | if (chk::vld_string(quotes)) x <- paste0(quotes, x, quotes) 50 | else if (chk::vld_whole_number(quotes)) { 51 | if (as.integer(quotes) == 0) return(x) 52 | else if (as.integer(quotes) == 1) x <- paste0("\'", x, "\'") 53 | else if (as.integer(quotes) == 2) x <- paste0("\"", x, "\"") 54 | else stop("`quotes` must be boolean, 1, 2, or a string.") 55 | } 56 | else { 57 | stop("'quotes' must be boolean, 1, 2, or a string.") 58 | } 59 | } 60 | x 61 | } 62 | 63 | #More informative and cleaner version of base::match.arg. From WeightIt with edits. 64 | match_arg <- function(arg, choices, several.ok = FALSE) { 65 | #Replaces match.arg() but gives cleaner error message and processing 66 | #of arg. 67 | if (missing(arg)) 68 | stop("No argument was supplied to match_arg().") 69 | arg.name <- deparse1(substitute(arg)) 70 | 71 | if (missing(choices)) { 72 | formal.args <- formals(sys.function(sysP <- sys.parent())) 73 | choices <- eval(formal.args[[as.character(substitute(arg))]], 74 | envir = sys.frame(sysP)) 75 | } 76 | 77 | if (is.null(arg)) return(choices[1L]) 78 | else if (!is.character(arg)) 79 | stop(sprintf("The argument to `%s` must be NULL or a character vector", arg.name), call. = FALSE) 80 | 81 | if (!several.ok) { 82 | if (identical(arg, choices)) return(arg[1L]) 83 | if (length(arg) > 1L) { 84 | stop(sprintf("The argument to `%s` must be of length 1", arg.name), call. = FALSE) 85 | } 86 | } 87 | else if (length(arg) == 0) { 88 | stop(sprintf("The argument to `%s` must be of length >= 1", arg.name), call. = FALSE) 89 | } 90 | 91 | i <- pmatch(arg, choices, nomatch = 0L, duplicates.ok = TRUE) 92 | if (all(i == 0L)) 93 | stop(sprintf("The argument to `%s` should be %s%s.", 94 | arg.name, 95 | ngettext(length(choices), "", if (several.ok) "at least one of " else "one of "), 96 | word_list(choices, and.or = "or", quotes = 2)), 97 | call. = FALSE) 98 | 99 | i <- i[i > 0L] 100 | 101 | choices[i] 102 | } 103 | 104 | #Format percentage for CI labels 105 | fmt.prc <- function(probs, digits = 3) { 106 | paste(format(100 * probs, trim = TRUE, scientific = FALSE, digits = digits), "%") 107 | } 108 | 109 | #Check if all values are the same 110 | all_the_same <- function(x) { 111 | if (is.list(x)) { 112 | for (i in x) if (!identical(i, x[[1]])) return(FALSE) 113 | return(TRUE) 114 | } 115 | 116 | if (is.numeric(x)) { 117 | return(abs(max(x) - min(x)) < 1e-9) 118 | } 119 | 120 | length(unique(x)) == 1 121 | } 122 | 123 | #Tidy tryCatching 124 | try_chk <- function(expr) { 125 | tryCatch(expr, 126 | error = function(e) .err(conditionMessage(e))) 127 | } 128 | 129 | #mode 130 | Mode <- function(v, na.rm = TRUE) { 131 | if (anyNA(v)) { 132 | if (na.rm) v <- v[!is.na(v)] 133 | else { 134 | #Return NA, keeping type of `v` 135 | v <- v[1] 136 | is.na(v) <- TRUE 137 | return(v) 138 | } 139 | } 140 | 141 | if (length(v) == 0) return(v) 142 | if (is.factor(v)) { 143 | if (nlevels(v) == 1) return(levels(v)[1]) 144 | mode <- levels(v)[which.max(tabulate(v, nbins = nlevels(v)))] 145 | mode <- factor(mode, levels = levels(v)) 146 | } 147 | else { 148 | uv <- unique(v) 149 | if (length(uv) == 1) return(uv) 150 | mode <- uv[which.max(tabulate(match(v, uv)))] 151 | } 152 | mode 153 | } 154 | 155 | #Recursively search a list for a value (key) and return location of value 156 | list.search <- function(x, key) { 157 | for (i in seq_along(x)) { 158 | if (identical(x[[i]], key)) { 159 | return(i) 160 | } 161 | 162 | if (is.list(x[[i]])) { 163 | l <- list.search(x[[i]], key) 164 | if (!is.null(l)) return(c(i, l)) 165 | } 166 | } 167 | 168 | NULL 169 | } 170 | 171 | #Checks if input is "try-error", i.e., failure of try() 172 | is_error <- function(x) { 173 | inherits(x, "try-error") 174 | } 175 | 176 | pkg_caller_call <- function(start = 1) { 177 | package.funs <- c(getNamespaceExports(utils::packageName()), 178 | .getNamespaceInfo(asNamespace(utils::packageName()), "S3methods")[, 3]) 179 | k <- start #skip checking pkg_caller_call() 180 | e_max <- start 181 | while (!is.null(e <- rlang::caller_call(k))) { 182 | if (!is.null(n <- rlang::call_name(e)) && 183 | n %in% package.funs) e_max <- k 184 | k <- k + 1 185 | } 186 | rlang::caller_call(e_max) 187 | } 188 | 189 | .err <- function(...) { 190 | chk::err(..., call = pkg_caller_call(start = 2)) 191 | } 192 | 193 | .wrn <- function(..., immediate = TRUE) { 194 | if (immediate && isTRUE(all.equal(getOption("warn"), 0))) { 195 | op <- options(warn = 1) 196 | on.exit(options(op)) 197 | } 198 | chk::wrn(...) 199 | } 200 | 201 | drop_sim_class <- function(x) { 202 | class(x) <- class(x)[!startsWith(class(x), "clarify_")] 203 | x 204 | } 205 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | ###pkg load stuff 2 | utils::globalVariables(c(".b1", ".b2")) 3 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "80%" 13 | ) 14 | ``` 15 | 16 | # `clarify`: Simulation-Based Inference for Regression Models 17 | 18 | 19 | [![CRAN status](https://www.r-pkg.org/badges/version/clarify)](https://CRAN.R-project.org/package=clarify) 20 | 21 | 22 | `clarify` implements simulation-based inference for computing functions of model parameters, such as average marginal effects and predictions at representative values of the predictors. See the `clarify` [website](https://iqss.github.io/clarify/) for documentation and other examples. `clarify` was designed to replicate and expand on functionality previously provided by the `Zelig` package. 23 | 24 | ## Installation 25 | 26 | `clarify` can be installed from CRAN using 27 | 28 | ```{r, eval = F} 29 | install.packages("clarify") 30 | ``` 31 | 32 | You can install the development version of `clarify` from [GitHub](https://github.com/iqss/clarify) with 33 | 34 | ```{r, eval = F} 35 | install.packages("remotes") 36 | remotes::install_github("iqss/clarify") 37 | ``` 38 | 39 | ## Example 40 | 41 | Below is an example of performing g-computation for the average treatment effect on the treated (ATT) after logistic regression to compute the average causal risk ratio and its confidence interval. First we load the data (in this case the `lalonde` dataset from `MatchIt`) and fit a logistic regression using functions outside of `clarify`: 42 | 43 | ```{r, fig.width=7, fig.height=3} 44 | library(clarify) 45 | 46 | data("lalonde", package = "MatchIt") 47 | 48 | # Fit the model 49 | fit <- glm(I(re78 > 0) ~ treat + age + educ + race + married + 50 | nodegree + re74 + re75, 51 | data = lalonde, family = binomial) 52 | ``` 53 | 54 | Next, to estimate the ATT risk ratio, we simulate coefficients from their implied distribution and compute the effects of interest in each simulation, yielding a distribution of estimates that we can summarize and use for inference: 55 | 56 | ```{r example, fig.width=7, fig.height=3} 57 | # Simulate coefficients from a multivariate normal distribution 58 | set.seed(123) 59 | sim_coefs <- sim(fit) 60 | 61 | # Marginal risk ratio ATT, simulation-based 62 | sim_est <- sim_ame(sim_coefs, var = "treat", subset = treat == 1, 63 | contrast = "RR", verbose = FALSE) 64 | 65 | sim_est 66 | 67 | # View the estimates, confidence intervals, and p-values 68 | summary(sim_est, null = c(`RR` = 1)) 69 | 70 | # Plot the resulting sampling distributions 71 | plot(sim_est) 72 | ``` 73 | 74 | Below, we provide information on the framework `clarify` uses and some other examples. For a complete vignette, see `vignette("clarify")`. 75 | 76 | ## Introduction 77 | 78 | Simulation-based inference is an alternative to the delta method and bootstrapping for performing inference on quantities that are functions of model parameters. It involves simulating model coefficients from their multivariate distribution using their estimated values and covariance from a single model fit to the original data, computing the quantities of interest from each set of model coefficients, and then performing inference using the resulting distribution of the estimates as their sampling distribution. Confidence intervals can be computed using the percentiles of the resulting sampling distribution, and p-values can be computed by inverting the confidence intervals. Alternatively, if the resulting sampling distribution is normally distributed, its standard error can be estimated as the standard deviation of the estimates and normal-theory Wald confidence intervals and p-values can be computed. The methodology of simulation-based inference is explained in King, Tomz, and Wittenberg (2000). 79 | 80 | `clarify` was designed to provide a simple, general interface for simulation-based inference and includes a few convenience functions to perform common tasks like computing average marginal effects. The primary functions of `clarify` are `sim()`, `sim_apply()`, `summary()`, and `plot()`. These work together to create a simple workflow for simulation-based inference. 81 | 82 | * `sim()` simulates model parameters from a fitted model 83 | * `sim_apply()` applies an estimator to the simulated coefficients, or to the original object but with the new coefficients inserted 84 | * `summary()` produces confidence intervals and p-values for the resulting estimates 85 | * `plot()` produces plots of the simulated sampling distribution of the resulting estimates 86 | 87 | There are also some wrappers for `sim_apply()` for performing some common operations: `sim_ame()` computes the average marginal effect of a variable, mirroring `marginaleffects::avg_predictions()` and `marginaleffects::avg_slopes()`; `sim_setx()` computes predictions at typical values of the covariates and differences between them, mirroring `Zelig::setx()` and `Zelig::setx1()`; and `sim_adrf()` computes average dose-response functions. `clarify` also offers support for models fit to multiply imputed data with the `misim()` function. 88 | 89 | In the example above, we used `sim_ame()` to compute the ATT, but we could have also done so manually using `sim_apply()`, as demonstrated below: 90 | 91 | ```{r example2, fig.width=7, fig.height=3} 92 | # Write a function that computes the g-computation estimate for the ATT 93 | ATT_fun <- function(fit) { 94 | d <- subset(lalonde, treat == 1) 95 | d$treat <- 1 96 | p1 <- mean(predict(fit, newdata = d, type = "response")) 97 | d$treat <- 0 98 | p0 <- mean(predict(fit, newdata = d, type = "response")) 99 | c(`E[Y(0)]` = p0, `E[Y(1)]` = p1, `RR` = p1 / p0) 100 | } 101 | 102 | # Apply that function to the simulated coefficient 103 | sim_est <- sim_apply(sim_coefs, ATT_fun, verbose = FALSE) 104 | 105 | sim_est 106 | 107 | # View the estimates, confidence intervals, and p-values; 108 | # they are the same as when using sim_ame() above 109 | summary(sim_est, null = c(`RR` = 1)) 110 | 111 | # Plot the resulting sampling distributions 112 | plot(sim_est, reference = TRUE, ci = FALSE) 113 | ``` 114 | 115 | The plot of the simulated sampling distribution indicates that the sampling distribution for the risk ratio is not normally distributed around the estimate, indicating that the delta method may be a poor approximation and the asymmetric confidence intervals produced using the simulation may be more valid. Note that the estimates are those computed from the original model coefficients; the distribution is used only for computing confidence intervals, in line with recommendations by Rainey (2023). 116 | 117 | If we want to compute the risk difference, we can do that using `transform()` on the already-produced output: 118 | 119 | ```{r} 120 | #Transform estimates into new quantities of interest 121 | sim_est <- transform(sim_est, `RD` = `E[Y(1)]` - `E[Y(0)]`) 122 | summary(sim_est, null = c(`RR` = 1, `RD` = 0)) 123 | ``` 124 | 125 | We can also use `clarify` to compute predictions and first differences at set and typical values of the predictors, mimicking the functionality of `Zelig`'s `setx()` and `setx1()` functions, using `sim_setx()`: 126 | 127 | ```{r, fig.width=7, fig.height=3} 128 | # Predictions across age and treat at typical values 129 | # of the other predictors 130 | sim_est <- sim_setx(sim_coefs, x = list(age = 20:50, treat = 0:1), 131 | verbose = FALSE) 132 | 133 | #Plot of predicted values across age for each value of treat 134 | plot(sim_est) 135 | ``` 136 | 137 | See `vignette("Zelig", package = "clarify")` for more examples of translating a `Zelig`-based workflow into one that uses `clarify` to estimate the same quantities of interest. 138 | 139 | `clarify` offers parallel processing for all estimation functions to speed up computation. Functionality is also available for the analysis of models fit to multiply imputed data. See `vignette("clarify")` for more details. 140 | 141 | ## References 142 | 143 | King, G., Tomz, M., & Wittenberg, J. (2000). Making the Most of Statistical Analyses: Improving Interpretation and Presentation. *American Journal of Political Science*, 44(2), 347–361. https://doi.org/10.2307/2669316 144 | 145 | Rainey, C. (2023). A careful consideration of CLARIFY: Simulation-induced bias in point estimates of quantities of interest. *Political Science Research and Methods*, 1–10. https://doi.org/10.1017/psrm.2023.8 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # `clarify`: Simulation-Based Inference for Regression Models 5 | 6 | 7 | 8 | [![CRAN 9 | status](https://www.r-pkg.org/badges/version/clarify)](https://CRAN.R-project.org/package=clarify) 10 | 11 | 12 | `clarify` implements simulation-based inference for computing functions 13 | of model parameters, such as average marginal effects and predictions at 14 | representative values of the predictors. See the `clarify` 15 | [website](https://iqss.github.io/clarify/) for documentation and other 16 | examples. `clarify` was designed to replicate and expand on 17 | functionality previously provided by the `Zelig` package. 18 | 19 | ## Installation 20 | 21 | `clarify` can be installed from CRAN using 22 | 23 | ``` r 24 | install.packages("clarify") 25 | ``` 26 | 27 | You can install the development version of `clarify` from 28 | [GitHub](https://github.com/iqss/clarify) with 29 | 30 | ``` r 31 | install.packages("remotes") 32 | remotes::install_github("iqss/clarify") 33 | ``` 34 | 35 | ## Example 36 | 37 | Below is an example of performing g-computation for the average 38 | treatment effect on the treated (ATT) after logistic regression to 39 | compute the average causal risk ratio and its confidence interval. First 40 | we load the data (in this case the `lalonde` dataset from `MatchIt`) and 41 | fit a logistic regression using functions outside of `clarify`: 42 | 43 | ``` r 44 | library(clarify) 45 | 46 | data("lalonde", package = "MatchIt") 47 | 48 | # Fit the model 49 | fit <- glm(I(re78 > 0) ~ treat + age + educ + race + married + 50 | nodegree + re74 + re75, 51 | data = lalonde, family = binomial) 52 | ``` 53 | 54 | Next, to estimate the ATT risk ratio, we simulate coefficients from 55 | their implied distribution and compute the effects of interest in each 56 | simulation, yielding a distribution of estimates that we can summarize 57 | and use for inference: 58 | 59 | ``` r 60 | # Simulate coefficients from a multivariate normal distribution 61 | set.seed(123) 62 | sim_coefs <- sim(fit) 63 | 64 | # Marginal risk ratio ATT, simulation-based 65 | sim_est <- sim_ame(sim_coefs, var = "treat", subset = treat == 1, 66 | contrast = "RR", verbose = FALSE) 67 | 68 | sim_est 69 | #> A `clarify_est` object (from `sim_ame()`) 70 | #> - Average adjusted predictions for `treat` 71 | #> - 1000 simulated values 72 | #> - 3 quantities estimated: 73 | #> E[Y(0)] 0.6830995 74 | #> E[Y(1)] 0.7567568 75 | #> RR 1.1078280 76 | 77 | # View the estimates, confidence intervals, and p-values 78 | summary(sim_est, null = c(`RR` = 1)) 79 | #> Estimate 2.5 % 97.5 % P-value 80 | #> E[Y(0)] 0.683 0.587 0.753 . 81 | #> E[Y(1)] 0.757 0.686 0.813 . 82 | #> RR 1.108 0.971 1.298 0.13 83 | 84 | # Plot the resulting sampling distributions 85 | plot(sim_est) 86 | ``` 87 | 88 | 89 | 90 | Below, we provide information on the framework `clarify` uses and some 91 | other examples. For a complete vignette, see `vignette("clarify")`. 92 | 93 | ## Introduction 94 | 95 | Simulation-based inference is an alternative to the delta method and 96 | bootstrapping for performing inference on quantities that are functions 97 | of model parameters. It involves simulating model coefficients from 98 | their multivariate distribution using their estimated values and 99 | covariance from a single model fit to the original data, computing the 100 | quantities of interest from each set of model coefficients, and then 101 | performing inference using the resulting distribution of the estimates 102 | as their sampling distribution. Confidence intervals can be computed 103 | using the percentiles of the resulting sampling distribution, and 104 | p-values can be computed by inverting the confidence intervals. 105 | Alternatively, if the resulting sampling distribution is normally 106 | distributed, its standard error can be estimated as the standard 107 | deviation of the estimates and normal-theory Wald confidence intervals 108 | and p-values can be computed. The methodology of simulation-based 109 | inference is explained in King, Tomz, and Wittenberg (2000). 110 | 111 | `clarify` was designed to provide a simple, general interface for 112 | simulation-based inference and includes a few convenience functions to 113 | perform common tasks like computing average marginal effects. The 114 | primary functions of `clarify` are `sim()`, `sim_apply()`, `summary()`, 115 | and `plot()`. These work together to create a simple workflow for 116 | simulation-based inference. 117 | 118 | - `sim()` simulates model parameters from a fitted model 119 | - `sim_apply()` applies an estimator to the simulated coefficients, or 120 | to the original object but with the new coefficients inserted 121 | - `summary()` produces confidence intervals and p-values for the 122 | resulting estimates 123 | - `plot()` produces plots of the simulated sampling distribution of the 124 | resulting estimates 125 | 126 | There are also some wrappers for `sim_apply()` for performing some 127 | common operations: `sim_ame()` computes the average marginal effect of a 128 | variable, mirroring `marginaleffects::avg_predictions()` and 129 | `marginaleffects::avg_slopes()`; `sim_setx()` computes predictions at 130 | typical values of the covariates and differences between them, mirroring 131 | `Zelig::setx()` and `Zelig::setx1()`; and `sim_adrf()` computes average 132 | dose-response functions. `clarify` also offers support for models fit to 133 | multiply imputed data with the `misim()` function. 134 | 135 | In the example above, we used `sim_ame()` to compute the ATT, but we 136 | could have also done so manually using `sim_apply()`, as demonstrated 137 | below: 138 | 139 | ``` r 140 | # Write a function that computes the g-computation estimate for the ATT 141 | ATT_fun <- function(fit) { 142 | d <- subset(lalonde, treat == 1) 143 | d$treat <- 1 144 | p1 <- mean(predict(fit, newdata = d, type = "response")) 145 | d$treat <- 0 146 | p0 <- mean(predict(fit, newdata = d, type = "response")) 147 | c(`E[Y(0)]` = p0, `E[Y(1)]` = p1, `RR` = p1 / p0) 148 | } 149 | 150 | # Apply that function to the simulated coefficient 151 | sim_est <- sim_apply(sim_coefs, ATT_fun, verbose = FALSE) 152 | 153 | sim_est 154 | #> A `clarify_est` object (from `sim_apply()`) 155 | #> - 1000 simulated values 156 | #> - 3 quantities estimated: 157 | #> E[Y(0)] 0.6830995 158 | #> E[Y(1)] 0.7567568 159 | #> RR 1.1078280 160 | 161 | # View the estimates, confidence intervals, and p-values; 162 | # they are the same as when using sim_ame() above 163 | summary(sim_est, null = c(`RR` = 1)) 164 | #> Estimate 2.5 % 97.5 % P-value 165 | #> E[Y(0)] 0.683 0.587 0.753 . 166 | #> E[Y(1)] 0.757 0.686 0.813 . 167 | #> RR 1.108 0.971 1.298 0.13 168 | 169 | # Plot the resulting sampling distributions 170 | plot(sim_est, reference = TRUE, ci = FALSE) 171 | ``` 172 | 173 | 174 | 175 | The plot of the simulated sampling distribution indicates that the 176 | sampling distribution for the risk ratio is not normally distributed 177 | around the estimate, indicating that the delta method may be a poor 178 | approximation and the asymmetric confidence intervals produced using the 179 | simulation may be more valid. Note that the estimates are those computed 180 | from the original model coefficients; the distribution is used only for 181 | computing confidence intervals, in line with recommendations by Rainey 182 | (2023). 183 | 184 | If we want to compute the risk difference, we can do that using 185 | `transform()` on the already-produced output: 186 | 187 | ``` r 188 | #Transform estimates into new quantities of interest 189 | sim_est <- transform(sim_est, `RD` = `E[Y(1)]` - `E[Y(0)]`) 190 | summary(sim_est, null = c(`RR` = 1, `RD` = 0)) 191 | #> Estimate 2.5 % 97.5 % P-value 192 | #> E[Y(0)] 0.6831 0.5872 0.7528 . 193 | #> E[Y(1)] 0.7568 0.6859 0.8134 . 194 | #> RR 1.1078 0.9708 1.2976 0.13 195 | #> RD 0.0737 -0.0215 0.1757 0.13 196 | ``` 197 | 198 | We can also use `clarify` to compute predictions and first differences 199 | at set and typical values of the predictors, mimicking the functionality 200 | of `Zelig`’s `setx()` and `setx1()` functions, using `sim_setx()`: 201 | 202 | ``` r 203 | # Predictions across age and treat at typical values 204 | # of the other predictors 205 | sim_est <- sim_setx(sim_coefs, x = list(age = 20:50, treat = 0:1), 206 | verbose = FALSE) 207 | 208 | #Plot of predicted values across age for each value of treat 209 | plot(sim_est) 210 | ``` 211 | 212 | 213 | 214 | See `vignette("Zelig", package = "clarify")` for more examples of 215 | translating a `Zelig`-based workflow into one that uses `clarify` to 216 | estimate the same quantities of interest. 217 | 218 | `clarify` offers parallel processing for all estimation functions to 219 | speed up computation. Functionality is also available for the analysis 220 | of models fit to multiply imputed data. See `vignette("clarify")` for 221 | more details. 222 | 223 | ## References 224 | 225 | King, G., Tomz, M., & Wittenberg, J. (2000). Making the Most of 226 | Statistical Analyses: Improving Interpretation and Presentation. 227 | *American Journal of Political Science*, 44(2), 347–361. 228 | 229 | 230 | Rainey, C. (2023). A careful consideration of CLARIFY: 231 | Simulation-induced bias in point estimates of quantities of interest. 232 | *Political Science Research and Methods*, 1–10. 233 | 234 | -------------------------------------------------------------------------------- /_dev/sim_chain.R: -------------------------------------------------------------------------------- 1 | # # Function to chain simulations, i.e., to simulate values within each simulation. Intended use is for 2 | # # using outputs of first stage, which are estimated with uncertainty, in second stage. Turned out 3 | # # not to give valid results when tested with propensity score weighting. 4 | # sim_chain <- function(sim, FUN, n = 10, vcov = NULL, coefs = NULL, dist = NULL, verbose = TRUE, 5 | # cl = NULL, ...) { 6 | # coef_template <- get_coef_template(sim$fit, sim$coefs) 7 | # coef_location <- get_coef_location(sim$fit, sim$coefs, coef_template) 8 | # 9 | # opb <- pbapply::pboptions(type = if (verbose) "timer" else "none") 10 | # on.exit(pbapply::pboptions(opb)) 11 | # 12 | # apply_FUN <- make_apply_FUN(FUN, coef_location, coef_template) 13 | # 14 | # # Test apply_FUN() on original model coefficients 15 | # test <- try(apply_FUN(fit = sim$fit, coefs = sim$coefs, ...), silent = TRUE) 16 | # if (is_error(test)) { 17 | # .err("`FUN` failed to run on an initial check with the following error:\n", 18 | # conditionMessage(attr(test, "condition"))) 19 | # } 20 | # test_sim <- sim(test, n = 1, vcov = vcov, coefs = coefs, dist = dist) 21 | # 22 | # if (is.null(names(test))) names(test) <- paste0("est", seq_along(test)) 23 | # 24 | # sim.list <- pbapply::pblapply(seq_len(nrow(sim$sim.coefs)), function(i) { 25 | # sim(apply_FUN(fit = sim$fit, coefs = sim$sim.coefs[i,], ...), 26 | # n = n, vcov = vcov, coefs = coefs, dist = dist) 27 | # }, cl = cl) 28 | # 29 | # out <- list(sim.coefs = do.call("rbind", lapply(sim.list, `[[`, "sim.coefs")), 30 | # coefs = test_sim$coefs, 31 | # fit = test) 32 | # 33 | # attr(out, "dist") <- attr(test_sim, "dist") 34 | # attr(out, "use_fit") <- TRUE 35 | # attr(out, "sim_hash") <- rlang::hash(out$sim.coefs) 36 | # class(out) <- "simbased_sim" 37 | # 38 | # out 39 | # } 40 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://iqss.github.io/clarify/ 2 | template: 3 | bootstrap: 5 4 | 5 | reference: 6 | - title: Main Functions 7 | - contents: 8 | - sim 9 | - sim_apply 10 | - summary.clarify_est 11 | - transform.clarify_est 12 | - title: Wrappers 13 | - contents: 14 | - sim_adrf 15 | - plot.clarify_adrf 16 | - sim_ame 17 | - sim_setx 18 | - plot.clarify_setx 19 | - misim 20 | -------------------------------------------------------------------------------- /clarify.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /clarify/Submission 1/RJreferences.bib: -------------------------------------------------------------------------------- 1 | 2 | @article{kingMakingMostStatistical2000, 3 | title = {Making the Most of Statistical Analyses: Improving Interpretation and Presentation}, 4 | author = {King, Gary and Tomz, Michael and Wittenberg, Jason}, 5 | year = {2000}, 6 | date = {2000}, 7 | journal = {American Journal of Political Science}, 8 | pages = {347--361}, 9 | volume = {44}, 10 | number = {2}, 11 | doi = {10.2307/2669316}, 12 | note = {tex.ids= kingMakingMostStatistical2000a 13 | publisher: [Midwest Political Science Association, Wiley]} 14 | } 15 | 16 | @article{zhouNoteBayesianInference2010, 17 | title = {A Note on Bayesian Inference After Multiple Imputation}, 18 | author = {Zhou, Xiang and Reiter, Jerome P.}, 19 | year = {2010}, 20 | month = {05}, 21 | date = {2010-05}, 22 | journal = {The American Statistician}, 23 | pages = {159--163}, 24 | volume = {64}, 25 | number = {2}, 26 | doi = {10.1198/tast.2010.09109}, 27 | langid = {en} 28 | } 29 | 30 | @article{tomzClarifySoftwareInterpreting2003, 31 | title = {Clarify: Software for Interpreting and Presenting Statistical Results}, 32 | author = {Tomz, Michael and Wittenberg, Jason and King, Gary}, 33 | year = {2003}, 34 | month = {01}, 35 | date = {2003-01-15}, 36 | journal = {Journal of Statistical Software}, 37 | pages = {1--30}, 38 | volume = {8}, 39 | doi = {10.18637/jss.v008.i01}, 40 | langid = {en} 41 | } 42 | 43 | @article{imaiCommonFrameworkStatistical2008a, 44 | title = {Toward a Common Framework for Statistical Analysis and Development}, 45 | author = {Imai, Kosuke and King, Gary and Lau, Olivia}, 46 | year = {2008}, 47 | month = {12}, 48 | date = {2008-12-01}, 49 | journal = {Journal of Computational and Graphical Statistics}, 50 | pages = {892--913}, 51 | volume = {17}, 52 | number = {4}, 53 | doi = {10.1198/106186008X384898} 54 | } 55 | 56 | @article{puhrFirthLogisticRegression2017, 57 | title = {Firth's logistic regression with rare events: accurate effect estimates and predictions?}, 58 | author = {Puhr, Rainer and Heinze, Georg and Nold, Mariana and Lusa, Lara and Geroldinger, Angelika}, 59 | year = {2017}, 60 | month = {06}, 61 | date = {2017-06-30}, 62 | journal = {Statistics in Medicine}, 63 | pages = {2302--2317}, 64 | volume = {36}, 65 | number = {14}, 66 | doi = {10.1002/sim.7273}, 67 | note = {Publisher: John Wiley & Sons, Ltd}, 68 | langid = {en} 69 | } 70 | 71 | @article{kingLogisticRegressionRare2001, 72 | title = {Logistic Regression in Rare Events Data}, 73 | author = {King, Gary and Zeng, Langche}, 74 | year = {2001}, 75 | date = {2001}, 76 | journal = {Political Analysis}, 77 | pages = {137--163}, 78 | volume = {9}, 79 | number = {2}, 80 | doi = {10.1093/oxfordjournals.pan.a004868}, 81 | langid = {en} 82 | } 83 | 84 | @article{raineyCarefulConsiderationCLARIFY2023, 85 | title = {A Careful Consideration of {{CLARIFY}}: Simulation-Induced Bias in Point Estimates of Quantities of Interest}, 86 | shorttitle = {A Careful Consideration of {{CLARIFY}}}, 87 | author = {Rainey, Carlisle}, 88 | year = {2023}, 89 | month = apr, 90 | journal = {Political Science Research and Methods}, 91 | pages = {1--10}, 92 | publisher = {{Cambridge University Press}}, 93 | issn = {2049-8470, 2049-8489}, 94 | doi = {10.1017/psrm.2023.8}, 95 | urldate = {2023-05-03}, 96 | langid = {english}, 97 | keywords = {Maximum likelihood estimation (MLE)}, 98 | } 99 | 100 | 101 | @article{rainey2017, 102 | title = {Transformation-Induced Bias: Unbiased Coefficients Do Not Imply Unbiased Quantities of Interest}, 103 | author = {Rainey, Carlisle}, 104 | year = {2017}, 105 | month = {07}, 106 | date = {2017-07}, 107 | journal = {Political Analysis}, 108 | pages = {402--409}, 109 | volume = {25}, 110 | number = {3}, 111 | doi = {10.1017/pan.2017.11}, 112 | langid = {en} 113 | } 114 | 115 | @article{JSSv042i08, 116 | title = {MatchIt: Nonparametric preprocessing for parametric causal inference}, 117 | author = {Ho, Daniel E. and Imai, Kosuke and King, Gary and Stuart, Elizabeth A.}, 118 | year = {2011}, 119 | date = {2011}, 120 | journal = {Journal of Statistical Software, Articles}, 121 | pages = {1{\textendash}28}, 122 | volume = {42}, 123 | number = {8}, 124 | doi = {10.18637/jss.v042.i08}, 125 | note = {Citation Key: JSSv042i08 126 | tex.ids= hoMatchItNonparametricPreprocessing2011} 127 | } 128 | 129 | @article{dehejiaCausalEffectsNonexperimental1999, 130 | title = {Causal Effects in Nonexperimental Studies: Reevaluating the Evaluation of Training Programs}, 131 | author = {Dehejia, Rajeev H. and Wahba, Sadek}, 132 | year = {1999}, 133 | month = {12}, 134 | date = {1999-12}, 135 | journal = {Journal of the American Statistical Association}, 136 | pages = {1053--1062}, 137 | volume = {94}, 138 | number = {448}, 139 | doi = {10.1080/01621459.1999.10473858}, 140 | langid = {en} 141 | } 142 | 143 | @article{greiferChoosingCausalEstimand2023, 144 | title = {Choosing the Causal Estimand for Propensity Score Analysis of Observational Studies}, 145 | author = {Greifer, Noah and Stuart, Elizabeth A.}, 146 | year = {2023}, 147 | doi = {10.48550/arXiv.2106.10577} 148 | } 149 | 150 | @book{longRegressionModelsCategorical2014, 151 | title = {Regression models for categorical dependent variables using Stata}, 152 | author = {Long, J. Scott and Freese, Jeremy}, 153 | year = {2014}, 154 | date = {2014}, 155 | publisher = {Stata Press Publication, StataCorp LP}, 156 | edition = {Third edition}, 157 | note = {OCLC: ocn890178695}, 158 | address = {College Station, Texas} 159 | } 160 | -------------------------------------------------------------------------------- /clarify/Submission 1/RJwrapper.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{report} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage[T1]{fontenc} 4 | \usepackage{RJournal} 5 | \usepackage{amsmath,amssymb,array} 6 | \usepackage{booktabs} 7 | 8 | 9 | % tightlist command for lists without linebreak 10 | \providecommand{\tightlist}{% 11 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} 12 | 13 | 14 | % Always define CSL refs as bib entries are contained in separate doc 15 | % Pandoc citation processing 16 | \newlength{\cslhangindent} 17 | \setlength{\cslhangindent}{1.5em} 18 | \newlength{\csllabelwidth} 19 | \setlength{\csllabelwidth}{3em} 20 | \newlength{\cslentryspacingunit} % times entry-spacing 21 | \setlength{\cslentryspacingunit}{\parskip} 22 | % for Pandoc 2.8 to 2.10.1 23 | \newenvironment{cslreferences}% 24 | {}% 25 | {\par} 26 | % For Pandoc 2.11+ 27 | \newenvironment{CSLReferences}[2] % #1 hanging-ident, #2 entry spacing 28 | {% don't indent paragraphs 29 | \setlength{\parindent}{0pt} 30 | % turn on hanging indent if param 1 is 1 31 | \ifodd #1 32 | \let\oldpar\par 33 | \def\par{\hangindent=\cslhangindent\oldpar} 34 | \fi 35 | % set entry spacing 36 | \setlength{\parskip}{#2\cslentryspacingunit} 37 | }% 38 | {} 39 | \usepackage{calc} 40 | \newcommand{\CSLBlock}[1]{#1\hfill\break} 41 | \newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{#1}} 42 | \newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{#1}\break} 43 | \newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1} 44 | 45 | 46 | 47 | \begin{document} 48 | 49 | 50 | %% do not edit, for illustration only 51 | \sectionhead{Contributed research article} 52 | \volume{XX} 53 | \volnumber{YY} 54 | \year{20ZZ} 55 | \month{AAAA} 56 | 57 | \begin{article} 58 | \input{clarify} 59 | \end{article} 60 | 61 | 62 | \end{document} 63 | -------------------------------------------------------------------------------- /clarify/Submission 1/clarify.R: -------------------------------------------------------------------------------- 1 | # Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand 2 | # Please edit clarify.Rmd to modify this file 3 | 4 | ## ----setup,include=FALSE------------------------------------------------------ 5 | knitr::opts_chunk$set( 6 | fig.path = "figures/", 7 | fig.align='center', 8 | fig.height = 2 9 | ) 10 | 11 | if (!requireNamespace("clarify")) { 12 | install.packages("clarify") 13 | } 14 | 15 | if (!requireNamespace("MatchIt")) { 16 | install.packages("MatchIt") 17 | } 18 | 19 | if (!requireNamespace("Amelia")) { 20 | install.packages("Amelia") 21 | } 22 | 23 | 24 | ## ----------------------------------------------------------------------------- 25 | library(clarify) 26 | 27 | 28 | ## ----------------------------------------------------------------------------- 29 | data("lalonde", package = "MatchIt") 30 | 31 | lalonde$re78_0 <- ifelse(lalonde$re78 > 0, 1, 0) 32 | 33 | head(lalonde) 34 | 35 | 36 | ## ----------------------------------------------------------------------------- 37 | fit <- glm(re78_0 ~ treat * married + age + educ + race + 38 | nodegree + re74 + re75, data = lalonde, 39 | family = binomial("probit")) 40 | 41 | 42 | ## ----------------------------------------------------------------------------- 43 | set.seed(1234) 44 | 45 | # Drawing 1000 simulated coefficients using an HC2 robust 46 | # covariance matrix 47 | s <- sim(fit, n = 1000, 48 | vcov = "HC2") 49 | 50 | s 51 | 52 | 53 | ## ----------------------------------------------------------------------------- 54 | sim_fun1 <- function(fit) { 55 | predict(fit, newdata = lalonde["PSID1",], type = "response") 56 | } 57 | 58 | 59 | ## ----------------------------------------------------------------------------- 60 | est1 <- sim_apply(s, FUN = sim_fun1, verbose = FALSE) 61 | 62 | est1 63 | 64 | 65 | ## ----------------------------------------------------------------------------- 66 | sim_fun2 <- function(coefs) { 67 | hispan <- unname(coefs["racehispan"]) 68 | white <- unname(coefs["racewhite"]) 69 | 70 | c("w - h" = white - hispan) 71 | } 72 | 73 | est2 <- sim_apply(s, FUN = sim_fun2, verbose = FALSE) 74 | 75 | est2 76 | 77 | 78 | ## ---- fig.width=4------------------------------------------------------------- 79 | plot(est1, reference = TRUE, ci = FALSE) 80 | 81 | 82 | ## ----------------------------------------------------------------------------- 83 | summary(est1) 84 | 85 | 86 | ## ---- fig.width=4------------------------------------------------------------- 87 | plot(est2, reference = TRUE, ci = FALSE) 88 | 89 | summary(est2, method = "wald", null = 0) 90 | 91 | 92 | ## ----------------------------------------------------------------------------- 93 | est3 <- sim_setx(s, 94 | x = list(treat = 0:1, 95 | re75 = c(0, 20000), 96 | race = "black"), 97 | verbose = FALSE) 98 | 99 | 100 | ## ----------------------------------------------------------------------------- 101 | summary(est3) 102 | 103 | 104 | ## ----------------------------------------------------------------------------- 105 | attr(est3, "setx") 106 | 107 | 108 | ## ---- fig.width=5------------------------------------------------------------- 109 | plot(est3, var = "re75", ci = FALSE) 110 | 111 | 112 | ## ----------------------------------------------------------------------------- 113 | est4 <- sim_setx(s, 114 | x = list(treat = 0:1, 115 | re75 = seq(0, 20000, by = 2000), 116 | race = "black"), 117 | verbose = FALSE) 118 | 119 | 120 | ## ---- fig.width=5------------------------------------------------------------- 121 | plot(est4) 122 | 123 | 124 | ## ----------------------------------------------------------------------------- 125 | est5 <- sim_setx(s, 126 | x = list(treat = 0, re75 = 0), 127 | x1 = list(treat = 1, re75 = 0), 128 | verbose = FALSE) 129 | 130 | 131 | ## ----------------------------------------------------------------------------- 132 | summary(est5) 133 | 134 | 135 | ## ----------------------------------------------------------------------------- 136 | est6 <- sim_ame(s, 137 | var = "treat", 138 | subset = treat == 1, 139 | contrast = "rr", 140 | verbose = FALSE) 141 | 142 | 143 | ## ----------------------------------------------------------------------------- 144 | summary(est6, null = c(`RR` = 1)) 145 | 146 | 147 | ## ----------------------------------------------------------------------------- 148 | est7 <- sim_ame(s, 149 | var = "age", 150 | verbose = FALSE) 151 | 152 | 153 | ## ----------------------------------------------------------------------------- 154 | summary(est7) 155 | 156 | 157 | ## ----------------------------------------------------------------------------- 158 | est6b <- sim_ame(s, 159 | var = "treat", 160 | subset = treat == 1, 161 | by = ~married, 162 | contrast = "rr", 163 | verbose = FALSE) 164 | 165 | summary(est6b) 166 | 167 | 168 | ## ----------------------------------------------------------------------------- 169 | age_seq <- seq(18, 50, by = 2) 170 | 171 | est8 <- sim_adrf(s, 172 | var = "age", 173 | contrast = "adrf", 174 | at = age_seq, 175 | verbose = FALSE) 176 | 177 | 178 | ## ---- fig.width=5------------------------------------------------------------- 179 | plot(est8) 180 | 181 | 182 | ## ----------------------------------------------------------------------------- 183 | summary(est8, parm = 1:4) 184 | 185 | 186 | ## ----------------------------------------------------------------------------- 187 | est9 <- sim_adrf(s, 188 | var = "age", 189 | contrast = "amef", 190 | at = age_seq, 191 | verbose = FALSE) 192 | 193 | 194 | ## ---- fig.width=5------------------------------------------------------------- 195 | plot(est9) 196 | 197 | 198 | ## ----------------------------------------------------------------------------- 199 | lalonde <- transform(lalonde, 200 | re78_0 = ifelse(re78 == 0, 1, 0)) 201 | 202 | 203 | ## ----------------------------------------------------------------------------- 204 | est6 <- transform(est6, 205 | RD = `E[Y(1)]` - `E[Y(0)]`) 206 | 207 | 208 | ## ----------------------------------------------------------------------------- 209 | summary(est6, null = c(`RR` = 1, `RD` = 0)) 210 | 211 | 212 | ## ----------------------------------------------------------------------------- 213 | est6b |> 214 | transform(RR_ratio = `RR[1]` / `RR[0]`) |> 215 | summary(parm = c("RR[0]", "RR[1]", "RR_ratio"), 216 | null = 1) 217 | 218 | 219 | ## ----------------------------------------------------------------------------- 220 | # AME of treat with race = "black" 221 | est10b <- sim_ame(s, var = "treat", subset = race == "black", 222 | contrast = "diff", verbose = FALSE) 223 | summary(est10b) 224 | 225 | # AME of treat with race = "hispan" 226 | est10h <- sim_ame(s, var = "treat", subset = race == "hispan", 227 | contrast = "diff", verbose = FALSE) 228 | summary(est10h) 229 | 230 | 231 | ## ----------------------------------------------------------------------------- 232 | names(est10b) <- paste(names(est10b), "b", sep = "_") 233 | names(est10h) <- paste(names(est10h), "h", sep = "_") 234 | 235 | 236 | ## ----------------------------------------------------------------------------- 237 | est10 <- cbind(est10b, est10h) 238 | summary(est10) 239 | 240 | 241 | ## ----------------------------------------------------------------------------- 242 | est10 <- transform(est10, 243 | `Dh - Db` = Diff_h - Diff_b) 244 | summary(est10, parm = "Dh - Db") 245 | 246 | 247 | ## ---- include=F--------------------------------------------------------------- 248 | amelia_ok <- requireNamespace("Amelia", quietly = TRUE) 249 | knitr::opts_chunk$set( 250 | eval = amelia_ok 251 | ) 252 | if (amelia_ok) library(Amelia) 253 | 254 | 255 | ## ---- message=F--------------------------------------------------------------- 256 | library(Amelia) 257 | data("africa", package = "Amelia") 258 | 259 | # Multiple imputation 260 | a.out <- amelia(x = africa, m = 10, cs = "country", 261 | ts = "year", logs = "gdp_pc", p2s = 0) 262 | 263 | # Fit model to each dataset 264 | model.list <- with(a.out, lm(gdp_pc ~ infl * trade)) 265 | 266 | # Simulate coefficients, 100 draws per imputation 267 | si <- misim(model.list, n = 100) 268 | 269 | si 270 | 271 | 272 | ## ----------------------------------------------------------------------------- 273 | sim_fun <- function(fit) { 274 | #Extract the original dataset using get_predictors() 275 | X <- insight::get_predictors(fit) 276 | 277 | p0 <- predict(fit) 278 | 279 | #Predictions after perturbing infl slightly 280 | p1 <- predict(fit, newdata = transform(X, infl = infl + 1e-5)) 281 | 282 | c(AME = mean((p1 - p0) / 1e-5)) 283 | } 284 | 285 | est_mi <- sim_apply(si, FUN = sim_fun, verbose = FALSE) 286 | 287 | summary(est_mi) 288 | 289 | 290 | ## ----------------------------------------------------------------------------- 291 | est_mi2 <- sim_ame(si, var = "infl", verbose = FALSE) 292 | 293 | summary(est_mi2) 294 | 295 | -------------------------------------------------------------------------------- /clarify/Submission 1/clarify.log: -------------------------------------------------------------------------------- 1 | This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.9.22) 28 SEP 2023 14:03 2 | entering extended mode 3 | restricted \write18 enabled. 4 | %&-line parsing enabled. 5 | **clarify.tex 6 | (./clarify.tex 7 | LaTeX2e <2023-06-01> patch level 1 8 | L3 programming layer <2023-08-29> 9 | ! Undefined control sequence. 10 | l.7 \maketitle 11 | 12 | Here is how much of TeX's memory you used: 13 | 16 strings out of 476894 14 | 383 string characters out of 5807862 15 | 1917791 words of memory out of 5000000 16 | 21409 multiletter control sequences out of 15000+600000 17 | 558069 words of font info for 36 fonts, out of 8000000 for 9000 18 | 14 hyphenation exceptions out of 8191 19 | 13i,0n,12p,88b,9s stack positions out of 10000i,1000n,20000p,200000b,200000s 20 | 21 | ! ==> Fatal error occurred, no output PDF file produced! 22 | -------------------------------------------------------------------------------- /clarify/Submission 1/clarify.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/clarify.pdf -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-10-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-10-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-14-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-14-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-16-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-16-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-25-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-25-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-25-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-25-1.png -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-28-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-28-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-28-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-28-1.png -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-8-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-8-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 1/figures/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 1/figures/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /clarify/Submission 1/initial_checks.log: -------------------------------------------------------------------------------- 1 | Initial check results: 2 | 3 | SUCCESS: Submission has consistently named tex, bib, and R files 4 | WARNING: The archive contains hidden files which will be removed: .DS_Store 5 | SUCCESS: File and directory names are compliant. 6 | SUCCESS: No problematic file found 7 | SUCCESS: Possible motivation letter found: motivation-letter.md 8 | ERROR: The title is not in title case! Suggest title to be changed to: 9 | Clarify: Simulation-Based Inference for Regression Models. 10 | Initial check results: 11 | 12 | SUCCESS: Submission has consistently named tex, bib, and R files 13 | WARNING: The archive contains hidden files which will be removed: .DS_Store 14 | SUCCESS: File and directory names are compliant. 15 | SUCCESS: No problematic file found 16 | SUCCESS: Possible motivation letter found: motivation-letter.md 17 | SUCCESS: The article title is properly formatted. 18 | Initial check results: 19 | 20 | SUCCESS: Submission has consistently named tex, bib, and R files 21 | WARNING: The archive contains hidden files which will be removed: .DS_Store 22 | SUCCESS: File and directory names are compliant. 23 | SUCCESS: No problematic file found 24 | SUCCESS: Possible motivation letter found: motivation-letter.md 25 | SUCCESS: The article title is properly formatted. 26 | -------------------------------------------------------------------------------- /clarify/Submission 1/motivation-letter.md: -------------------------------------------------------------------------------- 1 | --- 2 | output: pdf_document 3 | fontsize: 12pt 4 | --- 5 | 6 | \thispagestyle{empty} 7 | \today 8 | 9 | Editor 10 | The R Journal 11 | \bigskip 12 | 13 | 14 | Thank you for considering our article "`clarify`: Simulation-Based Inference for Regression Models" for publication in the R Journal. The article describes the use of our package `clarify` for performing simulation-based inference of post-estimation quantities from regression, which enhances the interpretation of these models without making the same assumptions and approximations similar methods make. We believe this will be useful to anyone performing statistical analysis with regression and therefore will have broad appeal to R users. Please feel free to reach out with any questions or additional materials that might be useful. 15 | 16 | \bigskip 17 | \bigskip 18 | 19 | Regards, 20 | 21 | 22 | 23 | 24 | Noah Greifer 25 | Institute for Quantitative Social Science 26 | Harvard University 27 | Cambridge, MA, USA 28 | ngreifer@iq.harvard.edu 29 | 30 | \bigskip 31 | -------------------------------------------------------------------------------- /clarify/Submission 2/1-review-1.txt: -------------------------------------------------------------------------------- 1 | A bit of background on my perspective as a reviewer. As the paper notes, there is some “disagreement” in practice (and maybe in theory) about the relative value of the delta-method versus simulation-based inference. I fall into the delta-method camp. As such, I’m slightly skeptical of the method the paper proposes. That said, I don’t have a huge problem with the approach that the authors propose—I would never suggest that a researcher should switch from {clarify} to {marginaleffects}, for example. That said, I’m going to make an argument for publication, and that argument is from the perspective of a skeptic. Others might make stronger arguments for publication. 2 | 3 | 4 | 5 | # Strengths 6 | 7 | 8 | 9 | First, the paper is extremely clear about both the software and the method. I sometimes find modern software frustrating because it’s not always clear what the software is doing. I assume that it’s doing good things, but I’m not always sure what it’s doing. That is not the case with this paper. The authors describe the software interface clearly, but also the method that the software is implementing. 10 | 11 | 12 | 13 | Second, this package is important for historical reasons. CLARIFY was (is?) an immensely popular Stata package among political scientists. {Zelig} never took off, but was widely known among political scientists. In my opinion, {clarify} is an excellent, smart replacement for {Zelig}. This paper is worth publishing because of the historical importance of CLARIFY and Zelig among political scientists. Will {clarify} be as popular as {marginaleffects}? I don’t think so. Will it help political scientists trained in and around 2003 to 2018? Absolutely. King, Tomz, and Wittenberg (2000) is one of the most cited political science papers ever, so it would be a real shame to not have a well-documented R package to implement those ideas. (I’m not sure how popular this method was/is outside political science; perhaps it’s also important in some adjacent social sciences.) 14 | 15 | 16 | 17 | This two points seem uncontroversial and should weigh heavily in favor of publication. 18 | 19 | 20 | 21 | # A (Minor) Weakness 22 | 23 | 24 | 25 | I would gently push back on a few stronger claims in the paper regarding the performance of the simulation-based intervals over the delta intervals. Here are a few examples: 26 | 27 | 1. “often more accurate than using the delta method” (p. 1) 28 | 2. “Given its non-Normality, the quantile-based bounds are clearly more appropriate than those resulting from the Normal approximation, as the bounds computed from the Normal approximation would be outside the bounds of the estimate.” (p. 6) 29 | 3. “Inverting the uncertainty interval involves finding the smallest confidence level such that the null value is within the confidence bounds. The p-value for the test is one minus this level.” (p. 6) 30 | 4. “including plots to assess the normality of the distributions of simulated values (important for assessing whether Wald-type confidence intervals and p-values are valid)” (p. 16) 31 | 32 | 33 | I think the authors would agree with this summary of their position: When the simulations of the quantity of interest is not normal, then simulation-based inference should be preferred.” 34 | 35 | 36 | 37 | It isn’t clear to me why this summary would be true. 38 | 39 | 40 | 41 | For example, if I knew the sampling distribution was skewed to the right, then I would want a CI with a longer arm to the left and a shorter arm to the right, else the CI will misses won’t be symmetric (e.g., 2.5% low, 2.5% high). To make this concrete, suppose you get a point estimate at the 97.5th percentile of a right-skewed sampling distribution. Then you have to go really far back to the left to capture the truth. If you get a point estimate at the 2.5th percentile of this same sampling distribution, then you only need to go a little bit to the right to capture the truth. Thus, the CI with the nominal behavior would seem to require a short arm to the right and a long arm to the left. Simulation-based inference does the opposite of this. It’s my intuition that equal-armed CIs would work better than CIs with a long and short arm on the “wrong” side. This is merely my intuition. 42 | 43 | 44 | 45 | The quote from p. 1 is stated as a matter of fact, but it’s not clear what the authors mean by “more accurate” or what conception of “accurate” would make this statement true. Similarly for “valid” on p. 16. 46 | 47 | 48 | 49 | The most natural reading of “more accurate” seems to be “closer to nominal coverage” (e.g., 95% capture rate). The authors suggest that the delta method will not approximately achieve this coverage when the sampling distribution is far from symmetric. This seems non-controversial. But it seems to me (based on intuition/theory and simulations) that the simulation-based intervals will also behave poorly in these same scenarios (with this poor behavior translating to the p-values). Claims 1, 2, and 4 above seem to assume that simulation-based inference will meaningfully improve on the delta method when the sampling distribution is non-normal, but I can’t quite see why that should be (see discussion above). Claim 3 depends on claims 1, 2, and 4. 50 | 51 | 52 | 53 | If they are able, I suggest that the authors (1) clarify their usage of “accurate” and “valid” and (2) support these points with references and or brief justifications. It’s certainly beyond the scope of the paper to fully justify these claims, but given the matter-of-factness with which the authors make these claims, perhaps stating the claims more clearly or justifying them briefly would be helpful to readers. 54 | 55 | 56 | 57 | I should emphasize, though, that this is “small beans”—mostly theoretical navel gazing—because asymptotic results apply and simulation-based inference is easy to use and historically important and popular. 58 | 59 | -------------------------------------------------------------------------------- /clarify/Submission 2/1-review-2.txt: -------------------------------------------------------------------------------- 1 | Thank you for giving me the opportunity to review this interesting paper. 2 | 3 | I like the paper a lot. It is clear and well-written. 4 | 5 | The software is of high quality. I have tried it, read the documentation, and skimmed its code base. The authors follow modern best practices for development, including many unit tests, thorough documentation, and a nice website with a useful vignette and a migration guide for users of the older `Zelig` package. Well done! 6 | 7 | # Motivation 8 | 9 | The motivation for simulation-based inference could be improved. For example, on page 1 the authors state that: 10 | 11 | > Simulation-based inference is not only often more accurate than using the delta method, it is also simpler to understand and implement, as it does not require understanding Taylor series or the calculus that underlies it. This makes it more palatable to nontechnical audiences and easier to learn for students without sacrificing statistical performance. 12 | 13 | This is a red herring. Easy to use software implements the delta method by default, and nontechnical audiences and students essentially never have to implement the delta method themselves. 14 | 15 | Also, what does "accurate" mean, exactly? What does "often" mean? Under what conditions, exactly? This is a paper about software, so I don't expect a full theoretical investigation. But there should be at least be a little bit more discussion and breadcrumbs for users to follow. Are there good theoretical or simulation studies on the properties of this strategy, identifying when it works better or worse? Currently, the reader might leave the paper with the impression that simulation-based inference strictly dominates the delta method and bootstrapping in all cases, except along the computational cost dimension. Is that really true? I'll freely admit that I'm a bit skeptical, but I would love to see some references to authors who probe this question. I'm sure other readers would find this useful too in deciding whether they should use `clarify`. 16 | 17 | The flip side of this question is: If simulation-based inference is easier and more accurate, why isn't everyone using it already? What's the market failure? 18 | 19 | # Approximations in simulation-based inference 20 | 21 | The authors write: 22 | 23 | > Arriving at the posterior distribution does not require taking any derivatives or making any approximations beyond those usually used for inference on model parameter estimates. 24 | 25 | This may seem trivial, taking M random draws will lead to a different result than drawing a different M' set of coefficients. Clearly, there's a simulation-related approximation going on. This should be acknowledged early in the intro. The authors should revise the text to avoid saying that simulation-based inference requires no approximation. 26 | 27 | # Assumption or approximation? 28 | 29 | The abstract and introduction sell simulation-based inference as a way to relax "assumptions" of the delta method, but then goes on to talk about "approximations" that may fail: 30 | 31 | > The usual method for estimating the uncertainty of the derived quantities is known as the “delta method”, which involves two approximations: 1) that the variance of the derived quantity can be represented as a first-order Taylor series, and 2) that the estimate of the derived quantity is normally distributed. 32 | 33 | The classic textbook treatment of the delta method talks about two assumptions: continuity, and normality of the $\hat{\theta}$ --- not of derived quantities $h(\hat{\theta})$. Do the authors see a distinction between a violation of delta method assumptions and a "failure" of approximations in finite samples? If so, this should be cleared up in the abstract and intro to avoid confusion. 34 | 35 | # Backtransformation 36 | 37 | Quotes like these two feel misleading: because standard practice in such GLM models it to build confidence intervals by backtransformation, rather than by naively constructing symmetric intervals: 38 | 39 | > For example, predicted probabilities close to 0 or 1 or ratios of coefficients or probabilities typically do not have normal (or even symmetrical) distributions in finite samples, and the usual Wald-type confidence interval limits produced from delta method standard errors can include values outside the domain of the quantity of interest. 40 | 41 | > One can see again how a delta method or Normal approximation may not have yielded valid uncertainty intervals given the non-Normality of the distributions. 42 | 43 | Most of the alternative packages in `R` will automatically use backtransformation to build confidence intervals that do not stretch outside reasonable bounds. Of course, this is not possible or easy for all model types. But if they authors want to make a big deal out of that critique --- open with it on page 1 and reiterate it in the text --- they should probably show an example where this is a real problem 44 | 45 | # Minor notes 46 | 47 | * Why are all functions prefixed with `sim_`? I would argue that these prefixes are extraneous, and that this is what the namespace is for. 48 | * Can `plot()` draw on more dimensions if more variables are included in `sim_setx()`? 49 | * It is not clear to me from the documentation what `null` values are acceptable in `summary()`. I see RD and RR. What else? 50 | 51 | > The largest difference is that clarify supports iterative building of more and more complex hypotheses through the transform() method, which quickly computes new quantities and transformation from the existing computed quantities, whereas marginaleffects only supports a single transformation 52 | 53 | Or users can call `posterior_draws()` to manipulate the draws themselves, but this is admittedly less convenient. 54 | 55 | -------------------------------------------------------------------------------- /clarify/Submission 2/RJwrapper.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{report} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage[T1]{fontenc} 4 | \usepackage{RJournal} 5 | \usepackage{amsmath,amssymb,array} 6 | \usepackage{booktabs} 7 | 8 | 9 | % tightlist command for lists without linebreak 10 | \providecommand{\tightlist}{% 11 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} 12 | 13 | 14 | % Always define CSL refs as bib entries are contained in separate doc 15 | % Pandoc citation processing 16 | \newlength{\cslhangindent} 17 | \setlength{\cslhangindent}{1.5em} 18 | \newlength{\csllabelwidth} 19 | \setlength{\csllabelwidth}{3em} 20 | \newlength{\cslentryspacingunit} % times entry-spacing 21 | \setlength{\cslentryspacingunit}{\parskip} 22 | % for Pandoc 2.8 to 2.10.1 23 | \newenvironment{cslreferences}% 24 | {}% 25 | {\par} 26 | % For Pandoc 2.11+ 27 | \newenvironment{CSLReferences}[2] % #1 hanging-ident, #2 entry spacing 28 | {% don't indent paragraphs 29 | \setlength{\parindent}{0pt} 30 | % turn on hanging indent if param 1 is 1 31 | \ifodd #1 32 | \let\oldpar\par 33 | \def\par{\hangindent=\cslhangindent\oldpar} 34 | \fi 35 | % set entry spacing 36 | \setlength{\parskip}{#2\cslentryspacingunit} 37 | }% 38 | {} 39 | \usepackage{calc} 40 | \newcommand{\CSLBlock}[1]{#1\hfill\break} 41 | \newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{#1}} 42 | \newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{#1}\break} 43 | \newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1} 44 | 45 | 46 | 47 | \begin{document} 48 | 49 | 50 | %% do not edit, for illustration only 51 | \sectionhead{Contributed research article} 52 | \volume{XX} 53 | \volnumber{YY} 54 | \year{20ZZ} 55 | \month{AAAA} 56 | 57 | \begin{article} 58 | \input{clarify} 59 | \end{article} 60 | 61 | 62 | \end{document} 63 | -------------------------------------------------------------------------------- /clarify/Submission 2/clarify.R: -------------------------------------------------------------------------------- 1 | # Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand 2 | # Please edit clarify.Rmd to modify this file 3 | 4 | ## ----setup,include=FALSE------------------------------------------------------ 5 | knitr::opts_chunk$set( 6 | fig.path = "figures/", 7 | fig.align='center', 8 | fig.height = 2 9 | ) 10 | 11 | if (!requireNamespace("clarify")) { 12 | install.packages("clarify") 13 | } 14 | 15 | if (!requireNamespace("MatchIt")) { 16 | install.packages("MatchIt") 17 | } 18 | 19 | if (!requireNamespace("Amelia")) { 20 | install.packages("Amelia") 21 | } 22 | 23 | 24 | ## ----------------------------------------------------------------------------- 25 | library(clarify) 26 | 27 | 28 | ## ----------------------------------------------------------------------------- 29 | data("lalonde", package = "MatchIt") 30 | 31 | lalonde$re78_0 <- ifelse(lalonde$re78 > 0, 1, 0) 32 | 33 | head(lalonde) 34 | 35 | 36 | ## ----------------------------------------------------------------------------- 37 | fit <- glm(re78_0 ~ treat * married + age + educ + race + 38 | nodegree + re74 + re75, data = lalonde, 39 | family = binomial("probit")) 40 | 41 | 42 | ## ----------------------------------------------------------------------------- 43 | set.seed(1234) 44 | 45 | # Drawing 1000 simulated coefficients using an HC2 robust 46 | # covariance matrix 47 | s <- sim(fit, n = 1000, 48 | vcov = "HC2") 49 | 50 | s 51 | 52 | 53 | ## ----------------------------------------------------------------------------- 54 | sim_fun1 <- function(fit) { 55 | predict(fit, newdata = lalonde["PSID1",], type = "response") 56 | } 57 | 58 | 59 | ## ----------------------------------------------------------------------------- 60 | est1 <- sim_apply(s, FUN = sim_fun1, verbose = FALSE) 61 | 62 | est1 63 | 64 | 65 | ## ----------------------------------------------------------------------------- 66 | sim_fun2 <- function(coefs) { 67 | hispan <- unname(coefs["racehispan"]) 68 | white <- unname(coefs["racewhite"]) 69 | 70 | c("w - h" = white - hispan) 71 | } 72 | 73 | est2 <- sim_apply(s, FUN = sim_fun2, verbose = FALSE) 74 | 75 | est2 76 | 77 | 78 | ## ---- fig.width=4------------------------------------------------------------- 79 | plot(est1, reference = TRUE, ci = FALSE) 80 | 81 | 82 | ## ----------------------------------------------------------------------------- 83 | summary(est1) 84 | 85 | 86 | ## ---- fig.width=4------------------------------------------------------------- 87 | plot(est2, reference = TRUE, ci = FALSE) 88 | 89 | summary(est2, method = "wald", null = 0) 90 | 91 | 92 | ## ----------------------------------------------------------------------------- 93 | est3 <- sim_setx(s, 94 | x = list(treat = 0:1, 95 | re75 = c(0, 20000), 96 | race = "black"), 97 | verbose = FALSE) 98 | 99 | 100 | ## ----------------------------------------------------------------------------- 101 | summary(est3) 102 | 103 | 104 | ## ----------------------------------------------------------------------------- 105 | attr(est3, "setx") 106 | 107 | 108 | ## ---- fig.width=5------------------------------------------------------------- 109 | plot(est3, var = "re75", ci = FALSE) 110 | 111 | 112 | ## ----------------------------------------------------------------------------- 113 | est4 <- sim_setx(s, 114 | x = list(treat = 0:1, 115 | re75 = seq(0, 20000, by = 2000), 116 | race = "black"), 117 | verbose = FALSE) 118 | 119 | 120 | ## ---- fig.width=5------------------------------------------------------------- 121 | plot(est4) 122 | 123 | 124 | ## ----------------------------------------------------------------------------- 125 | est5 <- sim_setx(s, 126 | x = list(treat = 0, re75 = 0), 127 | x1 = list(treat = 1, re75 = 0), 128 | verbose = FALSE) 129 | 130 | 131 | ## ----------------------------------------------------------------------------- 132 | summary(est5) 133 | 134 | 135 | ## ----------------------------------------------------------------------------- 136 | est6 <- sim_ame(s, 137 | var = "treat", 138 | subset = treat == 1, 139 | contrast = "rr", 140 | verbose = FALSE) 141 | 142 | 143 | ## ----------------------------------------------------------------------------- 144 | summary(est6, null = c(`RR` = 1)) 145 | 146 | 147 | ## ----------------------------------------------------------------------------- 148 | est7 <- sim_ame(s, 149 | var = "age", 150 | verbose = FALSE) 151 | 152 | 153 | ## ----------------------------------------------------------------------------- 154 | summary(est7) 155 | 156 | 157 | ## ----------------------------------------------------------------------------- 158 | est6b <- sim_ame(s, 159 | var = "treat", 160 | subset = treat == 1, 161 | by = ~married, 162 | contrast = "rr", 163 | verbose = FALSE) 164 | 165 | summary(est6b) 166 | 167 | 168 | ## ----------------------------------------------------------------------------- 169 | age_seq <- seq(18, 50, by = 2) 170 | 171 | est8 <- sim_adrf(s, 172 | var = "age", 173 | contrast = "adrf", 174 | at = age_seq, 175 | verbose = FALSE) 176 | 177 | 178 | ## ---- fig.width=5------------------------------------------------------------- 179 | plot(est8) 180 | 181 | 182 | ## ----------------------------------------------------------------------------- 183 | summary(est8, parm = 1:4) 184 | 185 | 186 | ## ----------------------------------------------------------------------------- 187 | est9 <- sim_adrf(s, 188 | var = "age", 189 | contrast = "amef", 190 | at = age_seq, 191 | verbose = FALSE) 192 | 193 | 194 | ## ---- fig.width=5------------------------------------------------------------- 195 | plot(est9) 196 | 197 | 198 | ## ----------------------------------------------------------------------------- 199 | lalonde <- transform(lalonde, 200 | re78_0 = ifelse(re78 == 0, 1, 0)) 201 | 202 | 203 | ## ----------------------------------------------------------------------------- 204 | est6 <- transform(est6, 205 | RD = `E[Y(1)]` - `E[Y(0)]`) 206 | 207 | 208 | ## ----------------------------------------------------------------------------- 209 | summary(est6, null = c(`RR` = 1, `RD` = 0)) 210 | 211 | 212 | ## ----------------------------------------------------------------------------- 213 | est6b |> 214 | transform(RR_ratio = `RR[1]` / `RR[0]`) |> 215 | summary(parm = c("RR[0]", "RR[1]", "RR_ratio"), 216 | null = 1) 217 | 218 | 219 | ## ----------------------------------------------------------------------------- 220 | # AME of treat with race = "black" 221 | est10b <- sim_ame(s, var = "treat", subset = race == "black", 222 | contrast = "diff", verbose = FALSE) 223 | summary(est10b) 224 | 225 | # AME of treat with race = "hispan" 226 | est10h <- sim_ame(s, var = "treat", subset = race == "hispan", 227 | contrast = "diff", verbose = FALSE) 228 | summary(est10h) 229 | 230 | 231 | ## ----------------------------------------------------------------------------- 232 | names(est10b) <- paste(names(est10b), "b", sep = "_") 233 | names(est10h) <- paste(names(est10h), "h", sep = "_") 234 | 235 | 236 | ## ----------------------------------------------------------------------------- 237 | est10 <- cbind(est10b, est10h) 238 | summary(est10) 239 | 240 | 241 | ## ----------------------------------------------------------------------------- 242 | est10 <- transform(est10, 243 | `Dh - Db` = Diff_h - Diff_b) 244 | summary(est10, parm = "Dh - Db") 245 | 246 | 247 | ## ---- include=F--------------------------------------------------------------- 248 | amelia_ok <- requireNamespace("Amelia", quietly = TRUE) 249 | knitr::opts_chunk$set( 250 | eval = amelia_ok 251 | ) 252 | if (amelia_ok) library(Amelia) 253 | 254 | 255 | ## ---- message=F--------------------------------------------------------------- 256 | library(Amelia) 257 | data("africa", package = "Amelia") 258 | 259 | # Multiple imputation 260 | a.out <- amelia(x = africa, m = 10, cs = "country", 261 | ts = "year", logs = "gdp_pc", p2s = 0) 262 | 263 | # Fit model to each dataset 264 | model.list <- with(a.out, lm(gdp_pc ~ infl * trade)) 265 | 266 | # Simulate coefficients, 100 draws per imputation 267 | si <- misim(model.list, n = 100) 268 | 269 | si 270 | 271 | 272 | ## ----------------------------------------------------------------------------- 273 | sim_fun <- function(fit) { 274 | #Extract the original dataset using get_predictors() 275 | X <- insight::get_predictors(fit) 276 | 277 | p0 <- predict(fit) 278 | 279 | #Predictions after perturbing infl slightly 280 | p1 <- predict(fit, newdata = transform(X, infl = infl + 1e-5)) 281 | 282 | c(AME = mean((p1 - p0) / 1e-5)) 283 | } 284 | 285 | est_mi <- sim_apply(si, FUN = sim_fun, verbose = FALSE) 286 | 287 | summary(est_mi) 288 | 289 | 290 | ## ----------------------------------------------------------------------------- 291 | est_mi2 <- sim_ame(si, var = "infl", verbose = FALSE) 292 | 293 | summary(est_mi2) 294 | 295 | -------------------------------------------------------------------------------- /clarify/Submission 2/clarify.log: -------------------------------------------------------------------------------- 1 | This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.9.22) 28 SEP 2023 14:03 2 | entering extended mode 3 | restricted \write18 enabled. 4 | %&-line parsing enabled. 5 | **clarify.tex 6 | (./clarify.tex 7 | LaTeX2e <2023-06-01> patch level 1 8 | L3 programming layer <2023-08-29> 9 | ! Undefined control sequence. 10 | l.7 \maketitle 11 | 12 | Here is how much of TeX's memory you used: 13 | 16 strings out of 476894 14 | 383 string characters out of 5807862 15 | 1917791 words of memory out of 5000000 16 | 21409 multiletter control sequences out of 15000+600000 17 | 558069 words of font info for 36 fonts, out of 8000000 for 9000 18 | 14 hyphenation exceptions out of 8191 19 | 13i,0n,12p,88b,9s stack positions out of 10000i,1000n,20000p,200000b,200000s 20 | 21 | ! ==> Fatal error occurred, no output PDF file produced! 22 | -------------------------------------------------------------------------------- /clarify/Submission 2/clarify.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/clarify.pdf -------------------------------------------------------------------------------- /clarify/Submission 2/figures/plot1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/plot1-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/plot2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/plot2-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/plot3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/plot3-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/plot4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/plot4-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/plot8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/plot8-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/plot9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/plot9-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-10-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-10-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-14-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-14-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-16-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-16-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-25-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-25-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-25-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-25-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-28-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-28-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-28-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-28-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-8-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-8-1.pdf -------------------------------------------------------------------------------- /clarify/Submission 2/figures/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/clarify/Submission 2/figures/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /clarify/Submission 2/initial_checks.log: -------------------------------------------------------------------------------- 1 | Initial check results: 2 | 3 | SUCCESS: Submission has consistently named tex, bib, and R files 4 | WARNING: The archive contains hidden files which will be removed: .DS_Store 5 | SUCCESS: File and directory names are compliant. 6 | SUCCESS: No problematic file found 7 | SUCCESS: Possible motivation letter found: motivation-letter.md 8 | ERROR: The title is not in title case! Suggest title to be changed to: 9 | Clarify: Simulation-Based Inference for Regression Models. 10 | Initial check results: 11 | 12 | SUCCESS: Submission has consistently named tex, bib, and R files 13 | WARNING: The archive contains hidden files which will be removed: .DS_Store 14 | SUCCESS: File and directory names are compliant. 15 | SUCCESS: No problematic file found 16 | SUCCESS: Possible motivation letter found: motivation-letter.md 17 | SUCCESS: The article title is properly formatted. 18 | Initial check results: 19 | 20 | SUCCESS: Submission has consistently named tex, bib, and R files 21 | WARNING: The archive contains hidden files which will be removed: .DS_Store 22 | SUCCESS: File and directory names are compliant. 23 | SUCCESS: No problematic file found 24 | SUCCESS: Possible motivation letter found: motivation-letter.md 25 | SUCCESS: The article title is properly formatted. 26 | -------------------------------------------------------------------------------- /clarify/Submission 2/motivation-letter.md: -------------------------------------------------------------------------------- 1 | --- 2 | output: pdf_document 3 | fontsize: 12pt 4 | --- 5 | 6 | \thispagestyle{empty} 7 | \today 8 | 9 | Editor 10 | The R Journal 11 | \bigskip 12 | 13 | 14 | Thank you for considering our article "`clarify`: Simulation-Based Inference for Regression Models" for publication in the R Journal. The article describes the use of our package `clarify` for performing simulation-based inference of post-estimation quantities from regression, which enhances the interpretation of these models without making the same assumptions and approximations similar methods make. We believe this will be useful to anyone performing statistical analysis with regression and therefore will have broad appeal to R users. Please feel free to reach out with any questions or additional materials that might be useful. 15 | 16 | \bigskip 17 | \bigskip 18 | 19 | Regards, 20 | 21 | 22 | 23 | 24 | Noah Greifer 25 | Institute for Quantitative Social Science 26 | Harvard University 27 | Cambridge, MA, USA 28 | ngreifer@iq.harvard.edu 29 | 30 | \bigskip 31 | -------------------------------------------------------------------------------- /man/clarify-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clarify-package.R 3 | \docType{package} 4 | \name{clarify-package} 5 | \alias{clarify} 6 | \alias{clarify-package} 7 | \title{clarify: Simulation-Based Inference for Regression Models} 8 | \description{ 9 | Performs simulation-based inference as an alternative to the delta method for obtaining valid confidence intervals and p-values for regression post-estimation quantities, such as average marginal effects and predictions at representative values. This framework for simulation-based inference is especially useful when the resulting quantity is not normally distributed and the delta method approximation fails. The methodology is described in King, Tomz, and Wittenberg (2000) \doi{10.2307/2669316}. 'clarify' is meant to replace some of the functionality of the archived package 'Zelig'; see the vignette "Translating Zelig to clarify" for replicating this functionality. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/iqss/clarify} 15 | \item \url{https://iqss.github.io/clarify/} 16 | \item Report bugs at \url{https://github.com/iqss/clarify/issues} 17 | } 18 | 19 | } 20 | \author{ 21 | \strong{Maintainer}: Noah Greifer \email{ngreifer@iq.harvard.edu} (\href{https://orcid.org/0000-0003-3067-7154}{ORCID}) 22 | 23 | Authors: 24 | \itemize{ 25 | \item Steven Worthington \email{sworthington@iq.harvard.edu} (\href{https://orcid.org/0000-0001-9550-5797}{ORCID}) 26 | \item Stefano Iacus \email{siacus@iq.harvard.edu} (\href{https://orcid.org/0000-0002-4884-0047}{ORCID}) 27 | \item Gary King \email{king@harvard.edu} (\href{https://orcid.org/0000-0002-5327-7631}{ORCID}) 28 | } 29 | 30 | } 31 | \keyword{internal} 32 | -------------------------------------------------------------------------------- /man/figures/README-example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/man/figures/README-example-1.png -------------------------------------------------------------------------------- /man/figures/README-example2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/man/figures/README-example2-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/man/figures/README-unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/man/figures/README-unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /man/misim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/misim.R 3 | \name{misim} 4 | \alias{misim} 5 | \title{Simulate model coefficients after multiple imputation} 6 | \usage{ 7 | misim(fitlist, n = 1000, vcov = NULL, coefs = NULL, dist = NULL) 8 | } 9 | \arguments{ 10 | \item{fitlist}{a list of model fits, one for each imputed dataset, or a \code{mira} object (the output of a call to \code{with()} applied to a \code{mids} object in \code{mice}).} 11 | 12 | \item{n}{the number of simulations to run for each imputed dataset; default is 1000. More is always better but resulting calculations will take longer.} 13 | 14 | \item{vcov}{a square covariance matrix of the coefficient covariance estimates, a function to use to extract it from \code{fit}, or a list thereof with an element for each imputed dataset. By default, uses \code{\link[stats:vcov]{stats::vcov()}} or \code{\link[insight:get_varcov]{insight::get_varcov()}} if that doesn't work.} 15 | 16 | \item{coefs}{a vector of coefficient estimates, a function to use to extract it from \code{fit}, or a list thereof with an element for each imputed dataset. By default, uses \code{\link[stats:coef]{stats::coef()}} or \code{\link[insight:get_parameters]{insight::get_parameters()}} if that doesn't work.} 17 | 18 | \item{dist}{a character vector containing the name of the multivariate distribution(s) to use to draw simulated coefficients. Should be one of \code{"normal"} (multivariate normal distribution) or \code{"t_{#}"} (multivariate t distribution), where \verb{\{#\}} corresponds to the desired degrees of freedom (e.g., \code{"t_100"}). If \code{NULL}, the right distributions to use will be determined based on heuristics; see \code{\link[=sim]{sim()}} for details.} 19 | } 20 | \value{ 21 | A \code{clarify_misim} object, which inherits from \code{clarify_sim} and has the following components: 22 | \item{sim.coefs}{a matrix containing the simulated coefficients with a column for each coefficient and a row for each simulation for each imputation} 23 | \item{coefs}{a matrix containing the original coefficients extracted from \code{fitlist} or supplied to \code{coefs}, with a row per imputation.} 24 | \item{fit}{the list of model fits supplied to \code{fitlist}} 25 | \item{imp}{a identifier of which imputed dataset each set of simulated coefficients corresponds to.} 26 | The \code{"dist"} attribute contains \code{"normal"} if the coefficients were sampled from a multivariate normal distribution and \code{"t({df})"} if sampled from a multivariate t distribution. The \code{"clarify_hash"} attribute contains a unique hash generated by \code{\link[rlang:hash]{rlang::hash()}}. 27 | } 28 | \description{ 29 | \code{misim()} simulates model parameters from multivariate normal or t distributions after multiple imputation that are then used by \code{\link[=sim_apply]{sim_apply()}} to calculate quantities of interest. 30 | } 31 | \details{ 32 | \code{misim()} essentially combines multiple \code{sim()} calls applied to a list of model fits, each fit in an imputed dataset, into a single combined pool of simulated coefficients. When simulation-based inference is to be used with multiply imputed data, many imputations are required; see Zhou and Reiter (2010). 33 | } 34 | \examples{ 35 | \dontshow{if (requireNamespace("Amelia", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} 36 | data("africa", package = "Amelia") 37 | 38 | # Multiple imputation using Amelia 39 | a.out <- Amelia::amelia(x = africa, m = 10, 40 | cs = "country", 41 | ts = "year", logs = "gdp_pc", 42 | p2s = 0) 43 | 44 | fits <- with(a.out, lm(gdp_pc ~ infl * trade)) 45 | 46 | # Simulate coefficients 47 | s <- misim(fits) 48 | s 49 | \dontshow{\}) # examplesIf} 50 | } 51 | \references{ 52 | Zhou, X., & Reiter, J. P. (2010). A Note on Bayesian Inference After Multiple Imputation. \emph{The American Statistician}, 64(2), 159–163. \doi{10.1198/tast.2010.09109} 53 | } 54 | \seealso{ 55 | \itemize{ 56 | \item \code{\link[=sim]{sim()}} for simulating model coefficients for a single dataset 57 | \item \code{\link[=sim_apply]{sim_apply()}} for applying a function to each set of simulated coefficients 58 | \item \code{\link[=sim_ame]{sim_ame()}} for computing average marginal effects in each simulation draw 59 | \item \code{\link[=sim_setx]{sim_setx()}} for computing marginal predictions and first differences at typical values in each simulation draw 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /man/plot.clarify_adrf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.clarify_adrf.R 3 | \name{plot.clarify_adrf} 4 | \alias{plot.clarify_adrf} 5 | \title{Plot marginal predictions from \code{sim_adrf()}} 6 | \usage{ 7 | \method{plot}{clarify_adrf}( 8 | x, 9 | ci = TRUE, 10 | level = 0.95, 11 | method = "quantile", 12 | baseline, 13 | color = "black", 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{a \code{clarify_adrf} object resulting from a call to \code{\link[=sim_adrf]{sim_adrf()}}.} 19 | 20 | \item{ci}{\code{logical}; whether to display confidence bands for the estimates. Default is \code{TRUE}.} 21 | 22 | \item{level}{the confidence level desired. Default is .95 for 95\% confidence intervals.} 23 | 24 | \item{method}{the method used to compute confidence bands. Can be \code{"wald"} to use a Normal approximation or \code{"quantile"} to use the simulated sampling distribution (default). See \code{\link[=summary.clarify_est]{summary.clarify_est()}} for details. Abbreviations allowed.} 25 | 26 | \item{baseline}{\code{logical}; whether to include a horizontal line at \code{y = 0} on the plot. Default is \code{FALSE} for the ADRF (since 0 might not be in the range of the outcome) and \code{TRUE} for the AMEF.} 27 | 28 | \item{color}{the color of the line and confidence band in the plot.} 29 | 30 | \item{...}{for \code{plot()}, further arguments passed to \code{\link[ggplot2:geom_density]{ggplot2::geom_density()}}.} 31 | } 32 | \value{ 33 | A \code{ggplot} object. 34 | } 35 | \description{ 36 | \code{plot.clarify_adrf()} plots the output of \code{\link[=sim_adrf]{sim_adrf()}}. For the average dose-response function (ADRF, requested with \code{contrast = "adrf"} in \code{sim_adrf()}), this is a plot of the average marginal mean of the outcome against the requested values of the focal predictor; for the average marginal effects function (AMEF, requested with \code{contrast = "amef"} in \code{sim_adrf()}), this is a plot of the instantaneous average marginal effect of the focal predictor on the outcome against the requested values of the focal predictor. 37 | } 38 | \details{ 39 | These plots are produced using \code{\link[ggplot2:geom_path]{ggplot2::geom_line()}} and \code{\link[ggplot2:geom_ribbon]{ggplot2::geom_ribbon()}}. The confidence bands should be interpreted pointwise (i.e., they do not account for simultaneous inference). 40 | } 41 | \examples{ 42 | ## See help("sim_adrf") for examples 43 | 44 | } 45 | \seealso{ 46 | \code{\link[=summary.clarify_est]{summary.clarify_est()}} for computing p-values and confidence intervals for the estimated quantities. 47 | } 48 | -------------------------------------------------------------------------------- /man/plot.clarify_setx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.clarify_setx.R 3 | \name{plot.clarify_setx} 4 | \alias{plot.clarify_setx} 5 | \title{Plot marginal predictions from \code{sim_setx()}} 6 | \usage{ 7 | \method{plot}{clarify_setx}( 8 | x, 9 | var = NULL, 10 | ci = TRUE, 11 | level = 0.95, 12 | method = "quantile", 13 | reference = FALSE, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{a \code{clarify_est} object resulting from a call to \code{\link[=sim_setx]{sim_setx()}}.} 19 | 20 | \item{var}{the name of the focal varying predictor, i.e., the variable to be on the x-axis of the plot. All other variables with varying set values will be used to color the resulting plot. See Details. Ignored if no predictors vary or if only one predictor varies in the reference grid or if \code{x1} was specified in \code{sim_setx()}. If not set, will use the predictor with the greatest number of unique values specified in the reference grid.} 21 | 22 | \item{ci}{\code{logical}; whether to display confidence intervals or bands for the estimates. Default is \code{TRUE}.} 23 | 24 | \item{level}{the confidence level desired. Default is .95 for 95\% confidence intervals.} 25 | 26 | \item{method}{the method used to compute confidence intervals or bands. Can be \code{"wald"} to use a Normal approximation or \code{"quantile"} to use the simulated sampling distribution (default). See \code{\link[=summary.clarify_est]{summary.clarify_est()}} for details. Abbreviations allowed.} 27 | 28 | \item{reference}{\code{logical}; whether to overlay a normal density reference distribution over the plots. Default is \code{FALSE}. Ignored when variables other than the focal varying predictor vary.} 29 | 30 | \item{...}{for \code{plot()}, further arguments passed to \code{\link[ggplot2:geom_density]{ggplot2::geom_density()}}.} 31 | } 32 | \value{ 33 | A \code{ggplot} object. 34 | } 35 | \description{ 36 | \code{plot.clarify_sext()} plots the output of \code{\link[=sim_setx]{sim_setx()}}, providing graphics similar to those of \code{\link[=plot.clarify_est]{plot.clarify_est()}} but with features specifically for plot marginal predictions. For continues predictors, this is a plot of the marginal predictions and their confidence bands across levels of the predictor. Otherwise, this is is a plot of simulated sampling distribution of the marginal predictions. 37 | } 38 | \details{ 39 | \code{plot()} creates one of two kinds of plots depending on how the reference grid was specified in the call to \code{sim_setx()} and what \code{var} is set to. When the focal varying predictor (i.e., the one set in \code{var}) is numeric and takes on three or more unique values in the reference grid, the produced plot is a line graph displaying the value of the marginal prediction (denoted as \code{E[Y|X]}) across values of the focal varying predictor, with confidence bands displayed when \code{ci = TRUE}. If other predictors also vary, lines for different values will be displayed in different colors. These plots are produced using \code{\link[ggplot2:geom_path]{ggplot2::geom_line()}} and \code{\link[ggplot2:geom_ribbon]{ggplot2::geom_ribbon()}} 40 | 41 | When the focal varying predictor is a factor or character or only takes on two or fewer values in the reference grid, the produced plot is a density plot of the simulated predictions, similar to the plot resulting from \code{\link[=plot.clarify_est]{plot.clarify_est()}}. When other variables vary, densities for different values will be displayed in different colors. These plots are produced using \code{\link[ggplot2:geom_density]{ggplot2::geom_density()}}. 42 | 43 | Marginal predictions are identified by the corresponding levels of the predictors that vary. The user should keep track of whether the non-varying predictors are set at specified or automatically set "typical" levels. 44 | } 45 | \examples{ 46 | ## See help("sim_setx") for examples 47 | 48 | } 49 | \seealso{ 50 | \code{\link[=summary.clarify_est]{summary.clarify_est()}} for computing p-values and confidence intervals for the estimated quantities. 51 | } 52 | -------------------------------------------------------------------------------- /man/sim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim.R 3 | \name{sim} 4 | \alias{sim} 5 | \title{Simulate model parameters} 6 | \usage{ 7 | sim(fit, n = 1000, vcov = NULL, coefs = NULL, dist = NULL) 8 | } 9 | \arguments{ 10 | \item{fit}{a model fit, such as the output of a call to \code{\link[=lm]{lm()}} or \code{\link[=glm]{glm()}}. Can be left unspecified if \code{coefs} and \code{vcov} are not functions.} 11 | 12 | \item{n}{the number of simulations to run; default is 1000. More is always better but resulting calculations will take longer.} 13 | 14 | \item{vcov}{either a square covariance matrix of the coefficient covariance estimates or a function to use to extract it from \code{fit}. By default, uses \code{\link[stats:vcov]{stats::vcov()}} or \code{\link[insight:get_varcov]{insight::get_varcov()}} if that doesn't work.} 15 | 16 | \item{coefs}{either a vector of coefficient estimates or a function to use to extract it from \code{fit}. By default, uses \code{\link[stats:coef]{stats::coef()}} or \code{\link[insight:get_parameters]{insight::get_parameters()}} if that doesn't work.} 17 | 18 | \item{dist}{a string containing the name of the multivariate distribution to use to draw simulated coefficients. Should be one of \code{"normal"} (multivariate normal distribution) or \code{"t({#})"} (multivariate t distribution), where \verb{\{#\}} corresponds to the desired degrees of freedom (e.g., \code{"t(100)"}). If \code{NULL}, the right distribution to use will be determined based on heuristics; see Details.} 19 | } 20 | \value{ 21 | A \code{clarify_sim} object, which has the following components: 22 | \item{sim.coefs}{a matrix containing the simulated coefficients with a column for each coefficient and a row for each simulation} 23 | \item{coefs}{the original coefficients extracted from \code{fit} or supplied to \code{coefs}.} 24 | \item{vcov}{the covariance matrix of the coefficients extracted from \code{fit} or supplied to \code{vcov}} 25 | \item{fit}{the original model fit supplied to \code{fit}} 26 | The \code{"dist"} attribute contains \code{"normal"} if the coefficients were sampled from a multivariate normal distribution and \code{"t(df)"} if sampled from a multivariate t distribution. The \code{"clarify_hash"} attribute contains a unique hash generated by \code{\link[rlang:hash]{rlang::hash()}}. 27 | } 28 | \description{ 29 | \code{sim()} simulates model parameters from a multivariate normal or t distribution that are then used by \code{\link[=sim_apply]{sim_apply()}} to calculate quantities of interest. 30 | } 31 | \details{ 32 | When \code{dist} is \code{NULL}, \code{sim()} samples from a multivariate normal or t distribution depending on the degrees of freedom extracted from \code{insight::get_df(., type = "wald")}. If \code{Inf}, a normal distribution will be used; otherwise, a t-distribution with the returned degrees of freedom will be used. Models not supported by \code{insight} will use a normal distribution. 33 | 34 | When a multivariate normal is used, it is sampled from with means equal to the estimated coefficients and the parameter covariance matrix as the covariance matrix using \code{\link[mvnfast:rmvn]{mvnfast::rmvn()}}. When a multivariate t distribution is used, it is sampled from with means equal to the estimated coefficients and scaling matrix equal to \code{cov*(df - 2)/df}, where \code{cov} is the parameter covariance matrix and \code{df} is the residual degrees of freedom for the model, using \code{\link[mvnfast:rmvt]{mvnfast::rmvt()}}. 35 | } 36 | \examples{ 37 | 38 | data("lalonde", package = "MatchIt") 39 | fit <- lm(re78 ~ treat * (age + race + nodegree + re74), data = lalonde) 40 | 41 | # Simulate coefficients 42 | s <- sim(fit) 43 | s 44 | 45 | ## Could also use a robust covariance matrix, e.g., 46 | s <- sim(fit, vcov = "HC3") 47 | 48 | # Simulated coefficients assuming a normal distribution 49 | # for coefficients; default for `lm` objects is a t- 50 | # distribution 51 | s <- sim(fit, dist = "normal") 52 | s 53 | 54 | } 55 | \seealso{ 56 | \itemize{ 57 | \item \code{\link[=misim]{misim()}} for simulating model coefficients after multiple imputation 58 | \item \code{\link[=sim_apply]{sim_apply()}} for applying a function to each set of simulated coefficients 59 | \item \code{\link[=sim_ame]{sim_ame()}} for computing average marginal effects in each simulation draw 60 | \item \code{\link[=sim_setx]{sim_setx()}} for computing marginal predictions and first differences at typical values in each simulation draw 61 | \item \code{\link[=sim_adrf]{sim_adrf()}} for computing average dose-response functions in each simulation draw 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /man/sim_adrf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_adrf.R 3 | \name{sim_adrf} 4 | \alias{sim_adrf} 5 | \alias{print.clarify_adrf} 6 | \title{Compute an average dose-response function} 7 | \usage{ 8 | sim_adrf( 9 | sim, 10 | var, 11 | subset = NULL, 12 | by = NULL, 13 | contrast = "adrf", 14 | at = NULL, 15 | n = 21, 16 | outcome = NULL, 17 | type = NULL, 18 | eps = 1e-05, 19 | verbose = TRUE, 20 | cl = NULL 21 | ) 22 | 23 | \method{print}{clarify_adrf}(x, digits = NULL, max.ests = 6, ...) 24 | } 25 | \arguments{ 26 | \item{sim}{a \code{clarify_sim} object; the output of a call to \code{\link[=sim]{sim()}} or 27 | \code{\link[=misim]{misim()}}.} 28 | 29 | \item{var}{the name of a variable for which the ADRF or AMEF is to be computed. This variable must be present in the model supplied to \code{sim()} and must be a numeric variable taking on more than two unique values.} 30 | 31 | \item{subset}{optional; a vector used to subset the data used to compute the ADRF or AMEF. This will be evaluated within the original dataset used to fit the model using \code{\link[=subset]{subset()}}, so nonstandard evaluation is allowed.} 32 | 33 | \item{by}{a one-sided formula or character vector containing the names of variables for which to stratify the estimates. Each quantity will be computed within each level of the complete cross of the variables specified in \code{by}.} 34 | 35 | \item{contrast}{a string naming the type of quantity to be produced: \code{"adrf"} for the ADRF (the default) or \code{"amef"} for the AMEF.} 36 | 37 | \item{at}{the levels of the variable named in \code{var} at which to evaluate the ADRF or AMEF. Should be a vector of numeric values corresponding to possible levels of \code{var}. If \code{NULL}, will be set to a range from slightly below the lowest observed value of \code{var} to slightly above the largest value.} 38 | 39 | \item{n}{when \code{at = NULL}, the number of points to evaluate the ADRF or AMEF. Default is 21. Ignored when \code{at} is not \code{NULL}.} 40 | 41 | \item{outcome}{a string containing the name of the outcome or outcome level for multivariate (multiple outcomes) or multi-category outcomes. Ignored for univariate (single outcome) and binary outcomes.} 42 | 43 | \item{type}{a string containing the type of predicted values (e.g., the link or the response). Passed to \code{\link[marginaleffects:get_predict]{marginaleffects::get_predict()}} and eventually to \code{predict()} in most cases. The default and allowable option depend on the type of model supplied, but almost always corresponds to the response scale (e.g., predicted probabilities for binomial models).} 44 | 45 | \item{eps}{when \code{contrast = "amef"}, the value by which to shift the value of \code{var} to approximate the derivative. See Details.} 46 | 47 | \item{verbose}{\code{logical}; whether to display a text progress bar indicating 48 | progress and estimated time remaining for the procedure. Default is \code{TRUE}.} 49 | 50 | \item{cl}{a cluster object created by \code{\link[parallel:makeCluster]{parallel::makeCluster()}}, or an 51 | integer to indicate the number of child-processes (integer values are 52 | ignored on Windows) for parallel evaluations. See \code{\link[pbapply:pbapply]{pbapply::pblapply()}} for 53 | details. If \code{NULL}, no parallelization will take place.} 54 | 55 | \item{x}{a \code{clarify_adrf} object.} 56 | 57 | \item{digits}{the minimum number of significant digits to be used; passed to \code{\link[=print.data.frame]{print.data.frame()}}.} 58 | 59 | \item{max.ests}{the maximum number of estimates to display.} 60 | 61 | \item{...}{optional arguments passed to \code{FUN}.} 62 | } 63 | \value{ 64 | A \code{clarify_adrf} object, which inherits from \code{clarify_est} and is similar to 65 | the output of \code{sim_apply()}, with the additional attributes \code{"var"} containing 66 | the variable named in \code{var}, \code{"by"} containing the names of the variables specified in \code{by} (if any), \code{"at"} containing values at which the ADRF or AMEF is evaluated, and \code{"contrast"} containing the argument supplied to \code{contrast}. For an ADRF, the average marginal means will be named 67 | \code{E[Y({v})]}, where \code{{v}} is replaced with the values in \code{at}. For an AMEF, the average marginal effects will be 68 | named \code{dY/d({x})|{a}} where \code{{x}} is replaced with \code{var} and \code{{a}} is replaced by the values in \code{at}. 69 | } 70 | \description{ 71 | \code{sim_adrf()} is a wrapper for \code{\link[=sim_apply]{sim_apply()}} that computes average dose-response functions (ADRFs) and average marginal effect functions (AMEFs). An ADRF describes the relationship between values a focal variable can take and the expected value of the outcome were all units to be given each value of the variable. An AMEF describes the relationship between values a focal variable can take and the derivative of ADRF at each value. 72 | } 73 | \details{ 74 | The ADRF is composed of average marginal means across levels of the focal predictor. For each level of the focal predictor, predicted values of the outcome are computed after setting the value of the predictor to that level, and those values of the outcome are averaged across all units in the sample to arrive at an average marginal mean. Thus, the ADRF represent the relationship between the "dose" (i.e., the level of the focal predictor) and the average "response" (i.e., the outcome variable). It is the continuous analog to the average marginal effect computed for a binary predictor, e.g., using \code{\link[=sim_ame]{sim_ame()}}. Although inference can be at each level of the predictor or between two levels of the predictor, typically a plot of the ADRF is the most useful relevant quantity. These can be requested using \code{\link[=plot.clarify_adrf]{plot.clarify_adrf()}}. 75 | 76 | The AMEF is the derivative of the ADRF; if we call the derivative of the ADRF at each point a "treatment effect" (i.e., the rate at which the outcome changes corresponding to a small change in the predictor, or "treatment"), the AMEF is a function that relates the size of the treatment effect to the level of the treatment. The shape of the AMEF is usually of less importance than the value of the AMEF at each level of the predictor, which corresponds to the size of the treatment effect at the corresponding level. The AMEF is computed by computing the ADRF at each level of the focal predictor specified in \code{at}, shifting the predictor value by a tiny amount (control by \code{eps}), and computing the ratio of the change in the outcome to the shift, then averaging this value across all units. This quantity is related the the average marginal effect of a continuous predictor as computed by \code{\link[=sim_ame]{sim_ame()}}, but rather than average these treatment effects across all observed levels of the treatment, the AMEF is a function evaluated at each possible level of the treatment. The "tiny amount" used is \code{eps} times the standard deviation of \code{var}. 77 | } 78 | \examples{ 79 | data("lalonde", package = "MatchIt") 80 | 81 | # Fit the model 82 | fit <- glm(I(re78 > 0) ~ treat + age + race + 83 | married + re74, 84 | data = lalonde, family = binomial) 85 | 86 | # Simulate coefficients 87 | set.seed(123) 88 | s <- sim(fit, n = 100) 89 | 90 | # ADRF for `age` 91 | est <- sim_adrf(s, var = "age", 92 | at = seq(15, 55, length.out = 6), 93 | verbose = FALSE) 94 | est 95 | plot(est) 96 | 97 | # AMEF for `age` 98 | est <- sim_adrf(s, var = "age", contrast = "amef", 99 | at = seq(15, 55, length.out = 6), 100 | verbose = FALSE) 101 | est 102 | summary(est) 103 | plot(est) 104 | 105 | # ADRF for `age` within levels of `married` 106 | est <- sim_adrf(s, var = "age", 107 | at = seq(15, 55, length.out = 6), 108 | by = ~married, 109 | verbose = FALSE) 110 | est 111 | plot(est) 112 | 113 | ## Difference between ADRFs 114 | est_diff <- est[7:12] - est[1:6] 115 | plot(est_diff) + ggplot2::labs(y = "Diff") 116 | } 117 | \seealso{ 118 | \code{\link[=plot.clarify_adrf]{plot.clarify_adrf()}} for plotting the ADRF or AMEF; \code{\link[=sim_ame]{sim_ame()}} for computing average marginal effects; \code{\link[=sim_apply]{sim_apply()}}, which provides a general interface to computing any 119 | quantities for simulation-based inference; \code{\link[=summary.clarify_est]{summary.clarify_est()}} for computing 120 | p-values and confidence intervals for the estimated quantities. 121 | 122 | \code{\link[marginaleffects:slopes]{marginaleffects::avg_slopes()}} and \code{\link[marginaleffects:predictions]{marginaleffects::avg_predictions()}} for delta method-based implementations of computing average marginal effects and average marginal means. 123 | } 124 | -------------------------------------------------------------------------------- /man/sim_apply.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_apply.R 3 | \name{sim_apply} 4 | \alias{sim_apply} 5 | \title{Apply a function to simulated parameter values} 6 | \usage{ 7 | sim_apply(sim, FUN, verbose = TRUE, cl = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{sim}{a \code{clarify_sim} object; the output of a call to \code{\link[=sim]{sim()}} or 11 | \code{\link[=misim]{misim()}}.} 12 | 13 | \item{FUN}{a function to be applied to each set of simulated coefficients. 14 | See Details.} 15 | 16 | \item{verbose}{\code{logical}; whether to display a text progress bar indicating 17 | progress and estimated time remaining for the procedure. Default is \code{TRUE}.} 18 | 19 | \item{cl}{a cluster object created by \code{\link[parallel:makeCluster]{parallel::makeCluster()}}, or an 20 | integer to indicate the number of child-processes (integer values are 21 | ignored on Windows) for parallel evaluations. See \code{\link[pbapply:pbapply]{pbapply::pblapply()}} for 22 | details. If \code{NULL}, no parallelization will take place.} 23 | 24 | \item{...}{optional arguments passed to \code{FUN}.} 25 | } 26 | \value{ 27 | A \code{clarify_est} object, which is a matrix with a column for each 28 | estimated quantity and a row for each simulation. The original estimates 29 | (\code{FUN} applied to the original coefficients or model fit object) are stored 30 | in the attribute \code{"original"}. The \code{"sim_hash"} attribute contains the 31 | simulation hash produced by \code{sim()}. 32 | } 33 | \description{ 34 | \code{sim_apply()} applies a function that produces quantities of 35 | interest to each set of simulated coefficients produced by \code{\link[=sim]{sim()}}; these 36 | calculated quantities form the posterior sampling distribution for the 37 | quantities of interest. Capabilities are available for parallelization. 38 | } 39 | \details{ 40 | \code{sim_apply()} applies a function, \code{FUN}, to each set of simulated 41 | coefficients, similar to \code{\link[=apply]{apply()}}. This function should return a numeric 42 | vector containing one or more estimated quantities. This should be a named 43 | vector to more easily keep track of the meaning of each estimated quantity. 44 | Care should be taken to ensure that the returned vector is the same length 45 | each time \code{FUN} is called. \code{NA}s are allowed in the output but should be 46 | avoided if possible. 47 | 48 | The arguments to \code{FUN} can be specified in a few ways. If \code{FUN} has an 49 | argument called \code{coefs}, a simulated set of coefficients will be passed to 50 | this argument, and \code{FUN} should compute and return a quantity based on the 51 | coefficients (e.g., the difference between two coefficients if one wants to 52 | test whether two coefficients are equal). If \code{FUN} has an argument called 53 | \code{fit}, a model fit object of the same type as the one originally supplied 54 | to \code{sim()} (e.g., an \code{lm} or \code{glm} object) will be passed to this argument, 55 | where the coefficients of the fit object have been replaced by the 56 | simulated coefficients generated by \code{sim()}, and \code{FUN} should compute and 57 | return a quantity based on the model fit (e.g., a computation based on the 58 | output of \code{predict()}). If neither \code{coefs} nor \code{fit} are the names of 59 | arguments to \code{FUN}, the model fit object with replaced coefficients will be 60 | supplied to the first argument of \code{FUN}. 61 | 62 | When custom coefficients are supplied to \code{sim()}, i.e., when the \code{coefs} 63 | argument to \code{sim()} is not left at its default value, \code{FUN} must accept a 64 | \code{coefs} argument and a warning will be thrown if it accepts a \code{fit} 65 | argument. This is because \code{sim_apply()} does not know how to reconstruct 66 | the original fit object with the new coefficients inserted. The quantities 67 | computed by \code{sim_apply()} must therefore be computed directly from the 68 | coefficients. 69 | 70 | If \code{FUN} is not supplied at all, the simulated values of the coefficients will be returned in the output with a warning. Set \code{FUN} to \code{NULL} or \code{verbose} to \code{FALSE} to suppress this warning. 71 | \subsection{\code{sim_apply()} with multiply imputed data}{ 72 | 73 | When using \code{\link[=misim]{misim()}} and \code{sim_apply()} with multiply imputed data, the 74 | coefficients are supplied to the model fit corresponding to the imputation 75 | identifier associated with each set of coefficients, which means if \code{FUN} 76 | uses a dataset extracted from a model (e.g., using \code{\link[insight:get_data]{insight::get_data()}}), it will do so from the model fit in 77 | the corresponding imputation. 78 | 79 | The original estimates (see Value below) are computed as the mean of the 80 | estimates across the imputations using the original coefficients averaged 81 | across imputations. That is, first, the coefficients estimated in the 82 | models in the imputed datasets are combined to form a single set of pooled 83 | coefficients; then, for each imputation, the quantities of interest are 84 | computed using the pooled coefficients; finally, the mean of the resulting 85 | estimates across the imputations are taken as the "original" estimates. 86 | Note this procedure is only valid for quantities with symmetric sampling 87 | distributions, which excludes quantities like risk ratios and odds ratios, 88 | but includes log risk ratios and log odds ratios. The desired quantities 89 | can be transformed from their log versions using 90 | \code{\link[=transform]{transform()}}. 91 | } 92 | } 93 | \examples{ 94 | 95 | data("lalonde", package = "MatchIt") 96 | fit <- lm(re78 ~ treat + age + race + nodegree + re74, 97 | data = lalonde) 98 | coef(fit) 99 | 100 | set.seed(123) 101 | s <- sim(fit, n = 500) 102 | 103 | # Function to compare predicted values for two units 104 | # using `fit` argument 105 | sim_fun <- function(fit) { 106 | pred1 <- unname(predict(fit, newdata = lalonde[1,])) 107 | pred2 <- unname(predict(fit, newdata = lalonde[2,])) 108 | c(pred1 = pred1, pred2 = pred2) 109 | } 110 | 111 | est <- sim_apply(s, sim_fun, verbose = FALSE) 112 | 113 | # Add difference between predicted values as 114 | # additional quantity 115 | est <- transform(est, `diff 1-2` = pred1 - pred2) 116 | 117 | # Examine estimates and confidence intervals 118 | summary(est) 119 | 120 | # Function to compare coefficients using `coefs` 121 | # argument 122 | sim_fun <- function(coefs) { 123 | setNames(coefs["racewhite"] - coefs["racehispan"], 124 | "wh - his") 125 | } 126 | 127 | est <- sim_apply(s, sim_fun, verbose = FALSE) 128 | 129 | # Examine estimates and confidence intervals 130 | summary(est) 131 | 132 | # Another way to do the above: 133 | est <- sim_apply(s, FUN = NULL) 134 | est <- transform(est, 135 | `wh - his` = `racewhite` - `racehispan`) 136 | 137 | summary(est, parm = "wh - his") 138 | 139 | } 140 | \seealso{ 141 | \itemize{ 142 | \item \code{\link[=sim]{sim()}} for generating the simulated coefficients 143 | \item \code{\link[=summary.clarify_est]{summary.clarify_est()}} for computing p-values and confidence intervals for 144 | the estimated quantities 145 | \item \code{\link[=plot.clarify_est]{plot.clarify_est()}} for plotting estimated 146 | quantities and their simulated posterior sampling distribution. 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /man/sim_setx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_setx.R 3 | \name{sim_setx} 4 | \alias{sim_setx} 5 | \alias{print.clarify_setx} 6 | \title{Compute predictions and first differences at set values} 7 | \usage{ 8 | sim_setx( 9 | sim, 10 | x = list(), 11 | x1 = list(), 12 | outcome = NULL, 13 | type = NULL, 14 | verbose = TRUE, 15 | cl = NULL 16 | ) 17 | 18 | \method{print}{clarify_setx}(x, digits = NULL, max.ests = 6, ...) 19 | } 20 | \arguments{ 21 | \item{sim}{a \code{clarify_sim} object; the output of a call to \code{\link[=sim]{sim()}} or 22 | \code{\link[=misim]{misim()}}.} 23 | 24 | \item{x}{a data.frame containing a reference grid of predictor values or a named list of values each predictor should take defining such a 25 | reference grid, e.g., \code{list(v1 = 1:4, v2 = c("A", "B"))}. 26 | Any omitted predictors are fixed at a "typical" value. See Details. 27 | When \code{x1} is specified, \code{x} should identify a single reference unit. 28 | 29 | For \code{print()}, a \code{clarify_setx} object.} 30 | 31 | \item{x1}{a data.frame or named list of the value each predictor should take to compute the 32 | first difference from the predictor combination specified in \code{x}. \code{x1} can 33 | only identify a single unit. See Details.} 34 | 35 | \item{outcome}{a string containing the name of the outcome or outcome level for multivariate (multiple outcomes) or multi-category outcomes. Ignored for univariate (single outcome) and binary outcomes.} 36 | 37 | \item{type}{a string containing the type of predicted values (e.g., the link or the response). Passed to \code{\link[marginaleffects:get_predict]{marginaleffects::get_predict()}} and eventually to \code{predict()} in most cases. The default and allowable option depend on the type of model supplied, but almost always corresponds to the response scale (e.g., predicted probabilities for binomial models).} 38 | 39 | \item{verbose}{\code{logical}; whether to display a text progress bar indicating 40 | progress and estimated time remaining for the procedure. Default is \code{TRUE}.} 41 | 42 | \item{cl}{a cluster object created by \code{\link[parallel:makeCluster]{parallel::makeCluster()}}, or an 43 | integer to indicate the number of child-processes (integer values are 44 | ignored on Windows) for parallel evaluations. See \code{\link[pbapply:pbapply]{pbapply::pblapply()}} for 45 | details. If \code{NULL}, no parallelization will take place.} 46 | 47 | \item{digits}{the minimum number of significant digits to be used; passed to \code{\link[=print.data.frame]{print.data.frame()}}.} 48 | 49 | \item{max.ests}{the maximum number of estimates to display.} 50 | 51 | \item{...}{optional arguments passed to \code{FUN}.} 52 | } 53 | \value{ 54 | a \code{clarify_setx} object, which inherits from \code{clarify_est} and is similar to the output of \code{sim_apply()}, with the following additional attributes: 55 | \itemize{ 56 | \item \code{"setx"} - a data frame containing the values at which predictions are to be made 57 | \item \code{"fd"} - whether or not the first difference is to be computed; set to \code{TRUE} if \code{x1} is specified and \code{FALSE} otherwise 58 | } 59 | } 60 | \description{ 61 | \code{sim_setx()} is a wrapper for \code{\link[=sim_apply]{sim_apply()}} that computes predicted values of 62 | the outcome at specified values of the predictors, sometimes called marginal 63 | predictions. One can also compute the difference between two marginal 64 | predictions (the "first difference"). Although any function that accepted 65 | \code{clarify_est} objects can be used with \code{sim_setx()} output objects, a 66 | special plotting function, \code{\link[=plot.clarify_setx]{plot.clarify_setx()}}, can be used to plot marginal 67 | predictions. 68 | } 69 | \details{ 70 | When \code{x} is a named list of predictor values, they will be crossed 71 | to form a reference grid for the marginal predictions. Any predictors not 72 | set in \code{x} are assigned their "typical" value, which, for factor, 73 | character, logical, and binary variables is the mode, for numeric variables 74 | is the mean, and for ordered variables is the median. These values can be 75 | seen in the \code{"setx"} attribute of the output object. If \code{x} is empty, a 76 | prediction will be made at a point corresponding to the typical value of 77 | every predictor. Estimates are identified (in \code{summary()}, etc.) only by 78 | the variables that differ across predictions. 79 | 80 | When \code{x1} is supplied, the first difference is computed, which here is 81 | considered as the difference between two marginal predictions. One marginal 82 | prediction must be specified in \code{x} and another, ideally with a single 83 | predictor changed, specified in \code{x1}. 84 | } 85 | \examples{ 86 | data("lalonde", package = "MatchIt") 87 | 88 | fit <- lm(re78 ~ treat + age + educ + married + race + re74, 89 | data = lalonde) 90 | 91 | # Simulate coefficients 92 | set.seed(123) 93 | s <- sim(fit, n = 100) 94 | 95 | # Predicted values at specified values of values, typical 96 | # values for other predictors 97 | est <- sim_setx(s, x = list(treat = 0:1, 98 | re74 = c(0, 10000)), 99 | verbose = FALSE) 100 | summary(est) 101 | plot(est) 102 | 103 | # Predicted values at specified grid of values, typical 104 | # values for other predictors 105 | est <- sim_setx(s, x = list(age = c(20, 25, 30, 35), 106 | married = 0:1), 107 | verbose = FALSE) 108 | summary(est) 109 | plot(est) 110 | 111 | # First differences of treat at specified value of 112 | # race, typical values for other predictors 113 | est <- sim_setx(s, x = data.frame(treat = 0, race = "hispan"), 114 | x1 = data.frame(treat = 1, race = "hispan"), 115 | verbose = FALSE) 116 | summary(est) 117 | plot(est) 118 | 119 | } 120 | \seealso{ 121 | \code{\link[=sim_apply]{sim_apply()}}, which provides a general interface to computing any 122 | quantities for simulation-based inference; \code{\link[=plot.clarify_setx]{plot.clarify_setx()}} for plotting the 123 | output of a call to \code{sim_setx()}; \code{\link[=summary.clarify_est]{summary.clarify_est()}} for computing 124 | p-values and confidence intervals for the estimated quantities. 125 | } 126 | -------------------------------------------------------------------------------- /man/summary.clarify_est.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.clarify_est.R, R/summary.clarify_est.R 3 | \name{plot.clarify_est} 4 | \alias{plot.clarify_est} 5 | \alias{summary.clarify_est} 6 | \alias{confint.clarify_est} 7 | \title{Plotting and inference for \code{clarify_est} objects} 8 | \usage{ 9 | \method{plot}{clarify_est}( 10 | x, 11 | parm, 12 | ci = TRUE, 13 | level = 0.95, 14 | method = "quantile", 15 | reference = FALSE, 16 | ncol = 3, 17 | ... 18 | ) 19 | 20 | \method{summary}{clarify_est}(object, parm, level = 0.95, method = "quantile", null = NA, ...) 21 | 22 | \method{confint}{clarify_est}(object, parm, level = 0.95, method = "quantile", ...) 23 | } 24 | \arguments{ 25 | \item{parm}{a vector of the names or indices of the estimates to plot. If unspecified, all estimates will be displayed.} 26 | 27 | \item{ci}{\code{logical}; whether to display confidence interval limits for the estimates. Default is \code{TRUE}.} 28 | 29 | \item{level}{the confidence level desired. Default is .95 for 95\% confidence intervals.} 30 | 31 | \item{method}{the method used to compute p-values and confidence intervals. Can be \code{"wald"} to use a Normal approximation or \code{"quantile"} to use the simulated sampling distribution (default). See Details. Abbreviations allowed.} 32 | 33 | \item{reference}{\code{logical}; whether to overlay a normal density reference distribution over the plots. Default is \code{FALSE}.} 34 | 35 | \item{ncol}{the number of columns used when wrapping multiple plots; default is 3.} 36 | 37 | \item{...}{for \code{plot()}, further arguments passed to \code{\link[ggplot2:geom_density]{ggplot2::geom_density()}}.} 38 | 39 | \item{object, x}{a \code{clarify_est} object; the output of a call to \code{\link[=sim_apply]{sim_apply()}} or its wrappers.} 40 | 41 | \item{null}{the values of the parameters under the null hypothesis for the p-value calculations. Should have length equal to the number of quantities estimated, or one, in which case it will be recycled, or it can be a named vector with just the names of quantities for which null values are to be set. Set values to \code{NA} to omit p-values for those quantities. When all values are \code{NA}, the default, no p-values are produced.} 42 | } 43 | \value{ 44 | For \code{summary()}, a \code{summary.clarify_est} object, which is a matrix containing the coefficient estimates, standard errors, test statistics, p-values, and confidence intervals. Not all columns will be present depending on the arguments supplied to \code{summary()}. 45 | 46 | For \code{confint()}, a matrix containing the confidence intervals for the requested quantities. 47 | 48 | For \code{plot()}, a \code{ggplot} object. 49 | } 50 | \description{ 51 | \code{summary()} tabulates the estimates and confidence intervals and (optionally) p-values from a \code{clarify_est} object. \code{confint()} computes confidence intervals. \code{plot()} plots the "posterior" distribution of estimates. 52 | } 53 | \details{ 54 | \code{summary()} uses the estimates computed from the original model as its estimates and uses the simulated parameters for inference only, in line with the recommendations of Rainey (2023). 55 | 56 | When \code{method = "wald"}, the standard deviation of the simulation estimates is used as the standard error, which is used in the z-statistics and the confidence intervals. The p-values and confidence intervals are valid only when the sampling distribution of the resulting statistic is normal (which can be assessed using \code{plot()}). When \code{method = "quantile"}, the confidence interval is calculated using the quantiles of the simulation estimates corresponding to \code{level}, and the p-value is calculated as twice the proportion of simulation estimates less than or greater than \code{null}, whichever is smaller; this is equivalent to inverting the confidence interval but is only truly valid when the true sampling distribution is only a location shift from the sampling distribution under the null hypothesis and should therefore be interpreted with caution. Using \verb{"method = "quantile"} (the default) is recommended because the confidence intervals will be valid even if the sampling distribution is not Normally distributed. The precision of the p-values and confidence intervals depends on the number of simulations requested (the value of \code{n} supplied to \code{\link[=sim]{sim()}}). 57 | 58 | The plots are produced using \code{\link[ggplot2:geom_density]{ggplot2::geom_density()}} and can be customized with \pkg{ggplot2} functions. When \code{reference = TRUE}, a reference Normal distribution is produced using the empirical mean and standard deviation of the simulated values. A blue references line is plotted at the median of the simulated values. For Wald-based inference to be valid, the reference distribution should overlap with the empirical distribution, in which case the quantile-based and Wald-based intervals should be similar. For quantile-based inference to be valid, the median of the estimates should overlap with the estimated value; this is a necessary but not sufficient condition, though. 59 | } 60 | \examples{ 61 | data("lalonde", package = "MatchIt") 62 | fit <- glm(I(re78 > 0) ~ treat + age + race + nodegree + re74, 63 | data = lalonde) 64 | 65 | s <- sim(fit, n = 100) 66 | 67 | # Compute average marginal means for `treat` 68 | est <- sim_ame(s, var = "treat", verbose = FALSE) 69 | coef(est) 70 | 71 | # Compute average marginal effects on risk difference 72 | # (RD) and risk ratio (RR) scale 73 | est <- transform(est, 74 | RD = `E[Y(1)]` - `E[Y(0)]`, 75 | RR = `E[Y(1)]` / `E[Y(0)]`) 76 | 77 | # Compute confidence intervals and p-values, 78 | # using given null values for computing p-values 79 | summary(est, null = c(`RD` = 0, `RR` = 1)) 80 | 81 | # Same tests using normal approximation and alternate 82 | # syntax for `null` 83 | summary(est, null = c(NA, NA, 0, 1), 84 | normal = TRUE) 85 | 86 | # Plot the RD and RR with a reference distribution 87 | plot(est, parm = c("RD", "RR"), reference = TRUE, 88 | ci = FALSE) 89 | 90 | # Plot the RD and RR with quantile confidence bounds 91 | plot(est, parm = c("RD", "RR"), ci = TRUE) 92 | 93 | } 94 | \references{ 95 | Rainey, C. (2023). A careful consideration of CLARIFY: Simulation-induced bias in point estimates of quantities of interest. \emph{Political Science Research and Methods}, 1–10. \doi{10.1017/psrm.2023.8} 96 | } 97 | \seealso{ 98 | \itemize{ 99 | \item \code{\link[=sim_apply]{sim_apply()}} for applying a function to each set of simulated coefficients 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /man/transform.clarify_est.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/transform.clarify_est.R 3 | \name{transform.clarify_est} 4 | \alias{transform.clarify_est} 5 | \alias{cbind.clarify_est} 6 | \title{Transform and combine \code{clarify_est} objects} 7 | \usage{ 8 | \method{transform}{clarify_est}(`_data`, ...) 9 | 10 | \method{cbind}{clarify_est}(..., deparse.level = 1) 11 | } 12 | \arguments{ 13 | \item{_data}{the \code{clarify_est} object to be transformed.} 14 | 15 | \item{...}{for \code{transform()}, arguments in the form \code{name = value}, where \code{name} is the name of a new quantity to be computed and \code{value} is an expression that is a function of the existing quantities corresponding to the new quantity to be computed. See Details. For \code{cbind()}, \code{clarify_est} objects to be combined.} 16 | 17 | \item{deparse.level}{ignored.} 18 | } 19 | \value{ 20 | A \code{clarify_est} object, either with new columns added (when using \code{transform()}) or combining two \code{clarify_est} objects. Note that any type attributes corresponding to the \code{sim_apply()} wrapper used (e.g., \code{sim_ame()}) is lost when using either function. This can affect any helper functions (e.g., \code{plot()}) designed to work with the output of specific wrappers. 21 | } 22 | \description{ 23 | \code{transform()} modifies a \code{clarify_est} object by allowing for the calculation of new quantities from the existing quantities without re-simulating them. \code{cbind()} binds two \code{clarify_est} objects together. 24 | } 25 | \details{ 26 | For \code{transform()}, the expression on the right side of the \code{=} should use the names of the existing quantities (e.g., \code{`E[Y(1)]` - `E[Y(1)]`}), with \verb{`} appropriately included when the quantity name include parentheses or brackets. Alternatively, it can use indexes prefixed by \code{.b}, e.g., \code{.b2 - .b1}, to refer to the corresponding quantity by position. This can aid in computing derived quantities of quantities with complicated names. (Note that if a quantity is named something like \code{.b1}, it will need to be referred to by position rather than name, as the position-based label takes precedence). See examples. Setting an existing value to \code{NULL} will remove that quantity from the object. 27 | 28 | \code{cbind()} does not rename the quanities or check for uniqueness of the names, so it is important to rename them yourself prior to combining the objects. 29 | } 30 | \examples{ 31 | data("lalonde", package = "MatchIt") 32 | 33 | # Fit the model 34 | fit <- lm(re78 ~ treat * (age + educ + race + 35 | married + re74 + re75), 36 | data = lalonde) 37 | 38 | # Simulate coefficients 39 | set.seed(123) 40 | s <- sim(fit, n = 100) 41 | 42 | # Average adjusted predictions for `treat` within 43 | # subsets of `race` 44 | est_b <- sim_ame(s, var = "treat", verbose = FALSE, 45 | subset = race == "black") 46 | est_b 47 | 48 | est_h <- sim_ame(s, var = "treat", verbose = FALSE, 49 | subset = race == "hispan") 50 | est_h 51 | 52 | # Compute differences between adjusted predictions 53 | est_b <- transform(est_b, 54 | diff = `E[Y(1)]` - `E[Y(0)]`) 55 | est_b 56 | 57 | est_h <- transform(est_h, 58 | diff = `E[Y(1)]` - `E[Y(0)]`) 59 | est_h 60 | 61 | # Bind estimates together after renaming 62 | names(est_b) <- paste0(names(est_b), "_b") 63 | names(est_h) <- paste0(names(est_h), "_h") 64 | 65 | est <- cbind(est_b, est_h) 66 | est 67 | 68 | # Compute difference in race-specific differences 69 | est <- transform(est, 70 | `diff-diff` = .b6 - .b3) 71 | 72 | summary(est, 73 | parm = c("diff_b", "diff_h", "diff-diff")) 74 | 75 | # Remove last quantity by using `NULL` 76 | transform(est, `diff-diff` = NULL) 77 | } 78 | \seealso{ 79 | \code{\link[=transform]{transform()}}, \code{\link[=cbind]{cbind()}}, \code{\link[=sim]{sim()}} 80 | } 81 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(clarify) 11 | 12 | test_check("clarify") 13 | -------------------------------------------------------------------------------- /tests/testthat/fixtures/make_mdata.R: -------------------------------------------------------------------------------- 1 | #Make matched data 2 | m <- MatchIt::matchit(treat ~ age + educ + race + married + re74, 3 | data = MatchIt::lalonde, method = "full", estimand = "ATE", 4 | caliper = .05) 5 | md <- MatchIt::match.data(m, data = MatchIt::lalonde) 6 | md$binY <- as.numeric(md$re78 > 0) 7 | 8 | set.seed(1993) 9 | md$countY <- rpois(nrow(md), 5) 10 | md$propY <- runif(nrow(md)) 11 | 12 | saveRDS(md, test_path("fixtures", "mdata.rds")) 13 | -------------------------------------------------------------------------------- /tests/testthat/fixtures/make_mira.R: -------------------------------------------------------------------------------- 1 | d <- cobalt::lalonde_mis 2 | d$binY <- as.numeric(d$re78 > 0) 3 | 4 | imp <- mice::mice(d, maxit = 5, m = 10, printFlag = FALSE, 5 | seed = 1234567) 6 | 7 | #mipo: 8 | mira <- with(imp, glm(binY ~ treat + age + educ + race + re74, family = binomial)) 9 | saveRDS(mira, test_path("fixtures", "mira.rds")) 10 | 11 | #list of models: 12 | model_list <- lapply(mice::complete(imp, "all"), function(data) { 13 | glm(binY ~ treat + age + educ + race + re74, family = binomial, 14 | data = data) 15 | }) 16 | saveRDS(model_list, test_path("fixtures", "model_list.rds")) 17 | 18 | m <- MatchThem::matchthem(treat ~ age + educ + race + married + re74 + re75, 19 | imp, estimand = "ATE", method = "full", link = "probit") 20 | 21 | #mimipo 22 | mimira <- with(m, glm(binY ~ treat + age + educ + race + re74, 23 | family = "quasibinomial")) 24 | 25 | saveRDS(mimira, test_path("fixtures", "mimira.rds")) 26 | -------------------------------------------------------------------------------- /tests/testthat/fixtures/mdata.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/tests/testthat/fixtures/mdata.rds -------------------------------------------------------------------------------- /tests/testthat/fixtures/mimira.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/tests/testthat/fixtures/mimira.rds -------------------------------------------------------------------------------- /tests/testthat/fixtures/mira.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/tests/testthat/fixtures/mira.rds -------------------------------------------------------------------------------- /tests/testthat/fixtures/model_list.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IQSS/clarify/ab7e624ba3c8277b5001482c2c689b5d0bd93a65/tests/testthat/fixtures/model_list.rds -------------------------------------------------------------------------------- /tests/testthat/helper.R: -------------------------------------------------------------------------------- 1 | #testthat helpers 2 | 3 | expect_good_clarify_sim <- function(s) { 4 | expect_s3_class(s, "clarify_sim") 5 | expect_true(all(c("sim.coefs", "coefs", "vcov") %in% names(s))) 6 | 7 | expect_true(is.matrix(s$sim.coefs)) 8 | expect_type(s$sim.coefs, "double") 9 | expect_type(s$coefs, "double") 10 | expect_true(is.matrix(s$vcov)) 11 | expect_type(s$vcov, "double") 12 | 13 | expect_vector(attr(s, "dist"), character(), 1) 14 | expect_vector(attr(s, "sim_hash"), character(), 1) 15 | expect_vector(attr(s, "use_fit"), logical(), 1) 16 | 17 | expect_equal(isTRUE(!is.null(s$fit)), attr(s, "use_fit")) 18 | 19 | expect_equal(length(s$coefs), ncol(s$sim.coefs)) 20 | expect_equal(ncol(s$vcov), nrow(s$vcov)) 21 | expect_equal(length(s$coefs), nrow(s$vcov)) 22 | 23 | expect_false(any(!is.finite(s$sim.coefs))) 24 | expect_false(any(!is.finite(s$coefs))) 25 | expect_false(any(!is.finite(s$vcov))) 26 | } 27 | 28 | expect_good_clarify_est <- function(e) { 29 | expect_s3_class(e, "clarify_est") 30 | expect_length(dim(e), 2L) 31 | expect_type(e, "double") 32 | 33 | expect_vector(attr(e, "original"), numeric(), ncol(e)) 34 | expect_vector(attr(e, "sim_hash"), character(), 1) 35 | 36 | expect_identical(names(e), names(attr(e, "original"))) 37 | 38 | expect_false(any(apply(e, 2, all_the_same))) 39 | } 40 | 41 | expect_good_clarify_misim <- function(s) { 42 | expect_s3_class(s, "clarify_misim") 43 | expect_s3_class(s, "clarify_sim") 44 | expect_true(all(c("sim.coefs", "coefs", "imp") %in% names(s))) 45 | 46 | expect_true(is.matrix(s$sim.coefs)) 47 | expect_type(s$sim.coefs, "double") 48 | expect_true(is.matrix(s$coefs)) 49 | expect_type(s$coefs, "double") 50 | if (!is.null(s$fit)) expect_equal(nrow(s$coefs), length(s$fit)) 51 | expect_type(s$imp, "integer") 52 | expect_equal(max(s$imp), nrow(s$coefs)) 53 | expect_equal(length(s$imp), nrow(s$sim.coefs)) 54 | 55 | expect_vector(attr(s, "dist"), character(), 1) 56 | expect_vector(attr(s, "sim_hash"), character(), 1) 57 | expect_vector(attr(s, "use_fit"), logical(), 1) 58 | 59 | expect_equal(isTRUE(!is.null(s$fit)), attr(s, "use_fit")) 60 | 61 | expect_equal(ncol(s$coefs), ncol(s$sim.coefs)) 62 | 63 | expect_false(any(!is.finite(s$sim.coefs))) 64 | expect_false(any(!is.finite(s$coefs))) 65 | } 66 | -------------------------------------------------------------------------------- /tests/testthat/test-transform.R: -------------------------------------------------------------------------------- 1 | test_that("transform() works", { 2 | mdata <- readRDS(test_path("fixtures", "mdata.rds")) 3 | 4 | fit <- lm(re78 ~ treat * age + educ + race + re74, data = mdata, 5 | weights = weights) 6 | 7 | s <- sim(fit, n = 5) 8 | 9 | e0 <- sim_ame(s, "treat", verbose = FALSE) 10 | 11 | e1 <- transform(e0, diff = `E[Y(1)]` - `E[Y(0)]`) 12 | 13 | expect_good_clarify_est(e1) 14 | expect_equal(length(names(e1)), 3) 15 | expect_equal(as.matrix(e1)[,2] - as.matrix(e1)[,1], 16 | as.matrix(e1)[,3]) 17 | 18 | #Test positional matching 19 | e2 <- transform(e0, diff = .b2 - .b1) 20 | 21 | expect_good_clarify_est(e2) 22 | 23 | expect_equal(e1, e2) 24 | 25 | # test that positional matching is prioritized 26 | e3 <- e0; names(e3) <- c(".b2", ".b1") 27 | 28 | e3 <- transform(e3, diff = .b2 - .b1) 29 | expect_good_clarify_est(e3) 30 | expect_equal(e1[3], e3[3]) 31 | 32 | #Test that NULL removes existing values but not new ones 33 | e4 <- transform(e2, diff = NULL) 34 | expect_good_clarify_est(e4) 35 | expect_equal(e1[-3], e4) 36 | 37 | e4 <- transform(e0, 38 | diff2 = .b1 - .b2, 39 | diff2 = NULL) 40 | expect_good_clarify_est(e4) 41 | expect_equal(length(names(e4)), 3) 42 | }) 43 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/Zelig.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Translating Zelig to clarify" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Translating Zelig to clarify} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | bibliography: references.bib 9 | --- 10 | 11 | ```{r, include = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = "#>", 15 | warning = FALSE, 16 | fig.width = 6.5, 17 | fig.height = 2.75 18 | ) 19 | ``` 20 | 21 | ## Introduction 22 | 23 | In this document, we demonstrate some common uses of `Zelig` [@imaiCommonFrameworkStatistical2008a] and how the same tasks can be performed using `clarify`. We'll include examples for computing predictions at representative values (i.e., `setx()` and `sim()` in `Zelig`), the rare-events logit model, estimating the average treatment effect (ATT) after matching, and combining estimates after multiple imputation. 24 | 25 | The usual workflow in `Zelig` is to fit a model using `zelig()`, specify quantities of interest to simulate using `setx()` on the `zelig()` output, and then simulate those quantities using `sim()`. `clarify` uses a similar approach, except that the model is fit outside `clarify` using functions in a different R package. In addition, `clarify`'s `sim_apply()` allows for the computation of any arbitrary quantity of interest. Unlike `Zelig`, `clarify` follows the recommendations of @raineyCarefulConsiderationCLARIFY2023 to use the estimates computed from the original model coefficients rather than the average of the simulated draws. We'll demonstrate how to replicate a standard `Zelig` analysis using `clarify` step-by-step. Because simulation-based inference involves randomness and some of the algorithms may not perfectly align, one shouldn't expect results to be identical, though in most cases, they should be similar. 26 | 27 | ```{r} 28 | ## library("Zelig") 29 | library("clarify") 30 | set.seed(100) 31 | ``` 32 | 33 | Note that both `Zelig` and `clarify` have a function called "`sim()`", so we will always make it clear which package's `sim()` is being used. 34 | 35 | ## Predictions at representative values 36 | 37 | Here we'll use the `lalonde` dataset in `{MatchIt}` and fit a linear model for `re78` as a function of the treatment `treat` and covariates. 38 | 39 | ```{r} 40 | data("lalonde", package = "MatchIt") 41 | ``` 42 | 43 | We'll be interested in the predicted values of the outcome for a typical unit at each level of treatment and their first difference. 44 | 45 | ### `Zelig` workflow 46 | 47 | In `Zelig`, we fit the model using `zelig()`: 48 | 49 | ```{r, eval = FALSE} 50 | fit <- zelig(re78 ~ treat + age + educ + married + race + 51 | nodegree + re74 + re75, data = lalonde, 52 | model = "ls", cite = FALSE) 53 | ``` 54 | 55 | Next, we use `setx()` and `setx1()` to set our values of `treat`: 56 | 57 | ```{r, eval = FALSE} 58 | fit <- setx(fit, treat = 0) 59 | fit <- setx1(fit, treat = 1) 60 | ``` 61 | 62 | Next we simulate the values using `sim()`: 63 | 64 | ```{r, eval = FALSE} 65 | fit <- Zelig::sim(fit) 66 | ``` 67 | 68 | Finally, we can print and plot the predicted values and first differences: 69 | 70 | ```{r, eval = FALSE} 71 | fit 72 | ``` 73 | 74 | ```{r, eval = F} 75 | plot(fit) 76 | ``` 77 | 78 | ### `clarify` workflow 79 | 80 | In `clarify`, we fit the model using functions outside `clarify`, like `stats::lm()`or `fixest::feols()`. 81 | 82 | ```{r} 83 | fit <- lm(re78 ~ treat + age + educ + married + race + 84 | nodegree + re74 + re75, data = lalonde) 85 | ``` 86 | 87 | Next, we simulate the model coefficients using `clarify::sim()`: 88 | 89 | ```{r} 90 | s <- clarify::sim(fit) 91 | ``` 92 | 93 | Next, we use `sim_setx()` to set our values of the predictors: 94 | 95 | ```{r} 96 | est <- sim_setx(s, x = list(treat = 0), x1 = list(treat = 1), 97 | verbose = FALSE) 98 | ``` 99 | 100 | Finally, we can summarize and plot the predicted values: 101 | 102 | ```{r} 103 | summary(est) 104 | 105 | plot(est) 106 | ``` 107 | 108 | ## Rare-events logit 109 | 110 | `Zelig` uses a special method for logistic regression with rare events as described in @kingLogisticRegressionRare2001. This is the primary implementation of the method in R. However, newer methods have been developed that perform similarly to or better than the method of King and Zeng [@puhrFirthLogisticRegression2017] and are implemented in R packages that are compatible with `clarify`, such as `logistf` and `brglm2`. 111 | 112 | Here, we'll use the `lalonde` dataset with a constructed rare outcome variable to demonstrate how to perform a rare events logistic regression in `Zelig` and in `clarify`. 113 | 114 | ```{r} 115 | data("lalonde", package = "MatchIt") 116 | 117 | #Rare outcome: 1978 earnings over $20k; ~6% prevalence 118 | lalonde$re78_20k <- lalonde$re78 >= 20000 119 | ``` 120 | 121 | ### `Zelig` workflow 122 | 123 | In `Zelig`, we fit a rare events logistic model using `zelig()` with `model = "relogit"`. 124 | 125 | ```{r, eval = FALSE} 126 | fit <- zelig(re78_20k ~ treat + age + educ + married + race + 127 | nodegree + re74 + re75, data = lalonde, 128 | model = "relogit", cite = FALSE) 129 | 130 | fit 131 | ``` 132 | 133 | We can compute predicted values at representative values using `setx()` and `Zelig::sim()` as above. 134 | 135 | ```{r, eval = FALSE} 136 | fit <- setx(fit, treat = 0) 137 | fit <- setx1(fit, treat = 1) 138 | 139 | fit <- Zelig::sim(fit) 140 | 141 | fit 142 | ``` 143 | 144 | ```{r, eval = FALSE} 145 | plot(fit) 146 | ``` 147 | 148 | ### `clarify` workflow 149 | 150 | Here, we'll use `logistf::logistif()` with `flic = TRUE`, which performs a variation on Firth's logistic regression with a correction for bias in the intercept [@puhrFirthLogisticRegression2017]. 151 | 152 | ```{r} 153 | fit <- logistf::logistf(re78_20k ~ treat + age + educ + married + race + 154 | nodegree + re74 + re75, data = lalonde, 155 | flic = TRUE) 156 | 157 | summary(fit) 158 | ``` 159 | 160 | We can compute predictions at representative values using `clarify::sim()` and `sim_setx()`. 161 | 162 | ```{r} 163 | s <- clarify::sim(fit) 164 | 165 | est <- sim_setx(s, x = list(treat = 0), x1 = list(treat = 1), 166 | verbose = FALSE) 167 | 168 | summary(est) 169 | ``` 170 | 171 | ```{r} 172 | plot(est) 173 | ``` 174 | 175 | ## Estimating the ATT after matching 176 | 177 | Here we'll use the `lalonde` dataset and perform propensity score matching and then fit a linear model for `re78` as a function of the treatment `treat`, the covariates, and their interaction. From this model, we'll compute the ATT of `treat` using `Zelig` and `clarify`. 178 | 179 | ```{r} 180 | data("lalonde", package = "MatchIt") 181 | 182 | m.out <- MatchIt::matchit(treat ~ age + educ + married + race + 183 | nodegree + re74 + re75, data = lalonde, 184 | method = "nearest") 185 | ``` 186 | 187 | ### `Zelig` workflow 188 | 189 | In `Zelig`, we fit the model using `zelig()` directly on the `matchit` object: 190 | 191 | ```{r, eval = FALSE} 192 | fit <- zelig(re78 ~ treat * (age + educ + married + race + 193 | nodegree + re74 + re75), 194 | data = m.out, model = "ls", cite = FALSE) 195 | ``` 196 | 197 | Next, we use `ATT()` to request the ATT of `treat` and simulate the values: 198 | 199 | ```{r, eval = FALSE} 200 | fit <- ATT(fit, "treat") 201 | ``` 202 | 203 | ```{r, eval = F} 204 | fit 205 | ``` 206 | 207 | ```{r, eval = F} 208 | plot(fit) 209 | ``` 210 | 211 | ### `clarify` workflow 212 | 213 | In `clarify`, we need to extract the matched dataset and fit a model outside `clarify` using another package. 214 | 215 | ```{r} 216 | m.data <- MatchIt::match.data(m.out) 217 | 218 | fit <- lm(re78 ~ treat * (age + educ + married + race + 219 | nodegree + re74 + re75), 220 | data = m.data) 221 | ``` 222 | 223 | Next, we simulate the model coefficients using `clarify::sim()`. Because we performed pair matching, we will request a cluster-robust standard error: 224 | 225 | ```{r} 226 | s <- clarify::sim(fit, vcov = ~subclass) 227 | ``` 228 | 229 | Next, we use `sim_ame()` to request the average marginal effect of `treat` within the subset of treated units: 230 | 231 | ```{r} 232 | est <- sim_ame(s, var = "treat", subset = treat == 1, 233 | contrast = "diff", verbose = FALSE) 234 | ``` 235 | 236 | Finally, we can summarize and plot the ATT: 237 | 238 | ```{r} 239 | summary(est) 240 | 241 | plot(est) 242 | ``` 243 | 244 | ## Combining results after multiple imputation 245 | 246 | Here we'll use the `africa` dataset in `{Amelia}` to demonstrate combining estimates after multiple imputation. This analysis is also demonstrated using `clarify` at the end of `vignette("clarify")`. 247 | 248 | ```{r, message=F} 249 | library(Amelia) 250 | data("africa", package = "Amelia") 251 | ``` 252 | 253 | First we multiply impute the data using `amelia()` using the specification in the `{Amelia}` documentation. 254 | 255 | ```{r} 256 | # Multiple imputation 257 | a.out <- amelia(x = africa, m = 10, cs = "country", 258 | ts = "year", logs = "gdp_pc", p2s = 0) 259 | ``` 260 | 261 | ### `Zelig` workflow 262 | 263 | With `Zelig`, we can supply the `amelia` object directly to the `data` argument of `zelig()` to fit a model in each imputed dataset: 264 | 265 | ```{r, eval = FALSE} 266 | fit <- zelig(gdp_pc ~ infl * trade, data = a.out, 267 | model = "ls", cite = FALSE) 268 | ``` 269 | 270 | Summarizing the coefficient estimates after the simulation can be done using `summary()`: 271 | 272 | ```{r, eval = FALSE} 273 | summary(fit) 274 | ``` 275 | 276 | We can use `Zelig::sim()` and `setx()` to compute predictions at specified values of the predictors: 277 | 278 | ```{r, eval = FALSE} 279 | fit <- setx(fit, infl = 0, trade = 40) 280 | fit <- setx1(fit, infl = 0, trade = 60) 281 | 282 | fit <- Zelig::sim(fit) 283 | ``` 284 | 285 | `Zelig` does not allow you to combine predicted values across imputations. 286 | 287 | ```{r, eval = F} 288 | fit 289 | ``` 290 | 291 | ```{r, eval = F} 292 | plot(fit) 293 | ``` 294 | 295 | ### `clarify` workflow 296 | 297 | `clarify` does not combine coefficients, unlike `zelig()`; instead, the models should be fit using `Amelia::with()`. To view the combined coefficient estimates, use `Amelia::mi.combine()`. 298 | 299 | ```{r} 300 | #Use Amelia functions to model and combine coefficients 301 | fits <- with(a.out, lm(gdp_pc ~ infl * trade)) 302 | 303 | mi.combine(fits) 304 | ``` 305 | 306 | Derived quantities can be computed using `clarify::misim()` and `sim_apply()` or its wrappers on the `with()` output, which is a list of regression model fits: 307 | 308 | ```{r} 309 | #Simulate coefficients, 100 in each of 10 imputations 310 | s <- misim(fits, n = 100) 311 | 312 | #Compute predictions at specified values 313 | est <- sim_setx(s, x = list(infl = 0, trade = 40), 314 | x1 = list(infl = 0, trade = 60), 315 | verbose = FALSE) 316 | 317 | summary(est) 318 | 319 | plot(est) 320 | ``` 321 | 322 | ## References 323 | -------------------------------------------------------------------------------- /vignettes/references.bib: -------------------------------------------------------------------------------- 1 | 2 | @article{kingMakingMostStatistical2000, 3 | title = {Making the Most of Statistical Analyses: Improving Interpretation and Presentation}, 4 | author = {King, Gary and Tomz, Michael and Wittenberg, Jason}, 5 | year = {2000}, 6 | date = {2000}, 7 | journal = {American Journal of Political Science}, 8 | pages = {347--361}, 9 | volume = {44}, 10 | number = {2}, 11 | doi = {10.2307/2669316}, 12 | url = {https://www.jstor.org/stable/2669316}, 13 | note = {tex.ids= kingMakingMostStatistical2000a 14 | publisher: [Midwest Political Science Association, Wiley]} 15 | } 16 | 17 | @article{zhouNoteBayesianInference2010, 18 | title = {A Note on Bayesian Inference After Multiple Imputation}, 19 | author = {Zhou, Xiang and Reiter, Jerome P.}, 20 | year = {2010}, 21 | month = {05}, 22 | date = {2010-05}, 23 | journal = {The American Statistician}, 24 | pages = {159--163}, 25 | volume = {64}, 26 | number = {2}, 27 | doi = {10.1198/tast.2010.09109}, 28 | url = {http://www.tandfonline.com/doi/abs/10.1198/tast.2010.09109}, 29 | langid = {en} 30 | } 31 | 32 | @article{tomzClarifySoftwareInterpreting2003, 33 | title = {Clarify: Software for Interpreting and Presenting Statistical Results}, 34 | author = {Tomz, Michael and Wittenberg, Jason and King, Gary}, 35 | year = {2003}, 36 | month = {01}, 37 | date = {2003-01-15}, 38 | journal = {Journal of Statistical Software}, 39 | pages = {1--30}, 40 | volume = {8}, 41 | doi = {10.18637/jss.v008.i01}, 42 | url = {https://doi.org/10.18637/jss.v008.i01}, 43 | langid = {en} 44 | } 45 | 46 | @article{imaiCommonFrameworkStatistical2008a, 47 | title = {Toward a Common Framework for Statistical Analysis and Development}, 48 | author = {Imai, Kosuke and King, Gary and Lau, Olivia}, 49 | year = {2008}, 50 | month = {12}, 51 | date = {2008-12-01}, 52 | journal = {Journal of Computational and Graphical Statistics}, 53 | pages = {892--913}, 54 | volume = {17}, 55 | number = {4}, 56 | doi = {10.1198/106186008X384898}, 57 | url = {https://doi.org/10.1198/106186008X384898}, 58 | note = {{\_}eprint: https://doi.org/10.1198/106186008X384898 59 | tex.ids= imaiCommonFrameworkStatistical2008 60 | publisher: Taylor & Francis} 61 | } 62 | 63 | @article{puhrFirthLogisticRegression2017, 64 | title = {Firth's logistic regression with rare events: accurate effect estimates and predictions?}, 65 | author = {Puhr, Rainer and Heinze, Georg and Nold, Mariana and Lusa, Lara and Geroldinger, Angelika}, 66 | year = {2017}, 67 | month = {06}, 68 | date = {2017-06-30}, 69 | journal = {Statistics in Medicine}, 70 | pages = {2302--2317}, 71 | volume = {36}, 72 | number = {14}, 73 | doi = {10.1002/sim.7273}, 74 | url = {http://onlinelibrary.wiley.com/doi/10.1002/sim.7273}, 75 | note = {Publisher: John Wiley & Sons, Ltd}, 76 | langid = {en} 77 | } 78 | 79 | @article{kingLogisticRegressionRare2001, 80 | title = {Logistic Regression in Rare Events Data}, 81 | author = {King, Gary and Zeng, Langche}, 82 | year = {2001}, 83 | date = {2001}, 84 | journal = {Political Analysis}, 85 | pages = {137--163}, 86 | volume = {9}, 87 | number = {2}, 88 | doi = {10.1093/oxfordjournals.pan.a004868}, 89 | url = {https://www.cambridge.org/core/product/identifier/S1047198700003740/type/journal_article}, 90 | langid = {en} 91 | } 92 | 93 | @article{raineyCarefulConsiderationCLARIFY2023, 94 | title = {A Careful Consideration of {{CLARIFY}}: Simulation-Induced Bias in Point Estimates of Quantities of Interest}, 95 | shorttitle = {A Careful Consideration of {{CLARIFY}}}, 96 | author = {Rainey, Carlisle}, 97 | year = {2023}, 98 | month = apr, 99 | journal = {Political Science Research and Methods}, 100 | pages = {1--10}, 101 | publisher = {{Cambridge University Press}}, 102 | issn = {2049-8470, 2049-8489}, 103 | doi = {10.1017/psrm.2023.8}, 104 | urldate = {2023-05-03}, 105 | abstract = {Some work in political methodology recommends that applied researchers obtain point estimates of quantities of interest by simulating model coefficients, transforming these simulated coefficients into simulated quantities of interest, and then averaging the simulated quantities of interest (e.g., CLARIFY). But other work advises applied researchers to directly transform coefficient estimates to estimate quantities of interest. I point out that these two approaches are not interchangeable and examine their properties. I show that the simulation approach compounds the transformation-induced bias identified by Rainey (2017), adding bias with direction and magnitude similar to the transformation-induced bias. I refer to this easily avoided additional bias as ``simulation-induced bias.'' Even if researchers use simulation to estimate standard errors, they should directly transform maximum likelihood estimates of coefficient estimates to obtain point estimates of quantities of interest.}, 106 | langid = {english}, 107 | keywords = {Maximum likelihood estimation (MLE)}, 108 | file = {/Users/NoahGreifer/Zotero/storage/2QK82P9A/Rainey - 2023 - A careful consideration of CLARIFY simulation-ind.pdf} 109 | } 110 | 111 | 112 | @article{rainey2017, 113 | title = {Transformation-Induced Bias: Unbiased Coefficients Do Not Imply Unbiased Quantities of Interest}, 114 | author = {Rainey, Carlisle}, 115 | year = {2017}, 116 | month = {07}, 117 | date = {2017-07}, 118 | journal = {Political Analysis}, 119 | pages = {402--409}, 120 | volume = {25}, 121 | number = {3}, 122 | doi = {10.1017/pan.2017.11}, 123 | url = {http://dx.doi.org/10.1017/pan.2017.11}, 124 | langid = {en} 125 | } 126 | --------------------------------------------------------------------------------