├── .Rbuildignore ├── .gitignore ├── .remarkrc ├── DESCRIPTION ├── LICENSE ├── Makefile ├── NAMESPACE ├── README.md ├── Untitled.Rmd ├── WORDLIST ├── _bookdown.yml ├── _common.R ├── _lint.R ├── _notes ├── _main.Rmd ├── biblio.md ├── pauperism.Rmd ├── questions.Rmd ├── realstats.Rmd └── wine.Rmd ├── _output.yml ├── _render.R ├── _serve.R ├── _spelling.R ├── appendix.Rmd ├── bibliography.Rmd ├── bootstrapping.Rmd ├── causal-regression.Rmd ├── cross-validation.Rmd ├── data └── western1995 │ ├── econ_growth.tsv │ ├── income_ineq.tsv │ └── unionization.tsv ├── diagrams ├── _book │ ├── _main_files │ │ └── figure-html │ │ │ ├── unnamed-chunk-11-1.png │ │ │ ├── unnamed-chunk-12-1.png │ │ │ └── unnamed-chunk-3-1.png │ ├── cross-validation.html │ ├── libs │ │ ├── gitbook-2.6.7 │ │ │ ├── css │ │ │ │ ├── fontawesome │ │ │ │ │ └── fontawesome-webfont.ttf │ │ │ │ ├── plugin-bookdown.css │ │ │ │ ├── plugin-fontsettings.css │ │ │ │ ├── plugin-highlight.css │ │ │ │ ├── plugin-search.css │ │ │ │ └── style.css │ │ │ └── js │ │ │ │ ├── app.min.js │ │ │ │ ├── jquery.highlight.js │ │ │ │ ├── lunr.js │ │ │ │ ├── plugin-bookdown.js │ │ │ │ ├── plugin-fontsettings.js │ │ │ │ ├── plugin-search.js │ │ │ │ └── plugin-sharing.js │ │ └── jquery-2.2.3 │ │ │ └── jquery.min.js │ └── search_index.json ├── iv-dag.gv ├── science.mmd └── science2.mmd ├── docs ├── .nojekyll ├── appendix.md ├── bootstrapping.html ├── bootstrapping.md ├── causal-regression.md ├── colinearity-and-multicolinearity.html ├── collinearity-and-multicollinearity.html ├── cross-validation.html ├── cross-validation.md ├── cross-validation_files │ └── figure-html │ │ ├── unnamed-chunk-12-1.svg │ │ ├── unnamed-chunk-13-1.svg │ │ └── unnamed-chunk-4-1.svg ├── eda.md ├── formatting-tables.html ├── img │ └── islr-fig-6.7.png ├── index.html ├── index.md ├── libs │ ├── gitbook-2.6.7 │ │ ├── css │ │ │ ├── fontawesome │ │ │ │ └── fontawesome-webfont.ttf │ │ │ ├── plugin-bookdown.css │ │ │ ├── plugin-fontsettings.css │ │ │ ├── plugin-highlight.css │ │ │ ├── plugin-search.css │ │ │ └── style.css │ │ └── js │ │ │ ├── app.min.js │ │ │ ├── jquery.highlight.js │ │ │ ├── lunr.js │ │ │ ├── plugin-bookdown.js │ │ │ ├── plugin-fontsettings.js │ │ │ ├── plugin-search.js │ │ │ └── plugin-sharing.js │ └── jquery-2.2.3 │ │ └── jquery.min.js ├── linear-regression.md ├── matrix-algebra-review.html ├── matrix.md ├── multicolinearity.md ├── multicollinearity.html ├── multicollinearity.md ├── ols-assumptions.html ├── ols-in-matrix-form.html ├── other-did-approaches.html ├── panel-data-fixed-effects-and-difference-in-difference.html ├── panel.md ├── prediction-policy-problems.html ├── prediction.html ├── prediction.md ├── prediction_files │ └── figure-html │ │ ├── unnamed-chunk-10-1.svg │ │ ├── unnamed-chunk-11-1.svg │ │ ├── unnamed-chunk-13-1.svg │ │ ├── unnamed-chunk-16-1.svg │ │ ├── unnamed-chunk-18-1.svg │ │ ├── unnamed-chunk-20-1.svg │ │ ├── unnamed-chunk-21-1.svg │ │ └── unnamed-chunk-9-1.svg ├── presentation.md ├── programming.md ├── purpose.html ├── rd.md ├── rd_files │ └── figure-html │ │ └── unnamed-chunk-2-1.svg ├── references-1.html ├── references-3.html ├── references-4.html ├── references-5.html ├── references.html ├── references.md ├── reganat.md ├── reganat_files │ └── figure-html │ │ ├── unnamed-chunk-3-1.svg │ │ └── unnamed-chunk-7-1.svg ├── regression-anatomy.html ├── regression-discontinuity.html ├── regression.html ├── regularization.html ├── regularization.md ├── regularization_files │ └── figure-html │ │ └── unnamed-chunk-5-1.svg ├── reproducible-research.html ├── reproducible-research.md ├── reproducible_research.md ├── search_index.json ├── section.html ├── tables-and-plots.md ├── tables_and_plots.md ├── typesetting-and-word-processing-programs.html ├── word-processing.md ├── writing-resources.html └── writing.md ├── eda.Rmd ├── img ├── 1000px-Coefficient_of_Determination.svg.png ├── islr-fig-6.7.png ├── laffer.png └── tobias-funke-blue.jpeg ├── includes ├── after_body.html ├── before_body.html ├── in_header.html └── preamble.tex ├── index.Rmd ├── intro-methods-notes.Rproj ├── intromethods.bib ├── iv.Rmd ├── linear-regression.Rmd ├── matrix.Rmd ├── model-fit.Rmd ├── multicollinearity.Rmd ├── old-files ├── _main.Rmd ├── multicollinearity.Rmd ├── non-standard-errors.Rmd ├── ols-diagnostics-troubleshooting.Rmd ├── ols-estimator.Rmd ├── ols-inference.Rmd ├── ols-misc.Rmd ├── ovb-measurment-error.Rmd └── resampling-methods.Rmd ├── outliers.Rmd ├── ovb.Rmd ├── package-lock.json ├── panel.Rmd ├── potential-outcomes.Rmd ├── prediction.Rmd ├── presentation.Rmd ├── probability.Rmd ├── programming.Rmd ├── questions.Rmd ├── rd.Rmd ├── references.Rmd ├── reganat.Rmd ├── regression-inference.Rmd ├── regularization.Rmd ├── reproducible-research.Rmd ├── simple-regression.Rmd ├── simpsons.Rmd ├── tables-and-plots.Rmd ├── word-processing.Rmd └── writing.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .*.Rnb.cached 5 | *.rds 6 | *.aux 7 | *.fdb_latexmk 8 | *.fls 9 | *.out 10 | *.toc 11 | *.bak 12 | $*$ 13 | *.sav 14 | _bookdown_files 15 | *.utf8.md 16 | *.knit.md 17 | 18 | /*.html 19 | /*_cache 20 | /*_files 21 | node_modules 22 | *.log 23 | ^/bookdown* 24 | /bookdown* 25 | -------------------------------------------------------------------------------- /.remarkrc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": [ 3 | "remark-preset-lint-recommended", 4 | "remark-preset-lint-consistent", 5 | "remark-preset-lint-markdown-style-guide", 6 | "remark-frontmatter", 7 | ["remark-lint-file-extension", false], 8 | ["remark-lint-maximum-line-length", 300], 9 | ["remark-lint-no-shortcut-reference-link", false], 10 | ["remark-lint-list-item-indent", "tab-size"], 11 | ["remark-lint-no-undefined-references", false], 12 | ["remark-lint-emphasis-marker", false], 13 | ["remark-lint-fenced-code-flag", false], 14 | ["remark-lint-no-duplicate-headings", false] 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: pols503notes 2 | Title: POLS 503 Notes 3 | Version: 0.0.1 4 | Authors@R: c( 5 | person("Jeffrey", "Arnold", , "jeffrey.arnold@gmail.com", c("aut", "cre")) 6 | ) 7 | Depends: R (>= 3.1.0) 8 | URL: https://github.com/jrnold/intro-methods-notes 9 | Imports: 10 | bookdown, 11 | broom, 12 | carData, 13 | datums, 14 | htmltools, 15 | htmlTable, 16 | jrnoldmisc, 17 | MASS, 18 | texreg, 19 | tidyverse, 20 | vcd, 21 | xtable 22 | Remotes: 23 | jrnold/jrnoldmisc, 24 | jrnold/datums 25 | RoxygenNote: 6.0.1 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | CC0 1.0 Universal 2 | 3 | Statement of Purpose 4 | 5 | The laws of most jurisdictions throughout the world automatically confer 6 | exclusive Copyright and Related Rights (defined below) upon the creator and 7 | subsequent owner(s) (each and all, an "owner") of an original work of 8 | authorship and/or a database (each, a "Work"). 9 | 10 | Certain owners wish to permanently relinquish those rights to a Work for the 11 | purpose of contributing to a commons of creative, cultural and scientific 12 | works ("Commons") that the public can reliably and without fear of later 13 | claims of infringement build upon, modify, incorporate in other works, reuse 14 | and redistribute as freely as possible in any form whatsoever and for any 15 | purposes, including without limitation commercial purposes. These owners may 16 | contribute to the Commons to promote the ideal of a free culture and the 17 | further production of creative, cultural and scientific works, or to gain 18 | reputation or greater distribution for their Work in part through the use and 19 | efforts of others. 20 | 21 | For these and/or other purposes and motivations, and without any expectation 22 | of additional consideration or compensation, the person associating CC0 with a 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work 25 | and publicly distribute the Work under its terms, with knowledge of his or her 26 | Copyright and Related Rights in the Work and the meaning and intended legal 27 | effect of CC0 on those rights. 28 | 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be 30 | protected by copyright and related or neighboring rights ("Copyright and 31 | Related Rights"). Copyright and Related Rights include, but are not limited 32 | to, the following: 33 | 34 | i. the right to reproduce, adapt, distribute, perform, display, communicate, 35 | and translate a Work; 36 | 37 | ii. moral rights retained by the original author(s) and/or performer(s); 38 | 39 | iii. publicity and privacy rights pertaining to a person's image or likeness 40 | depicted in a Work; 41 | 42 | iv. rights protecting against unfair competition in regards to a Work, 43 | subject to the limitations in paragraph 4(a), below; 44 | 45 | v. rights protecting the extraction, dissemination, use and reuse of data in 46 | a Work; 47 | 48 | vi. database rights (such as those arising under Directive 96/9/EC of the 49 | European Parliament and of the Council of 11 March 1996 on the legal 50 | protection of databases, and under any national implementation thereof, 51 | including any amended or successor version of such directive); and 52 | 53 | vii. other similar, equivalent or corresponding rights throughout the world 54 | based on applicable law or treaty, and any national implementations thereof. 55 | 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of, 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright 59 | and Related Rights and associated claims and causes of action, whether now 60 | known or unknown (including existing as well as future claims and causes of 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum 62 | duration provided by applicable law or treaty (including future time 63 | extensions), (iii) in any current or future medium and for any number of 64 | copies, and (iv) for any purpose whatsoever, including without limitation 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes 66 | the Waiver for the benefit of each member of the public at large and to the 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver 68 | shall not be subject to revocation, rescission, cancellation, termination, or 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work 70 | by the public as contemplated by Affirmer's express Statement of Purpose. 71 | 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be 73 | judged legally invalid or ineffective under applicable law, then the Waiver 74 | shall be preserved to the maximum extent permitted taking into account 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver 76 | is so judged Affirmer hereby grants to each affected person a royalty-free, 77 | non transferable, non sublicensable, non exclusive, irrevocable and 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration 80 | provided by applicable law or treaty (including future time extensions), (iii) 81 | in any current or future medium and for any number of copies, and (iv) for any 82 | purpose whatsoever, including without limitation commercial, advertising or 83 | promotional purposes (the "License"). The License shall be deemed effective as 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the 85 | License for any reason be judged legally invalid or ineffective under 86 | applicable law, such partial invalidity or ineffectiveness shall not 87 | invalidate the remainder of the License, and in such case Affirmer hereby 88 | affirms that he or she will not (i) exercise any of his or her remaining 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims 90 | and causes of action with respect to the Work, in either case contrary to 91 | Affirmer's express Statement of Purpose. 92 | 93 | 4. Limitations and Disclaimers. 94 | 95 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 96 | surrendered, licensed or otherwise affected by this document. 97 | 98 | b. Affirmer offers the Work as-is and makes no representations or warranties 99 | of any kind concerning the Work, express, implied, statutory or otherwise, 100 | including without limitation warranties of title, merchantability, fitness 101 | for a particular purpose, non infringement, or the absence of latent or 102 | other defects, accuracy, or the present or absence of errors, whether or not 103 | discoverable, all to the greatest extent permissible under applicable law. 104 | 105 | c. Affirmer disclaims responsibility for clearing rights of other persons 106 | that may apply to the Work or any use thereof, including without limitation 107 | any person's Copyright and Related Rights in the Work. Further, Affirmer 108 | disclaims responsibility for obtaining any necessary consents, permissions 109 | or other rights required for any use of the Work. 110 | 111 | d. Affirmer understands and acknowledges that Creative Commons is not a 112 | party to this document and has no duty or obligation with respect to this 113 | CC0 or use of the Work. 114 | 115 | For more information, please see 116 | 117 | 118 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | Rscript --quiet _render.R 3 | 4 | gitbook: 5 | Rscript --quiet _render.R "bookdown::gitbook" 6 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/README.md -------------------------------------------------------------------------------- /WORDLIST: -------------------------------------------------------------------------------- 1 | AJPS 2 | AME 3 | APSA 4 | ATT 5 | Abadie 6 | AbdulkadirogluAngristPathak 7 | Altonji 8 | AltonjiElderTaber 9 | Angrist 10 | AngristPischke 11 | Arrellano 12 | Ashenfelter 13 | Ashenfelter's 14 | AshenfelterAshemoreLalonde 15 | Ashmore 16 | AtheyImbens 17 | AucTeX 18 | Belloni 19 | BertrandDufloMullainathan 20 | Bizup 21 | Bonferroni 22 | Broockman 23 | BroockmanKallaAronow 24 | CEF 25 | CLT 26 | CPE 27 | CalTech 28 | CarpenterDobkin 29 | CaugheySekhon 30 | Chernozhukov 31 | DAGs 32 | DGP 33 | DOI 34 | Dafoe 35 | DavidsonMacKinnon 36 | DiD 37 | Duflo 38 | EPE 39 | EggersFowlerHainmuellerEtAl 40 | EggersHainmueller 41 | EsareyMenger 42 | Eubank 43 | Evera 44 | FWL 45 | Frisch 46 | Gardeazabel 47 | Gelman 48 | GelmanHill 49 | GelmanKatz 50 | Gentzkow 51 | Glymour 52 | Grilliches 53 | GrimmerHershFeinsteinEtAl 54 | Guber 55 | HahnToddKlaauw 56 | Hausmann 57 | Herndon 58 | HerndonAshPollin 59 | Hesterberg 60 | Hochberg 61 | Holm 62 | Homas 63 | IPE 64 | ISLR 65 | ImbensKalyanaraman 66 | JacobLefgren 67 | JamesWittenHastieEtAl 68 | Jeter 69 | Katz 70 | Keele 71 | KingTomzWittenberg 72 | Kleinberg 73 | KleinbergLudwigMullainathanEtAl 74 | Krugman 75 | LDV 76 | LDVs 77 | LSDV 78 | LaCour 79 | LaTeXTools 80 | Lalonde 81 | LeeLemieux 82 | LeeMorettiButler 83 | LindenAdamsRoberts 84 | LitschigMorrison 85 | Lovell 86 | MPs 87 | MSS 88 | MacKinnon 89 | Makefiles 90 | Mankiw 91 | Mattenberg 92 | McCloskey 93 | McCrary 94 | Mendeley 95 | Methodologist 96 | Mullainathan 97 | NBER 98 | NHST 99 | Nunn 100 | NunnWantchekon 101 | OVB 102 | Oster 103 | PCSE 104 | PSAS 105 | Passell 106 | PeiPischkeSchwandt 107 | Pischke 108 | Pollin 109 | RDD 110 | RStudio 111 | Reinhart 112 | ReporteR 113 | Rmd 114 | Rnw 115 | Rogoff 116 | Ruud 117 | SEs 118 | SSR 119 | ShareLaTeX 120 | Shmueli 121 | StackExchange 122 | Strunk 123 | TSCS 124 | TSS 125 | Taber 126 | TeXStudio 127 | TeXmaker 128 | TeXshop 129 | ThistlethwaiteCampbell 130 | ThistlethwaiteCampbell1960a 131 | VIF 132 | Varian 133 | Vec 134 | WYS 135 | Wantchekon 136 | Weingast 137 | Wikibook 138 | X'X 139 | X'y 140 | Zan 141 | Zapnik 142 | Zheng 143 | Zotero 144 | Zvi 145 | acec 146 | advstats 147 | al 148 | andrewgelman 149 | apalike 150 | apsrtable 151 | arg 152 | arxiv 153 | asymptotics 154 | avesbiodiv 155 | bc 156 | beamer 157 | biblio 158 | binom 159 | bmatrix 160 | bmj 161 | bookdown 162 | cdots 163 | clubSandwich 164 | clusterSE 165 | colinear 166 | collinear 167 | colorlinks 168 | colwiz 169 | confounders 170 | counterfactuals 171 | cov 172 | csic 173 | dX 174 | ddots 175 | df 176 | discretizing 177 | documentclass 178 | docx 179 | doi 180 | downarrow 181 | econometrics 182 | emptyset 183 | estadistica 184 | et 185 | fhat 186 | frac 187 | geq 188 | github 189 | heteroskedastic 190 | heteroskedasticity 191 | homoskedastic 192 | homoskedasticity 193 | htmlTables 194 | http 195 | https 196 | iK 197 | ignorability 198 | igt 199 | ij 200 | ik 201 | infty 202 | intromethods 203 | invertible 204 | itg 205 | jrnold 206 | jrnoldmisc 207 | ki 208 | knitr 209 | lah 210 | leftrightarrow 211 | leq 212 | lim 213 | linearities 214 | literatures 215 | lme 216 | lof 217 | mathbb 218 | mathcal 219 | mathrm 220 | mathsf 221 | mathtt 222 | mattblackwell 223 | mncn 224 | monofont 225 | monofontoptions 226 | multicollinearity 227 | neq 228 | nk 229 | nonsingular 230 | observables 231 | overfit 232 | parametric 233 | perp 234 | plm 235 | policymaker's 236 | politicalsciencereplication 237 | positivity 238 | pre 239 | probabilistically 240 | programmatically 241 | regularization 242 | repo 243 | residualized 244 | roids 245 | rsample 246 | rsquared 247 | se 248 | skedastic 249 | statmethods 250 | tech 251 | teck 252 | texreg 253 | tg 254 | tidyverse 255 | tl 256 | tomhopper 257 | unbiasedness 258 | unconfoundedness 259 | underbrace 260 | underfit 261 | underspecified 262 | unmodeled 263 | untestable 264 | uparrow 265 | varepsilon 266 | vcov 267 | vdots 268 | vec 269 | wc 270 | widehat 271 | wordpress 272 | www 273 | xtable 274 | -------------------------------------------------------------------------------- /_bookdown.yml: -------------------------------------------------------------------------------- 1 | book_filename: "intro-method-notes" 2 | chapter_name: "Chapter " 3 | delete_merged_file: true 4 | new_session: yes 5 | output_dir: docs 6 | before_chapter_script: "_common.R" 7 | edit: 8 | link: https://github.com/jrnold/intro-methods-notes/edit/gh-pages/%s 9 | text: "Edit" 10 | rmd_subdir: false 11 | rmd_files: 12 | - index.Rmd 13 | 14 | - eda.Rmd 15 | 16 | - programming.Rmd 17 | 18 | - linear-regression.Rmd 19 | - reganat.Rmd 20 | - matrix.Rmd 21 | - multicollinearity.Rmd 22 | - bootstrapping.Rmd 23 | 24 | - prediction.Rmd 25 | - cross-validation.Rmd 26 | - regularization.Rmd 27 | 28 | - causal-regression.Rmd 29 | - panel.Rmd 30 | - rd.Rmd 31 | 32 | - presentation.Rmd 33 | - tables-and-plots.Rmd 34 | - reproducible-research.Rmd 35 | - word-processing.Rmd 36 | - writing.Rmd 37 | 38 | - appendix.Rmd 39 | - references.Rmd 40 | -------------------------------------------------------------------------------- /_common.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages( 2 | library("tidyverse") 3 | ) 4 | 5 | rpkg_url <- function(pkg) { 6 | paste0("https://cran.r-project.org/package=", pkg) 7 | } 8 | 9 | rpkg <- function(pkg) { 10 | paste0("**[", pkg, "](", rpkg_url(pkg), ")**") 11 | } 12 | 13 | rdoc_url <- function(pkg, fun) { 14 | paste0("https://www.rdocumentation.org/packages/", pkg, "/topics/", fun) # nolint 15 | } 16 | 17 | rdoc <- function(pkg, fun, full_name = FALSE) { 18 | text <- if (full_name) paste0(pkg, "::", fun) else pkg 19 | paste0("[", text, "](", rdoc_url(pkg, fun), ")") 20 | } 21 | 22 | knitr::opts_chunk$set(cache = TRUE, autodep = TRUE) 23 | set.seed(634808943) 24 | -------------------------------------------------------------------------------- /_lint.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | suppressPackageStartupMessages({ 3 | library("rex") 4 | library("lintr") 5 | }) 6 | 7 | lint_dir <- function(path = ".", relative_path = TRUE, 8 | pattern = "\\.([Rr]|Rmd|Rhtml)$", recursive = TRUE, ...) { 9 | lintr:::read_settings(path) 10 | on.exit(lintr:::clear_settings, add = TRUE) 11 | settings <- lintr:::settings 12 | names(settings$exclusions) <- 13 | normalizePath(file.path(path, names(settings$exclusions))) 14 | files <- dir(path = path, pattern = pattern, recursive = TRUE, 15 | full.names = TRUE) 16 | files <- normalizePath(files) 17 | lints <- lintr:::flatten_lints(lapply(files, function(file) { 18 | if (interactive()) { 19 | message(".", appendLF = FALSE) 20 | } 21 | try(lint(file, ..., parse_settings = FALSE)) 22 | })) 23 | if (interactive()) { 24 | message() 25 | } 26 | lints <- lintr:::reorder_lints(lints) 27 | if (relative_path == TRUE) { 28 | lints[] <- lapply(lints, function(x) { 29 | x$filename <- re_substitutes(x$filename, rex(normalizePath(path), 30 | one_of("/", "\\")), "") 31 | x 32 | }) 33 | attr(lints, "path") <- path 34 | } 35 | class(lints) <- "lints" 36 | lints 37 | } 38 | 39 | lint_dir(here::here()) 40 | -------------------------------------------------------------------------------- /_notes/pauperism.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Yule Replication" 3 | output: html_document 4 | --- 5 | 6 | ```{r include=FALSE} 7 | library("dplyr") 8 | library("readr") 9 | library("tidyr") 10 | library("haven") 11 | library("plm") 12 | library("magrittr") 13 | library("purrr") 14 | library("ggplot2") 15 | library("broom") 16 | ``` 17 | 18 | ```{r} 19 | ratiodiff <- function(x) { 20 | z <- x / lag(x) 21 | z[is.infinite(z)] <- NA_real_ 22 | z 23 | } 24 | ``` 25 | 26 | ```{r} 27 | pauperism <- 28 | left_join(yule, yule_plu, by = "plu") 29 | ``` 30 | 31 | Table 2: Metropolitan Group, 1871-1881 32 | ```{r results = 'asis'} 33 | filter(yule_long, Type == "Metropolitan") %>% 34 | filter(year == 1881) %>% 35 | select(ID, Union, pauper_diff, outratio, Prop65_diff, 36 | Popn_diff) %>% 37 | arrange(ID) %>% 38 | select(-ID) %>% 39 | knitr::kable() 40 | ``` 41 | 42 | $$ 43 | \begin{aligned}[t] 44 | \Delta\mathtt{Paup} &= \beta_0 \\ 45 | &+ \beta_1 \Delta\mathtt{Out} \\ 46 | &+ \beta_2 \Delta\mathtt{Old} \\ 47 | &+ \beta_3 \Delta\mathtt{Pop} + \varepsilon 48 | \end{aligned} 49 | $$ 50 | 51 | # Summary Statistics 52 | 53 | ```{r} 54 | filter(yule_long, year > 1871) %>% 55 | group_by(year, Type) %>% 56 | select(pauper_diff, outratiodiff, Prop65_diff, Popn_diff) %>% 57 | gather(variable, value, -Type, -year) %>% 58 | group_by(variable, year, Type) %>% 59 | summarize(mean = mean(value, na.rm = TRUE), 60 | sd = sd(value, na.rm = TRUE)) %>% 61 | knitr::kable() 62 | 63 | ``` 64 | 65 | 66 | # Regression 67 | 68 | ```{r} 69 | lm(pauper ~ outratio, data = yule_long) 70 | lm(pauper ~ year + Type + outratio, data = yule_long) 71 | lm(pauper ~ year + Type + outratio + Prop65 + Popn65, data = yule_long) 72 | lm(pauper ~ Type * (year + outratio + Prop65 + Popn65), data = yule_long) 73 | ``` 74 | 75 | 76 | ```{r} 77 | yule_diff <- yule_long %>% 78 | filter(year > 1871) %>% 79 | mutate(year = as.factor(year)) %>% 80 | select(ID, Union, Type, year, pauper_diff, outratiodiff, Popn_diff, 81 | Prop65_diff) 82 | 83 | lm(pauper_diff ~ outratiodiff, data = yule_diff) 84 | lm(pauper_diff ~ Type * year + outratiodiff, data = yule_diff) 85 | lm(pauper_diff ~ Type * year + outratiodiff + Popn_diff + Prop65_diff, data = yule_diff) 86 | lm(pauper_diff ~ (Type * year) * (outratiodiff + Prop65_diff + Popn_diff), 87 | data = yule_diff) 88 | 89 | 90 | 91 | ``` 92 | 93 | Individual regression for each Type and Region 94 | ```{r} 95 | diff_mod_3 <- 96 | yule_long %>% 97 | filter(year %in% c(1881, 1891)) %>% 98 | group_by(year, Type) %>% 99 | do(tidy(lm(pauper_diff ~ outratiodiff + Popn_diff + Prop65_diff, data = .))) 100 | 101 | diff_mod_3 %>% 102 | select(year, Type, term, estimate) %>% 103 | spread(term, estimate) %>% 104 | knitr::kable() 105 | ``` 106 | 107 | ## Summary Statistics 108 | 109 | ### Outratio 110 | 111 | ```{r} 112 | ggplot(select(filter(yule_long, !is.na(outratio)), 113 | outratio, ID, year, Type), 114 | aes(x = outratio, y = ..density..)) + 115 | geom_histogram(binwidth = 2) + 116 | facet_grid(year ~ Type) 117 | ``` 118 | 119 | ```{r} 120 | ggplot(select(filter(yule_long, !is.na(outratiodiff)), 121 | outratiodiff, ID, year, Type), 122 | aes(x = outratiodiff, y = ..density..)) + 123 | geom_histogram(binwidth = 20) + 124 | facet_grid(year ~ Type) 125 | ``` 126 | 127 | ## Pauperism 128 | 129 | ```{r} 130 | ggplot(select(filter(yule_long, !is.na(pauper)), 131 | pauper, ID, year, Type), 132 | aes(x = pauper, y = ..density..)) + 133 | geom_histogram(binwidth = .01) + 134 | facet_grid(year ~ Type) 135 | ``` 136 | 137 | There appear to be some big outliers in the ratio difference 138 | in pauperism, 139 | ```{r} 140 | ggplot(select(filter(yule_long, !is.na(pauper_diff)), 141 | pauper_diff, ID, year, Type), 142 | aes(x = pauper_diff, y = ..density..)) + 143 | geom_histogram(binwidth = 15) + 144 | facet_grid(year ~ Type) 145 | ``` 146 | -------------------------------------------------------------------------------- /_notes/questions.Rmd: -------------------------------------------------------------------------------- 1 | # Questions 2 | 3 | ```{r setup,include=FALSE} 4 | library("DiagrammR") 5 | ``` 6 | 7 | # Tukey (1980) 8 | 9 | > Tukey, John W. 1980. "We Need Both Exploratory and Confirmatory" *The American Statistician.* https://dx.doi.org/10.2307/268299 10 | 11 | John Tukey discussed exploratory and confirmatory analysis and the need for both: 12 | 13 | The stylized view of science is the "straight-line paradigm" 14 | ```{r} 15 | mermaid("diagrams/science.mmd") 16 | ``` 17 | 18 | But where does the question or idea come from? Tukey notes four issues with this straight-line paradigm: 19 | 20 | - Questions come from theory and insights derived from previous explorations of similar data 21 | - Designs come are also driven by insights from previous studies of similar data 22 | - Data collection is monitored by exploring the data and looking for unexpected patterns 23 | - The analysis proceeds often by exploring the data to avoid bad or pursue good avenues of discovery? 24 | 25 | All science has peeked at the data before answering the question. 26 | In fact, if science as a whole persued the straight-line paradigm only the first question ever posed could be analyzed without some corruption from knowing something about domain of study. 27 | 28 | Instead, a more realistic formulation of the scientific process is 29 | ```{r} 30 | mermaid("diagrams/scienc2.mmd") 31 | ``` 32 | 33 | > The formulation of the question itself involves what can in fact be asked, what designs are feasible, as well as how likely a given design is to give a useful answer. 34 | > Both inchoate insight and extensive exploration (of past data) can---and should---play a role in this process of formulating and question. 35 | > 36 | > Science ... DOES NOT BEGIN WITH A TIDY QUESTION. Nor does it end with a tidy answer. 37 | > 38 | > The picture of a scientist struck---as by lightning---with a question is very far from the truth. 39 | 40 | But if you do do confirmatory analysis: 41 | 42 | 1. randomize 43 | 2. pre-plan 44 | 45 | After choosing a question, limit your analysis to one main question---specified by the entire design, collection, monitoring, and analysis. 46 | 47 | # Peng and Leek 48 | 49 | The epicycles of analysis (CH 2). 50 | There are 5 core activities of data analysis: 51 | 52 | 1. Stating the question 53 | 2. Exploratory data analysis 54 | 3. Model building 55 | 4. Interpreting 56 | 5. Communicating 57 | 58 | Each of those activities consists of three epicycles: 59 | 60 | 1. setting expectations 61 | 2. collecting data, comparing data to expectations 62 | 3. if the data don't match expectations, then revise data or expectations and repeat 63 | 64 | Types of questions. There are six types of questions (p. 18--19) 65 | Leek and Peng. What is the question? 2015. *Science* http://science.sciencemag.org/content/347/6228/1314 66 | 67 | 1. Descriptive: Summarizes a characteristic of data. 68 | 2. Exploratory: Find patterns in data. Hypothesis generating analysis. 69 | 3. Inferential: Given a hypothesis, extrapolate from the sample to the population or different sample. 70 | 4. Predictive: Predict new data. In this you don't necessarily care about the predictors, only that the model predicts well. 71 | 5. Causal: Does X cause Y? How does changing one factor change another (on average) in the population? 72 | 6. Mechanistic: How does X cause Y? 73 | 74 | What is a good question (p. 21)? 75 | 76 | 1. interest to the audience 77 | 2. it is not already answered 78 | 3. it stems from a plausible framework 79 | 4. it should be answerable 80 | 5. it is also useful to be specific - because that helps answerability. 81 | 82 | # Exploratory Data Analysis 83 | 84 | Goals of EDA (Art of Data Science, Ch 4.): 85 | 86 | 1. Find problems in the data 87 | 2. Detemine whether the question can be answered with the data at hand (proof of concept) 88 | 3. Develop a "sketch of the answer" 89 | 90 | Their EDA checklist 91 | 92 | 1. Formulate your question 93 | 2. Read in your data 94 | 3. Check the packaging: How many observations and variables? What are the observations and variables in the data? 95 | 4. Look at the top and the bottom of your data: Look at the beginning and end of the data---is it in order, is it properly formatted, in a time series does it have the right times? 96 | 5. Check your "n"s: Always check the number of observations. This is quick way to check that there aren't mistakes in the sample, especially when merging. 97 | 6. Validate with at least one external data source: This doesn't need to be formal. But compare values of variables to other known values to ensure they are in the right ballpark. This catches unit-of-measurement issues, variables not measuring what you thought they were measuring, data entry errors. 98 | 7. Make a plot. Comparing the data to what you expect it to look like is a good way to catch both data errors and also to find new patterns. 99 | 8. Try the easy solution first. This is a proof of concept that your answer will work. 100 | 9. Follow up. Challenge the solution. Why might it be wrong. 101 | 102 | - do you have the right data? 103 | - do you need more data? 104 | - do you have the right question? 105 | -------------------------------------------------------------------------------- /_notes/realstats.Rmd: -------------------------------------------------------------------------------- 1 | 3.1 Bivariate Regression Model 2 | 3 | - estimation 4 | 5 | 3.2 Random variation in Coefficient estimates 6 | 7 | - distribution of $\hat{\beta}$ estimates 8 | - $\hat{\beta}$ are normally distributed 9 | 10 | 3.3 Exogeneity and Ubiasedness 11 | 12 | - conditions for unbiased estimator 13 | - characterizing biaas 14 | 15 | 3.4 Precision of Estimate 16 | 3.5 Probability limits and consistency 17 | 18 | - probability limit 19 | - consistency 20 | 21 | 3.6 Homoskedasticity 22 | 23 | - heteroskedasticity 24 | - correlated errors - autocorrelation 25 | -------------------------------------------------------------------------------- /_output.yml: -------------------------------------------------------------------------------- 1 | bookdown::gitbook: 2 | # css: style.css 3 | # math: true 4 | dev: svglite 5 | config: 6 | toc: 7 | collapse: none 8 | before: | 9 |
  • Intro Method Notes
  • 10 | after: | 11 |
  • Published with bookdown
  • 12 | edit: 13 | link: https://github.com/jrnold/intro-method-notes/edit/master/%s 14 | text: "Edit" 15 | sharing: 16 | github: yes 17 | facebook: no 18 | always_allow_html: yes 19 | includes: 20 | in_header: includes/in_header.html 21 | before_body: includes/before_body.html 22 | after_body: includes/after_body.html 23 | bookdown::pdf_book: 24 | includes: 25 | in_header: includes/preamble.tex 26 | latex_engine: xelatex 27 | always_allow_html: yes 28 | -------------------------------------------------------------------------------- /_render.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript --quiet 2 | quiet <- "--quiet" %in% commandArgs(FALSE) 3 | formats <- commandArgs(TRUE) 4 | 5 | # provide default formats if necessary 6 | if (length(formats) == 0) { 7 | formats <- c("bookdown::pdf_book", "bookdown::gitbook") 8 | } 9 | # render the book to all formats unless they are specified via command-line args 10 | for (fmt in formats) { 11 | cmd <- sprintf("bookdown::render_book('index.Rmd', '%s', quiet = %s)", 12 | fmt, quiet) 13 | res <- bookdown:::Rscript(c("-e", shQuote(cmd))) 14 | if (res != 0) stop("Failed to compile the book to ", fmt) 15 | } 16 | -------------------------------------------------------------------------------- /_serve.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript --quiet 2 | quiet <- "--quiet" %in% commandArgs(FALSE) 3 | bookdown::serve_book(dir = ".", 4 | preview = TRUE, 5 | daemon = FALSE, 6 | in_session = FALSE) 7 | -------------------------------------------------------------------------------- /_spelling.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | files <- c(dir(here::here(), pattern = "\\.(Rmd)"), 4 | here::here("README.md")) 5 | words <- readLines(here::here("WORDLIST")) 6 | spelling::spell_check_files(files, ignore = words) 7 | -------------------------------------------------------------------------------- /appendix.Rmd: -------------------------------------------------------------------------------- 1 | # (APPENDIX) Appendix {-} 2 | -------------------------------------------------------------------------------- /bibliography.Rmd: -------------------------------------------------------------------------------- 1 | # Annotated Bibliography 2 | 3 | -------------------------------------------------------------------------------- /bootstrapping.Rmd: -------------------------------------------------------------------------------- 1 | # Bootstrapping 2 | 3 | The central analogy of bootstrapping is 4 | 5 | > The population is to the sample as the sample is to the bootstrap samples [@Fox2008a, p. 590] 6 | 7 | To calculate standard errors to use in confidence intervals we need to know sampling distribution of the statistic of interest. 8 | 9 | In the case of a mean, we can appeal to the central limit theorem if the sample size is large enough. 10 | 11 | Bootstrapping takes a different approach. 12 | We use the sample as an estimator of the sampling distribution. 13 | E.g. bootstrap claims 14 | $$ 15 | \text{sample distribution} \approx \text{population distribution} 16 | $$ 17 | and then proceeds to *plug-in* the sample distribution for the population distribution, and then draw new samples to generate a sampling distribution. 18 | 19 | The bootstrap relies upon the **plug-in principle**. 20 | The plug-in principle is that when something is unknown, use an estimate of it. 21 | An example is the use of the *sample standard deviation* in place of the *population standard deviation*, when calculating the standard error of the mean, 22 | $$ 23 | \SE(\bar{x}) = \frac{\sigma}{\sqrt{n}} \approx \frac{\hat{\sigma}}{\sqrt{n}} 24 | $$ 25 | Bootstrap is the plug-in principal on 'roids. 26 | It uses the empirical distribution as a plug-in for the unknown population distribution. 27 | See Figures 4 and 5 of @Hesterberg2015a. 28 | 29 | Bootstrap principles 30 | 31 | 1. The substitution of the empirical distribution for the population works. 32 | 1. Sample with replacement. 33 | 34 | - The bootstrap is for inference not better estimates. It can estimate uncertainty, not improve $\bar{x}$. It is not generating new data out of nowhere. However, see the section on bagging for how bootstrap aggregation can be used. 35 | 36 | ## Non-parametric bootstrap 37 | 38 | The non-parametric bootstrap resamples the data with replacement $B$ times and calculates the statistic on each resample. 39 | 40 | ## Standard Errors 41 | 42 | The bootstrap is primarily a means to calculate standard errors. 43 | 44 | The bootstrap standard error is 45 | 46 | Suppose there are $r$ bootstrap replicates. 47 | Let $\hat{\theta}^{*}_1, \dots, \hat{\theta}^{*}_r$ be statistics calculated on each bootstrap samples. 48 | $$ 49 | \SE^{*}\left(\hat{\theta}^{*}\right) = \sqrt{\frac{\sum_{b = 1}^r {(\hat{\theta}^{*}_b - \bar{\theta}^{*})}^2}{r - 1}} 50 | $$ 51 | where $\bar{\theta}^{*}$ is the mean of bootstrap statistics, 52 | $$ 53 | \bar{\theta}^{*} = \frac{\sum_{b = 1}^r}{r} . 54 | $$ 55 | 56 | ## Confidence Intervals 57 | 58 | There are multiple ways to calculate confidence intervals from bootstrap. 59 | 60 | - Normal-Theory Intervals 61 | - Percentile Intervals 62 | - ABC Intervals 63 | 64 | ## Alternative methods 65 | 66 | ### Parametric Bootstrap 67 | 68 | The parametric bootstrap draws samples from the estimated model. 69 | 70 | For example, in linear regression, we can start from the model, 71 | $$ 72 | y_i = \Vec{x}_i \Vec{\beta} + \epsilon_i 73 | $$ 74 | 75 | 1. Estimate the regression model to get $\hat{\beta}$ and $\hat{\sigma}$ 76 | 77 | 1. For $1, \dots, r$ bootstrap replicates: 78 | 79 | 1. Generate bootstrap sample $(\Vec{y}^{*}, \Mat{X})$, where $\Mat{X}$ are 80 | those from the original sample, and the values of $\Vec{y}^{*}$ are generated 81 | by sampling from the residual distribution, 82 | $$ 83 | y_i^{*}_b = \Vec{x}_i \Vec{\hat{\beta}} + \epsilon^{*}_{i,b} 84 | $$ 85 | where $\epsilon^{*}_{i,b} \sim \mathrm{Normal}(0, \hat{\sigma})$. 86 | 87 | 1. Re-estimate a regression on $(\Vec{y}^{*}, \Mat{X})$ to estimate 88 | $\hat{\beta}^{*}$. 89 | 90 | 1. Calculate any statistics of the regression results. 91 | 92 | Alternatively, we could have drawn the values of $\Vec{\epsilon}^*_b$ from the 93 | empirical distribution of residuals or the [Wild Bootstrap](https://www.math.kth.se/matstat/gru/sf2930/papers/wild.bootstrap.pdf). 94 | 95 | See the the discussion in the `boot::boot()` function, for `sim = "parametric"`. 96 | 97 | ### Clustered bootstrap 98 | 99 | We can incorporate complex sampling methods into the bootstrap [@Fox2008a, Sec 21.5]. 100 | In particular, by resampling clusters instead of individual observations, we get the clustered bootstrap.[@EsareyMenger2017a] 101 | 102 | ### Time series bootstrap 103 | 104 | Since data are not independent in time-series, variations of the bootstrap have to be used. 105 | See the references in the documentation for `boot::tsboot`. 106 | 107 | ### How to sample? 108 | 109 | Draw the bootstrap sample in the same way it was drawn from the population (if possible) [@Hesterberg2015a, p. 19] 110 | 111 | The are a few exceptions: 112 | 113 | - Condition on the observed information. We should fix known quantities, e.g. observed sample sizes of sub-samples [@Hesterberg2015a] 114 | - For hypothesis testing, the sampling distribution needs to be modified to represent the null distribution [@Hesterberg2015a] 115 | 116 | ### Caveats 117 | 118 | - Bootstrapping does not work well for the median or other quantities that depend on the small number of observations out of larger sample.[@Hesterberg2015a] 119 | - Uncertainty in the bootstrap estimator is due to both (1) Monte Carlo sampling (taking a finite number of samples), and (2) the sample itself. The former can be decreased by increasing the number of bootstrap samples. The latter is irreducible without a new sample. 120 | - The bootstrap distribution will reflect the data. If the sample was "unusual", then the bootstrap distribution will also be so.[@Hesterberg2015a] 121 | - In small samples there is a narrowness bias. [@Hesterberg2015a, p. 24]. As always, small samples is problematic. 122 | 123 | ### Why use bootstrapping? 124 | 125 | - The common practice of relying on asymmetric results may understate variability by ignoring dependencies or heteroskedasticity. These can be incorporated into bootstrapping.[@Fox2008a, p. 602] 126 | - it is general purpose algorithm that can generate standard errors and confidence intervals in cases where an analytic solution does not exist. 127 | - however, it may require programming to implement and computational power to execute 128 | 129 | ## Bagging 130 | 131 | Note that in all the previous discussion, the original point estimate is used. 132 | Bootstrapping is only used to generate (1) standard errors and confidence intervals (2). 133 | 134 | Bootstrap aggregating or [bagging](https://en.wikipedia.org/wiki/Bootstrap_aggregating) is a meta-algorithm that constructs a point estimate by averaging the point-estimates from bootstrap samples. 135 | Bagging can reduce the variance of some estimators, so can be thought of as a sort of regularization method. 136 | 137 | ## Hypothesis Testing 138 | 139 | Hypothesis testing with bootstrap is more complicated. 140 | 141 | ## How many samples? 142 | 143 | There is no fixed rule of thumb (it will depend on the statistic you are calculating and the population distribution), but if you want a single number, 1,000 is good lower bound. 144 | 145 | - Higher levels of confidence require more samples 146 | 147 | - Note that the results of the percentile method will be more variable than the normal-approximation method. 148 | The ABC confidence intervals will be even better. 149 | 150 | One ad-hoc recipe suggested [here](https://www.stata.com/support/faqs/statistics/bootstrapped-samples-guidelines/) is: 151 | 152 | 1. Choose a $B$ 153 | 1. Run the bootstrap 154 | 1. Run the bootstrap again (ensure there is a different random number seed) 155 | 1. If results differ, increase the size. 156 | 157 | @DavidsonMacKinnon2000a suggest the following: 158 | 159 | - 5%: 399 160 | - 1%: 1499 161 | 162 | Though it also suggests a pre-test method. 163 | 164 | @Hesterberg2015a suggests far a larger bootstrap sample size: 10,000 for routine use. 165 | It notes that for a t-test, 15,000 samples for the a 95% probability that the one-sided levels fall within 10% of the true values, for 95% intervals and 5% tests. 166 | 167 | ## References 168 | 169 | See @Fox2008a [Ch. 21]. 170 | 171 | @Hesterberg2015a is for "teachers of statistics" but is a great overview of bootstrapping. 172 | I found it more useful than the treatment of bootstrapping in many textbooks. 173 | 174 | For some Monte Carlo results on the accuracy of the bootstrap see @Hesterberg2015a, p. 21. 175 | 176 | R packages. For general purpose bootstrapping and cross-validation I suggest the `r rpkg("rsample")` package, which works well with the tidyverse and seems to be 177 | useful going forward. 178 | 179 | The `r rpkg("boot")` package included in the recommended R packages is a classic package that implements many bootstrapping and resampling methods. Most of them 180 | are parallelized. However, its interface is not as nice as rsample. 181 | 182 | - 183 | - 184 | 185 | See [this spreadsheet](https://docs.google.com/spreadsheets/d/1MNOCwOo7oPKrDB1FMwDzsYzvLoK-IBqoxhKrOsN1M2A/edit#gid=0) for some Monte Carlo simulations on Bootstrap vs. t-statistic. 186 | -------------------------------------------------------------------------------- /data/western1995/econ_growth.tsv: -------------------------------------------------------------------------------- 1 | country econ_growth labor_org social_dem 2 | Australia .51 1.87 30.5 3 | Austria .64 3.06 100.0 4 | Belgium .44 2.80 21.0 5 | Canada .50 .98 .0 6 | Denmark .36 2.77 75.5 7 | Finland .56 2.76 40.2 8 | France .57 .68 1.7 9 | Germany .53 1.80 74.8 10 | Holland .44 1.90 41.2 11 | Italy .53 1.47 6.5 12 | Japan .38 .43 .0 13 | Norway 1.05 3.33 100.0 14 | Sweden .44 3.52 45.9 15 | United Kingdom .26 1.81 86.0 16 | United States .51 .82 .0 17 | -------------------------------------------------------------------------------- /data/western1995/income_ineq.tsv: -------------------------------------------------------------------------------- 1 | country inequality turnout energy socialism 2 | Argentina 2.960 61.8 1,088 2.3 3 | Australia 1.940 85.3 3,918 45.0 4 | Denmark 2.734 86.8 2,829 41.8 5 | Finland 4.441 82.1 1,650 24.9 6 | France 5.653 66.5 2,419 25.1 7 | Germany 3.435 77.6 3,673 27.1 8 | Israel 1.950 84.1 1,243 50.8 9 | Italy 2.196 89.2 1,135 17.0 10 | Japan 3.007 72.3 1,166 27.5 11 | Netherlands 3.457 87.9 2,691 30.8 12 | Norway 2.440 81.9 2,740 52.0 13 | Puerto Rico 3.693 73.3 1,453 0.0 14 | South Africa 9.410 14.3 2,338 1.8 15 | Sweden 3.143 78.1 3,491 48.5 16 | Trinidad and Tobago 3.888 64.7 1,935 18.8 17 | United Kingdom 2.876 72.4 4,907 48.5 18 | United States 2.296 56.8 8,047 0.0 19 | Venezuela 3.515 78.8 2,623 28.7 20 | -------------------------------------------------------------------------------- /data/western1995/unionization.tsv: -------------------------------------------------------------------------------- 1 | country union_density left_government labor_force_size econ_conc 2 | Sweden 82.4 111.84 3,931 1.55 3 | Israel 80.0 73.17 997 1.71 4 | Iceland 74.3 17.25 81 2.06 5 | Finland 73.3 59.33 2,034 1.56 6 | Belgium 71.9 43.25 3,348 1.52 7 | Denmark 69.8 90.24 2,225 1.52 8 | Ireland 68.1 .00 886 1.75 9 | Austria 65.6 48.67 2,469 1.53 10 | New Zealand 59.4 60.00 1,050 1.64 11 | Norway 58.9 83.08 1,657 1.58 12 | Australia 51.4 33.74 5,436 1.37 13 | Italy 50.6 .00 15,819 .86 14 | United Kingdon 48.0 43.67 25,757 1.13 15 | Germany 39.6 35.33 23,003 .92 16 | Netherlands 37.7 31.50 4,509 1.25 17 | Switzerland 35.4 11.87 2,460 1.68 18 | Canada 31.2 .00 10,516 1.35 19 | Japan 31.0 1.92 39,930 1.11 20 | France 28.2 8.67 18,846 .95 21 | United States 24.5 .00 92,899 1.00 22 | -------------------------------------------------------------------------------- /diagrams/_book/_main_files/figure-html/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/diagrams/_book/_main_files/figure-html/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /diagrams/_book/_main_files/figure-html/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/diagrams/_book/_main_files/figure-html/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /diagrams/_book/_main_files/figure-html/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/diagrams/_book/_main_files/figure-html/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /diagrams/_book/libs/gitbook-2.6.7/css/fontawesome/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/diagrams/_book/libs/gitbook-2.6.7/css/fontawesome/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /diagrams/_book/libs/gitbook-2.6.7/css/plugin-bookdown.css: -------------------------------------------------------------------------------- 1 | .book .book-header h1 { 2 | padding-left: 20px; 3 | padding-right: 20px; 4 | } 5 | .book .book-header.fixed { 6 | position: fixed; 7 | right: 0; 8 | top: 0; 9 | left: 0; 10 | border-bottom: 1px solid rgba(0,0,0,.07); 11 | } 12 | span.search-highlight { 13 | background-color: #ffff88; 14 | } 15 | @media (min-width: 600px) { 16 | .book.with-summary .book-header.fixed { 17 | left: 300px; 18 | } 19 | } 20 | @media (max-width: 1240px) { 21 | .book .book-body.fixed { 22 | top: 50px; 23 | } 24 | .book .book-body.fixed .body-inner { 25 | top: auto; 26 | } 27 | } 28 | @media (max-width: 600px) { 29 | .book.with-summary .book-header.fixed { 30 | left: calc(100% - 60px); 31 | min-width: 300px; 32 | } 33 | .book.with-summary .book-body { 34 | transform: none; 35 | left: calc(100% - 60px); 36 | min-width: 300px; 37 | } 38 | .book .book-body.fixed { 39 | top: 0; 40 | } 41 | } 42 | 43 | .book .book-body.fixed .body-inner { 44 | top: 50px; 45 | } 46 | .book .book-body .page-wrapper .page-inner section.normal sub, .book .book-body .page-wrapper .page-inner section.normal sup { 47 | font-size: 85%; 48 | } 49 | 50 | @media print { 51 | .book .book-summary, .book .book-body .book-header, .fa { 52 | display: none !important; 53 | } 54 | .book .book-body.fixed { 55 | left: 0px; 56 | } 57 | .book .book-body,.book .book-body .body-inner, .book.with-summary { 58 | overflow: visible !important; 59 | } 60 | } 61 | .kable_wrapper { 62 | border-spacing: 20px 0; 63 | border-collapse: separate; 64 | border: none; 65 | margin: auto; 66 | } 67 | .kable_wrapper > tbody > tr > td { 68 | vertical-align: top; 69 | } 70 | .book .book-body .page-wrapper .page-inner section.normal table tr.header { 71 | border-top-width: 2px; 72 | } 73 | .book .book-body .page-wrapper .page-inner section.normal table tr:last-child td { 74 | border-bottom-width: 2px; 75 | } 76 | .book .book-body .page-wrapper .page-inner section.normal table td, .book .book-body .page-wrapper .page-inner section.normal table th { 77 | border-left: none; 78 | border-right: none; 79 | } 80 | .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr, .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr > td { 81 | border-top: none; 82 | } 83 | .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr:last-child > td { 84 | border-bottom: none; 85 | } 86 | 87 | div.theorem, div.lemma, div.corollary, div.proposition, div.conjecture { 88 | font-style: italic; 89 | } 90 | span.theorem, span.lemma, span.corollary, span.proposition, span.conjecture { 91 | font-style: normal; 92 | } 93 | div.proof:after { 94 | content: "\25a2"; 95 | float: right; 96 | } 97 | .header-section-number { 98 | padding-right: .5em; 99 | } 100 | -------------------------------------------------------------------------------- /diagrams/_book/libs/gitbook-2.6.7/css/plugin-search.css: -------------------------------------------------------------------------------- 1 | .book .book-summary .book-search { 2 | padding: 6px; 3 | background: transparent; 4 | position: absolute; 5 | top: -50px; 6 | left: 0px; 7 | right: 0px; 8 | transition: top 0.5s ease; 9 | } 10 | .book .book-summary .book-search input, 11 | .book .book-summary .book-search input:focus, 12 | .book .book-summary .book-search input:hover { 13 | width: 100%; 14 | background: transparent; 15 | border: 1px solid #ccc; 16 | box-shadow: none; 17 | outline: none; 18 | line-height: 22px; 19 | padding: 7px 4px; 20 | color: inherit; 21 | box-sizing: border-box; 22 | } 23 | .book.with-search .book-summary .book-search { 24 | top: 0px; 25 | } 26 | .book.with-search .book-summary ul.summary { 27 | top: 50px; 28 | } 29 | -------------------------------------------------------------------------------- /diagrams/_book/libs/gitbook-2.6.7/js/jquery.highlight.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["jQuery"], function(jQuery) { 2 | 3 | /* 4 | * jQuery Highlight plugin 5 | * 6 | * Based on highlight v3 by Johann Burkard 7 | * http://johannburkard.de/blog/programming/javascript/highlight-javascript-text-higlighting-jquery-plugin.html 8 | * 9 | * Code a little bit refactored and cleaned (in my humble opinion). 10 | * Most important changes: 11 | * - has an option to highlight only entire words (wordsOnly - false by default), 12 | * - has an option to be case sensitive (caseSensitive - false by default) 13 | * - highlight element tag and class names can be specified in options 14 | * 15 | * Copyright (c) 2009 Bartek Szopka 16 | * 17 | * Licensed under MIT license. 18 | * 19 | */ 20 | 21 | jQuery.extend({ 22 | highlight: function (node, re, nodeName, className) { 23 | if (node.nodeType === 3) { 24 | var match = node.data.match(re); 25 | if (match) { 26 | var highlight = document.createElement(nodeName || 'span'); 27 | highlight.className = className || 'highlight'; 28 | var wordNode = node.splitText(match.index); 29 | wordNode.splitText(match[0].length); 30 | var wordClone = wordNode.cloneNode(true); 31 | highlight.appendChild(wordClone); 32 | wordNode.parentNode.replaceChild(highlight, wordNode); 33 | return 1; //skip added node in parent 34 | } 35 | } else if ((node.nodeType === 1 && node.childNodes) && // only element nodes that have children 36 | !/(script|style)/i.test(node.tagName) && // ignore script and style nodes 37 | !(node.tagName === nodeName.toUpperCase() && node.className === className)) { // skip if already highlighted 38 | for (var i = 0; i < node.childNodes.length; i++) { 39 | i += jQuery.highlight(node.childNodes[i], re, nodeName, className); 40 | } 41 | } 42 | return 0; 43 | } 44 | }); 45 | 46 | jQuery.fn.unhighlight = function (options) { 47 | var settings = { className: 'highlight', element: 'span' }; 48 | jQuery.extend(settings, options); 49 | 50 | return this.find(settings.element + "." + settings.className).each(function () { 51 | var parent = this.parentNode; 52 | parent.replaceChild(this.firstChild, this); 53 | parent.normalize(); 54 | }).end(); 55 | }; 56 | 57 | jQuery.fn.highlight = function (words, options) { 58 | var settings = { className: 'highlight', element: 'span', caseSensitive: false, wordsOnly: false }; 59 | jQuery.extend(settings, options); 60 | 61 | if (words.constructor === String) { 62 | words = [words]; 63 | } 64 | words = jQuery.grep(words, function(word, i){ 65 | return word !== ''; 66 | }); 67 | words = jQuery.map(words, function(word, i) { 68 | return word.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&"); 69 | }); 70 | if (words.length === 0) { return this; } 71 | 72 | var flag = settings.caseSensitive ? "" : "i"; 73 | var pattern = "(" + words.join("|") + ")"; 74 | if (settings.wordsOnly) { 75 | pattern = "\\b" + pattern + "\\b"; 76 | } 77 | var re = new RegExp(pattern, flag); 78 | 79 | return this.each(function () { 80 | jQuery.highlight(this, re, settings.element, settings.className); 81 | }); 82 | }; 83 | 84 | }); 85 | -------------------------------------------------------------------------------- /diagrams/_book/libs/gitbook-2.6.7/js/plugin-bookdown.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | 3 | var gs = gitbook.storage; 4 | 5 | gitbook.events.bind("start", function(e, config) { 6 | 7 | // add the Edit button (edit on Github) 8 | var edit = config.edit; 9 | if (edit && edit.link) gitbook.toolbar.createButton({ 10 | icon: 'fa fa-edit', 11 | label: edit.text || 'Edit', 12 | position: 'left', 13 | onClick: function(e) { 14 | e.preventDefault(); 15 | window.open(edit.link); 16 | } 17 | }); 18 | 19 | var down = config.download; 20 | var normalizeDownload = function() { 21 | if (!down || !(down instanceof Array) || down.length === 0) return; 22 | if (down[0] instanceof Array) return down; 23 | return $.map(down, function(file, i) { 24 | return [[file, file.replace(/.*[.]/g, '').toUpperCase()]]; 25 | }); 26 | }; 27 | down = normalizeDownload(down); 28 | if (down) if (down.length === 1 && /[.]pdf$/.test(down[0][0])) { 29 | gitbook.toolbar.createButton({ 30 | icon: 'fa fa-file-pdf-o', 31 | label: down[0][1], 32 | position: 'left', 33 | onClick: function(e) { 34 | e.preventDefault(); 35 | window.open(down[0][0]); 36 | } 37 | }); 38 | } else { 39 | gitbook.toolbar.createButton({ 40 | icon: 'fa fa-download', 41 | label: 'Download', 42 | position: 'left', 43 | dropdown: $.map(down, function(item, i) { 44 | return { 45 | text: item[1], 46 | onClick: function(e) { 47 | e.preventDefault(); 48 | window.open(item[0]); 49 | } 50 | }; 51 | }) 52 | }); 53 | } 54 | 55 | // highlight the current section in TOC 56 | var href = window.location.pathname; 57 | href = href.substr(href.lastIndexOf('/') + 1); 58 | if (href === '') href = 'index.html'; 59 | var li = $('a[href^="' + href + location.hash + '"]').parent('li.chapter').first(); 60 | var summary = $('ul.summary'), chaps = summary.find('li.chapter'); 61 | if (li.length === 0) li = chaps.first(); 62 | li.addClass('active'); 63 | chaps.on('click', function(e) { 64 | chaps.removeClass('active'); 65 | $(this).addClass('active'); 66 | gs.set('tocScrollTop', summary.scrollTop()); 67 | }); 68 | 69 | var toc = config.toc; 70 | // collapse TOC items that are not for the current chapter 71 | if (toc && toc.collapse) (function() { 72 | var type = toc.collapse; 73 | if (type === 'none') return; 74 | if (type !== 'section' && type !== 'subsection') return; 75 | // sections under chapters 76 | var toc_sub = summary.children('li[data-level]').children('ul'); 77 | if (type === 'section') { 78 | toc_sub.hide() 79 | .parent().has(li).children('ul').show(); 80 | } else { 81 | toc_sub.children('li').children('ul').hide() 82 | .parent().has(li).children('ul').show(); 83 | } 84 | li.children('ul').show(); 85 | var toc_sub2 = toc_sub.children('li'); 86 | if (type === 'section') toc_sub2.children('ul').hide(); 87 | summary.children('li[data-level]').find('a') 88 | .on('click.bookdown', function(e) { 89 | if (href === $(this).attr('href').replace(/#.*/, '')) 90 | $(this).parent('li').children('ul').toggle(); 91 | }); 92 | })(); 93 | 94 | // add tooltips to the 's that are truncated 95 | $('a').each(function(i, el) { 96 | if (el.offsetWidth >= el.scrollWidth) return; 97 | if (typeof el.title === 'undefined') return; 98 | el.title = el.text; 99 | }); 100 | 101 | // restore TOC scroll position 102 | var pos = gs.get('tocScrollTop'); 103 | if (typeof pos !== 'undefined') summary.scrollTop(pos); 104 | 105 | // highlight the TOC item that has same text as the heading in view as scrolling 106 | if (toc && toc.scroll_highlight !== false) (function() { 107 | // scroll the current TOC item into viewport 108 | var ht = $(window).height(), rect = li[0].getBoundingClientRect(); 109 | if (rect.top >= ht || rect.top <= 0 || rect.bottom <= 0) { 110 | summary.scrollTop(li[0].offsetTop); 111 | } 112 | // current chapter TOC items 113 | var items = $('a[href^="' + href + '"]').parent('li.chapter'), 114 | m = items.length; 115 | if (m === 0) { 116 | items = summary.find('li.chapter'); 117 | m = items.length; 118 | } 119 | if (m === 0) return; 120 | // all section titles on current page 121 | var hs = bookInner.find('.page-inner').find('h1,h2,h3'), n = hs.length, 122 | ts = hs.map(function(i, el) { return $(el).text(); }); 123 | if (n === 0) return; 124 | var scrollHandler = function(e) { 125 | var ht = $(window).height(); 126 | clearTimeout($.data(this, 'scrollTimer')); 127 | $.data(this, 'scrollTimer', setTimeout(function() { 128 | // find the first visible title in the viewport 129 | for (var i = 0; i < n; i++) { 130 | var rect = hs[i].getBoundingClientRect(); 131 | if (rect.top >= 0 && rect.bottom <= ht) break; 132 | } 133 | if (i === n) return; 134 | items.removeClass('active'); 135 | for (var j = 0; j < m; j++) { 136 | if (items.eq(j).children('a').first().text() === ts[i]) break; 137 | } 138 | if (j === m) j = 0; // highlight the chapter title 139 | // search bottom-up for a visible TOC item to highlight; if an item is 140 | // hidden, we check if its parent is visible, and so on 141 | while (j > 0 && items.eq(j).is(':hidden')) j--; 142 | items.eq(j).addClass('active'); 143 | }, 250)); 144 | }; 145 | bookInner.on('scroll.bookdown', scrollHandler); 146 | bookBody.on('scroll.bookdown', scrollHandler); 147 | })(); 148 | 149 | // do not refresh the page if the TOC item points to the current page 150 | $('a[href="' + href + '"]').parent('li.chapter').children('a') 151 | .on('click', function(e) { 152 | bookInner.scrollTop(0); 153 | bookBody.scrollTop(0); 154 | return false; 155 | }); 156 | 157 | var toolbar = config.toolbar; 158 | if (!toolbar || toolbar.position !== 'static') { 159 | var bookHeader = $('.book-header'); 160 | bookBody.addClass('fixed'); 161 | bookHeader.addClass('fixed') 162 | .css('background-color', bookBody.css('background-color')) 163 | .on('click.bookdown', function(e) { 164 | // the theme may have changed after user clicks the theme button 165 | bookHeader.css('background-color', bookBody.css('background-color')); 166 | }); 167 | } 168 | 169 | }); 170 | 171 | gitbook.events.bind("page.change", function(e) { 172 | // store TOC scroll position 173 | var summary = $('ul.summary'); 174 | gs.set('tocScrollTop', summary.scrollTop()); 175 | }); 176 | 177 | var bookBody = $('.book-body'), bookInner = bookBody.find('.body-inner'); 178 | var chapterTitle = function() { 179 | return bookInner.find('.page-inner').find('h1,h2').first().text(); 180 | }; 181 | var bookTitle = function() { 182 | return bookInner.find('.book-header > h1').first().text(); 183 | }; 184 | var saveScrollPos = function(e) { 185 | // save scroll position before page is reloaded 186 | gs.set('bodyScrollTop', { 187 | body: bookBody.scrollTop(), 188 | inner: bookInner.scrollTop(), 189 | focused: document.hasFocus(), 190 | title: chapterTitle() 191 | }); 192 | }; 193 | $(document).on('servr:reload', saveScrollPos); 194 | 195 | // check if the page is loaded in an iframe (e.g. the RStudio preview window) 196 | var inIFrame = function() { 197 | var inIframe = true; 198 | try { inIframe = window.self !== window.top; } catch (e) {} 199 | return inIframe; 200 | }; 201 | $(window).on('blur unload', function(e) { 202 | if (inIFrame()) saveScrollPos(e); 203 | gs.set('bookTitle', bookTitle()); 204 | }); 205 | 206 | $(function(e) { 207 | if (gs.get('bookTitle', '') !== bookTitle()) localStorage.clear(); 208 | var pos = gs.get('bodyScrollTop'); 209 | if (pos) { 210 | if (pos.title === chapterTitle()) { 211 | if (pos.body !== 0) bookBody.scrollTop(pos.body); 212 | if (pos.inner !== 0) bookInner.scrollTop(pos.inner); 213 | } 214 | if (pos.focused) bookInner.find('.page-wrapper').focus(); 215 | } 216 | // clear book body scroll position 217 | gs.remove('bodyScrollTop'); 218 | }); 219 | 220 | }); 221 | -------------------------------------------------------------------------------- /diagrams/_book/libs/gitbook-2.6.7/js/plugin-fontsettings.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | var fontState; 3 | 4 | var THEMES = { 5 | "white": 0, 6 | "sepia": 1, 7 | "night": 2 8 | }; 9 | 10 | var FAMILY = { 11 | "serif": 0, 12 | "sans": 1 13 | }; 14 | 15 | // Save current font settings 16 | function saveFontSettings() { 17 | gitbook.storage.set("fontState", fontState); 18 | update(); 19 | } 20 | 21 | // Increase font size 22 | function enlargeFontSize(e) { 23 | e.preventDefault(); 24 | if (fontState.size >= 4) return; 25 | 26 | fontState.size++; 27 | saveFontSettings(); 28 | }; 29 | 30 | // Decrease font size 31 | function reduceFontSize(e) { 32 | e.preventDefault(); 33 | if (fontState.size <= 0) return; 34 | 35 | fontState.size--; 36 | saveFontSettings(); 37 | }; 38 | 39 | // Change font family 40 | function changeFontFamily(index, e) { 41 | e.preventDefault(); 42 | 43 | fontState.family = index; 44 | saveFontSettings(); 45 | }; 46 | 47 | // Change type of color 48 | function changeColorTheme(index, e) { 49 | e.preventDefault(); 50 | 51 | var $book = $(".book"); 52 | 53 | if (fontState.theme !== 0) 54 | $book.removeClass("color-theme-"+fontState.theme); 55 | 56 | fontState.theme = index; 57 | if (fontState.theme !== 0) 58 | $book.addClass("color-theme-"+fontState.theme); 59 | 60 | saveFontSettings(); 61 | }; 62 | 63 | function update() { 64 | var $book = gitbook.state.$book; 65 | 66 | $(".font-settings .font-family-list li").removeClass("active"); 67 | $(".font-settings .font-family-list li:nth-child("+(fontState.family+1)+")").addClass("active"); 68 | 69 | $book[0].className = $book[0].className.replace(/\bfont-\S+/g, ''); 70 | $book.addClass("font-size-"+fontState.size); 71 | $book.addClass("font-family-"+fontState.family); 72 | 73 | if(fontState.theme !== 0) { 74 | $book[0].className = $book[0].className.replace(/\bcolor-theme-\S+/g, ''); 75 | $book.addClass("color-theme-"+fontState.theme); 76 | } 77 | }; 78 | 79 | function init(config) { 80 | var $bookBody, $book; 81 | 82 | //Find DOM elements. 83 | $book = gitbook.state.$book; 84 | $bookBody = $book.find(".book-body"); 85 | 86 | // Instantiate font state object 87 | fontState = gitbook.storage.get("fontState", { 88 | size: config.size || 2, 89 | family: FAMILY[config.family || "sans"], 90 | theme: THEMES[config.theme || "white"] 91 | }); 92 | 93 | update(); 94 | }; 95 | 96 | 97 | gitbook.events.bind("start", function(e, config) { 98 | var opts = config.fontsettings; 99 | 100 | // Create buttons in toolbar 101 | gitbook.toolbar.createButton({ 102 | icon: 'fa fa-font', 103 | label: 'Font Settings', 104 | className: 'font-settings', 105 | dropdown: [ 106 | [ 107 | { 108 | text: 'A', 109 | className: 'font-reduce', 110 | onClick: reduceFontSize 111 | }, 112 | { 113 | text: 'A', 114 | className: 'font-enlarge', 115 | onClick: enlargeFontSize 116 | } 117 | ], 118 | [ 119 | { 120 | text: 'Serif', 121 | onClick: _.partial(changeFontFamily, 0) 122 | }, 123 | { 124 | text: 'Sans', 125 | onClick: _.partial(changeFontFamily, 1) 126 | } 127 | ], 128 | [ 129 | { 130 | text: 'White', 131 | onClick: _.partial(changeColorTheme, 0) 132 | }, 133 | { 134 | text: 'Sepia', 135 | onClick: _.partial(changeColorTheme, 1) 136 | }, 137 | { 138 | text: 'Night', 139 | onClick: _.partial(changeColorTheme, 2) 140 | } 141 | ] 142 | ] 143 | }); 144 | 145 | 146 | // Init current settings 147 | init(opts); 148 | }); 149 | }); 150 | 151 | 152 | -------------------------------------------------------------------------------- /diagrams/_book/libs/gitbook-2.6.7/js/plugin-search.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | var index = null; 3 | var $searchInput, $searchForm; 4 | var $highlighted, hi = 0, hiOpts = { className: 'search-highlight' }; 5 | var collapse = false; 6 | 7 | // Use a specific index 8 | function loadIndex(data) { 9 | // [Yihui] In bookdown, I use a character matrix to store the chapter 10 | // content, and the index is dynamically built on the client side. 11 | // Gitbook prebuilds the index data instead: https://github.com/GitbookIO/plugin-search 12 | // We can certainly do that via R packages V8 and jsonlite, but let's 13 | // see how slow it really is before improving it. On the other hand, 14 | // lunr cannot handle non-English text very well, e.g. the default 15 | // tokenizer cannot deal with Chinese text, so we may want to replace 16 | // lunr with a dumb simple text matching approach. 17 | index = lunr(function () { 18 | this.ref('url'); 19 | this.field('title', { boost: 10 }); 20 | this.field('body'); 21 | }); 22 | data.map(function(item) { 23 | index.add({ 24 | url: item[0], 25 | title: item[1], 26 | body: item[2] 27 | }); 28 | }); 29 | } 30 | 31 | // Fetch the search index 32 | function fetchIndex() { 33 | return $.getJSON(gitbook.state.basePath+"/search_index.json") 34 | .then(loadIndex); // [Yihui] we need to use this object later 35 | } 36 | 37 | // Search for a term and return results 38 | function search(q) { 39 | if (!index) return; 40 | 41 | var results = _.chain(index.search(q)) 42 | .map(function(result) { 43 | var parts = result.ref.split("#"); 44 | return { 45 | path: parts[0], 46 | hash: parts[1] 47 | }; 48 | }) 49 | .value(); 50 | 51 | // [Yihui] Highlight the search keyword on current page 52 | hi = 0; 53 | $highlighted = results.length === 0 ? undefined : $('.page-inner') 54 | .unhighlight(hiOpts).highlight(q, hiOpts).find('span.search-highlight'); 55 | scrollToHighlighted(); 56 | toggleTOC(results.length > 0); 57 | 58 | return results; 59 | } 60 | 61 | // [Yihui] Scroll the chapter body to the i-th highlighted string 62 | function scrollToHighlighted() { 63 | if (!$highlighted) return; 64 | var n = $highlighted.length; 65 | if (n === 0) return; 66 | var $p = $highlighted.eq(hi), p = $p[0], rect = p.getBoundingClientRect(); 67 | if (rect.top < 0 || rect.bottom > $(window).height()) { 68 | ($(window).width() >= 1240 ? $('.body-inner') : $('.book-body')) 69 | .scrollTop(p.offsetTop - 100); 70 | } 71 | $highlighted.css('background-color', ''); 72 | // an orange background color on the current item and removed later 73 | $p.css('background-color', 'orange'); 74 | setTimeout(function() { 75 | $p.css('background-color', ''); 76 | }, 2000); 77 | } 78 | 79 | // [Yihui] Expand/collapse TOC 80 | function toggleTOC(show) { 81 | if (!collapse) return; 82 | var toc_sub = $('ul.summary').children('li[data-level]').children('ul'); 83 | if (show) return toc_sub.show(); 84 | var href = window.location.pathname; 85 | href = href.substr(href.lastIndexOf('/') + 1); 86 | if (href === '') href = 'index.html'; 87 | var li = $('a[href^="' + href + location.hash + '"]').parent('li.chapter').first(); 88 | toc_sub.hide().parent().has(li).children('ul').show(); 89 | li.children('ul').show(); 90 | } 91 | 92 | // Create search form 93 | function createForm(value) { 94 | if ($searchForm) $searchForm.remove(); 95 | if ($searchInput) $searchInput.remove(); 96 | 97 | $searchForm = $('
    ', { 98 | 'class': 'book-search', 99 | 'role': 'search' 100 | }); 101 | 102 | $searchInput = $('', { 103 | 'type': 'search', 104 | 'class': 'form-control', 105 | 'val': value, 106 | 'placeholder': 'Type to search' 107 | }); 108 | 109 | $searchInput.appendTo($searchForm); 110 | $searchForm.prependTo(gitbook.state.$book.find('.book-summary')); 111 | } 112 | 113 | // Return true if search is open 114 | function isSearchOpen() { 115 | return gitbook.state.$book.hasClass("with-search"); 116 | } 117 | 118 | // Toggle the search 119 | function toggleSearch(_state) { 120 | if (isSearchOpen() === _state) return; 121 | if (!$searchInput) return; 122 | 123 | gitbook.state.$book.toggleClass("with-search", _state); 124 | 125 | // If search bar is open: focus input 126 | if (isSearchOpen()) { 127 | gitbook.sidebar.toggle(true); 128 | $searchInput.focus(); 129 | } else { 130 | $searchInput.blur(); 131 | $searchInput.val(""); 132 | gitbook.storage.remove("keyword"); 133 | gitbook.sidebar.filter(null); 134 | $('.page-inner').unhighlight(hiOpts); 135 | toggleTOC(false); 136 | } 137 | } 138 | 139 | // Recover current search when page changed 140 | function recoverSearch() { 141 | var keyword = gitbook.storage.get("keyword", ""); 142 | 143 | createForm(keyword); 144 | 145 | if (keyword.length > 0) { 146 | if(!isSearchOpen()) { 147 | toggleSearch(true); // [Yihui] open the search box 148 | } 149 | gitbook.sidebar.filter(_.pluck(search(keyword), "path")); 150 | } 151 | } 152 | 153 | 154 | gitbook.events.bind("start", function(e, config) { 155 | // [Yihui] disable search 156 | if (config.search === false) return; 157 | collapse = !config.toc || config.toc.collapse === 'section' || 158 | config.toc.collapse === 'subsection'; 159 | 160 | // Pre-fetch search index and create the form 161 | fetchIndex() 162 | // [Yihui] recover search after the page is loaded 163 | .then(recoverSearch); 164 | 165 | 166 | // Type in search bar 167 | $(document).on("keyup", ".book-search input", function(e) { 168 | var key = (e.keyCode ? e.keyCode : e.which); 169 | // [Yihui] Escape -> close search box; Up/Down: previous/next highlighted 170 | if (key == 27) { 171 | e.preventDefault(); 172 | toggleSearch(false); 173 | } else if (key == 38) { 174 | if (hi <= 0 && $highlighted) hi = $highlighted.length; 175 | hi--; 176 | scrollToHighlighted(); 177 | } else if (key == 40) { 178 | hi++; 179 | if ($highlighted && hi >= $highlighted.length) hi = 0; 180 | scrollToHighlighted(); 181 | } 182 | }).on("input", ".book-search input", function(e) { 183 | var q = $(this).val().trim(); 184 | if (q.length === 0) { 185 | gitbook.sidebar.filter(null); 186 | gitbook.storage.remove("keyword"); 187 | $('.page-inner').unhighlight(hiOpts); 188 | toggleTOC(false); 189 | } else { 190 | var results = search(q); 191 | gitbook.sidebar.filter( 192 | _.pluck(results, "path") 193 | ); 194 | gitbook.storage.set("keyword", q); 195 | } 196 | }); 197 | 198 | // Create the toggle search button 199 | gitbook.toolbar.createButton({ 200 | icon: 'fa fa-search', 201 | label: 'Search', 202 | position: 'left', 203 | onClick: toggleSearch 204 | }); 205 | 206 | // Bind keyboard to toggle search 207 | gitbook.keyboard.bind(['f'], toggleSearch); 208 | }); 209 | 210 | // [Yihui] do not try to recover search; always start fresh 211 | // gitbook.events.bind("page.change", recoverSearch); 212 | }); 213 | -------------------------------------------------------------------------------- /diagrams/_book/libs/gitbook-2.6.7/js/plugin-sharing.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | var SITES = { 3 | 'github': { 4 | 'label': 'Github', 5 | 'icon': 'fa fa-github', 6 | 'onClick': function(e) { 7 | e.preventDefault(); 8 | var repo = $('meta[name="github-repo"]').attr('content'); 9 | if (typeof repo === 'undefined') throw("Github repo not defined"); 10 | window.open("https://github.com/"+repo); 11 | } 12 | }, 13 | 'facebook': { 14 | 'label': 'Facebook', 15 | 'icon': 'fa fa-facebook', 16 | 'onClick': function(e) { 17 | e.preventDefault(); 18 | window.open("http://www.facebook.com/sharer/sharer.php?s=100&p[url]="+encodeURIComponent(location.href)); 19 | } 20 | }, 21 | 'twitter': { 22 | 'label': 'Twitter', 23 | 'icon': 'fa fa-twitter', 24 | 'onClick': function(e) { 25 | e.preventDefault(); 26 | window.open("http://twitter.com/home?status="+encodeURIComponent(document.title+" "+location.href)); 27 | } 28 | }, 29 | 'google': { 30 | 'label': 'Google+', 31 | 'icon': 'fa fa-google-plus', 32 | 'onClick': function(e) { 33 | e.preventDefault(); 34 | window.open("https://plus.google.com/share?url="+encodeURIComponent(location.href)); 35 | } 36 | }, 37 | 'linkedin': { 38 | 'label': 'LinkedIn', 39 | 'icon': 'fa fa-linkedin', 40 | 'onClick': function(e) { 41 | e.preventDefault(); 42 | window.open("https://www.linkedin.com/shareArticle?mini=true&url="+encodeURIComponent(location.href)+"&title="+encodeURIComponent(document.title)); 43 | } 44 | }, 45 | 'weibo': { 46 | 'label': 'Weibo', 47 | 'icon': 'fa fa-weibo', 48 | 'onClick': function(e) { 49 | e.preventDefault(); 50 | window.open("http://service.weibo.com/share/share.php?content=utf-8&url="+encodeURIComponent(location.href)+"&title="+encodeURIComponent(document.title)); 51 | } 52 | }, 53 | 'instapaper': { 54 | 'label': 'Instapaper', 55 | 'icon': 'fa fa-instapaper', 56 | 'onClick': function(e) { 57 | e.preventDefault(); 58 | window.open("http://www.instapaper.com/text?u="+encodeURIComponent(location.href)); 59 | } 60 | }, 61 | 'vk': { 62 | 'label': 'VK', 63 | 'icon': 'fa fa-vk', 64 | 'onClick': function(e) { 65 | e.preventDefault(); 66 | window.open("http://vkontakte.ru/share.php?url="+encodeURIComponent(location.href)); 67 | } 68 | } 69 | }; 70 | 71 | 72 | 73 | gitbook.events.bind("start", function(e, config) { 74 | var opts = config.sharing; 75 | if (!opts) return; 76 | 77 | // Create dropdown menu 78 | var menu = _.chain(opts.all) 79 | .map(function(id) { 80 | var site = SITES[id]; 81 | 82 | return { 83 | text: site.label, 84 | onClick: site.onClick 85 | }; 86 | }) 87 | .compact() 88 | .value(); 89 | 90 | // Create main button with dropdown 91 | if (menu.length > 0) { 92 | gitbook.toolbar.createButton({ 93 | icon: 'fa fa-share-alt', 94 | label: 'Share', 95 | position: 'right', 96 | dropdown: [menu] 97 | }); 98 | } 99 | 100 | // Direct actions to share 101 | _.each(SITES, function(site, sideId) { 102 | if (!opts[sideId]) return; 103 | 104 | gitbook.toolbar.createButton({ 105 | icon: site.icon, 106 | label: site.text, 107 | position: 'right', 108 | onClick: site.onClick 109 | }); 110 | }); 111 | }); 112 | }); 113 | -------------------------------------------------------------------------------- /diagrams/iv-dag.gv: -------------------------------------------------------------------------------- 1 | digraph ivdag { 2 | graph [layout = dot] 3 | 4 | edge [color = black] 5 | Z -> D 6 | D -> Y 7 | 8 | edge [color = gray] 9 | U -> D 10 | U -> Y 11 | 12 | } 13 | -------------------------------------------------------------------------------- /diagrams/science.mmd: -------------------------------------------------------------------------------- 1 | graph LR 2 | A(question) --> B(design) 3 | B --> C(collection) 4 | C --> D(analysis) 5 | D --> E(answer) 6 | -------------------------------------------------------------------------------- /diagrams/science2.mmd: -------------------------------------------------------------------------------- 1 | graph LR 2 | A(idea) --> B 3 | B(question) --> C(design) 4 | C --> B 5 | C --> D(collection) 6 | D --> E(analysis) 7 | E --> F(answer) 8 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/docs/.nojekyll -------------------------------------------------------------------------------- /docs/appendix.md: -------------------------------------------------------------------------------- 1 | 2 | # (APPENDIX) Appendix {-} 3 | -------------------------------------------------------------------------------- /docs/eda.md: -------------------------------------------------------------------------------- 1 | 2 | # (PART) Exploratory Data Analysis {-} 3 | -------------------------------------------------------------------------------- /docs/img/islr-fig-6.7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/docs/img/islr-fig-6.7.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "Data Analysis Notes" 4 | author: "Jeffrey B. Arnold" 5 | date: "2018-05-07" 6 | knit: "bookdown::render_book" 7 | bibliography: ["intromethods.bib"] 8 | biblio-style: "apalike" 9 | link-citations: true 10 | documentclass: book 11 | colorlinks: yes 12 | lot: yes 13 | lof: yes 14 | monofont: "Source Code Pro" 15 | monofontoptions: "Scale=0.7" 16 | site: bookdown::bookdown_site 17 | github-repo: jrnold/intro-methods-notes 18 | description: > 19 | These are notes associated with the course, POLS/CS&SS 503: Advanced Quantitative Political Methodology at the University of Washington. 20 | --- 21 | 22 | # Introduction 23 | 24 | Notes used when teaching "POLS/CS&SS 501: Advanced Political Research Design and Analysis" and "POLS/CS&SS 503: Advanced Quantitative Political Methodology" at the University of Washington. 25 | 26 | 27 | $$ 28 | $$ 29 | -------------------------------------------------------------------------------- /docs/libs/gitbook-2.6.7/css/fontawesome/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/docs/libs/gitbook-2.6.7/css/fontawesome/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/libs/gitbook-2.6.7/css/plugin-bookdown.css: -------------------------------------------------------------------------------- 1 | .book .book-header h1 { 2 | padding-left: 20px; 3 | padding-right: 20px; 4 | } 5 | .book .book-header.fixed { 6 | position: fixed; 7 | right: 0; 8 | top: 0; 9 | left: 0; 10 | border-bottom: 1px solid rgba(0,0,0,.07); 11 | } 12 | span.search-highlight { 13 | background-color: #ffff88; 14 | } 15 | @media (min-width: 600px) { 16 | .book.with-summary .book-header.fixed { 17 | left: 300px; 18 | } 19 | } 20 | @media (max-width: 1240px) { 21 | .book .book-body.fixed { 22 | top: 50px; 23 | } 24 | .book .book-body.fixed .body-inner { 25 | top: auto; 26 | } 27 | } 28 | @media (max-width: 600px) { 29 | .book.with-summary .book-header.fixed { 30 | left: calc(100% - 60px); 31 | min-width: 300px; 32 | } 33 | .book.with-summary .book-body { 34 | transform: none; 35 | left: calc(100% - 60px); 36 | min-width: 300px; 37 | } 38 | .book .book-body.fixed { 39 | top: 0; 40 | } 41 | } 42 | 43 | .book .book-body.fixed .body-inner { 44 | top: 50px; 45 | } 46 | .book .book-body .page-wrapper .page-inner section.normal sub, .book .book-body .page-wrapper .page-inner section.normal sup { 47 | font-size: 85%; 48 | } 49 | 50 | @media print { 51 | .book .book-summary, .book .book-body .book-header, .fa { 52 | display: none !important; 53 | } 54 | .book .book-body.fixed { 55 | left: 0px; 56 | } 57 | .book .book-body,.book .book-body .body-inner, .book.with-summary { 58 | overflow: visible !important; 59 | } 60 | } 61 | .kable_wrapper { 62 | border-spacing: 20px 0; 63 | border-collapse: separate; 64 | border: none; 65 | margin: auto; 66 | } 67 | .kable_wrapper > tbody > tr > td { 68 | vertical-align: top; 69 | } 70 | .book .book-body .page-wrapper .page-inner section.normal table tr.header { 71 | border-top-width: 2px; 72 | } 73 | .book .book-body .page-wrapper .page-inner section.normal table tr:last-child td { 74 | border-bottom-width: 2px; 75 | } 76 | .book .book-body .page-wrapper .page-inner section.normal table td, .book .book-body .page-wrapper .page-inner section.normal table th { 77 | border-left: none; 78 | border-right: none; 79 | } 80 | .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr, .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr > td { 81 | border-top: none; 82 | } 83 | .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr:last-child > td { 84 | border-bottom: none; 85 | } 86 | 87 | div.theorem, div.lemma, div.corollary, div.proposition { 88 | font-style: italic; 89 | } 90 | span.theorem, span.lemma, span.corollary, span.proposition { 91 | font-style: normal; 92 | } 93 | div.proof:after { 94 | content: "\25a2"; 95 | float: right; 96 | } 97 | .header-section-number { 98 | padding-right: .5em; 99 | } 100 | -------------------------------------------------------------------------------- /docs/libs/gitbook-2.6.7/css/plugin-search.css: -------------------------------------------------------------------------------- 1 | .book .book-summary .book-search { 2 | padding: 6px; 3 | background: transparent; 4 | position: absolute; 5 | top: -50px; 6 | left: 0px; 7 | right: 0px; 8 | transition: top 0.5s ease; 9 | } 10 | .book .book-summary .book-search input, 11 | .book .book-summary .book-search input:focus, 12 | .book .book-summary .book-search input:hover { 13 | width: 100%; 14 | background: transparent; 15 | border: 1px solid #ccc; 16 | box-shadow: none; 17 | outline: none; 18 | line-height: 22px; 19 | padding: 7px 4px; 20 | color: inherit; 21 | box-sizing: border-box; 22 | } 23 | .book.with-search .book-summary .book-search { 24 | top: 0px; 25 | } 26 | .book.with-search .book-summary ul.summary { 27 | top: 50px; 28 | } 29 | -------------------------------------------------------------------------------- /docs/libs/gitbook-2.6.7/js/jquery.highlight.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["jQuery"], function(jQuery) { 2 | 3 | /* 4 | * jQuery Highlight plugin 5 | * 6 | * Based on highlight v3 by Johann Burkard 7 | * http://johannburkard.de/blog/programming/javascript/highlight-javascript-text-higlighting-jquery-plugin.html 8 | * 9 | * Code a little bit refactored and cleaned (in my humble opinion). 10 | * Most important changes: 11 | * - has an option to highlight only entire words (wordsOnly - false by default), 12 | * - has an option to be case sensitive (caseSensitive - false by default) 13 | * - highlight element tag and class names can be specified in options 14 | * 15 | * Copyright (c) 2009 Bartek Szopka 16 | * 17 | * Licensed under MIT license. 18 | * 19 | */ 20 | 21 | jQuery.extend({ 22 | highlight: function (node, re, nodeName, className) { 23 | if (node.nodeType === 3) { 24 | var match = node.data.match(re); 25 | if (match) { 26 | var highlight = document.createElement(nodeName || 'span'); 27 | highlight.className = className || 'highlight'; 28 | var wordNode = node.splitText(match.index); 29 | wordNode.splitText(match[0].length); 30 | var wordClone = wordNode.cloneNode(true); 31 | highlight.appendChild(wordClone); 32 | wordNode.parentNode.replaceChild(highlight, wordNode); 33 | return 1; //skip added node in parent 34 | } 35 | } else if ((node.nodeType === 1 && node.childNodes) && // only element nodes that have children 36 | !/(script|style)/i.test(node.tagName) && // ignore script and style nodes 37 | !(node.tagName === nodeName.toUpperCase() && node.className === className)) { // skip if already highlighted 38 | for (var i = 0; i < node.childNodes.length; i++) { 39 | i += jQuery.highlight(node.childNodes[i], re, nodeName, className); 40 | } 41 | } 42 | return 0; 43 | } 44 | }); 45 | 46 | jQuery.fn.unhighlight = function (options) { 47 | var settings = { className: 'highlight', element: 'span' }; 48 | jQuery.extend(settings, options); 49 | 50 | return this.find(settings.element + "." + settings.className).each(function () { 51 | var parent = this.parentNode; 52 | parent.replaceChild(this.firstChild, this); 53 | parent.normalize(); 54 | }).end(); 55 | }; 56 | 57 | jQuery.fn.highlight = function (words, options) { 58 | var settings = { className: 'highlight', element: 'span', caseSensitive: false, wordsOnly: false }; 59 | jQuery.extend(settings, options); 60 | 61 | if (words.constructor === String) { 62 | words = [words]; 63 | } 64 | words = jQuery.grep(words, function(word, i){ 65 | return word !== ''; 66 | }); 67 | words = jQuery.map(words, function(word, i) { 68 | return word.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&"); 69 | }); 70 | if (words.length === 0) { return this; } 71 | 72 | var flag = settings.caseSensitive ? "" : "i"; 73 | var pattern = "(" + words.join("|") + ")"; 74 | if (settings.wordsOnly) { 75 | pattern = "\\b" + pattern + "\\b"; 76 | } 77 | var re = new RegExp(pattern, flag); 78 | 79 | return this.each(function () { 80 | jQuery.highlight(this, re, settings.element, settings.className); 81 | }); 82 | }; 83 | 84 | }); 85 | -------------------------------------------------------------------------------- /docs/libs/gitbook-2.6.7/js/plugin-bookdown.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | 3 | var gs = gitbook.storage; 4 | 5 | gitbook.events.bind("start", function(e, config) { 6 | 7 | // add the Edit button (edit on Github) 8 | var edit = config.edit; 9 | if (edit && edit.link) gitbook.toolbar.createButton({ 10 | icon: 'fa fa-edit', 11 | label: edit.text || 'Edit', 12 | position: 'left', 13 | onClick: function(e) { 14 | e.preventDefault(); 15 | window.open(edit.link); 16 | } 17 | }); 18 | 19 | var down = config.download; 20 | var normalizeDownload = function() { 21 | if (!down || !(down instanceof Array) || down.length === 0) return; 22 | if (down[0] instanceof Array) return down; 23 | return $.map(down, function(file, i) { 24 | return [[file, file.replace(/.*[.]/g, '').toUpperCase()]]; 25 | }); 26 | }; 27 | down = normalizeDownload(down); 28 | if (down) if (down.length === 1 && /[.]pdf$/.test(down[0][0])) { 29 | gitbook.toolbar.createButton({ 30 | icon: 'fa fa-file-pdf-o', 31 | label: down[0][1], 32 | position: 'left', 33 | onClick: function(e) { 34 | e.preventDefault(); 35 | window.open(down[0][0]); 36 | } 37 | }); 38 | } else { 39 | gitbook.toolbar.createButton({ 40 | icon: 'fa fa-download', 41 | label: 'Download', 42 | position: 'left', 43 | dropdown: $.map(down, function(item, i) { 44 | return { 45 | text: item[1], 46 | onClick: function(e) { 47 | e.preventDefault(); 48 | window.open(item[0]); 49 | } 50 | }; 51 | }) 52 | }); 53 | } 54 | 55 | // highlight the current section in TOC 56 | var href = window.location.pathname; 57 | href = href.substr(href.lastIndexOf('/') + 1); 58 | if (href === '') href = 'index.html'; 59 | var li = $('a[href^="' + href + location.hash + '"]').parent('li.chapter').first(); 60 | var summary = $('ul.summary'), chaps = summary.find('li.chapter'); 61 | if (li.length === 0) li = chaps.first(); 62 | li.addClass('active'); 63 | chaps.on('click', function(e) { 64 | chaps.removeClass('active'); 65 | $(this).addClass('active'); 66 | gs.set('tocScrollTop', summary.scrollTop()); 67 | }); 68 | 69 | var toc = config.toc; 70 | // collapse TOC items that are not for the current chapter 71 | if (toc && toc.collapse) (function() { 72 | var type = toc.collapse; 73 | if (type === 'none') return; 74 | if (type !== 'section' && type !== 'subsection') return; 75 | // sections under chapters 76 | var toc_sub = summary.children('li[data-level]').children('ul'); 77 | if (type === 'section') { 78 | toc_sub.hide() 79 | .parent().has(li).children('ul').show(); 80 | } else { 81 | toc_sub.children('li').children('ul').hide() 82 | .parent().has(li).children('ul').show(); 83 | } 84 | li.children('ul').show(); 85 | var toc_sub2 = toc_sub.children('li'); 86 | if (type === 'section') toc_sub2.children('ul').hide(); 87 | summary.children('li[data-level]').find('a') 88 | .on('click.bookdown', function(e) { 89 | if (href === $(this).attr('href').replace(/#.*/, '')) 90 | $(this).parent('li').children('ul').toggle(); 91 | }); 92 | })(); 93 | 94 | // add tooltips to the 's that are truncated 95 | $('a').each(function(i, el) { 96 | if (el.offsetWidth >= el.scrollWidth) return; 97 | if (typeof el.title === 'undefined') return; 98 | el.title = el.text; 99 | }); 100 | 101 | // restore TOC scroll position 102 | var pos = gs.get('tocScrollTop'); 103 | if (typeof pos !== 'undefined') summary.scrollTop(pos); 104 | 105 | // highlight the TOC item that has same text as the heading in view as scrolling 106 | if (toc && toc.scroll_highlight !== false) (function() { 107 | // scroll the current TOC item into viewport 108 | var ht = $(window).height(), rect = li[0].getBoundingClientRect(); 109 | if (rect.top >= ht || rect.top <= 0 || rect.bottom <= 0) { 110 | summary.scrollTop(li[0].offsetTop); 111 | } 112 | // current chapter TOC items 113 | var items = $('a[href^="' + href + '"]').parent('li.chapter'), 114 | m = items.length; 115 | if (m === 0) { 116 | items = summary.find('li.chapter'); 117 | m = items.length; 118 | } 119 | if (m === 0) return; 120 | // all section titles on current page 121 | var hs = bookInner.find('.page-inner').find('h1,h2,h3'), n = hs.length, 122 | ts = hs.map(function(i, el) { return $(el).text(); }); 123 | if (n === 0) return; 124 | var scrollHandler = function(e) { 125 | var ht = $(window).height(); 126 | clearTimeout($.data(this, 'scrollTimer')); 127 | $.data(this, 'scrollTimer', setTimeout(function() { 128 | // find the first visible title in the viewport 129 | for (var i = 0; i < n; i++) { 130 | var rect = hs[i].getBoundingClientRect(); 131 | if (rect.top >= 0 && rect.bottom <= ht) break; 132 | } 133 | if (i === n) return; 134 | items.removeClass('active'); 135 | for (var j = 0; j < m; j++) { 136 | if (items.eq(j).children('a').first().text() === ts[i]) break; 137 | } 138 | if (j === m) j = 0; // highlight the chapter title 139 | // search bottom-up for a visible TOC item to highlight; if an item is 140 | // hidden, we check if its parent is visible, and so on 141 | while (j > 0 && items.eq(j).is(':hidden')) j--; 142 | items.eq(j).addClass('active'); 143 | }, 250)); 144 | }; 145 | bookInner.on('scroll.bookdown', scrollHandler); 146 | bookBody.on('scroll.bookdown', scrollHandler); 147 | })(); 148 | 149 | // do not refresh the page if the TOC item points to the current page 150 | $('a[href="' + href + '"]').parent('li.chapter').children('a') 151 | .on('click', function(e) { 152 | bookInner.scrollTop(0); 153 | bookBody.scrollTop(0); 154 | return false; 155 | }); 156 | 157 | var toolbar = config.toolbar; 158 | if (!toolbar || toolbar.position !== 'static') { 159 | var bookHeader = $('.book-header'); 160 | bookBody.addClass('fixed'); 161 | bookHeader.addClass('fixed') 162 | .css('background-color', bookBody.css('background-color')) 163 | .on('click.bookdown', function(e) { 164 | // the theme may have changed after user clicks the theme button 165 | bookHeader.css('background-color', bookBody.css('background-color')); 166 | }); 167 | } 168 | 169 | }); 170 | 171 | gitbook.events.bind("page.change", function(e) { 172 | // store TOC scroll position 173 | var summary = $('ul.summary'); 174 | gs.set('tocScrollTop', summary.scrollTop()); 175 | }); 176 | 177 | var bookBody = $('.book-body'), bookInner = bookBody.find('.body-inner'); 178 | var chapterTitle = function() { 179 | return bookInner.find('.page-inner').find('h1,h2').first().text(); 180 | }; 181 | var bookTitle = function() { 182 | return bookInner.find('.book-header > h1').first().text(); 183 | }; 184 | var saveScrollPos = function(e) { 185 | // save scroll position before page is reloaded 186 | gs.set('bodyScrollTop', { 187 | body: bookBody.scrollTop(), 188 | inner: bookInner.scrollTop(), 189 | focused: document.hasFocus(), 190 | title: chapterTitle() 191 | }); 192 | }; 193 | $(document).on('servr:reload', saveScrollPos); 194 | 195 | // check if the page is loaded in an iframe (e.g. the RStudio preview window) 196 | var inIFrame = function() { 197 | var inIframe = true; 198 | try { inIframe = window.self !== window.top; } catch (e) {} 199 | return inIframe; 200 | }; 201 | $(window).on('blur unload', function(e) { 202 | if (inIFrame()) saveScrollPos(e); 203 | gs.set('bookTitle', bookTitle()); 204 | }); 205 | 206 | $(function(e) { 207 | if (gs.get('bookTitle', '') !== bookTitle()) localStorage.clear(); 208 | var pos = gs.get('bodyScrollTop'); 209 | if (pos) { 210 | if (pos.title === chapterTitle()) { 211 | if (pos.body !== 0) bookBody.scrollTop(pos.body); 212 | if (pos.inner !== 0) bookInner.scrollTop(pos.inner); 213 | } 214 | if (pos.focused) bookInner.find('.page-wrapper').focus(); 215 | } 216 | // clear book body scroll position 217 | gs.remove('bodyScrollTop'); 218 | }); 219 | 220 | }); 221 | -------------------------------------------------------------------------------- /docs/libs/gitbook-2.6.7/js/plugin-fontsettings.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | var fontState; 3 | 4 | var THEMES = { 5 | "white": 0, 6 | "sepia": 1, 7 | "night": 2 8 | }; 9 | 10 | var FAMILY = { 11 | "serif": 0, 12 | "sans": 1 13 | }; 14 | 15 | // Save current font settings 16 | function saveFontSettings() { 17 | gitbook.storage.set("fontState", fontState); 18 | update(); 19 | } 20 | 21 | // Increase font size 22 | function enlargeFontSize(e) { 23 | e.preventDefault(); 24 | if (fontState.size >= 4) return; 25 | 26 | fontState.size++; 27 | saveFontSettings(); 28 | }; 29 | 30 | // Decrease font size 31 | function reduceFontSize(e) { 32 | e.preventDefault(); 33 | if (fontState.size <= 0) return; 34 | 35 | fontState.size--; 36 | saveFontSettings(); 37 | }; 38 | 39 | // Change font family 40 | function changeFontFamily(index, e) { 41 | e.preventDefault(); 42 | 43 | fontState.family = index; 44 | saveFontSettings(); 45 | }; 46 | 47 | // Change type of color 48 | function changeColorTheme(index, e) { 49 | e.preventDefault(); 50 | 51 | var $book = $(".book"); 52 | 53 | if (fontState.theme !== 0) 54 | $book.removeClass("color-theme-"+fontState.theme); 55 | 56 | fontState.theme = index; 57 | if (fontState.theme !== 0) 58 | $book.addClass("color-theme-"+fontState.theme); 59 | 60 | saveFontSettings(); 61 | }; 62 | 63 | function update() { 64 | var $book = gitbook.state.$book; 65 | 66 | $(".font-settings .font-family-list li").removeClass("active"); 67 | $(".font-settings .font-family-list li:nth-child("+(fontState.family+1)+")").addClass("active"); 68 | 69 | $book[0].className = $book[0].className.replace(/\bfont-\S+/g, ''); 70 | $book.addClass("font-size-"+fontState.size); 71 | $book.addClass("font-family-"+fontState.family); 72 | 73 | if(fontState.theme !== 0) { 74 | $book[0].className = $book[0].className.replace(/\bcolor-theme-\S+/g, ''); 75 | $book.addClass("color-theme-"+fontState.theme); 76 | } 77 | }; 78 | 79 | function init(config) { 80 | var $bookBody, $book; 81 | 82 | //Find DOM elements. 83 | $book = gitbook.state.$book; 84 | $bookBody = $book.find(".book-body"); 85 | 86 | // Instantiate font state object 87 | fontState = gitbook.storage.get("fontState", { 88 | size: config.size || 2, 89 | family: FAMILY[config.family || "sans"], 90 | theme: THEMES[config.theme || "white"] 91 | }); 92 | 93 | update(); 94 | }; 95 | 96 | 97 | gitbook.events.bind("start", function(e, config) { 98 | var opts = config.fontsettings; 99 | 100 | // Create buttons in toolbar 101 | gitbook.toolbar.createButton({ 102 | icon: 'fa fa-font', 103 | label: 'Font Settings', 104 | className: 'font-settings', 105 | dropdown: [ 106 | [ 107 | { 108 | text: 'A', 109 | className: 'font-reduce', 110 | onClick: reduceFontSize 111 | }, 112 | { 113 | text: 'A', 114 | className: 'font-enlarge', 115 | onClick: enlargeFontSize 116 | } 117 | ], 118 | [ 119 | { 120 | text: 'Serif', 121 | onClick: _.partial(changeFontFamily, 0) 122 | }, 123 | { 124 | text: 'Sans', 125 | onClick: _.partial(changeFontFamily, 1) 126 | } 127 | ], 128 | [ 129 | { 130 | text: 'White', 131 | onClick: _.partial(changeColorTheme, 0) 132 | }, 133 | { 134 | text: 'Sepia', 135 | onClick: _.partial(changeColorTheme, 1) 136 | }, 137 | { 138 | text: 'Night', 139 | onClick: _.partial(changeColorTheme, 2) 140 | } 141 | ] 142 | ] 143 | }); 144 | 145 | 146 | // Init current settings 147 | init(opts); 148 | }); 149 | }); 150 | 151 | 152 | -------------------------------------------------------------------------------- /docs/libs/gitbook-2.6.7/js/plugin-search.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | var index = null; 3 | var $searchInput, $searchForm; 4 | var $highlighted, hi = 0, hiOpts = { className: 'search-highlight' }; 5 | var collapse = false; 6 | 7 | // Use a specific index 8 | function loadIndex(data) { 9 | // [Yihui] In bookdown, I use a character matrix to store the chapter 10 | // content, and the index is dynamically built on the client side. 11 | // Gitbook prebuilds the index data instead: https://github.com/GitbookIO/plugin-search 12 | // We can certainly do that via R packages V8 and jsonlite, but let's 13 | // see how slow it really is before improving it. On the other hand, 14 | // lunr cannot handle non-English text very well, e.g. the default 15 | // tokenizer cannot deal with Chinese text, so we may want to replace 16 | // lunr with a dumb simple text matching approach. 17 | index = lunr(function () { 18 | this.ref('url'); 19 | this.field('title', { boost: 10 }); 20 | this.field('body'); 21 | }); 22 | data.map(function(item) { 23 | index.add({ 24 | url: item[0], 25 | title: item[1], 26 | body: item[2] 27 | }); 28 | }); 29 | } 30 | 31 | // Fetch the search index 32 | function fetchIndex() { 33 | return $.getJSON(gitbook.state.basePath+"/search_index.json") 34 | .then(loadIndex); // [Yihui] we need to use this object later 35 | } 36 | 37 | // Search for a term and return results 38 | function search(q) { 39 | if (!index) return; 40 | 41 | var results = _.chain(index.search(q)) 42 | .map(function(result) { 43 | var parts = result.ref.split("#"); 44 | return { 45 | path: parts[0], 46 | hash: parts[1] 47 | }; 48 | }) 49 | .value(); 50 | 51 | // [Yihui] Highlight the search keyword on current page 52 | hi = 0; 53 | $highlighted = results.length === 0 ? undefined : $('.page-inner') 54 | .unhighlight(hiOpts).highlight(q, hiOpts).find('span.search-highlight'); 55 | scrollToHighlighted(); 56 | toggleTOC(results.length > 0); 57 | 58 | return results; 59 | } 60 | 61 | // [Yihui] Scroll the chapter body to the i-th highlighted string 62 | function scrollToHighlighted() { 63 | if (!$highlighted) return; 64 | var n = $highlighted.length; 65 | if (n === 0) return; 66 | var $p = $highlighted.eq(hi), p = $p[0], rect = p.getBoundingClientRect(); 67 | if (rect.top < 0 || rect.bottom > $(window).height()) { 68 | ($(window).width() >= 1240 ? $('.body-inner') : $('.book-body')) 69 | .scrollTop(p.offsetTop - 100); 70 | } 71 | $highlighted.css('background-color', ''); 72 | // an orange background color on the current item and removed later 73 | $p.css('background-color', 'orange'); 74 | setTimeout(function() { 75 | $p.css('background-color', ''); 76 | }, 2000); 77 | } 78 | 79 | // [Yihui] Expand/collapse TOC 80 | function toggleTOC(show) { 81 | if (!collapse) return; 82 | var toc_sub = $('ul.summary').children('li[data-level]').children('ul'); 83 | if (show) return toc_sub.show(); 84 | var href = window.location.pathname; 85 | href = href.substr(href.lastIndexOf('/') + 1); 86 | if (href === '') href = 'index.html'; 87 | var li = $('a[href^="' + href + location.hash + '"]').parent('li.chapter').first(); 88 | toc_sub.hide().parent().has(li).children('ul').show(); 89 | li.children('ul').show(); 90 | } 91 | 92 | // Create search form 93 | function createForm(value) { 94 | if ($searchForm) $searchForm.remove(); 95 | if ($searchInput) $searchInput.remove(); 96 | 97 | $searchForm = $('
    ', { 98 | 'class': 'book-search', 99 | 'role': 'search' 100 | }); 101 | 102 | $searchInput = $('', { 103 | 'type': 'search', 104 | 'class': 'form-control', 105 | 'val': value, 106 | 'placeholder': 'Type to search' 107 | }); 108 | 109 | $searchInput.appendTo($searchForm); 110 | $searchForm.prependTo(gitbook.state.$book.find('.book-summary')); 111 | } 112 | 113 | // Return true if search is open 114 | function isSearchOpen() { 115 | return gitbook.state.$book.hasClass("with-search"); 116 | } 117 | 118 | // Toggle the search 119 | function toggleSearch(_state) { 120 | if (isSearchOpen() === _state) return; 121 | if (!$searchInput) return; 122 | 123 | gitbook.state.$book.toggleClass("with-search", _state); 124 | 125 | // If search bar is open: focus input 126 | if (isSearchOpen()) { 127 | gitbook.sidebar.toggle(true); 128 | $searchInput.focus(); 129 | } else { 130 | $searchInput.blur(); 131 | $searchInput.val(""); 132 | gitbook.storage.remove("keyword"); 133 | gitbook.sidebar.filter(null); 134 | $('.page-inner').unhighlight(hiOpts); 135 | toggleTOC(false); 136 | } 137 | } 138 | 139 | // Recover current search when page changed 140 | function recoverSearch() { 141 | var keyword = gitbook.storage.get("keyword", ""); 142 | 143 | createForm(keyword); 144 | 145 | if (keyword.length > 0) { 146 | if(!isSearchOpen()) { 147 | toggleSearch(true); // [Yihui] open the search box 148 | } 149 | gitbook.sidebar.filter(_.pluck(search(keyword), "path")); 150 | } 151 | } 152 | 153 | 154 | gitbook.events.bind("start", function(e, config) { 155 | // [Yihui] disable search 156 | if (config.search === false) return; 157 | collapse = !config.toc || config.toc.collapse === 'section' || 158 | config.toc.collapse === 'subsection'; 159 | 160 | // Pre-fetch search index and create the form 161 | fetchIndex() 162 | // [Yihui] recover search after the page is loaded 163 | .then(recoverSearch); 164 | 165 | 166 | // Type in search bar 167 | $(document).on("keyup", ".book-search input", function(e) { 168 | var key = (e.keyCode ? e.keyCode : e.which); 169 | // [Yihui] Escape -> close search box; Up/Down: previous/next highlighted 170 | if (key == 27) { 171 | e.preventDefault(); 172 | toggleSearch(false); 173 | } else if (key == 38) { 174 | if (hi <= 0 && $highlighted) hi = $highlighted.length; 175 | hi--; 176 | scrollToHighlighted(); 177 | } else if (key == 40) { 178 | hi++; 179 | if ($highlighted && hi >= $highlighted.length) hi = 0; 180 | scrollToHighlighted(); 181 | } 182 | }).on("input", ".book-search input", function(e) { 183 | var q = $(this).val().trim(); 184 | if (q.length === 0) { 185 | gitbook.sidebar.filter(null); 186 | gitbook.storage.remove("keyword"); 187 | $('.page-inner').unhighlight(hiOpts); 188 | toggleTOC(false); 189 | } else { 190 | var results = search(q); 191 | gitbook.sidebar.filter( 192 | _.pluck(results, "path") 193 | ); 194 | gitbook.storage.set("keyword", q); 195 | } 196 | }); 197 | 198 | // Create the toggle search button 199 | gitbook.toolbar.createButton({ 200 | icon: 'fa fa-search', 201 | label: 'Search', 202 | position: 'left', 203 | onClick: toggleSearch 204 | }); 205 | 206 | // Bind keyboard to toggle search 207 | gitbook.keyboard.bind(['f'], toggleSearch); 208 | }); 209 | 210 | // [Yihui] do not try to recover search; always start fresh 211 | // gitbook.events.bind("page.change", recoverSearch); 212 | }); 213 | -------------------------------------------------------------------------------- /docs/libs/gitbook-2.6.7/js/plugin-sharing.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | var SITES = { 3 | 'github': { 4 | 'label': 'Github', 5 | 'icon': 'fa fa-github', 6 | 'onClick': function(e) { 7 | e.preventDefault(); 8 | var repo = $('meta[name="github-repo"]').attr('content'); 9 | if (typeof repo === 'undefined') throw("Github repo not defined"); 10 | window.open("https://github.com/"+repo); 11 | } 12 | }, 13 | 'facebook': { 14 | 'label': 'Facebook', 15 | 'icon': 'fa fa-facebook', 16 | 'onClick': function(e) { 17 | e.preventDefault(); 18 | window.open("http://www.facebook.com/sharer/sharer.php?s=100&p[url]="+encodeURIComponent(location.href)); 19 | } 20 | }, 21 | 'twitter': { 22 | 'label': 'Twitter', 23 | 'icon': 'fa fa-twitter', 24 | 'onClick': function(e) { 25 | e.preventDefault(); 26 | window.open("http://twitter.com/home?status="+encodeURIComponent(document.title+" "+location.href)); 27 | } 28 | }, 29 | 'google': { 30 | 'label': 'Google+', 31 | 'icon': 'fa fa-google-plus', 32 | 'onClick': function(e) { 33 | e.preventDefault(); 34 | window.open("https://plus.google.com/share?url="+encodeURIComponent(location.href)); 35 | } 36 | }, 37 | 'linkedin': { 38 | 'label': 'LinkedIn', 39 | 'icon': 'fa fa-linkedin', 40 | 'onClick': function(e) { 41 | e.preventDefault(); 42 | window.open("https://www.linkedin.com/shareArticle?mini=true&url="+encodeURIComponent(location.href)+"&title="+encodeURIComponent(document.title)); 43 | } 44 | }, 45 | 'weibo': { 46 | 'label': 'Weibo', 47 | 'icon': 'fa fa-weibo', 48 | 'onClick': function(e) { 49 | e.preventDefault(); 50 | window.open("http://service.weibo.com/share/share.php?content=utf-8&url="+encodeURIComponent(location.href)+"&title="+encodeURIComponent(document.title)); 51 | } 52 | }, 53 | 'instapaper': { 54 | 'label': 'Instapaper', 55 | 'icon': 'fa fa-instapaper', 56 | 'onClick': function(e) { 57 | e.preventDefault(); 58 | window.open("http://www.instapaper.com/text?u="+encodeURIComponent(location.href)); 59 | } 60 | }, 61 | 'vk': { 62 | 'label': 'VK', 63 | 'icon': 'fa fa-vk', 64 | 'onClick': function(e) { 65 | e.preventDefault(); 66 | window.open("http://vkontakte.ru/share.php?url="+encodeURIComponent(location.href)); 67 | } 68 | } 69 | }; 70 | 71 | 72 | 73 | gitbook.events.bind("start", function(e, config) { 74 | var opts = config.sharing; 75 | if (!opts) return; 76 | 77 | // Create dropdown menu 78 | var menu = _.chain(opts.all) 79 | .map(function(id) { 80 | var site = SITES[id]; 81 | 82 | return { 83 | text: site.label, 84 | onClick: site.onClick 85 | }; 86 | }) 87 | .compact() 88 | .value(); 89 | 90 | // Create main button with dropdown 91 | if (menu.length > 0) { 92 | gitbook.toolbar.createButton({ 93 | icon: 'fa fa-share-alt', 94 | label: 'Share', 95 | position: 'right', 96 | dropdown: [menu] 97 | }); 98 | } 99 | 100 | // Direct actions to share 101 | _.each(SITES, function(site, sideId) { 102 | if (!opts[sideId]) return; 103 | 104 | gitbook.toolbar.createButton({ 105 | icon: site.icon, 106 | label: site.text, 107 | position: 'right', 108 | onClick: site.onClick 109 | }); 110 | }); 111 | }); 112 | }); 113 | -------------------------------------------------------------------------------- /docs/linear-regression.md: -------------------------------------------------------------------------------- 1 | 2 | # (PART) Linear Regression {-} 3 | -------------------------------------------------------------------------------- /docs/multicolinearity.md: -------------------------------------------------------------------------------- 1 | 2 | # Colinearity and Multicolinearity 3 | 4 | ## (Perfect) Colinearity 5 | 6 | In order to estimate unique $\hat{\beta}$ OLS requires the that the columns of the design matrix $\Vec{X}$ are linearly independent. 7 | 8 | Common examples of groups of variables that are not linearly independent: 9 | 10 | - Categorical variables in which there is no excluded category. 11 | You can also include all categories of a categorical variable if you exclude the intercept. 12 | Note that although they are not (often) used in political science, there are other methods of transforming categorical variables to ensure the columns in the design matrix are independent. 13 | - A constant variable. This can happen in practice with dichotomous variables of rare events; if you drop some observations for whatever reason, you may end up dropping all the 1's in the data. So although the variable is not constant in the population, in your sample it is constant and cannot be included in the regression. 14 | - A variable that is a multiple of another variable. E.g. you cannot include $\log(\text{GDP in millions USD})$ and $\log({GDP in USD})$ since $\log(\text{GDP in millions USD}) = \log({GDP in USD}) / 1,000,000$. in 15 | - A variable that is the sum of two other variables. E.g. you cannot include $\log(population)$, $\log(GDP)$, $\log(GDP per capita)$ in a regression since 16 | $$\log(\text{GDP per capita}) = \log(\text{GDP} / \text{population}) = \log(\text{GDP}) - \log(\text{population})$$. 17 | 18 | 19 | ## What to do about it? 20 | 21 | R and most statistical programs will run regressions with colinear variables, but will drop variables until only linearly independent columns in $\Mat{X}$ remain. 22 | 23 | For example, consider the following code. The variable `type` is a categorical variable with categories "bc", "wc", and "prof". 24 | It will 25 | 26 | ```r 27 | data(Duncan, package = "car") 28 | # Create dummy variables for each category 29 | Duncan <- mutate(Duncan, 30 | bc = type == "bc", 31 | wc = type == "wc", 32 | prof = type == "prof") 33 | lm(prestige ~ bc + wc + prof, data = Duncan) 34 | ``` 35 | 36 | ``` 37 | ## 38 | ## Call: 39 | ## lm(formula = prestige ~ bc + wc + prof, data = Duncan) 40 | ## 41 | ## Coefficients: 42 | ## (Intercept) bcTRUE wcTRUE profTRUE 43 | ## 80.44 -57.68 -43.78 NA 44 | ``` 45 | R runs the regression, but coefficient and standard errors for `prof` are set to `NA`. 46 | 47 | You should not rely on the software to fix this for you; once you (or the software) notices the problem check the reasons it occurred. The rewrite your regression to remove whatever was creating linearly dependent variables in $\Mat{X}$. 48 | 49 | 50 | 51 | # Multicollinearity 52 | 53 | Multicollinearity is the (poor) name for less-than-perfect collinearity. 54 | Even though there is enough variation in $\Mat{X}$ to estimate OLS coefficients, if some set of variables in $\Mat{X}$ is highly correlated it will result in large, but unbiased, standard errors on the esimates. 55 | 56 | What happens if variables are not linearly dependent, but nevertheless highly correlated? 57 | If $\Cor(\Vec{x}_1, vec{x}_2) = 1$, then they are linearly dependent and the regression cannot be estimated (see above). 58 | But if $\Cor(\Vec{x}_1, vec{x}_2) = 0.99$, the OLS can estimate unique values of of $\hat\beta$. However, it everything was fine with OLS estimates until, suddenly, when there is linearly independence everything breaks. The answer is yes, and no. 59 | As $|\Cor(\Vec{x}_1, \Vec{x}_2)| \to 1$ the standard errors on the coefficients of these variables increase, but OLS as an estimator works correctly; $\hat\beta$ and $\se{\hat\beta}$ are unbiased. 60 | With multicollinearly, OLS gives you the "right" answer, but it cannot say much with certainty. 61 | 62 | 63 | For a bivariate regression, the distribution of the slope coefficient has variance, 64 | $$ 65 | \Var(\hat{\beta}_1) = \frac{\sigma_u^2}{\sum_{i = 1} (x_i - \bar{x})^2} . 66 | $$ 67 | 68 | What affects the standard error of $\hat{\beta}$? 69 | 70 | - The error variance ($\sigma_u^2$). The higher the variance of the residuals, the higher the variance of the coefficients. 71 | - The variance of $\Vec{x}$. The lower variation in $\Mat{x}$, the bigger the standard errors of the slope. 72 | 73 | Now consider a multiple regression, 74 | $$ 75 | \Vec{y} = \beta_0 + \beta_1 \Vec{x}_1 + \beta_2 \Vec{x}_2 + u 76 | $$ 77 | 78 | this becomes, 79 | $$ 80 | \Var(\hat{\beta}_1) = \frac{\sigma_u^2}{(1 - R^2_1) \sum_{i = 1}^n (x_i - \bar{x})^2} 81 | $$ 82 | where $R^2_1$ is the $R^2$ from the regression of $\Vec{x}_1$ on $\Vec{x}_2$, 83 | $$ 84 | \Vec{x} = \hat{\delta}_0 + \hat{\delta}_1 \Vec{x}_2 . 85 | $$ 86 | 87 | The factors affecting standard errors are 88 | 89 | 1. Error variance: higher residuals leads to higher standard errors. 90 | 2. Variance of $\Vec{x}_1$: lower variation in $\Vec{x}_2$ leads to higher standard errors. 91 | 3. The strength of the relationship between $x_1$ and $x_2$. Stronger relationship between $x_1$ and $x_2$ (higher $R^2$ of the regression of $x_1$ on $x_2$) leads to higher standard errors. 92 | 93 | These arguments generalize to more than two predictors. 94 | 95 | ### What do do about it? 96 | 97 | Multicollinearity is not an "error" in the model. 98 | All you can do is: 99 | 100 | 1. Get more data 101 | 2. Find more conditional variation in the predictor of interest 102 | 103 | What it means depends on what you are doing. 104 | 105 | 1. Prediction: then you are interested in $\hat{\Vec{y}}$ and not $\hat{\beta}}$ (or its standard errors). 106 | In this case, multicollinearity is irrelevant. 107 | 108 | 2. Causal inference: in this case you are interested in $\hat{\Vec{\beta}}$. 109 | Multicollinearity does not bias $\hat{\beta}$. 110 | You should include all regressors to achieve balance, and include all relevant pre-treatment variables and not include post-treatment variables. 111 | Multicollinearity is not directly relevant in this choice. 112 | All multicollinearity means is that the variation in the treatment after accounting for selection effects is very low, making it hard to say anything about the treatment effect with that observational data. 113 | More sophisticated methods may trade off some bias for a lower variance (e.g. shrinkage methods), but that must be done systematically, and not ad-hoc dropping relevant pre-treatment variables that simply correlate highly with your treatment variable. 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /docs/multicollinearity.md: -------------------------------------------------------------------------------- 1 | 2 | # Collinearity and Multicollinearity 3 | 4 | 5 | ```r 6 | library("tidyverse") 7 | library("carData") 8 | ``` 9 | 10 | ## (Perfect) collinearity 11 | 12 | In order to estimate unique $\hat{\beta}$ OLS requires the that the columns of the design matrix $\Vec{X}$ are linearly independent. 13 | 14 | Common examples of groups of variables that are not linearly independent: 15 | 16 | - Categorical variables in which there is no excluded category. 17 | You can also include all categories of a categorical variable if you exclude the intercept. 18 | Note that although they are not (often) used in political science, there are other methods of transforming categorical variables to ensure the columns in the design matrix are independent. 19 | 20 | - A constant variable. This can happen in practice with dichotomous 21 | variables of rare events; if you drop some observations for whatever 22 | reason, you may end up dropping all the 1's in the data. So although the 23 | variable is not constant in the population, in your sample it is constant 24 | and cannot be included in the regression. 25 | 26 | - A variable that is a multiple of another variable. E.g. you cannot include $\log(\text{GDP in millions USD})$ and $\log({GDP in USD})$ since $\log(\text{GDP in millions USD}) = \log({GDP in USD}) / 1,000,000$. 27 | 28 | - A variable that is the sum of two other variables. E.g. you cannot include $\log(population)$, $\log(GDP)$, $\log(GDP per capita)$ in a regression since 29 | $$\log(\text{GDP per capita}) = \log(\text{GDP} / \text{population}) = \log(\text{GDP}) - \log(\text{population})$$. 30 | 31 | ## What to do about it? 32 | 33 | R and most statistical programs will run regressions with collinear variables, but will drop variables until only linearly independent columns in $\Mat{X}$ remain. 34 | 35 | For example, consider the following code. The variable `type` is a categorical variable with categories "bc", "wc", and "prof". 36 | 37 | 38 | ```r 39 | data(Duncan, package = "carData") 40 | # Create dummy variables for each category 41 | Duncan <- mutate(Duncan, 42 | bc = type == "bc", 43 | wc = type == "wc", 44 | prof = type == "prof") 45 | lm(prestige ~ bc + wc + prof, data = Duncan) 46 | ``` 47 | 48 | ``` 49 | ## 50 | ## Call: 51 | ## lm(formula = prestige ~ bc + wc + prof, data = Duncan) 52 | ## 53 | ## Coefficients: 54 | ## (Intercept) bcTRUE wcTRUE profTRUE 55 | ## 80.44 -57.68 -43.78 NA 56 | ``` 57 | R runs the regression, but coefficient and standard errors for `prof` are set to `NA`. 58 | 59 | You should not rely on the software to fix this for you; once you (or the software) notices the problem check the reasons it occurred. The rewrite your regression to remove whatever was creating linearly dependent variables in $\Mat{X}$. 60 | 61 | ## Multicollinearity 62 | 63 | Multicollinearity is the (poor) name for less-than-perfect collinearity. 64 | Even though there is enough variation in $\Mat{X}$ to estimate OLS coefficients, if some set of variables in $\Mat{X}$ is highly correlated it will result in large, but unbiased, standard errors on the estimates. 65 | 66 | What happens if variables are not linearly dependent, but nevertheless highly correlated? 67 | If $\Cor(\Vec{x}_1, vec{x}_2) = 1$, then they are linearly dependent and the regression cannot be estimated (see above). 68 | But if $\Cor(\Vec{x}_1, vec{x}_2) = 0.99$, the OLS can estimate unique values of of $\hat\beta$. However, it everything was fine with OLS estimates until, suddenly, when there is linearly independence everything breaks. The answer is yes, and no. 69 | As $|\Cor(\Vec{x}_1, \Vec{x}_2)| \to 1$ the standard errors on the coefficients of these variables increase, but OLS as an estimator works correctly; $\hat\beta$ and $\se{\hat\beta}$ are unbiased. 70 | With multicollinearity, OLS gives you the "right" answer, but it cannot say much with certainty. 71 | 72 | For a bivariate regression, the distribution of the slope coefficient has variance, 73 | $$ 74 | \Var(\hat{\beta}_1) = \frac{\sigma_u^2}{\sum_{i = 1} (x_i - \bar{x})^2} . 75 | $$ 76 | 77 | What affects the standard error of $\hat{\beta}$? 78 | 79 | - The error variance ($\sigma_u^2$). The higher the variance of the residuals, the higher the variance of the coefficients. 80 | - The variance of $\Vec{x}$. The lower variation in $\Mat{x}$, the bigger the standard errors of the slope. 81 | 82 | Now consider a multiple regression, 83 | $$ 84 | \Vec{y} = \beta_0 + \beta_1 \Vec{x}_1 + \beta_2 \Vec{x}_2 + u 85 | $$ 86 | 87 | this becomes, 88 | $$ 89 | \Var(\hat{\beta}_1) = \frac{\sigma_u^2}{(1 - R^2_1) \sum_{i = 1}^n (x_i - \bar{x})^2} 90 | $$ 91 | where $R^2_1$ is the $R^2$ from the regression of $\Vec{x}_1$ on $\Vec{x}_2$, 92 | $$ 93 | \Vec{x} = \hat{\delta}_0 + \hat{\delta}_1 \Vec{x}_2 . 94 | $$ 95 | 96 | The factors affecting standard errors are 97 | 98 | 1. Error variance: higher residuals leads to higher standard errors. 99 | 1. Variance of $\Vec{x}_1$: lower variation in $\Vec{x}_2$ leads to higher standard errors. 100 | 1. The strength of the relationship between $x_1$ and $x_2$. Stronger relationship between $x_1$ and $x_2$ (higher $R^2$ of the regression of $x_1$ on $x_2$) leads to higher standard errors. 101 | 102 | These arguments generalize to more than two predictors. 103 | 104 | ## What do do about it? 105 | 106 | Multicollinearity is not an "error" in the model. 107 | All you can do is: 108 | 109 | 1. Get more data 110 | 1. Find more conditional variation in the predictor of interest 111 | 112 | What it means depends on what you are doing. 113 | 114 | 1. Prediction: then you are interested in $\hat{\Vec{y}}$ and not $\hat{\beta}}$ (or its standard errors). 115 | In this case, multicollinearity is irrelevant. 116 | 117 | 1. Causal inference: in this case you are interested in $\hat{\Vec{\beta}}$. 118 | Multicollinearity does not bias $\hat{\beta}$. 119 | You should include all regressors to achieve balance, and include all relevant pre-treatment variables and not include post-treatment variables. 120 | Multicollinearity is not directly relevant in this choice. 121 | All multicollinearity means is that the variation in the treatment after accounting for selection effects is very low, making it hard to say anything about the treatment effect with that observational data. 122 | More sophisticated methods may trade off some bias for a lower variance (e.g. shrinkage methods), but that must be done systematically, and not ad-hoc dropping relevant pre-treatment variables that simply correlate highly with your treatment variable. 123 | -------------------------------------------------------------------------------- /docs/presentation.md: -------------------------------------------------------------------------------- 1 | 2 | # (PART) Presentation {-} 3 | -------------------------------------------------------------------------------- /docs/programming.md: -------------------------------------------------------------------------------- 1 | 2 | # (PART) Programming {-} 3 | -------------------------------------------------------------------------------- /docs/rd.md: -------------------------------------------------------------------------------- 1 | 2 | # Regression Discontinuity 3 | 4 | Summary: If there are thresholds whereby some observations receive the 5 | treatment above it, other those below it do not, and those immediately above or 6 | below that threshold are similar, we can use the difference of the outcome 7 | between those just above and those just below the threshold to estimate the 8 | causal effect of the treatment. 9 | 10 | Suppose there is a running variable $x$ such that any person receives the treatment, $d$ if $x \geq a$ and does not if $x \leq a$, 11 | $$ 12 | d = \begin{cases} 13 | 1 & x \geq a \\ 14 | 0 & x < a 15 | \end{cases} 16 | $$ 17 | 18 | A simple regression discontinuity model is, 19 | $$ 20 | \begin{aligned}[t] 21 | y_i = \alpha + \beta x_i + \tau d_i + \gamma x_i d_i + \epsilon_i 22 | \end{aligned} 23 | $$ 24 | The local causal effect of the treatment at the discontinuity is $\tau$. 25 | 26 |
    27 | Fake Example of a Regression Discontinuity. The difference at the threshold (50) is the effect of the treatment. 28 |

    (\#fig:unnamed-chunk-2)Fake Example of a Regression Discontinuity. The difference at the threshold (50) is the effect of the treatment.

    29 |
    30 | 31 | However, there are several choices 32 | 33 | - Functional form of the trends before and after the discontinuity 34 | - The size of the window of observations before and after the trend which to compare. 35 | 36 | How to choose? 37 | 38 | - parametric: chooses specific functional forms 39 | - non-parametric: uses flexible forms, and chooses a bandwidth [@ImbensKalyanaraman2011a] 40 | 41 | Sharp vs. Fuzzy Discontinuity? 42 | 43 | - Sharp: the assignment of the treatment occurs with certainty at the threshold. 44 | - Fuzzy: the assignment of the treatment occurs only probabilistically at the threshold. 45 | 46 | Suppose that the causal effect of treatment $T \in \{0, 1\}$ on unit $i$ is $\tau_i = Y_i(1) - Y_i(0)$ where $Y_i(1)$ is the potential outcome of $i$ under the treatment and $Y_i(0)$ is the potential outcome of $i$ under the control. 47 | If potential outcomes are distributed smoothly at the cut-point $c$, then the average causal effect of the treatment at the cut-point, $Z_i = c$: 48 | $$ 49 | \tau_{RD} = \E[Y_{i}(1) - Y_i(0)| Z_i = c] = \lim_{Z_i \downarrow c}\E[Y_{i}(1) | Z_i = c] - \lim_{Z_i \uparrow c}\E[Y_i(0)| Z_i = c] 50 | $$ 51 | 52 | An advantage of RD designs is that unlike selection on observables or IV, its identifying assumptions are more observable and testable. 53 | 54 | There are two basic tests (@LeeLemieux2010a): 55 | 56 | 1. Continuity of pre-treatment covariates. E.g. density test of McCrary (2008). Whether the ratio of treated to control units departs from chance. 57 | A difficulty is that balance only holds in the limit, and covariance balance may still be present in finite samples. 58 | 59 | 1. Irrelevance of covariates to the treatment-outcome relationship. There should be no systematic association between covariates and treatment, so controlling for them shouldn't affect the estimates. 60 | 61 | ## Examples 62 | 63 | - @ThistlethwaiteCampbell1960a was the first example of RD. 64 | 65 | - Outcome: Career choices in teaching 66 | - Running variable: PSAS scores 67 | - Cutoff: receiving National Merit Finalist 68 | - Discussed: @AngristPischke2014a [Ch 4] 69 | 70 | - @CarpenterDobkin2011a, @CarpenterDobkin2009a 71 | 72 | - Running variable: age 73 | - Cutoff: ability to drink alcohol legally 74 | - Outcome: Death, accidents 75 | - Discussed: @AngristPischke2014a [Ch 4] 76 | 77 | - @AbdulkadirogluAngristPathak2014a 78 | 79 | - Running variable: exam score 80 | - Cutoff: above threshold receive an offer from a school. This is fuzzy since not all those who receive the offer attend. 81 | - Outcome: Educational outcomes 82 | - Discussed: @AngristPischke2014a [Ch 4] 83 | 84 | - @EggersHainmueller2009a 85 | 86 | - units: UK MPs 87 | - outcome: personal wealth 88 | - treatment: winning an election (holding office) 89 | - running variable: vote share 90 | 91 | - @LitschigMorrison2013a 92 | 93 | - units: Brazilian municipalities 94 | - outcome: education, literacy, poverty rate 95 | - treatment: receiving a cash transfer from the central government (there are population cutoffs) 96 | - running variable: population 97 | 98 | - @GelmanHill2007a [p. 213-217] 99 | 100 | - units: US Congressional members 101 | - outcome: ideology of representative 102 | - treatment: winning election 103 | - running variable: vote share 104 | 105 | - @GelmanKatz2007a, @GelmanHill2007a [p. 232] 106 | 107 | - units: patients 108 | - outcome: length of hospital stay 109 | - treatment: new surgery method 110 | - cutoff: not performed on those over 80 111 | - running variable: age 112 | 113 | - @LeeMorettiButler2004a. Also see derived examples in @Bailey2016a [Ex. 6.3]. See @Button2015a for a replication. 114 | 115 | - units: congressional districts 116 | - outcome: ideology of nominees 117 | - treatment: election 118 | - running variable: vote share 119 | 120 | - @JacobLefgren2004a 121 | 122 | - units: students 123 | - outcome: education achievement 124 | - treatment: summer school, retention 125 | - running variable: standardized test 126 | 127 | ## Example: Close Elections 128 | 129 | A common use of RD in political science and econ is election outcomes. 130 | In this case the "treatment" is winning the election; it is applied to the candidate whose vote exceeds the threshold of 50%, but not to candidates arbitrarily below that threshold. 131 | Thus "close" elections are a common use of RD designs. 132 | This design was formalized in @Lee2008a. 133 | 134 | Several papers question whether close elections satisfy the assumptions of RD: 135 | 136 | - @CaugheySekhon2011a look at US House elections (1942-2008). They find that close elections are more imbalanced. They attribute this to national partisan waves. 137 | - @GrimmerHershFeinsteinEtAl2011a look at all US House elections 1880-2008. They find that structurally advantaged candidates (strong party, incumbents) are more likely to win close elections. 138 | 139 | The ways in which close elections can be non-random are lawsuit challenges and fraud. 140 | 141 | @EggersFowlerHainmuellerEtAl2014a addresses these concerns with a systematic review of 40,000 close elections: "U.S. House in other time periods, statewide, state legislative, and mayoral races in the U.S. and national or local elections in nine other countries" 142 | Only the US House appears to have these issues. 143 | 144 | ## Software 145 | 146 | See the R packages 147 | 148 | - **[rddtools](https://cran.r-project.org/package=rddtools)**: a new and fairly complete package of regression discontinuity from primary data viz to other tests. 149 | - **[rdd](https://cran.r-project.org/package=rdd)** 150 | - **[rdrobust](https://cran.r-project.org/package=rdrobust)**: Tools for data-driven graphical and analytical statistical inference in RD. 151 | - **[rdpower](https://cran.r-project.org/package=rdpower)**: Calculate power for RD designs. 152 | - **[rdmulti](https://cran.r-project.org/package=rdmulti)**: Analyze designs with multiple cutoffs. 153 | 154 | See entries in the [Econometrics](https://cran.r-project.org/web/views/Econometrics.html) task view. 155 | 156 | ## References 157 | 158 | Textbooks and Reviews: 159 | 160 | - @AngristPischke2014a [Ch. 4] 161 | - @GelmanHill2007a [Sec. 10.4] 162 | - @Bailey2016a [Ch. 11] 163 | - @LindenAdamsRoberts2006a for applications to medicine 164 | - @HahnToddKlaauw2001a An early review of RD in economics 165 | 166 | Methods: 167 | 168 | - @ImbensKalyanaraman2011a propose an optimal bandwidth selection method 169 | -------------------------------------------------------------------------------- /docs/references-3.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/docs/references-3.html -------------------------------------------------------------------------------- /docs/references.md: -------------------------------------------------------------------------------- 1 | 2 | # References {-} 3 | -------------------------------------------------------------------------------- /docs/reproducible-research.md: -------------------------------------------------------------------------------- 1 | 2 | # Reproducible Research 3 | -------------------------------------------------------------------------------- /docs/reproducible_research.md: -------------------------------------------------------------------------------- 1 | 2 | # Reproducible Research 3 | -------------------------------------------------------------------------------- /docs/word-processing.md: -------------------------------------------------------------------------------- 1 | 2 | # Typesetting and Word Processing Programs 3 | 4 | ## LaTeX 5 | 6 | [LaTeX](https://en.wikipedia.org/wiki/LaTeX) is a document markup language (think something like HTML) that is widely used in academia.[^pronunciation] 7 | Its primary advantages over Word (and word processors) are the separation of content and presentation and its formatting of mathematical equations. 8 | In addition to papers, it is often used for academic slides; many talk slides are prepared with beamer. 9 | 10 | ### Learning LaTeX 11 | 12 | Here are some links to get started learning LaTeX: 13 | 14 | - [Overleaf Free & Interactive Online Introduction to LaTeX](https://www.overleaf.com/latex/learn/free-online-introduction-to-latex-part-1) 15 | - [LaTeX Tutorial](https://www.latex-tutorial.com/tutorials/) has interactive lessons 16 | - [ShareLaTeX Documentation](https://www.sharelatex.com/learn/) 17 | - [Overleaf Example Templates](https://www.overleaf.com/latex/templates/) has many different examples of LaTeX documents. 18 | - [LaTeX Wikibook](https://en.wikibooks.org/wiki/LaTeX) 19 | - [Not So Short Introduction to LaTeX](https://tobi.oetiker.ch/lshort/lshort.pdf) is a classic, but not as as new-user friendly as the others. 20 | 21 | ### Using LaTeX 22 | 23 | - Use an online service such as [Overleaf](https://www.overleaf.com/) or [ShareLaTeX](https://www.sharelatex.com/). These are great for collaboration, but become inflexible 24 | when you want to customize your workflow. 25 | 26 | - Write it with a specialized editor such as [TeXmaker](http://www.xm1math.net/texmaker/), [TeXStudio](http://www.texstudio.org/), or [TeXshop](http://pages.uoregon.edu/koch/texshop/). These generally have 27 | built ways to insert text, and also live preview. I would stay away from editors such as [LyX](https://www.lyx.org/) that are [WYSIWYG](https://en.wikipedia.org/wiki/WYSIWYG). 28 | 29 | - Write it with an general purpose editor such as [Atom](https://atom.io/) or [Sublime Text](https://www.sublimetext.com/).[^1] Most editors have a plugin 30 | to make writing LaTeX easier. For Atom there is [LaTeXTools](https://atom.io/packages/latextools), and for Sublime Text, [LaTeXTools](https://github.com/SublimeText/LaTeXTools) 31 | 32 | [^1]: And of course [Vim](http://www.vim.org/) or [Emacs](https://www.gnu.org/software/emacs/). 33 | 34 | ### LaTeX with R 35 | 36 | This is pretty easy. Rnw, also called Sweave, documents allow you to mix R chunks with LaTeX. 37 | This is similar to R markdown, but with LaTeX instead of markdown.[^2] 38 | 39 | Many packages, such as [xtable](https://cran.r-project.org/package=xtable), [stargazer](ttps://cran.r-project.org/package=stargazer), or [texreg](ttps://cran.r-project.org/package=texreg) produce formatted output in LaTeX. 40 | When you use these programs, do not copy and paste the output. Instead, save it to a file, 41 | and use `\input{}` to include the contents in your document. 42 | 43 | [^2]: And [Sweave](https://www.statistik.lmu.de/~leisch/Sweave/) files preceded R markdown and knitr by many years. 44 | 45 | ## Word 46 | 47 | While I use LaTeX in my own work, Microsoft Word is powerful piece of software, 48 | and many of the complaints against Word come down to not being aware of its 49 | features. There are many tools you can use to build your research paper; 50 | whatever tool you use, learn how to use it proficiently. 51 | 52 | ### General Advice 53 | 54 | This guide on using [Microsoft Word for Dissertations](http://guides.lib.umich.edu/c.php?g=283073&p=1886001) 55 | covers everything and more that I would have. Also see [this](http://www3.nd.edu/~shill2/dtclass/word_2013_word_for_research_projects.pdf) 56 | 57 | - [separate presentation and content](https://en.wikipedia.org/wiki/Separation_of_presentation_and_content) using styles 58 | 59 | - Automatically number figures and tables 60 | 61 | - Use a reference manager like [Mendeley](https://www.mendeley.com/), [Zotero](https://www.zotero.org/), [colwiz](https://www.colwiz.com/app), or [Papers](http://www.papersapp.com/). They have plugins for citations in Word. 62 | 63 | - When exporting figures for Word, if you must use a [raster graphic](https://en.wikipedia.org/wiki/Raster_graphics) use PNG files (not JPEG). For publication, use a high DPI (600) with PNG graphics. 64 | 65 | - Learn to use *Fields*. You can insert figures from files that you can 66 | update using `Insert > Field > Links and References > IncludePicture`. 67 | This is useful for programmatically generating figures to insert into 68 | your document. Likewise, you can insert text from files that you can 69 | update using `Insert > Field > Links and References > IncludeText`. 70 | 71 | ### Using R with Word 72 | 73 | For a dynamic reports you can use [R Markdown](http://rmarkdown.rstudio.com/word_document_format.html) and export to a word document. When doing this, use a reference document to set the the styles that you will use. 74 | See [Happy collaboration with Rmd to docx](http://rmarkdown.rstudio.com/articles_docx.html) for more advice on using R Markdown with Word. 75 | 76 | When using functions from packages such as [xtable](https://cran.r-project.org/package=xtable), [stargazer](ttps://cran.r-project.org/package=stargazer), or [texreg](ttps://cran.r-project.org/package=texreg) output HTML, which can be copy and pasted into word. 77 | 78 | Finally, the [ReporteR](http://davidgohel.github.io/ReporteRs/word.html) package is an alternative method to generate Word Documents from R. 79 | 80 | [^pronunciation]: TeX is pronounced as "teck" because the X is a Greek chi. The pronunciation of of LaTeX is thus lah-teck or lay-teck. It is not 81 | pronounced like the rubber compound. See this [StackExchange](http://tex.stackexchange.com/questions/17502/what-is-the-correct-pronunciation-of-tex-and-latex) question on the pronunciation of LaTeX. 82 | -------------------------------------------------------------------------------- /docs/writing.md: -------------------------------------------------------------------------------- 1 | 2 | # Writing Resources 3 | 4 | ## Writing and Organizing Papers 5 | 6 | - Chris Adolph. [Writing Empirical Papers: 6 Rules & 12 Recommendations](http://faculty.washington.edu/cadolph/503/papers.pdf) 7 | 8 | - Barry R. Weingast. 2015. [CalTech Rules for Writing Papers: How to Structure Your Paper and Write an Introduction](https://web.stanford.edu/group/mcnollgast/cgi-bin/wordpress/wp-content/uploads/2013/10/CALTECH.RUL_..pdf) 9 | 10 | - [The Science of Scientific Writing](http://www.americanscientist.org/issues/id.877,y.0,no.,content.true,page.1,css.print/issue.aspx) *American Scientist* 11 | 12 | - Deidre McCloskey. [Economical Writing](http://www.amazon.com/Economical-Writing-Deirdre-McCloskey/dp/1577660633/) 13 | 14 | - William Thompson. [A Guide for the Young Economist](http://www.amazon.com/Guide-Young-Economist-MIT-Press/dp/026251589X). "Chapter 2: Writing Papers." 15 | 16 | - Stephen Van Evera. [Guide to Methods for Students of Political Science](http://www.amazon.com/Guide-Methods-Students-Political-Science/dp/080148457X). Appendix. 17 | 18 | - Joseph M. Williams and Joseph Bizup. [Style: Lessons in Clarity and Grace](http://www.amazon.com/dp/0321898680/) 19 | 20 | - Strunk and White. *The Elements of Style* 21 | 22 | - [Chicago Manual of Style](http://www.chicagomanualofstyle.org/) and [APSA Style Manual for Political Science](http://www.apsanet.org/Portals/54/APSA%20Files/publications/APSAStyleManual2006.pdf) for editorial and style issues. 23 | 24 | - [How to construct a Nature summary paragraph](http://www.nature.com/nature/authors/gta/Letter_bold_para.doc). Though specific to *Nature*, it provides good advice for structuring abstracts or introductions. 25 | 26 | - Ezra Klein. [How researchers are terrible communications, and how they can do better](http://chrisblattman.com/2015/11/05/ezra-klein-how-researchers-are-terrible-communicators-and-how-they-can-do-better/). 27 | 28 | - The advice in the *AJPS* [Instructions for Submitting Authors](http://ajps.org/guidelines-for-manuscripts/) is a concise description of how to write an abstract: 29 | 30 | > The abstract should provide a very concise descriptive summary of the research stream to which the manuscript contributes, the specific research 31 | > topic it addresses, the research strategy employed for the analysis, the results obtained from the analysis, and the implications of the findings. 32 | 33 | - [Concrete Advice for Writing Informative Abstracts](http://connection.sagepub.com/blog/sage-connection/2014/05/15/concrete-advice-for-writing-informative-abstracts/) and [How to Carefully Choose Useless Titles for Academic Writing](http://www.socialsciencespace.com/2014/03/how-to-carefully-choose-useless-titles-for-academic-writing/) 34 | 35 | ## Finding Research Ideas 36 | 37 | - Paul Krugman [How I Work](http://web.mit.edu/krugman/www/howiwork.html) 38 | - Hal Varian. [How to build an Economic Model in your spare time](http://people.ischool.berkeley.edu/~hal/Papers/how.pdf) 39 | - Greg Mankiw, [My Rules of Thumb](http://faculty.som.yale.edu/jameschoi/mankiw_tips.pdf): 40 | - The links in [Advice for Grad Students](http://gregmankiw.blogspot.com/2006/05/advice-for-grad-students.html) 41 | 42 | ## Replications 43 | 44 | Gary King has advice on how to turn a replication into a publishable paper: 45 | 46 | - Gary King [How to Write a Publishable Paper as a Class Project](http://gking.harvard.edu/papers) 47 | 48 | - Gary King. 2006. "[Publication, Publication.](http://gking.harvard.edu/files/abs/paperspub-abs.shtml)" *PS: Political Science and Politics*. 49 | 50 | - [Political Science Should Not Stop Young Researchers from Replicating](https://politicalsciencereplication.wordpress.com/2015/06/15/political-science-should-not-stop-young-researchers-from-replicating/) 51 | from the [Political Science Replication](https://politicalsciencereplication.wordpress.com) blog. 52 | 53 | And see the examples of students replications from his Harvard course at . 54 | 55 | Famous replications. 56 | 57 | - "Irregularities in LaCour (2014) [@BroockmanKallaAronow2015a] 58 | - "Does High Public Debt Consistently Stifle Economic Growth? A Critique of Reinhart and Rogoff." [@HerndonAshPollin2013a] 59 | 60 | However, although those replications are famous for finding fraud or obvious 61 | errors in the analysis, replications can lead to extensions and generate new 62 | ideas. This was the intent of @BroockmanKallaAronow2015a when starting the 63 | replication. 64 | -------------------------------------------------------------------------------- /eda.Rmd: -------------------------------------------------------------------------------- 1 | # (PART) Exploratory Data Analysis {-} 2 | -------------------------------------------------------------------------------- /img/1000px-Coefficient_of_Determination.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/img/1000px-Coefficient_of_Determination.svg.png -------------------------------------------------------------------------------- /img/islr-fig-6.7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/img/islr-fig-6.7.png -------------------------------------------------------------------------------- /img/laffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/img/laffer.png -------------------------------------------------------------------------------- /img/tobias-funke-blue.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/img/tobias-funke-blue.jpeg -------------------------------------------------------------------------------- /includes/after_body.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/includes/after_body.html -------------------------------------------------------------------------------- /includes/before_body.html: -------------------------------------------------------------------------------- 1 | \[ 2 | \DeclareMathOperator{\E}{E} 3 | \DeclareMathOperator{\mean}{mean} 4 | \DeclareMathOperator{\Var}{Var} 5 | \DeclareMathOperator{\Cov}{Cov} 6 | \DeclareMathOperator{\Cor}{Cor} 7 | \DeclareMathOperator{\Bias}{Bias} 8 | \DeclareMathOperator{\MSE}{MSE} 9 | \DeclareMathOperator{\RMSE}{RMSE} 10 | \DeclareMathOperator{\sd}{sd} 11 | \DeclareMathOperator{\se}{se} 12 | \DeclareMathOperator{\rank}{rank} 13 | \DeclareMathOperator*{\argmin}{arg\,min} 14 | \DeclareMathOperator*{\argmax}{arg\,max} 15 | 16 | \newcommand{\Mat}[1]{\boldsymbol{#1}} 17 | \newcommand{\Vec}[1]{\boldsymbol{#1}} 18 | \newcommand{\T}{'} 19 | 20 | \newcommand{\distr}[1]{\mathcal{#1}} 21 | \newcommand{\dnorm}{\distr{N}} 22 | \newcommand{\dmvnorm}[1]{\distr{N}_{#1}} 23 | \newcommand{\dt}[1]{\distr{T}_{#1}} 24 | 25 | \newcommand{\cia}{\perp\!\!\!\perp} 26 | \DeclareMathOperator*{\plim}{plim} 27 | \] 28 | -------------------------------------------------------------------------------- /includes/in_header.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrnold/intro-methods-notes/4c1342aa322c728ad21bbfaa2eeade554cb79b6c/includes/in_header.html -------------------------------------------------------------------------------- /includes/preamble.tex: -------------------------------------------------------------------------------- 1 | \usepackage{booktabs} 2 | 3 | \DeclareMathOperator{\E}{E} 4 | \DeclareMathOperator{\mean}{mean} 5 | \DeclareMathOperator{\Var}{Var} 6 | \DeclareMathOperator{\Cov}{Cov} 7 | \DeclareMathOperator{\Cor}{Cor} 8 | \DeclareMathOperator{\Bias}{Bias} 9 | \DeclareMathOperator{\MSE}{MSE} 10 | \DeclareMathOperator{\sd}{sd} 11 | \DeclareMathOperator{\se}{se} 12 | \DeclareMathOperator{\rank}{rank} 13 | \DeclareMathOperator*{\argmin}{arg\,min} 14 | \DeclareMathOperator*{\argmax}{arg\,max} 15 | 16 | \newcommand{\mat}[1]{\boldsymbol{#1}} 17 | \renewcommand{\vec}[1]{\boldsymbol{#1}} 18 | \renewcommand{\T}{'} 19 | 20 | \newcommand{\distr}[1]{\mathcal{#1}} 21 | \newcommand{\dnorm}{\distr{N}} 22 | \newcommand{\dmvnorm}[1]{\distr{N}_{#1}} 23 | \newcommand{\dt}[1]{\distr{T}_{#1}} 24 | 25 | \newcommand{\cia}{\perp\!\!\!\perp} 26 | \DeclareMathOperator*{\plim}{plim} 27 | -------------------------------------------------------------------------------- /index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Analysis Notes" 3 | author: "Jeffrey B. Arnold" 4 | date: "`r Sys.Date()`" 5 | knit: "bookdown::render_book" 6 | bibliography: ["intromethods.bib"] 7 | biblio-style: "apalike" 8 | link-citations: true 9 | documentclass: book 10 | colorlinks: yes 11 | lot: yes 12 | lof: yes 13 | monofont: "Source Code Pro" 14 | monofontoptions: "Scale=0.7" 15 | site: bookdown::bookdown_site 16 | github-repo: jrnold/intro-methods-notes 17 | description: > 18 | These are notes associated with the course, POLS/CS&SS 503: Advanced Quantitative Political Methodology at the University of Washington. 19 | --- 20 | 21 | # Introduction 22 | 23 | Notes used when teaching "POLS/CS&SS 501: Advanced Political Research Design and Analysis" and "POLS/CS&SS 503: Advanced Quantitative Political Methodology" at the University of Washington. 24 | 25 | 26 | $$ 27 | $$ 28 | -------------------------------------------------------------------------------- /intro-methods-notes.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Makefile 19 | -------------------------------------------------------------------------------- /linear-regression.Rmd: -------------------------------------------------------------------------------- 1 | # (PART) Linear Regression {-} 2 | -------------------------------------------------------------------------------- /model-fit.Rmd: -------------------------------------------------------------------------------- 1 | # Model Fit 2 | 3 | ## Sums of Squares 4 | 5 | There are several "sums of squares" that are important for regression. 6 | 7 | | RSS | Residual Sum of squares | $\sum_{i = 1} (\hat{y}_i - y_i)^2$ | 8 | | MSS | $\sum_{i = 1} (\hat{y}_i - y_i)^2$ | 9 | | TSS | Total Sum of squares | $\sum_{i = 1} (y_i - \mean{y}_i)^2$ | 10 | 11 | - The *residual sum of squares* is the total error of the model. How much of $y$ is not explained[^causal] by $x$? 12 | - The *model sum of squares* is the total difference between the regression line and the mean of $y$. How much of $y$ is explained by $x$? 13 | - The *total sum of squares* is the total variation (numerator of the variance of $y$) in $y$ unconditional of $x$. How much of $y$ is there to explain? 14 | 15 | $$ 16 | \begin{aligned}[t] 17 | TSS = MSS + RSS \\ 18 | \text{total variation} = \text{variation explained by model} + \text{remaining variation} 19 | \end{aligned} 20 | $$ 21 | 22 | These terms have many names ... some of which conflict with each other. 23 | 24 | - RSS (Residual sum of squares) 25 | 26 | - SSE (Sum of squared errors) 27 | - SSR (Sum of squared residuals) 28 | 29 | - MSS (Model sum of squares) 30 | 31 | - RSS (Regression sum of squares) 32 | 33 | - TSS (Total sum of squares) 34 | 35 | The OLS variance decomposition is 36 | 37 | $$ 38 | TSS = RSS + 39 | $$ 40 | 41 | ## Regression Standard Error 42 | 43 | The regression standard error for a linear regression with $n$ observations and $k$ variables is. 44 | $$ 45 | \hat{\sigma} = \frac{\sum_{i= 1} \hat{\epsilon}_i}{n - k - 1} 46 | $$ 47 | The $n - k - 1$ denominator is the *regression degrees of freedom*. 48 | Since we have already estimated $k$ slope coefficients and the intercept, there are only $n - k - 1$ values left to estimate the regression standard error. 49 | 50 | But recall regression standard error is an estimator for the population $\sigma$, for the population model, 51 | $$ 52 | Y = X \beta + \epsilon 53 | $$ 54 | where $\E(\epsilon) = 0$ and $\Var(\epsilon) = \sigma^2$. 55 | The $n - k - 1$ denominator is needed for the estimator (of the variance) to be unbiased. 56 | 57 | ## (Root) Mean Squared Error 58 | 59 | The statistic mean squared error (MSE) is, 60 | $$ 61 | MSE(\hat{\epsilon}) = \frac{1}{n} \sum_{i = 1}^n \hat{\epsilon}_i^2 . 62 | $$ 63 | 64 | Unlike $\hat{\sigma}$ the denominator is $n$, not $n - k - 1$. 65 | This is because the MSE is used as a descriptive statistic of the sample rather than as an estimator of a population value. 66 | 67 | The MSE is not on the same scale as $y$, so often the root mean squared error (RMSE) is used, 68 | $$ 69 | RMSE(\hat{\epsilon}) = \sqrt{MSE(\hat{\epsilon})}. 70 | $$ 71 | 72 | Both MSE and RMSE are also often used as out-of-sample model fit measures in cross-validation. 73 | 74 | [^causal]: Where "explained" is in **no** way causal. In this case explained means the difference in variation in one variable after conditioning on another variable. 75 | 76 | ## R-squared 77 | 78 | R squared is also called the **coefficient of determination**. 79 | 80 | $$ 81 | \begin{aligned}[t] 82 | R^2 &= \frac{MSS}{TSS} = 1 - \frac{RSS}{TSS} \\ 83 | &= \frac{\text{model variance}}{\text{total variance}} \\ 84 | &= 1 - \frac{\text{residual variance}}{\text{total variance}} \\ 85 | &= \text{fraction of variance explained} 86 | \end{aligned} 87 | $$ 88 | 89 | - R-squared is so called because for a bivariate regression the $R^2$ is the square of the correlation coefficient ($r$). 90 | 91 | There are a large [number](https://stats.stackexchange.com/questions/13314/is-r2-useful-or-dangerous) of rants about the dangers of focusing on $R^2$. 92 | 93 | ```{r echo=FALSE} 94 | knitr::include_graphics("") 95 | ``` 96 | -------------------------------------------------------------------------------- /multicollinearity.Rmd: -------------------------------------------------------------------------------- 1 | # Collinearity and Multicollinearity 2 | 3 | ```{r} 4 | library("tidyverse") 5 | library("carData") 6 | ``` 7 | 8 | ## (Perfect) collinearity 9 | 10 | In order to estimate unique $\hat{\beta}$ OLS requires the that the columns of the design matrix $\Vec{X}$ are linearly independent. 11 | 12 | Common examples of groups of variables that are not linearly independent: 13 | 14 | - Categorical variables in which there is no excluded category. 15 | You can also include all categories of a categorical variable if you exclude the intercept. 16 | Note that although they are not (often) used in political science, there are other methods of transforming categorical variables to ensure the columns in the design matrix are independent. 17 | 18 | - A constant variable. This can happen in practice with dichotomous 19 | variables of rare events; if you drop some observations for whatever 20 | reason, you may end up dropping all the 1's in the data. So although the 21 | variable is not constant in the population, in your sample it is constant 22 | and cannot be included in the regression. 23 | 24 | - A variable that is a multiple of another variable. E.g. you cannot include $\log(\text{GDP in millions USD})$ and $\log({GDP in USD})$ since $\log(\text{GDP in millions USD}) = \log({GDP in USD}) / 1,000,000$. 25 | 26 | - A variable that is the sum of two other variables. E.g. you cannot include $\log(population)$, $\log(GDP)$, $\log(GDP per capita)$ in a regression since 27 | $$\log(\text{GDP per capita}) = \log(\text{GDP} / \text{population}) = \log(\text{GDP}) - \log(\text{population})$$. 28 | 29 | ## What to do about it? 30 | 31 | R and most statistical programs will run regressions with collinear variables, but will drop variables until only linearly independent columns in $\Mat{X}$ remain. 32 | 33 | For example, consider the following code. The variable `type` is a categorical variable with categories "bc", "wc", and "prof". 34 | 35 | ```{r} 36 | data(Duncan, package = "carData") 37 | # Create dummy variables for each category 38 | Duncan <- mutate(Duncan, 39 | bc = type == "bc", 40 | wc = type == "wc", 41 | prof = type == "prof") 42 | lm(prestige ~ bc + wc + prof, data = Duncan) 43 | ``` 44 | R runs the regression, but coefficient and standard errors for `prof` are set to `NA`. 45 | 46 | You should not rely on the software to fix this for you; once you (or the software) notices the problem check the reasons it occurred. The rewrite your regression to remove whatever was creating linearly dependent variables in $\Mat{X}$. 47 | 48 | ## Multicollinearity 49 | 50 | Multicollinearity is the (poor) name for less-than-perfect collinearity. 51 | Even though there is enough variation in $\Mat{X}$ to estimate OLS coefficients, if some set of variables in $\Mat{X}$ is highly correlated it will result in large, but unbiased, standard errors on the estimates. 52 | 53 | What happens if variables are not linearly dependent, but nevertheless highly correlated? 54 | If $\Cor(\Vec{x}_1, vec{x}_2) = 1$, then they are linearly dependent and the regression cannot be estimated (see above). 55 | But if $\Cor(\Vec{x}_1, vec{x}_2) = 0.99$, the OLS can estimate unique values of of $\hat\beta$. However, it everything was fine with OLS estimates until, suddenly, when there is linearly independence everything breaks. The answer is yes, and no. 56 | As $|\Cor(\Vec{x}_1, \Vec{x}_2)| \to 1$ the standard errors on the coefficients of these variables increase, but OLS as an estimator works correctly; $\hat\beta$ and $\se{\hat\beta}$ are unbiased. 57 | With multicollinearity, OLS gives you the "right" answer, but it cannot say much with certainty. 58 | 59 | For a bivariate regression, the distribution of the slope coefficient has variance, 60 | $$ 61 | \Var(\hat{\beta}_1) = \frac{\sigma_u^2}{\sum_{i = 1} (x_i - \bar{x})^2} . 62 | $$ 63 | 64 | What affects the standard error of $\hat{\beta}$? 65 | 66 | - The error variance ($\sigma_u^2$). The higher the variance of the residuals, the higher the variance of the coefficients. 67 | - The variance of $\Vec{x}$. The lower variation in $\Mat{x}$, the bigger the standard errors of the slope. 68 | 69 | Now consider a multiple regression, 70 | $$ 71 | \Vec{y} = \beta_0 + \beta_1 \Vec{x}_1 + \beta_2 \Vec{x}_2 + u 72 | $$ 73 | 74 | this becomes, 75 | $$ 76 | \Var(\hat{\beta}_1) = \frac{\sigma_u^2}{(1 - R^2_1) \sum_{i = 1}^n (x_i - \bar{x})^2} 77 | $$ 78 | where $R^2_1$ is the $R^2$ from the regression of $\Vec{x}_1$ on $\Vec{x}_2$, 79 | $$ 80 | \Vec{x} = \hat{\delta}_0 + \hat{\delta}_1 \Vec{x}_2 . 81 | $$ 82 | 83 | The factors affecting standard errors are 84 | 85 | 1. Error variance: higher residuals leads to higher standard errors. 86 | 1. Variance of $\Vec{x}_1$: lower variation in $\Vec{x}_2$ leads to higher standard errors. 87 | 1. The strength of the relationship between $x_1$ and $x_2$. Stronger relationship between $x_1$ and $x_2$ (higher $R^2$ of the regression of $x_1$ on $x_2$) leads to higher standard errors. 88 | 89 | These arguments generalize to more than two predictors. 90 | 91 | ## What do do about it? 92 | 93 | Multicollinearity is not an "error" in the model. 94 | All you can do is: 95 | 96 | 1. Get more data 97 | 1. Find more conditional variation in the predictor of interest 98 | 99 | What it means depends on what you are doing. 100 | 101 | 1. Prediction: then you are interested in $\hat{\Vec{y}}$ and not $\hat{\beta}}$ (or its standard errors). 102 | In this case, multicollinearity is irrelevant. 103 | 104 | 1. Causal inference: in this case you are interested in $\hat{\Vec{\beta}}$. 105 | Multicollinearity does not bias $\hat{\beta}$. 106 | You should include all regressors to achieve balance, and include all relevant pre-treatment variables and not include post-treatment variables. 107 | Multicollinearity is not directly relevant in this choice. 108 | All multicollinearity means is that the variation in the treatment after accounting for selection effects is very low, making it hard to say anything about the treatment effect with that observational data. 109 | More sophisticated methods may trade off some bias for a lower variance (e.g. shrinkage methods), but that must be done systematically, and not ad-hoc dropping relevant pre-treatment variables that simply correlate highly with your treatment variable. 110 | -------------------------------------------------------------------------------- /old-files/multicollinearity.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Collinearity and Multicollinearity 3 | --- 4 | 5 | # Collinearity and Multicollinearity 6 | 7 | ## (Perfect) Collinearity 8 | 9 | In order to estimate unique $\hat{\beta}$ OLS requires the that the columns of the design matrix $\Vec{X}$ are linearly independent. 10 | 11 | Common examples of groups of variables that are not linearly independent: 12 | 13 | - Categorical variables in which there is no excluded category. 14 | You can also include all categories of a categorical variable if you exclude the intercept. 15 | Note that although they are not (often) used in political science, there are other methods of transforming categorical variables to ensure the columns in the design matrix are independent. 16 | - A constant variable. This can happen in practice with dichotomous variables of rare events; if you drop some observations for whatever reason, you may end up dropping all the 1's in the data. So although the variable is not constant in the population, in your sample it is constant and cannot be included in the regression. 17 | - A variable that is a multiple of another variable. E.g. you cannot include $\log(\text{GDP in millions USD})$ and $\log({GDP in USD})$ since $\log(\text{GDP in millions USD}) = \log({GDP in USD}) / 1,000,000$. in 18 | - A variable that is the sum of two other variables. E.g. you cannot include $\log(population)$, $\log(GDP)$, $\log(GDP per capita)$ in a regression since 19 | $$\log(\text{GDP per capita}) = \log(\text{GDP} / \text{population}) = \log(\text{GDP}) - \log(\text{population})$$. 20 | 21 | 22 | #### What to do about it? 23 | 24 | R and most statistical programs will run regressions with collinear variables, but will drop variables until only linearly independent columns in $\Mat{X}$ remain. 25 | 26 | For example, consider the following code. The variable `type` is a categorical variable with categories "bc", "wc", and "prof". 27 | It will 28 | ```{r} 29 | data(Duncan, package = "car") 30 | # Create dummy variables for each category 31 | Duncan <- mutate(Duncan, 32 | bc = type == "bc", 33 | wc = type == "wc", 34 | prof = type == "prof") 35 | lm(prestige ~ bc + wc + prof, data = Duncan) 36 | ``` 37 | R runs the regression, but coefficient and standard errors for `prof` are set to `NA`. 38 | 39 | You should not rely on the software to fix this for you; once you (or the software) notices the problem check the reasons it occurred. The rewrite your regression to remove whatever was creating linearly dependent variables in $\Mat{X}$. 40 | 41 | 42 | 43 | ## Multicollinearity 44 | 45 | 46 | *Insert plot of highly correlated variables and their coefficients.* 47 | 48 | *Insert plot of uncorrelated variables and their coefficients.* 49 | 50 | ### What to do about it? 51 | 52 | Remember multicollinearity does not violate the assumptions of OLS. If all the other assumptions hold, then OLS is giving you unbiased coefficients and standard errors. What multicollinearity is indicating is that you may not be able to answer the question with the precision you would like. 53 | 54 | 1. If the variable(s) of interest are highly correlated with other variables, then it means that there is not enough variation, controlling for other factors. You may check that you are not controlling for "post-treatment" variables. Dropping control variables if they are correctly included will bias your estimates. But otherwise, there is little you can do other than get more data. You could re-consider your research design and question. What does it mean if there is that little variation in the treatment variable after controlling for other factors? 55 | 2. If control variables are highly correlated with each other, it does not matter. You should not be interpreting their coefficients, so their standard errors do not matter. In fact, controlling for several similar, but correlated variables, may be useful in order to offset measurement error in any one of them. 56 | -------------------------------------------------------------------------------- /old-files/ols-diagnostics-troubleshooting.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Regression Diagnostics" 3 | --- 4 | 5 | # Regression Diagnostics 6 | 7 | Several packages in R provide large collections of regression diagnostics: 8 | 9 | - [lmtest](https://cran.r-project.org/web/packages/lmtest/index.html) 10 | - [car](https://cran.r-project.org/web/packages/car/index.html) 11 | 12 | Reading the vignettes or documentation of these packages is a good overview of available regression diagnostics. 13 | Also see the [Econometrics Task View](https://cran.r-project.org/web/views/Econometrics.html). 14 | 15 | @Fox2016a has a particularly extensive overview of regression diagnostics. 16 | 17 | Though for Stata, [this tutorial](http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm) has an overview of many regression diagnostics. 18 | -------------------------------------------------------------------------------- /old-files/ols-inference.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: OLS Inference 3 | --- 4 | 5 | # OLS Inference 6 | 7 | 8 | ## Sampling Distribution 9 | 10 | The sampling distribution of the OLS parameters is 11 | $$ 12 | \Vec{\beta} \sim \dmvnorm(\Vec{beta}, \sigma^2 (\Mat{X}' \Mat{X})^{-1}). 13 | $$ 14 | Thus, the variance of the coefficients is 15 | $$ 16 | \Var(\hat{\beta}) = \sigma^2 (\Mat{X}' \Mat{X})^{-1} . 17 | $$ 18 | which is a symmetric matrix, 19 | $$ 20 | \Var(\hat{\beta}) = 21 | \begin{bmatrix} 22 | \Var(\hat{\beta}_0) & \Cov(\hat{\beta}_0, \hat{\beta}_1) & \Cov(\hat{\beta}_0, \hat{\beta}_1) & \cdots & \Cov(\hat{\beta}_0, \hat{\beta}_K) \\ 23 | \Cov(\hat{\beta}_0, \hat{\beta}_1) & \Var(\hat{\beta}_1) & \Cov(\hat{\beta}_1, \hat{\beta}_2) & \cdots & \Cov(\hat{\beta}_1, \hat{\beta}_K) \\ 24 | \Cov(\hat{\beta}_0, \hat{\beta}_2) & \Cov(\hat{\beta}_1, \hat{\beta}_2) & \Cov(\hat{\beta}_2) & \cdots & \Cov(\hat{\beta}_2, \hat{\beta}_K) \\ 25 | \vdots & \vdots & \vdots & \ddots & \vdots \\ 26 | \Cov(\hat{\beta}_0, \hat{\beta}_K) & \Cov(\hat{\beta}_1, \hat{\beta}_K) & \Cov(\hat{\beta}_K) & \cdots & \Var( \hat{\beta}_k) 27 | \end{bmatrix} 28 | $$ 29 | On the diagonal are the variances of the parameters, and the off-diagonal elements are the covariances of the parameters. 30 | 31 | 32 | ## t-tests for single parameters 33 | 34 | The null hypothesis and alternative hypotheses for two-sided tests are, 35 | $$ 36 | \begin{aligned}[t] 37 | H_0: &\beta_k = \beta_0 \\ 38 | H_a: &\beta_k \neq \beta_0 39 | \end{aligned} 40 | $$ 41 | 42 | Then in large samples, 43 | $$ 44 | \frac{\hat{\beta}_k - \beta_k}{\se(\widehat{\beta}_k)} \sim \dnorm(0, 1) 45 | $$ 46 | In small samples, 47 | $$ 48 | \frac{\hat{\beta}_k - \beta_k}{\se(\widehat{\beta}_k)} \sim \dt{N - (K + 1)} 49 | $$ 50 | 51 | 52 | The estimated standard errors of $\hat{\beta}$ come from 53 | $$ 54 | \begin{aligned}[t] 55 | \Var(\hat{\Vec{\beta}}) &= \hat{\sigma}^2 (\Mat{X}' \Mat{X})^{-1} \\ 56 | \hat{\sigma}^2 &= \frac{\Vec{\epsilon}'\Vec{\epsilon}}{(N - (K + 1))} 57 | \end{aligned} 58 | $$ 59 | 60 | So, under the common null hypothesis test for $\beta_k = 0$, 61 | $$ 62 | \frac{\hat{\beta}_k}{\se(\widehat{\beta}_k)} \sim \dt{N - (K + 1)} 63 | $$ 64 | 65 | And the confidence intervals for a $(1 - \alpha) \times 100$ confidence interval for $\hat{\beta}_k$ are, 66 | $$ 67 | \hat{\beta}_k \pm t^*_{\alpha / 2} \times \se(\hat{\beta}_K) 68 | $$ 69 | where $t^*_{\alpha / 2}$ is the quantile of the $\dt{n - (K + 1)}$ distribution such that $P(T \leq t^*) > 1 - \alpha / 2$. 70 | 71 | 72 | ## F-tests of Multiple Hypotheses 73 | 74 | TODO 75 | 76 | ## Testing functions of coefficients 77 | 78 | The standard error for non-linear functions of parameters can be approximated with the Delta method: 79 | $$ 80 | \se(f(\Vec{\beta})) = 81 | \left(\frac{d\,f(\Vec{\beta})}{d\,\Vec{beta}} \right)\T 82 | \Var{\Vec{\beta}} 83 | \left(\frac{d\,f(\Vec{\beta})}{d\,\Vec{beta}} \right) . 84 | $$ 85 | -------------------------------------------------------------------------------- /old-files/ols-misc.Rmd: -------------------------------------------------------------------------------- 1 | # Diagnostics and Troubleshooting 2 | 3 | 4 | ## Omitted variables 5 | 6 | - Problem: An omitted variables bias coefficients unless (1) their coefficient is zero, or (2) it is uncorrelated with the variable. 7 | - Solutions: Control for those variables. When estimating a structural or causal effect, care needs to be taken to not include bad controls. 8 | 9 | ### Simulations 10 | 11 | 12 | ### What to do about it? 13 | 14 | - Include more controls 15 | - Estimate the possible bias of omitted variables 16 | - Better design. Do not rely on selection on observables. 17 | 18 | ### Examples 19 | 20 | **TODO:** Find good examples. Perhaps examples of Simpson's Paradox. 21 | 22 | ## Measurement Errors 23 | 24 | - Problem: Measurement error in covariates biases regression coefficient towards zero, and makes it an imperfect control 25 | - Solutions: 26 | - better measures 27 | - instrumental variable or structural equation models 28 | 29 | A regression model allows for measurement error in the outcome variable, since measurement error uncorrelated with $X$ can be thought of as part of the residual $\varepsilon$. 30 | 31 | However, measurement error in the covariates is a different issue. 32 | Measurement error in a covariate biases its coefficient downward. 33 | This is called **attenuation bias**. That covariate also acts as 34 | an imperfect control, which will bias other coefficients. 35 | 36 | Suppose the population regression function is 37 | $$ 38 | Y_i = \beta_0 + \beta_1 X_{i} + \varepsilon_i 39 | $$ 40 | However, instead of $X_1$, you observe $\tilde{X}_1$, which is observed with measurement error, 41 | $$ 42 | \tilde{X}_1 = X_{i} + \delta_i 43 | $$ 44 | where $\delta_i$ is the *classical measurement error*, which is mean zero and uncorrelated with the covariates or regression disturbances, 45 | $$ 46 | \begin{aligned}[t] 47 | \E(\delta_i) &= 0 \\ 48 | \Cov(X_i, \delta_i) &= \Cov(\epsilon_i, \delta_i) = 0 49 | \end{aligned} 50 | $$ 51 | 52 | Measurement error in a variable $X$ has the following effects 53 | 54 | - Biases its coefficient towards zero (attenuation biase) 55 | - Biases the coefficients of other variables (that $X$ is correlated with) in unknown directions. 56 | - Controlling for other variables *increases* the attenuation bias in $\beta$ 57 | 58 | **TODO** Fill in equations. See Wooldridge Ch 9 (p. 320-323), Mastering Metrics, p. 240; Fox, Ch 3. 59 | 60 | ### What can we do about it? 61 | 62 | - Instrumental variable models, and, more generally, structural equation models, can model the measurement error. 63 | - Use measures that are more closely aligned with your concepts, have less error. 64 | - Combine multiple measures in order to reduce measurement error 65 | 66 | ### Simulations 67 | 68 | 69 | 70 | 71 | 72 | ### Example 73 | 74 | **TODO:** Need example of measurement error in political science. 75 | 76 | ### References 77 | 78 | - Mastering Metrics, Ch 6. p. 240. 79 | - Fox, Ch 6.4. p. 112. 80 | - Kennedy (6 ed) Ch 9, p. 139. 81 | 82 | 83 | ## Functional Form 84 | 85 | **TODO** 86 | 87 | ## Multicollinearity 88 | 89 | - Problem: Correlation between predictors increases the standard errors on those predictors. However, coefficients are unbiased, an assuming the other CLM assumptions hold, the standard errors. 90 | - Solution: 91 | - More data 92 | - Remove predictors 93 | - Combine predictors: principal components, indexes 94 | - Regularization: e.g. LASSO or Ridge regression 95 | 96 | ## Residuals 97 | 98 | ### Non-Normal errors 99 | 100 | - Problem: Incorrect standard errors, but generally only an issue if sample size is small. However, this may suggest that the expected value of $Y$ is not a substantively meaningful quantity. 101 | - Solution: 102 | - Transform variables 103 | - Use alternative model more appropriate for the data 104 | 105 | 106 | Diagnostics 107 | 108 | - qqplots 109 | 110 | ### Non-Constant variance 111 | 112 | - Problem: Incorrect standard errors. This may also suggest incorrect functional form. 113 | - Solution: 114 | - If form of non-constant variance is known: weighted least squares 115 | - If form is unknown: robust standard errors 116 | - Since it suggests an incorrect functional form, adjust the model until non-constant variance disappears. 117 | 118 | Diagnostics 119 | 120 | - plots 121 | - compare robust standard errors to non-robust standard errors 122 | -------------------------------------------------------------------------------- /old-files/ovb-measurment-error.Rmd: -------------------------------------------------------------------------------- 1 | ## Measurement Error 2 | 3 | ### What's the problem? 4 | 5 | It biases coefficients. The way in which it biases coefficients depends on which 6 | variables have measurement error. 7 | 8 | 1. Variable with measurement error: biases $\beta$ towards zero (**attenuation bias**) 9 | 2. Other variables: Biases $\beta$ similarly to omitted variable bias. In other words, when a variable has measurement error it is an imperfect control. You can think of omitted variables as the limit of the effect of measurement error as it increases. 10 | 11 | 12 | ### What to do about it? 13 | 14 | There's no easy fix within the OLS framework. 15 | 16 | 1. If the measurement error is in the variable of interest, then the variable will be biased towards zero, and your estimate is too large. 17 | 2. Find better measures with lower measurement errors. If the variable is the variable of interest, then perhaps combine multiple variables into a single index. If the measurement error is in the control variables, then include several measures. That these measure correlate closely increases their standard errors, but the control variables are not the object of the inferential analysis. 18 | 3. More complicated methods: errors in variable models, structural equation models, instrumental variable (IV) models, and Bayesian methods. 19 | 20 | -------------------------------------------------------------------------------- /old-files/resampling-methods.Rmd: -------------------------------------------------------------------------------- 1 | # Prediction 2 | 3 | 4 | 5 | 6 | ## Prediction error 7 | 8 | The problem is that we would like to estimate how well the model will fit *new* data. 9 | Since we haven't seen the new data we don't know, this is hard. 10 | We will have to 11 | 12 | 13 | ## Cross-Validation 14 | 15 | 16 | ### Example 17 | 18 | ```{r} 19 | 20 | ``` 21 | 22 | 23 | ## Application to Science 24 | 25 | TODO 26 | 27 | 28 | ## References 29 | 30 | See the R packages 31 | 32 | - **caret** 33 | - **mlr** 34 | - **recipes** 35 | -------------------------------------------------------------------------------- /outliers.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: html_document 3 | editor_options: 4 | chunk_output_type: console 5 | --- 6 | 7 | # Outliers 8 | 9 | In bivariate regression, the coefficient $\hat{\beta}_1$ can be written as a weighted average of the outcomes, 10 | $$ 11 | \hat{\beta}_1 = \sum_{i = 1}^n w_i (y_i - \bar{y}), 12 | $$ 13 | where 14 | $$ 15 | w_i = \frac{x_i - \bar{x}}{\sum_{i = 1}^n (x_i - \bar{x})^2} . 16 | $$ 17 | 18 | ```{r} 19 | Anscombe <- anscombe %>% 20 | rowid_to_column(var = ".id") %>% 21 | gather(variable, value, -.id) %>% 22 | separate(variable, c("xy", "dataset"), sep = 1) %>% 23 | spread(xy, value) 24 | ``` 25 | 26 | We'll consider the regressions on each dataset. 27 | ```{r} 28 | ggplot(Anscombe, aes(x = x, y = y)) + 29 | geom_point() + 30 | geom_smooth(method = "lm", se = FALSE) + 31 | facet_wrap(~ dataset, ncol = 2) 32 | ``` 33 | 34 | Add the linear regression weights for each observation: 35 | ```{r} 36 | Anscombe <- Anscombe %>% 37 | group_by(dataset) %>% 38 | mutate(w = x - mean(x), 39 | w = w / sum(w ^ 2)) %>% 40 | ungroup() 41 | ``` 42 | Now show the weights of each observation: 43 | ```{r} 44 | ggplot(Anscombe, aes(x = x, y = y, size = abs(w))) + 45 | geom_point() + 46 | geom_smooth(method = "lm", se = FALSE) + 47 | facet_wrap(~ dataset, ncol = 2) 48 | ``` 49 | 50 | 64 | 65 | ## Questions 66 | 67 | - Which observations in linear regression given the most weight in determining $\hat{\beta_1}$? 68 | 69 | - Consider two observations $x_1 = 1$ and $x_2 = 2$. Suppose $\bar{x} = 1$. 70 | What are the weights of the two observations? What is the implication for 71 | how OLS will respond to outliers? 72 | 73 | ## Influential Weights 74 | 75 | The previous section showed how OLS coefficients are a weighted average of the outcomes. 76 | This suggests that some observations may have have more influence than others on our estimates. 77 | 78 | There are three types of extreme values to consider: 79 | 80 | 1. Leverage point: extreme in $x$ 81 | 1. Outlier: extreme in $y$ 82 | 1. Influence point: a leverage point **and** an outlier 83 | 84 | ## Leverage Point 85 | 86 | The **hat matrix** is defined as 87 | $$ 88 | \Mat{H} = \Mat{X} (\Mat{X}' \Mat{X})^{-1} \Mat{X}' 89 | $$ 90 | 91 | Note, 92 | $$ 93 | \begin{aligned}[t] 94 | \hat{\Vec{u}} &= \Vec{y} - \Mat{X} \hat{\Vec{\beta}} \\ 95 | &= \Vec{y} - \Mat{X} \underbrace{\Mat{X} (\Mat{X}' \Mat{X})^{-1} \Mat{X}' \Vec{y}}_{\text{OLS estimate}} \\ 96 | &= \Vec{y} - \Mat{H} \Vec{y} \\ 97 | &= (\Mat{I} - \Mat{H}) \Vec{y} 98 | \end{aligned} 99 | $$ 100 | The hat matrix is so-called because it puts the "hat" on $\Vec{y}$: 101 | $$ 102 | \hat{\Vec{y}} = \Mat{H} \Vec{y} 103 | $$ 104 | Properties of the hat matrix: 105 | 106 | - $n \times n$ symmetric matrix 107 | - idempotent: $\Mat{H} \Mat{H} = \Mat{I}$. 108 | -------------------------------------------------------------------------------- /ovb.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: html_document 3 | editor_options: 4 | chunk_output_type: console 5 | --- 6 | 7 | # Omitted Variable Bias 8 | 9 | **Long regression:** The regression with all variables. 10 | $$ 11 | Y_i = \beta_0 + \beta_1 X_{1,i} + \beta_2 X_{2,i} + u_i 12 | $$ 13 | 14 | **Short regression:** The regression that omits a variable. In this case the short regression omits $Z_i$ 15 | $$ 16 | Y_i = \beta^s_0 + \beta^s_1 X_{1,i} + u_i^s 17 | $$ 18 | 19 | **Question** Will $E(\widehat{\alpha}_1) = \beta_1$? Under what assumptions? 20 | 21 | **Result:** 22 | $$ 23 | \beta^s = \beta_2 + \underbrace{\delta_{21} \beta_2}_{\text{bias}} 24 | $$ 25 | where $\delta_1$ is the coefficient of $X_{2,i}$ on $X_{1,i}$, 26 | $$ 27 | X_{2,i} = \delta_0 + \delta_{1} X_{1,i} . 28 | $$ 29 | 30 | **Omitted variable bias:** bias in $\hat{\beta}^s$ due to omitting $X_{2i}$, 31 | $$ 32 | \mathrm{Bias}(\hat{\beta}_1) = \E[\widehat{\beta}_1] - \beta_1 = \beta_2 \delta_1 . 33 | $$ 34 | 35 | The omitted variable bias is: 36 | $$ 37 | \begin{aligned}[t] 38 | (\text{"effect of $X_{2i}$ on $Y_i$"}) & \times (\text{"effect of $X_{2i}$ on $X_{1i}$"}) \\ 39 | (\mathrm{omitted} \to \mathrm{outcome}) & \times (\mathrm{included} \to \mathrm{omitted}) 40 | \end{aligned} 41 | $$ 42 | 43 | Remember that by OLS, the effect of $X_{1i}$ on $X_{2i}$ is 44 | $$ 45 | \delta_1 = \frac{\Cov(X_{1i}, X_{2i})}{\Var(X_{1i})} . 46 | $$ 47 | 48 | | | $\Cov(X_{1i}, X_{2i}) > 0$ | $\Cov(X_{1i}, X_{2i}) < 0$ | $\Cov(X_{1i}, X_{2i}) = 0$ | 49 | | -------------- | -------------------------- | -------------------------- | -------------------------- | 50 | | $\beta_2 > 0$ | $+$ | $-$ | $\emptyset$ | 51 | | $\beta_2 < 0$ | $-$ | $+$ | $\emptyset$ | 52 | | $\beta_2 = 0$ | $\emptyset$ | $\emptyset$ | $\emptyset$ | 53 | 54 | So $\beta^2_1$ is only unbiased if either of the following is true: 55 | 56 | - $\beta_2 = 0$ ($X_{2,i} is uncorrelated with $Y_i$) 57 | - $\delta_2 = 0$ ($X_{2,i}) is uncorrelated with $X_{1,i}$) 58 | 59 | See @AngristPischke2014 [p. 92]. 60 | 61 | ## Including Irrelevant Variables 62 | 63 | How does including an **irrelevant variable** in a regression affect the other coefficients? 64 | 65 | An **irrelevant variable** is one which is uncorrelated with $Y_i$, thus it would have a coefficient of 0. 66 | 67 | Consider the regression, 68 | $$ 69 | Y_i = \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + u_i . 70 | $$ 71 | If $X_{2i}$ is irrelevant, then $\beta_2 = 0$, and 72 | $$ 73 | Y_i = \beta_0 + \beta_1 X_{1i} + 0 \times X_{2i} + u_i . 74 | $$ 75 | 76 | But given the previous results, OLS is still unbiased for all parameters, 77 | $$ 78 | \begin{aligned}[t] 79 | \E[\widehat{\beta}_0] &= \beta_0 \\ 80 | \E[\widehat{\beta}_1] &= \beta_1 \\ 81 | \E[\widehat{\beta}_2] &= 0 82 | \end{aligned} 83 | $$ 84 | 85 | However, including an irrelevant variable will increase the standard errors of $\hat{\beta}_1$ by reducing the conditional variation of $X_{1i}$ and it also removing a degrees of freedom. 86 | 87 | ## Measurement Error 88 | 89 | There are two issues to be concerned about. 90 | 91 | 1. Measurement error in the covariate of interest. 92 | 1. Measurement error in a control variable. 93 | 94 | ## When does Omitted variable bias make sense? 95 | 96 | - **Description**: No. It may be interesting to consider the relationship conditional on another variable, but not doing it doesn't invalidate the method. 97 | 98 | - **Prediction**: No. The $\hat{\beta}$ do not (directly) matter, since we care about $\hat{y}$. Omitted variable bias does not directly affect that. 99 | 100 | - **Causal Inference** Yes. Not only does it make sense, it is the most important assumption for casual inference. 101 | 102 | - model/structural approach: OVB violates Gauss-Markov assumptions and estimator is biased. 103 | - potential outcomes approach: OVB violates the conditional independence assumption. 104 | 105 | ## What to do about it? 106 | 107 | OVB is the most important assumption for regression in any causal setting. 108 | It is also difficult to assess. 109 | How can we know what we omitted? And how can we know that we've including everything relevant from the population model, if we don't know the population model? 110 | 111 | There are effectively two strategies for testing OVB [@PeiPischkeSchwandt2017a] 112 | 113 | 1. balancing tests 114 | 1. coefficient comparison tests (regression sensitivity analysis/robustness tests) 115 | 116 | Consider the case of long and short regressions 117 | $$ 118 | \begin{aligned} 119 | Y_i &= \beta_0 + \beta_1 X_{1,i} + \beta_2 X_{2,i} + u_i \\ 120 | Y_i &= \beta^s_0 + \beta^s_1 X_{1,i} + u_i^s 121 | \end{aligned} 122 | $$ 123 | The omitted variable bias for estimating the short regression rather than the long regression is 124 | $$ 125 | 126 | $$ 127 | 128 | **Regression sensitivity analysis** Suppose you are interested in the coefficient on $X_1$. Run the bivariate regression of $X_{1i}$ (without any controls), 129 | $$ 130 | y_i = \hat{\beta}_0^s + \hat{\beta}_1^s X_{1i} + \hat{u}_i^s , 131 | $$ 132 | and the multiple regression with **all** the controls, 133 | $$ 134 | y_i = \hat{\beta}_0 + \hat{\beta}_1 x_{1i} + \sum_{k = 2}^K \hat{\beta}_k X_{ki} + \hat{u}_i. 135 | $$ 136 | The quantity of interest is the difference, 137 | $$ 138 | |\hat{\beta}_{1} - \hat{\beta}_{1}^s| . 139 | $$ 140 | If the coefficient on $x_{1i}$ has a large change with the addition of control variables, it suggest that it is likely that there more omitted variables out there. 141 | If the coefficient on $x_{1i}$ changes little with the addition of control variables, it suggest that few covariates influence its coefficient, and it is less likely that there are omitted covariates that would influence the coefficient of $x_{1i}$ [@AngristPischke2014; p. 74]. 142 | See @NunnWantchekon2011a, @AltonjiElderTaber2005a, @PeiPischkeSchwandt2017a, @Oster2016a. 143 | 144 | You may often see papers include regressors one at a time or in groups. 145 | That is more-or-less pointless (at least in the manner that it is usually done), and provides no more information than the long regression. 146 | 147 | ## References 148 | 149 | Much of this chapter is derived from Matt Blackwell . 150 | -------------------------------------------------------------------------------- /potential-outcomes.Rmd: -------------------------------------------------------------------------------- 1 | # Potential Outcomes 2 | 3 | -------------------------------------------------------------------------------- /presentation.Rmd: -------------------------------------------------------------------------------- 1 | # (PART) Presentation {-} 2 | -------------------------------------------------------------------------------- /probability.Rmd: -------------------------------------------------------------------------------- 1 | # (PART) Probability {-} 2 | -------------------------------------------------------------------------------- /programming.Rmd: -------------------------------------------------------------------------------- 1 | # (PART) Programming {-} 2 | -------------------------------------------------------------------------------- /questions.Rmd: -------------------------------------------------------------------------------- 1 | # Types of Questions 2 | 3 | 1. Descriptive: Represent data $X$ by a smaller number of values 4 | 1. Predictive: Given some $X$, what is the value of $Y$? 5 | 1. Causal: If I change $X$, what will the value of $Y$ be? 6 | 7 | Causal questions 8 | 9 | > A variable $X$ is a cause of *variable* $Y$ if $Y$ in any way relies on $X$ for its value. ... $X$ is a cause of $Y$ if $Y$ listens to $X$ and decides 10 | > its value in response to what it hears. Pearl et al. p. 5. 11 | 12 | References 13 | 14 | - Keele - Statistics of Causal Inference 15 | - Explanation vs. Prediction 16 | - Policy Prediction Questions 17 | -------------------------------------------------------------------------------- /rd.Rmd: -------------------------------------------------------------------------------- 1 | # Regression Discontinuity 2 | 3 | Summary: If there are thresholds whereby some observations receive the 4 | treatment above it, other those below it do not, and those immediately above or 5 | below that threshold are similar, we can use the difference of the outcome 6 | between those just above and those just below the threshold to estimate the 7 | causal effect of the treatment. 8 | 9 | Suppose there is a running variable $x$ such that any person receives the treatment, $d$ if $x \geq a$ and does not if $x \leq a$, 10 | $$ 11 | d = \begin{cases} 12 | 1 & x \geq a \\ 13 | 0 & x < a 14 | \end{cases} 15 | $$ 16 | 17 | A simple regression discontinuity model is, 18 | $$ 19 | \begin{aligned}[t] 20 | y_i = \alpha + \beta x_i + \tau d_i + \gamma x_i d_i + \epsilon_i 21 | \end{aligned} 22 | $$ 23 | The local causal effect of the treatment at the discontinuity is $\tau$. 24 | 25 | ```{r echo=FALSE, fig.cap="Fake Example of a Regression Discontinuity. The difference at the threshold (50) is the effect of the treatment."} 26 | tibble( 27 | x = 1:100, 28 | d = x > 50, 29 | yhat = 0.2 * x + 20 * d - 0.1 * x * d, 30 | y = yhat + rnorm(length(x), 0, 7) 31 | ) %>% 32 | ggplot(aes(x = x)) + 33 | geom_vline(xintercept = 50, colour = "white", size = 2) + 34 | geom_point(aes(y = y)) + 35 | geom_smooth(aes(y = y, group = d), method = "lm") 36 | ``` 37 | 38 | However, there are several choices 39 | 40 | - Functional form of the trends before and after the discontinuity 41 | - The size of the window of observations before and after the trend which to compare. 42 | 43 | How to choose? 44 | 45 | - parametric: chooses specific functional forms 46 | - non-parametric: uses flexible forms, and chooses a bandwidth [@ImbensKalyanaraman2011a] 47 | 48 | Sharp vs. Fuzzy Discontinuity? 49 | 50 | - Sharp: the assignment of the treatment occurs with certainty at the threshold. 51 | - Fuzzy: the assignment of the treatment occurs only probabilistically at the threshold. 52 | 53 | Suppose that the causal effect of treatment $T \in \{0, 1\}$ on unit $i$ is $\tau_i = Y_i(1) - Y_i(0)$ where $Y_i(1)$ is the potential outcome of $i$ under the treatment and $Y_i(0)$ is the potential outcome of $i$ under the control. 54 | If potential outcomes are distributed smoothly at the cut-point $c$, then the average causal effect of the treatment at the cut-point, $Z_i = c$: 55 | $$ 56 | \tau_{RD} = \E[Y_{i}(1) - Y_i(0)| Z_i = c] = \lim_{Z_i \downarrow c}\E[Y_{i}(1) | Z_i = c] - \lim_{Z_i \uparrow c}\E[Y_i(0)| Z_i = c] 57 | $$ 58 | 59 | An advantage of RD designs is that unlike selection on observables or IV, its identifying assumptions are more observable and testable. 60 | 61 | There are two basic tests (@LeeLemieux2010a): 62 | 63 | 1. Continuity of pre-treatment covariates. E.g. density test of McCrary (2008). Whether the ratio of treated to control units departs from chance. 64 | A difficulty is that balance only holds in the limit, and covariance balance may still be present in finite samples. 65 | 66 | 1. Irrelevance of covariates to the treatment-outcome relationship. There should be no systematic association between covariates and treatment, so controlling for them shouldn't affect the estimates. 67 | 68 | ## Examples 69 | 70 | - @ThistlethwaiteCampbell1960a was the first example of RD. 71 | 72 | - Outcome: Career choices in teaching 73 | - Running variable: PSAS scores 74 | - Cutoff: receiving National Merit Finalist 75 | - Discussed: @AngristPischke2014a [Ch 4] 76 | 77 | - @CarpenterDobkin2011a, @CarpenterDobkin2009a 78 | 79 | - Running variable: age 80 | - Cutoff: ability to drink alcohol legally 81 | - Outcome: Death, accidents 82 | - Discussed: @AngristPischke2014a [Ch 4] 83 | 84 | - @AbdulkadirogluAngristPathak2014a 85 | 86 | - Running variable: exam score 87 | - Cutoff: above threshold receive an offer from a school. This is fuzzy since not all those who receive the offer attend. 88 | - Outcome: Educational outcomes 89 | - Discussed: @AngristPischke2014a [Ch 4] 90 | 91 | - @EggersHainmueller2009a 92 | 93 | - units: UK MPs 94 | - outcome: personal wealth 95 | - treatment: winning an election (holding office) 96 | - running variable: vote share 97 | 98 | - @LitschigMorrison2013a 99 | 100 | - units: Brazilian municipalities 101 | - outcome: education, literacy, poverty rate 102 | - treatment: receiving a cash transfer from the central government (there are population cutoffs) 103 | - running variable: population 104 | 105 | - @GelmanHill2007a [p. 213-217] 106 | 107 | - units: US Congressional members 108 | - outcome: ideology of representative 109 | - treatment: winning election 110 | - running variable: vote share 111 | 112 | - @GelmanKatz2007a, @GelmanHill2007a [p. 232] 113 | 114 | - units: patients 115 | - outcome: length of hospital stay 116 | - treatment: new surgery method 117 | - cutoff: not performed on those over 80 118 | - running variable: age 119 | 120 | - @LeeMorettiButler2004a. Also see derived examples in @Bailey2016a [Ex. 6.3]. See @Button2015a for a replication. 121 | 122 | - units: congressional districts 123 | - outcome: ideology of nominees 124 | - treatment: election 125 | - running variable: vote share 126 | 127 | - @JacobLefgren2004a 128 | 129 | - units: students 130 | - outcome: education achievement 131 | - treatment: summer school, retention 132 | - running variable: standardized test 133 | 134 | ## Example: Close Elections 135 | 136 | A common use of RD in political science and econ is election outcomes. 137 | In this case the "treatment" is winning the election; it is applied to the candidate whose vote exceeds the threshold of 50%, but not to candidates arbitrarily below that threshold. 138 | Thus "close" elections are a common use of RD designs. 139 | This design was formalized in @Lee2008a. 140 | 141 | Several papers question whether close elections satisfy the assumptions of RD: 142 | 143 | - @CaugheySekhon2011a look at US House elections (1942-2008). They find that close elections are more imbalanced. They attribute this to national partisan waves. 144 | - @GrimmerHershFeinsteinEtAl2011a look at all US House elections 1880-2008. They find that structurally advantaged candidates (strong party, incumbents) are more likely to win close elections. 145 | 146 | The ways in which close elections can be non-random are lawsuit challenges and fraud. 147 | 148 | @EggersFowlerHainmuellerEtAl2014a addresses these concerns with a systematic review of 40,000 close elections: "U.S. House in other time periods, statewide, state legislative, and mayoral races in the U.S. and national or local elections in nine other countries" 149 | Only the US House appears to have these issues. 150 | 151 | ## Software 152 | 153 | See the R packages 154 | 155 | - `r rpkg("rddtools")`: a new and fairly complete package of regression discontinuity from primary data viz to other tests. 156 | - `r rpkg("rdd")` 157 | - `r rpkg("rdrobust")`: Tools for data-driven graphical and analytical statistical inference in RD. 158 | - `r rpkg("rdpower")`: Calculate power for RD designs. 159 | - `r rpkg("rdmulti")`: Analyze designs with multiple cutoffs. 160 | 161 | See entries in the [Econometrics](https://cran.r-project.org/web/views/Econometrics.html) task view. 162 | 163 | ## References 164 | 165 | Textbooks and Reviews: 166 | 167 | - @AngristPischke2014a [Ch. 4] 168 | - @GelmanHill2007a [Sec. 10.4] 169 | - @Bailey2016a [Ch. 11] 170 | - @LindenAdamsRoberts2006a for applications to medicine 171 | - @HahnToddKlaauw2001a An early review of RD in economics 172 | 173 | Methods: 174 | 175 | - @ImbensKalyanaraman2011a propose an optimal bandwidth selection method 176 | -------------------------------------------------------------------------------- /references.Rmd: -------------------------------------------------------------------------------- 1 | `r if (knitr:::is_html_output()) '# References {-}'` 2 | -------------------------------------------------------------------------------- /regularization.Rmd: -------------------------------------------------------------------------------- 1 | # Regularization 2 | 3 | ```{r} 4 | library("glmnet") 5 | library("tidyverse") 6 | library("broom") 7 | ``` 8 | 9 | ```{r} 10 | UScrime <- MASS::UScrime %>% 11 | mutate_at(vars(y, M, Ed, Po1, Po2, LF, M.F, Pop, 12 | NW, U1, U2, GDP, Ineq, Prob, Time), 13 | funs(log)) 14 | 15 | varlist <- c("M", "Ed", "Po1", "Po2", "LF", "M.F", "Pop", "NW", 16 | "U1", "U2", "GDP", "Ineq", "Prob", "Time") 17 | ``` 18 | 19 | By default, `glmnet` will return and entire range of coefficients. 20 | ```{r} 21 | mod_lasso <- glmnet(as.matrix(UScrime[, varlist]), UScrime[["y"]]) 22 | mod_ridge <- glmnet(as.matrix(UScrime[, varlist]), UScrime[["y"]], alpha = 0) 23 | ``` 24 | 25 | ```{r} 26 | bind_rows( 27 | mutate(tidy(mod_lasso), model = "Lasso"), 28 | mutate(tidy(mod_ridge), model = "Ridge") 29 | ) %>% 30 | filter(term != "(Intercept)") %>% 31 | ggplot(aes(x = dev.ratio, y = estimate, colour = term)) + 32 | geom_line() + 33 | facet_wrap(~ model, ncol = 1) 34 | ``` 35 | 36 | Alternatively, the lasso and ridge regression models are the solutions to the problems 37 | $$ 38 | \hat{\beta}_{\text{lasso}} = \arg \min_\beta \left\{ \sum_{i =1}^n \left(y_i - \beta_0 - \sum_{j = 1}^p \beta_j x_{ij} \right)^{2} \right\} \text{s.t.} \sum_{j = 1}^p \beta_j^2 \leq c, 39 | $$ 40 | and 41 | $$ 42 | \begin{aligned}[t] 43 | \hat{\beta}_{\text{lasso}} &= \arg \min_\beta \left\{ \sum_{i =1}^n \left(y_i - \beta_0 - \sum_{j = 1}^p \beta_j x_{ij} \right)^{2} \right\} \\ 44 | \text{s.t.}& \sum_{j = 1}^p |\beta_j| \leq c 45 | \end{aligned} 46 | $$ 47 | 48 | In other words, these methods try to find the $\Vec{\beta}$ with the smallest sum of squared errors that has a $\Vec{\beta}$ with a norm less than $c$. 49 | The value of $c$ corresponds to some value of $\lambda$ in the previous methods. 50 | 51 | Think of $c$ as a fixed *budget*. The lasso and ridge regressions try to find the variables that explain $y$ the best without going over the budget [@JamesWittenHastieEtAl2013a, p. 221]: 52 | 53 | Consider the case with only coefficients: $\beta_1$ and $\beta_2$. 54 | In the lasso, we want to find the values of $\beta_1$ and $\beta_2$ 55 | $$ 56 | |\beta_1| + |\beta_2| \leq c 57 | $$ 58 | 59 | ```{r echo=FALSE} 60 | knitr::include_graphics("img/islr-fig-6.7.png") 61 | ``` 62 | 63 | > never trust OLS with more than five regressors 64 | > --- [Zvi Grilliches](http://www.nber.org/econometrics_minicourse_2015/nber_slides11.pdf) 65 | > 66 | > Regularization theory was one of the first signs of the existence of intelligent inference 67 | > --- [Zapnik](http://www.nber.org/econometrics_minicourse_2015/nber_slides11.pdf) 68 | 69 | Rather than choose the best fit, there is some penalty to avoid over-fitting. 70 | This is to choose the optimal optimal point on the expected predicted value. 71 | 72 | There are two questions 73 | 74 | 1. method of regularization 75 | 1. amount of regularization 76 | 77 | There are several choices of the former, chosen for different reasons. 78 | 79 | The latter is almost always chosen by cross-validation. 80 | 81 | While OLS is okay for estimating $\beta$ (best linear unbiased property). 82 | However, with $K \geq 3$ regressors, OLS is poor. 83 | 84 | The approaches to regularization in regression are 85 | 86 | 1. Shrink estimates to zero (Ridge) 87 | 1. Sparsity, limit number of non-zero estimates (Lasso) 88 | 1. Combination of the two (Bridge) 89 | 90 | ## Ridge Regression 91 | 92 | $$ 93 | \hat{\beta}_{\text{OLS}} = \arg \min_{\beta} \sum_{i=1}^{n} (y_i - \Vec{x}_{i} \Vec{\beta})^{2} 94 | $$ 95 | 96 | Regularized regression adds a penalty that is a function of $\beta$. 97 | This encourages $\beta$ to be close to zero. 98 | $$ 99 | \hat{\beta}_{\text{regularized}} = \arg \min_{\beta} \sum_{i=1}^{n} (y_i - \Vec{x}_{i} \Vec{\beta})^{2} + \lambda f(\beta) 100 | $$ 101 | 102 | Where $\lambda$ is a penalty parameter, and $f(\beta)$ is a function that increases in the total magnitudes of the coefficients. 103 | 104 | - $\lambda \to \infty$: all coefficients are zero 105 | - $\lambda \to 0$: same as OLS 106 | 107 | How do we choose the value of $\lambda$? 108 | 109 | - Currently: cross-validation 110 | - Historically: there were some default plug-in estimators, especially for ridge regression. 111 | 112 | **Ridge** regression penalizes the $\Vec{\beta}$ vector by the 113 | $$ 114 | \hat{\beta}_{\text{ridge}} = \arg \min_{\beta} \sum_{i=1}^{n} (y_i - \Vec{x}_{i} \Vec{\beta})^{2} + \sum_{k = 1}^{p} \beta_k^2 115 | $$ 116 | 117 | **Lasso** penalizes the coefficients by an the $L1$ norm. 118 | Suppose we want to find the best subset of $\leq k$ covariates . 119 | $$ 120 | \hat{\beta}_{\text{lasso}} = \arg \min_{\beta} \sum_{i=1}^{n} (y_i - \Vec{x}_{i} \Vec{\beta})^{2} + \lambda \sum_{k = 1}^p |\beta_k| 121 | $$ 122 | 123 | - If true distribution of coefficients is a few big ones and many small ones, 124 | LASSO will do better. If many small/modest sized effects, ridge may do better. 125 | 126 | - LASSO does not work well with highly correlated coefficients. 127 | 128 | - Ridge: $\hat{\beta}_{1} + \hat{\beta}_{2} \approx (\beta_1 + \beta_2)/ 2$. 129 | - LASSO: Indifferent between $\hat{\beta}_1 = 0$, $\hat{\beta}_2 = \beta_1 + \beta_2$, $\hat{\beta}_1 = \beta_1 + \beta_2$, and $\hat{\beta}_2 = 0$. 130 | 131 | - Approximate best-subset selection. Suppose that we would really like to select 132 | the best subset of $q < k$ coefficients and set the rest to zero (this is the variable selection problem). 133 | That is a hard problem since there are $\binom{k}{q}$. 134 | Lasso can be viewed as an approximation of the problem. 135 | 136 | - Oracle property. If the true model is sufficiently sparse, we can ignore the 137 | selection stage and use OLS standard errors of the non-zero variables 138 | for inference. 139 | 140 | **Bridge** regression penalizes the $\Vec{\beta}$ vector by the 141 | $$ 142 | \hat{\beta}_{\text{bridge}} = \arg \min_{\beta} \sum_{i=1}^{n} (y_i - \Vec{x}_{i} \Vec{\beta})^{2} + \lambda_1 \sum_{k = 1}^{p} |\beta_k| + \lambda_2 \sum_{k = 1}^{p} \beta_k^2 143 | $$ 144 | 145 | Bridge regression has some of the properties of both ridge and Lasso. 146 | It will select correlated regressors, yet also shrink coefficients to zero for 147 | a sparse solution. 148 | 149 | The R package `r rpkg("glmnet")` is the most commonly used package to estimate 150 | Lasso, ridge, and bridge regression for linear and generalized linear models. 151 | However, these methods are common enough that all machine learning frameworks 152 | will have some implementation of them. See other packages for variations on the 153 | lasso that take into account other dependencies in the data. 154 | 155 | How to find the value of $\lambda$? Cross validation. 156 | The function `cv.glmet()` uses cross-validation to select the penalty parameter. 157 | 158 | ## Regularization for Causal Inference 159 | 160 | Belloni, Chernozhukov, and Hansen (2014) propose a simple method for using Lasso 161 | for causal effects. 162 | 163 | What's the problem with regularized regression for causal inference? 164 | Suppose we estimate a model with the aim to recover $\beta_1$. 165 | $$ 166 | \Vec{y} = \alpha + \beta x + \gamma_1 z_1 + \cdots + \gamma_k z_{k-1} + \epsilon 167 | $$ 168 | If we estimate it with a regularized model, like lasso, then $\beta_1$ will be shrunk in addition to the controls. 169 | If we instead do not shrink $\beta_1$ but we shrink the controls enough. 170 | It will be closer to **not** controlling for the other variables since any part of 171 | of the treatment prediction of the outcome explained by the controls will be shrunk since those coefficients are penalized, but the treatment coefficient is not. 172 | 173 | 1. Run Lasso with the outcome $y$ on all controls, $z_1, \dots, \z_k$. 174 | Keep all non-zero coefficients. 175 | 176 | 1. Run Lasso with the treatment $z$ on all controls, $z_1, \dots, z_k$. 177 | Keep all non-zero coefficients. 178 | 179 | 1. Run OLS with the outcome $y$ on the treatment, $x$, and all variables with 180 | a non-zero coefficient in either step 1 or 2. 181 | 182 | If the **true model is sparse** (and asymptotics), then by the Oracle property, 183 | we can treat the standard errors of the OLS coefficients in the last step as 184 | if the selection stage did not occur. 185 | 186 | See and the `r rpkg("hdm")` which implements this method, and extensions to work with high dimensional data in R. 187 | 188 | ## References 189 | 190 | It is a few years old, but the [2015 NBER Summer course](http://www.nber.org/econometrics_minicourse_2015/nber_slides11.pdf) has a good introduction to machine learning that is targeted at social scientists. 191 | -------------------------------------------------------------------------------- /reproducible-research.Rmd: -------------------------------------------------------------------------------- 1 | # Reproducible Research 2 | -------------------------------------------------------------------------------- /simple-regression.Rmd: -------------------------------------------------------------------------------- 1 | # OLS Estimator 2 | 3 | For **unbiasedness** 4 | 5 | 1. Linearity 6 | 1. Random (iid) sample 7 | 1. Variation in $X_i$ 8 | 1. Zero conditional mean of errors 9 | 10 | ## Linearity 11 | 12 | **Assumption 1** The population regression function is linear in the parameters. 13 | $$ 14 | Y = \beta_0 + \beta_1 X_i + u 15 | $$ 16 | 17 | Note that 18 | 19 | - $u$ is the *unobserved* disturbance term for all factors influencing $Y$ other than $X$ 20 | - This is different than the the CEF error - we are interpreting $\beta_1$ structurally. This is an assumption needed for $\hat{\beta}$ to be an unbiased estimator of the population $\beta$. It may still be the case that $\hat{\beta}$ is a good estimator for other quantities. 21 | 22 | A violation: 23 | $$ 24 | Y_i = \frac{1}{\beta_0 + \beta_1 X_i} + u_i 25 | $$ 26 | 27 | Sometimes we can transform non-linear cases to be linear. 28 | For example, while this is not linear, 29 | $$ 30 | Y_i = \exp(\beta_0) \exp(\beta_1 X_i) u_i 31 | $$ 32 | the log transformation is linear, 33 | $$ 34 | \log Y_i = \beta_0 + \beta_1 X_i + \log (u_i). 35 | $$ 36 | 37 | ## Random Sample 38 | 39 | **Assumption 2:** We have a iid random sample of size $n$ $\{Y_i, X_i: i = 1, \dots, n\}$ from the population regression model. 40 | 41 | This is a standard assumption for generalizing from a sample to a population. 42 | Violations include time-series and selected samples. 43 | 44 | ## Variation in $X$ 45 | 46 | **Assumption 3:** The in-sample independent variables $\{X_i: i = 1, \dots, n\}$ are not all the same value. 47 | 48 | Recall, the formula for the OLS slope is 49 | $$ 50 | \hat{\beta}_1 = \frac{\sum_{i = 1}^n (x_i - \bar{x}) (y_i - \bar{y})}{\sum_{i = 1}^n (x_i - \bar{x})^2} 51 | $$ 52 | If there is no variation in $x$, then all $x_i = \bar{x}$, 53 | and 54 | $$ 55 | \hat{\beta}_1 = \frac{\sum_{i = 1}^n (\bar{x} - \bar{x}) (y_i - \bar{y})}{\sum_{i = 1}^n (\bar{x} - \bar{x})^2} = \frac{0}{0} \to \text{undefined} . 56 | $$ 57 | 58 | ## Assumption 4 59 | 60 | **Assumption 4** The error $u_i$, has expected value of 0, given the values of the independent variable, 61 | $$ 62 | E(u_i | X_i = x) = 0, 63 | $$ 64 | for all $x$. 65 | 66 | This is the key assumption for a structural interpretation of $Y$. 67 | It says that all the other things that influence $Y$ on average have no effect on $Y$ at every value of $x$. 68 | 69 | When is this most plausible? When $X$ is randomly assigned, so it uncorrelated with the errors by design. 70 | In **observational** data this is difficult to justify. 71 | 72 | *Consistency* is a property of an estimator that as the sample size gets larger, it approaches the true value, 73 | $$ 74 | \widehat{\beta}_1 \to^{p} \beta_1 75 | $$ 76 | 77 | For consistency, only as weaker version of Assumption 4 is needed. 78 | 79 | **Assumption 4(b)** The error is mean zero, $E(u_i) = 0$, and uncorrelated with $X$, $E(u_i X_i) = 0$. 80 | 81 | That the error is mean zero is not binding as long as we have an intercept in the model. 82 | 83 | That the errors are uncorrelated with the predictor. 84 | This is weaker than Assumption 4 because it only rules out *linear* relationships between $u$ and $X$. 85 | If there are unmodeled non-linearities OLS still captures the best linear approximation to the CEF. 86 | And this weaker assumption says that even if we miss those, we will be consistent estimates of the population line of best fit. 87 | 88 | Note that $\widehat{\beta}$ is a weighted sum of residuals, 89 | $$ 90 | \widehat{\beta}_1 = \beta_1 + \sum_{i = 1}^n W_i u_i . 91 | $$ 92 | So, 93 | $$ 94 | \sum_{i = 1}^n W_i u_i \to^p \frac{\Cov(X_i, u_i)}{V(X_i)} 95 | $$ 96 | Since $Cov(X_i, u_i) = 0$, $\widehat{\beta}_1 \to^p \beta_1$. 97 | 98 | **Where are we?** Under assumptions 1--4, $\widehat{beta} \sim ?(\beta_1, ?)$. 99 | These assumptions establish that the expected value of the sampling distribution is $E(\widehat{\beta}_1) = \beta_1$. 100 | However, they don't say anything about the distributional form (is it Normal?) or the standard deviation of the sampling distribution of $\hat{\beta_1}$. 101 | We need a few more assumptions to deal with that. 102 | 103 | ## Large Sample Inference 104 | 105 | **Assumption 5:** The conditional variance of $Y_i$ given $X_i$ is constant, 106 | $$ 107 | V(Y_i | X_i = x) = V(u_i | X_i = x) = \sigma^2_u . 108 | $$ 109 | 110 | The function which gives the values of the variance of $Y$ as a function of $X$ is called the **skedastic** function. 111 | 112 | - **homodeskedasticity**: $V(Y | X = x) = V(u | X = x) = \sigma^2_u$ for all $x$ 113 | - **heteroskedasticity**: $V(u | X = x) \neq V(u | X = x')$ for some values of $x$ and $x'$. In other words, the conditional variance is not constant. 114 | 115 | ## Asymptotic Normality of OLS 116 | 117 | Do we need the errors to be distributed normal? No, not in large samples. 118 | The OLS error is a weighted sum of the residuals, 119 | $$ 120 | \hat{\beta}_1 - \beta = \sum_{i = 1}^n W_i u_i 121 | $$ 122 | Since the estimator error is a mean, the CLT holds, and the distribution of the errors (variance) will be distributed standard normal. 123 | $$ 124 | \frac{\hat{\beta}_1 - \beta_1}{SE(\hat{\beta}_)} \to N(0, 1) 125 | $$ 126 | Also, in large samples, we can plug in the estimated standard error for the population standard error, 127 | $$ 128 | \frac{\hat{\beta}_1 - \beta_1}{\widehat{SE}(\hat{\beta}_)} \to N(0, 1) 129 | $$ 130 | 131 | ## Small Sample Model-Based Inference 132 | 133 | The CLT tells us that the sampling distribution of $\beta$ is normal in large samples (asymptotically). 134 | What about small samples? 135 | To use the normal (t-distribution) for hypothesis testing, we need the assumption that the errors are distributed normal. 136 | 137 | **Assumption 6** The conditional distribution of $u$ given $X$ is Normal with mean 0 and variance $\sigma^2_u$. 138 | 139 | $$ 140 | \frac{\widehat{\beta}_1 - \beta_1}{SE(\hat{\beta}_1)} \sim N(0, 1) 141 | $$ 142 | 143 | If we plug in the sample standard error for the population standard error, the sampling distribution has a $t$-distribution with $n - k - 1$ (where $k$ is the number of predictors) degrees of freedom. 144 | $$ 145 | \frac{\widehat{\beta}_1 - \beta_1}{\widehat{SE}(\widehat{\beta}_1)} \sim \text{Student's-} t_{n - k - 1} 146 | $$ 147 | 148 | ## Assumptions Review 149 | 150 | What assumptions do we need to make for various uses of OLS? 151 | 152 | 1. Data description: variation in X 153 | 1. Consistency: linearity, iid, variation in X, uncorrelated errors 154 | 1. Unbiasedness: linearity, iid, variation in X, zero conditional mean errors 155 | 1. Large-sample inference: linearity, iid, variation in X, zero conditional mean error, homoskedasticity. 156 | 1. Small-sample inference: linearity, iid, variation in X, zero conditional mean error, homoskedasticity, Normal errors 157 | -------------------------------------------------------------------------------- /simpsons.Rmd: -------------------------------------------------------------------------------- 1 | # Simpson's Paradox 2 | 3 | The "paradox" is data where the a statistical association is present in 4 | every subgroup but the reverse association is present in the population. 5 | 6 | ## Examples 7 | 8 | ### Batting Averages 9 | 10 | This example is from: 11 | 12 | > Ken Ross. "A Mathematician at the Ballpark: Odds and Probabilities for Baseball Fans (Paperback)" Pi Press, 2004. ISBN 0-13-147990-3. 12–13 13 | 14 | This example involves the batting averages for the baseball players, Derek Jeter and David Justice. 15 | In both 1995 and 1996, David Justice had a higher batting average than Derek Jeter. 16 | But when aggregated, Derek Jeter had a higher batting average for 1995-96 than David Justice. 17 | 18 | ```{r} 19 | batting_yearly <- tribble( 20 | ~ player, ~ year, ~ hits, ~ ab, 21 | "Derek Jeter", 1995, 12, 48, 22 | "Derek Jeter", 1996, 183, 582, 23 | "David Justice", 1995, 104, 411, 24 | "David Justice", 1996, 45, 140 25 | ) %>% 26 | mutate(avg = hits / ab) 27 | ``` 28 | 29 | ```{r} 30 | batting_total <- batting_yearly %>% 31 | group_by(player) %>% 32 | summarise(ab = sum(ab), hits = sum(hits)) %>% 33 | mutate(avg = hits / ab) 34 | ``` 35 | 36 | ```{r} 37 | ggplot() + 38 | geom_point(data = batting_yearly, 39 | mapping = aes(x = as.integer(as.factor(player)), 40 | y = avg, 41 | color = as.factor(year), 42 | size = ab)) + 43 | geom_line(data = batting_yearly, 44 | mapping = aes(x = as.integer(as.factor(player)), 45 | y = avg, color = as.factor(year))) + 46 | geom_point(data = batting_total, 47 | mapping = aes(x = as.integer(as.factor(player)), 48 | y = avg, 49 | size = ab)) + 50 | geom_line(data = batting_total, 51 | mapping = aes(x = as.integer(as.factor(player)), 52 | y = avg)) + 53 | scale_x_continuous("Player", breaks = 1:2, 54 | labels = levels(as.factor(batting_yearly$player))) + 55 | scale_y_continuous("Batting Avg.") + 56 | scale_color_discrete("Year") + 57 | scale_size_continuous("At Bats") 58 | 59 | ``` 60 | 61 | ### Kidney Stones 62 | 63 | See 64 | 65 | ```{r} 66 | kidney_stones <- tribble( 67 | ~ treatment, ~ size, ~ success, ~ n, 68 | 0, "Small", 81, 87, 69 | 0, "Large", 192, 263, 70 | 1, "Small", 234, 270, 71 | 1, "Large", 55, 80 72 | ) %>% 73 | mutate(p = success / n) 74 | ``` 75 | 76 | ```{r} 77 | kidney_stones_total <- kidney_stones %>% 78 | group_by(treatment) %>% 79 | summarise(success = sum(success), n = sum(n)) %>% 80 | mutate(p = success / n) 81 | ``` 82 | 83 | ```{r} 84 | ggplot() + 85 | geom_point(data = kidney_stones, 86 | mapping = aes(x = treatment, y = p, 87 | color = size, size = n)) + 88 | geom_line(data = kidney_stones, 89 | mapping = aes(x = treatment, y = p, color = size)) + 90 | geom_point(data = kidney_stones_total, 91 | mapping = aes(x = treatment, y = p, size = n)) + 92 | geom_line(data = kidney_stones_total, 93 | mapping = aes(x = treatment, y = p)) + 94 | scale_x_continuous("Treatment", 95 | breaks = c(0, 1), labels = c("A", "B")) + 96 | scale_y_continuous("% Success") + 97 | scale_color_discrete("Kidney Stone Size") + 98 | scale_size_continuous("Number in Group") 99 | 100 | ``` 101 | 102 | ### Blood Pressure Drug 103 | 104 | Example from Pearl et al. (p. 4) original from Simpson (1951). 105 | 106 | Consider data on 700 sick patients given the opportunity to try a new drug, of which 350 *chose* to take the new drug. 107 | The number of patients in each gender ("Male", "Female") and the number recovered were recorded. 108 | 109 | ```{r} 110 | tribble( 111 | ~ gender, ~ drug, ~ recovered, ~ n, 112 | "Men", 1, 81, 87, 113 | "Men", 0, 234, 270, 114 | "Women", 1, 192, 263, 115 | "Women", 0, 289, 350 116 | ) 117 | ``` 118 | 119 | 1. What is the overall recovery rate ? 120 | 1. What is the recovery rate within each gender? 121 | 122 | ### Berkeley Admissions 123 | 124 | This is an example commonly used to illustrate Simpson's Paradox, first appearing in 125 | 126 | > 127 | 128 | It concerns the admission rate of women to graduate University of 129 | Across all graduate departments, the admissions rate of women applicants was less than male applicants. 130 | However, within all departments considered, the admissions rate for women was higher than that of males. 131 | The reversal of association was due to women applying to more selective programs which had lower overall admissions rates. 132 | 133 | The dataset is in the recommended R package `r rpkg("datasets")` as `r rdoc("datasets::UCBAdmissions")`. 134 | 135 | These links provide good visualizations of the data: 136 | 137 | - [Simpson's Paradox](http://vudlab.com/simpsons/) 138 | - 139 | 140 | ### Teacher Salary and Test Scores 141 | 142 | Simpson's Paradox can occur with continuous data. This example is from: 143 | 144 | > Deborah Lynn Guber, "Getting what you pay for: the debate over equity in public school expenditures" (1999), *Journal of Statistics Education* 145 | 146 | This example concerns school expenditures and test scores. 147 | The proportion of school expenditure and SAT test scores. 148 | 149 | The data is included in the package `r rpkg("mosaicData")` as the dataset `r rdoc("mosaicData::SAT")`: 150 | 151 | ```{r} 152 | data("SAT", package = "mosaicData") 153 | ``` 154 | 155 | For the fifty US states, there is a negative relationship between school expenditures and SAT test scores. 156 | 157 | ```{r} 158 | ggplot(SAT, aes(label = state, x = salary, y = sat)) + 159 | geom_text() + 160 | geom_smooth(method = "lm", se = FALSE) 161 | ``` 162 | 163 | However, when the states are categorized by the fraction of students taking the SAT, there is a positive or negligible association between school expenditure and SAT expenditure within each subgroup: 164 | 165 | ```{r} 166 | mutate(SAT, 167 | frac_cat = cut_number(frac, 3)) %>% 168 | ggplot(aes(label = state, x = salary, y = sat, color = frac_cat)) + 169 | geom_text() + 170 | geom_smooth(method = "lm", se = FALSE) 171 | ``` 172 | 173 | ### Other Examples 174 | 175 | - U.S. Education and Income. Norris, Floyd. "[Can Every Group Be Worse Than Average? Yes.](https://economix.blogs.nytimes.com/2013/05/01/can-every-group-be-worse-than-average-yes/)". *New York*. May 1, 2013. 176 | - Armstrong, Zan, and Martin Mattenberg. 2014. "[Visualizing Statistical Mix Effects and Simpson’s Paradox](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42901.pdf)" 177 | - Horton, Bob. [Fun with Simpson's Paradox: Simulating Confounders](http://blog.revolutionanalytics.com/2015/11/fun-with-simpsons-paradox-simulating-confounders.html) November 17, 2015. 178 | - Burn-Murdoch, John. "[Germany’s election and the trouble with correlation](https://www.ft.com/content/94e3acec-a767-11e7-ab55-27219df83c97)" *Financial Times*. October 2, 2017. 179 | - Income and Party Affiliation in the United States. DOI 10.1561/100.00006026 180 | 181 | ## Casual Inference and Simpson's Paradox 182 | 183 | The Simpson's Paradox does not directly have anything to do with causal inference, which should be obvious given the clearly non-causal baseball example. 184 | 185 | Simpson's example is a special case of *omitted variable bias* (discussed in more detail later). 186 | It does illustrate how associations between variables can be much different when looking at subpopulations (or controlling for a variable). 187 | 188 | It does suggest a major difficulty in conducting causal inference with observational data. However, whether those variables need to be controlled for or not is an *extra-statistical* question and cannot be revealed by the associations alone. 189 | 190 | In the new drug example, women are more likely to take the drug *and* less likely to recover. In other words, gender is a common cause of taking the drug and recovery rate. (this example is from Pearl, Glymour, and Jewell). 191 | To assess how effective the drug is, we need to compare group 192 | 193 | However, consider blood pressure observed after the administration of the drug. 194 | Reducing blood pressure may be a mechanism by which the drug affects recovery rate. 195 | Comparing patients the recovery rate of patients with the same blood pressure level *after* taking the drug does not make sense. 196 | -------------------------------------------------------------------------------- /word-processing.Rmd: -------------------------------------------------------------------------------- 1 | # Typesetting and Word Processing Programs 2 | 3 | ## LaTeX 4 | 5 | [LaTeX](https://en.wikipedia.org/wiki/LaTeX) is a document markup language (think something like HTML) that is widely used in academia.[^pronunciation] 6 | Its primary advantages over Word (and word processors) are the separation of content and presentation and its formatting of mathematical equations. 7 | In addition to papers, it is often used for academic slides; many talk slides are prepared with beamer. 8 | 9 | ### Learning LaTeX 10 | 11 | Here are some links to get started learning LaTeX: 12 | 13 | - [Overleaf Free & Interactive Online Introduction to LaTeX](https://www.overleaf.com/latex/learn/free-online-introduction-to-latex-part-1) 14 | - [LaTeX Tutorial](https://www.latex-tutorial.com/tutorials/) has interactive lessons 15 | - [ShareLaTeX Documentation](https://www.sharelatex.com/learn/) 16 | - [Overleaf Example Templates](https://www.overleaf.com/latex/templates/) has many different examples of LaTeX documents. 17 | - [LaTeX Wikibook](https://en.wikibooks.org/wiki/LaTeX) 18 | - [Not So Short Introduction to LaTeX](https://tobi.oetiker.ch/lshort/lshort.pdf) is a classic, but not as as new-user friendly as the others. 19 | 20 | ### Using LaTeX 21 | 22 | - Use an online service such as [Overleaf](https://www.overleaf.com/) or [ShareLaTeX](https://www.sharelatex.com/). These are great for collaboration, but become inflexible 23 | when you want to customize your workflow. 24 | 25 | - Write it with a specialized editor such as [TeXmaker](http://www.xm1math.net/texmaker/), [TeXStudio](http://www.texstudio.org/), or [TeXshop](http://pages.uoregon.edu/koch/texshop/). These generally have 26 | built ways to insert text, and also live preview. I would stay away from editors such as [LyX](https://www.lyx.org/) that are [WYSIWYG](https://en.wikipedia.org/wiki/WYSIWYG). 27 | 28 | - Write it with an general purpose editor such as [Atom](https://atom.io/) or [Sublime Text](https://www.sublimetext.com/).[^1] Most editors have a plugin 29 | to make writing LaTeX easier. For Atom there is [LaTeXTools](https://atom.io/packages/latextools), and for Sublime Text, [LaTeXTools](https://github.com/SublimeText/LaTeXTools) 30 | 31 | [^1]: And of course [Vim](http://www.vim.org/) or [Emacs](https://www.gnu.org/software/emacs/). 32 | 33 | ### LaTeX with R 34 | 35 | This is pretty easy. Rnw, also called Sweave, documents allow you to mix R chunks with LaTeX. 36 | This is similar to R markdown, but with LaTeX instead of markdown.[^2] 37 | 38 | Many packages, such as [xtable](https://cran.r-project.org/package=xtable), [stargazer](ttps://cran.r-project.org/package=stargazer), or [texreg](ttps://cran.r-project.org/package=texreg) produce formatted output in LaTeX. 39 | When you use these programs, do not copy and paste the output. Instead, save it to a file, 40 | and use `\input{}` to include the contents in your document. 41 | 42 | [^2]: And [Sweave](https://www.statistik.lmu.de/~leisch/Sweave/) files preceded R markdown and knitr by many years. 43 | 44 | ## Word 45 | 46 | While I use LaTeX in my own work, Microsoft Word is powerful piece of software, 47 | and many of the complaints against Word come down to not being aware of its 48 | features. There are many tools you can use to build your research paper; 49 | whatever tool you use, learn how to use it proficiently. 50 | 51 | ### General Advice 52 | 53 | This guide on using [Microsoft Word for Dissertations](http://guides.lib.umich.edu/c.php?g=283073&p=1886001) 54 | covers everything and more that I would have. Also see [this](http://www3.nd.edu/~shill2/dtclass/word_2013_word_for_research_projects.pdf) 55 | 56 | - [separate presentation and content](https://en.wikipedia.org/wiki/Separation_of_presentation_and_content) using styles 57 | 58 | - Automatically number figures and tables 59 | 60 | - Use a reference manager like [Mendeley](https://www.mendeley.com/), [Zotero](https://www.zotero.org/), [colwiz](https://www.colwiz.com/app), or [Papers](http://www.papersapp.com/). They have plugins for citations in Word. 61 | 62 | - When exporting figures for Word, if you must use a [raster graphic](https://en.wikipedia.org/wiki/Raster_graphics) use PNG files (not JPEG). For publication, use a high DPI (600) with PNG graphics. 63 | 64 | - Learn to use *Fields*. You can insert figures from files that you can 65 | update using `Insert > Field > Links and References > IncludePicture`. 66 | This is useful for programmatically generating figures to insert into 67 | your document. Likewise, you can insert text from files that you can 68 | update using `Insert > Field > Links and References > IncludeText`. 69 | 70 | ### Using R with Word 71 | 72 | For a dynamic reports you can use [R Markdown](http://rmarkdown.rstudio.com/word_document_format.html) and export to a word document. When doing this, use a reference document to set the the styles that you will use. 73 | See [Happy collaboration with Rmd to docx](http://rmarkdown.rstudio.com/articles_docx.html) for more advice on using R Markdown with Word. 74 | 75 | When using functions from packages such as [xtable](https://cran.r-project.org/package=xtable), [stargazer](ttps://cran.r-project.org/package=stargazer), or [texreg](ttps://cran.r-project.org/package=texreg) output HTML, which can be copy and pasted into word. 76 | 77 | Finally, the [ReporteR](http://davidgohel.github.io/ReporteRs/word.html) package is an alternative method to generate Word Documents from R. 78 | 79 | [^pronunciation]: TeX is pronounced as "teck" because the X is a Greek chi. The pronunciation of of LaTeX is thus lah-teck or lay-teck. It is not 80 | pronounced like the rubber compound. See this [StackExchange](http://tex.stackexchange.com/questions/17502/what-is-the-correct-pronunciation-of-tex-and-latex) question on the pronunciation of LaTeX. 81 | -------------------------------------------------------------------------------- /writing.Rmd: -------------------------------------------------------------------------------- 1 | # Writing Resources 2 | 3 | ## Writing and Organizing Papers 4 | 5 | - Chris Adolph. [Writing Empirical Papers: 6 Rules & 12 Recommendations](http://faculty.washington.edu/cadolph/503/papers.pdf) 6 | 7 | - Barry R. Weingast. 2015. [CalTech Rules for Writing Papers: How to Structure Your Paper and Write an Introduction](https://web.stanford.edu/group/mcnollgast/cgi-bin/wordpress/wp-content/uploads/2013/10/CALTECH.RUL_..pdf) 8 | 9 | - [The Science of Scientific Writing](http://www.americanscientist.org/issues/id.877,y.0,no.,content.true,page.1,css.print/issue.aspx) *American Scientist* 10 | 11 | - Deidre McCloskey. [Economical Writing](http://www.amazon.com/Economical-Writing-Deirdre-McCloskey/dp/1577660633/) 12 | 13 | - William Thompson. [A Guide for the Young Economist](http://www.amazon.com/Guide-Young-Economist-MIT-Press/dp/026251589X). "Chapter 2: Writing Papers." 14 | 15 | - Stephen Van Evera. [Guide to Methods for Students of Political Science](http://www.amazon.com/Guide-Methods-Students-Political-Science/dp/080148457X). Appendix. 16 | 17 | - Joseph M. Williams and Joseph Bizup. [Style: Lessons in Clarity and Grace](http://www.amazon.com/dp/0321898680/) 18 | 19 | - Strunk and White. *The Elements of Style* 20 | 21 | - [Chicago Manual of Style](http://www.chicagomanualofstyle.org/) and [APSA Style Manual for Political Science](http://www.apsanet.org/Portals/54/APSA%20Files/publications/APSAStyleManual2006.pdf) for editorial and style issues. 22 | 23 | - [How to construct a Nature summary paragraph](http://www.nature.com/nature/authors/gta/Letter_bold_para.doc). Though specific to *Nature*, it provides good advice for structuring abstracts or introductions. 24 | 25 | - Ezra Klein. [How researchers are terrible communications, and how they can do better](http://chrisblattman.com/2015/11/05/ezra-klein-how-researchers-are-terrible-communicators-and-how-they-can-do-better/). 26 | 27 | - The advice in the *AJPS* [Instructions for Submitting Authors](http://ajps.org/guidelines-for-manuscripts/) is a concise description of how to write an abstract: 28 | 29 | > The abstract should provide a very concise descriptive summary of the research stream to which the manuscript contributes, the specific research 30 | > topic it addresses, the research strategy employed for the analysis, the results obtained from the analysis, and the implications of the findings. 31 | 32 | - [Concrete Advice for Writing Informative Abstracts](http://connection.sagepub.com/blog/sage-connection/2014/05/15/concrete-advice-for-writing-informative-abstracts/) and [How to Carefully Choose Useless Titles for Academic Writing](http://www.socialsciencespace.com/2014/03/how-to-carefully-choose-useless-titles-for-academic-writing/) 33 | 34 | ## Finding Research Ideas 35 | 36 | - Paul Krugman [How I Work](http://web.mit.edu/krugman/www/howiwork.html) 37 | - Hal Varian. [How to build an Economic Model in your spare time](http://people.ischool.berkeley.edu/~hal/Papers/how.pdf) 38 | - Greg Mankiw, [My Rules of Thumb](http://faculty.som.yale.edu/jameschoi/mankiw_tips.pdf): 39 | - The links in [Advice for Grad Students](http://gregmankiw.blogspot.com/2006/05/advice-for-grad-students.html) 40 | 41 | ## Replications 42 | 43 | Gary King has advice on how to turn a replication into a publishable paper: 44 | 45 | - Gary King [How to Write a Publishable Paper as a Class Project](http://gking.harvard.edu/papers) 46 | 47 | - Gary King. 2006. "[Publication, Publication.](http://gking.harvard.edu/files/abs/paperspub-abs.shtml)" *PS: Political Science and Politics*. 48 | 49 | - [Political Science Should Not Stop Young Researchers from Replicating](https://politicalsciencereplication.wordpress.com/2015/06/15/political-science-should-not-stop-young-researchers-from-replicating/) 50 | from the [Political Science Replication](https://politicalsciencereplication.wordpress.com) blog. 51 | 52 | And see the examples of students replications from his Harvard course at . 53 | 54 | Famous replications. 55 | 56 | - "Irregularities in LaCour (2014) [@BroockmanKallaAronow2015a] 57 | - "Does High Public Debt Consistently Stifle Economic Growth? A Critique of Reinhart and Rogoff." [@HerndonAshPollin2013a] 58 | 59 | However, although those replications are famous for finding fraud or obvious 60 | errors in the analysis, replications can lead to extensions and generate new 61 | ideas. This was the intent of @BroockmanKallaAronow2015a when starting the 62 | replication. 63 | --------------------------------------------------------------------------------