├── .Rbuildignore ├── .Rprofile ├── .gitattributes ├── .github ├── .gitignore └── workflows │ └── deploy_bookdown.yml ├── .gitignore ├── 01-introduction.Rmd ├── 02-overview-surveys.Rmd ├── 03-survey-data-documentation.Rmd ├── 04-set-up.Rmd ├── 05-descriptive-analysis.Rmd ├── 06-statistical-testing.Rmd ├── 07-modeling.Rmd ├── 08-communicating-results.Rmd ├── 09-reproducible-data.Rmd ├── 10-sample-designs-replicate-weights.Rmd ├── 11-missing-data.Rmd ├── 12-successful-survey-data-analysis.Rmd ├── 13-ncvs-vignette.Rmd ├── 14-ambarom-vignette.Rmd ├── 90-AppendixA-DataImport.Rmd ├── 91-AppendixB-ANES-CB-latex.Rmd ├── 91-AppendixB-ANES-CB.Rmd ├── 92-AppendixC-RECS-CB-latex.Rmd ├── 92-AppendixC-RECS-CB.Rmd ├── 93-AppendixD-Solutions-latex.Rmd ├── 93-AppendixD-Solutions.Rmd ├── 94-AppendixE-Corrections-Remarks.Rmd ├── 99-references.Rmd ├── LICENSE.md ├── README.md ├── _bookdown.yml ├── _output.yml ├── assets └── hero-image.html ├── book.bib ├── css └── style.css ├── images ├── PetExample1.png ├── PetExample2.png ├── codebook-example.jpg ├── codebook-ncvs-weapon-handgun.jpg ├── codebook-ncvs-weapon-li.jpg ├── cover.png ├── favicon.png ├── header.png ├── logo.png ├── questionnaire-example-2.jpg ├── questionnaire-example.jpg └── questionnaire-ncvs-weapon.jpg ├── index.Rmd ├── krantz.cls ├── latex ├── after_body.tex ├── before_body_temp.tex └── preamble.tex ├── plausible.html ├── renv.lock ├── renv ├── .gitignore ├── activate.R ├── settings.dcf └── settings.json └── tidy-survey-book.Rproj /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^renv$ 2 | ^renv\.lock$ 3 | ^LICENSE\.md$ 4 | ^\.github$ 5 | -------------------------------------------------------------------------------- /.Rprofile: -------------------------------------------------------------------------------- 1 | source("renv/activate.R") 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | *.pdf filter=lfs diff=lfs merge=lfs -text 3 | *.xlsx filter=lfs diff=lfs merge=lfs -text 4 | *.rds filter=lfs diff=lfs merge=lfs -text 5 | *.sav filter=lfs diff=lfs merge=lfs -text 6 | *.dta filter=lfs diff=lfs merge=lfs -text 7 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/deploy_bookdown.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | name: renderbook 7 | 8 | jobs: 9 | bookdown: 10 | name: Render-Book 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | with: 15 | lfs: True 16 | - uses: r-lib/actions/setup-r@v2 17 | with: 18 | r-version: '4.4.0' 19 | use-public-rspm: true 20 | - uses: r-lib/actions/setup-pandoc@v2 21 | - name: Set RENV_PATHS_ROOT 22 | shell: bash 23 | run: | 24 | echo "RENV_PATHS_ROOT=${{ runner.temp }}/renv" >> $GITHUB_ENV 25 | - name: Cache packages 26 | uses: actions/cache@v3 27 | with: 28 | path: ${{ env.RENV_PATHS_ROOT }} 29 | key: ${{ runner.os }}-renv-${{ hashFiles('**/renv.lock') }} 30 | restore-keys: | 31 | ${{ runner.os }}-renv- 32 | - name: Install tidycensus system dependency 33 | shell: bash 34 | run: | 35 | sudo apt-get install libudunits2-dev libgdal-dev libgeos-dev libproj-dev libfontconfig1-dev libglpk-dev 36 | - name: Restore packages 37 | shell: Rscript {0} 38 | run: | 39 | if (!requireNamespace("renv", quietly = TRUE)) install.packages("renv") 40 | renv::restore() 41 | - name: Render Book 42 | env: 43 | CENSUS_KEY: ${{ secrets.CENSUS_KEY }} 44 | OSF_PAT: ${{ secrets.OSF_PAT }} 45 | run: Rscript -e 'bookdown::render_book("index.Rmd")' 46 | - uses: actions/upload-artifact@v4.4.0 47 | with: 48 | name: _book 49 | path: _book/ 50 | 51 | # Need to first create an empty gh-pages branch 52 | # see https://pkgdown.r-lib.org/reference/deploy_site_github.html 53 | # and also add secrets for a GH_PAT and EMAIL to the repository 54 | # gh-action from Cecilapp/GitHub-Pages-deploy 55 | checkout-and-deploy: 56 | runs-on: ubuntu-latest 57 | needs: bookdown 58 | steps: 59 | - name: Checkout 60 | uses: actions/checkout@main 61 | - name: Download artifact 62 | uses: actions/download-artifact@v4.1.8 63 | with: 64 | # Artifact name 65 | name: _book # optional 66 | # Destination path 67 | path: _book # optional 68 | - name: Deploy to GitHub Pages 69 | uses: Cecilapp/GitHub-Pages-deploy@v3 70 | env: 71 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 72 | with: 73 | email: ${{ secrets.EMAIL }} 74 | build_dir: _book/ 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # don't push local rendered files in master branch 2 | _book/ 3 | _bookdown_files/ 4 | bookdown_files/ 5 | bookdown_cache/ 6 | libs/ 7 | *.log 8 | *.pdf 9 | # ignore all pdf files except: 10 | !RawData/RECS_2015/microdata_v3.pdf 11 | !RawData/RECS_2020/microdata-guide.pdf 12 | !RawData/ANES_2020/*.pdf 13 | *.epub 14 | **/*.bib 15 | !book.bib 16 | **/*.html 17 | # ignore all html files except: 18 | !**/mathjax_header.html 19 | !**/plausible.html 20 | !**/hero-image.html 21 | # R and Rstudio specific files 22 | .Rproj.user 23 | .Rhistory 24 | .RData 25 | .Ruserdata 26 | .DS_Store 27 | # LAPOP raw data 28 | RawData/LAPOP_2021/ 29 | # ignore complete NCVS data files due to DUA 30 | RawData/NCVS_2021/ICPSR_38429/ 31 | osf_dl/*.rds 32 | latex/before_body_ded.tex -------------------------------------------------------------------------------- /01-introduction.Rmd: -------------------------------------------------------------------------------- 1 | \mainmatter 2 | 3 | # (PART) Introduction {-} 4 | 5 | # Introduction {#c01-intro} 6 | 7 | Surveys are valuable tools for gathering information about a population. Researchers, governments, and businesses use surveys to better understand public opinion and behaviors. For example, a non-profit group may analyze societal trends to measure their impact, government agencies may study behaviors to inform policy, or companies may seek to learn customer product preferences to refine business strategy. With survey data, we can explore the world around us. 8 | 9 | Surveys are often conducted with a sample of the population. Therefore, to use the survey data to understand the population, we use weights to adjust the survey results for unequal probabilities of selection, nonresponse, and post-stratification. These adjustments ensure the sample accurately represents the population of interest [@gard2023weightsdef]. To account for the intricate nature of the survey design, analysts rely on statistical software such as SAS, Stata, SUDAAN, and R. 10 | 11 | In this book, we focus on R to introduce survey analysis. Our goal is to provide a comprehensive guide for individuals new to survey analysis but with some familiarity with statistics and R programming. We use a combination of the {survey} and {srvyr} packages and present the code following best practices from the tidyverse [@R-srvyr; @lumley2010complex; @tidyverse2019]. 12 | 13 | ## Survey analysis in R 14 | 15 | The {survey} package was released on the [Comprehensive R Archive Network (CRAN)](https://cran.r-project.org/src/contrib/Archive/survey/) in 2003 and has been continuously developed over time. This package, primarily authored by Thomas Lumley, offers an extensive array of features, including: 16 | 17 | * Calculation of point estimates and estimates of their uncertainty, including means, totals, ratios, quantiles, and proportions 18 | * Estimation of regression models, including generalized linear models, log-linear models, and survival curves 19 | * Variances by Taylor linearization or by replicate weights, including balance repeated replication, jackknife, bootstrap, multistage bootstrap, or user-supplied methods 20 | * Hypothesis testing for means, proportions, and other parameters 21 | 22 | The {srvyr} package builds on the {survey} package by providing wrappers for functions that align with the tidyverse philosophy. This is our motivation for using and recommending the {srvyr} package. We find that it is user-friendly for those familiar with the tidyverse packages in R. 23 | 24 | For example, while many functions in the {survey} package access variables through formulas, the {srvyr} package uses tidy selection to pass variable names, a common feature in the tidyverse [@R-tidyselect]. Users of the tidyverse are also likely familiar with the magrittr pipe operator (`%>%`), which seamlessly works with functions from the {srvyr} package. Moreover, several common functions from {dplyr}, such as `filter()`, `mutate()`, and `summarize()`, can be applied to survey objects [@R-dplyr]. This enables users to streamline their analysis workflow and leverage the benefits of both the {srvyr} and {tidyverse} packages. 25 | 26 | While the {srvyr} package offers many advantages, there is one notable limitation: it doesn't fully incorporate the modeling capabilities of the {survey} package into tidy wrappers. When discussing modeling and hypothesis testing, we primarily rely on the {survey} package. However, we provide information on how to apply the pipe operator to these functions to maintain clarity and consistency in analyses. 27 | 28 | ## What to expect {#what-to-expect} 29 | 30 | This book covers many aspects of survey design and analysis, from understanding how to create design objects to conducting descriptive analysis, statistical tests, and models. We emphasize coding best practices and effective presentation techniques while using real-world data and practical examples to help readers gain proficiency in survey analysis. 31 | 32 | Below is a summary of each chapter: 33 | 34 | - **Chapter \@ref(c02-overview-surveys) - Overview of surveys**: 35 | - Overview of survey design processes 36 | - References for more in-depth knowledge 37 | - **Chapter \@ref(c03-survey-data-documentation) - Survey data documentation**: 38 | - Guide to survey documentation types 39 | - How to read survey documentation 40 | - **Chapter \@ref(c04-getting-started) - Getting started**: 41 | - Installation of packages 42 | - Introduction to the {srvyrexploR} package and its analytic datasets 43 | - Outline of the survey analysis process 44 | - Comparison between the {dplyr} and {srvyr} packages 45 | - **Chapter \@ref(c05-descriptive-analysis) - Descriptive analyses**: 46 | - Calculation of point estimates 47 | - Estimation of standard errors and confidence intervals 48 | - Calculation of design effects 49 | - **Chapter \@ref(c06-statistical-testing) - Statistical testing**: 50 | - Statistical testing methods 51 | - Comparison of means and proportions 52 | - Goodness-of-fit tests, tests of independence, and tests of homogeneity 53 | - **Chapter \@ref(c07-modeling) - Modeling**: 54 | - Overview of model formula specifications 55 | - Linear regression, ANOVA, and logistic regression modeling 56 | - **Chapter \@ref(c08-communicating-results) - Communication of results**: 57 | - Strategies for communicating survey results 58 | - Tools and guidance for creating publishable tables and graphs 59 | - **Chapter \@ref(c09-reprex-data) - Reproducible research**: 60 | - Tools and methods for achieving reproducibility 61 | - Resources for reproducible research 62 | - **Chapter \@ref(c10-sample-designs-replicate-weights) - Sample designs and replicate weights**: 63 | - Overview of common sampling designs 64 | - Replicate weight methods 65 | - How to specify survey designs in R 66 | - **Chapter \@ref(c11-missing-data) - Missing data**: 67 | - Overview of missing data in surveys 68 | - Approaches to dealing with missing data 69 | - **Chapter \@ref(c12-recommendations) - Successful survey analysis recommendations**: 70 | - Tips for successful analysis 71 | - Recommendations for debugging 72 | - **Chapter \@ref(c13-ncvs-vignette) - National Crime Victimization Survey Vignette**: 73 | - Vignette on analyzing National Crime Victimization Survey (NCVS) data 74 | - Illustration of analysis requiring multiple files for victimization rates 75 | - **Chapter \@ref(c14-ambarom-vignette) - AmericasBarometer Vignette**: 76 | - Vignette on analyzing AmericasBarometer survey data 77 | - Creation of choropleth maps with survey estimates 78 | 79 | The majority of chapters contain code that readers can follow. Each of these chapters starts with a "Prerequisites" section, which includes the code needed to load the packages and datasets used in the chapter. We then provide the main idea of the chapter and examples of how to use the functions. Most chapters conclude with exercises to work through. We provide the solutions to the exercises in the [online version of the book](https://tidy-survey-r.github.io/tidy-survey-book/). 80 | 81 | While we provide a brief overview of survey methodology and statistical theory, this book is not intended to be the sole resource for these topics. We reference other materials and encourage readers to seek them out for more information. 82 | 83 | ## Prerequisites 84 | 85 | To get the most out of this book, we assume a survey has already been conducted and readers have obtained a microdata file. Microdata, also known as respondent-level or row-level data, differ from summarized data typically found in tables. Microdata contain individual survey responses, along with analysis weights and design variables such as strata or clusters. 86 | 87 | Additionally, the survey data should already include weights and design variables. These are required to accurately calculate unbiased estimates. The concepts and techniques discussed in this book help readers to extract meaningful insights from survey data, but this book does not cover how to create weights, as this is a separate complex topic. If weights are not already created for the survey data, we recommend reviewing other resources focused on weight creation such as @Valliant2018weights. 88 | 89 | This book is tailored for analysts already familiar with R and the tidyverse, but who may be new to complex survey analysis in R. We anticipate that readers of this book can: 90 | 91 | * Install R and their Integrated Development Environment (IDE) of choice, such as RStudio 92 | * Install and load packages from CRAN and GitHub repositories 93 | * Run R code 94 | * Read data from a folder or their working directory 95 | * Understand fundamental tidyverse concepts such as tidy/long/wide data, tibbles, the magrittr pipe (`%>%`), and tidy selection 96 | * Use the tidyverse packages to wrangle, tidy, and visualize data 97 | 98 | If these concepts or skills are unfamiliar, we recommend starting with introductory resources to cover these topics before reading this book. R for Data Science [@wickham2023r4ds] is a beginner-friendly guide for getting started in data science using R. It offers guidance on preliminary installation steps, basic R syntax, and tidyverse workflows and packages. 99 | 100 | ## Datasets used in this book 101 | 102 | We work with two key datasets throughout the book: the Residential Energy Consumption Survey [RECS -- @recs-2020-tech] and the American National Election Studies [ANES -- @debell]. We introduce the loading and preparation of these datasets in Chapter \@ref(c04-getting-started). 103 | 104 | ## Conventions 105 | 106 | Throughout the book, we use the following typographical conventions: 107 | 108 | * Package names are surrounded by curly brackets: {srvyr} 109 | * Function names are in constant-width text format and include parentheses: `survey_mean()` 110 | * Object and variable names are in constant-width text format: `anes_des` 111 | 112 | ## Getting help 113 | 114 | We recommend first trying to resolve errors and issues independently using the tips provided in Chapter \@ref(c12-recommendations). 115 | 116 | There are several community forums for asking questions, including: 117 | 118 | * [Posit Community](https://forum.posit.co/) 119 | * [R for Data Science Slack Community](https://rfordatasci.com/) 120 | * [Stack Overflow](https://stackoverflow.com/) 121 | 122 | Please report any bugs and issues to the book's [GitHub repository](https://github.com/tidy-survey-r/tidy-survey-book/issues). 123 | 124 | ## Acknowledgments 125 | 126 | We would like to thank Holly Cast, Greg Freedman Ellis, Joe Murphy, and Sheila Saia for their reviews of the initial draft. Their detailed and honest feedback helped improve this book, and we are grateful for their input. Additionally, this book started with two short courses. The first was at the Annual Conference for the American Association for Public Opinion Research (AAPOR) and the second was a series of webinars for the Midwest Association of Public Opinion Research (MAPOR). We would like to also thank those who assisted us by moderating breakout rooms and answering questions from attendees: Greg Freedman Ellis, Raphael Nishimura, and Benjamin Schneider. 127 | 128 | ## Colophon 129 | 130 | This book was written in [bookdown](http://bookdown.org/) using [RStudio](http://www.rstudio.com/ide/). The complete source is available on [GitHub](https://github.com/tidy-survey-r/tidy-survey-book). 131 | 132 | This version of the book was built with `r R.version.string` and with the packages listed in Table \@ref(tab:intro-packages-tab). 133 | 134 | ```{r} 135 | #| label: intro-colophon-pkgs 136 | #| echo: false 137 | #| warning: false 138 | #| message: false 139 | library(prettyunits) 140 | library(DiagrammeR) 141 | library(tidyverse) 142 | library(tidycensus) 143 | library(survey) 144 | library(srvyr) 145 | library(srvyrexploR) 146 | library(broom) 147 | library(gt) 148 | library(gtsummary) 149 | library(censusapi) 150 | library(naniar) 151 | library(haven) 152 | library(sf) 153 | library(rnaturalearth) 154 | library(rnaturalearthdata) 155 | library(ggpattern) 156 | library(osfr) 157 | library(janitor) 158 | library(kableExtra) 159 | library(knitr) 160 | library(labelled) 161 | library(bookdown) 162 | library(rmarkdown) 163 | library(tidyselect) 164 | 165 | ``` 166 | 167 | (ref:intro-packages-tab) Package versions and sources used in building this book 168 | 169 | ```{r} 170 | #| label: intro-packages-tab 171 | #| echo: FALSE 172 | #| warning: FALSE 173 | renv_in <- renv::lockfile_read() 174 | renv_pack <- renv_in$Packages %>% 175 | map(as_tibble) %>%list_rbind() %>% 176 | distinct(Package, Version, Source, Repository, RemoteSha, RemoteUsername, RemoteRepo) 177 | 178 | packinfo <- sessioninfo::package_info() 179 | packinfo_attach <- packinfo %>% 180 | filter(attached|package %in% c("renv")) 181 | 182 | packinfo_tib <- 183 | renv_pack %>% 184 | filter(Package %in% c(pull(packinfo_attach, package))) %>% 185 | rename(SourceInit=Source) %>% 186 | mutate( 187 | ShortSha=str_sub(RemoteSha, 1, 7), 188 | Source=case_when( 189 | Repository=="CRAN"~"CRAN", 190 | TRUE ~ glue::glue("{SourceInit} ({RemoteUsername}/{RemoteRepo}@{ShortSha})") 191 | ) 192 | ) %>% 193 | select(Package, Version, Source) 194 | 195 | packinfo_tib %>% 196 | gt() %>% 197 | cols_align(align="left") %>% 198 | cols_label( 199 | Package=md("**Package**"), 200 | Version=md("**Version**"), 201 | Source=md("**Source**"), 202 | ) %>% 203 | print_gt_book(knitr::opts_current$get()[["label"]]) 204 | ``` 205 | -------------------------------------------------------------------------------- /03-survey-data-documentation.Rmd: -------------------------------------------------------------------------------- 1 | # Survey data documentation {#c03-survey-data-documentation} 2 | 3 | ```{r} 4 | #| label: understand-pkgs 5 | #| echo: FALSE 6 | #| error: FALSE 7 | #| warning: FALSE 8 | #| message: FALSE 9 | library(tidyverse) 10 | ``` 11 | 12 | ## Introduction 13 | 14 | Survey documentation helps us prepare before we look at the actual survey data. The documentation includes technical guides, questionnaires, codebooks, errata, and other useful resources. By taking the time to review these materials, we can gain a comprehensive understanding of the survey data (including research and design decisions discussed in Chapters \@ref(c02-overview-surveys) and \@ref(c10-sample-designs-replicate-weights)) and conduct our analysis more effectively. 15 | 16 | Survey documentation can vary in organization, type, and ease of use. The information may be stored in any format---PDFs, Excel spreadsheets, Word documents, and so on. Some surveys bundle documentation together, such as providing the codebook and questionnaire in a single document. Others keep them in separate files. Despite these variations, we can gain a general understanding of the documentation types and what aspects to focus on in each. 17 | 18 | ## Types of survey documentation 19 | 20 | ### Technical documentation 21 | 22 | The technical documentation, also known as user guides or methodology/analysis guides, highlights the variables necessary to specify the survey design. We recommend concentrating on these key sections: 23 | 24 | * Introduction: The introduction orients us to the survey. This section provides the project's background, the study's purpose, and \index{Research topic|(}the main research questions.\index{Research topic|)} 25 | * Study design: The study design section describes how researchers prepared and administered the survey. 26 | * \index{Sampling error|(}\index{Sampling frame|(}\index{Sample|(}Sample: The sample section describes the sample frame, any known sampling errors, and limitations of the sample.\index{Sampling frame|)} \index{Weighting|(}This section can contain recommendations on how to use sampling weights. Look for weight information, whether the survey design contains strata, clusters/PSUs, or replicate weights. Also, look for population sizes, finite population correction, or replicate weight scaling information. Additional detail on sample designs is available in Chapter \@ref(c10-sample-designs-replicate-weights).\index{Sampling error|)}\index{Sample|)}\index{Weighting|)} 27 | * Notes on fielding: Any additional notes on fielding, such as response rates, may be found in the technical documentation. 28 | 29 | The technical documentation may include other helpful resources. For example, some technical documentation includes syntax for SAS, SUDAAN, Stata, and/or R, so we do not have to create this code from scratch. 30 | 31 | ### Questionnaires 32 | 33 | \index{Questionnaire|(}A questionnaire is a series of questions used to collect information from people in a survey. It can ask about opinions, behaviors, demographics, or even just numbers like the count of lightbulbs, square footage, or farm size. Questionnaires can employ different types of questions, such as closed-ended (e.g., select one or check all that apply), open-ended (e.g., numeric or text), Likert scales (e.g., a 5- or 7-point scale specifying a respondent's level of agreement to a statement), or ranking questions (e.g., a list of options that a respondent ranks by preference). It may randomize the display order of responses or include instructions that help respondents understand the questions. A survey may have one questionnaire or multiple, depending on its scale and scope. 34 | 35 | The questionnaire is another important resource for understanding and interpreting the survey data (see Section \@ref(overview-design-questionnaire)), and we should use it alongside any analysis. It provides details about each of the questions asked in the survey, such as question name, question wording, response options, skip logic, randomizations, display specifications, mode differences, and the universe (the subset of respondents who were asked a question). 36 | 37 | \index{American National Election Studies (ANES)|(} 38 | In Figure \@ref(fig:understand-que-examp), we show an example from the American National Election Studies (ANES) 2020 questionnaire [@anes-svy]. The figure shows the question name (`POSTVOTE_RVOTE`), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (this question was only asked if `vote_pre` = 0), and other specifications. The section also includes the variable name, which we can link to the codebook. 39 | 40 | ```{r} 41 | #| label: understand-que-examp 42 | #| echo: false 43 | #| fig.cap: ANES 2020 questionnaire example 44 | #| fig.alt: Question information about the variable postvote_rvote from ANES 2020 questionnaire Survey question, Universe, Logic, Web Spec, Response Order, and Released Variable are included. 45 | 46 | knitr::include_graphics(path = "images/questionnaire-example.jpg") 47 | ``` 48 | 49 | \index{American National Election Studies (ANES)|)} 50 | 51 | The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example above), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure \@ref(fig:understand-que-examp-2) shows an example from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) [@brfss-svy]. 52 | 53 | ```{r} 54 | #| label: understand-que-examp-2 55 | #| echo: false 56 | #| fig.cap: BRFSS 2021 questionnaire example 57 | #| fig.alt: Question information about the variable BPHIGH6 from BRFSS 2021 questionnaire. Question number, question text, variable names, responses, skip info and CATI note, interviewer notes, and columns are included. 58 | 59 | knitr::include_graphics(path = "images/questionnaire-example-2.jpg") 60 | ``` 61 | 62 | \index{Mode|(}We should factor in the details of a survey when conducting our analyses. For example, surveys that use various modes (e.g., web and mail) may have differences in question wording or skip logic, as web surveys can include fills or automate skip logic. If large enough, these variations could warrant separate analyses for each mode.\index{Mode|)} \index{Questionnaire|)} 63 | 64 | ### Codebooks 65 | 66 | \index{Missing data|(} \index{Codebook|(} \index{Data dictionary|see {Codebook}} 67 | While a questionnaire provides information about the questions posed to respondents, the codebook explains how the survey data were coded and recorded. It lists details such as variable names, variable labels, variable meanings, codes for missing data, value labels, and value types (whether categorical, continuous, etc.). The codebook helps us understand and use the variables appropriately in our analysis. In particular, the codebook (as opposed to the questionnaire) often includes information on missing data. Note that the term data dictionary is sometimes used interchangeably with codebook, but a data dictionary may include more details on the structure and elements of the data. 68 | \index{Missing data|)} 69 | 70 | \index{American National Election Studies (ANES)|(} 71 | Figure \@ref(fig:understand-codebook-examp) is a question from the ANES 2020 codebook [@anes-cb]. This section indicates a variable's name (`V202066`), question wording, value labels, universe, and associated survey question (`POSTVOTE_RVOTE`). 72 | 73 | ```{r} 74 | #| label: understand-codebook-examp 75 | #| echo: false 76 | #| fig.cap: ANES 2020 codebook example 77 | #| fig.alt: Variable information about the variable V202066 from ANES 2020 questionnaire Variable meaning, Value labels, Universe, and Survey Question(s) are included. 78 | 79 | knitr::include_graphics(path="images/codebook-example.jpg") 80 | ``` 81 | 82 | Reviewing the questionnaires and codebooks in parallel can clarify how to interpret the variables (Figures \@ref(fig:understand-que-examp) and \@ref(fig:understand-codebook-examp)), as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. \index{American National Election Studies (ANES)|)} 83 | \index{Codebook|)} 84 | 85 | ### Errata 86 | 87 | An erratum (singular) or errata (plural) is a document that lists errors found in a publication or dataset. The purpose of an erratum is to correct or update inaccuracies in the original document. Examples of errata include: 88 | 89 | * Issuing a corrected data table after realizing a typo or mistake in a table cell 90 | * Reporting incorrectly programmed skips in an electronic survey where questions are skipped by the respondent when they should not have been 91 | 92 | For example, the 2004 ANES dataset released an erratum, notifying analysts to remove a specific row from the data file due to the inclusion of a respondent who should not have been part of the sample. Adhering to an issued erratum helps us increase the accuracy and reliability of analysis. 93 | 94 | ### Additional resources 95 | 96 | Survey documentation may include additional material, such as interviewer instructions or "show cards" provided to respondents during interviewer-administered surveys to help respondents answer questions. Explore the survey website to find out what resources were used and in what contexts. 97 | 98 | ## Missing data coding 99 | 100 | \index{Missing data|(} 101 | Some observations in a dataset may have missing data. This can be due to design or nonresponse, and these concepts are detailed in Chapter \@ref(c11-missing-data). In that chapter, we also discuss how to analyze data with missing values. This chapter walks through how to understand documentation related to missing data. 102 | 103 | \index{Codebook|(} 104 | The survey documentation, often the codebook, represents the missing data with a code. The codebook may list different codes depending on why certain data points are missing. In the example of variable `V202066` from the ANES (Figure \@ref(fig:understand-codebook-examp)), `-9` represents "Refused," `-7` means that the response was deleted due to an incomplete interview, `-6` means that there is no response because there was no follow-up interview, and `-1` means "Inapplicable" (due to a designed skip pattern). 105 | 106 | \index{National Crime Victimization Survey (NCVS)|(} 107 | As another example, there may be a summary variable that describes the missingness of a set of variables --- particularly with "select all that apply" or "multiple response" questions. In the National Crime Victimization Survey (NCVS), respondents who are victims of a crime and saw the offender are asked if the offender had a weapon and then asked what the type of weapon was. This part of the questionnaire from 2021 is shown in Figure \@ref(fig:understand-ncvs-weapon-q) [@ncvs_survey_2020]. 108 | 109 | ```{r} 110 | #| label: understand-ncvs-weapon-q 111 | #| echo: false 112 | #| fig.cap: Excerpt from the NCVS 2020-2021 Crime Incident Report - Weapon Type 113 | #| fig.alt: Questions 22 and 23a from the NCVS 2020-2021 Crime Incident Report, see https://bjs.ojp.gov/content/pub/pdf/ncvs20_cir.pdf 114 | 115 | knitr::include_graphics(path="images/questionnaire-ncvs-weapon.jpg") 116 | ``` 117 | 118 | For these multiple response variables (select all that apply), the NCVS codebook includes what they call a "lead-in" variable that summarizes the response. This lead-in variable provides metadata information on how a respondent answered the question. For example, question 23a on the weapon type, the lead-in variable is V4050 (shown in Figure \@ref(fig:understand-ncvs-weapon-cb)) indicates the quality and type of response [@ncvs_cb_2020]. In the codebook, this variable is then followed by a set of variables for each weapon type. An example of one of the individual variables from the codebook, the handgun (V4051), is shown in Figure \@ref(fig:understand-ncvs-weapon-cb-hg) [@ncvs_cb_2020]. We will dive into how to analyze this variable in Chapter \@ref(c11-missing-data). 119 | 120 | ```{r} 121 | #| label: understand-ncvs-weapon-cb 122 | #| echo: false 123 | #| fig.cap: Excerpt from the NCVS 2021 Codebook for V4050 - LI WHAT WAS WEAPON 124 | #| fig.alt: Codebook includes location of variable (files and columns), variable type (numeric), question (What was the weapon? Anything else?), and the coding of this lead in variable 125 | knitr::include_graphics(path="images/codebook-ncvs-weapon-li.jpg") 126 | ``` 127 | 128 | 129 | ```{r} 130 | #| label: understand-ncvs-weapon-cb-hg 131 | #| echo: false 132 | #| fig.cap: "Excerpt from the NCVS 2021 Codebook for V4051 - C WEAPON: HAND GUN" 133 | #| fig.alt: Codebook includes location of variable (files and columns), variable type (numeric), question (What was the weapon? Anything else?), and the coding of this categorical variable 134 | knitr::include_graphics(path="images/codebook-ncvs-weapon-handgun.jpg") 135 | ``` 136 | 137 | When data are read into R, some values may be system missing, that is they are coded as `NA` even if that is not evident in a codebook. We discuss in Chapter \@ref(c11-missing-data) how to analyze data with `NA` values and review how R handles missing data in calculations. 138 | \index{National Crime Victimization Survey (NCVS)|)} \index{Missing data|)} \index{Codebook|)} 139 | 140 | ## Example: ANES 2020 survey documentation 141 | 142 | \index{American National Election Studies (ANES)|(} 143 | Let's look at the survey documentation for the ANES 2020 and the documentation from their [website](https://electionstudies.org/data-center/2020-time-series-study/). Navigating to "User Guide and Codebook" [@anes-cb], we can download the PDF that contains the survey documentation, titled "ANES 2020 Time Series Study Full Release: User Guide and Codebook." Do not be daunted by the 796-page PDF. Below, we focus on the most critical information. 144 | 145 | #### Introduction {-} 146 | 147 | The first section in the User Guide explains that the ANES 2020 Times Series Study continues a series of election surveys conducted since 1948. These surveys contain data on public opinion and voting behavior in the U.S. presidential elections. \index{Mode|(}The introduction also includes information about the modes used for data collection (web, live video interviewing, or CATI).\index{Mode|)} Additionally, there is a summary of the number of pre-election interviews (8,280) and post-election re-interviews (7,449). 148 | 149 | #### Sample design and respondent recruitment {-} 150 | 151 | \index{Mode|(}The section "Sample Design and Respondent Recruitment" provides more detail about the survey's sequential mixed-mode design. All three modes were conducted one after another and not at the same time.\index{Mode|)} Additionally, it indicates that for the 2020 survey, they resampled all respondents who participated in the 2016 ANES, along with a newly drawn cross-section: 152 | 153 | > The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or the District of Columbia. 154 | 155 | The document continues with more details on the sample groups. 156 | 157 | #### Data analysis, weights, and variance estimation {-} 158 | 159 | \index{Weighting|(}The section "Data Analysis, Weights, and Variance Estimation" includes information on weights and strata/cluster variables. Reading through, we can find the full sample weight variables: 160 | 161 | > For analysis of the complete set of cases using pre-election data only, including all cases and representative of the 2020 electorate, use the full sample pre-election weight, **V200010a**. For analysis including post-election data for the complete set of participants (i.e., analysis of post-election data only or a combination of pre- and post-election data), use the full sample post-election weight, **V200010b**. Additional weights are provided for analysis of subsets of the data... 162 | 163 | The document provides more information about the design variables, summarized in Table \@ref(tab:aneswgts). 164 | 165 | Table: (\#tab:aneswgts) Weight and variance information for ANES 166 | 167 | For weight | Variance unit/cluster | Variance stratum 168 | :-----------:|:-----------:|:-----------: 169 | V200010a| V200010c| V200010d 170 | V200010b| V200010c| V200010d 171 | 172 | ### Methodology {-} 173 | 174 | The user guide mentions a supplemental document called "How to Analyze ANES Survey Data" [@debell] as a how-to guide for analyzing the data. In this document, we learn more about the weights, and that they sum to the sample size and not the population. If our goal is to calculate estimates for the entire U.S. population instead of just the sample, we must adjust the weights to the U.S. population. To create accurate weights for the population, we need to determine the total population size at the time of the survey. Let's review the "Sample Design and Respondent Recruitment" section for more details: 175 | 176 | > The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or the District of Columbia. 177 | 178 | \index{Current Population Survey (CPS)|(} 179 | The documentation suggests that the population should equal around 231 million, but this is a very imprecise count. Upon further investigation of the available resources, we can find the methodology file titled "Methodology Report for the ANES 2020 Time Series Study" [@anes-2020-tech]. This file states that we can use the population total from the Current Population Survey (CPS), a monthly survey sponsored by the U.S. Census Bureau and the U.S. Bureau of Labor Statistics. The CPS provides a more accurate population estimate for a specific month. Therefore, we can use the CPS to get the total population number for March 2020, when the ANES was conducted. Chapter \@ref(c04-getting-started) goes into detailed instructions on how to calculate and adjust this value in the data. 180 | \index{Weighting|)} \index{American National Election Studies (ANES)|)} \index{Current Population Survey (CPS)|)} -------------------------------------------------------------------------------- /04-set-up.Rmd: -------------------------------------------------------------------------------- 1 | # (PART) Analysis {-} 2 | 3 | # Getting started {#c04-getting-started} 4 | 5 | ```{r} 6 | #| label: setup-styler 7 | #| echo: false 8 | #| message: false 9 | knitr::opts_chunk$set(tidy = 'styler') 10 | library(magrittr) 11 | library(tidyselect) 12 | ``` 13 | 14 | ## Introduction 15 | 16 | This chapter provides an overview of the packages, data, and design objects we use frequently throughout this book. As mentioned in Chapter \@ref(c02-overview-surveys), understanding how a survey was conducted helps us make sense of the results and interpret findings. Therefore, we provide background on the datasets used in examples and exercises. Next, we walk through how to create the survey design objects necessary to begin an analysis. Finally, we provide an overview of the {srvyr} package and the steps needed for analysis. Please report any bugs and issues encountered while going through the book to the book's [GitHub repository](https://github.com/tidy-survey-r/tidy-survey-book). 17 | 18 | ## Setup 19 | 20 | This section provides details on the required packages and data, as well as the steps for preparing survey design objects. For a streamlined learning experience, we recommend taking the time to walk through the code provided here and making sure everything is properly set up. 21 | 22 | ### Packages {#setup-load-pkgs} 23 | 24 | We use several packages throughout the book, but let's install and load specific ones for this chapter. Many functions in the examples and exercises are from three packages: {tidyverse}, {survey}, and {srvyr} [@tidyverse2019; @lumley2010complex; @R-srvyr]. The packages can be installed from the Comprehensive R Archive Network (CRAN) using the code below: 25 | 26 | ```{r} 27 | #| label: setup-install-core1 28 | #| eval: FALSE 29 | install.packages(c("tidyverse", "survey", "srvyr")) 30 | ``` 31 | 32 | We bundled the datasets used in the book in an R package, {srvyrexploR} [@R-srvyrexploR]. To install it from GitHub, use the {pak} package [@R-pak]: 33 | 34 | ```{r} 35 | #| label: setup-install-core2 36 | #| eval: FALSE 37 | #| warning: FALSE 38 | install.packages("pak") 39 | pak::pak("tidy-survey-r/srvyrexploR") 40 | ``` 41 | 42 | After installing these packages, load them using the `library()` function: 43 | 44 | ```{r} 45 | #| label: setup-pkgs-core 46 | #| error: FALSE 47 | #| warning: FALSE 48 | #| message: FALSE 49 | library(tidyverse) 50 | library(survey) 51 | library(srvyr) 52 | library(srvyrexploR) 53 | ``` 54 | 55 | \index{gtsummary|(} \index{gt package|(} 56 | The packages {broom}, {gt}, and {gtsummary} play a role in displaying output and creating formatted tables [@R-gt; @R-broom; @gtsummarysjo]. Install them with the provided code^[Note: {broom} is already included in the tidyverse, so no separate installation is required.]: 57 | 58 | ```{r} 59 | #| label: setup-install-extra 60 | #| eval: FALSE 61 | install.packages(c("gt", "gtsummary")) 62 | ``` 63 | 64 | After installing these packages, load them using the `library()` function: 65 | 66 | ```{r} 67 | #| label: setup-pkgs-extra 68 | #| error: FALSE 69 | #| warning: FALSE 70 | #| message: FALSE 71 | library(broom) 72 | library(gt) 73 | library(gtsummary) 74 | ``` 75 | \index{gtsummary|)} \index{gt package|)} 76 | 77 | \index{Current Population Survey (CPS)|(} 78 | Install and load the {censusapi} package to access the Current Population Survey (CPS), which we use to ensure accurate weighting of a key dataset in the book [@R-censusapi]. Run the code below to install {censusapi}: 79 | 80 | ```{r} 81 | #| label: setup-install-census 82 | #| eval: FALSE 83 | install.packages("censusapi") 84 | ``` 85 | 86 | After installing this package, load it using the `library()` function: 87 | 88 | ```{r} 89 | #| label: setup-pkgs-census 90 | #| error: FALSE 91 | #| warning: FALSE 92 | #| message: FALSE 93 | library(censusapi) 94 | ``` 95 | 96 | Note that the {censusapi} package requires a Census API key, available for free from the [U.S. Census Bureau website](https://api.census.gov/data/key_signup.html) (refer to the package documentation for more information). We recommend storing the Census API key in the R environment instead of directly in the code. To do this, run the `Sys.setenv()` script below, substituting the API key where it says `YOUR_API_KEY_HERE`. 97 | 98 | ```{r} 99 | #| label: setup-census-api-setup 100 | #| eval: FALSE 101 | Sys.setenv(CENSUS_KEY = "YOUR_API_KEY_HERE") 102 | ``` 103 | 104 | Then, restart the R session. Once the Census API key is stored, we can retrieve it in our R code with `Sys.getenv("CENSUS_KEY")`. 105 | \index{Current Population Survey (CPS)|)} 106 | 107 | There are a few other packages used in the book in limited frequency. We list them in the Prerequisite boxes at the beginning of each chapter. As we work through the book, make sure to check the Prerequisite box and install any missing packages before proceeding. 108 | 109 | ### Data 110 | 111 | The {srvyrexploR} package contains the datasets used in the book. Once installed and loaded, explore the documentation using the `help()` function. Read the descriptions of the datasets to understand what they contain: 112 | 113 | ```{r} 114 | #| label: setup-datapkg-help 115 | #| eval: FALSE 116 | help(package = "srvyrexploR") 117 | ``` 118 | 119 | This book uses two main datasets: the American National Election Studies [ANES -- @debell] and the Residential Energy Consumption Survey [RECS -- @recs-2020-tech], which are included as `anes_2020` and `recs_2020` in the {srvyrexploR} package, respectively. 120 | 121 | #### American National Election Studies Data {-} 122 | 123 | \index{American National Election Studies (ANES)|(} 124 | American National Election Studies (ANES) collect data from election surveys dating back to 1948. These surveys contain information on public opinion and voting behavior in U.S. presidential elections and some midterm elections^[In the United States, presidential elections are held in years divisible by four. In other even years, there are elections at the federal level for Congress, which are referred to as midterm elections as they occur at the middle of the term of a president.]. They cover topics such as party affiliation, voting choice, and level of trust in the government. The 2020 survey (data used in this book) was fielded online, through live video interviews, or via computer-assisted telephone interviews (CATI). 125 | 126 | When working with new survey data, we should review the survey documentation (see Chapter \@ref(c03-survey-data-documentation)) to understand the data collection methods. The original ANES data contains variables starting with `V20` [@debell], so to assist with our analysis throughout the book, we created descriptive variable names. For example, the respondent's age is now in a variable called `Age`, and gender is in a variable called `Gender`. These descriptive variables are included in the {srvyrexploR} package. A complete overview of all variables can be found in `r if (!knitr:::is_html_output()) 'the online Appendix ('`Appendix \@ref(anes-cb)`r if (!knitr:::is_html_output()) ')'`. 127 | 128 | Before beginning an analysis, it is useful to view the data to understand the available variables. The `dplyr::glimpse()` function produces a list of all variables, their types (e.g., function, double), and a few example values. Below, we remove variables containing a "V" followed by numbers with `select(-matches("^V\\d"))` before using `glimpse()` to get a quick overview of the data with descriptive variable names: 129 | 130 | ```{r} 131 | #| label: setup-anes-glimpse 132 | anes_2020 %>% 133 | select(-matches("^V\\d")) %>% 134 | glimpse() 135 | ``` 136 | 137 | From the output, we can see there are `r nrow(anes_2020 %>% select(-matches("^V\\d"))) %>% formatC(big.mark = ",")` rows and `r ncol(anes_2020 %>% select(-matches("^V\\d"))) %>% formatC(big.mark = ",")` variables in the ANES data. This output also indicates that most of the variables are factors (e.g., `InterviewMode`), while a few variables are in double (numeric) format (e.g., `Age`). 138 | \index{American National Election Studies (ANES)|)} 139 | 140 | #### Residential Energy Consumption Survey Data {-} 141 | 142 | \index{Residential Energy Consumption Survey (RECS)|(} 143 | Residential Energy Consumption Survey (RECS) is a study that measures energy consumption and expenditure in American households. Funded by the Energy Information Administration, RECS data are collected through interviews with household members and energy suppliers. These interviews take place in person, over the phone, via mail, and on the web, with modes changing over time. The survey has been fielded 14 times between 1950 and 2020. It includes questions about appliances, electronics, heating, air conditioning (A/C), temperatures, water heating, lighting, energy bills, respondent demographics, and energy assistance. 144 | 145 | We should read the survey documentation (see Chapter \@ref(c03-survey-data-documentation)) to understand how the data were collected and implemented. An overview of all variables can be found in `r if (!knitr:::is_html_output()) 'the online Appendix ('`Appendix \@ref(recs-cb)`r if (!knitr:::is_html_output()) ')'`. 146 | 147 | Before starting an analysis, we recommend viewing the data to understand the types of data and variables that are included. The `dplyr::glimpse()` function produces a list of all variables, the type of the variable (e.g., function, double), and a few example values. Below, we remove the weight variables with `select(-matches("^NWEIGHT"))` before using `glimpse()` to get a quick overview of the data: 148 | 149 | ```{r} 150 | #| label: setup-recs-glimpse 151 | recs_2020 %>% 152 | select(-matches("^NWEIGHT")) %>% 153 | glimpse() 154 | ``` 155 | 156 | From the output, we can see that the RECS data has `r nrow(recs_2020 %>% select(-matches("^NWEIGHT"))) %>% formatC(big.mark = ",")` rows and `r ncol(recs_2020 %>% select(-matches("^NWEIGHT"))) %>% formatC(big.mark = ",")` non-weight variables. This output also indicates that most of the variables are in double (numeric) format (e.g., `TOTSQFT_EN`), with some factor (e.g., `Region`), Boolean (e.g., `ACUsed`), character (e.g., `REGIONC`), and ordinal (e.g., `YearMade`) variables. \index{Residential Energy Consumption Survey (RECS)|)} 157 | 158 | ### Design objects {#setup-des-obj} 159 | 160 | \index{Design object|(}The design object is the backbone for survey analysis. It is where we specify the sampling design, weights, and other necessary information to ensure we account for errors in the data. Before creating the design object, we should carefully review the survey documentation to understand how to create the design object for accurate analysis. 161 | 162 | In this section, we provide details on how to code the design object for the ANES and RECS data used in the book. However, we only provide a high-level overview to get readers started. For a deeper understanding of creating design objects for a variety of sampling designs, see Chapter \@ref(c10-sample-designs-replicate-weights). 163 | 164 | While we recommend conducting exploratory data analysis on the original data before diving into complex survey analysis (see Chapter \@ref(c12-recommendations)), the actual survey analysis and inference should be performed with the survey design objects instead of the original survey data. For example, the ANES data is called `anes_2020`. If we create a survey design object called `anes_des`, our survey analyses should begin with `anes_des` and not `anes_2020`. Using the survey design object ensures that our calculations appropriately account for the details of the survey design. 165 | 166 | #### American National Election Studies Design Object {-} 167 | 168 | \index{American National Election Studies (ANES)|(} \index{Current Population Survey (CPS)|(} 169 | The ANES documentation [@debell] details the sampling and weighting implications for analyzing the survey data. From this documentation and as noted in Chapter \@ref(c03-survey-data-documentation), the 2020 ANES data are weighted to the sample, not the population. To make generalizations about the population, we need to weigh the data against the full population count. The ANES methodology recommends using the Current Population Survey (CPS) to determine the number of non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or D.C. in March 2020. 170 | 171 | We can use the {censusapi} package to obtain the information needed for the survey design object. The `getCensus()` function allows us to retrieve the CPS data for March (`cps/basic/mar`) in 2020 (`vintage = 2020`). Additionally, we extract several variables from the CPS: 172 | 173 | - month (`HRMONTH`) and year (`HRYEAR4`) of the interview: to confirm the correct time period 174 | - age (`PRTAGE`) of the respondent: to narrow the population to 18 and older (eligible age to vote) 175 | - citizenship status (`PRCITSHP`) of the respondent: to narrow the population to only those eligible to vote 176 | - final person-level weight (`PWSSWGT`) 177 | 178 | Detailed information for these variables can be found in the [CPS data dictionary](https://www2.census.gov/programs-surveys/cps/datasets/2020/basic/2020_Basic_CPS_Public_Use_Record_Layout_plus_IO_Code_list.txt). 179 | 180 | ```{r} 181 | #| label: setup-anes-cps-get 182 | #| message: false 183 | 184 | cps_state_in <- getCensus(name = "cps/basic/mar", 185 | vintage = 2020, 186 | region = "state", 187 | vars = c("HRMONTH", "HRYEAR4", 188 | "PRTAGE", "PRCITSHP", "PWSSWGT"), 189 | key = Sys.getenv("CENSUS_KEY")) 190 | 191 | cps_state <- cps_state_in %>% 192 | as_tibble() %>% 193 | mutate(across(.cols = everything(), 194 | .fns = as.numeric)) 195 | ``` 196 | 197 | In the code above, we include `region = "state"`. The default region type for the CPS data is at the state level. While not required, including the region can be helpful for understanding the geographical context of the data. 198 | 199 | In `getCensus()`, we filtered the dataset by specifying the month (`HRMONTH == 3`) and year (`HRYEAR4 == 2020`) of our request. Therefore, we expect that all interviews within our output were conducted during that particular month and year. We can confirm that the data are from March 2020 by running the code below: 200 | 201 | ```{r} 202 | #| label: setup-anes-cps-date 203 | cps_state %>% 204 | distinct(HRMONTH, HRYEAR4) 205 | ``` 206 | 207 | We can narrow down the dataset using the age and citizenship variables to include only individuals who are 18 years or older (`PRTAGE >= 18`) and have U.S. citizenship (`PRCITSHIP %in% c(1:4)`): 208 | 209 | ```{r} 210 | #| label: setup-anes-cps-narrowresp 211 | cps_narrow_resp <- cps_state %>% 212 | filter(PRTAGE >= 18, 213 | PRCITSHP %in% c(1:4)) 214 | ``` 215 | 216 | To calculate the U.S. population from the filtered data, we sum the person weights (`PWSSWGT`): 217 | 218 | ```{r} 219 | #| label: setup-anes-cps-targetpop 220 | targetpop <- cps_narrow_resp %>% 221 | pull(PWSSWGT) %>% 222 | sum() 223 | 224 | scales::comma(targetpop) 225 | ``` 226 | 227 | 228 | The population of interest in 2020 is `r scales::comma(targetpop)`. This result gives us what we need to create the survey design object for estimating population statistics. Using the `anes_2020` data, we adjust the weighting variable (`V200010b`) using the population of interest we just calculated (`targetpop`). We determine the proportion of the total weight for each individual weight (`V200010b / sum(V200010b)`) and then multiply that proportion by the calculated population of interest. 229 | \index{Current Population Survey (CPS)|)} 230 | 231 | ```{r} 232 | #| label: setup-anes-adjust 233 | anes_adjwgt <- anes_2020 %>% 234 | mutate(Weight = V200010b / sum(V200010b) * targetpop) 235 | ``` 236 | \index{Stratified sampling|(} \index{Functions in srvyr!as\_survey\_design|(} \index{as\_survey\_design|see {Functions in srvyr}} \index{Clustered sampling|(} \index{Primary sampling unit|(} \index{PSU|see {Primary sampling unit}} \index{Cluster|see {Primary sampling unit}} 237 | Once we have the adjusted weights, we can refer to the rest of the documentation to create the survey design. The documentation indicates that the study uses a stratified cluster sampling design. Therefore, we need to specify variables for `strata` and `ids` (cluster) and fill in the `nest` argument. The documentation provides guidance on which strata and cluster variables to use depending on whether we are analyzing pre- or post-election data. In this book, we analyze post-election data, so we need to use the post-election weight `V200010b`, strata variable `V200010d`, and Primary Sampling Unit (PSU)/cluster variable `V200010c`. Additionally, we set `nest=TRUE` to ensure the clusters are nested within the strata. \index{Weighting|)} 238 | 239 | ```{r} 240 | #| label: setup-anes-des 241 | anes_des <- anes_adjwgt %>% 242 | as_survey_design(weights = Weight, 243 | strata = V200010d, 244 | ids = V200010c, 245 | nest = TRUE) 246 | 247 | anes_des 248 | ``` 249 | 250 | We can examine this new object to learn more about the survey design, such that the ANES is a "Stratified 1 - level Cluster Sampling design (with replacement) With (101) clusters." Additionally, the output displays the sampling variables and then lists the remaining variables in the dataset. This design object is used throughout this book to conduct survey analysis. \index{Stratified sampling|)} \index{Functions in srvyr!as\_survey\_design|)} \index{American National Election Studies (ANES)|)} \index{Clustered sampling|)} \index{Primary sampling unit|)} 251 | 252 | #### Residential Energy Consumption Survey Design Object {-} 253 | 254 | \index{Replicate weights|(} \index{Replicate weights!Jackknife} \index{Jackknife|see {Replicate weights}} \index{Residential Energy Consumption Survey (RECS)|(} 255 | The RECS documentation [@recs-2020-tech] provides information on the survey's sampling and weighting implications for analysis. The documentation shows the 2020 RECS uses Jackknife weights, where the main analytic weight is `NWEIGHT`, and the Jackknife weights are `NWEIGHT1`-`NWEIGHT60`. We can specify these in the ``weights`` and ``repweights`` arguments in the survey design object code, respectively. 256 | 257 | With Jackknife weights, additional information is required: `type`, `scale`, and `mse`. Chapter \@ref(c10-sample-designs-replicate-weights) discusses in depth each of these arguments; but to quickly get started, the RECS documentation lets us know that `type=JK1`, `scale=59/60`, and `mse = TRUE`. \index{Functions in srvyr!as\_survey\_rep|(}We can use the following code to create the survey design object: \index{as\_survey\_rep|see {Functions in srvyr}} \index{Replicate weights!Jackknife} 258 | 259 | ```{r} 260 | #| label: setup-recs-des 261 | 262 | recs_des <- recs_2020 %>% 263 | as_survey_rep( 264 | weights = NWEIGHT, 265 | repweights = NWEIGHT1:NWEIGHT60, 266 | type = "JK1", 267 | scale = 59 / 60, 268 | mse = TRUE 269 | ) 270 | 271 | recs_des 272 | ``` 273 | 274 | Viewing this new object provides information about the survey design, such that RECS is an "Unstratified cluster jacknife (JK1) with 60 replicates and MSE variances." Additionally, the output shows the sampling variables (`NWEIGHT1`-`NWEIGHT60`) and then lists the remaining variables in the dataset. This design object is used throughout this book to conduct survey analysis. \index{Functions in srvyr!as\_survey\_rep|)} \index{Replicate weights|)} \index{Residential Energy Consumption Survey (RECS)|)} 275 | 276 | ## Survey analysis process {#survey-analysis-process} 277 | 278 | \index{Survey analysis process|(} 279 | 280 | There is a general process for analyzing data to create estimates with {srvyr} package: 281 | 282 | 1. Create a `tbl_svy` object (a survey object) using: `as_survey_design()` or `as_survey_rep()` 283 | 284 | 2. Subset data (if needed) using `filter()` (to create subpopulations) 285 | 286 | 3. Specify domains of analysis using `group_by()` 287 | 288 | 4. Within `summarize()`, specify variables to calculate, including means, totals, proportions, quantiles, and more 289 | 290 | In Section \@ref(setup-des-obj), we follow Step 1 to create the survey design objects for the ANES and RECS data featured in this book. Additional details on how to create design objects can be found in Chapter \@ref(c10-sample-designs-replicate-weights). Then, once we have the design object, we can filter the data to any subpopulation of interest (if needed). It is important to filter the data after creating the design object. This ensures that we are accurately accounting for the survey design in our calculations. Finally, we can use `group_by()`, `summarize()`, and other functions from the {survey} and {srvyr} packages to analyze the survey data by estimating means, totals, and so on. 291 | 292 | \index{Survey analysis process|)}\index{Design object|)} 293 | 294 | ## Similarities between {dplyr} and {srvyr} functions {#similarities-dplyr-srvyr} 295 | 296 | The {dplyr} package from the tidyverse offers flexible and intuitive functions for data wrangling [@R-dplyr]. One of the major advantages of using {srvyr} is that it applies {dplyr}-like syntax to the {survey} package [@R-srvyr]. We can use pipes, such as `%>%` from the {magrittr} package, to specify a survey design object, apply a function, and then feed that output into the next function's first argument [@R-magrittr]. Functions follow the 'tidy' convention of snake_case function names. 297 | 298 | To help explain the similarities between {dplyr} functions and {srvyr} functions, we use the `towny` dataset from the {gt} package and `apistrat` data that comes in the {survey} package. The `towny` dataset provides population data for municipalities in Ontario, Canada on census years between 1996 and 2021. Taking a look at `towny` with `dplyr::glimpse()`, we can see the dataset has `r ncol(towny)` columns with a mix of character and numeric data. 299 | 300 | ```{r} 301 | #| label: setup-towny-surveydata 302 | towny %>% 303 | glimpse() 304 | ``` 305 | 306 | Let's examine the `towny` object's class. We verify that it is a tibble, as indicated by `"tbl_df"`, by running the code below: 307 | 308 | ```{r} 309 | #| label: setup-towny-class 310 | class(towny) 311 | ``` 312 | 313 | All tibbles are data.frames, but not all data.frames are tibbles. Compared to data.frames, tibbles have some advantages, with the printing behavior being a noticeable advantage. When working with tidyverse style code, we recommend making all your datasets tibbles for ease of analysis. 314 | 315 | The {survey} package contains datasets related to the California Academic Performance Index, which measures student performance in schools with at least 100 students in California. We can access these datasets by loading the {survey} package and running `data(api)`. 316 | 317 | \index{Stratified sampling|(} \index{Functions in srvyr!as\_survey\_design|(} 318 | Let's work with the `apistrat` dataset, which is a stratified random sample, stratified by school type (`stype`) with three levels: `E` for elementary school, `M` for middle school, and `H` for high school. We first create the survey design object (see Chapter \@ref(c10-sample-designs-replicate-weights) for more information). The sample is stratified by the `stype` variable and the sampling weights are found in the `pw` variable. We can use this information to construct the design object, `apistrat_des`. \index{Stratified sampling|)} 319 | 320 | ```{r} 321 | #| label: setup-api-surveydata 322 | data(api) 323 | 324 | apistrat_des <- apistrat %>% 325 | as_survey_design(strata = stype, 326 | weights = pw) 327 | ``` 328 | 329 | When we check the class of `apistrat_des`, it is not a typical `data.frame`. Applying the `as_survey_design()` function transforms the data into a `tbl_svy`, a special class specifically for survey design objects. The {srvyr} package is designed to work with the `tbl_svy` class of objects. \index{Functions in srvyr!as\_survey\_design|)} 330 | 331 | ```{r} 332 | #| label: setup-api-class 333 | class(apistrat_des) 334 | ``` 335 | 336 | Let's look at how {dplyr} works with regular data frames. The example below calculates the mean and median for the `land_area_km2` variable in the `towny` dataset. 337 | 338 | ```{r} 339 | #| label: setup-dplyr-examp 340 | towny %>% 341 | summarize(area_mean = mean(land_area_km2), 342 | area_median = median(land_area_km2)) 343 | ``` 344 | 345 | In the code below, we calculate the mean and median of the variable `api00` using `apistrat_des`. Note the similarity in the syntax. However, the standard error of the statistic is also calculated in addition to the statistic itself. \index{Functions in srvyr!survey\_mean|(} \index{Functions in srvyr!summarize|(} \index{summarize|see {Functions in srvyr}} \index{survey\_mean|see {Functions in srvyr}} 346 | 347 | ```{r} 348 | #| label: setup-srvyr-examp 349 | apistrat_des %>% 350 | summarize(api00_mean = survey_mean(api00), 351 | api00_med = survey_median(api00)) 352 | ``` 353 | 354 | The functions in {srvyr} also play nicely with other tidyverse functions. For example, if we wanted to select columns with shared characteristics, we can use {tidyselect} functions such as `starts_with()`, `num_range()`, etc. [@R-tidyselect]. In the examples below, we use a combination of `across()` and `starts_with()` to calculate the mean of variables starting with "population" in the `towny` data frame and those beginning with `api` in the `apistrat_des` survey object. \index{Functions in srvyr!summarize|)} 355 | 356 | ```{r} 357 | #| label: setup-dplyr-select 358 | towny %>% 359 | summarize(across(starts_with("population"), 360 | ~mean(.x, na.rm = TRUE))) 361 | ``` 362 | 363 | ```{r} 364 | #| label: setup-srvyr-select 365 | apistrat_des %>% 366 | summarize(across(starts_with("api"), 367 | survey_mean)) 368 | ``` 369 | \index{Functions in srvyr!survey\_mean|)} 370 | 371 | We have the flexibility to use {dplyr} verbs such as `mutate()`, `filter()`, and `select()` on our survey design object. As mentioned in Section \@ref(survey-analysis-process), these steps should be performed on the survey design object. This ensures our survey design is properly considered in all our calculations. 372 | 373 | ```{r} 374 | #| label: setup-srvyr-mutate 375 | apistrat_des_mod <- apistrat_des %>% 376 | mutate(api_diff = api00 - api99) %>% 377 | filter(stype == "E") %>% 378 | select(stype, api99, api00, api_diff, api_students = api.stu) 379 | 380 | apistrat_des_mod 381 | 382 | apistrat_des 383 | ``` 384 | 385 | Several functions in {srvyr} must be called within `srvyr::summarize()`, with the exception of \index{Functions in srvyr!survey\_count}`srvyr::survey_count()` and \index{Functions in srvyr!survey\_tally}`srvyr::survey_tally()`. This is similar to how `dplyr::count()` and `dplyr::tally()` are not called within `dplyr::summarize()`. The `summarize()` function can be used in conjunction with the `group_by()` function or `by/.by` arguments, which applies the functions on a group-by-group basis to create grouped summaries. \index{survey\_count|see {Functions in srvyr}} 386 | 387 | ```{r} 388 | #| label: setup-dplyr-groupby 389 | towny %>% 390 | group_by(csd_type) %>% 391 | dplyr::summarize(area_mean = mean(land_area_km2), 392 | area_median = median(land_area_km2)) 393 | ``` 394 | 395 | We use a similar setup to summarize data in {srvyr}: \index{Functions in srvyr!survey\_mean|(} \index{Functions in srvyr!survey\_median|(} \index{Functions in srvyr!summarize|(} \index{survey\_median|see {Functions in srvyr}} 396 | 397 | ```{r} 398 | #| label: setup-srvyr-groupby 399 | apistrat_des %>% 400 | group_by(stype) %>% 401 | summarize(api00_mean = survey_mean(api00), 402 | api00_median = survey_median(api00)) 403 | ``` 404 | 405 | An alternative way to do grouped analysis on the `towny` data would be with the `.by` argument: 406 | 407 | ```{r} 408 | #| label: setup-dplyr-by-alt 409 | towny %>% 410 | dplyr::summarize(area_mean = mean(land_area_km2), 411 | area_median = median(land_area_km2), 412 | .by=csd_type) 413 | ``` 414 | 415 | The `.by` syntax is similarly implemented in {srvyr} for grouped analysis: 416 | 417 | ```{r} 418 | #| label: setup-srvyr-by-alt 419 | apistrat_des %>% 420 | summarize(api00_mean = survey_mean(api00), 421 | api00_median = survey_median(api00), 422 | .by = stype) 423 | ``` 424 | \index{Functions in srvyr!survey\_median|)} 425 | 426 | As mentioned above, {srvyr} functions are meant for `tbl_svy` objects. Attempting to manipulate data on non-`tbl_svy` objects, like the `towny` example shown below, results in an error. Running the code lets us know what the issue is: `Survey context not set`. 427 | 428 | ```{r} 429 | #| label: setup-nsobj-error 430 | #| error: true 431 | towny %>% 432 | summarize(area_mean = survey_mean(land_area_km2)) 433 | ``` 434 | \index{Functions in srvyr!survey\_mean|)} 435 | 436 | A few functions in {srvyr} have counterparts in {dplyr}, such as `srvyr::summarize()` and `srvyr::group_by()`. Unlike {srvyr}-specific verbs, {srvyr} recognizes these parallel functions if applied to a non-survey object. Instead of causing an error, the package provides the equivalent output from {dplyr}: 437 | 438 | ```{r} 439 | #| label: setup-nsobj-noerr 440 | towny %>% 441 | srvyr::summarize(area_mean = mean(land_area_km2)) 442 | ``` 443 | 444 | Because this book focuses on survey analysis, most of our pipes stem from a survey object. When we load the {dplyr} and {srvyr} packages, the functions automatically figure out the class of data and use the appropriate one from {dplyr} or {srvyr}. Therefore, we do not need to include the namespace for each function (e.g., `srvyr::summarize()`). \index{Functions in srvyr!summarize|)} 445 | -------------------------------------------------------------------------------- /09-reproducible-data.Rmd: -------------------------------------------------------------------------------- 1 | # Reproducible research {#c09-reprex-data} 2 | 3 | ```{r} 4 | #| label: reprex-styler 5 | #| include: false 6 | #| message: false 7 | knitr::opts_chunk$set(tidy = 'styler') 8 | ``` 9 | 10 | ## Introduction 11 | 12 | Reproducing results is an important aspect of any research. First, reproducibility serves as a form of quality assurance. If we pass an analysis project to another person, they should be able to run the entire project from start to finish and obtain the same results. They can critically assess the methodology and code while detecting potential errors. Another goal of reproducibility is enabling the verification of our analysis. When someone else is able to check our results, it ensures the integrity of the analyses by determining that the conclusions are not dependent on a particular person running the code or workflow on a particular day or in a particular environment. 13 | 14 | Not only is reproducibility a key component in ethical and accurate research, but it is also a requirement for many scientific journals. For example, the *Journal of Survey Statistics and Methodology* (JSSAM) and *Public Opinion Quarterly* (POQ) require authors to make code, data, and methodology transparent and accessible to other researchers who wish to verify or build on existing work. 15 | 16 | Reproducible research requires that the key components of analysis are available, discoverable, documented, and shared with others. The four main components that we should consider are: 17 | 18 | - Code: source code used for data cleaning, analysis, modeling, and reporting 19 | - Data: raw data used in the workflow, or if data are sensitive or proprietary, as much data as possible that would allow others to run our workflow or provide details on how to access the data (e.g., access to a restricted use file (RUF)) 20 | - Environment: environment of the project, including the R version, packages, operating system, and other dependencies used in the analysis 21 | - Methodology: survey and analysis methodology, including rationale behind sample, questionnaire and analysis decisions, interpretations, and assumptions 22 | 23 | In Chapter \@ref(c08-communicating-results), we briefly mention how each of these is important to include in the methodology report and when communicating the findings of a study. However, to be transparent and effective analysts, we need to ensure we not only discuss these through text but also provide files and additional information when requested. Often, when starting a project, we may be eager to jump into the data and make decisions as we go without full documentation. This can be challenging if we need to go back and make changes or understand even what we did a few months ago. It benefits other analysts and potentially our future selves to document everything from the start. The good news is that many tools, practices, and project management techniques make survey analysis projects easy to reproduce. For best results, we should decide which techniques and tools to use before starting a project (or very early on). 24 | 25 | This chapter covers some of our suggestions for tools and techniques we can use in projects. This list is not comprehensive but aims to provide a starting point for those looking to create a reproducible workflow. 26 | 27 | ## Project-based workflows 28 | 29 | \index{R projects|(} 30 | We recommend a project-based workflow for analysis projects as described by @wickham2023r4ds. A project-based workflow maintains a "source of truth" for our analyses. It helps with file system discipline by putting everything related to a project in a designated folder. Since all associated files are in a single location, they are easy to find and organize. When we reopen the project, we can recreate the environment in which we originally ran the code to reproduce our results. 31 | 32 | The RStudio IDE has built-in support for projects. When we create a project in RStudio, it creates an `.Rproj` file that stores settings specific to that project. Once we have created a project, we can create folders that help us organize our workflow. For example, a project directory could look like this: 33 | 34 | ``` 35 | | anes_analysis/ 36 | | anes_analysis.Rproj 37 | | README.md 38 | | codebooks 39 | | codebook2020.pdf 40 | | codebook2016.pdf 41 | | rawdata 42 | | anes2020_raw.csv 43 | | anes2016_raw.csv 44 | | scripts 45 | | data-prep.R 46 | | data 47 | | anes2020_clean.csv 48 | | anes2016_clean.csv 49 | | report 50 | | anes_report.Rmd 51 | | anes_report.html 52 | | anes_report.pdf 53 | ``` 54 | 55 | \index{here package|(} 56 | In a project-based workflow, all paths are relative and, by default, relative to the folder the `.Rproj` file is located in. By using relative paths, others can open and run our files even if their directory configuration differs from ours (e.g., Mac and Windows users have different directory path structures). The {here} package enables easy file referencing, and we can start by using the `here::here()` function to build the path for loading or saving data [@R-here]. Below, we ask R to read the CSV file `anes_2020.csv` in the project directory's `data` folder: 57 | 58 | ```{r} 59 | #| label: reprex-project-file-example 60 | #| eval: false 61 | anes <- 62 | read_csv(here::here("data", "anes2020_clean.csv")) 63 | ``` 64 | 65 | The combination of projects and the {here} package keep all associated files organized. This workflow makes it more likely that our analyses can be reproduced by us or our colleagues. 66 | \index{here package|)} \index{R projects|)} 67 | 68 | ## Functions and packages 69 | 70 | We may find that we are repeating ourselves in our script, and the chance of errors increases whenever we copy and paste our code. By creating a function, we can create a consistent set of commands that reduce the likelihood of mistakes. Functions also organize our code, improve the code readability, and allow others to execute the same commands. For example, in Chapter \@ref(c13-ncvs-vignette), we create a function to run sequences of `rename()`, `filter()`, `group_by()`, and summarize statements across different variables. Creating functions helps us avoid overlooking necessary steps. 71 | 72 | A package is made up of a collection of functions. If we find ourselves sharing functions with others to replicate the same series of commands in a separate project, creating a package can be a useful tool for sharing the code along with data and documentation. 73 | 74 | ## Version control with Git 75 | 76 | \index{Version control|(} \index{Git| see {Version control }} 77 | Often, a survey analysis project produces a lot of code. Keeping track of the latest version can become challenging, as files evolve throughout a project. If a team of analysts is working on the same script, someone may use an outdated version, resulting in incorrect results or redundant work. 78 | 79 | Version control systems like Git can help alleviate these pains. Git is a system that tracks changes in files. We can use Git to follow code evaluation and manage asynchronous work. With Git, it is easy to see any changes made in a script, revert changes, and resolve differences between code versions (called conflicts). 80 | 81 | Services such as GitHub or GitLab provide hosting and sharing of files as well as version control with Git. For example, we can visit the [GitHub repository for this book](https://github.com/tidy-survey-r/tidy-survey-book) and see the files that build the book, when they were committed to the repository, and the history of modifications over time. 82 | 83 | In addition to code scripts, platforms like GitHub can store data and documentation. They provide a way to maintain a history of data modifications through versioning and timestamps. By saving the data and documentation alongside the code, it becomes easier for others to refer to and access everything they need in one place. 84 | 85 | Using version control in analysis projects makes collaboration and maintenance more manageable. To connect Git with R, we recommend referencing the book [Happy Git and GitHub for the useR](https://happygitwithr.com/) [@git-w-R]. 86 | 87 | \index{Version control|)} 88 | 89 | ## Package management with {renv} 90 | 91 | \index{renv package|(} \index{Package management|see {renv package}} 92 | Ensuring reproducibility involves not only using version control of code but also managing the versions of packages. If two people run the same code but use different package versions, the results might differ because of changes to those packages. For example, this book currently uses a version of the {srvyr} package from GitHub and not from CRAN. This is because the version of {srvyr} on CRAN has some bugs (errors) that result in incorrect calculations. The version on GitHub has corrected these errors, so we have asked readers to install the GitHub version to obtain the same results. 93 | 94 | One way to handle different package versions is with the {renv} package. This package allows researchers to set the versions for each used package and manage package dependencies. Specifically, {renv} creates isolated, project-specific environments that record the packages and their versions used in the code. When initiated by a new user, {renv} checks whether the installed packages are consistent with the recorded version for the project. If not, it installs the appropriate versions so that others can replicate the project's environment to rerun the code and obtain consistent results [@R-renv]. 95 | 96 | \index{renv package|)} 97 | 98 | ## R environments with Docker 99 | 100 | \index{Environment management|(} \index{Docker|see {Environment management}} 101 | Just as different versions of packages can introduce discrepancies or compatibility issues, the version of R can also prevent reproducibility. Tools such as Docker can help with this potential issue by creating isolated environments that define the version of R being used, along with other dependencies and configurations. The entire environment is bundled in a container. The container, defined by a Dockerfile, can be shared so that anybody, regardless of their local setup, can run the R code in the same environment. 102 | \index{Environment management|)} 103 | 104 | ## Workflow management with {targets} 105 | 106 | With complex studies involving multiple code files and dependencies, it is important to ensure each step is executed in the intended sequence. We can do this manually, e.g., by numbering files to indicate the order or providing detailed documentation on the order. Alternatively, we can automate the process so the code flows sequentially. Making sure that the code runs in the correct order helps ensure that the research is reproducible. Anyone should be able to pick up the set of scripts and get the same results by following the workflow. 107 | 108 | The {targets} package is an increasingly popular workflow manager that documents, automates, and executes complex data workflows with multiple steps and dependencies. With this package, we first define the order of execution for our code, and then it consistently executes the code in that order each time it is run. One beneficial feature of {targets} is that if code changes later in the workflow, only the affected code and its downstream targets (i.e., the subsequent code files) are re-executed when we change a script. The {targets} package also provides interactive progress monitoring and reporting, allowing us to track the status and progress of our analysis pipeline [@targetslandau]. 109 | 110 | ## Documentation with Quarto and R Markdown 111 | 112 | \index{R Markdown|(} \index{Quarto|(} 113 | Tools like Quarto and R Markdown aid in reproducibility by creating documents that weave together code, text, and results. We can present analysis results alongside the report's narrative, so there's no need to copy and paste code output into the final documentation. By eliminating manual steps, we can reduce the chances of errors in the final output. 114 | 115 | Quarto and R Markdown documents also allow users to re-execute the underlying code when needed. Another analyst can see the steps we took, follow the scripts, and recreate the report. We can include details about our work in one place thanks to the combination of text and code, making our work transparent and easier to verify [@R-quarto; @rmarkdown2020man]. 116 | 117 | ### Parameterization 118 | 119 | Another useful feature of Quarto and R Markdown is the ability to reduce repetitive code by parameterizing the files. Parameters can control various aspects of the analysis, such as dates, geography, or other analysis variables. We can define and modify these parameters to explore different scenarios or inputs. For example, suppose we start by creating a document that provides survey analysis results for North Carolina but then later decide we want to look at another state. In that case, we can define a `state` parameter and rerun the same analysis for a state like Washington without having to edit the code throughout the document. 120 | 121 | Parameters can be defined in the header or code chunks of our Quarto or R Markdown documents and easily modified and documented. By manually editing code throughout the script, we reduce errors that may occur and offer a flexible way for others to replicate the analysis and explore variations. 122 | 123 | \index{R Markdown|)} \index{Quarto|)} 124 | 125 | ## Other tips for reproducibility 126 | 127 | ### Random number seeds 128 | 129 | Some tasks in survey analysis require randomness, such as imputation\index{Imputation}, model training, or creating random samples. By default, the random numbers generated by R change each time we rerun the code, making it difficult to reproduce the same results. By "setting the seed," we can control the randomness and ensure that the random numbers remain consistent whenever we rerun the code. Others can use the same seed value to reproduce our random numbers and achieve the same results. 130 | 131 | In R, we can use the `set.seed()` function to control the randomness in our code. We set a seed value by providing an integer in the function argument. The following code chunk sets a seed using `999`, then runs a random number function (`runif()`) to get five random numbers from a uniform distribution. 132 | 133 | ```{r} 134 | #| label: reprex-set-seed 135 | set.seed(999) 136 | runif(5) 137 | ``` 138 | 139 | Since the seed is set to `999`, running `runif(5)` multiple times always produces the same output. The choice of the seed number is up to the analyst. For example, this could be the date (`20240102`) or time of day (`1056`) when the analysis was first conducted, a phone number (`8675309`), or the first few numbers that come to mind (`369`). As long as the seed is set for a given analysis, the actual number is up to the analyst to decide. It is important to note that `set.seed()` should be used before random number generation. Run it once per program, and the seed is applied to the entire script. We recommend setting the seed at the beginning of a script, where libraries are loaded. 140 | 141 | ### Descriptive names and labels 142 | 143 | \index{American National Election Studies (ANES)|(} 144 | Using descriptive variable names or labeling data can also assist with reproducible research. For example, in the ANES data, the variable names in the raw data all start with `V20` and are a string of numbers. To make things easier to reproduce in this book, we opted to change the variable names to be more descriptive of what they contained (e.g., `Age`).\index{American National Election Studies (ANES)|)} This can also be done with the data values themselves. \index{Categorical data|(}\index{Factor|(}One way to accomplish this is by creating factors for categorical data, which can ensure that we know that a value of `1` really means `Female`, for example.\index{Factor|)} There are other ways of handling this, such as attaching labels to the data instead of recoding variables to be descriptive (see Chapter \@ref(c11-missing-data)). \index{Categorical data|)} As with random number seeds, the exact method is up to the analyst, but providing this information can help ensure our research is reproducible. 145 | 146 | ## Additional resources 147 | 148 | We can promote accuracy and verification of results by making our analysis reproducible. There are various tools and guides available to help achieve reproducibility in analysis work, a few of which were described in this chapter. Here are additional resources to explore: 149 | 150 | * [R for Data Science chapter on project-based workflows](https://r4ds.hadley.nz/workflow-scripts.html#projects) 151 | * [Building reproducible analytical pipelines with R](https://raps-with-r.dev/) 152 | * [Posit Solutions Site page on reproducible environments](https://solutions.posit.co/envs-pkgs/environments/) 153 | -------------------------------------------------------------------------------- /11-missing-data.Rmd: -------------------------------------------------------------------------------- 1 | # Missing data {#c11-missing-data} 2 | 3 | \index{Missing data|(} 4 | ```{r} 5 | #| label: missing-styler 6 | #| include: false 7 | knitr::opts_chunk$set(tidy = 'styler') 8 | ``` 9 | 10 | ::: {.prereqbox-header} 11 | `r if (knitr:::is_html_output()) '### Prerequisites {- #prereq11}'` 12 | ::: 13 | 14 | ::: {.prereqbox data-latex="{Prerequisites}"} 15 | For this chapter, load the following packages: 16 | ```{r} 17 | #| label: missing-setup 18 | #| error: FALSE 19 | #| warning: FALSE 20 | #| message: FALSE 21 | library(tidyverse) 22 | library(survey) 23 | library(srvyr) 24 | library(srvyrexploR) 25 | library(naniar) 26 | library(haven) 27 | library(gt) 28 | ``` 29 | 30 | 31 | We are using data from ANES and RECS described in Chapter \@ref(c04-getting-started). As a reminder, here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter \@ref(c04-getting-started) for more information). 32 | 33 | ```{r} 34 | #| label: missing-anes-des 35 | #| eval: FALSE 36 | targetpop <- 231592693 37 | 38 | anes_adjwgt <- anes_2020 %>% 39 | mutate(Weight = Weight / sum(Weight) * targetpop) 40 | 41 | anes_des <- anes_adjwgt %>% 42 | as_survey_design( 43 | weights = Weight, 44 | strata = Stratum, 45 | ids = VarUnit, 46 | nest = TRUE 47 | ) 48 | ``` 49 | 50 | For RECS, details are included in the RECS documentation and Chapter \@ref(c10-sample-designs-replicate-weights). 51 | 52 | ```{r} 53 | #| label: missing-recs-des 54 | #| eval: FALSE 55 | recs_des <- recs_2020 %>% 56 | as_survey_rep( 57 | weights = NWEIGHT, 58 | repweights = NWEIGHT1:NWEIGHT60, 59 | type = "JK1", 60 | scale = 59/60, 61 | mse = TRUE 62 | ) 63 | ``` 64 | ::: 65 | 66 | ## Introduction 67 | 68 | Missing data in surveys refer to situations where participants do not provide complete responses to survey questions. Respondents may not have seen a question by design. Or, they may not respond to a question for various other reasons, such as not wanting to answer a particular question, not understanding the question, or simply forgetting to answer. Missing data are important to consider and account for, as they can introduce bias and reduce the representativeness of the data. This chapter provides an overview of the types of missing data, how to assess missing data in surveys, and how to conduct analysis when missing data are present. Understanding this complex topic can help ensure accurate reporting of survey results and provide insight into potential changes to the survey design for the future. 69 | 70 | ## Missing data mechanisms 71 | 72 | \index{Item nonresponse|(}There are two main categories that missing data typically fall into: missing by design and unintentional missing data. Missing by design is part of the survey plan and can be more easily incorporated into weights and analyses. Unintentional missing data, on the other hand, can lead to bias in survey estimates if not correctly accounted for. Below we provide more information on the types of missing data. 73 | 74 | 1. Missing by design/questionnaire skip logic: This type of missingness occurs when certain respondents are intentionally directed to skip specific questions based on their previous responses or characteristics. For example, in a survey about employment, if a respondent indicates that they are not employed, they may be directed to skip questions related to their job responsibilities. Additionally, some surveys randomize questions or modules so that not all participants respond to all questions. In these instances, respondents would have missing data for the modules not randomly assigned to them. 75 | 76 | 2. Unintentional missing data: This type of missingness occurs when researchers do not intend for there to be missing data on a particular question, for example, if respondents did not finish the survey or refused to answer individual questions. There are three main types of unintentional missing data that each should be considered and handled differently [@mack; @Schafer2002]: 77 | 78 | a. Missing completely at random (MCAR): The missing data are unrelated to both observed and unobserved data, and the probability of being missing is the same across all cases. For example, if a respondent missed a question because they had to leave the survey early due to an emergency. 79 | 80 | b. Missing at random (MAR): The missing data are related to observed data but not unobserved data, and the probability of being missing is the same within groups. For example, we know the respondents' ages and older respondents choose not to answer specific questions but younger respondents do answer them. 81 | 82 | c. Missing not at random (MNAR): The missing data are related to unobserved data, and the probability of being missing varies for reasons we are not measuring. For example, if respondents with depression do not answer a question about depression severity. 83 | 84 | 85 | ## Assessing missing data 86 | 87 | Before beginning an analysis, we should explore the data to determine if there is missing data and what types of missing data are present. Conducting descriptive analysis can help with the analysis and reporting of survey data and can inform the survey design in future studies. For example, large amounts of unexpected missing data may indicate the questions were unclear or difficult to recall. There are several ways to explore missing data, which we walk through below. When assessing the missing data, we recommend using a data.frame object and not the survey object, as most of the analysis is about patterns of records, and weights are not necessary. 88 | 89 | ### Summarize data 90 | 91 | \index{American National Election Studies (ANES)|(} 92 | A very rudimentary first exploration is to use the `summary()` function to summarize the data, which illuminates `NA` values in the data. Let's look at a few analytic variables on the ANES 2020 data using `summary()`: 93 | 94 | ```{r} 95 | #| label: missing-anes-summary 96 | 97 | anes_2020 %>% 98 | select(V202051:EarlyVote2020) %>% 99 | summary() 100 | ``` 101 | 102 | We see that there are `NA` values in several of the derived variables (those not beginning with "V") and negative values in the original variables (those beginning with "V"). We can also use the `count()` function to get an understanding of the different types of missing data on the original variables. For example, let's look at the count of data for `V202072`, which corresponds to our `VotedPres2020` variable. 103 | 104 | ```{r} 105 | #| label: missing-anes-count 106 | 107 | anes_2020 %>% 108 | count(VotedPres2020,V202072) 109 | ``` 110 | 111 | Here, we can see that there are three types of missing data, and the majority of them fall under the "Inapplicable" category. This is usually a term associated with data missing due to skip patterns and is considered to be missing data by design. Based on the documentation from ANES [@debell], we can see that this question was only asked to respondents who voted in the election. 112 | 113 | ### Visualization of missing data 114 | 115 | It can be challenging to look at tables for every variable and instead may be more efficient to view missing data in a graphical format to help narrow in on patterns or unique variables. The {naniar} package is very useful in exploring missing data visually. We can use the `vis_miss()` function available in both {visdat} and {naniar} packages to view the amount of missing data by variable (see Figure \@ref(fig:missing-anes-vismiss)) [@visdattierney; @naniar2023]. 116 | 117 | ```{r} 118 | #| label: missing-anes-vismiss 119 | #| warning: FALSE 120 | #| message: FALSE 121 | #| fig.cap: "Visual depiction of missing data in the ANES 2020 data" 122 | #| fig.alt: "This chart shows a the missingness of the selected variables where missing is highlighted in a dark color. Each row of the plot is an observation and each column is a variable. There are some patterns observed such as a large block of missing for `VotedPres2016_selection` and many of the same respondents also having missing for `VotedPres2020_selection`." 123 | 124 | anes_2020_derived<-anes_2020 %>% 125 | select( 126 | -starts_with("V2"), -CaseID, -InterviewMode, 127 | -Weight, -Stratum, -VarUnit) 128 | 129 | anes_2020_derived %>% 130 | vis_miss(cluster= TRUE, show_perc = FALSE) + 131 | scale_fill_manual(values = book_colors[c(3,1)], 132 | labels = c("Present","Missing"), 133 | name = "") + 134 | theme( 135 | plot.margin=margin(5.5,30,5.5,5.5, "pt"), 136 | axis.text.x=element_text(angle=70)) 137 | 138 | ``` 139 | 140 | From the visualization in Figure \@ref(fig:missing-anes-vismiss), we can start to get a picture of what questions may be connected in terms of missing data. Even if we did not have the informative variable names, we could deduce that `VotedPres2020`, `VotedPres2020_selection`, and `EarlyVote2020` are likely connected since their missing data patterns are similar. 141 | 142 | Additionally, we can also look at `VotedPres2016_selection` and see that there are a lot of missing data in that variable. The missing data are likely due to a skip pattern, and we can look at other graphics to see how they relate to other variables. The {naniar} package has multiple visualization functions that can help dive deeper, such as the `gg_miss_fct()` function, which looks at missing data for all variables by levels of another variable (see Figure \@ref(fig:missing-anes-ggmissfct)). 143 | 144 | ```{r} 145 | #| label: missing-anes-ggmissfct 146 | #| warning: FALSE 147 | #| message: FALSE 148 | #| fig.cap: Missingness in variables for each level of 'VotedPres2016,' in the ANES 2020 data 149 | #| fig.alt: "This chart has x-axis 'Voted for President in 2016' with labels Yes, No and NA and has y-axis 'Variable' with labels Age, AgeGroup, CampaignInterest, EarlyVote2020, Education, Gender, Income, Income7, PartyID, RaceEth, TrustGovernment, TrustPeople, VotedPres2016_selection, VotedPres2020 and VotedPres2020_selection. There is a legend indicating fill is used to show pct_miss, ranging from 0 represented by fill very pale blue to 100 shown as fill dark blue. Among those that voted for president in 2016, they had little missing for other variables (light color) but those that did not vote have more missing data in their 2020 voting patterns and their 2016 president selection." 150 | 151 | anes_2020_derived %>% 152 | gg_miss_fct(VotedPres2016) + 153 | scale_fill_gradientn( 154 | guide = "colorbar", 155 | name = "% Miss", 156 | colors = book_colors[c(3, 2, 1)] 157 | ) + 158 | ylab("Variable") + 159 | xlab("Voted for President in 2016") 160 | ``` 161 | 162 | In Figure \@ref(fig:missing-anes-ggmissfct), we can see that if respondents did not vote for president in 2016 or did not answer that question, then they were not asked about who they voted for in 2016 (the percentage of missing data is 100%). Additionally, we can see with Figure \@ref(fig:missing-anes-ggmissfct) that there are more missing data across all questions if they did not provide an answer to `VotedPres2016`. 163 | \index{American National Election Studies (ANES)|)} 164 | 165 | \index{Residential Energy Consumption Survey (RECS)|(} 166 | There are other visualizations that work well with numeric data. For example, in the RECS 2020 data, we can plot two continuous variables and the missing data associated with them to see if there are any patterns in the missingness. To do this, we can use the `bind_shadow()` function from the {naniar} package. This creates a nabular (combination of "na" with "tabular"), which features the original columns followed by the same number of columns with a specific `NA` format. These `NA` columns are indicators of whether the value in the original data is missing or not. The example printed below shows how most levels of `HeatingBehavior` are not missing (`!NA`) in the NA variable of `HeatingBehavior_NA`, but those missing in `HeatingBehavior` are also missing in `HeatingBehavior_NA`. 167 | 168 | ```{r} 169 | #| label: missing-recs-shadow 170 | 171 | recs_2020_shadow <- recs_2020 %>% 172 | bind_shadow() 173 | 174 | ncol(recs_2020) 175 | ncol(recs_2020_shadow) 176 | 177 | recs_2020_shadow %>% 178 | count(HeatingBehavior,HeatingBehavior_NA) 179 | ``` 180 | 181 | We can then use these new variables to plot the missing data alongside the actual data. For example, let's plot a histogram of the total electric bill grouped by those missing and not missing by heating behavior (see Figure \@ref(fig:missing-recs-hist)). 182 | 183 | ```{r} 184 | #| label: missing-recs-hist 185 | #| fig.cap: "Histogram of energy cost by heating behavior missing data" 186 | #| fig.alt: "This chart has title 'Histogram of Energy Cost by Heating Behavior Missing Data'. It has x-axis 'Total Energy Cost (Truncated at $5000)' with labels 0, 1000, 2000, 3000, 4000 and 5000. It has y-axis 'Number of Households' with labels 0, 500, 1000 and 1500. There is a legend indicating fill is used to show HeatingBehavior_NA, with 2 levels: !NA shown as very pale blue fill and NA shown as dark blue fill. The chart is a bar chart with 30 vertical bars. These are stacked, as sorted by HeatingBehavior_NA." 187 | recs_2020_shadow %>% 188 | filter(TOTALDOL < 5000) %>% 189 | ggplot(aes(x = TOTALDOL, fill = HeatingBehavior_NA)) + 190 | geom_histogram() + 191 | scale_fill_manual( 192 | values = book_colors[c(3, 1)], 193 | labels = c("Present", "Missing"), 194 | name = "Heating Behavior" 195 | ) + 196 | theme_minimal() + 197 | xlab("Total Energy Cost (Truncated at $5000)") + 198 | ylab("Number of Households") 199 | ``` 200 | 201 | Figure \@ref(fig:missing-recs-hist) indicates that respondents who did not provide a response for the heating behavior question may have a different distribution of total energy cost compared to respondents who did provide a response. This view of the raw data and missingness could indicate some bias in the data. Researchers take these different bias aspects into account when calculating weights, and we need to make sure that we incorporate the weights when analyzing the data. \index{Residential Energy Consumption Survey (RECS)|)} 202 | 203 | There are many other visualizations that can be helpful in reviewing the data, and we recommend reviewing the {naniar} documentation for more information [@naniar2023]. 204 | 205 | 206 | ## Analysis with missing data 207 | 208 | \index{Imputation|(} 209 | Once we understand the types of missingness, we can begin the analysis of the data. Different missingness types may be handled in different ways. In most publicly available datasets, researchers have already calculated weights and imputed missing values if necessary. Often, there are imputation flags included in the data that indicate if each value in a given variable is imputed. For example, in the RECS data we may see a logical variable of `ZWinterTempNight`, where a value of `TRUE` means that the value of `WinterTempNight` for that respondent was imputed, and `FALSE` means that it was not imputed. We may use these imputation flags if we are interested in examining the nonresponse rates in the original data. For those interested in learning more about how to calculate weights and impute data for different missing data mechanisms, we recommend @Kim2021 and @Valliant2018weights. 210 | 211 | Even with weights and imputation, missing data are most likely still present and need to be accounted for in analysis. This section provides an overview on how to recode missing data in R, and how to account for skip patterns in analysis. 212 | \index{Imputation|)} 213 | 214 | ### Recoding missing data 215 | 216 | \index{American National Election Studies (ANES)|(} 217 | Even within a variable, there can be different reasons for missing data. In publicly released data, negative values are often present to provide different meanings for values. For example, in the ANES 2020 data, they have the following negative values to represent different types of missing data: 218 | 219 | 220 | * --9: Refused 221 | * --8: Don't Know 222 | * --7: No post-election data, deleted due to incomplete interview 223 | * --6: No post-election interview 224 | * --5: Interview breakoff (sufficient partial IW) 225 | * --4: Technical error 226 | * --3: Restricted 227 | * --2: Other missing reason (question specific) 228 | * --1: Inapplicable 229 | 230 | When we created the derived variables for use in this book, we coded all negative values as `NA` and proceeded to analyze the data. For most cases, this is an appropriate approach as long as we filter the data appropriately to account for skip patterns (see Section \@ref(missing-skip-patt)). However, the {naniar} package does have the option to code special missing values. For example, if we wanted to have two `NA` values, one that indicated the question was missing by design (e.g., due to skip patterns) and one for the other missing categories, we can use the `nabular` format to incorporate these with the `recode_shadow()` function. 231 | 232 | 233 | ```{r} 234 | #| label: missing-anes-shadow-recode 235 | 236 | anes_2020_shadow<-anes_2020 %>% 237 | select(starts_with("V2")) %>% 238 | mutate(across(everything(),~case_when(.x < -1 ~ NA, 239 | TRUE~.x))) %>% 240 | bind_shadow() %>% 241 | recode_shadow(V201103 = .where(V201103==-1~"skip")) 242 | 243 | anes_2020_shadow %>% 244 | count(V201103,V201103_NA) 245 | ``` 246 | 247 | However, it is important to note that at the time of publication, there is no easy way to implement `recode_shadow()` to multiple variables at once (e.g., we cannot use the tidyverse feature of `across()`). The example code above only implements this for a single variable, so this would have to be done manually or in a loop for all variables of interest. \index{American National Election Studies (ANES)|)} 248 | 249 | ### Accounting for skip patterns {#missing-skip-patt} 250 | 251 | When questions are skipped by design in a survey, it is meaningful that the data are later missing. For example, the RECS asks people how they control the heat in their home in the winter (`HeatingBehavior`). This is only among those who have heat in their home (`SpaceHeatingUsed`). If there is no heating equipment used, the value of `HeatingBehavior` is missing. One has several choices when analyzing these data which include: (1) only including those with a valid value of `HeatingBehavior` and specifying the universe as those with heat or (2) including those who do not have heat. It is important to specify what population an analysis generalizes to. 252 | 253 | \index{Residential Energy Consumption Survey (RECS)|(} 254 | Here is an example where we only include those with a valid value of `HeatingBehavior` (choice 1). Note that we use the design object (`recs_des`) and then \index{Functions in srvyr!filter|(}filter to those that are not missing on `HeatingBehavior`.\index{Functions in srvyr!survey\_prop} \index{Functions in srvyr!summarize|(} 255 | 256 | ```{r} 257 | #| label: missing-recs-heatcc 258 | 259 | heat_cntl_1 <- recs_des %>% 260 | filter(!is.na(HeatingBehavior)) %>% 261 | group_by(HeatingBehavior) %>% 262 | summarize( 263 | p=survey_prop() 264 | ) 265 | 266 | heat_cntl_1 267 | ``` 268 | \index{Functions in srvyr!filter|)} 269 | 270 | Here is an example where we include those who do not have heat (choice 2). To help understand what we are looking at, we have included the output to show both variables, `SpaceHeatingUsed` and `HeatingBehavior`. \index{Functions in srvyr!survey\_prop} \index{Functions in srvyr!interact|(} 271 | 272 | ```{r} 273 | #| label: missing-recs-heatpop 274 | 275 | heat_cntl_2 <- recs_des %>% 276 | group_by(interact(SpaceHeatingUsed, HeatingBehavior)) %>% 277 | summarize( 278 | p=survey_prop() 279 | ) 280 | 281 | heat_cntl_2 282 | ``` 283 | \index{Functions in srvyr!interact|)} \index{Functions in srvyr!summarize|)} 284 | 285 | ```{r} 286 | #| label: missing-recs-heattext 287 | #| echo: FALSE 288 | 289 | pct_1 <- heat_cntl_1 %>% 290 | filter(str_detect(HeatingBehavior, "Program")) %>% 291 | mutate(p=round(p*100, 1)) %>% 292 | pull(p) 293 | 294 | pct_2 <- heat_cntl_2 %>% 295 | filter(str_detect(HeatingBehavior, "Program")) %>% 296 | mutate(p=round(p*100, 1)) %>% 297 | pull(p) 298 | 299 | ``` 300 | 301 | If we ran the first analysis, we would say that `r pct_1`% of households with heat use a programmable or smart thermostat for heating their home. If we used the results from the second analysis, we would say that `r pct_2`% of households use a programmable or smart thermostat for heating their home. The distinction between the two statements is made bold for emphasis. Skip patterns often change the universe we are talking about and need to be carefully examined. \index{Residential Energy Consumption Survey (RECS)|)} 302 | 303 | \index{American National Election Studies (ANES)|(} 304 | Filtering to the correct universe is important when handling these types of missing data. The `nabular` we created above can also help with this. If we have `NA_skip` values in the shadow, we can make sure that we filter out all of these values and only include relevant missing values. To do this with survey data, we could first create the `nabular`, then create the \index{Functions in srvyr!as\_survey\_design|(} design object on that data, and then use the shadow variables to assist with filtering the data. Let's use the `nabular` we created above for ANES 2020 (`anes_2020_shadow`) to create the design object. 305 | 306 | ```{r} 307 | #| label: missing-anes-shadow-des 308 | #| warning: FALSE 309 | 310 | anes_adjwgt_shadow <- anes_2020_shadow %>% 311 | mutate(V200010b = V200010b/sum(V200010b)*targetpop) 312 | 313 | anes_des_shadow <- anes_adjwgt_shadow %>% 314 | as_survey_design( 315 | weights = V200010b, 316 | strata = V200010d, 317 | ids = V200010c, 318 | nest = TRUE 319 | ) 320 | ``` 321 | \index{Functions in srvyr!as\_survey\_design|)} 322 | 323 | Then, we can use this design object to look at the percentage of the population who voted for each candidate in 2016 (`V201103`). First, let's look at the percentages without removing any cases: \index{Functions in srvyr!survey\_prop} \index{Functions in srvyr!summarize|(} 324 | 325 | ```{r} 326 | #| label: missing-anes-shadow-ex1 327 | 328 | pres16_select1<-anes_des_shadow %>% 329 | group_by(V201103) %>% 330 | summarize( 331 | All_Missing=survey_prop() 332 | ) 333 | 334 | pres16_select1 335 | ``` 336 | 337 | Next, we look at the percentages, removing only those missing due to skip patterns (i.e., they did not receive this question). \index{Functions in srvyr!survey\_prop} \index{Functions in srvyr!filter|(} 338 | 339 | ```{r} 340 | #| label: missing-anes-shadow-ex2 341 | 342 | pres16_select2<-anes_des_shadow %>% 343 | filter(V201103_NA!="NA_skip") %>% 344 | group_by(V201103) %>% 345 | summarize( 346 | No_Skip_Missing=survey_prop() 347 | ) 348 | 349 | pres16_select2 350 | ``` 351 | 352 | Finally, we look at the percentages, removing all missing values both due to skip patterns and due to those who refused to answer the question. \index{Functions in srvyr!survey\_prop} 353 | 354 | ```{r} 355 | #| label: missing-anes-shadow-ex3 356 | 357 | pres16_select3<-anes_des_shadow %>% 358 | filter(V201103_NA=="!NA") %>% 359 | group_by(V201103) %>% 360 | summarize( 361 | No_Missing=survey_prop() 362 | ) 363 | 364 | pres16_select3 365 | ``` 366 | \index{Functions in srvyr!filter|)} \index{Functions in srvyr!summarize|)} 367 | 368 | ```{r} 369 | #| label: missing-anes-shadow-gt 370 | #| echo: FALSE 371 | 372 | pres16_select_gt<-pres16_select1 %>% 373 | full_join(pres16_select2,by="V201103") %>% 374 | full_join(pres16_select3,by="V201103") %>% 375 | mutate(Candidate=case_when(V201103==-1~"Did Not Vote for President in 2016", 376 | V201103==1~"Hillary Clinton", 377 | V201103==2~"Donald Trump", 378 | V201103==5~"Other Candidate", 379 | TRUE~"Missing")) %>% 380 | select(Candidate,everything()) %>% 381 | select(-V201103) %>% 382 | gt() %>% 383 | cols_label(All_Missing = "%", 384 | All_Missing_se = "s.e. (%)", 385 | No_Skip_Missing = "%", 386 | No_Skip_Missing_se = "s.e. (%)", 387 | No_Missing = "%", 388 | No_Missing_se = "s.e. (%)") %>% 389 | tab_spanner(label = "Including All Missing Data", 390 | columns = c(All_Missing, All_Missing_se)) %>% 391 | tab_spanner(label = "Removing Skip Patterns Only", 392 | columns = c(No_Skip_Missing, No_Skip_Missing_se)) %>% 393 | tab_spanner(label = "Removing All Missing Data", 394 | columns = c(No_Missing, No_Missing_se)) %>% 395 | fmt_number(decimals = 1, scale_by=100) 396 | ``` 397 | 398 | (ref:missing-anes-shadow-tab) Percentage of votes by candidate for different missing data inclusions 399 | 400 | ```{r} 401 | #| label: missing-anes-shadow-tab 402 | #| echo: FALSE 403 | #| warning: FALSE 404 | 405 | pres16_select_gt %>% 406 | print_gt_book(knitr::opts_current$get()[["label"]]) 407 | ``` 408 | 409 | ```{r} 410 | #| label: missing-anes-shadow-tab-text 411 | #| echo: FALSE 412 | #| warning: FALSE 413 | 414 | pres16_select1_1<-pres16_select1 %>% 415 | filter(V201103==1) %>% 416 | pull(All_Missing) 417 | 418 | pres16_select1_2<-pres16_select1 %>% 419 | filter(V201103==2) %>% 420 | pull(All_Missing) 421 | 422 | pres16_select1_out<-round(pres16_select1_1*100-pres16_select1_2*100,1) 423 | 424 | 425 | pres16_select2_1<-pres16_select2 %>% 426 | filter(V201103==1) %>% 427 | pull(No_Skip_Missing) 428 | 429 | pres16_select2_2<-pres16_select2 %>% 430 | filter(V201103==2) %>% 431 | pull(No_Skip_Missing) 432 | 433 | pres16_select2_out<-round(pres16_select2_1*100-pres16_select2_2*100,1) 434 | 435 | ``` 436 | 437 | 438 | As Table \@ref(tab:missing-anes-shadow-tab) shows, the results can vary greatly depending on which type of missing data are removed. If we remove only the skip patterns, the margin between Clinton and Trump is `r pres16_select2_out` percentage points; but if we include all data, even those who did not vote in 2016, the margin is `r pres16_select1_out` percentage points. How we handle the different types of missing values is important for interpreting the data. 439 | \index{Item nonresponse|)} \index{American National Election Studies (ANES)|)} 440 | 441 | \index{Missing data|)} 442 | -------------------------------------------------------------------------------- /12-successful-survey-data-analysis.Rmd: -------------------------------------------------------------------------------- 1 | # Successful survey analysis recommendations {#c12-recommendations} 2 | 3 | ```{r} 4 | #| label: recommendations-styler 5 | #| include: false 6 | knitr::opts_chunk$set(tidy = 'styler') 7 | ``` 8 | 9 | ::: {.prereqbox-header} 10 | `r if (knitr:::is_html_output()) '### Prerequisites {- #prereq12}'` 11 | ::: 12 | 13 | ::: {.prereqbox data-latex="{Prerequisites}"} 14 | For this chapter, load the following packages: 15 | ```{r} 16 | #| label: recommendations-setup 17 | #| error: FALSE 18 | #| warning: FALSE 19 | #| message: FALSE 20 | library(tidyverse) 21 | library(survey) 22 | library(srvyr) 23 | library(srvyrexploR) 24 | ``` 25 | 26 | To illustrate the importance of data visualization, we discuss Anscombe's Quartet. The dataset can be replicated by running the code below: 27 | 28 | ```{r} 29 | #| label: recommendations-anscombe-setup 30 | anscombe_tidy <- anscombe %>% 31 | mutate(obs = row_number()) %>% 32 | pivot_longer(-obs, names_to = "key", values_to = "value") %>% 33 | separate(key, c("variable", "set"), 1, convert = TRUE) %>% 34 | mutate(set = c("I", "II", "III", "IV")[set]) %>% 35 | pivot_wider(names_from = variable, values_from = value) 36 | ``` 37 | 38 | We create an example survey dataset to explain potential pitfalls and how to overcome them in survey analysis. To recreate the dataset, run the code below: 39 | 40 | ```{r} 41 | #| label: recommendations-example-dat 42 | example_srvy <- tribble( 43 | ~id, ~region, ~q_d1, ~q_d2_1, ~gender, ~weight, 44 | 1L, 1L, 1L, "Somewhat interested", "female", 1740, 45 | 2L, 1L, 1L, "Not at all interested", "female", 1428, 46 | 3L, 2L, NA, "Somewhat interested", "female", 496, 47 | 4L, 2L, 1L, "Not at all interested", "female", 550, 48 | 5L, 3L, 1L, "Somewhat interested", "female", 1762, 49 | 6L, 4L, NA, "Very interested", "female", 1004, 50 | 7L, 4L, NA, "Somewhat interested", "female", 522, 51 | 8L, 3L, 2L, "Not at all interested", "female", 1099, 52 | 9L, 4L, 2L, "Somewhat interested", "female", 1295, 53 | 10L, 2L, 2L, "Somewhat interested", "male", 983 54 | ) 55 | 56 | example_des <- 57 | example_srvy %>% 58 | as_survey_design(weights = weight) 59 | ``` 60 | ::: 61 | 62 | ## Introduction 63 | 64 | The previous chapters in this book aimed to provide the technical skills and knowledge required for running survey analyses. This chapter builds upon the previously mentioned best practices to present a curated set of recommendations for running a successful survey analysis. We hope this list provides practical insights that assist in producing meaningful and reliable results. 65 | 66 | ## Follow the survey analysis process {#recs-survey-process} 67 | 68 | \index{Survey analysis process|(}As we first introduced in Chapter \@ref(c04-getting-started), there are four main steps to successfully analyze survey data: 69 | 70 | 1. Create a `tbl_svy` object (a survey object) using: `as_survey_design()` or `as_survey_rep()` 71 | 72 | 2. Subset data (if needed) using `filter()` (to create subpopulations) 73 | 74 | 3. Specify domains of analysis using `group_by()` 75 | 76 | 4. Within `summarize()`, specify variables to calculate, including means, totals, proportions, quantiles, and more 77 | 78 | The order of these steps matters in survey analysis. For example, if we need to subset the data, \index{Functions in srvyr!filter}we must use `filter()` on our data after creating the survey design. If we do this before the survey design is created, we may not be correctly accounting for the study design, resulting in inaccurate findings.\index{Survey analysis process|)} 79 | 80 | Additionally, correctly identifying the survey design is one of the most important steps in survey analysis. Knowing the type of sample design (e.g., clustered, stratified) helps ensure the underlying error structure is correctly calculated and weights are correctly used. Learning about complex design factors such as clustering, stratification, and weighting is foundational to complex survey analysis, and we recommend that all analysts review Chapter \@ref(c10-sample-designs-replicate-weights) before creating their first design object. Reviewing the documentation (see Chapter \@ref(c03-survey-data-documentation)) helps us understand what variables to use from the data. 81 | 82 | Making sure to use the survey analysis functions from the {srvyr} and {survey} packages is also important in survey analysis. For example, using `mean()` and \index{Functions in srvyr!survey\_mean}`survey_mean()` on the same data results in different findings and outputs. Each of the survey functions from {srvyr} and {survey} impacts standard errors and variance, and we cannot treat complex surveys as unweighted simple random samples if we want to produce unbiased estimates [@R-srvyr; @lumley2010complex]. 83 | 84 | ## Begin with descriptive analysis 85 | 86 | When receiving a fresh batch of data, it is tempting to jump right into running models to find significant results. However, a successful data analyst begins by exploring the dataset. Chapter \@ref(c11-missing-data) talks about the importance of reviewing data when examining missing data patterns. In this chapter, we illustrate the value of reviewing all types of data. This involves running descriptive analysis on the dataset as a whole, as well as individual variables and combinations of variables. As described in Chapter \@ref(c05-descriptive-analysis), descriptive analyses should always precede statistical analysis to prevent avoidable (and potentially embarrassing) mistakes. 87 | 88 | ### Table review 89 | 90 | \index{Cross-tabulation|(} 91 | Even before applying weights, consider running cross-tabulations on the raw data. Cross-tabs can help us see if any patterns stand out that may be alarming or something worth further investigating. 92 | \index{Cross-tabulation|)} 93 | 94 | For example, let’s explore the example survey dataset introduced in the Prerequisites box, `example_srvy`. We run the code below on the unweighted data to inspect the `gender` variable: 95 | 96 | ```{r} 97 | #| label: recommendations-example-desc 98 | example_srvy %>% 99 | group_by(gender) %>% 100 | summarize(n = n()) 101 | ``` 102 | 103 | The data show that females comprise 9 out of 10, or 90%, of the sample. Generally, we assume something close to a 50/50 split between male and female respondents in a population. The sizable female proportion could indicate either a unique sample or a potential error in the data. If we review the survey documentation and see this was a deliberate part of the design, we can continue our analysis using the appropriate methods. If this was not an intentional choice by the researchers, the results alert us that something may be incorrect in the data or our code, and we can verify if there’s an issue by comparing the results with the weighted means. 104 | 105 | ### Graphical review 106 | 107 | Tables provide a quick check of our assumptions, but there is no substitute for graphs and plots to visualize the distribution of data. We might miss outliers or nuances if we scan only summary statistics. 108 | 109 | For example, Anscombe's Quartet demonstrates the importance of visualization in analysis. Let's say we have a dataset with x- and y-variables in an object called `anscombe_tidy`. Let's take a look at how the dataset is structured: 110 | 111 | ```{r} 112 | #| label: recommendations-anscombe-head 113 | head(anscombe_tidy) 114 | ``` 115 | 116 | We can begin by checking one set of variables. For Set I, the x-variables have an average of 9 with a standard deviation of 3.3; for y, we have an average of 7.5 with a standard deviation of 2.03. The two variables have a correlation of 0.81. 117 | 118 | ```{r} 119 | #| label: recommendations-anscombe-calc 120 | anscombe_tidy %>% 121 | filter(set == "I") %>% 122 | summarize( 123 | x_mean = mean(x), 124 | x_sd = sd(x), 125 | y_mean = mean(y), 126 | y_sd = sd(y), 127 | correlation = cor(x, y) 128 | ) 129 | ``` 130 | 131 | These are useful statistics. We can note that the data do not have high variability, and the two variables are strongly correlated. Now, let’s check all the sets (I-IV) in the Anscombe data. Notice anything interesting? 132 | 133 | ```{r} 134 | #| label: recommendations-anscombe-calc-2 135 | anscombe_tidy %>% 136 | group_by(set) %>% 137 | summarize( 138 | x_mean = mean(x), 139 | x_sd = sd(x, na.rm = TRUE), 140 | y_mean = mean(y), 141 | y_sd = sd(y, na.rm = TRUE), 142 | correlation = cor(x, y) 143 | ) 144 | ``` 145 | 146 | The summary results for these four sets are nearly identical! Based on this, we might assume that each distribution is similar. Let's look at a graphical visualization to see if our assumption is correct (see Figure \@ref(fig:recommendations-anscombe-plot)). 147 | 148 | ```{r} 149 | #| label: recommendations-anscombe-plot 150 | #| warning: false 151 | #| error: false 152 | #| message: false 153 | #| fig.cap: "Plot of Anscombe's Quartet data and the importance of reviewing data graphically" 154 | #| fig.alt: "This figure shows four plots one for each of Anscombe's sets. The upper left plot is a plot of set I and has a trend line with a slope of 0.5 and an intercept of 3. The data points are distributed evenly around the trend line. The upper right plot is a plot of set II and has the same trend line as set I. The data points are curved around the trend line. The lower left plot is a plot of set III and has the same trend line as set I. The data points closely followly the trend line with one outlier where the y-value for the point is much larger than the others. The lower right plot is a plot of set IV and has the same trend line as set I. The data points all share the same x-value but different y-values with the exception of one data point, which has a much larger value for both y and x values." 155 | 156 | ggplot(anscombe_tidy, aes(x, y)) + 157 | geom_point() + 158 | facet_wrap( ~ set) + 159 | geom_smooth(method = "lm", se = FALSE, alpha = 0.5) + 160 | theme_minimal() 161 | ``` 162 | 163 | Although each of the four sets has the same summary statistics and regression line, when reviewing the plots (see Figure \@ref(fig:recommendations-anscombe-plot)), it becomes apparent that the distributions of the data are not the same at all. Each set of points results in different shapes and distributions. Imagine sharing each set (I-IV) and the corresponding plot with a different colleague. The interpretations and descriptions of the data would be very different even though the statistics are similar. Plotting data can also ensure that we are using the correct analysis method on the data, so understanding the underlying distributions is an important first step. 164 | 165 | ## Check variable types 166 | 167 | When we pull the data from surveys into R, the data may be listed as character, factor, numeric, or logical/Boolean. The tidyverse functions that read in data (e.g., `read_csv()`, `read_excel()`) default to have all strings load as character variables. This is important when dealing with survey data, as many strings may be better suited for factors than character variables. For example, let's revisit the `example_srvy` data. Taking a `glimpse()` of the data gives us insight into what it contains: 168 | 169 | ```{r} 170 | #| label: recommendations-example-dat-glimpse 171 | example_srvy %>% 172 | glimpse() 173 | ``` 174 | 175 | \index{Factor|(} 176 | The output shows that `q_d2_1` is a character variable, but the values of that variable show three options (Very interested / Somewhat interested / Not at all interested). In this case, we most likely want to change `q_d2_1` to be a factor variable and order the factor levels to indicate that this is an ordinal variable. Here is some code on how we might approach this task using the {forcats} package [@R-forcats]: 177 | 178 | ```{r} 179 | #| label: recommendations-example-dat-fct 180 | example_srvy_fct <- example_srvy %>% 181 | mutate(q_d2_1_fct = factor( 182 | q_d2_1, 183 | levels = c("Very interested", 184 | "Somewhat interested", 185 | "Not at all interested") 186 | )) 187 | 188 | example_srvy_fct %>% 189 | glimpse() 190 | 191 | example_srvy_fct %>% 192 | count(q_d2_1_fct, q_d2_1) 193 | ``` 194 | 195 | \index{Codebook|(} \index{Categorical data|(} 196 | This example dataset also includes a column called `region`, which is imported as a number (``). This is a good reminder to use the questionnaire and codebook along with the data to find out if the values actually reflect a number or are perhaps a coded categorical variable (see Chapter \@ref(c03-survey-data-documentation) for more details). R calculates the mean even if it is not appropriate, leading to the common mistake of applying an average to categorical values instead of a proportion function. For example, for ease of coding, we may use the `across()` function to calculate the mean across all numeric variables: \index{Functions in srvyr!survey\_mean|(} \index{Functions in srvyr!summarize|(} \index{Codebook|()} \index{Categorical data|)} 197 | 198 | ```{r} 199 | #| label: recommendations-example-dat-num-calc 200 | example_des %>% 201 | select(-weight) %>% 202 | summarize(across(where(is.numeric), ~ survey_mean(.x, na.rm = TRUE))) 203 | ``` 204 | 205 | In this example, if we do not adjust `region` to be a factor variable type, we might accidentally report an average region of `r round(example_des %>% summarize(across(where(is.numeric), ~ survey_mean(.x, na.rm = TRUE))) %>% pull(region), 2)` in our findings, which is meaningless. Checking that our variables are appropriate avoids this pitfall and ensures the measures and models are suitable for the variable type. 206 | \index{Factor|)} 207 | 208 | 209 | ## Improve debugging skills 210 | 211 | \index{Debugging|(} 212 | It is common for analysts working in R to come across warning or error messages, and learning how to debug these messages (i.e., find and fix issues) ensures we can proceed with our work and avoid potential mistakes. 213 | 214 | We've discussed a few examples in this book. For example, if we calculate an average with `survey_mean()` and get `NA` instead of a number, it may be because our column has missing values. 215 | 216 | ```{r} 217 | #| label: recommendations-missing-dat 218 | example_des %>% 219 | summarize(mean = survey_mean(q_d1)) 220 | ``` 221 | 222 | Including the `na.rm = TRUE` would resolve the issue: 223 | 224 | ```{r} 225 | #| label: recommendations-missing-dat-fix 226 | example_des %>% 227 | summarize(mean = survey_mean(q_d1, na.rm = TRUE)) 228 | ``` 229 | \index{Functions in srvyr!summarize|)} \index{Functions in srvyr!survey\_mean|)} 230 | 231 | Another common error message that we may see with survey analysis may look something like the following: \index{Functions in survey!svyttest|(} 232 | 233 | ```{r} 234 | #| label: recommendations-desobj-loc 235 | #| error: true 236 | example_des %>% 237 | svyttest(q_d1~gender) 238 | ``` 239 | 240 | \index{Dot notation|(} 241 | In this case, we need to remember that with functions from the {survey} packages like `svyttest()`, the design object is not the first argument, and we have to use the dot (`.`) notation (see Chapter \@ref(c06-statistical-testing)). Adding in the named argument of `design=.` fixes this error. 242 | 243 | ```{r} 244 | #| label: recommendations-desobj-locfix 245 | example_des %>% 246 | svyttest(q_d1 ~ gender, 247 | design = .) 248 | ``` 249 | 250 | \index{Dot notation|)} 251 | 252 | Often, debugging involves interpreting the message from R. For example, if our code results in this error: 253 | 254 | ``` 255 | Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 256 | contrasts can be applied only to factors with 2 or more levels 257 | ``` 258 | 259 | \index{Factor|(} 260 | We can see that the error has to do with a function requiring a factor with two or more levels and that it has been applied to something else. This ties back to our section on using appropriate variable types. We can check the variable of interest to examine whether it is the correct type. \index{Functions in survey!svyttest|)} 261 | \index{Factor|)} 262 | 263 | The internet also offers many resources for debugging. Searching for a specific error message can often lead to a solution. In addition, we can post on community forums like [Posit Community](https://forum.posit.co/) for direct help from others. \index{Debugging|)} 264 | 265 | ## Think critically about conclusions 266 | 267 | Once we have our findings, we need to learn to think critically about them. As mentioned in Chapter \@ref(c02-overview-surveys), many aspects of the study design can impact our interpretation of the results, for example, the number and types of response options provided to the respondent or who was asked the question (both thinking about the full sample and any skip patterns). Knowing the overall study design can help us accurately think through what the findings may mean and identify any issues with our analyses. Additionally, we should make sure that our survey design object is correctly defined (see Chapter \@ref(c10-sample-designs-replicate-weights)), carefully consider how we are managing missing data (see Chapter \@ref(c11-missing-data)), and follow statistical analysis procedures such as avoiding model overfitting by using too many variables in our formulas. 268 | 269 | These considerations allow us to conduct our analyses and review findings for statistically significant results. It is important to note that even significant results do not mean that they are meaningful or important. A large enough sample can produce statistically significant results. Therefore, we want to look at our results in context, such as comparing them with results from other studies or analyzing them in conjunction with confidence intervals and other measures. 270 | 271 | Communicating the results (see Chapter \@ref(c08-communicating-results)) in an unbiased manner is also a critical step in any analysis project. If we present results without error measures or only present results that support our initial hypotheses, we are not thinking critically and may incorrectly represent the data. As survey data analysts, we often interpret the survey data for the public. We must ensure that we are the best stewards of the data and work to bring light to meaningful and interesting findings that the public wants and needs to know about. -------------------------------------------------------------------------------- /90-AppendixA-DataImport.Rmd: -------------------------------------------------------------------------------- 1 | \cleardoublepage 2 | 3 | # (APPENDIX) Appendices {-} 4 | 5 | # Importing survey data into R {#importing-survey-data-into-r} 6 | 7 | ```{r} 8 | #| label: readr-styler 9 | #| include: false 10 | knitr::opts_chunk$set(tidy = 'styler') 11 | ``` 12 | 13 | To analyze a survey, we need to bring the survey data into R. This process is often referred to as importing, loading, or reading in data. Survey files come in different formats depending on the software used to create them. One of the many advantages of R is its flexibility in handling various data formats, regardless of their file extensions. Here are examples of common public-use survey file formats we may encounter: 14 | 15 | * Delimiter-separated text files 16 | * Excel spreadsheets in `.xls` or `.xlsx` format 17 | * R native `.rda` files 18 | * Stata datasets in `.dta` format 19 | * SAS datasets in `.sas` format 20 | * SPSS datasets in `.sav` format 21 | * Application Programming Interfaces (APIs), often in JavaScript Object Notation (JSON) format 22 | * Data stored in databases 23 | 24 | This appendix guides analysts through the process of importing these various types of survey data into R. 25 | 26 | ## Importing delimiter-separated files into R 27 | 28 | Delimiter-separated files use specific characters, known as delimiters, to separate values within the file. For example, CSV (comma-separated values) files use commas as delimiters, while TSV (tab-separated values) files use tabs. These file formats are widely used because of their simplicity and compatibility with various software applications. 29 | 30 | The {readr} package, part of the tidyverse ecosystem, offers efficient ways to import delimiter-separated files into R [@R-readr]. It offers several advantages, including automatic data type detection and flexible handling of missing values, depending on one's survey analysis needs. The {readr} package includes functions for: 31 | 32 | * `read_csv()`: This function is specifically designed to read CSV files. 33 | * `read_tsv()`: Use this function for TSV files. 34 | * `read_delim()`: This function can handle a broader range of delimiter-separated files, including CSV and TSV. Specify the delimiter using the `delim` argument. 35 | * `read_fwf()`: This function is useful for importing fixed-width files (FWF), where columns have predetermined widths, and values are aligned in specific positions. 36 | * `read_table()`: Use this function when dealing with whitespace-separated files, such as those with spaces or multiple spaces as delimiters. 37 | * `read_log()`: This function can read and parse web log files. 38 | 39 | The syntax for `read_csv()` is: 40 | 41 | ``` 42 | read_csv( 43 | file, 44 | col_names = TRUE, 45 | col_types = NULL, 46 | col_select = NULL, 47 | id = NULL, 48 | locale = default_locale(), 49 | na = c("", "NA"), 50 | comment = "", 51 | trim_ws = TRUE, 52 | skip = 0, 53 | n_max = Inf, 54 | guess_max = min(1000, n_max), 55 | name_repair = "unique", 56 | num_threads = readr_threads(), 57 | progress = show_progress(), 58 | show_col_types = should_show_types(), 59 | skip_empty_rows = TRUE, 60 | lazy = should_read_lazy() 61 | ) 62 | ``` 63 | 64 | The arguments are: 65 | 66 | * `file`: the path to the CSV file to import 67 | * `col_names`: a value of `TRUE` imports the first row of the `file` as column names and not included in the data frame. A value of `FALSE` creates automated column names. Alternatively, we can provide a vector of column names. 68 | * `col_types`: by default, R infers the column variable types. We can also provide a column specification using `list()` or `cols()`; for example, use `col_types = cols(.default = "c")` to read all the columns as characters. Alternatively, we can use a string to specify the variable types for each column. 69 | * `col_select`: the columns to include in the results 70 | * `id`: a column for storing the file path. This is useful for keeping track of the input file when importing multiple CSVs at a time. 71 | * `locale`: the location-specific defaults for the file 72 | * `na`: a character vector of values to interpret as missing 73 | * `comment`: a character vector of values to interpret as comments 74 | * `trim_ws`: a value of `TRUE` trims leading and trailing white space 75 | * `skip`: number of lines to skip before importing the data 76 | * `n_max`: maximum number of lines to read 77 | * `guess_max`: maximum number of lines used for guessing column types 78 | * `name_repair`: whether to check column names. By default, the column names are unique. 79 | * `num_threads`: the number of processing threads to use for initial parsing and lazy reading of data 80 | * `progress`: a value of `TRUE` displays a progress bar 81 | * `show_col_types`: a value of `TRUE` displays the column types 82 | * `skip_empty_rows`: a value of `TRUE` ignores blank rows 83 | * `lazy`: a value of `TRUE` reads values lazily 84 | 85 | The other functions share a similar syntax to `read_csv()`. To find more details, run `??` followed by the function name. For example, run `??read_tsv` in the Console for additional information on importing TSV files. 86 | 87 | In the example below, we use {readr} to import a CSV file named 'anes_timeseries_2020_csv_20220210.csv' into an R object called `anes_csv`. The `read_csv()` imports the file and stores the data in the `anes_csv` object. We can then use this object for further analysis. 88 | 89 | ```r 90 | library(readr) 91 | 92 | anes_csv <- 93 | read_csv(file = "data/anes_timeseries_2020_csv_20220210.csv") 94 | ``` 95 | 96 | ## Importing Excel files into R 97 | 98 | Excel, a widely used spreadsheet software program created by Microsoft, is a common file format in survey research. We can import Excel spreadsheets into the R environment using the {readxl} package. The package supports both the legacy `.xls` files and the modern `.xlsx` format. 99 | 100 | To import Excel data into R, we can use the `read_excel()` function from the {readxl} package. This function offers a range of options for the import process. Let's explore the syntax: 101 | 102 | ``` 103 | read_excel( 104 | path, 105 | sheet = NULL, 106 | range = NULL, 107 | col_names = TRUE, 108 | col_types = NULL, 109 | na = "", 110 | trim_ws = TRUE, 111 | skip = 0, 112 | n_max = Inf, 113 | guess_max = min(1000, n_max), 114 | progress = readxl_progress(), 115 | .name_repair = "unique" 116 | ) 117 | ``` 118 | 119 | The arguments are: 120 | 121 | * `path`: the path to the Excel file to import 122 | * `sheet`: the name or index of the sheet (sometimes called tabs) within the Excel file 123 | * `range`: the range of cells to import (for example, `P15:T87`) 124 | * `col_names`: indicates whether the first row of the dataset contains column names 125 | * `col_types`: specifies the data types of columns 126 | * `na`: defines the representation of missing values (for example, `NULL`) 127 | * `trim_ws`: controls whether leading and trailing whitespaces should be trimmed 128 | * `skip` and `n_max`: enable skipping rows and limit the number of rows imported 129 | * `guess_max`: sets the maximum number of rows used for data type guessing 130 | * `progress`: specifies a progress bar for large imports 131 | * `.name_repair`: determines how column names are repaired if they are not valid 132 | 133 | In the code example below, we import an Excel spreadsheet named 'anes_timeseries_2020_csv_20220210.xlsx' into R. The resulting data is saved as a tibble in the `anes_excel` object, ready for further analysis. 134 | 135 | ```r 136 | library(readxl) 137 | 138 | anes_excel <- 139 | read_excel(path = "data/anes_timeseries_2020_csv_20220210.xlsx") 140 | ``` 141 | 142 | ## Importing Stata, SAS, and SPSS files into R 143 | 144 | The {haven} package, also from the tidyverse ecosystem, imports various proprietary data formats: Stata `.dta` files, SPSS `.sav` files, and SAS `.sas7bdat` and `.sas7bcat` files [@R-haven]. One of the notable strengths of the {haven} package is its ability to handle multiple proprietary formats within a unified framework. It offers dedicated functions for each supported proprietary format, making it straightforward to import data regardless of the program. Here, we introduce `read_dta()` for Stata files, `read_sav()` for SPSS files, and `read_sas()` for SAS files. 145 | 146 | ### Syntax 147 | 148 | Let's explore the syntax for importing Stata files `.dta` files using `haven::read_dta()`: 149 | 150 | ```r 151 | read_dta( 152 | file, 153 | encoding = NULL, 154 | col_select = NULL, 155 | skip = 0, 156 | n_max = Inf, 157 | .name_repair = "unique" 158 | ) 159 | ``` 160 | 161 | The arguments are: 162 | 163 | * `file`: the path to the proprietary data file to import 164 | * `encoding`: specifies the character encoding of the data file 165 | * `col_select`: selects specific columns for import 166 | * `skip` and `n_max`: control the number of rows skipped and the maximum number of rows imported 167 | * `.name_repair`: determines how column names are repaired if they are not valid 168 | 169 | The syntax for `read_sav()` is similar to `read_dta()`: 170 | 171 | ``` 172 | read_sav( 173 | file, 174 | encoding = NULL, 175 | user_na = FALSE, 176 | col_select = NULL, 177 | skip = 0, 178 | n_max = Inf, 179 | .name_repair = "unique" 180 | ) 181 | ``` 182 | 183 | The arguments are: 184 | 185 | * `file`: the path to the proprietary data file to import 186 | * `encoding`: specifies the character encoding of the data file 187 | * `col_select`: selects specific columns for import 188 | * `user_na`: a value of `TRUE` reads variables with user-defined missing labels into `labelled_spss()` objects 189 | * `skip` and `n_max`: control the number of rows skipped and the maximum number of rows imported 190 | * `.name_repair`: determines how column names are repaired if they are not valid 191 | 192 | The syntax for importing SAS files with `read_sas()` is as follows: 193 | 194 | ```r 195 | read_sas( 196 | data_file, 197 | catalog_file = NULL, 198 | encoding = NULL, 199 | catalog_encoding = encoding, 200 | col_select = NULL, 201 | skip = 0L, 202 | n_max = Inf, 203 | .name_repair = "unique" 204 | ) 205 | ``` 206 | 207 | The arguments are: 208 | 209 | * `data_file`: the path to the proprietary data file to import 210 | * `catalog_file`: the path to the catalog file to import 211 | * `encoding`: specifies the character encoding of the data file 212 | * `catalog_encoding`: specifies the character encoding of the catalog file 213 | * `col_select`: selects specific columns for import 214 | * `skip` and `n_max`: control the number of rows skipped and the maximum number of rows imported 215 | * `.name_repair`: determines how column names are repaired if they are not valid 216 | 217 | In the code examples below, we demonstrate how to import Stata, SPSS, and SAS files into R using the respective {haven} functions. The resulting data are stored in `anes_dta`, `anes_sav`, and `anes_sas` objects as tibbles, ready for use in R. For the Stata example, we show how to import the data from the {srvyrexploR} package to use in examples. 218 | 219 | Stata: \index{American National Election Studies (ANES)|(} 220 | 221 | ```{r} 222 | #| label: readr-stata 223 | library(haven) 224 | 225 | anes_dta <- 226 | read_dta(file = system.file("extdata", 227 | "anes_2020_stata_example.dta", 228 | package = "srvyrexploR")) 229 | ``` 230 | 231 | \index{American National Election Studies (ANES)|)} 232 | 233 | SPSS: 234 | 235 | ```r 236 | library(haven) 237 | 238 | anes_sav <- 239 | read_sav(file = "data/anes_timeseries_2020_spss_20220210.sav") 240 | ``` 241 | 242 | SAS: 243 | 244 | ```r 245 | library(haven) 246 | 247 | anes_sas <- 248 | read_sas( 249 | data_file = "data/anes_timeseries_2020_sas_20220210.sas7bdat" 250 | ) 251 | ``` 252 | 253 | ### Working with labeled data 254 | 255 | \index{American National Election Studies (ANES)|(} \index{Categorical data|(} 256 | Stata, SPSS, and SAS files can contain labeled variables and values. These labels provide descriptive information about categorical data, making them easier to understand and analyze. When importing data from Stata, SPSS, or SAS, we want to preserve these labels to maintain data fidelity. 257 | 258 | Consider a variable like 'Education Level' with coded values (e.g., 1, 2, 3). Without labels, these codes can be cryptic. However, with labels ('High School Graduate,' 'Bachelor's Degree,' 'Master's Degree'), the data become more informative and easier to work with. 259 | 260 | With the {haven} package, we have the capability to import and work with labeled data from Stata, SPSS, and SAS files. The package uses a special class of data called `haven_labelled` to store labeled variables. When a dataset label is defined in Stata, it is stored in the 'label' attribute of the tibble when imported, ensuring that the information is not lost. 261 | 262 | We can use functions like `select()`, `glimpse()`, and `is.labelled()` to inspect the imported data and verify if the variables are labeled. Take a look at the ANES Stata file. Notice that categorical variables `V200002` and `V201006` are marked with a type of ``. This notation indicates that these variables are labeled. 263 | 264 | ```{r} 265 | #| label: readr-glimpse 266 | #| message: false 267 | library(dplyr) 268 | 269 | anes_dta %>% 270 | select(1:6) %>% 271 | glimpse() 272 | ``` 273 | 274 | We can confirm their label status using the `haven::is.labelled()` function. 275 | 276 | ```{r} 277 | #| label: readr-islabelled 278 | haven::is.labelled(anes_dta$V200002) 279 | ``` 280 | 281 | To explore the labels further, we can use the `attributes()` function. This function provides insights into both the variable labels (`$label`) and the associated value labels (`$labels`). 282 | 283 | ```{r} 284 | #| label: readr-attributes 285 | attributes(anes_dta$V200002) 286 | ``` 287 | 288 | When we import a labeled dataset using {haven}, it results in a tibble containing both the data and label information. However, this is meant to be an intermediary data structure and not intended to be the final data format for analysis. Instead, we should convert it into a regular R data frame before continuing our data workflow. There are two primary methods to achieve this conversion: (1) convert to factors or (2) remove the labels. 289 | 290 | #### Option 1: Convert the vector into a factor {-} 291 | 292 | \index{Factor|(} 293 | Factors are native R data types for working with categorical data. They consist of integer values that correspond to character values, known as levels. Below is a dummy example of factors. The `factors` show the four different levels in the data: `strongly agree`, `agree`, `disagree`, and `strongly disagree`. 294 | 295 | ```{r} 296 | #| label: readr-factor 297 | response <- 298 | c("strongly agree", "agree", "agree", "disagree", "strongly disagree") 299 | 300 | response_levels <- 301 | c("strongly agree", "agree", "disagree", "strongly disagree") 302 | 303 | factors <- factor(response, levels = response_levels) 304 | 305 | factors 306 | ``` 307 | 308 | Factors are integer vectors, though they may look like character strings. We can confirm by looking at the vector's structure: 309 | 310 | ```{r} 311 | #| label: readr-factor-view 312 | glimpse(factors) 313 | ``` 314 | 315 | R's factors differ from Stata, SPSS, or SAS labeled vectors. However, we can convert labeled variables into factors using the `as_factor()` function. 316 | 317 | ```{r} 318 | #| label: readr-factor-create 319 | anes_dta %>% 320 | transmute(V200002 = as_factor(V200002)) 321 | ``` 322 | 323 | The `as_factor()` function can be applied to all columns in a data frame or individual ones. Below, we convert all `` columns into factors. 324 | 325 | ```{r} 326 | #| label: readr-factor-glimpse 327 | anes_dta_factor <- 328 | anes_dta %>% 329 | as_factor() 330 | 331 | anes_dta_factor %>% 332 | select(1:6) %>% 333 | glimpse() 334 | ``` 335 | 336 | \index{Factor|)} 337 | 338 | #### Option 2: Strip the labels {-} 339 | 340 | The second option is to remove the labels altogether, converting the labeled data into a regular R data frame. To remove, or 'zap,' the labels from our tibble, we can use the {haven} package's `zap_label()` and `zap_labels()` functions. This approach removes the labels but retains the data values in their original form. 341 | 342 | The ANES Stata file columns contain variable labels. Using the `map()` function from {purrr}, we can review the labels using `attr`. In the example below, we list the first two variables and their labels. For instance, the label for `V200002` is "Mode of interview: pre-election interview." 343 | 344 | ```{r} 345 | #| label: readr-label-show 346 | purrr::map(anes_dta, ~ attr(.x, "label")) %>% 347 | head(2) 348 | ``` 349 | 350 | Use `zap_label()` to remove the variable labels but retain the value labels. Notice that the labels return as `NULL`. 351 | 352 | ```{r} 353 | #| label: readr-zaplabel 354 | zap_label(anes_dta) %>% 355 | purrr::map( ~ attr(.x, "label")) %>% 356 | head(2) 357 | ``` 358 | 359 | To remove the value labels, use `zap_labels()`. Notice the previous `` columns are now ``. 360 | 361 | ```{r} 362 | #| label: readr-zaplabels 363 | zap_labels(anes_dta) %>% 364 | select(1:6) %>% 365 | glimpse() 366 | ``` 367 | 368 | While it is important to convert labeled datasets into regular R data frames for working in R, the labels themselves often contain valuable information that provides context and meaning to the survey variables. To aid with interpretability and documentation, we can create a data dictionary from the labeled dataset. A data dictionary is a reference document that provides detailed information about the variables and values of a survey. 369 | \index{Categorical data|)} 370 | 371 | The {labelled} package offers a convenient function, `generate_dictionary()`, that creates data dictionaries directly from a labeled dataset [@R-labelled]. This function extracts variable labels, value labels, and other metadata and organizes them into a structured document that we can browse and reference throughout our analysis. 372 | 373 | Let's create a data dictionary from the ANES Stata dataset as an example: 374 | 375 | ```{r} 376 | #| label: readr-dictionary-create 377 | library(labelled) 378 | 379 | dictionary <- generate_dictionary(anes_dta) 380 | ``` 381 | 382 | Once we've generated the data dictionary, we can take a look at the `V200002` variable and see the label, column type, number of missing entries, and associated values. 383 | 384 | ```{r} 385 | #| label: readr-dictionary-view 386 | dictionary %>% 387 | filter(variable == "V200002") 388 | ``` 389 | 390 | \index{American National Election Studies (ANES)|)} 391 | 392 | ### Labeled missing data values 393 | 394 | \index{Missing data|(} 395 | In survey data analysis, dealing with missing values is a crucial aspect of data preparation. Stata, SPSS, and SAS files each have their own method for handling missing values. 396 | 397 | * Stata has "extended" missing values, `.A` through `.Z`. 398 | * SAS has "special" missing values, `.A` through `.Z` and `._`. 399 | * SPSS has per-column "user" missing values. Each column can declare up to three distinct values or a range of values (plus one distinct value) that should be treated as missing. 400 | 401 | SAS and Stata use a concept known as 'tagged' missing values, which extend R's regular `NA`. A 'tagged' missing value is essentially an `NA` with an additional single-character label. These values behave identically to regular `NA` in standard R operations while preserving the informative tag associated with the missing value. 402 | 403 | Here is an example from the NORC at the University of Chicago’s 2018 General Society Survey, where Don't Know (`DK`) responses are tagged as `NA(d)`, Inapplicable (`IAP`) responses are tagged as `NA(i)`, and `No Answer` responses are tagged as `NA(n)` [@gss-codebook]. 404 | 405 | ```r 406 | head(gss_dta$HEALTH) 407 | #> [6]>: condition of health 408 | #> [1] 2 1 NA(i) NA(i) 1 2 409 | #> 410 | #> Labels: 411 | #> value label 412 | #> 1 excellent 413 | #> 2 good 414 | #> 3 fair 415 | #> 4 poor 416 | #> NA(d) DK 417 | #> NA(i) IAP 418 | #> NA(n) NA 419 | ``` 420 | 421 | In contrast, SPSS uses a different approach called 'user-defined values' to denote missing values. Each column in an SPSS dataset can have up to three distinct values designated as missing or a specified range of missing values. To model these additional user-defined missing values, {haven} provides the `labeled_spss()` subclass of `labeled()`. When importing SPSS data using {haven}, it ensures that user-defined missing values are correctly handled. We can work with these data in R while preserving the unique missing value conventions from SPSS. 422 | 423 | Here is what the GSS SPSS dataset looks like when loaded with {haven}. 424 | 425 | ``` 426 | head(gss_sps$HEALTH) 427 | #> [6]>: Condition of health 428 | #> [1] 2 1 0 0 1 2 429 | #> Missing values: 0, 8, 9 430 | #> 431 | #> Labels: 432 | #> value label 433 | #> 0 IAP 434 | #> 1 EXCELLENT 435 | #> 2 GOOD 436 | #> 3 FAIR 437 | #> 4 POOR 438 | #> 8 DK 439 | #> 9 NA 440 | ``` 441 | 442 | \index{Missing data|)} 443 | 444 | ## Importing data from APIs into R 445 | 446 | In addition to working with data saved as files, we may also need to retrieve data through Application Programming Interfaces (APIs). APIs provide a structured way to access data hosted on external servers and import them directly into R for analysis. 447 | 448 | To access these data, we need to understand how to construct API requests. Each API has unique endpoints, parameters, and authentication requirements. Pay attention to: 449 | 450 | * Endpoints: These are URLs that point to specific data or services 451 | * Parameters: Information passed to the API to customize the request (e.g., date ranges, filters) 452 | * Authentication: APIs may require API keys or tokens for access 453 | * Rate Limits: APIs may have usage limits, so be aware of any rate limits or quotas 454 | 455 | Typically, we begin by making a GET request to an API endpoint. The {httr2} package allows us to generate and process HTTP requests [@R-httr2]. We can make the GET request by pointing to the URL that contains the data we would like: 456 | 457 | ```r 458 | library(httr2) 459 | 460 | api_url <- "https://api.example.com/survey-data" 461 | response <- GET(url = api_url) 462 | ``` 463 | 464 | Once we make the request, we obtain the data as the `response`. The data often come in JSON format. We can extract and parse the data using the {jsonlite} package, allowing us to work with them in R [@jsonliteooms]. The `fromJSON()` function, shown below, converts JSON data to an R object. 465 | 466 | ```r 467 | survey_data <- fromJSON(content(response, "text")) 468 | ``` 469 | 470 | Note that these are dummy examples. Please review the documentation to understand how to make requests from a specific API. 471 | 472 | R offers several packages that simplify API access by providing ready-to-use functions for popular APIs. These packages are called "wrappers," as they "wrap" the API in R to make it easier to use. For example, the {tidycensus} package used in this book simplifies access to U.S. Census data, allowing us to retrieve data with R commands instead of writing API requests from scratch [@R-tidycensus]. Behind the scenes, `get_pums()` is making a GET request from the Census API, and the {tidycensus} functions are converting the response into an R-friendly format. For example, if we are interested in the age, sex, race, and Hispanicity of those in the American Community Survey sample of Durham County, North Carolina^[The public use microdata areas (PUMA) for Durham County were identified using the 2020 PUMA Names File: https://www2.census.gov/geo/pdfs/reference/puma2020/2020_PUMA_Names.pdf], we can use the `get_pums()` function to extract the microdata as shown in the code below. We can then use the replicate weights to create a survey object and calculate estimates for Durham County. 473 | 474 | ```{r} 475 | #| label: readr-pumsin 476 | #| results: false 477 | library(tidycensus) 478 | 479 | durh_pums <- get_pums( 480 | variables = c("PUMA", "SEX", "AGEP", "RAC1P", "HISP"), 481 | state = "NC", 482 | puma = c("01301", "01302"), 483 | survey = "acs1", 484 | year = 2022, 485 | rep_weights = "person" 486 | ) 487 | ``` 488 | 489 | ```{r} 490 | #| label: readr-pumsprint 491 | 492 | durh_pums 493 | ``` 494 | 495 | In Chapter \@ref(c04-getting-started), we used the {censusapi} package to get data from the Census data API for the Current Population Survey. To discover if there is an R package that directly interfaces with a specific survey or data source, search for "[survey] R wrapper" or "[data source] R package" online. 496 | 497 | ## Importing data from databases in R 498 | 499 | Databases provide a secure and organized solution as the volume and complexity of data grow. We can access, manage, and update data stored in databases in a systematic way. Because of how the data are organized, teams can draw from the same source and obtain any metadata that would be helpful for analysis. 500 | 501 | There are various ways of using R to work with databases. If using RStudio, we can connect to different databases through the Connections Pane in the top right of the IDE. We can also use packages like {DBI} and {odbc} to access database tables in R files. Here is an example script connecting to a database: 502 | 503 | ```r 504 | con <- 505 | DBI::dbConnect( 506 | odbc::odbc(), 507 | Driver = "[driver name]", 508 | Server = "[server path]", 509 | UID = rstudioapi::askForPassword("Database user"), 510 | PWD = rstudioapi::askForPassword("Database password"), 511 | Database = "[database name]", 512 | Warehouse = "[warehouse name]", 513 | Schema = "[schema name]" 514 | ) 515 | ``` 516 | 517 | The {dbplyr} and {dplyr} packages allow us to make queries and run data analysis entirely using {dplyr} syntax. All of the code can be written in R, so we do not have to switch between R and SQL to explore the data. Here is some sample code: 518 | 519 | ```r 520 | q1 <- tbl(con, "bank") %>% 521 | group_by(month_idx, year, month) %>% 522 | summarize(subscribe = sum(ifelse(term_deposit == "yes", 1, 0)), 523 | total = n()) 524 | 525 | show_query(q1) 526 | ``` 527 | 528 | Be sure to check the documentation to configure a database connection. 529 | 530 | ## Importing data from other formats 531 | 532 | R also offers dedicated packages such as {googlesheets4} for Google Sheets or {qualtRics} for Qualtrics. With less common or proprietary file formats, the broader data science community can often provide guidance. Online resources like [Stack Overflow](https://stackoverflow.com/) and dedicated forums like [Posit Community](https://forum.posit.co/) are valuable sources of information for importing data into R. 533 | -------------------------------------------------------------------------------- /91-AppendixB-ANES-CB-latex.Rmd: -------------------------------------------------------------------------------- 1 | # ANES derived variable codebook {#anes-cb} 2 | 3 | \index{American National Election Studies (ANES)|(} 4 | The full codebook with the original variables is available at @anes-cb. 5 | 6 | The ANES codebook for the data used in this book (`anes_2020` from {srvyrexploR}) is available in the online version of the book at [https://tidy-survey-r.github.io/tidy-survey-book/anes-cb.html](https://tidy-survey-r.github.io/tidy-survey-book/anes-cb.html). 7 | \index{American National Election Studies (ANES)|)} -------------------------------------------------------------------------------- /91-AppendixB-ANES-CB.Rmd: -------------------------------------------------------------------------------- 1 | # ANES derived variable codebook {#anes-cb} 2 | 3 | ```{r} 4 | #| label: anes-cb-setup 5 | #| echo: FALSE 6 | #| error: FALSE 7 | #| warning: FALSE 8 | #| message: FALSE 9 | library(tidyverse) 10 | library(janitor) 11 | library(kableExtra) 12 | library(knitr) 13 | 14 | data(anes_2020) 15 | 16 | attrlist <- map(anes_2020, attributes) 17 | 18 | NULL_to_NA <- function(x){ 19 | if (is.null(x)){ 20 | NA 21 | }else{ 22 | x 23 | } 24 | } 25 | 26 | anes_var_info <- tibble( 27 | Vars=names(attrlist), 28 | Section=map_chr(attrlist, "Section") %>% unname(), 29 | Question=map(attrlist, "Question") %>% map(NULL_to_NA) %>% unlist(use.names = FALSE), 30 | Description=map_chr(attrlist, "label") %>% unname(), 31 | VarType=map(anes_2020, class) ,) %>% 32 | rowwise() %>% 33 | mutate( 34 | VarClass=str_c(VarType, collapse=", "), 35 | VarType=case_when( 36 | Vars%in%c("V200001", "CaseID")~ list("ID"), 37 | Vars=="V201507x"~list("numeric2"), 38 | TRUE~list(VarType) 39 | ) 40 | ) %>% 41 | ungroup() 42 | 43 | cb_count <- function(dat, var){ 44 | t <- dat %>% 45 | count(.data[[var]]) %>% 46 | mutate(`Unweighted Freq` = n / sum(n)) %>% 47 | janitor::adorn_totals(where="row", fill="-", na.rm=TRUE, name="Total", n, `Unweighted Freq`) %>% 48 | mutate(`Unweighted Freq`= round(`Unweighted Freq`, 3)) %>% 49 | kbl(position="H") 50 | 51 | if (knitr:::is_html_output()){ 52 | t %>% kable_minimal() %>% print() 53 | } else{ 54 | t %>% print() 55 | } 56 | } 57 | 58 | cb_count_labelled <- function(dat, var){ 59 | dat2 <- dat %>% 60 | mutate( 61 | Label=as_factor(.data[[var]], levels="labels"), 62 | ) %>% haven::zap_labels() 63 | 64 | 65 | t <- dat2 %>% 66 | count(.data[[var]], Label) %>% 67 | mutate(`Unweighted Freq` = n / sum(n)) %>% 68 | janitor::adorn_totals(where="row", fill="-", na.rm=TRUE, name="Total", n, `Unweighted Freq`) %>% 69 | mutate(`Unweighted Freq`= round(`Unweighted Freq`, 3)) %>% 70 | kbl(position="H") 71 | 72 | if (knitr:::is_html_output()){ 73 | t %>% kable_minimal() %>% print() 74 | } else{ 75 | t %>% print() 76 | } 77 | } 78 | 79 | 80 | cb_continuous <- function(dat, var){ 81 | t <- dat %>% 82 | summarize( 83 | `N Missing`=sum(is.na(.data[[var]])), 84 | Minimum = min(.data[[var]], na.rm = TRUE), 85 | Median = median(.data[[var]], na.rm = TRUE), 86 | Maximum = max(.data[[var]], na.rm = TRUE)) %>% 87 | kbl(position="H") 88 | 89 | if (knitr:::is_html_output()){ 90 | t %>% kable_minimal() %>% print() 91 | } else{ 92 | t %>% print() 93 | } 94 | 95 | } 96 | 97 | cb_continuous_spec <- function(dat, var){ 98 | dat2 <- dat %>% 99 | haven::zap_labels() 100 | 101 | t_valid <- dat2 %>% 102 | filter(.data[[var]]>0) %>% 103 | summarize( 104 | `N Missing`=sum(is.na(.data[[var]])), 105 | Minimum = min(.data[[var]], na.rm = TRUE), 106 | Median = median(.data[[var]], na.rm = TRUE), 107 | Maximum = max(.data[[var]], na.rm = TRUE)) 108 | 109 | t_ref <- dat2 %>% 110 | filter(.data[[var]]==-9) %>% 111 | count(name="N Refused (-9)") 112 | 113 | t <- t_ref %>% bind_cols(t_valid) %>% 114 | select(`N Missing`, `N Refused (-9)`, everything()) %>% 115 | kbl(position="H") 116 | 117 | if (knitr:::is_html_output()){ 118 | t %>% kable_minimal() %>% print() 119 | } else{ 120 | t %>% print() 121 | } 122 | 123 | } 124 | 125 | 126 | make_section <- function(sec){ 127 | cat(str_c("## ", sec, "\n\n")) 128 | 129 | make_sum <- function(var){ 130 | cat(str_c("#### ", var, " {-} \n\n")) 131 | vi <- anes_var_info %>% filter(Vars==var) 132 | de <- vi %>% pull(Description) 133 | cat(str_c("Description: ", de, "\n\n")) 134 | qt <- vi %>% pull(Question) 135 | if (!is.na(qt)) cat(str_c("Question text: ", qt, "\n\n")) 136 | vc <- vi %>% pull(VarClass) 137 | cat(str_c("Variable class: ", vc, "\n\n")) 138 | vt <- vi %>% pull(VarType) %>% unlist() 139 | 140 | if (any(c("factor", "character", "logical") %in% vt)){ 141 | anes_2020 %>% cb_count(var) 142 | cat("\n") 143 | } else if ("haven_labelled" %in% vt){ 144 | anes_2020 %>% cb_count_labelled(var) 145 | cat("\n") 146 | } else if ("numeric" %in% vt){ 147 | anes_2020 %>% cb_continuous(var) 148 | cat("\n") 149 | } else if ("numeric2" %in% vt){ 150 | anes_2020 %>% cb_continuous_spec(var) 151 | cat("\n") 152 | } 153 | 154 | } 155 | 156 | anes_var_info %>% filter(Section==sec) %>% pull(Vars) %>% 157 | walk(make_sum) 158 | } 159 | 160 | ``` 161 | 162 | The full codebook with the original variables is available at @anes-cb. 163 | 164 | This is a codebook for the ANES data used in this book (`anes_2020`) from the {srvyrexploR} package. 165 | 166 | 167 | ```{r} 168 | #| label: anes-cb-write 169 | #| echo: FALSE 170 | #| results: asis 171 | anes_var_info %>% 172 | distinct(Section) %>% 173 | pull(Section) %>% 174 | walk(make_section) 175 | ``` 176 | -------------------------------------------------------------------------------- /92-AppendixC-RECS-CB-latex.Rmd: -------------------------------------------------------------------------------- 1 | # RECS derived variable codebook {#recs-cb} 2 | 3 | \index{Residential Energy Consumption Survey (RECS)|(} 4 | The full codebook with the original variables is available at [https://www.eia.gov/consumption/residential/data/2020/index.php?view=microdata](https://www.eia.gov/consumption/residential/data/2020/index.php?view=microdata) - "Variable and Response Codebook." 5 | 6 | The RECS codebook for the data used in this book (`recs_2020` from {srvyrexploR}) is available in the online version of the book at [https://tidy-survey-r.github.io/tidy-survey-book/recs-cb.html](https://tidy-survey-r.github.io/tidy-survey-book/recs-cb.html). 7 | \index{Residential Energy Consumption Survey (RECS)|)} -------------------------------------------------------------------------------- /92-AppendixC-RECS-CB.Rmd: -------------------------------------------------------------------------------- 1 | # RECS derived variable codebook {#recs-cb} 2 | 3 | ```{r} 4 | #| label: recs-cb-cb-setup 5 | #| echo: FALSE 6 | #| error: FALSE 7 | #| warning: FALSE 8 | #| message: FALSE 9 | library(tidyverse) 10 | library(janitor) 11 | library(kableExtra) 12 | library(knitr) 13 | 14 | data(recs_2020) 15 | ``` 16 | 17 | The full codebook with the original variables is available at [https://www.eia.gov/consumption/residential/data/2020/index.php?view=microdata](https://www.eia.gov/consumption/residential/data/2020/index.php?view=microdata) - "Variable and Response Codebook." 18 | 19 | This is a codebook for the RECS data used in this book (`recs_2020`) from the {srvyrexploR} package. 20 | 21 | 22 | ```{r} 23 | #| label: recs-cb-prep 24 | #| echo: FALSE 25 | 26 | attrlist <- map(recs_2020, attributes) 27 | 28 | recs_var_info <- tibble( 29 | Vars=names(attrlist), 30 | Section=map_chr(attrlist, "Section") %>% unname(), 31 | Question=map(attrlist, "Question") %>% map(NULL_to_NA) %>% unlist(use.names = FALSE), 32 | Description=map_chr(attrlist, "label") %>% unname(), 33 | VarType=map(recs_2020, class) , 34 | ) %>% 35 | mutate( 36 | VarType=if_else(Vars=="DOEID", list("ID"), VarType) 37 | ) 38 | 39 | 40 | cb_count <- function(dat, var){ 41 | t <- dat %>% 42 | count(.data[[var]]) %>% 43 | mutate(`Unweighted Freq` = n / sum(n)) %>% 44 | janitor::adorn_totals(where="row", fill="-", na.rm=TRUE, name="Total", n, `Unweighted Freq`) %>% 45 | mutate(`Unweighted Freq`= round(`Unweighted Freq`, 3)) %>% 46 | kbl(position="H") 47 | 48 | if (knitr:::is_html_output()){ 49 | t %>% kable_minimal() %>% print() 50 | } else{ 51 | t %>% print() 52 | } 53 | } 54 | 55 | cb_continuous <- function(dat, var){ 56 | t <- dat %>% 57 | summarize( 58 | `N Missing`=sum(is.na(.data[[var]])), 59 | Minimum = min(.data[[var]], na.rm = TRUE), 60 | Median = median(.data[[var]], na.rm = TRUE), 61 | Maximum = max(.data[[var]], na.rm = TRUE)) %>% 62 | kbl(position="H") 63 | 64 | if (knitr:::is_html_output()){ 65 | t %>% kable_minimal() %>% print() 66 | } else{ 67 | t %>% print() 68 | } 69 | 70 | } 71 | 72 | 73 | make_section <- function(sec){ 74 | cat(str_c("## ", sec, "\n\n")) 75 | 76 | make_sum <- function(var){ 77 | cat(str_c("#### ", var, " {-} \n\n")) 78 | vi <- recs_var_info %>% filter(Vars==var) 79 | de <- vi %>% pull(Description) 80 | cat(str_c("Description: ", de, "\n\n")) 81 | qt <- vi %>% pull(Question) 82 | if (!is.na(qt)) cat(str_c("Question text: ", qt, "\n\n")) 83 | vt <- vi %>% pull(VarType) %>% unlist() 84 | 85 | if (any(c("factor", "character", "logical") %in% vt)){ 86 | recs_2020 %>% cb_count(var) 87 | cat("\n") 88 | } else if ("numeric" %in% vt){ 89 | recs_2020 %>% cb_continuous(var) 90 | cat("\n") 91 | } 92 | 93 | } 94 | 95 | recs_var_info %>% filter(Section==sec) %>% pull(Vars) %>% 96 | walk(make_sum) 97 | } 98 | 99 | 100 | 101 | 102 | ``` 103 | 104 | 105 | ```{r} 106 | #| label: recs-cb-write 107 | #| echo: FALSE 108 | #| results: asis 109 | recs_var_info %>% 110 | distinct(Section) %>% 111 | pull(Section) %>% 112 | walk(make_section) 113 | ``` 114 | 115 | -------------------------------------------------------------------------------- /93-AppendixD-Solutions-latex.Rmd: -------------------------------------------------------------------------------- 1 | # Exercise solutions {#exercise-solutions} 2 | 3 | Exercise solutions are available in the online version of the book at https://tidy-survey-r.github.io/tidy-survey-book/exercise-solutions.html. -------------------------------------------------------------------------------- /94-AppendixE-Corrections-Remarks.Rmd: -------------------------------------------------------------------------------- 1 | # Corrections & Remarks {#corrections-remarks} 2 | 3 | These errata are in the print version. They have been corrected in the online version. 4 | 5 | ## Chapter 2 6 | 7 | The word **broad** not **board** should be in the following sentence. "If we are looking at more **broad** populations of interest, like all adults in the United States, the sampling frame is likely imperfect." 8 | 9 | ## Appendix A.3 {-} 10 | 11 | Stata files have the extension of `.dta`. There are a few instances of using the function `read_dat()` instead of `read_dta()` on Page 315. -------------------------------------------------------------------------------- /99-references.Rmd: -------------------------------------------------------------------------------- 1 | `r if (knitr:::is_html_output()) '# References {-}'` 2 | 3 | ```{r} 4 | #| label: refs-package-list 5 | #| include: FALSE 6 | # generate a BibTeX database automatically for some R packages 7 | library(targets) 8 | library(visdat) 9 | library(readxl) 10 | library(httr2) 11 | library(jsonlite) 12 | library(tidycensus) 13 | library(pak) 14 | 15 | 16 | our_write_bib <- function (x = .packages(), file = "", tweak = TRUE, width = NULL, 17 | prefix = getOption("knitr.bib.prefix", "R-"), lib.loc = NULL) 18 | { 19 | .this.year = sprintf(' year = {%s},', format(Sys.Date(), '%Y')) 20 | system.file = function(...) base::system.file(..., lib.loc = lib.loc) 21 | citation = function(...) utils::citation(..., lib.loc = lib.loc) 22 | x = x[nzchar(x)] 23 | idx = mapply(system.file, package = x) == "" 24 | if (any(idx)) { 25 | warning("package(s) ", paste(x[idx], collapse = ", "), 26 | " not found") 27 | x = x[!idx] 28 | } 29 | x = setdiff(x, setdiff(xfun::base_pkgs(), "base")) 30 | x = sort(x) 31 | bib = sapply(x, function(pkg) { 32 | cite = citation(pkg, auto = if (pkg != "base") { 33 | meta = packageDescription(pkg, lib.loc = lib.loc) 34 | if (identical(meta$Repository, "CRAN") && !is.null(meta$URL)) { 35 | if (!grepl("[, ]", meta$URL)) 36 | meta$Repository = NULL 37 | } 38 | meta 39 | }) 40 | 41 | 42 | if (tweak) { 43 | cite$title = gsub(sprintf("^(%s: )(\\1)", pkg), 44 | "\\1", cite$title) 45 | cite$title = gsub(pkg, paste0("{", pkg, "}"), cite$title) 46 | cite$title = gsub("\\b(R)\\b", "{R}", cite$title) 47 | cite$title = gsub("\\b(ggplot2)\\b", "{ggplot2}", cite$title) 48 | cite$title = gsub("\\b(dplyr)\\b", "{dplyr}", cite$title) 49 | cite$title = gsub("\\b(tidyverse)\\b", "{tidyverse}", cite$title) 50 | cite$title = gsub("\\b(sf)\\b", "{sf}", cite$title) 51 | cite$title = gsub(" & ", " \\\\& ", cite$title) 52 | } 53 | entry = toBibtex(cite) 54 | entry[1] = sub("\\{,$", sprintf("{%s%s,", prefix, pkg), 55 | entry[1]) 56 | entry 57 | }, simplify = FALSE) 58 | if (tweak) { 59 | for (i in intersect(names(knitr:::.tweak.bib), x)) { 60 | message("tweaking ", i) 61 | bib[[i]] = merge_list(bib[[i]], knitr:::.tweak.bib[[i]]) 62 | } 63 | bib = lapply(bib, function(b) { 64 | b["author"] = sub("Duncan Temple Lang", "Duncan {Temple Lang}", 65 | b["author"]) 66 | # b["title"] = gsub("(^|\\W)'([^']+)'(\\W|$)", "\\1\\2\\3", 67 | # b["title"]) 68 | if (!is.na(b["note"])) 69 | b["note"] = gsub("(^.*?https?://.*?),\\s+https?://.*?(},\\s*)$", 70 | "\\1\\2", b["note"]) 71 | if (!("year" %in% names(b))) 72 | b["year"] = .this.year 73 | b 74 | }) 75 | } 76 | bib2 = lapply(x, function(pkg) { 77 | if (pkg == "base") 78 | return() 79 | if (system.file("CITATION", package = pkg) == "") 80 | return() 81 | cites = citation(pkg, auto = FALSE) 82 | cites = Filter(x = cites, function(cite) { 83 | !isTRUE(grepl("R package version", cite$note)) 84 | }) 85 | s = knitr:::make_unique(unlist(lapply(cites, function(cite) { 86 | if (is.null(cite$year)) 87 | format(Sys.Date(), "%Y") 88 | else cite$year 89 | }))) 90 | mapply(cites, s, FUN = function(cite, suffix) { 91 | if (isTRUE(grepl("R package version", cite$note))) 92 | return() 93 | entry = toBibtex(cite) 94 | entry[1] = sub("\\{,$", sprintf("{%s%s,", pkg, suffix), 95 | entry[1]) 96 | entry 97 | }, SIMPLIFY = FALSE) 98 | }) 99 | bib = c(bib, unlist(bib2, recursive = FALSE)) 100 | bib = lapply(bib, function(b) { 101 | idx = which(names(b) == "") 102 | if (!is.null(width)) 103 | b[-idx] = str_wrap(b[-idx], width, 2, 4) 104 | structure(c(b[idx[1L]], b[-idx], b[idx[2L]], ""), class = "Bibtex") 105 | }) 106 | if (!is.null(file) && length(x)) 107 | xfun::write_utf8(unlist(bib), file) 108 | invisible(bib) 109 | } 110 | 111 | 112 | 113 | our_write_bib(c( 114 | .packages(), 'bookdown', 'knitr', 'rmarkdown', 'renv', 'here' 115 | ), 'packages.bib') 116 | ``` 117 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-NoDerivatives 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 60 | 61 | Section 1 – Definitions. 62 | 63 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 64 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 65 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 66 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 67 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 68 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 69 | Licensor means the individual(s) or entity(ies) granting rights under this Public License. 70 | NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 71 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 72 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 73 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 74 | 75 | Section 2 – Scope. 76 | 77 | License grant. 78 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 79 | reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 80 | produce and reproduce, but not Share, Adapted Material for NonCommercial purposes only. 81 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 82 | Term. The term of this Public License is specified in Section 6(a). 83 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 84 | Downstream recipients. 85 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 86 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 87 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 88 | 89 | Other rights. 90 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 91 | Patent and trademark rights are not licensed under this Public License. 92 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 93 | 94 | Section 3 – License Conditions. 95 | 96 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 97 | 98 | Attribution. 99 | 100 | If You Share the Licensed Material, You must: 101 | retain the following if it is supplied by the Licensor with the Licensed Material: 102 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 103 | a copyright notice; 104 | a notice that refers to this Public License; 105 | a notice that refers to the disclaimer of warranties; 106 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 107 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 108 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 109 | For the avoidance of doubt, You do not have permission under this Public License to Share Adapted Material. 110 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 111 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 112 | 113 | Section 4 – Sui Generis Database Rights. 114 | 115 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 116 | 117 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only and provided You do not Share Adapted Material; 118 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and 119 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 120 | 121 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 122 | 123 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 124 | 125 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 126 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 127 | 128 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 129 | 130 | Section 6 – Term and Termination. 131 | 132 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 133 | 134 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 135 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 136 | upon express reinstatement by the Licensor. 137 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 138 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 139 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 140 | 141 | Section 7 – Other Terms and Conditions. 142 | 143 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 144 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 145 | 146 | Section 8 – Interpretation. 147 | 148 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 149 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 150 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 151 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 152 | 153 | 154 | 155 | ======================================================================= 156 | 157 | Creative Commons is not a party to its public 158 | licenses. Notwithstanding, Creative Commons may elect to apply one of 159 | its public licenses to material it publishes and in those instances 160 | will be considered the “Licensor.” The text of the Creative Commons 161 | public licenses is dedicated to the public domain under the CC0 Public 162 | Domain Dedication. Except for the limited purpose of indicating that 163 | material is shared under a Creative Commons public license or as 164 | otherwise permitted by the Creative Commons policies published at 165 | creativecommons.org/policies, Creative Commons does not authorize the 166 | use of the trademark "Creative Commons" or any other trademark or logo 167 | of Creative Commons without its prior written consent including, 168 | without limitation, in connection with any unauthorized modifications 169 | to any of its public licenses or any other arrangements, 170 | understandings, or agreements concerning use of licensed material. For 171 | the avoidance of doubt, this paragraph does not form part of the 172 | public licenses. 173 | 174 | Creative Commons may be contacted at creativecommons.org. 175 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Exploring Complex Survey Data Analysis Using R: A Tidy Introduction with {srvyr} and {survey} 2 | 3 | Surveys are a powerful tool for gathering information, drawing insights, and driving decisions. However, they require specific analysis methods to ensure the correct interpretation of results. This book will provide an in-depth introduction to conducting survey analysis with the {srvyr} package and {tidyverse} family of functions. Readers will understand how and why to use survey analysis techniques as well as how to interpret and communicate results. 4 | 5 | ## Chapters 6 | 7 |
    8 |
  1. Introduction 9 |
      10 |
    1. Introduction 11 |
    2. Overview of surveys 12 |
    3. Survey data documentation 13 |
  2. 14 |
  3. Analysis 15 |
      16 |
    1. Getting started 17 |
    2. Descriptive analyses 18 |
    3. Statistical testing 19 |
    4. Modeling 20 |
  4. 21 |
  5. Reporting 22 |
      23 |
    1. Communication of results 24 |
    2. Reproducible research 25 |
  6. 26 |
  7. Real-life data 27 |
      28 |
    1. Sample designs and replicate weights 29 |
    2. Missing data 30 |
    3. Successful survey analysis recommendations 31 |
  8. 32 |
  9. Vignettes 33 |
      34 |
    1. National Crime Victimization Survey vignette 35 |
    2. AmericasBarometer vignette 36 |
  10. 37 |
38 | 39 | ## Appendices 40 | 41 |
    42 |
  1. Importing survey data into R 43 |
  2. ANES derived variable codebook 44 |
  3. RECS derived variable codebook 45 |
  4. Exercise solutions 46 |
47 | -------------------------------------------------------------------------------- /_bookdown.yml: -------------------------------------------------------------------------------- 1 | book_filename: bookdown 2 | clean: [packages.bib, bookdown.bbl] 3 | delete_merged_file: true 4 | rmd_files: 5 | html: ['index.Rmd', '01-introduction.Rmd', '02-overview-surveys.Rmd', '03-survey-data-documentation.Rmd', '04-set-up.Rmd', '05-descriptive-analysis.Rmd', '06-statistical-testing.Rmd', '07-modeling.Rmd', '08-communicating-results.Rmd', '09-reproducible-data.Rmd', '10-sample-designs-replicate-weights.Rmd', '11-missing-data.Rmd', '12-successful-survey-data-analysis.Rmd', '13-ncvs-vignette.Rmd', '14-ambarom-vignette.Rmd', '90-AppendixA-DataImport.Rmd', '91-AppendixB-ANES-CB.Rmd', '92-AppendixC-RECS-CB.Rmd', '93-AppendixD-Solutions.Rmd', '94-AppendixE-Corrections-Remarks.Rmd', '99-references.Rmd'] 6 | latex: ['index.Rmd', '01-introduction.Rmd', '02-overview-surveys.Rmd', '03-survey-data-documentation.Rmd', '04-set-up.Rmd', '05-descriptive-analysis.Rmd', '06-statistical-testing.Rmd', '07-modeling.Rmd', '08-communicating-results.Rmd', '09-reproducible-data.Rmd', '10-sample-designs-replicate-weights.Rmd', '11-missing-data.Rmd', '12-successful-survey-data-analysis.Rmd', '13-ncvs-vignette.Rmd', '14-ambarom-vignette.Rmd', '90-AppendixA-DataImport.Rmd', '91-AppendixB-ANES-CB-latex.Rmd', '92-AppendixC-RECS-CB-latex.Rmd', '93-AppendixD-Solutions-latex.Rmd', '99-references.Rmd'] 7 | language: 8 | label: 9 | fig: "FIGURE " 10 | tab: "TABLE " 11 | ui: 12 | edit: "Edit" 13 | chapter_name: "Chapter " 14 | -------------------------------------------------------------------------------- /_output.yml: -------------------------------------------------------------------------------- 1 | bookdown::gitbook: 2 | css: css/style.css 3 | highlight: kate 4 | includes: 5 | in_header: plausible.html 6 | before_body: assets/hero-image.html 7 | config: 8 | toc: 9 | collapse: section 10 | before: | 11 |
  • Exploring
    Complex Survey
    Data Analysis Using R
  • 12 | 13 | after: | 14 |
  • Published with bookdown
  • 15 | download: no 16 | fontsettings: no 17 | view: https://github.com/tidy-survey-r/tidy-survey-book/blob/main/%s 18 | sharing: 19 | facebook: false 20 | github: true 21 | twitter: true 22 | linkedin: true 23 | weibo: false 24 | instapaper: false 25 | vk: false 26 | whatsapp: false 27 | all: ['facebook', 'twitter', 'linkedin', 'github'] 28 | bookdown::pdf_book: 29 | includes: 30 | in_header: latex/preamble.tex 31 | before_body: latex/before_body_ded.tex 32 | after_body: latex/after_body.tex 33 | keep_tex: true 34 | dev: "cairo_pdf" 35 | latex_engine: xelatex 36 | citation_package: natbib 37 | template: null 38 | pandoc_args: --top-level-division=chapter 39 | toc_depth: 2 40 | toc_unnumbered: false 41 | toc_appendix: true 42 | quote_footer: ["\\VA{", "}{}"] 43 | highlight_bw: true -------------------------------------------------------------------------------- /assets/hero-image.html: -------------------------------------------------------------------------------- 1 | 2 |
    3 |

    4 | Join us at useR on August 8, 2025 for our workshop: Complex Survey Data Analysis: A Tidy Introduction with {srvyr} and {survey}. 5 | Register here! 6 |

    7 | 8 |
    -------------------------------------------------------------------------------- /book.bib: -------------------------------------------------------------------------------- 1 | @book{xie2015, 2 | title = {Dynamic Documents with {R} and knitr}, 3 | author = {Yihui Xie}, 4 | year = 2015, 5 | publisher = {Chapman and Hall/CRC}, 6 | address = {Boca Raton, Florida}, 7 | note = {ISBN 978-1498716963}, 8 | howpublished = {\url{http://yihui.name/knitr/}}, 9 | edition = {2nd} 10 | } 11 | @book{lohr2021sampling, 12 | title = {Sampling: design and analysis}, 13 | author = {Lohr, Sharon L}, 14 | year = 2021, 15 | publisher = {Chapman and Hall/CRC} 16 | } 17 | @book{cox2011business, 18 | title = {Business survey methods}, 19 | author = {Cox, Brenda G and Binder, David A and Chinnappa, B Nanjamma and Christianson, Anders and Colledge, Michael J and Kott, Phillip S}, 20 | year = 2011, 21 | publisher = {John Wiley \& Sons} 22 | } 23 | @book{wolter2007introduction, 24 | title = {Introduction to variance estimation}, 25 | author = {Wolter, Kirk M}, 26 | year = 2007, 27 | publisher = {Springer}, 28 | volume = 53 29 | } 30 | @book{cochran1977sampling, 31 | title = {Sampling techniques}, 32 | author = {Cochran, William G}, 33 | year = 1977, 34 | publisher = {John Wiley \& Sons} 35 | } 36 | @book{valliant2013practical, 37 | title = {Practical tools for designing and weighting survey samples}, 38 | author = {Valliant, Richard and Dever, Jill A and Kreuter, Frauke}, 39 | year = 2013, 40 | publisher = {Springer}, 41 | volume = 1 42 | } 43 | @book{levy2013sampling, 44 | title = {Sampling of populations: methods and applications}, 45 | author = {Levy, Paul S and Lemeshow, Stanley}, 46 | year = 2013, 47 | publisher = {John Wiley \& Sons} 48 | } 49 | @book{sarndal2003model, 50 | title = {Model assisted survey sampling}, 51 | author = {S{\"a}rndal, Carl-Erik and Swensson, Bengt and Wretman, Jan}, 52 | year = 2003, 53 | publisher = {Springer Science \& Business Media} 54 | } 55 | @book{fuller2011sampling, 56 | title = {Sampling statistics}, 57 | author = {Fuller, Wayne A}, 58 | year = 2011, 59 | publisher = {John Wiley \& Sons} 60 | } 61 | @misc{lapopdat, 62 | title = {The {A}mericas{B}arometer by the {LAPOP} Lab}, 63 | year = 2023, 64 | howpublished = {\url{www.vanderbilt.edu/lapop}} 65 | } 66 | @misc{lapop, 67 | title = {The {A}mericas{B}arometer by the {LAPOP} Lab}, 68 | author = {{LAPOP}}, 69 | year = 2023, 70 | howpublished = {\url{www.vanderbilt.edu/lapop}} 71 | } 72 | @misc{lapop-about, 73 | title = {About the {A}mericas{B}arometer}, 74 | author = {{LAPOP}}, 75 | year = 2023, 76 | howpublished = {\url{https://www.vanderbilt.edu/lapop/about-americasbarometer.php}} 77 | } 78 | @techreport{lapop-tech, 79 | title = {Americas{B}arometer 2021: Technical Information}, 80 | author = {{LAPOP}}, 81 | year = 2021, 82 | howpublished = {\url{https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf}}, 83 | institution = {Vanderbilt University} 84 | } 85 | @techreport{lapop-can, 86 | title = {Americas{B}arometer 2021 - {C}anada: Technical Information}, 87 | author = {{LAPOP}}, 88 | year = 2021, 89 | howpublished = {\url{http://datasets.americasbarometer.org/database/files/ABCAN2021-Technical-Report-v1.0-FINAL-eng-110921.pdf}}, 90 | institution = {Vanderbilt University} 91 | } 92 | @techreport{lapop-usa, 93 | title = {Americas{B}arometer 2021 - {U.S.}: Technical Information}, 94 | author = {{LAPOP}}, 95 | year = 2021, 96 | howpublished = {\url{http://datasets.americasbarometer.org/database/files/ABUSA2021-Technical-Report-v1.0-FINAL-eng-110921.pdf}}, 97 | institution = {Vanderbilt University} 98 | } 99 | @misc{lapop-svy, 100 | title = {Core Questionnaire}, 101 | author = {{LAPOP}}, 102 | year = 2021, 103 | howpublished = {\url{https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf}} 104 | } 105 | @book{deming1991sample, 106 | title = {Sample design in business research}, 107 | author = {Deming, W Edwards}, 108 | year = 1991, 109 | publisher = {John Wiley \& Sons}, 110 | volume = 23 111 | } 112 | @misc{ncvs_data_2021, 113 | title = {National {Crime} {Victimization} {Survey}, [{United} {States}], 2021}, 114 | author = {{U.S. Bureau of Justice Statistics}}, 115 | year = 2022, 116 | publisher = {Inter-university Consortium for Political and Social Research [distributor]}, 117 | doi = {10.3886/ICPSR38429.v1}, 118 | note = {Type: dataset}, 119 | howpublished = {\url{https://www.icpsr.umich.edu/web/NACJD/studies/38429}} 120 | } 121 | @misc{ncvs_user_guide, 122 | title = {Users' guide to the {National} {Crime} {Victimization} {Survey} ({NCVS}) direct variance estimation}, 123 | author = {{Shook-Sa}, Bonnie and Couzens, G. Lance and Berzofsky, Marcus}, 124 | year = 2015, 125 | publisher = {U. S. Bureau of Justice Statistics}, 126 | howpublished = {\url{https://bjs.ojp.gov/sites/g/files/xyckuh236/files/media/document/ncvs_variance_user_guide_11.06.14.pdf}} 127 | } 128 | @misc{ncvs_tech_2016, 129 | title = {National {Crime} {Victimization} {Survey}, 2016: {Technical} {Documentation}}, 130 | author = {{U. S. Bureau of Justice Statistics}}, 131 | year = 2017, 132 | month = dec, 133 | howpublished = {\url{https://bjs.ojp.gov/sites/g/files/xyckuh236/files/media/document/ncvstd16.pdf}} 134 | } 135 | @misc{ncvs_survey_2020, 136 | title = {National {Crime} {Victimization} {Survey} NCVS-2 CRIME INCIDENT REPORT}, 137 | author = {{U. S. Bureau of Justice Statistics}}, 138 | year = 2020, 139 | howpublished = {\url{https://bjs.ojp.gov/content/pub/pdf/ncvs20_cir.pdf}} 140 | } 141 | @misc{ncvs_cb_2020, 142 | title = {{National Crime Victimization Survey}, [{United States}], 2021}, 143 | author = {{U. S. Bureau of Justice Statistics}}, 144 | year = 2022, 145 | howpublished = {\url{https://www.icpsr.umich.edu/web/NACJD/studies/38429/datadocumentation}}, 146 | note = {Download - DS0 Study-Level Files - Codebook [{PDF}]} 147 | } 148 | @article{gelman2007weights, 149 | title = {{Struggles with Survey Weighting and Regression Modeling}}, 150 | author = {Andrew Gelman}, 151 | year = 2007, 152 | journal = {Statistical Science}, 153 | publisher = {Institute of Mathematical Statistics}, 154 | volume = 22, 155 | number = 2, 156 | pages = {153--164}, 157 | doi = {10.1214/088342306000000691}, 158 | howpublished = {\url{https://doi.org/10.1214/088342306000000691}}, 159 | keywords = {multilevel modeling, poststratification, sampling weights, shrinkage} 160 | } 161 | @article{gard2023weightsdef, 162 | title = {Why weight? Analytic approaches for large-scale population neuroscience data}, 163 | journal = {Developmental Cognitive Neuroscience}, 164 | volume = {59}, 165 | pages = {101196}, 166 | year = {2023}, 167 | issn = {1878-9293}, 168 | doi = {https://doi.org/10.1016/j.dcn.2023.101196}, 169 | author = {Arianna M. Gard and Luke W. Hyde and Steven G. Heeringa and Brady T. West and Colter Mitchell}, 170 | howpublished = {\url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9843279/}} 171 | } 172 | @article{bollen2016weightsreg, 173 | title = {Are Survey Weights Needed? A Review of Diagnostic Tests in Regression Analysis}, 174 | author = {Bollen, Kenneth A. and Biemer, Paul P. and Karr, Alan F. and Tueller, Stephen and Berzofsky, Marcus E.}, 175 | year = 2016, 176 | journal = {Annual Review of Statistics and Its Application}, 177 | volume = 3, 178 | number = 1, 179 | pages = {375--392}, 180 | doi = {10.1146/annurev-statistics-011516-012958}, 181 | howpublished = {\url{https://doi.org/10.1146/annurev-statistics-011516-012958}} 182 | } 183 | @book{lumley2010complex, 184 | title = {Complex surveys: a guide to analysis Using {R}}, 185 | author = {Thomas Lumley}, 186 | year = 2010, 187 | publisher = {John Wiley \& Sons} 188 | } 189 | @article{hansen1987, 190 | title = {Some history and reminiscences on survey sampling}, 191 | author = {Hansen, Morris H}, 192 | year = 1987, 193 | journal = {Statistical Science}, 194 | publisher = {Institute of Mathematical Statistics}, 195 | volume = 2, 196 | number = 2, 197 | pages = {180--190} 198 | } 199 | @misc{anes-svy, 200 | title = {ANES 2020 Time Series Study: Pre-Election and Post-Election Survey Questionnaires}, 201 | author = {{American National Election Studies}}, 202 | year = 2021, 203 | howpublished = {\url{https://electionstudies.org/wp-content/uploads/2021/07/anes_timeseries_2020_questionnaire_20210719.pdf}} 204 | } 205 | @misc{anes-cb, 206 | title = {ANES 2020 Time Series Study Full Release: User Guide and Codebook}, 207 | author = {{American National Election Studies}}, 208 | year = 2022, 209 | howpublished = {\url{https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf}} 210 | } 211 | @misc{brfss-svy, 212 | title = {Behavioral Risk Factor Surveillance System Survey Questionnaire}, 213 | author = {{Centers for Disease Control and Prevention (CDC)}}, 214 | year = 2021, 215 | howpublished = {\url{https://www.cdc.gov/brfss/questionnaires/pdf-ques/2021-BRFSS-Questionnaire-1-19-2022-508.pdf}}, 216 | institution = {U.S. Department of Health and Human Services, Centers for Disease Control and Prevention} 217 | } 218 | @book{allison, 219 | title = {Missing Data}, 220 | author = {Allison, Paul}, 221 | year = 2002, 222 | publisher = {SAGE Publications}, 223 | doi = {10.4135/9781412985079} 224 | } 225 | @inbook{mack, 226 | title = {Types of Missing Data}, 227 | author = {Mack, Christina and Su, Zhaohui and Westreich, Daniel}, 228 | year = 2018, 229 | booktitle = {Managing Missing Data in Patient Registries: Addendum to Registries for Evaluating Patient Outcomes: A User’s Guide, Third Edition [Internet]}, 230 | howpublished = {\url{https://www.ncbi.nlm.nih.gov/books/NBK493614/}}, 231 | publisher = {Rockville (MD): Agency for Healthcare Research and Quality (US)} 232 | } 233 | @techreport{debell, 234 | title = {How to Analyze ANES Survey Data}, 235 | author = {DeBell, Matthew}, 236 | year = 2010, 237 | number = {nes012492}, 238 | type = {ANES Technical Report Series}, 239 | howpublished = {\url{https://electionstudies.org/wp-content/uploads/2018/05/HowToAnalyzeANESData.pdf}}, 240 | institution = {Palo Alto, CA: Stanford University and Ann Arbor, MI: the University of Michigan} 241 | } 242 | @article{Schafer2002, 243 | title = {Missing Data: Our View of the State of the Art}, 244 | author = {Joseph L Schafer and John W Graham}, 245 | year = 2002, 246 | journal = {Psychological Methods}, 247 | volume = 7, 248 | pages = {147--177}, 249 | doi = {10.1037//1082-989X.7.2.147}, 250 | issue = 2 251 | } 252 | @article{kruskal1980, 253 | title = {Representative sampling, IV: The history of the concept in statistics, 1895-1939}, 254 | author = {Kruskal, William and Mosteller, Frederick}, 255 | year = 1980, 256 | journal = {International Statistical Review/Revue Internationale de Statistique}, 257 | publisher = {JSTOR}, 258 | pages = {169--195} 259 | } 260 | @inproceedings{shahvaish, 261 | title = {Confidence intervals for quantile estimation from complex survey data}, 262 | author = {Shah, Babubhai V and Vaish, Akhil K}, 263 | year = 2006, 264 | booktitle = {Proceedings of the Section on Survey Research Methods}, 265 | howpublished = {\url{http://www.asasrms.org/Proceedings/y2006/Files/JSM2006-000749.pdf}} 266 | } 267 | @book{wickham2019advanced, 268 | title = {Advanced {R}}, 269 | author = {Wickham, Hadley}, 270 | year = 2019, 271 | publisher = {CRC {P}ress}, 272 | howpublished = {\url{https://adv-r.hadley.nz/}} 273 | } 274 | @book{wickham2023r4ds, 275 | title = {R for Data Science: Import, Tidy, Transform, Visualize, and Model Data}, 276 | author = {Wickham, Hadley and Çetinkaya-Rundel, Mine and Grolemund, Garrett}, 277 | edition = {2nd}, 278 | year = 2023, 279 | publisher = {O'Reilly Media}, 280 | howpublished = {\url{https://r4ds.hadley.nz/}} 281 | } 282 | @misc{R-quarto, 283 | author = {Allaire, J.J. and Teague, Charles and Scheidegger, Carlos and Xie, Yihui and Dervieux, Christophe}, 284 | doi = {10.5281/zenodo.5960048}, 285 | month = feb, 286 | title = {{Quarto}}, 287 | howpublished = {\url{https://github.com/quarto-dev/quarto-cli}}, 288 | version = {1.4}, 289 | note = {Type: Software}, 290 | year = {2024} 291 | } 292 | @misc{acs-pums-2021, 293 | title = {{Understanding and Using the American Community Survey Public Use Microdata Sample Files What Data Users Need to Know}}, 294 | author = {{U.S. Census Bureau}}, 295 | year = 2021, 296 | howpublished = {\url{https://www.census.gov/content/dam/Census/library/publications/2021/acs/acs_pums_handbook_2021.pdf}}, 297 | institution = {U.S. Government Printing Office} 298 | } 299 | @misc{recs-2015-micro, 300 | title = {{Residential Energy Consumption Survey (RECS): Using the 2015 microdata file to compute estimates and standard errors (RSEs)}}, 301 | author = {{U.S. Energy Information Administration}}, 302 | year = 2017, 303 | howpublished = {\url{https://www.eia.gov/consumption/residential/data/2015/pdf/microdata_v3.pdf}} 304 | } 305 | @misc{recs-2020-micro, 306 | title = {{2020 Residential Energy Consumption Survey: Using the microdata file to compute estimates and relative standard errors (RSEs)}}, 307 | author = {{U.S. Energy Information Administration}}, 308 | year = 2023, 309 | howpublished = {\url{https://www.eia.gov/consumption/residential/data/2020/pdf/microdata-guide.pdf}} 310 | } 311 | @misc{recs-2020-tech, 312 | title = {{2020 Residential Energy Consumption Survey: Household Characteristics Technical Documentation Summary}}, 313 | author = {{U.S. Energy Information Administration}}, 314 | year = 2023, 315 | howpublished = {\url{https://www.eia.gov/consumption/residential/data/2020/pdf/2020%20RECS_Methodology%20Report.pdf}} 316 | } 317 | @misc{recs-2020-meth, 318 | title = {{2020 Residential Energy Consumption Survey: Consumption and Expenditures Technical Documentation Summary}}, 319 | author = {{U.S. Energy Information Administration}}, 320 | year = 2023, 321 | howpublished = {\url{https://www.eia.gov/consumption/residential/data/2020/pdf/2020%20RECS%20CE%20Methodology_Final.pdf}} 322 | } 323 | @misc{anes-2020-tech, 324 | title = {{Methodology Report for the ANES 2020 Time Series Study}}, 325 | author = {DeBell, Matthew and Amsbary, Michelle and Brader, Ted and Brock, Shelley and Good, Cindy and Kamens, Justin and Maisel, Natalya and Pinto, Sarah}, 326 | year = 2022, 327 | howpublished = {\url{https://electionstudies.org/wp-content/uploads/2022/08/anes_timeseries_2020_methodology_report.pdf}} 328 | } 329 | @misc{acs-5yr-doc, 330 | title = {{American Community Survey 2017-2021 5-Year: PUMS User Guide and Overview}}, 331 | author = {{U.S. Census Bureau}}, 332 | year = 2023, 333 | howpublished = {\url{https://www2.census.gov/programs-surveys/acs/tech_docs/pums/2017_2021ACS_PUMS_User_Guide.pdf}} 334 | } 335 | @article{tse-doc, 336 | title = {Total Survey Error: Design, Implementation, and Evaluation}, 337 | author = {Biemer, Paul P.}, 338 | year = 2010, 339 | month = {01}, 340 | journal = {Public Opinion Quarterly}, 341 | volume = 74, 342 | number = 5, 343 | pages = {817--848}, 344 | doi = {10.1093/poq/nfq058}, 345 | issn = {0033-362X}, 346 | howpublished = {\url{https://doi.org/10.1093/poq/nfq058}}, 347 | eprint = {\url{https://academic.oup.com/poq/article-pdf/74/5/817/5138301/nfq058.pdf}} 348 | } 349 | @book{dillman2014mode, 350 | title = {Internet, phone, mail, and mixed-mode surveys: The tailored design method}, 351 | author = {Dillman, Don A and Smyth, Jolene D and Christian, Leah Melani}, 352 | year = 2014, 353 | publisher = {John Wiley \& Sons} 354 | } 355 | @book{groves2009survey, 356 | title = {Survey methodology}, 357 | author = {Groves, Robert M and Fowler Jr, Floyd J and Couper, Mick P and Lepkowski, James M and Singer, Eleanor and Tourangeau, Roger}, 358 | year = 2009, 359 | publisher = {John Wiley \& Sons} 360 | } 361 | @book{biemer2003survqual, 362 | title = {Introduction to survey quality}, 363 | author = {Biemer, Paul P. and Lyberg, Lars E.}, 364 | year = 2003, 365 | publisher = {John Wiley \& Sons} 366 | } 367 | @techreport{harter2016address, 368 | title = {Address-based sampling}, 369 | author = {Harter, Rachel and Battaglia, Michael P and Buskirk, Trent D and Dillman, Don A and English, Ned and Fahimi, Mansour and Frankel, Martin R and Kennel, Timothy and McMichael, Joseph P and McPhee, Cameron Brook and Montaquila, Jill and Yancey, Tracie and Zuckerberg, Andrew L.}, 370 | year = 2016, 371 | howpublished = {\url{https://aapor.org/wp-content/uploads/2022/11/AAPOR_Report_1_7_16_CLEAN-COPY-FINAL-2.pdf}}, 372 | institution = {American Association for Public Opinion Research}, 373 | type = {Task force report} 374 | } 375 | @article{DeLeeuw_2018, 376 | title = {Mixed-Mode: Past, Present, and Future}, 377 | author = {DeLeeuw, Edith D.}, 378 | year = 2018, 379 | month = {Aug.}, 380 | journal = {Survey Research Methods}, 381 | volume = 12, 382 | number = 2, 383 | pages = {75–89}, 384 | doi = {10.18148/srm/2018.v12i2.7402}, 385 | howpublished = {\url{https://ojs.ub.uni-konstanz.de/srm/article/view/7402}} 386 | } 387 | @article{biemer_choiceplus, 388 | title = {{Using Bonus Monetary Incentives to Encourage Web Response in Mixed-Mode Household Surveys}}, 389 | author = {Biemer, Paul P. and Murphy, Joe and Zimmer, Stephanie and Berry, Chip and Deng, Grace and Lewis, Katie}, 390 | year = 2017, 391 | month = {06}, 392 | journal = {Journal of Survey Statistics and Methodology}, 393 | volume = 6, 394 | number = 2, 395 | pages = {240--261}, 396 | doi = {10.1093/jssam/smx015}, 397 | issn = {2325-0984}, 398 | howpublished = {\url{https://doi.org/10.1093/jssam/smx015}}, 399 | eprint = {https://academic.oup.com/jssam/article-pdf/6/2/240/24807375/smx015.pdf} 400 | } 401 | @book{Bradburn2004, 402 | title = {Asking Questions: The Definitive Guide to Questionnaire Design}, 403 | author = {Norman M. Bradburn and Seymour Sudman and Brian Wansink}, 404 | year = 2004, 405 | publisher = {Jossey-Bass}, 406 | edition = {2nd} 407 | } 408 | @book{Fowler1989, 409 | title = {Standardized Survey Interviewing}, 410 | author = {Floyd J Fowler and Thomas W. Mangione}, 411 | year = 1989, 412 | publisher = {SAGE} 413 | } 414 | @book{Kim2021, 415 | title = {Statistical Methods for Handling Incomplete Data}, 416 | author = {Jae Kwang Kim and Jun Shao}, 417 | year = 2021, 418 | publisher = {Chapman \& Hall/CRC Press} 419 | } 420 | @book{Schouten2018, 421 | title = {Adaptive Survey Design}, 422 | author = {Barry Schouten and Andy Peytchev and James Wagner}, 423 | year = 2018, 424 | publisher = {Chapman \& Hall/CRC Press} 425 | } 426 | @book{Tourangeau2000psych, 427 | title = {Psychology of Survey Response}, 428 | author = {Roger Tourangeau and Lance J. Rips and Kenneth Rasinski}, 429 | year = 2000, 430 | publisher = {Cambridge University Press} 431 | } 432 | @article{Tourangeau2004spacing, 433 | title = {Spacing, Position, and Order: Interpretive Heuristics for Visual Features of Survey Questions}, 434 | author = {Roger Tourangeau and Mick P. Couper and Frederick Conrad}, 435 | year = 2004, 436 | journal = {Public Opinion Quarterly}, 437 | publisher = {Oxford University Press}, 438 | volume = 68, 439 | pages = {368--393}, 440 | isbn = {0033-362X}, 441 | issn = {0033362X}, 442 | issue = 3, 443 | howpublished = {\url{http://www.jstor.org/stable/3521676 http://www.jstor.org/page/info/about/policies/terms.jsp}} 444 | } 445 | @book{Valliant2018weights, 446 | title = {Survey Weights: A Step-by-step Guide to Calculation}, 447 | author = {Richard Valliant and Jill A. Dever}, 448 | year = 2018, 449 | publisher = {Stata Press} 450 | } 451 | @article{deLeeuw2005, 452 | title = {To Mix or Not to Mix Data Collection Modes in Surveys}, 453 | author = {DeLeeuw, Edith D.}, 454 | year = 2005, 455 | journal = {Journal of Official Statistics}, 456 | volume = 21, 457 | pages = {233--255}, 458 | issue = 2 459 | } 460 | @inbook{Skinner2009, 461 | title = {Chapter 15: Statistical Disclosure Control for Survey Data}, 462 | author = {Chris Skinner}, 463 | year = 2009, 464 | booktitle = {Handbook of Statistics: Sample Surveys: Design, Methods and Applications}, 465 | publisher = {Elsevier B.V.}, 466 | pages = {381--396}, 467 | editor = {C.R. Rao} 468 | } 469 | @misc{pennstate506, 470 | title = {STAT 506: Sampling Theory and Methods [Online Course]}, 471 | author = {{Penn State}}, 472 | year = 2019, 473 | howpublished = {\url{https://online.stat.psu.edu/stat506/}} 474 | } 475 | @inproceedings{Scott2007, 476 | title = {Rao-Scott corrections and their impact}, 477 | author = {Alastair Scott}, 478 | year = 2007, 479 | booktitle = {Section on Survey Research Methods}, 480 | pages = {3514--3518}, 481 | howpublished = {\url{http://www.asasrms.org/Proceedings/y2007/Files/JSM2007-000874.pdf}} 482 | } 483 | @misc{git-w-R, 484 | title = {Happy {G}it and {G}itHub for the use{R}}, 485 | author = {Jenny Bryan}, 486 | year = {2023}, 487 | howpublished = {\url{https://happygitwithr.com/}} 488 | } 489 | @incollection{mccullagh1989binary, 490 | title={Binary data}, 491 | author={McCullagh, Peter and Nelder, John Ashworth}, 492 | booktitle={Generalized linear models}, 493 | pages={98--148}, 494 | year={1989}, 495 | publisher={Springer} 496 | } 497 | @misc{eia-cdd, 498 | title = "Units and calculators explained: Degree days", 499 | author = "{U.S. Energy Information Administration}", 500 | howpublished = "\url{https://www.eia.gov/energyexplained/units-and-calculators/degree-days.php}", 501 | year = 2023 502 | } 503 | @article{npr-voting-trend, 504 | author = "Barbara Sprunt", 505 | title = "93 Million And Counting: Americans Are Shattering Early Voting Records", 506 | journal = "National Public Radio", 507 | year = 2020, 508 | howpublished = "\url{https://www.npr.org/2020/10/26/927803214/62-million-and-counting-americans-are-breaking-early-voting-records}" 509 | } 510 | @misc{nhis-svy-des, 511 | title = {{National Health Interview Survey, 2022 survey description}}, 512 | author = {{National Center for Health Statistics}}, 513 | year = 2023, 514 | howpublished = {\url{https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NHIS/2022/srvydesc-508.pdf}} 515 | } 516 | @misc{gss-codebook, 517 | title = {{General Social Survey 2016-2020 Panel Codebook}}, 518 | author = {Davern, Michael and Bautista, Rene and Freese, Jeremy and Morgan, Stephen L. and Smith, Tom W.}, 519 | editor = {NORC, Chicago}, 520 | year = 2021, 521 | howpublished = {\url{https://gss.norc.org/Documents/codebook/2016-2020%20GSS%20Panel%20Codebook%20-%20R1a.pdf}} 522 | } 523 | 524 | @Book{ggplot2wickham, 525 | author = {Hadley Wickham}, 526 | title = {{ggplot2}: Elegant Graphics for Data Analysis}, 527 | publisher = {Springer-Verlag New York}, 528 | year = {2016}, 529 | isbn = {978-3-319-24277-4}, 530 | howpublished = {\url{https://ggplot2.tidyverse.org}} 531 | } 532 | 533 | @Article{gtsummarysjo, 534 | author = {Daniel D. Sjoberg and Karissa Whiting and Michael Curry and Jessica A. Lavery and Joseph Larmarange}, 535 | title = {Reproducible Summary Tables with the {gtsummary} Package}, 536 | journal = {{The R Journal}}, 537 | year = {2021}, 538 | howpublished = {\url{https://doi.org/10.32614/RJ-2021-053}}, 539 | doi = {10.32614/RJ-2021-053}, 540 | volume = {13}, 541 | issue = {1}, 542 | pages = {570-580}, 543 | } 544 | 545 | @Article{targetslandau, 546 | title = {The {targets} {R} package: a dynamic {Make}-like function-oriented pipeline toolkit for reproducibility and high-performance computing}, 547 | author = {William Michael Landau}, 548 | journal = {Journal of Open Source Software}, 549 | year = {2021}, 550 | volume = {6}, 551 | number = {57}, 552 | pages = {2959}, 553 | doi = {10.21105/joss.02959}, 554 | } 555 | 556 | @Article{jsonliteooms, 557 | title = {The {jsonlite} Package: A Practical and Consistent Mapping Between JSON Data and {R} Objects}, 558 | author = {Jeroen Ooms}, 559 | journal = {arXiv:1403.2805 [stat.CO]}, 560 | year = {2014}, 561 | howpublished = {\url{https://arxiv.org/abs/1403.2805}}, 562 | } 563 | 564 | @Article{visdattierney, 565 | title = {{visdat}: Visualising Whole Data Frames}, 566 | author = {Nicholas Tierney}, 567 | doi = {10.21105/joss.00355}, 568 | howpublished = {\url{http://dx.doi.org/10.21105/joss.00355}}, 569 | year = {2017}, 570 | journal = {Journal of Open Source Software}, 571 | volume = {2}, 572 | number = {16}, 573 | pages = {355} 574 | } 575 | 576 | @Book{sf2023man, 577 | author = {Edzer Pebesma and Roger Bivand}, 578 | title = {{Spatial Data Science: With applications in R}}, 579 | year = {2023}, 580 | publisher = {{Chapman \& Hall/CRC}}, 581 | howpublished = {\url{https://r-spatial.org/book/}}, 582 | doi = {10.1201/9780429459016} 583 | } 584 | @Book{rmarkdown2020man, 585 | title = {R Markdown Cookbook}, 586 | author = {Yihui Xie and Christophe Dervieux and Emily Riederer}, 587 | publisher = {Chapman \& Hall/CRC}, 588 | year = {2020}, 589 | isbn = {9780367563837}, 590 | howpublished = {\url{https://bookdown.org/yihui/rmarkdown-cookbook}} 591 | } 592 | @misc{recs-svy, 593 | title = {Residential Energy Consumption Survey (RECS) Form EIA-457A 2020 Household Questionnaire}, 594 | author = {{U.S. Energy Information Administration}}, 595 | year = 2020, 596 | howpublished = {\url{https://www.eia.gov/survey/form/eia_457/archive/2020_RECS-457A.pdf}} 597 | } -------------------------------------------------------------------------------- /css/style.css: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css2?family=Telex&family=Ubuntu:ital,wght@0,300;0,400;0,500;0,700;1,300;1,400;1,500;1,700&display=swap'); 2 | 3 | /* The next two rules make the horizontal line go straight across in top navbar */ 4 | 5 | .summary > li:first-child { 6 | height: 50px; 7 | padding-top: 10px; 8 | border-bottom: 1px solid rgba(0, 0, 0, 0.07); 9 | } 10 | 11 | .book .book-summary ul.summary li.divider { 12 | height: 0px; 13 | } 14 | 15 | /*____Code Chunks____*/ 16 | .sourceCode pre { 17 | margin: 0 !important; /* makes space above and below symmetrical */ 18 | } 19 | 20 | .sourceCode { 21 | margin-bottom: 0.85em; /* adds same amount of margin as a

    would */ 22 | } 23 | 24 | p.caption { 25 | color: #777; 26 | margin-top: 10px; 27 | } 28 | 29 | p code { 30 | white-space: inherit; 31 | } 32 | 33 | pre { 34 | word-break: normal; 35 | word-wrap: normal; 36 | } 37 | 38 | pre code { 39 | white-space: inherit; 40 | } 41 | 42 | p.flushright { 43 | text-align: right; 44 | } 45 | 46 | blockquote > p:last-child { 47 | text-align: right; 48 | } 49 | 50 | blockquote > p:first-child { 51 | text-align: inherit; 52 | } 53 | 54 | p, div { 55 | font-family: 'Telex', sans-serif; 56 | } 57 | 58 | .title { 59 | font-family: 'Ubuntu', serif; 60 | font-size: 2em !important; 61 | text-transform: uppercase; 62 | color: #0B3954; 63 | } 64 | 65 | .subtitle { 66 | font-family: 'Ubuntu', serif; 67 | text-transform: none; 68 | font-weight: 400; 69 | color: #0B3954; 70 | font-style: normal !important; 71 | } 72 | 73 | h1, h2, h3, h4, h5, h6 { 74 | font-family: 'Ubuntu', sans-serif; 75 | color: #0B3954; 76 | text-transform: uppercase; 77 | font-weight: 700; 78 | } 79 | 80 | ul.ro { 81 | list-style-type: circle; 82 | } 83 | 84 | li.ro::marker { 85 | font-size: 150%; 86 | } 87 | 88 | .prereqbox { 89 | padding: 1em; 90 | background: #C6E0E6; 91 | color: black; 92 | border: 2px solid #087E8B; 93 | border-bottom-left-radius: 10px; 94 | border-bottom-right-radius: 10px; 95 | } 96 | 97 | .prereqbox-header { 98 | padding-left: 1em; 99 | padding-right: 1em; 100 | background: #087E8B; 101 | color: white; 102 | border: 2px solid #087E8B; 103 | border-top-left-radius: 10px; 104 | border-top-right-radius: 10px; 105 | } 106 | 107 | h3.hasAnchor#prereq3 #prereq4 #prereq5 #prereq6 #prereq7 #prereq8 #prereq9 #prereq10 { 108 | margin-top: 0em !important; 109 | margin-bottom: 0em !important; 110 | } 111 | 112 | .summary { 113 | font-family: 'Ubuntu', sans-serif; 114 | } 115 | 116 | .book .book-summary { 117 | background: white; 118 | border-right: none; 119 | } 120 | 121 | .book.with-summary .book-header.fixed { 122 | left: 350px; 123 | } 124 | 125 | .book.with-summary .book-body { 126 | left: 350px; 127 | } 128 | 129 | .book .book-summary { 130 | width: 350px; 131 | } 132 | 133 | /*-----Body Links-------*/ 134 | .book .book-body .page-wrapper .page-inner section.normal a { 135 | color: #419599; 136 | } 137 | 138 | .book .book-body .page-wrapper .page-inner section.normal a:hover { 139 | color: #19757F; /* darker color when hovering */ 140 | text-decoration: none; 141 | } 142 | 143 | /* HERO IMAGE */ 144 | .hero-image-container { 145 | position: absolute; 146 | top: 0; 147 | left: 0; 148 | right: 0; 149 | height: 310px; 150 | background-color: #0b3954; 151 | } 152 | 153 | .hero-image { 154 | width: 100%; 155 | height: 310px; 156 | object-fit: cover; 157 | } 158 | 159 | .page-inner { 160 | padding-top: 400px !important; 161 | } 162 | 163 | .book .book-summary ul.summary li a, 164 | .book .book-summary ul.summary li span { 165 | padding-top: 8px; 166 | padding-bottom: 8px; 167 | padding-left: 15px; 168 | padding-right: 15px; 169 | color: #0b3954; 170 | } 171 | 172 | .summary a:hover { 173 | color: #19757F !important; 174 | } 175 | 176 | .book .book-summary ul.summary li.active>a { /*active TOC links*/ 177 | color: #8d5b94 !important; 178 | border-left: solid 4px; 179 | border-color: #0b3954; 180 | padding-left: 11px !important; 181 | } 182 | 183 | li.appendix span, li.part span { /* for TOC part names */ 184 | margin-top: 1em; 185 | color: #1c3046 !important; 186 | opacity: 1 !important; 187 | text-transform: uppercase; 188 | } 189 | 190 | /*--- LOGO ---*/ 191 | .toc-logo { 192 | width: 150px !important; 193 | object-fit: contain; 194 | margin: 0 auto; 195 | display: block; 196 | } 197 | 198 | .toc-logo img { 199 | max-width: 100%; 200 | margin-bottom: -10px; 201 | } 202 | 203 | .summary > li:first-child { 204 | height: auto !important; 205 | } 206 | 207 | em { 208 | font-style: normal !important; 209 | } 210 | 211 | .prereqbox-header h3 { 212 | color: white !important; 213 | } 214 | 215 | -------------------------------------------------------------------------------- /images/PetExample1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/PetExample1.png -------------------------------------------------------------------------------- /images/PetExample2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/PetExample2.png -------------------------------------------------------------------------------- /images/codebook-example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/codebook-example.jpg -------------------------------------------------------------------------------- /images/codebook-ncvs-weapon-handgun.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/codebook-ncvs-weapon-handgun.jpg -------------------------------------------------------------------------------- /images/codebook-ncvs-weapon-li.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/codebook-ncvs-weapon-li.jpg -------------------------------------------------------------------------------- /images/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/cover.png -------------------------------------------------------------------------------- /images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/favicon.png -------------------------------------------------------------------------------- /images/header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/header.png -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/logo.png -------------------------------------------------------------------------------- /images/questionnaire-example-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/questionnaire-example-2.jpg -------------------------------------------------------------------------------- /images/questionnaire-example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/questionnaire-example.jpg -------------------------------------------------------------------------------- /images/questionnaire-ncvs-weapon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidy-survey-r/tidy-survey-book/006de5409c0b56bd6d930bfb9a8d05e2eb7be165/images/questionnaire-ncvs-weapon.jpg -------------------------------------------------------------------------------- /index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Exploring Complex Survey Data Analysis Using R" 3 | subtitle: "A Tidy Introduction with {srvyr} and {survey}" 4 | author: "Stephanie A. Zimmer, Rebecca J. Powell, and Isabella C. Velásquez" 5 | date: "`r Sys.Date()`" 6 | documentclass: krantz 7 | bibliography: [book.bib, packages.bib] 8 | biblio-style: apalike 9 | link-citations: yes 10 | colorlinks: true 11 | lot: true 12 | lof: true 13 | site: bookdown::bookdown_site 14 | description: "Exploring Complex Survey Data Analysis Using R: A Tidy Introduction with {srvyr} and {survey}" 15 | github-repo: tidy-survey-r/tidy-survey-book 16 | graphics: yes 17 | favicon: images/favicon.png 18 | cover-image: images/cover.png 19 | header-includes: 20 | - \usepackage[titles]{tocloft} 21 | --- 22 | 23 | 24 | ```{r setup} 25 | #| include: false 26 | 27 | knitr::opts_chunk$set(fig.pos = "h!", out.extra = "") 28 | 29 | library(styler) 30 | options( 31 | htmltools.dir.version = FALSE, formatR.indent = 2, digits = 4 32 | ) 33 | if (knitr:::is_html_output()){ 34 | options(width=72) 35 | } else{ 36 | options(width=72) 37 | } 38 | 39 | library(prettyunits) 40 | 41 | book_colors <- c("#0b3954", "#087e8b", "#bfd7ea", "#ff8484", "#8d6b94") 42 | 43 | as_latex_with_caption <- function(gtobj, chunk_label) { 44 | lt <- nrow(gtobj[["_data"]]) >= 5 45 | gt_l <- gtobj %>% tab_options(latex.use_longtable=lt, latex.tbl.pos="H") %>% gt::as_latex() 46 | caption <- paste0( 47 | "\\caption{\\label{tab:", chunk_label, "}(ref:", chunk_label, ")}") 48 | if (lt){ 49 | caption <- paste0(caption, " \\\\") 50 | } 51 | latex <- strsplit(gt_l[1], split = "\n")[[1]] 52 | idxtable <- which(stringr::str_detect(latex, "begin") & stringr::str_detect(latex, "table")) 53 | # https://tex.stackexchange.com/questions/95236/as-first-character-in-table-row 54 | idxparen <- which(stringr::str_detect(latex, "^\\(")) 55 | if (length(idxparen)>0){ 56 | latex[(idxparen-1)] <- stringr::str_c(latex[(idxparen-1)], "\\relax") 57 | } 58 | latex1 <- stringi::stri_replace_all(latex, regex="(?=\\d*)-{1,2}(\\d)", replacement="--$1") 59 | latex2 <- c(latex1[1:idxtable], caption, latex1[-c(1:idxtable)]) 60 | latex3 <- paste(latex2, collapse = "\n") 61 | gt_l[1] <- latex3 62 | return(gt_l) 63 | } 64 | 65 | print_gt_book <- function(gtobj, ref){ 66 | if ("gtsummary" %in% class(gtobj)){ 67 | gtobj <- as_gt(gtobj) 68 | } 69 | 70 | if (knitr::is_latex_output()){ 71 | gtobj %>% 72 | as_latex_with_caption(ref) 73 | } else { 74 | gtobj %>% 75 | tab_caption(glue::glue("(ref:{ref})")) 76 | } 77 | 78 | 79 | } 80 | 81 | ``` 82 | 83 | `r if (knitr:::is_html_output()) '# Welcome {-}'` 84 | 85 | ```{r} 86 | #| label: index-printversion-text 87 | #| echo: false 88 | #| results: asis 89 | 90 | printversion_text <- "This is the online version of the book published by CRC Press in November 2024. You can purchase a copy of this book directly from [Routledge](https://www.routledge.com/Exploring-Complex-Survey-Data-Analysis-Using-R-A-Tidy-Introduction-with-srvyr-and-survey/Zimmer-Powell-Velasquez/p/book/9781032302867) or your preferred bookstore. The cover artwork was designed and created by [Allison Horst](https://allisonhorst.com/)." 91 | 92 | if (knitr:::is_html_output()){ 93 | cat(printversion_text) 94 | } 95 | 96 | rm(printversion_text) 97 | ``` 98 | 99 | 100 | ```{r} 101 | #| label: index-printversion-coverimage 102 | #| echo: false 103 | #| fig.cap: "" 104 | #| fig.alt: "Image of print book cover with author names, title, and cover image" 105 | #| out.width: 70% 106 | #| fig.align: center 107 | 108 | if (knitr:::is_html_output()){ 109 | knitr::include_graphics(path="images/cover.png") 110 | } 111 | ``` 112 | 113 | `r if (knitr:::is_html_output()) '## Dedication {-}'` 114 | 115 | ```{r} 116 | #| label: index-dedication-text 117 | #| echo: false 118 | #| results: asis 119 | 120 | thanks <- "To Will, Tom, and Drew, thanks for all the help with additional chores and plenty of Git consulting!" 121 | 122 | if (knitr:::is_html_output()){ 123 | cat(thanks) 124 | } else if(knitr:::is_latex_output()){ 125 | bb <- readLines(here::here("latex", "before_body_temp.tex")) 126 | bb[which(bb=="placeholder")] <- thanks 127 | writeLines(bb, here::here("latex", "before_body_ded.tex")) 128 | rm(bb) 129 | } 130 | 131 | rm(thanks) 132 | ``` 133 | 134 | `r if (knitr:::is_html_output()) '## Citation {-}'` 135 | 136 | ```{r} 137 | #| label: index-citation-text 138 | #| echo: false 139 | #| results: asis 140 | 141 | citation_text <- "To cite this book, we recommend the following citation: \n 142 | Zimmer, S. A., Powell, R. J., & Velásquez, I. C. (2024). Exploring Complex Survey Data Analysis Using R: A Tidy Introduction with {srvyr} and {survey}. Chapman & Hall: CRC Press." 143 | 144 | if (knitr:::is_html_output()){ 145 | cat(citation_text) 146 | } 147 | 148 | rm(citation_text) 149 | ``` 150 | -------------------------------------------------------------------------------- /latex/after_body.tex: -------------------------------------------------------------------------------- 1 | \backmatter 2 | \printindex 3 | -------------------------------------------------------------------------------- /latex/before_body_temp.tex: -------------------------------------------------------------------------------- 1 | % you may need to leave a few empty pages before the dedication page 2 | 3 | %\cleardoublepage\newpage\thispagestyle{empty}\null 4 | %\cleardoublepage\newpage\thispagestyle{empty}\null 5 | %\cleardoublepage\newpage 6 | \thispagestyle{empty} 7 | 8 | \begin{center} 9 | placeholder 10 | \end{center} 11 | 12 | \setlength{\abovedisplayskip}{-5pt} 13 | \setlength{\abovedisplayshortskip}{-5pt} 14 | -------------------------------------------------------------------------------- /latex/preamble.tex: -------------------------------------------------------------------------------- 1 | \usepackage{booktabs} 2 | \usepackage{longtable} 3 | \usepackage[bf,singlelinecheck=off]{caption} 4 | \captionsetup[table]{labelsep=space} 5 | \captionsetup[figure]{labelsep=space} 6 | \usepackage[scale=.8]{sourcecodepro} 7 | 8 | \usepackage{framed,color} 9 | \definecolor{shadecolor}{RGB}{248,248,248} 10 | 11 | \renewcommand{\textfraction}{0.05} 12 | \renewcommand{\topfraction}{0.8} 13 | \renewcommand{\bottomfraction}{0.8} 14 | \renewcommand{\floatpagefraction}{0.75} 15 | 16 | \renewenvironment{quote}{\begin{VF}}{\end{VF}} 17 | \usepackage{hyperref} 18 | \let\oldhref\href 19 | \renewcommand{\href}[2]{#2\footnote{\url{#1}}} 20 | 21 | \makeatletter 22 | \newenvironment{kframe}{% 23 | \medskip{} 24 | \setlength{\fboxsep}{.8em} 25 | \def\at@end@of@kframe{}% 26 | \ifinner\ifhmode% 27 | \def\at@end@of@kframe{\end{minipage}}% 28 | \begin{minipage}{\columnwidth}% 29 | \fi\fi% 30 | \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep 31 | \colorbox{shadecolor}{##1}\hskip-\fboxsep 32 | % There is no \\@totalrightmargin, so: 33 | \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% 34 | \MakeFramed {\advance\hsize-\width 35 | \@totalleftmargin\z@ \linewidth\hsize 36 | \@setminipage}}% 37 | {\par\unskip\endMakeFramed% 38 | \at@end@of@kframe} 39 | \makeatother 40 | 41 | \makeatletter 42 | \@ifundefined{Shaded}{ 43 | }{\renewenvironment{Shaded}{\begin{kframe}}{\end{kframe}}} 44 | \makeatother 45 | 46 | \usepackage{makeidx} 47 | \makeindex 48 | 49 | \urlstyle{tt} 50 | 51 | \usepackage{amsthm} 52 | \makeatletter 53 | \def\thm@space@setup{% 54 | \thm@preskip=8pt plus 2pt minus 4pt 55 | \thm@postskip=\thm@preskip 56 | } 57 | \makeatother 58 | 59 | \usepackage{tcolorbox} 60 | \newtcolorbox{prereqbox}[1]{ 61 | colback=white, 62 | colframe=gray!75!black, 63 | fonttitle=\bfseries, 64 | title={#1} 65 | } 66 | 67 | \frontmatter 68 | -------------------------------------------------------------------------------- /plausible.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /renv/.gitignore: -------------------------------------------------------------------------------- 1 | sandbox/ 2 | library/ 3 | local/ 4 | cellar/ 5 | lock/ 6 | python/ 7 | staging/ -------------------------------------------------------------------------------- /renv/settings.dcf: -------------------------------------------------------------------------------- 1 | bioconductor.version: 2 | external.libraries: 3 | ignored.packages: 4 | package.dependency.fields: Imports, Depends, LinkingTo 5 | r.version: 6 | snapshot.type: implicit 7 | use.cache: TRUE 8 | vcs.ignore.cellar: TRUE 9 | vcs.ignore.library: TRUE 10 | vcs.ignore.local: TRUE 11 | -------------------------------------------------------------------------------- /renv/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "bioconductor.version": null, 3 | "external.libraries": [], 4 | "ignored.packages": [], 5 | "package.dependency.fields": [ 6 | "Imports", 7 | "Depends", 8 | "LinkingTo" 9 | ], 10 | "r.version": null, 11 | "snapshot.type": "implicit", 12 | "use.cache": true, 13 | "vcs.ignore.cellar": true, 14 | "vcs.ignore.library": true, 15 | "vcs.ignore.local": true, 16 | "vcs.manage.ignores": true 17 | } 18 | -------------------------------------------------------------------------------- /tidy-survey-book.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: d4da462a-2af3-4992-9c46-d0495d3b7477 3 | 4 | RestoreWorkspace: No 5 | SaveWorkspace: No 6 | AlwaysSaveHistory: No 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 2 11 | Encoding: UTF-8 12 | 13 | RnwWeave: knitr 14 | LaTeX: pdfLaTeX 15 | 16 | BuildType: Website 17 | --------------------------------------------------------------------------------