├── .gitignore
├── Data
    ├── anes.rds
    └── recs.rds
├── DataCleaningScripts
    ├── ANES_DataPrep.Rmd
    ├── ANES_DataPrep.md
    ├── RECS_DataPrep.Rmd
    └── RECS_DataPrep.md
├── Exercises
    ├── CategorialExercises.R
    ├── CategorialExercises.Rmd
    ├── CategorialExercises_solutions.R
    ├── CategorialExercises_solutions.Rmd
    ├── CategorialExercises_solutions.html
    ├── ContinuousExercises.R
    ├── ContinuousExercises.Rmd
    ├── ContinuousExercises_solutions.R
    ├── ContinuousExercises_solutions.Rmd
    ├── ContinuousExercises_solutions.html
    ├── WarmUpExercises.R
    ├── WarmUpExercises.Rmd
    ├── WarmUpExercises_solutions.R
    ├── WarmUpExercises_solutions.Rmd
    └── WarmUpExercises_solutions.html
├── FinalizeMaterials.R
├── LICENSE
├── Presentation
    ├── Slides.R
    ├── Slides.Rmd
    ├── Slides.html
    ├── Slides.pdf
    ├── Slides.pptx
    ├── Slides_files
    │   └── figure-html
    │   │   └── plot_sf_elbill_disp-1.png
    └── xaringan-themer.css
├── README.md
├── RawData
    ├── ANES_2016
    │   ├── anes_timeseries_2016.sav
    │   ├── anes_timeseries_2016_qnaire_post.pdf
    │   ├── anes_timeseries_2016_qnaire_pre.pdf
    │   └── anes_timeseries_2016_userguidecodebook.pdf
    └── RECS_2015
    │   ├── 2020_RECS-457A.pdf
    │   ├── README.md
    │   ├── codebook_publicv4.xlsx
    │   ├── microdata_v3.pdf
    │   └── recs2015_public_v4.csv
└── tidy-survey-short-course.Rproj


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | debug.log


--------------------------------------------------------------------------------
/Data/anes.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/Data/anes.rds


--------------------------------------------------------------------------------
/Data/recs.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/Data/recs.rds


--------------------------------------------------------------------------------
/DataCleaningScripts/ANES_DataPrep.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "American National Election Studies (ANES) 2016 Time Series Study Data Prep"
  3 | output: github_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ## Data information
 11 | 
 12 | All data and resources were downloaded from https://electionstudies.org/data-center/2016-time-series-study/ on April 3, 2021.
 13 | 
 14 | American National Election Studies. 2019. ANES 2016 Time Series Study [dataset and documentation]. September 4, 2019 version. www.electionstudies.org
 15 | ```{r loadpackageh, message=FALSE}
 16 | library(here) #easy relative paths
 17 | ```
 18 | 
 19 | 
 20 | 
 21 | ```{r loadpackages}
 22 | library(tidyverse) #data manipulation
 23 | library(haven) #data import
 24 | library(tidylog) #informative logging messages
 25 | ```
 26 | ## Import data and create derived variables
 27 | 
 28 | ```{r derivedata}
 29 | anes_in <- read_sav(here("RawData", "ANES_2016", "anes_timeseries_2016.sav"))
 30 | 
 31 | 
 32 | anes <- anes_in %>%
 33 |    select('V160102',	'V160201',	'V160202',	'V160501',	'V161004',	'V161005',	'V161006',	'V161024x', 'V161158x',	'V161215',	'V161219',	'V161267',	'V161267',	'V161270',	'V161310x',	'V161342',	'V161361x',	'V162031',	'V162031x',	'V162034',	'V162034a',	'V162062x',	'V162062x'
 34 |    ) %>%
 35 |    mutate(
 36 |       InterviewMode=fct_recode(as.character(V160501), FTF="1", Web="2"),
 37 |       Weight=V160102,
 38 |       Stratum=as.factor(V160201),
 39 |       VarUnit=as.factor(V160202),
 40 |       Age=if_else(V161267>0, as.numeric(V161267), NA_real_),
 41 |       AgeGroup=cut(Age, c(17, 29, 39, 49, 59, 69, 200),
 42 |                    labels=c("18-29", "30-39", "40-49", "50-59", "60-69", "70 or older")),
 43 |       Gender=factor(
 44 |          case_when(
 45 |             V161342==1~"Male",
 46 |             V161342==2~"Female",
 47 |             V161342==3~"Other",
 48 |             TRUE~NA_character_
 49 |          ),
 50 |          levels=c("Male", "Female", "Other")
 51 |       ),
 52 |       RaceEth=factor(
 53 |          case_when(
 54 |             V161310x==1~"White",
 55 |             V161310x==2~"Black",
 56 |             V161310x==5~"Hispanic",
 57 |             V161310x==3~"Asian, NH/PI",
 58 |             near(V161310x, 4)~"AI/AN",
 59 |             near(V161310x, 6)~"Other/multiple race",
 60 |             TRUE ~ NA_character_
 61 |          ),
 62 |          levels=c("White", "Black", "Hispanic", "Asian, NH/PI", "AI/AN", "Other/multiple race", NA_character_)
 63 |       ),
 64 |       PartyID=factor(
 65 |          case_when(
 66 |             V161158x==1~"Strong democrat",
 67 |             V161158x==2~"Not very strong democrat",
 68 |             V161158x==3~"Independent-democrat",
 69 |             V161158x==4~"Independent",
 70 |             V161158x==5~"Independent-republican",
 71 |             V161158x==6~"Not very strong republican",
 72 |             V161158x==7~"Strong republican",
 73 |             TRUE ~ NA_character_
 74 |          ),
 75 |          levels=c("Strong democrat", "Not very strong democrat", "Independent-democrat", "Independent", "Independent-republican", "Not very strong republican", "Strong republican")
 76 |       ),
 77 |       Education=factor(
 78 |          case_when(
 79 |             V161270 <=0~NA_character_,
 80 |             V161270 <= 8~"Less than HS",
 81 |             V161270==9|V161270==90~"High school",
 82 |             V161270<=12~"Post HS",
 83 |             V161270==13~"Bachelor's",
 84 |             V161270<=16~"Graduate",
 85 |             TRUE~NA_character_
 86 |          ),
 87 |          levels=c("Less than HS", "High school", "Post HS", "Bachelor's", "Graduate")
 88 |       ),
 89 |       Income=cut(V161361x, c(-5, 1:28),
 90 |                  labels=c("Under $5k", 
 91 |                           "$5-10k", "$10-12.5k", "$12.5-15", "$15-17.5k", "$17.5-20k", "$20-22.5k", "$22.5-25k", "$25-27.5k", "$27.5-30k", "$30-35k", "$35-40k", "$40-45k", "$45-50k", "$50-55k", "$55-60k", "$60-65k","$65-70k", "$70-75k", "$75-80k", "$80-90k", "$90-100k","$100-110k", "$110-125k", "$125-150k", "$150-175k", "$175-250k", "$250k or more"  )
 92 |       ), 
 93 |       Income7=fct_collapse(
 94 |          Income,
 95 |          "Under $20k"=c("Under $5k", "$5-10k", "$10-12.5k", "$12.5-15", "$15-17.5k", "$17.5-20k"),
 96 |          "$20-40k"=c("$20-22.5k", "$22.5-25k", "$25-27.5k", "$27.5-30k", "$30-35k", "$35-40k"),
 97 |          "$40-60k"=c( "$40-45k", "$45-50k", "$50-55k", "$55-60k"),
 98 |          "$60-80k"=c( "$60-65k", "$65-70k", "$70-75k", "$75-80k"),
 99 |          "$80-100k"=c("$80-90k", "$90-100k"),
100 |          "$100-125k"=c("$100-110k", "$110-125k"),
101 |          "$125k or more"=c("$125-150k", "$150-175k", "$175-250k", "$250k or more")
102 |       ),
103 |       CampaignInterest=factor(
104 |          case_when(
105 |             V161004==1~"Very much interested",
106 |             V161004==2~"Somewhat interested",
107 |             V161004==3~"Not much interested",
108 |             TRUE~NA_character_
109 |          ),
110 |          levels=c("Very much interested", "Somewhat interested", "Not much interested")
111 |       ),
112 |       TrustGovernment=factor(
113 |          case_when(
114 |             V161215==1~"Always",
115 |             V161215==2~"Most of the time",
116 |             V161215==3~"About half the time",
117 |             V161215==4~"Some of the time",
118 |             V161215==5~"Never",
119 |             TRUE~NA_character_
120 |          ),
121 |          levels=c("Always", "Most of the time", "About half the time", "Some of the time", "Never")
122 |       ),
123 |       TrustPeople=factor(
124 |          case_when(
125 |             V161219==1~"Always",
126 |             V161219==2~"Most of the time",
127 |             V161219==3~"About half the time",
128 |             V161219==4~"Some of the time",
129 |             V161219==5~"Never",
130 |             TRUE ~ NA_character_
131 |          ),
132 |          levels=c("Always", "Most of the time", "About half the time", "Some of the time", "Never")
133 |       ),
134 |       VotedPres2012=factor(
135 |          case_when(
136 |             V161005==1~"Yes",
137 |             V161005==2~"No",
138 |             TRUE~NA_character_
139 |          ), levels=c("Yes", "No")
140 |       ),
141 |       VotedPres2012_selection=factor(
142 |          case_when(
143 |             V161006==1~"Obama",
144 |             V161006==2~"Romney",
145 |             V161006==5~"Other",
146 |             TRUE~NA_character_
147 |          ), levels=c("Obama", "Romney", "Other")
148 |       ),
149 |       VotedPres2016=factor(
150 |          case_when(
151 |             V162031x==1~"Yes",
152 |             V162031x==0~"No",
153 |             TRUE~NA_character_
154 |          ), levels=c("Yes", "No")
155 |       ),
156 |       VotedPres2016_selection=factor(
157 |          case_when(
158 |             V162062x==1~"Clinton",
159 |             V162062x==2~"Trump",
160 |             V162062x >=3 ~"Other",
161 |             TRUE~NA_character_
162 |          ), levels=c("Clinton", "Trump", "Other")
163 |       ),
164 |       EarlyVote2016=factor(
165 |          case_when(
166 |             V161024x==4~"Yes",
167 |             VotedPres2016=="Yes"~"No",
168 |             TRUE~NA_character_
169 |          ), levels=c("Yes", "No")
170 |       )
171 |    )
172 |    
173 | 
174 | 
175 | summary(anes)
176 | ```
177 | 
178 | 
179 | ## Check derived variables for correct coding
180 | 
181 | ```{r checkvars}
182 | 
183 | anes %>% count(InterviewMode, V160501)
184 | anes %>% group_by(AgeGroup) %>% summarise(minAge=min(Age), maxAge=max(Age), minV=min(V161267), maxV=max(V161267))
185 | anes %>% count(Gender, V161342)
186 | anes %>% count(RaceEth, V161310x)
187 | anes %>% count(PartyID, V161158x)
188 | anes %>% count(Education, V161270)
189 | anes %>% count(Income, Income7, V161361x) %>% print(n=30)
190 | anes %>% count(CampaignInterest, V161004)
191 | anes %>% count(TrustGovernment, V161215)
192 | anes %>% count(TrustPeople, V161219)
193 | anes %>% count(VotedPres2012, V161005)
194 | anes %>% count(VotedPres2012_selection, V161006)
195 | anes %>% count(VotedPres2016, V162031x)
196 | anes %>% count(VotedPres2016_selection, V162062x)
197 | anes %>% count(EarlyVote2016, V161024x, VotedPres2016)
198 | 
199 | anes %>%
200 |    summarise(WtSum=sum(Weight)) %>%
201 |    pull(WtSum)
202 | 
203 | ```
204 | ## Save data
205 | 
206 | ```{r savedat}
207 | write_rds(anes, here("Data", "anes.rds"), compress="gz")
208 | ```
209 | 
210 | 
211 | 


--------------------------------------------------------------------------------
/DataCleaningScripts/ANES_DataPrep.md:
--------------------------------------------------------------------------------
  1 | American National Election Studies (ANES) 2016 Time Series Study Data
  2 | Prep
  3 | ================
  4 | 
  5 | ## Data information
  6 | 
  7 | All data and resources were downloaded from
  8 | <https://electionstudies.org/data-center/2016-time-series-study/> on
  9 | April 3, 2021.
 10 | 
 11 | American National Election Studies. 2019. ANES 2016 Time Series Study
 12 | \[dataset and documentation\]. September 4, 2019 version.
 13 | www.electionstudies.org
 14 | 
 15 | ``` r
 16 | library(here) #easy relative paths
 17 | ```
 18 | 
 19 | ``` r
 20 | library(tidyverse) #data manipulation
 21 | ```
 22 | 
 23 |     ## -- Attaching packages ----------------------------- tidyverse 1.3.0 --
 24 | 
 25 |     ## v ggplot2 3.3.3     v purrr   0.3.4
 26 |     ## v tibble  3.1.0     v dplyr   1.0.5
 27 |     ## v tidyr   1.1.3     v stringr 1.4.0
 28 |     ## v readr   1.4.0     v forcats 0.5.1
 29 | 
 30 |     ## -- Conflicts -------------------------------- tidyverse_conflicts() --
 31 |     ## x dplyr::filter() masks stats::filter()
 32 |     ## x dplyr::lag()    masks stats::lag()
 33 | 
 34 | ``` r
 35 | library(haven) #data import
 36 | library(tidylog) #informative logging messages
 37 | ```
 38 | 
 39 |     ## 
 40 |     ## Attaching package: 'tidylog'
 41 | 
 42 |     ## The following objects are masked from 'package:dplyr':
 43 |     ## 
 44 |     ##     add_count, add_tally, anti_join, count, distinct, distinct_all,
 45 |     ##     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
 46 |     ##     full_join, group_by, group_by_all, group_by_at, group_by_if,
 47 |     ##     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
 48 |     ##     relocate, rename, rename_all, rename_at, rename_if, rename_with,
 49 |     ##     right_join, sample_frac, sample_n, select, select_all, select_at,
 50 |     ##     select_if, semi_join, slice, slice_head, slice_max, slice_min,
 51 |     ##     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
 52 |     ##     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
 53 |     ##     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
 54 |     ##     transmute_if, ungroup
 55 | 
 56 |     ## The following objects are masked from 'package:tidyr':
 57 |     ## 
 58 |     ##     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
 59 |     ##     spread, uncount
 60 | 
 61 |     ## The following object is masked from 'package:stats':
 62 |     ## 
 63 |     ##     filter
 64 | 
 65 | ## Import data and create derived variables
 66 | 
 67 | ``` r
 68 | anes_in <- read_sav(here("RawData", "ANES_2016", "anes_timeseries_2016.sav"))
 69 | 
 70 | 
 71 | anes <- anes_in %>%
 72 |    select('V160102',    'V160201',  'V160202',  'V160501',  'V161004',  'V161005',  'V161006',  'V161024x', 'V161158x', 'V161215',  'V161219',  'V161267',  'V161267',  'V161270',  'V161310x', 'V161342',  'V161361x', 'V162031',  'V162031x', 'V162034',  'V162034a', 'V162062x', 'V162062x'
 73 |    ) %>%
 74 |    mutate(
 75 |       InterviewMode=fct_recode(as.character(V160501), FTF="1", Web="2"),
 76 |       Weight=V160102,
 77 |       Stratum=as.factor(V160201),
 78 |       VarUnit=as.factor(V160202),
 79 |       Age=if_else(V161267>0, as.numeric(V161267), NA_real_),
 80 |       AgeGroup=cut(Age, c(17, 29, 39, 49, 59, 69, 200),
 81 |                    labels=c("18-29", "30-39", "40-49", "50-59", "60-69", "70 or older")),
 82 |       Gender=factor(
 83 |          case_when(
 84 |             V161342==1~"Male",
 85 |             V161342==2~"Female",
 86 |             V161342==3~"Other",
 87 |             TRUE~NA_character_
 88 |          ),
 89 |          levels=c("Male", "Female", "Other")
 90 |       ),
 91 |       RaceEth=factor(
 92 |          case_when(
 93 |             V161310x==1~"White",
 94 |             V161310x==2~"Black",
 95 |             V161310x==5~"Hispanic",
 96 |             V161310x==3~"Asian, NH/PI",
 97 |             near(V161310x, 4)~"AI/AN",
 98 |             near(V161310x, 6)~"Other/multiple race",
 99 |             TRUE ~ NA_character_
100 |          ),
101 |          levels=c("White", "Black", "Hispanic", "Asian, NH/PI", "AI/AN", "Other/multiple race", NA_character_)
102 |       ),
103 |       PartyID=factor(
104 |          case_when(
105 |             V161158x==1~"Strong democrat",
106 |             V161158x==2~"Not very strong democrat",
107 |             V161158x==3~"Independent-democrat",
108 |             V161158x==4~"Independent",
109 |             V161158x==5~"Independent-republican",
110 |             V161158x==6~"Not very strong republican",
111 |             V161158x==7~"Strong republican",
112 |             TRUE ~ NA_character_
113 |          ),
114 |          levels=c("Strong democrat", "Not very strong democrat", "Independent-democrat", "Independent", "Independent-republican", "Not very strong republican", "Strong republican")
115 |       ),
116 |       Education=factor(
117 |          case_when(
118 |             V161270 <=0~NA_character_,
119 |             V161270 <= 8~"Less than HS",
120 |             V161270==9|V161270==90~"High school",
121 |             V161270<=12~"Post HS",
122 |             V161270==13~"Bachelor's",
123 |             V161270<=16~"Graduate",
124 |             TRUE~NA_character_
125 |          ),
126 |          levels=c("Less than HS", "High school", "Post HS", "Bachelor's", "Graduate")
127 |       ),
128 |       Income=cut(V161361x, c(-5, 1:28),
129 |                  labels=c("Under $5k", 
130 |                           "$5-10k", "$10-12.5k", "$12.5-15", "$15-17.5k", "$17.5-20k", "$20-22.5k", "$22.5-25k", "$25-27.5k", "$27.5-30k", "$30-35k", "$35-40k", "$40-45k", "$45-50k", "$50-55k", "$55-60k", "$60-65k","$65-70k", "$70-75k", "$75-80k", "$80-90k", "$90-100k","$100-110k", "$110-125k", "$125-150k", "$150-175k", "$175-250k", "$250k or more"  )
131 |       ), 
132 |       Income7=fct_collapse(
133 |          Income,
134 |          "Under $20k"=c("Under $5k", "$5-10k", "$10-12.5k", "$12.5-15", "$15-17.5k", "$17.5-20k"),
135 |          "$20-40k"=c("$20-22.5k", "$22.5-25k", "$25-27.5k", "$27.5-30k", "$30-35k", "$35-40k"),
136 |          "$40-60k"=c( "$40-45k", "$45-50k", "$50-55k", "$55-60k"),
137 |          "$60-80k"=c( "$60-65k", "$65-70k", "$70-75k", "$75-80k"),
138 |          "$80-100k"=c("$80-90k", "$90-100k"),
139 |          "$100-125k"=c("$100-110k", "$110-125k"),
140 |          "$125k or more"=c("$125-150k", "$150-175k", "$175-250k", "$250k or more")
141 |       ),
142 |       CampaignInterest=factor(
143 |          case_when(
144 |             V161004==1~"Very much interested",
145 |             V161004==2~"Somewhat interested",
146 |             V161004==3~"Not much interested",
147 |             TRUE~NA_character_
148 |          ),
149 |          levels=c("Very much interested", "Somewhat interested", "Not much interested")
150 |       ),
151 |       TrustGovernment=factor(
152 |          case_when(
153 |             V161215==1~"Always",
154 |             V161215==2~"Most of the time",
155 |             V161215==3~"About half the time",
156 |             V161215==4~"Some of the time",
157 |             V161215==5~"Never",
158 |             TRUE~NA_character_
159 |          ),
160 |          levels=c("Always", "Most of the time", "About half the time", "Some of the time", "Never")
161 |       ),
162 |       TrustPeople=factor(
163 |          case_when(
164 |             V161219==1~"Always",
165 |             V161219==2~"Most of the time",
166 |             V161219==3~"About half the time",
167 |             V161219==4~"Some of the time",
168 |             V161219==5~"Never",
169 |             TRUE ~ NA_character_
170 |          ),
171 |          levels=c("Always", "Most of the time", "About half the time", "Some of the time", "Never")
172 |       ),
173 |       VotedPres2012=factor(
174 |          case_when(
175 |             V161005==1~"Yes",
176 |             V161005==2~"No",
177 |             TRUE~NA_character_
178 |          ), levels=c("Yes", "No")
179 |       ),
180 |       VotedPres2012_selection=factor(
181 |          case_when(
182 |             V161006==1~"Obama",
183 |             V161006==2~"Romney",
184 |             V161006==5~"Other",
185 |             TRUE~NA_character_
186 |          ), levels=c("Obama", "Romney", "Other")
187 |       ),
188 |       VotedPres2016=factor(
189 |          case_when(
190 |             V162031x==1~"Yes",
191 |             V162031x==0~"No",
192 |             TRUE~NA_character_
193 |          ), levels=c("Yes", "No")
194 |       ),
195 |       VotedPres2016_selection=factor(
196 |          case_when(
197 |             V162062x==1~"Clinton",
198 |             V162062x==2~"Trump",
199 |             V162062x >=3 ~"Other",
200 |             TRUE~NA_character_
201 |          ), levels=c("Clinton", "Trump", "Other")
202 |       ),
203 |       EarlyVote2016=factor(
204 |          case_when(
205 |             V161024x==4~"Yes",
206 |             VotedPres2016=="Yes"~"No",
207 |             TRUE~NA_character_
208 |          ), levels=c("Yes", "No")
209 |       )
210 |    )
211 | ```
212 | 
213 |     ## select: dropped 1,821 variables (version, V160001, V160001_orig, V160101, V160101f, …)
214 | 
215 |     ## mutate: new variable 'InterviewMode' (factor) with 2 unique values and 0% NA
216 | 
217 |     ##         new variable 'Weight' (double) with 2,609 unique values and 0% NA
218 | 
219 |     ##         new variable 'Stratum' (factor) with 132 unique values and 0% NA
220 | 
221 |     ##         new variable 'VarUnit' (factor) with 3 unique values and 0% NA
222 | 
223 |     ##         new variable 'Age' (double) with 74 unique values and 3% NA
224 | 
225 |     ##         new variable 'AgeGroup' (factor) with 7 unique values and 3% NA
226 | 
227 |     ##         new variable 'Gender' (factor) with 4 unique values and 1% NA
228 | 
229 |     ##         new variable 'RaceEth' (factor) with 7 unique values and 1% NA
230 | 
231 |     ##         new variable 'PartyID' (factor) with 8 unique values and 1% NA
232 | 
233 |     ##         new variable 'Education' (factor) with 6 unique values and 1% NA
234 | 
235 |     ##         new variable 'Income' (factor) with 29 unique values and 5% NA
236 | 
237 |     ##         new variable 'Income7' (factor) with 8 unique values and 5% NA
238 | 
239 |     ##         new variable 'CampaignInterest' (factor) with 3 unique values and 0% NA
240 | 
241 |     ##         new variable 'TrustGovernment' (factor) with 6 unique values and 1% NA
242 | 
243 |     ##         new variable 'TrustPeople' (factor) with 6 unique values and <1% NA
244 | 
245 |     ##         new variable 'VotedPres2012' (factor) with 3 unique values and <1% NA
246 | 
247 |     ##         new variable 'VotedPres2012_selection' (factor) with 4 unique values and 28% NA
248 | 
249 |     ##         new variable 'VotedPres2016' (factor) with 3 unique values and 22% NA
250 | 
251 |     ##         new variable 'VotedPres2016_selection' (factor) with 4 unique values and 34% NA
252 | 
253 |     ##         new variable 'EarlyVote2016' (factor) with 3 unique values and 32% NA
254 | 
255 | ``` r
256 | summary(anes)
257 | ```
258 | 
259 |     ##     V160102          V160201          V160202         V160501     
260 |     ##  Min.   :0.0000   Min.   :  1.00   Min.   :1.000   Min.   :1.000  
261 |     ##  1st Qu.:0.3934   1st Qu.: 36.00   1st Qu.:1.000   1st Qu.:1.000  
262 |     ##  Median :0.7481   Median : 71.00   Median :1.500   Median :2.000  
263 |     ##  Mean   :0.8541   Mean   : 69.58   Mean   :1.505   Mean   :1.724  
264 |     ##  3rd Qu.:1.1294   3rd Qu.:105.00   3rd Qu.:2.000   3rd Qu.:2.000  
265 |     ##  Max.   :6.4445   Max.   :133.00   Max.   :3.000   Max.   :2.000  
266 |     ##                                                                   
267 |     ##     V161004       V161005          V161006           V161024x    
268 |     ##  Min.   :1.0   Min.   :-9.000   Min.   :-9.0000   Min.   :1.000  
269 |     ##  1st Qu.:1.0   1st Qu.: 1.000   1st Qu.:-1.0000   1st Qu.:3.000  
270 |     ##  Median :1.0   Median : 1.000   Median : 1.0000   Median :3.000  
271 |     ##  Mean   :1.6   Mean   : 1.232   Mean   : 0.6773   Mean   :2.804  
272 |     ##  3rd Qu.:2.0   3rd Qu.: 2.000   3rd Qu.: 2.0000   3rd Qu.:3.000  
273 |     ##  Max.   :3.0   Max.   : 2.000   Max.   : 6.0000   Max.   :4.000  
274 |     ##                                                                  
275 |     ##     V161158x         V161215         V161219          V161267     
276 |     ##  Min.   :-9.000   Min.   :-9.00   Min.   :-9.000   Min.   :-9.00  
277 |     ##  1st Qu.: 2.000   1st Qu.: 3.00   1st Qu.: 2.000   1st Qu.:33.00  
278 |     ##  Median : 4.000   Median : 4.00   Median : 3.000   Median :49.00  
279 |     ##  Mean   : 3.792   Mean   : 3.49   Mean   : 2.831   Mean   :47.92  
280 |     ##  3rd Qu.: 6.000   3rd Qu.: 4.00   3rd Qu.: 4.000   3rd Qu.:63.00  
281 |     ##  Max.   : 7.000   Max.   : 5.00   Max.   : 5.000   Max.   :90.00  
282 |     ##                                                                   
283 |     ##     V161270         V161310x         V161342          V161361x    
284 |     ##  Min.   :-9.00   Min.   :-2.000   Min.   :-9.000   Min.   :-9.00  
285 |     ##  1st Qu.: 9.00   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 8.00  
286 |     ##  Median :11.00   Median : 1.000   Median : 2.000   Median :15.00  
287 |     ##  Mean   :11.66   Mean   : 1.787   Mean   : 1.432   Mean   :14.25  
288 |     ##  3rd Qu.:13.00   3rd Qu.: 2.000   3rd Qu.: 2.000   3rd Qu.:22.00  
289 |     ##  Max.   :95.00   Max.   : 6.000   Max.   : 3.000   Max.   :28.00  
290 |     ##                                                                   
291 |     ##     V162031          V162031x          V162034           V162034a      
292 |     ##  Min.   :-8.000   Min.   :-8.0000   Min.   :-9.0000   Min.   :-9.0000  
293 |     ##  1st Qu.:-1.000   1st Qu.: 0.0000   1st Qu.:-1.0000   1st Qu.:-1.0000  
294 |     ##  Median : 4.000   Median : 1.0000   Median : 1.0000   Median : 1.0000  
295 |     ##  Mean   : 1.759   Mean   : 0.2349   Mean   :-0.4625   Mean   :-0.1468  
296 |     ##  3rd Qu.: 4.000   3rd Qu.: 1.0000   3rd Qu.: 1.0000   3rd Qu.: 2.0000  
297 |     ##  Max.   : 4.000   Max.   : 1.0000   Max.   : 2.0000   Max.   : 9.0000  
298 |     ##                                                                        
299 |     ##     V162062x       InterviewMode     Weight          Stratum     VarUnit 
300 |     ##  Min.   :-9.0000   FTF:1180      Min.   :0.0000   123    :  57   1:2135  
301 |     ##  1st Qu.:-2.0000   Web:3090      1st Qu.:0.3934   121    :  55   2:2115  
302 |     ##  Median : 1.0000                 Median :0.7481   126    :  55   3:  20  
303 |     ##  Mean   : 0.3393                 Mean   :0.8541   118    :  52           
304 |     ##  3rd Qu.: 2.0000                 3rd Qu.:1.1294   108    :  50           
305 |     ##  Max.   : 5.0000                 Max.   :6.4445   107    :  46           
306 |     ##                                                   (Other):3955           
307 |     ##       Age               AgeGroup      Gender                    RaceEth    
308 |     ##  Min.   :18.00   18-29      :651   Male  :1987   White              :3038  
309 |     ##  1st Qu.:34.00   30-39      :761   Female:2231   Black              : 397  
310 |     ##  Median :50.00   40-49      :620   Other :  11   Hispanic           : 450  
311 |     ##  Mean   :49.58   50-59      :781   NA's  :  41   Asian, NH/PI       : 148  
312 |     ##  3rd Qu.:63.00   60-69      :769                 AI/AN              :  27  
313 |     ##  Max.   :90.00   70 or older:567                 Other/multiple race: 177  
314 |     ##  NA's   :121     NA's       :121                 NA's               :  33  
315 |     ##                        PartyID           Education          Income    
316 |     ##  Strong democrat           :890   Less than HS: 282   Under $5k: 275  
317 |     ##  Strong republican         :721   High school : 815   $80-90k  : 231  
318 |     ##  Independent               :579   Post HS     :1499   $30-35k  : 213  
319 |     ##  Not very strong democrat  :559   Bachelor's  : 955   $60-65k  : 205  
320 |     ##  Not very strong republican:508   Graduate    : 680   $50-55k  : 204  
321 |     ##  (Other)                   :990   NA's        :  39   (Other)  :2940  
322 |     ##  NA's                      : 23                       NA's     : 202  
323 |     ##           Income7                CampaignInterest            TrustGovernment
324 |     ##  $20-40k      :773   Very much interested:2230    Always             :  66  
325 |     ##  Under $20k   :703   Somewhat interested :1519    Most of the time   : 429  
326 |     ##  $40-60k      :621   Not much interested : 521    About half the time:1382  
327 |     ##  $125k or more:615                                Some of the time   :1826  
328 |     ##  $60-80k      :576                                Never              : 545  
329 |     ##  (Other)      :780                                NA's               :  22  
330 |     ##  NA's         :202                                                          
331 |     ##               TrustPeople   VotedPres2012 VotedPres2012_selection VotedPres2016
332 |     ##  Always             :  50   Yes :3117     Obama :1728             Yes :2887    
333 |     ##  Most of the time   :1765   No  :1137     Romney:1268             No  : 444    
334 |     ##  About half the time:1305   NA's:  16     Other :  58             NA's: 939    
335 |     ##  Some of the time   : 947                 NA's  :1216                          
336 |     ##  Never              : 188                                                      
337 |     ##  NA's               :  15                                                      
338 |     ##                                                                                
339 |     ##  VotedPres2016_selection EarlyVote2016
340 |     ##  Clinton:1364            Yes : 156    
341 |     ##  Trump  :1245            No  :2731    
342 |     ##  Other  : 202            NA's:1383    
343 |     ##  NA's   :1459                         
344 |     ##                                       
345 |     ##                                       
346 |     ## 
347 | 
348 | ## Check derived variables for correct coding
349 | 
350 | ``` r
351 | anes %>% count(InterviewMode, V160501)
352 | ```
353 | 
354 |     ## count: now 2 rows and 3 columns, ungrouped
355 | 
356 |     ## # A tibble: 2 x 3
357 |     ##   InterviewMode          V160501     n
358 |     ##   <fct>                <dbl+lbl> <int>
359 |     ## 1 FTF           1 [1. FTF /CASI]  1180
360 |     ## 2 Web           2 [2. Web]        3090
361 | 
362 | ``` r
363 | anes %>% group_by(AgeGroup) %>% summarise(minAge=min(Age), maxAge=max(Age), minV=min(V161267), maxV=max(V161267))
364 | ```
365 | 
366 |     ## group_by: one grouping variable (AgeGroup)
367 | 
368 |     ## summarise: now 7 rows and 5 columns, ungrouped
369 | 
370 |     ## # A tibble: 7 x 5
371 |     ##   AgeGroup    minAge maxAge                   minV                          maxV
372 |     ##   <fct>        <dbl>  <dbl>              <dbl+lbl>                     <dbl+lbl>
373 |     ## 1 18-29           18     29 18                     29                           
374 |     ## 2 30-39           30     39 30                     39                           
375 |     ## 3 40-49           40     49 40                     49                           
376 |     ## 4 50-59           50     59 50                     59                           
377 |     ## 5 60-69           60     69 60                     69                           
378 |     ## 6 70 or older     70     90 70                     90 [90. Age 90 or older]     
379 |     ## 7 NA              NA     NA -9 [-9. RF (year of b~ -8 [-8. DK (year of birth, F~
380 | 
381 | ``` r
382 | anes %>% count(Gender, V161342)
383 | ```
384 | 
385 |     ## count: now 4 rows and 3 columns, ungrouped
386 | 
387 |     ## # A tibble: 4 x 3
388 |     ##   Gender          V161342     n
389 |     ##   <fct>         <dbl+lbl> <int>
390 |     ## 1 Male    1 [1. Male]      1987
391 |     ## 2 Female  2 [2. Female]    2231
392 |     ## 3 Other   3 [3. Other]       11
393 |     ## 4 NA     -9 [-9. Refused]    41
394 | 
395 | ``` r
396 | anes %>% count(RaceEth, V161310x)
397 | ```
398 | 
399 |     ## count: now 7 rows and 3 columns, ungrouped
400 | 
401 |     ## # A tibble: 7 x 3
402 |     ##   RaceEth                                                         V161310x     n
403 |     ##   <fct>                                                          <dbl+lbl> <int>
404 |     ## 1 White             1 [1. White, non-Hispanic]                              3038
405 |     ## 2 Black             2 [2. Black, non-Hispanic]                               397
406 |     ## 3 Hispanic          5 [5. Hispanic]                                          450
407 |     ## 4 Asian, NH/PI      3 [3. Asian, native Hawaiian or other Pacif Islr,non-~   148
408 |     ## 5 AI/AN             4 [4. Native American or Alaska Native, non-Hispanic]     27
409 |     ## 6 Other/multiple ~  6 [6. Other non-Hispanic incl multiple races [WEB: bl~   177
410 |     ## 7 NA               -2 [-2. Missing]                                           33
411 | 
412 | ``` r
413 | anes %>% count(PartyID, V161158x)
414 | ```
415 | 
416 |     ## count: now 9 rows and 3 columns, ungrouped
417 | 
418 |     ## # A tibble: 9 x 3
419 |     ##   PartyID                                                         V161158x     n
420 |     ##   <fct>                                                          <dbl+lbl> <int>
421 |     ## 1 Strong democrat         1 [1. Strong Democrat]                             890
422 |     ## 2 Not very strong democ~  2 [2. Not very strong Democract]                   559
423 |     ## 3 Independent-democrat    3 [3. Independent-Democrat]                        490
424 |     ## 4 Independent             4 [4. Independent]                                 579
425 |     ## 5 Independent-republican  5 [5. Independent-Republican]                      500
426 |     ## 6 Not very strong repub~  6 [6. Not very strong Republican]                  508
427 |     ## 7 Strong republican       7 [7. Strong Republican]                           721
428 |     ## 8 NA                     -9 [-9. RF (-9) in V161155 (FTF only) /-9 in V16~    12
429 |     ## 9 NA                     -8 [-8. DK (-8) in V161156 or V161157 (FTF only)]    11
430 | 
431 | ``` r
432 | anes %>% count(Education, V161270)
433 | ```
434 | 
435 |     ## count: now 19 rows and 3 columns, ungrouped
436 | 
437 |     ## # A tibble: 19 x 3
438 |     ##    Education                                                       V161270     n
439 |     ##    <fct>                                                         <dbl+lbl> <int>
440 |     ##  1 Less than HS  1 [1. Less than 1st grade]                                    1
441 |     ##  2 Less than HS  2 [2. 1st, 2nd, 3rd or 4th grade]                             3
442 |     ##  3 Less than HS  3 [3. 5th or 6th grade]                                      15
443 |     ##  4 Less than HS  4 [4. 7th or 8th grade]                                      22
444 |     ##  5 Less than HS  5 [5. 9th grade]                                             32
445 |     ##  6 Less than HS  6 [6. 10th grade]                                            40
446 |     ##  7 Less than HS  7 [7. 11th grade]                                            62
447 |     ##  8 Less than HS  8 [8. 12th grade no diploma]                                107
448 |     ##  9 High school   9 [9. High school graduate- high school diploma or equiv~   810
449 |     ## 10 High school  90 [90. Other specify given as: high school graduate]          5
450 |     ## 11 Post HS      10 [10. Some college but no degree]                          898
451 |     ## 12 Post HS      11 [11. Associate degree in college - occupational /vocat~   313
452 |     ## 13 Post HS      12 [12. Associate degree in college -- academic program]     288
453 |     ## 14 Bachelor's   13 [13. Bachelor's degree (for example: BA, AB, BS)]         955
454 |     ## 15 Graduate     14 [14. Master's degree (for example: MA, MS, MENG, MED, ~   499
455 |     ## 16 Graduate     15 [15. Professional school degree (for example: MD, DDS,~    88
456 |     ## 17 Graduate     16 [16. Doctorate degree (for example: PHD, EDD)]             93
457 |     ## 18 NA           -9 [-9. Refused]                                              15
458 |     ## 19 NA           95 [95. Other SPECIFY]                                        24
459 | 
460 | ``` r
461 | anes %>% count(Income, Income7, V161361x) %>% print(n=30)
462 | ```
463 | 
464 |     ## count: now 30 rows and 4 columns, ungrouped
465 | 
466 |     ## # A tibble: 30 x 4
467 |     ##    Income       Income7                                           V161361x     n
468 |     ##    <fct>        <fct>                                            <dbl+lbl> <int>
469 |     ##  1 Under $5k    Under $20k     1 [01. Under $5,000]                          275
470 |     ##  2 $5-10k       Under $20k     2 [02. $5,000-$9,999]                          96
471 |     ##  3 $10-12.5k    Under $20k     3 [03. $10,000-$12,499]                       133
472 |     ##  4 $12.5-15     Under $20k     4 [04. $12,500-$14,999]                        37
473 |     ##  5 $15-17.5k    Under $20k     5 [05. $15,000-$17,499]                       110
474 |     ##  6 $17.5-20k    Under $20k     6 [06. $17,500-$19,999]                        52
475 |     ##  7 $20-22.5k    $20-40k        7 [07. $20,000-$22,499]                       153
476 |     ##  8 $22.5-25k    $20-40k        8 [08. $22,500-$24,999]                        64
477 |     ##  9 $25-27.5k    $20-40k        9 [09. $25,000-$27,499]                       143
478 |     ## 10 $27.5-30k    $20-40k       10 [10. $27,500-$29,999]                        34
479 |     ## 11 $30-35k      $20-40k       11 [11. $30,000-$34,999]                       213
480 |     ## 12 $35-40k      $20-40k       12 [12. $35,000-$39,999]                       166
481 |     ## 13 $40-45k      $40-60k       13 [13. $40,000-$44,999]                       178
482 |     ## 14 $45-50k      $40-60k       14 [14. $45,000-$49,999]                       154
483 |     ## 15 $50-55k      $40-60k       15 [15. $50,000-$54,999]                       204
484 |     ## 16 $55-60k      $40-60k       16 [16. $55,000-$59,999]                        85
485 |     ## 17 $60-65k      $60-80k       17 [17. $60,000-$64,999]                       205
486 |     ## 18 $65-70k      $60-80k       18 [18. $65,000-$69,999]                       107
487 |     ## 19 $70-75k      $60-80k       19 [19. $70,000-$74,999]                       138
488 |     ## 20 $75-80k      $60-80k       20 [20. $75,000-$79,999]                       126
489 |     ## 21 $80-90k      $80-100k      21 [21. $80,000-$89,999]                       231
490 |     ## 22 $90-100k     $80-100k      22 [22. $90,000-$99,999]                       176
491 |     ## 23 $100-110k    $100-125k     23 [23. $100,000-$109,999]                     191
492 |     ## 24 $110-125k    $100-125k     24 [24. $110,000-$124,999]                     182
493 |     ## 25 $125-150k    $125k or more 25 [25. $125,000-$149,999]                     166
494 |     ## 26 $150-175k    $125k or more 26 [26. $150,000-$174,999]                     154
495 |     ## 27 $175-250k    $125k or more 27 [27. $175,000-$249,999]                     154
496 |     ## 28 $250k or mo~ $125k or more 28 [28. $250,000 or more]                      141
497 |     ## 29 NA           NA            -9 [-9. Refused]                               190
498 |     ## 30 NA           NA            -5 [-5. Interview breakoff (sufficient part~    12
499 | 
500 | ``` r
501 | anes %>% count(CampaignInterest, V161004)
502 | ```
503 | 
504 |     ## count: now 3 rows and 3 columns, ungrouped
505 | 
506 |     ## # A tibble: 3 x 3
507 |     ##   CampaignInterest                         V161004     n
508 |     ##   <fct>                                  <dbl+lbl> <int>
509 |     ## 1 Very much interested 1 [1. Very much interested]  2230
510 |     ## 2 Somewhat interested  2 [2. Somewhat interested]   1519
511 |     ## 3 Not much interested  3 [3. Not much interested]    521
512 | 
513 | ``` r
514 | anes %>% count(TrustGovernment, V161215)
515 | ```
516 | 
517 |     ## count: now 7 rows and 3 columns, ungrouped
518 | 
519 |     ## # A tibble: 7 x 3
520 |     ##   TrustGovernment                            V161215     n
521 |     ##   <fct>                                    <dbl+lbl> <int>
522 |     ## 1 Always               1 [1. Always]                    66
523 |     ## 2 Most of the time     2 [2. Most of the time]         429
524 |     ## 3 About half the time  3 [3. About half the time]     1382
525 |     ## 4 Some of the time     4 [4. Some of the time]        1826
526 |     ## 5 Never                5 [5. Never]                    545
527 |     ## 6 NA                  -9 [-9. Refused]                  19
528 |     ## 7 NA                  -8 [-8. Don't know (FTF only)]     3
529 | 
530 | ``` r
531 | anes %>% count(TrustPeople, V161219)
532 | ```
533 | 
534 |     ## count: now 7 rows and 3 columns, ungrouped
535 | 
536 |     ## # A tibble: 7 x 3
537 |     ##   TrustPeople                                V161219     n
538 |     ##   <fct>                                    <dbl+lbl> <int>
539 |     ## 1 Always               1 [1. Always]                    50
540 |     ## 2 Most of the time     2 [2. Most of the time]        1765
541 |     ## 3 About half the time  3 [3. About half the time]     1305
542 |     ## 4 Some of the time     4 [4. Some of the time]         947
543 |     ## 5 Never                5 [5. Never]                    188
544 |     ## 6 NA                  -9 [-9. Refused]                  14
545 |     ## 7 NA                  -8 [-8. Don't know (FTF only)]     1
546 | 
547 | ``` r
548 | anes %>% count(VotedPres2012, V161005)
549 | ```
550 | 
551 |     ## count: now 4 rows and 3 columns, ungrouped
552 | 
553 |     ## # A tibble: 4 x 3
554 |     ##   VotedPres2012                        V161005     n
555 |     ##   <fct>                              <dbl+lbl> <int>
556 |     ## 1 Yes            1 [1. Yes, voted]              3117
557 |     ## 2 No             2 [2. No, didn't vote]         1137
558 |     ## 3 NA            -9 [-9. Refused]                   2
559 |     ## 4 NA            -8 [-8. Don't know (FTF only)]    14
560 | 
561 | ``` r
562 | anes %>% count(VotedPres2012_selection, V161006)
563 | ```
564 | 
565 |     ## count: now 7 rows and 3 columns, ungrouped
566 | 
567 |     ## # A tibble: 7 x 3
568 |     ##   VotedPres2012_select~                                            V161006     n
569 |     ##   <fct>                                                          <dbl+lbl> <int>
570 |     ## 1 Obama                  1 [1. Barack Obama]                                1728
571 |     ## 2 Romney                 2 [2. Mitt Romney]                                 1268
572 |     ## 3 Other                  5 [5. Other SPECIFY]                                 58
573 |     ## 4 NA                    -9 [-9. Refused]                                      47
574 |     ## 5 NA                    -8 [-8. Don't know (FTF only)]                        13
575 |     ## 6 NA                    -1 [-1. Inap, 2,-8,-9 in V161005]                   1153
576 |     ## 7 NA                     6 [6. Other specify - specified as:  Did not vot~     3
577 | 
578 | ``` r
579 | anes %>% count(VotedPres2016, V162031x)
580 | ```
581 | 
582 |     ## count: now 4 rows and 3 columns, ungrouped
583 | 
584 |     ## # A tibble: 4 x 3
585 |     ##   VotedPres2016                                                   V162031x     n
586 |     ##   <fct>                                                          <dbl+lbl> <int>
587 |     ## 1 Yes            1 [1. Voted in 2016]                                       2887
588 |     ## 2 No             0 [0. Did not vote in 2016]                                 444
589 |     ## 3 NA            -8 [-8. Don't know (in V162031)]                               1
590 |     ## 4 NA            -2 [-2. Missing, 3 in V162022 /FTF: -8,-9 in V162022 /WEB~   938
591 | 
592 | ``` r
593 | anes %>% count(VotedPres2016_selection, V162062x)
594 | ```
595 | 
596 |     ## count: now 8 rows and 3 columns, ungrouped
597 | 
598 |     ## # A tibble: 8 x 3
599 |     ##   VotedPres2016_select~                                           V162062x     n
600 |     ##   <fct>                                                          <dbl+lbl> <int>
601 |     ## 1 Clinton                1 [1. Hillary Clinton]                             1364
602 |     ## 2 Trump                  2 [2. Donald Trump]                                1245
603 |     ## 3 Other                  3 [3. Gary Johnson]                                 118
604 |     ## 4 Other                  4 [4. Jill Stein]                                    32
605 |     ## 5 Other                  5 [5. Other candidate SPECIFY]                       52
606 |     ## 6 NA                    -9 [-9. Refused]                                      31
607 |     ## 7 NA                    -8 [-8. Don't know (FTF only)]                         2
608 |     ## 8 NA                    -2 [-2. Missing, no vote for Pres in Post /no Pos~  1426
609 | 
610 | ``` r
611 | anes %>% count(EarlyVote2016, V161024x, VotedPres2016)
612 | ```
613 | 
614 |     ## count: now 10 rows and 4 columns, ungrouped
615 | 
616 |     ## # A tibble: 10 x 4
617 |     ##    EarlyVote2016                                    V161024x VotedPres2016     n
618 |     ##    <fct>                                           <dbl+lbl> <fct>         <int>
619 |     ##  1 Yes           4 [4. Registered and voted early]           Yes             156
620 |     ##  2 No            1 [1. Not (or DK /RF if) registered, does ~ Yes              28
621 |     ##  3 No            2 [2. Not (or DK /RF if) registered, inten~ Yes              65
622 |     ##  4 No            3 [3. Registered but did not vote early (o~ Yes            2638
623 |     ##  5 NA            1 [1. Not (or DK /RF if) registered, does ~ No               31
624 |     ##  6 NA            1 [1. Not (or DK /RF if) registered, does ~ NA              322
625 |     ##  7 NA            2 [2. Not (or DK /RF if) registered, inten~ No               46
626 |     ##  8 NA            2 [2. Not (or DK /RF if) registered, inten~ NA              120
627 |     ##  9 NA            3 [3. Registered but did not vote early (o~ No              367
628 |     ## 10 NA            3 [3. Registered but did not vote early (o~ NA              497
629 | 
630 | ``` r
631 | anes %>%
632 |    summarise(WtSum=sum(Weight)) %>%
633 |    pull(WtSum)
634 | ```
635 | 
636 |     ## summarise: now one row and one column, ungrouped
637 | 
638 |     ## [1] 3646.921
639 | 
640 | ## Save data
641 | 
642 | ``` r
643 | write_rds(anes, here("Data", "anes.rds"), compress="gz")
644 | ```
645 | 


--------------------------------------------------------------------------------
/DataCleaningScripts/RECS_DataPrep.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Residential Energy Consumption Survey (RECS) 2015 Data Prep"
  3 | output: github_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ## Data information
 11 | 
 12 | All data and resources were downloaded from https://www.eia.gov/consumption/residential/data/2015/index.php?view=microdata on March 3, 2021.
 13 | 
 14 | ```{r loadpackageh, message=FALSE}
 15 | library(here) #easy relative paths
 16 | ```
 17 | 
 18 | ```{r loadpackages}
 19 | library(tidyverse) #data manipulation
 20 | library(haven) #data import
 21 | library(tidylog) #informative logging messages
 22 | ```
 23 | ## Import data and create derived variables
 24 | 
 25 | ```{r derivedata}
 26 | recs_in <- read_csv(here("RawData", "RECS_2015", "recs2015_public_v4.csv"))
 27 | 
 28 | recs <- recs_in %>%
 29 |    select(DOEID, REGIONC, DIVISION, METROMICRO, UATYP10, TYPEHUQ, YEARMADERANGE, HEATHOME, EQUIPMUSE, TEMPHOME, TEMPGONE, TEMPNITE, AIRCOND, USECENAC, TEMPHOMEAC, TEMPGONEAC, TEMPNITEAC, TOTCSQFT, TOTHSQFT, TOTSQFT_EN, TOTUCSQFT, TOTUSQFT, NWEIGHT, starts_with("BRRWT"), CDD30YR, CDD65, CDD80, CLIMATE_REGION_PUB, IECC_CLIMATE_PUB, HDD30YR, HDD65, HDD50, GNDHDD65, BTUEL, DOLLAREL, BTUNG, DOLLARNG, BTULP, DOLLARLP, BTUFO, DOLLARFO, TOTALBTU, TOTALDOL, BTUWOOD=WOODBTU, BTUPELLET=PELLETBTU ) %>%
 30 |    mutate(
 31 |       Region=parse_factor(
 32 |          case_when(
 33 |             REGIONC==1~"Northeast",
 34 |             REGIONC==2~"Midwest",
 35 |             REGIONC==3~"South",
 36 |             REGIONC==4~"West",
 37 |       ), levels=c("Northeast", "Midwest", "South", "West")),
 38 |       Division=parse_factor(
 39 |          case_when(
 40 |             DIVISION==1~"New England",
 41 |             DIVISION==2~"Middle Atlantic",
 42 |             DIVISION==3~"East North Central",
 43 |             DIVISION==4~"West North Central",
 44 |             DIVISION==5~"South Atlantic",
 45 |             DIVISION==6~"East South Central",
 46 |             DIVISION==7~"West South Central",
 47 |             DIVISION==8~"Mountain North",
 48 |             DIVISION==9~"Mountain South",
 49 |             DIVISION==10~"Pacific",
 50 |       ), levels=c("New England", "Middle Atlantic", "East North Central", "West North Central", "South Atlantic", "East South Central", "West South Central", "Mountain North", "Mountain South", "Pacific")),
 51 |       MSAStatus=fct_recode(METROMICRO, "Metropolitan Statistical Area"="METRO", "Micropolitan Statistical Area"="MICRO", "None"="NONE"),
 52 |       Urbanicity=parse_factor(
 53 |          case_when(
 54 |             UATYP10=="U"~"Urban Area",
 55 |             UATYP10=="C"~"Urban Cluster",
 56 |             UATYP10=="R"~"Rural"
 57 |          ),
 58 |          levels=c("Urban Area", "Urban Cluster", "Rural")
 59 |       ),
 60 |       HousingUnitType=parse_factor(
 61 |          case_when(
 62 |             TYPEHUQ==1~"Mobile home",
 63 |             TYPEHUQ==2~"Single-family detached",
 64 |             TYPEHUQ==3~"Single-family attached",
 65 |             TYPEHUQ==4~"Apartment: 2-4 Units",
 66 |             TYPEHUQ==5~"Apartment: 5 or more units",
 67 |       ), levels=c("Mobile home", "Single-family detached", "Single-family attached", "Apartment: 2-4 Units", "Apartment: 5 or more units")),
 68 |       YearMade=parse_factor(
 69 |          case_when(
 70 |             YEARMADERANGE==1~"Before 1950",
 71 |             YEARMADERANGE==2~"1950-1959",
 72 |             YEARMADERANGE==3~"1960-1969",
 73 |             YEARMADERANGE==4~"1970-1979",
 74 |             YEARMADERANGE==5~"1980-1989",
 75 |             YEARMADERANGE==6~"1990-1999",
 76 |             YEARMADERANGE==7~"2000-2009",
 77 |             YEARMADERANGE==8~"2010-2015",
 78 |          ),
 79 |          levels=c("Before 1950", "1950-1959", "1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000-2009", "2010-2015"),
 80 |          ordered = TRUE
 81 |       ),
 82 |       SpaceHeatingUsed=as.logical(HEATHOME),
 83 |       HeatingBehavior=parse_factor(
 84 |          case_when(
 85 |             EQUIPMUSE==1~"Set one temp and leave it",
 86 |             EQUIPMUSE==2~"Manually adjust at night/no one home",
 87 |             EQUIPMUSE==3~"Program thermostat to change at certain times",
 88 |             EQUIPMUSE==4~"Turn on or off as needed",
 89 |             EQUIPMUSE==5~"No control",
 90 |             EQUIPMUSE==9~"Other",
 91 |             EQUIPMUSE==-9~NA_character_),
 92 |          levels=c("Set one temp and leave it", "Manually adjust at night/no one home", "Program thermostat to change at certain times", "Turn on or off as needed", "No control", "Other")
 93 |       ),
 94 |       WinterTempDay=if_else(TEMPHOME>0, TEMPHOME, NA_real_),
 95 |       WinterTempAway=if_else(TEMPGONE>0, TEMPGONE, NA_real_),
 96 |       WinterTempNight=if_else(TEMPNITE>0, TEMPNITE, NA_real_),
 97 |       ACUsed=as.logical(AIRCOND),
 98 |       ACBehavior=parse_factor(
 99 |          case_when(
100 |             USECENAC==1~"Set one temp and leave it",
101 |             USECENAC==2~"Manually adjust at night/no one home",
102 |             USECENAC==3~"Program thermostat to change at certain times",
103 |             USECENAC==4~"Turn on or off as needed",
104 |             USECENAC==5~"No control",
105 |             USECENAC==-9~NA_character_),
106 |          levels=c("Set one temp and leave it", "Manually adjust at night/no one home", "Program thermostat to change at certain times", "Turn on or off as needed", "No control")
107 |       ),
108 |       SummerTempDay=if_else(TEMPHOMEAC>0, TEMPHOMEAC, NA_real_),
109 |       SummerTempAway=if_else(TEMPGONEAC>0, TEMPGONEAC, NA_real_),
110 |       SummerTempNight=if_else(TEMPNITEAC>0, TEMPNITEAC, NA_real_),
111 |       ClimateRegion_BA=parse_factor(CLIMATE_REGION_PUB),
112 |       ClimateRegion_IECC=factor(IECC_CLIMATE_PUB)
113 |       
114 |    )
115 | 
116 | ```
117 | 
118 | 
119 | ## Check derived variables for correct coding
120 | 
121 | ```{r checkvars}
122 | recs %>% count(Region, REGIONC)
123 | recs %>% count(Division, DIVISION)
124 | recs %>% count(MSAStatus, METROMICRO)
125 | recs %>% count(Urbanicity, UATYP10)
126 | recs %>% count(HousingUnitType, TYPEHUQ)
127 | recs %>% count(YearMade, YEARMADERANGE)
128 | recs %>% count(SpaceHeatingUsed, HEATHOME)
129 | recs %>% count(HeatingBehavior, EQUIPMUSE)
130 | recs %>% count(ACUsed, AIRCOND)
131 | recs %>% count(ACBehavior, USECENAC)
132 | recs %>% count(ClimateRegion_BA, CLIMATE_REGION_PUB)
133 | recs %>% count(ClimateRegion_IECC, IECC_CLIMATE_PUB)
134 | 
135 | ```
136 | ## Save data
137 | 
138 | ```{r savedat}
139 | recs_out <- recs %>%
140 |    select(DOEID, Region, Division, MSAStatus, Urbanicity, HousingUnitType, YearMade, SpaceHeatingUsed, HeatingBehavior, WinterTempDay, WinterTempAway, WinterTempNight, ACUsed, ACBehavior, SummerTempDay, SummerTempAway, SummerTempNight, TOTCSQFT, TOTHSQFT, TOTSQFT_EN, TOTUCSQFT, TOTUSQFT, NWEIGHT, starts_with("BRRWT"), CDD30YR, CDD65, CDD80, ClimateRegion_BA, ClimateRegion_IECC, HDD30YR, HDD65, HDD50, GNDHDD65, BTUEL, DOLLAREL, BTUNG, DOLLARNG, BTULP, DOLLARLP, BTUFO, DOLLARFO, TOTALBTU, TOTALDOL, BTUWOOD, BTUPELLET)
141 | 
142 | summary(recs_out)
143 | write_rds(recs_out, here("Data", "recs.rds"), compress="gz")
144 | ```
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/DataCleaningScripts/RECS_DataPrep.md:
--------------------------------------------------------------------------------
  1 | Residential Energy Consumption Survey (RECS) 2015 Data Prep
  2 | ================
  3 | 
  4 | ## Data information
  5 | 
  6 | All data and resources were downloaded from
  7 | <https://www.eia.gov/consumption/residential/data/2015/index.php?view=microdata>
  8 | on March 3, 2021.
  9 | 
 10 | ``` r
 11 | library(here) #easy relative paths
 12 | ```
 13 | 
 14 |     ## Warning: package 'here' was built under R version 4.0.4
 15 | 
 16 | ``` r
 17 | library(tidyverse) #data manipulation
 18 | ```
 19 | 
 20 |     ## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.3.0 --
 21 | 
 22 |     ## v ggplot2 3.3.2     v purrr   0.3.4
 23 |     ## v tibble  3.0.3     v dplyr   1.0.2
 24 |     ## v tidyr   1.1.2     v stringr 1.4.0
 25 |     ## v readr   1.3.1     v forcats 0.5.0
 26 | 
 27 |     ## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
 28 |     ## x dplyr::filter() masks stats::filter()
 29 |     ## x dplyr::lag()    masks stats::lag()
 30 | 
 31 | ``` r
 32 | library(haven) #data import
 33 | library(tidylog) #informative logging messages
 34 | ```
 35 | 
 36 |     ## Warning: package 'tidylog' was built under R version 4.0.4
 37 | 
 38 |     ## 
 39 |     ## Attaching package: 'tidylog'
 40 | 
 41 |     ## The following objects are masked from 'package:dplyr':
 42 |     ## 
 43 |     ##     add_count, add_tally, anti_join, count, distinct, distinct_all,
 44 |     ##     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
 45 |     ##     full_join, group_by, group_by_all, group_by_at, group_by_if,
 46 |     ##     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
 47 |     ##     relocate, rename, rename_all, rename_at, rename_if, rename_with,
 48 |     ##     right_join, sample_frac, sample_n, select, select_all, select_at,
 49 |     ##     select_if, semi_join, slice, slice_head, slice_max, slice_min,
 50 |     ##     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
 51 |     ##     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
 52 |     ##     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
 53 |     ##     transmute_if, ungroup
 54 | 
 55 |     ## The following objects are masked from 'package:tidyr':
 56 |     ## 
 57 |     ##     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
 58 |     ##     spread, uncount
 59 | 
 60 |     ## The following object is masked from 'package:stats':
 61 |     ## 
 62 |     ##     filter
 63 | 
 64 | ## Import data and create derived variables
 65 | 
 66 | ``` r
 67 | recs_in <- read_csv(here("RawData", "RECS_2015", "recs2015_public_v4.csv"))
 68 | ```
 69 | 
 70 |     ## Parsed with column specification:
 71 |     ## cols(
 72 |     ##   .default = col_double(),
 73 |     ##   METROMICRO = col_character(),
 74 |     ##   UATYP10 = col_character(),
 75 |     ##   CLIMATE_REGION_PUB = col_character(),
 76 |     ##   IECC_CLIMATE_PUB = col_character()
 77 |     ## )
 78 | 
 79 |     ## See spec(...) for full column specifications.
 80 | 
 81 | ``` r
 82 | recs <- recs_in %>%
 83 |    select(DOEID, REGIONC, DIVISION, METROMICRO, UATYP10, TYPEHUQ, YEARMADERANGE, HEATHOME, EQUIPMUSE, TEMPHOME, TEMPGONE, TEMPNITE, AIRCOND, USECENAC, TEMPHOMEAC, TEMPGONEAC, TEMPNITEAC, TOTCSQFT, TOTHSQFT, TOTSQFT_EN, TOTUCSQFT, TOTUSQFT, NWEIGHT, starts_with("BRRWT"), CDD30YR, CDD65, CDD80, CLIMATE_REGION_PUB, IECC_CLIMATE_PUB, HDD30YR, HDD65, HDD50, GNDHDD65, BTUEL, DOLLAREL, BTUNG, DOLLARNG, BTULP, DOLLARLP, BTUFO, DOLLARFO, TOTALBTU, TOTALDOL, BTUWOOD=WOODBTU, BTUPELLET=PELLETBTU ) %>%
 84 |    mutate(
 85 |       Region=parse_factor(
 86 |          case_when(
 87 |             REGIONC==1~"Northeast",
 88 |             REGIONC==2~"Midwest",
 89 |             REGIONC==3~"South",
 90 |             REGIONC==4~"West",
 91 |       ), levels=c("Northeast", "Midwest", "South", "West")),
 92 |       Division=parse_factor(
 93 |          case_when(
 94 |             DIVISION==1~"New England",
 95 |             DIVISION==2~"Middle Atlantic",
 96 |             DIVISION==3~"East North Central",
 97 |             DIVISION==4~"West North Central",
 98 |             DIVISION==5~"South Atlantic",
 99 |             DIVISION==6~"East South Central",
100 |             DIVISION==7~"West South Central",
101 |             DIVISION==8~"Mountain North",
102 |             DIVISION==9~"Mountain South",
103 |             DIVISION==10~"Pacific",
104 |       ), levels=c("New England", "Middle Atlantic", "East North Central", "West North Central", "South Atlantic", "East South Central", "West South Central", "Mountain North", "Mountain South", "Pacific")),
105 |       MSAStatus=fct_recode(METROMICRO, "Metropolitan Statistical Area"="METRO", "Micropolitan Statistical Area"="MICRO", "None"="NONE"),
106 |       Urbanicity=parse_factor(
107 |          case_when(
108 |             UATYP10=="U"~"Urban Area",
109 |             UATYP10=="C"~"Urban Cluster",
110 |             UATYP10=="R"~"Rural"
111 |          ),
112 |          levels=c("Urban Area", "Urban Cluster", "Rural")
113 |       ),
114 |       HousingUnitType=parse_factor(
115 |          case_when(
116 |             TYPEHUQ==1~"Mobile home",
117 |             TYPEHUQ==2~"Single-family detached",
118 |             TYPEHUQ==3~"Single-family attached",
119 |             TYPEHUQ==4~"Apartment: 2-4 Units",
120 |             TYPEHUQ==5~"Apartment: 5 or more units",
121 |       ), levels=c("Mobile home", "Single-family detached", "Single-family attached", "Apartment: 2-4 Units", "Apartment: 5 or more units")),
122 |       YearMade=parse_factor(
123 |          case_when(
124 |             YEARMADERANGE==1~"Before 1950",
125 |             YEARMADERANGE==2~"1950-1959",
126 |             YEARMADERANGE==3~"1960-1969",
127 |             YEARMADERANGE==4~"1970-1979",
128 |             YEARMADERANGE==5~"1980-1989",
129 |             YEARMADERANGE==6~"1990-1999",
130 |             YEARMADERANGE==7~"2000-2009",
131 |             YEARMADERANGE==8~"2010-2015",
132 |          ),
133 |          levels=c("Before 1950", "1950-1959", "1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000-2009", "2010-2015"),
134 |          ordered = TRUE
135 |       ),
136 |       SpaceHeatingUsed=as.logical(HEATHOME),
137 |       HeatingBehavior=parse_factor(
138 |          case_when(
139 |             EQUIPMUSE==1~"Set one temp and leave it",
140 |             EQUIPMUSE==2~"Manually adjust at night/no one home",
141 |             EQUIPMUSE==3~"Program thermostat to change at certain times",
142 |             EQUIPMUSE==4~"Turn on or off as needed",
143 |             EQUIPMUSE==5~"No control",
144 |             EQUIPMUSE==9~"Other",
145 |             EQUIPMUSE==-9~NA_character_),
146 |          levels=c("Set one temp and leave it", "Manually adjust at night/no one home", "Program thermostat to change at certain times", "Turn on or off as needed", "No control", "Other")
147 |       ),
148 |       WinterTempDay=if_else(TEMPHOME>0, TEMPHOME, NA_real_),
149 |       WinterTempAway=if_else(TEMPGONE>0, TEMPGONE, NA_real_),
150 |       WinterTempNight=if_else(TEMPNITE>0, TEMPNITE, NA_real_),
151 |       ACUsed=as.logical(AIRCOND),
152 |       ACBehavior=parse_factor(
153 |          case_when(
154 |             USECENAC==1~"Set one temp and leave it",
155 |             USECENAC==2~"Manually adjust at night/no one home",
156 |             USECENAC==3~"Program thermostat to change at certain times",
157 |             USECENAC==4~"Turn on or off as needed",
158 |             USECENAC==5~"No control",
159 |             USECENAC==-9~NA_character_),
160 |          levels=c("Set one temp and leave it", "Manually adjust at night/no one home", "Program thermostat to change at certain times", "Turn on or off as needed", "No control")
161 |       ),
162 |       SummerTempDay=if_else(TEMPHOMEAC>0, TEMPHOMEAC, NA_real_),
163 |       SummerTempAway=if_else(TEMPGONEAC>0, TEMPGONEAC, NA_real_),
164 |       SummerTempNight=if_else(TEMPNITEAC>0, TEMPNITEAC, NA_real_),
165 |       ClimateRegion_BA=parse_factor(CLIMATE_REGION_PUB),
166 |       ClimateRegion_IECC=factor(IECC_CLIMATE_PUB)
167 |       
168 |    )
169 | ```
170 | 
171 |     ## select: renamed 2 variables (BTUWOOD, BTUPELLET) and dropped 619 variables
172 | 
173 |     ## mutate: new variable 'Region' (factor) with 4 unique values and 0% NA
174 | 
175 |     ##         new variable 'Division' (factor) with 10 unique values and 0% NA
176 | 
177 |     ##         new variable 'MSAStatus' (factor) with 3 unique values and 0% NA
178 | 
179 |     ##         new variable 'Urbanicity' (factor) with 3 unique values and 0% NA
180 | 
181 |     ##         new variable 'HousingUnitType' (factor) with 5 unique values and 0% NA
182 | 
183 |     ##         new variable 'YearMade' (ordered factor) with 8 unique values and 0% NA
184 | 
185 |     ##         new variable 'SpaceHeatingUsed' (logical) with 2 unique values and 0% NA
186 | 
187 |     ##         new variable 'HeatingBehavior' (factor) with 7 unique values and 0% NA
188 | 
189 |     ##         new variable 'WinterTempDay' (double) with 35 unique values and 5% NA
190 | 
191 |     ##         new variable 'WinterTempAway' (double) with 37 unique values and 5% NA
192 | 
193 |     ##         new variable 'WinterTempNight' (double) with 38 unique values and 5% NA
194 | 
195 |     ##         new variable 'ACUsed' (logical) with 2 unique values and 0% NA
196 | 
197 |     ##         new variable 'ACBehavior' (factor) with 6 unique values and 0% NA
198 | 
199 |     ##         new variable 'SummerTempDay' (double) with 38 unique values and 13% NA
200 | 
201 |     ##         new variable 'SummerTempAway' (double) with 35 unique values and 13% NA
202 | 
203 |     ##         new variable 'SummerTempNight' (double) with 36 unique values and 13% NA
204 | 
205 |     ##         new variable 'ClimateRegion_BA' (factor) with 5 unique values and 0% NA
206 | 
207 |     ##         new variable 'ClimateRegion_IECC' (factor) with 11 unique values and 0% NA
208 | 
209 | ## Check derived variables for correct coding
210 | 
211 | ``` r
212 | recs %>% count(Region, REGIONC)
213 | ```
214 | 
215 |     ## count: now 4 rows and 3 columns, ungrouped
216 | 
217 |     ## # A tibble: 4 x 3
218 |     ##   Region    REGIONC     n
219 |     ##   <fct>       <dbl> <int>
220 |     ## 1 Northeast       1   794
221 |     ## 2 Midwest         2  1327
222 |     ## 3 South           3  2010
223 |     ## 4 West            4  1555
224 | 
225 | ``` r
226 | recs %>% count(Division, DIVISION)
227 | ```
228 | 
229 |     ## count: now 10 rows and 3 columns, ungrouped
230 | 
231 |     ## # A tibble: 10 x 3
232 |     ##    Division           DIVISION     n
233 |     ##    <fct>                 <dbl> <int>
234 |     ##  1 New England               1   253
235 |     ##  2 Middle Atlantic           2   541
236 |     ##  3 East North Central        3   836
237 |     ##  4 West North Central        4   491
238 |     ##  5 South Atlantic            5  1058
239 |     ##  6 East South Central        6   372
240 |     ##  7 West South Central        7   580
241 |     ##  8 Mountain North            8   228
242 |     ##  9 Mountain South            9   242
243 |     ## 10 Pacific                  10  1085
244 | 
245 | ``` r
246 | recs %>% count(MSAStatus, METROMICRO)
247 | ```
248 | 
249 |     ## count: now 3 rows and 3 columns, ungrouped
250 | 
251 |     ## # A tibble: 3 x 3
252 |     ##   MSAStatus                     METROMICRO     n
253 |     ##   <fct>                         <chr>      <int>
254 |     ## 1 Metropolitan Statistical Area METRO       4745
255 |     ## 2 Micropolitan Statistical Area MICRO        584
256 |     ## 3 None                          NONE         357
257 | 
258 | ``` r
259 | recs %>% count(Urbanicity, UATYP10)
260 | ```
261 | 
262 |     ## count: now 3 rows and 3 columns, ungrouped
263 | 
264 |     ## # A tibble: 3 x 3
265 |     ##   Urbanicity    UATYP10     n
266 |     ##   <fct>         <chr>   <int>
267 |     ## 1 Urban Area    U        3928
268 |     ## 2 Urban Cluster C         598
269 |     ## 3 Rural         R        1160
270 | 
271 | ``` r
272 | recs %>% count(HousingUnitType, TYPEHUQ)
273 | ```
274 | 
275 |     ## count: now 5 rows and 3 columns, ungrouped
276 | 
277 |     ## # A tibble: 5 x 3
278 |     ##   HousingUnitType            TYPEHUQ     n
279 |     ##   <fct>                        <dbl> <int>
280 |     ## 1 Mobile home                      1   286
281 |     ## 2 Single-family detached           2  3752
282 |     ## 3 Single-family attached           3   479
283 |     ## 4 Apartment: 2-4 Units             4   311
284 |     ## 5 Apartment: 5 or more units       5   858
285 | 
286 | ``` r
287 | recs %>% count(YearMade, YEARMADERANGE)
288 | ```
289 | 
290 |     ## count: now 8 rows and 3 columns, ungrouped
291 | 
292 |     ## # A tibble: 8 x 3
293 |     ##   YearMade    YEARMADERANGE     n
294 |     ##   <ord>               <dbl> <int>
295 |     ## 1 Before 1950             1   858
296 |     ## 2 1950-1959               2   544
297 |     ## 3 1960-1969               3   565
298 |     ## 4 1970-1979               4   928
299 |     ## 5 1980-1989               5   874
300 |     ## 6 1990-1999               6   786
301 |     ## 7 2000-2009               7   901
302 |     ## 8 2010-2015               8   230
303 | 
304 | ``` r
305 | recs %>% count(SpaceHeatingUsed, HEATHOME)
306 | ```
307 | 
308 |     ## count: now 2 rows and 3 columns, ungrouped
309 | 
310 |     ## # A tibble: 2 x 3
311 |     ##   SpaceHeatingUsed HEATHOME     n
312 |     ##   <lgl>               <dbl> <int>
313 |     ## 1 FALSE                   0   258
314 |     ## 2 TRUE                    1  5428
315 | 
316 | ``` r
317 | recs %>% count(HeatingBehavior, EQUIPMUSE)
318 | ```
319 | 
320 |     ## count: now 7 rows and 3 columns, ungrouped
321 | 
322 |     ## # A tibble: 7 x 3
323 |     ##   HeatingBehavior                               EQUIPMUSE     n
324 |     ##   <fct>                                             <dbl> <int>
325 |     ## 1 Set one temp and leave it                             1  2156
326 |     ## 2 Manually adjust at night/no one home                  2  1414
327 |     ## 3 Program thermostat to change at certain times         3   972
328 |     ## 4 Turn on or off as needed                              4   761
329 |     ## 5 No control                                            5   114
330 |     ## 6 Other                                                 9    11
331 |     ## 7 <NA>                                                 -2   258
332 | 
333 | ``` r
334 | recs %>% count(ACUsed, AIRCOND)
335 | ```
336 | 
337 |     ## count: now 2 rows and 3 columns, ungrouped
338 | 
339 |     ## # A tibble: 2 x 3
340 |     ##   ACUsed AIRCOND     n
341 |     ##   <lgl>    <dbl> <int>
342 |     ## 1 FALSE        0   737
343 |     ## 2 TRUE         1  4949
344 | 
345 | ``` r
346 | recs %>% count(ACBehavior, USECENAC)
347 | ```
348 | 
349 |     ## count: now 6 rows and 3 columns, ungrouped
350 | 
351 |     ## # A tibble: 6 x 3
352 |     ##   ACBehavior                                    USECENAC     n
353 |     ##   <fct>                                            <dbl> <int>
354 |     ## 1 Set one temp and leave it                            1  1661
355 |     ## 2 Manually adjust at night/no one home                 2   984
356 |     ## 3 Program thermostat to change at certain times        3   727
357 |     ## 4 Turn on or off as needed                             4   438
358 |     ## 5 No control                                           5     2
359 |     ## 6 <NA>                                                -2  1874
360 | 
361 | ``` r
362 | recs %>% count(ClimateRegion_BA, CLIMATE_REGION_PUB)
363 | ```
364 | 
365 |     ## count: now 5 rows and 3 columns, ungrouped
366 | 
367 |     ## # A tibble: 5 x 3
368 |     ##   ClimateRegion_BA  CLIMATE_REGION_PUB     n
369 |     ##   <fct>             <chr>              <int>
370 |     ## 1 Hot-Dry/Mixed-Dry Hot-Dry/Mixed-Dry    750
371 |     ## 2 Hot-Humid         Hot-Humid           1036
372 |     ## 3 Mixed-Humid       Mixed-Humid         1468
373 |     ## 4 Cold/Very Cold    Cold/Very Cold      2008
374 |     ## 5 Marine            Marine               424
375 | 
376 | ``` r
377 | recs %>% count(ClimateRegion_IECC, IECC_CLIMATE_PUB)
378 | ```
379 | 
380 |     ## count: now 11 rows and 3 columns, ungrouped
381 | 
382 |     ## # A tibble: 11 x 3
383 |     ##    ClimateRegion_IECC IECC_CLIMATE_PUB     n
384 |     ##    <fct>              <chr>            <int>
385 |     ##  1 1A-2A              1A-2A              846
386 |     ##  2 2B                 2B                 106
387 |     ##  3 3A                 3A                 637
388 |     ##  4 3B-4B              3B-4B              644
389 |     ##  5 3C                 3C                 209
390 |     ##  6 4A                 4A                1021
391 |     ##  7 4C                 4C                 215
392 |     ##  8 5A                 5A                1240
393 |     ##  9 5B-5C              5B-5C              332
394 |     ## 10 6A-6B              6A-6B              376
395 |     ## 11 7A-7B-7AK-8AK      7A-7B-7AK-8AK       60
396 | 
397 | ## Save data
398 | 
399 | ``` r
400 | recs_out <- recs %>%
401 |    select(DOEID, Region, Division, MSAStatus, Urbanicity, HousingUnitType, YearMade, SpaceHeatingUsed, HeatingBehavior, WinterTempDay, WinterTempAway, WinterTempNight, ACUsed, ACBehavior, SummerTempDay, SummerTempAway, SummerTempNight, TOTCSQFT, TOTHSQFT, TOTSQFT_EN, TOTUCSQFT, TOTUSQFT, NWEIGHT, starts_with("BRRWT"), CDD30YR, CDD65, CDD80, ClimateRegion_BA, ClimateRegion_IECC, HDD30YR, HDD65, HDD50, GNDHDD65, BTUEL, DOLLAREL, BTUNG, DOLLARNG, BTULP, DOLLARLP, BTUFO, DOLLARFO, TOTALBTU, TOTALDOL, BTUWOOD, BTUPELLET)
402 | ```
403 | 
404 |     ## select: dropped 18 variables (REGIONC, DIVISION, METROMICRO, UATYP10, TYPEHUQ, …)
405 | 
406 | ``` r
407 | summary(recs_out)
408 | ```
409 | 
410 |     ##      DOEID             Region                   Division   
411 |     ##  Min.   :10001   Northeast: 794   Pacific           :1085  
412 |     ##  1st Qu.:11422   Midwest  :1327   South Atlantic    :1058  
413 |     ##  Median :12844   South    :2010   East North Central: 836  
414 |     ##  Mean   :12844   West     :1555   West South Central: 580  
415 |     ##  3rd Qu.:14265                    Middle Atlantic   : 541  
416 |     ##  Max.   :15686                    West North Central: 491  
417 |     ##                                   (Other)           :1095  
418 |     ##                          MSAStatus            Urbanicity  
419 |     ##  Metropolitan Statistical Area:4745   Urban Area   :3928  
420 |     ##  Micropolitan Statistical Area: 584   Urban Cluster: 598  
421 |     ##  None                         : 357   Rural        :1160  
422 |     ##                                                           
423 |     ##                                                           
424 |     ##                                                           
425 |     ##                                                           
426 |     ##                    HousingUnitType        YearMade   SpaceHeatingUsed
427 |     ##  Mobile home               : 286   1970-1979  :928   Mode :logical   
428 |     ##  Single-family detached    :3752   2000-2009  :901   FALSE:258       
429 |     ##  Single-family attached    : 479   1980-1989  :874   TRUE :5428      
430 |     ##  Apartment: 2-4 Units      : 311   Before 1950:858                   
431 |     ##  Apartment: 5 or more units: 858   1990-1999  :786                   
432 |     ##                                    1960-1969  :565                   
433 |     ##                                    (Other)    :774                   
434 |     ##                                       HeatingBehavior WinterTempDay  
435 |     ##  Set one temp and leave it                    :2156   Min.   :50.00  
436 |     ##  Manually adjust at night/no one home         :1414   1st Qu.:68.00  
437 |     ##  Program thermostat to change at certain times: 972   Median :70.00  
438 |     ##  Turn on or off as needed                     : 761   Mean   :70.06  
439 |     ##  No control                                   : 114   3rd Qu.:72.00  
440 |     ##  Other                                        :  11   Max.   :90.00  
441 |     ##  NA                                           : 258   NA's   :258    
442 |     ##  WinterTempAway  WinterTempNight   ACUsed       
443 |     ##  Min.   :50.00   Min.   :50.00   Mode :logical  
444 |     ##  1st Qu.:65.00   1st Qu.:65.00   FALSE:737      
445 |     ##  Median :68.00   Median :68.00   TRUE :4949     
446 |     ##  Mean   :67.12   Mean   :68.06                  
447 |     ##  3rd Qu.:70.00   3rd Qu.:70.00                  
448 |     ##  Max.   :90.00   Max.   :90.00                  
449 |     ##  NA's   :258     NA's   :258                    
450 |     ##                                          ACBehavior   SummerTempDay  
451 |     ##  Set one temp and leave it                    :1661   Min.   :50.00  
452 |     ##  Manually adjust at night/no one home         : 984   1st Qu.:70.00  
453 |     ##  Program thermostat to change at certain times: 727   Median :72.00  
454 |     ##  Turn on or off as needed                     : 438   Mean   :72.66  
455 |     ##  No control                                   :   2   3rd Qu.:76.00  
456 |     ##  NA                                           :1874   Max.   :90.00  
457 |     ##                                                       NA's   :737    
458 |     ##  SummerTempAway  SummerTempNight    TOTCSQFT         TOTHSQFT      TOTSQFT_EN  
459 |     ##  Min.   :50.00   Min.   :50.00   Min.   :   0.0   Min.   :   0   Min.   : 221  
460 |     ##  1st Qu.:71.00   1st Qu.:70.00   1st Qu.: 466.2   1st Qu.:1008   1st Qu.:1100  
461 |     ##  Median :75.00   Median :72.00   Median :1218.5   Median :1559   Median :1774  
462 |     ##  Mean   :74.63   Mean   :71.82   Mean   :1454.5   Mean   :1816   Mean   :2081  
463 |     ##  3rd Qu.:78.00   3rd Qu.:75.00   3rd Qu.:2094.0   3rd Qu.:2400   3rd Qu.:2766  
464 |     ##  Max.   :90.00   Max.   :90.00   Max.   :8066.0   Max.   :8066   Max.   :8501  
465 |     ##  NA's   :737     NA's   :737                                                   
466 |     ##    TOTUCSQFT         TOTUSQFT         NWEIGHT           BRRWT1      
467 |     ##  Min.   :   0.0   Min.   :   0.0   Min.   :  1236   Min.   :  1836  
468 |     ##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.: 13874   1st Qu.:  9859  
469 |     ##  Median : 400.0   Median : 250.0   Median : 18510   Median : 16942  
470 |     ##  Mean   : 793.9   Mean   : 432.6   Mean   : 20789   Mean   : 20789  
471 |     ##  3rd Qu.:1150.0   3rd Qu.: 569.8   3rd Qu.: 24840   3rd Qu.: 27219  
472 |     ##  Max.   :7986.0   Max.   :6660.0   Max.   :139307   Max.   :203902  
473 |     ##                                                                     
474 |     ##      BRRWT2             BRRWT3             BRRWT4             BRRWT5        
475 |     ##  Min.   :   685.9   Min.   :   543.9   Min.   :   699.7   Min.   :   649.3  
476 |     ##  1st Qu.:  9733.0   1st Qu.:  9575.3   1st Qu.:  9518.5   1st Qu.:  9598.5  
477 |     ##  Median : 16993.7   Median : 16698.7   Median : 17034.2   Median : 16487.5  
478 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
479 |     ##  3rd Qu.: 27825.1   3rd Qu.: 27941.8   3rd Qu.: 27931.5   3rd Qu.: 27856.7  
480 |     ##  Max.   :189788.1   Max.   :180155.3   Max.   :159902.6   Max.   :141796.4  
481 |     ##                                                                             
482 |     ##      BRRWT6             BRRWT7             BRRWT8           BRRWT9        
483 |     ##  Min.   :   638.7   Min.   :   564.1   Min.   :   591   Min.   :   545.2  
484 |     ##  1st Qu.:  9501.7   1st Qu.:  9534.4   1st Qu.:  9653   1st Qu.:  9595.0  
485 |     ##  Median : 16150.6   Median : 16332.5   Median : 16802   Median : 17352.7  
486 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789   Mean   : 20789.3  
487 |     ##  3rd Qu.: 28092.8   3rd Qu.: 27992.5   3rd Qu.: 27926   3rd Qu.: 27753.7  
488 |     ##  Max.   :189031.8   Max.   :192311.7   Max.   :195071   Max.   :117167.3  
489 |     ##                                                                           
490 |     ##     BRRWT10            BRRWT11            BRRWT12            BRRWT13      
491 |     ##  Min.   :   732.5   Min.   :   586.1   Min.   :   549.8   Min.   :   668  
492 |     ##  1st Qu.:  9077.6   1st Qu.:  9448.5   1st Qu.:  9388.2   1st Qu.:  9757  
493 |     ##  Median : 16601.9   Median : 16172.3   Median : 16167.4   Median : 16584  
494 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789  
495 |     ##  3rd Qu.: 28089.9   3rd Qu.: 28022.1   3rd Qu.: 28075.4   3rd Qu.: 27455  
496 |     ##  Max.   :183073.4   Max.   :195408.4   Max.   :197373.3   Max.   :182228  
497 |     ##                                                                           
498 |     ##     BRRWT14            BRRWT15            BRRWT16            BRRWT17        
499 |     ##  Min.   :   544.5   Min.   :   671.4   Min.   :   603.4   Min.   :   563.3  
500 |     ##  1st Qu.:  9491.8   1st Qu.:  9341.8   1st Qu.:  9804.6   1st Qu.:  9593.2  
501 |     ##  Median : 17028.9   Median : 15996.8   Median : 16562.6   Median : 16750.8  
502 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
503 |     ##  3rd Qu.: 27975.3   3rd Qu.: 28117.5   3rd Qu.: 27322.1   3rd Qu.: 27458.0  
504 |     ##  Max.   :173341.2   Max.   :179152.7   Max.   :210507.2   Max.   :195346.9  
505 |     ##                                                                             
506 |     ##     BRRWT18            BRRWT19          BRRWT20            BRRWT21        
507 |     ##  Min.   :   517.2   Min.   :   657   Min.   :   682.2   Min.   :   689.4  
508 |     ##  1st Qu.:  9839.6   1st Qu.:  9776   1st Qu.:  9569.2   1st Qu.:  9663.9  
509 |     ##  Median : 16560.5   Median : 16779   Median : 16881.2   Median : 16503.8  
510 |     ##  Mean   : 20789.3   Mean   : 20789   Mean   : 20789.3   Mean   : 20789.3  
511 |     ##  3rd Qu.: 27636.2   3rd Qu.: 27986   3rd Qu.: 27467.7   3rd Qu.: 27863.0  
512 |     ##  Max.   :158094.9   Max.   :197236   Max.   :146347.4   Max.   :181583.8  
513 |     ##                                                                           
514 |     ##     BRRWT22            BRRWT23            BRRWT24            BRRWT25        
515 |     ##  Min.   :   581.3   Min.   :   658.4   Min.   :   698.7   Min.   :   541.3  
516 |     ##  1st Qu.:  9805.3   1st Qu.:  9597.1   1st Qu.:  9387.9   1st Qu.:  9502.9  
517 |     ##  Median : 16711.4   Median : 16205.0   Median : 16398.2   Median : 17120.6  
518 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
519 |     ##  3rd Qu.: 27503.4   3rd Qu.: 27855.2   3rd Qu.: 27791.0   3rd Qu.: 28108.8  
520 |     ##  Max.   :173557.2   Max.   :182366.0   Max.   :170970.0   Max.   :128220.6  
521 |     ##                                                                             
522 |     ##     BRRWT26            BRRWT27          BRRWT28            BRRWT29      
523 |     ##  Min.   :   832.9   Min.   :  1372   Min.   :   764.7   Min.   :   854  
524 |     ##  1st Qu.:  9593.2   1st Qu.:  9333   1st Qu.:  9358.0   1st Qu.:  9596  
525 |     ##  Median : 16642.2   Median : 16671   Median : 16663.4   Median : 16336  
526 |     ##  Mean   : 20789.3   Mean   : 20789   Mean   : 20789.3   Mean   : 20789  
527 |     ##  3rd Qu.: 28018.5   3rd Qu.: 27832   3rd Qu.: 28065.9   3rd Qu.: 27506  
528 |     ##  Max.   :176770.0   Max.   :176453   Max.   :210413.6   Max.   :194434  
529 |     ##                                                                         
530 |     ##     BRRWT30            BRRWT31            BRRWT32            BRRWT33        
531 |     ##  Min.   :   680.6   Min.   :   868.4   Min.   :   645.1   Min.   :   714.2  
532 |     ##  1st Qu.:  9689.3   1st Qu.:  9493.1   1st Qu.:  9370.6   1st Qu.:  9530.8  
533 |     ##  Median : 16683.8   Median : 16876.0   Median : 16594.5   Median : 16839.7  
534 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
535 |     ##  3rd Qu.: 27613.1   3rd Qu.: 27807.8   3rd Qu.: 28250.9   3rd Qu.: 27610.2  
536 |     ##  Max.   :118557.6   Max.   :197960.8   Max.   :182658.3   Max.   :183414.8  
537 |     ##                                                                             
538 |     ##     BRRWT34          BRRWT35            BRRWT36            BRRWT37        
539 |     ##  Min.   :  1880   Min.   :   629.3   Min.   :   980.2   Min.   :   634.6  
540 |     ##  1st Qu.:  9703   1st Qu.:  9842.0   1st Qu.:  9439.6   1st Qu.:  9276.7  
541 |     ##  Median : 16380   Median : 17204.4   Median : 16440.6   Median : 16620.9  
542 |     ##  Mean   : 20789   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
543 |     ##  3rd Qu.: 27846   3rd Qu.: 27533.4   3rd Qu.: 28354.2   3rd Qu.: 27754.3  
544 |     ##  Max.   :130246   Max.   :125674.9   Max.   :171375.9   Max.   :209103.9  
545 |     ##                                                                           
546 |     ##     BRRWT38            BRRWT39            BRRWT40          BRRWT41      
547 |     ##  Min.   :   738.1   Min.   :   684.5   Min.   :  1531   Min.   :  1406  
548 |     ##  1st Qu.:  9737.9   1st Qu.:  9389.5   1st Qu.:  9624   1st Qu.:  9776  
549 |     ##  Median : 16862.8   Median : 16797.7   Median : 16644   Median : 16910  
550 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789   Mean   : 20789  
551 |     ##  3rd Qu.: 27710.0   3rd Qu.: 27850.3   3rd Qu.: 27858   3rd Qu.: 27616  
552 |     ##  Max.   :187208.7   Max.   :136106.4   Max.   :165612   Max.   :145467  
553 |     ##                                                                         
554 |     ##     BRRWT42            BRRWT43            BRRWT44            BRRWT45      
555 |     ##  Min.   :   943.8   Min.   :   683.3   Min.   :   866.4   Min.   :  1105  
556 |     ##  1st Qu.:  9446.7   1st Qu.:  9563.6   1st Qu.:  9595.5   1st Qu.:  9563  
557 |     ##  Median : 16177.2   Median : 16999.1   Median : 17034.6   Median : 16629  
558 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789  
559 |     ##  3rd Qu.: 28089.3   3rd Qu.: 27724.1   3rd Qu.: 27593.8   3rd Qu.: 27773  
560 |     ##  Max.   :189726.6   Max.   :192302.9   Max.   :190671.5   Max.   :160108  
561 |     ##                                                                           
562 |     ##     BRRWT46            BRRWT47          BRRWT48            BRRWT49        
563 |     ##  Min.   :   750.7   Min.   :  1230   Min.   :   684.4   Min.   :   627.1  
564 |     ##  1st Qu.:  9616.2   1st Qu.:  9362   1st Qu.:  9383.9   1st Qu.:  9489.0  
565 |     ##  Median : 16821.6   Median : 16243   Median : 16720.3   Median : 17068.6  
566 |     ##  Mean   : 20789.3   Mean   : 20789   Mean   : 20789.3   Mean   : 20789.3  
567 |     ##  3rd Qu.: 27563.3   3rd Qu.: 27547   3rd Qu.: 27965.8   3rd Qu.: 27829.1  
568 |     ##  Max.   :183963.8   Max.   :196001   Max.   :199079.7   Max.   :203407.7  
569 |     ##                                                                           
570 |     ##     BRRWT50          BRRWT51            BRRWT52            BRRWT53        
571 |     ##  Min.   :  1638   Min.   :   922.9   Min.   :   749.9   Min.   :   871.8  
572 |     ##  1st Qu.:  9601   1st Qu.:  9704.7   1st Qu.:  9496.9   1st Qu.:  9489.1  
573 |     ##  Median : 16788   Median : 16706.2   Median : 16442.9   Median : 16494.9  
574 |     ##  Mean   : 20789   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
575 |     ##  3rd Qu.: 27667   3rd Qu.: 27755.8   3rd Qu.: 27621.2   3rd Qu.: 28075.0  
576 |     ##  Max.   :223546   Max.   :161561.8   Max.   :146056.0   Max.   :143796.6  
577 |     ##                                                                           
578 |     ##     BRRWT54            BRRWT55          BRRWT56            BRRWT57        
579 |     ##  Min.   :   687.9   Min.   :  2056   Min.   :   623.7   Min.   :   713.4  
580 |     ##  1st Qu.:  9623.3   1st Qu.:  9595   1st Qu.:  9798.4   1st Qu.:  9393.8  
581 |     ##  Median : 16662.9   Median : 16589   Median : 16624.8   Median : 17198.4  
582 |     ##  Mean   : 20789.3   Mean   : 20789   Mean   : 20789.3   Mean   : 20789.3  
583 |     ##  3rd Qu.: 27612.8   3rd Qu.: 27857   3rd Qu.: 27650.0   3rd Qu.: 27964.1  
584 |     ##  Max.   :174657.5   Max.   :206797   Max.   :226169.8   Max.   :162193.6  
585 |     ##                                                                           
586 |     ##     BRRWT58            BRRWT59            BRRWT60          BRRWT61        
587 |     ##  Min.   :   905.5   Min.   :   630.7   Min.   :  1275   Min.   :   546.4  
588 |     ##  1st Qu.:  9559.2   1st Qu.:  9623.7   1st Qu.:  9577   1st Qu.:  9387.4  
589 |     ##  Median : 16540.0   Median : 16656.6   Median : 16197   Median : 16376.3  
590 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789   Mean   : 20789.3  
591 |     ##  3rd Qu.: 27780.9   3rd Qu.: 27577.8   3rd Qu.: 27781   3rd Qu.: 28016.5  
592 |     ##  Max.   :211170.6   Max.   :206702.7   Max.   :169387   Max.   :122260.9  
593 |     ##                                                                           
594 |     ##     BRRWT62            BRRWT63            BRRWT64            BRRWT65      
595 |     ##  Min.   :   739.7   Min.   :   671.5   Min.   :   926.4   Min.   :  1144  
596 |     ##  1st Qu.:  9643.5   1st Qu.:  9455.3   1st Qu.:  9400.5   1st Qu.:  9597  
597 |     ##  Median : 17067.2   Median : 16632.1   Median : 16508.1   Median : 16442  
598 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789  
599 |     ##  3rd Qu.: 27540.6   3rd Qu.: 28020.8   3rd Qu.: 27693.9   3rd Qu.: 27348  
600 |     ##  Max.   :158200.9   Max.   :196933.9   Max.   :217490.7   Max.   :239712  
601 |     ##                                                                           
602 |     ##     BRRWT66          BRRWT67            BRRWT68          BRRWT69      
603 |     ##  Min.   :  1264   Min.   :   684.8   Min.   :  1053   Min.   :  1676  
604 |     ##  1st Qu.:  9758   1st Qu.:  9588.0   1st Qu.:  9245   1st Qu.:  9371  
605 |     ##  Median : 16565   Median : 16560.8   Median : 16464   Median : 16682  
606 |     ##  Mean   : 20789   Mean   : 20789.3   Mean   : 20789   Mean   : 20789  
607 |     ##  3rd Qu.: 27884   3rd Qu.: 27838.7   3rd Qu.: 28108   3rd Qu.: 27957  
608 |     ##  Max.   :157193   Max.   :179204.9   Max.   :183266   Max.   :193274  
609 |     ##                                                                       
610 |     ##     BRRWT70            BRRWT71            BRRWT72            BRRWT73      
611 |     ##  Min.   :   758.4   Min.   :   892.2   Min.   :   695.5   Min.   :   875  
612 |     ##  1st Qu.:  9622.5   1st Qu.:  9451.9   1st Qu.:  9516.0   1st Qu.:  9734  
613 |     ##  Median : 16676.4   Median : 16482.8   Median : 16717.8   Median : 16930  
614 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789  
615 |     ##  3rd Qu.: 27897.7   3rd Qu.: 27882.7   3rd Qu.: 27611.7   3rd Qu.: 27756  
616 |     ##  Max.   :146583.8   Max.   :126528.3   Max.   :196704.6   Max.   :184412  
617 |     ##                                                                           
618 |     ##     BRRWT74            BRRWT75            BRRWT76          BRRWT77        
619 |     ##  Min.   :   541.6   Min.   :   669.7   Min.   :   617   Min.   :   560.5  
620 |     ##  1st Qu.:  9503.9   1st Qu.:  9835.9   1st Qu.:  9385   1st Qu.:  9673.8  
621 |     ##  Median : 16128.6   Median : 16921.5   Median : 17000   Median : 16713.6  
622 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789   Mean   : 20789.3  
623 |     ##  3rd Qu.: 27849.9   3rd Qu.: 27352.3   3rd Qu.: 27558   3rd Qu.: 27712.8  
624 |     ##  Max.   :125833.8   Max.   :194829.8   Max.   :212262   Max.   :234971.4  
625 |     ##                                                                           
626 |     ##     BRRWT78            BRRWT79            BRRWT80            BRRWT81        
627 |     ##  Min.   :   526.7   Min.   :   651.1   Min.   :   675.7   Min.   :   681.2  
628 |     ##  1st Qu.:  9744.1   1st Qu.:  9549.7   1st Qu.:  9554.4   1st Qu.:  9489.0  
629 |     ##  Median : 17098.9   Median : 16676.0   Median : 16707.8   Median : 16769.3  
630 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
631 |     ##  3rd Qu.: 27459.8   3rd Qu.: 27857.9   3rd Qu.: 27688.3   3rd Qu.: 27901.5  
632 |     ##  Max.   :152055.4   Max.   :180157.0   Max.   :165661.6   Max.   :191740.1  
633 |     ##                                                                             
634 |     ##     BRRWT82            BRRWT83            BRRWT84            BRRWT85        
635 |     ##  Min.   :   563.6   Min.   :   656.9   Min.   :   652.7   Min.   :   675.4  
636 |     ##  1st Qu.:  9216.4   1st Qu.:  9634.4   1st Qu.:  9432.5   1st Qu.:  9551.2  
637 |     ##  Median : 16121.6   Median : 16516.9   Median : 16454.8   Median : 16902.2  
638 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
639 |     ##  3rd Qu.: 28253.1   3rd Qu.: 27725.8   3rd Qu.: 28006.4   3rd Qu.: 27325.4  
640 |     ##  Max.   :171004.8   Max.   :184719.0   Max.   :191550.3   Max.   :198238.4  
641 |     ##                                                                             
642 |     ##     BRRWT86            BRRWT87            BRRWT88            BRRWT89        
643 |     ##  Min.   :   680.3   Min.   :   551.7   Min.   :   704.2   Min.   :   644.9  
644 |     ##  1st Qu.:  9619.8   1st Qu.:  9436.6   1st Qu.:  9393.1   1st Qu.:  9643.2  
645 |     ##  Median : 16772.0   Median : 16799.0   Median : 16778.6   Median : 16586.1  
646 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
647 |     ##  3rd Qu.: 27638.1   3rd Qu.: 28046.3   3rd Qu.: 27789.9   3rd Qu.: 28075.4  
648 |     ##  Max.   :232065.5   Max.   :179835.0   Max.   :166866.1   Max.   :144299.3  
649 |     ##                                                                             
650 |     ##     BRRWT90            BRRWT91            BRRWT92            BRRWT93        
651 |     ##  Min.   :   649.2   Min.   :   568.2   Min.   :   591.9   Min.   :   545.3  
652 |     ##  1st Qu.:  9467.7   1st Qu.:  9506.3   1st Qu.:  9610.6   1st Qu.:  9688.4  
653 |     ##  Median : 16212.0   Median : 16781.5   Median : 16524.1   Median : 16258.4  
654 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3  
655 |     ##  3rd Qu.: 28020.8   3rd Qu.: 27876.1   3rd Qu.: 27915.1   3rd Qu.: 27728.8  
656 |     ##  Max.   :175279.5   Max.   :205917.4   Max.   :225638.4   Max.   :117260.5  
657 |     ##                                                                             
658 |     ##     BRRWT94            BRRWT95            BRRWT96            CDD30YR    
659 |     ##  Min.   :   716.2   Min.   :   566.4   Min.   :   551.1   Min.   :   0  
660 |     ##  1st Qu.:  9561.6   1st Qu.:  9530.2   1st Qu.:  9533.2   1st Qu.: 712  
661 |     ##  Median : 17099.7   Median : 16577.2   Median : 16358.9   Median :1150  
662 |     ##  Mean   : 20789.3   Mean   : 20789.3   Mean   : 20789.3   Mean   :1451  
663 |     ##  3rd Qu.: 27853.9   3rd Qu.: 27441.4   3rd Qu.: 27823.1   3rd Qu.:1880  
664 |     ##  Max.   :207264.3   Max.   :205015.8   Max.   :171550.8   Max.   :5792  
665 |     ##                                                                         
666 |     ##      CDD65          CDD80                 ClimateRegion_BA ClimateRegion_IECC
667 |     ##  Min.   :   0   Min.   :   0.0   Hot-Dry/Mixed-Dry: 750    5A     :1240      
668 |     ##  1st Qu.: 793   1st Qu.:  10.0   Hot-Humid        :1036    4A     :1021      
669 |     ##  Median :1378   Median :  60.0   Mixed-Humid      :1468    1A-2A  : 846      
670 |     ##  Mean   :1719   Mean   : 174.7   Cold/Very Cold   :2008    3B-4B  : 644      
671 |     ##  3rd Qu.:2231   3rd Qu.: 208.0   Marine           : 424    3A     : 637      
672 |     ##  Max.   :6607   Max.   :2297.0                             6A-6B  : 376      
673 |     ##                                                            (Other): 922      
674 |     ##     HDD30YR          HDD65          HDD50         GNDHDD65    
675 |     ##  Min.   :    0   Min.   :   0   Min.   :   0   Min.   :    0  
676 |     ##  1st Qu.: 2102   1st Qu.:1881   1st Qu.: 260   1st Qu.: 1337  
677 |     ##  Median : 4353   Median :3878   Median :1260   Median : 3704  
678 |     ##  Mean   : 4087   Mean   :3708   Mean   :1486   Mean   : 3578  
679 |     ##  3rd Qu.: 5967   3rd Qu.:5467   3rd Qu.:2499   3rd Qu.: 5630  
680 |     ##  Max.   :12184   Max.   :9843   Max.   :4956   Max.   :11851  
681 |     ##                                                               
682 |     ##      BTUEL             DOLLAREL           BTUNG           DOLLARNG     
683 |     ##  Min.   :   201.6   Min.   :  18.72   Min.   :     0   Min.   :   0.0  
684 |     ##  1st Qu.: 20221.3   1st Qu.: 815.12   1st Qu.:     0   1st Qu.:   0.0  
685 |     ##  Median : 32582.4   Median :1253.02   Median : 17961   Median : 231.8  
686 |     ##  Mean   : 37630.7   Mean   :1403.78   Mean   : 33331   Mean   : 346.8  
687 |     ##  3rd Qu.: 49670.6   3rd Qu.:1830.83   3rd Qu.: 57126   3rd Qu.: 605.1  
688 |     ##  Max.   :215695.7   Max.   :8121.56   Max.   :306594   Max.   :2789.8  
689 |     ##                                                                        
690 |     ##      BTULP           DOLLARLP           BTUFO           DOLLARFO      
691 |     ##  Min.   :     0   Min.   :   0.00   Min.   :     0   Min.   :   0.00  
692 |     ##  1st Qu.:     0   1st Qu.:   0.00   1st Qu.:     0   1st Qu.:   0.00  
693 |     ##  Median :     0   Median :   0.00   Median :     0   Median :   0.00  
694 |     ##  Mean   :  3192   Mean   :  67.72   Mean   :  3569   Mean   :  64.08  
695 |     ##  3rd Qu.:     0   3rd Qu.:   0.00   3rd Qu.:     0   3rd Qu.:   0.00  
696 |     ##  Max.   :220435   Max.   :5121.27   Max.   :273608   Max.   :4700.03  
697 |     ##                                                                       
698 |     ##     TOTALBTU           TOTALDOL           BTUWOOD         BTUPELLET       
699 |     ##  Min.   :   201.6   Min.   :   60.46   Min.   :     0   Min.   :     0.0  
700 |     ##  1st Qu.: 42655.8   1st Qu.: 1175.49   1st Qu.:     0   1st Qu.:     0.0  
701 |     ##  Median : 68663.3   Median : 1724.60   Median :     0   Median :     0.0  
702 |     ##  Mean   : 77722.9   Mean   : 1882.34   Mean   :  4140   Mean   :   197.4  
703 |     ##  3rd Qu.:103832.9   3rd Qu.: 2385.84   3rd Qu.:     0   3rd Qu.:     0.0  
704 |     ##  Max.   :490187.4   Max.   :10135.99   Max.   :295476   Max.   :115500.0  
705 |     ## 
706 | 
707 | ``` r
708 | write_rds(recs_out, here("Data", "recs.rds"), compress="gz")
709 | ```
710 | 


--------------------------------------------------------------------------------
/Exercises/CategorialExercises.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: "Categorical Data Analysis Exercise Solutions"
 3 | #' output:
 4 | #'   html_document:
 5 | #'     df_print: paged
 6 | #' ---
 7 | #' 
 8 | #' # Set-up
 9 | ## -------------------------------------------------------------------
10 | library(tidyverse) # for tidyverse
11 | library(here) # for file paths
12 | library(survey) # for survey analysis
13 | library(srvyr) # for tidy survey analysis
14 | 
15 | anes <- read_rds(here("Data", "anes.rds")) %>%
16 |    mutate(Weight=Weight/sum(Weight)*224059005) 
17 | # adjust weight to sum to citizen pop, 18+ in Nov 2016 per ANES methodology documentation
18 | 
19 | anes_des <- anes %>%
20 |    as_survey_design(weights = Weight,
21 |                     strata = Stratum,
22 |                     ids = VarUnit,
23 |                     nest = TRUE)
24 | 
25 | #' 
26 | #' # Part 1
27 | #' 
28 | #' 1. How many females have a graduate degree?
29 | #' 
30 | ## -------------------------------------------------------------------
31 | 
32 | 
33 | 
34 | #' 
35 | #' 2. What percentage of people identify as "Strong democrat"?
36 | #' 
37 | ## -------------------------------------------------------------------
38 | 
39 | 
40 | #' 
41 | #' 3. What percentage of people who voted in the 2016 election identify as "Strong republican"?
42 | #' 
43 | ## -------------------------------------------------------------------
44 | 
45 | 
46 | #' 
47 | #' 4. What percentage of people voted in both the 2012 election and in the 2016 election?  Include the confidence interval.
48 | #' 
49 | ## -------------------------------------------------------------------
50 | 
51 | 
52 | #' 
53 | #' 5. What is the design effect for the proportion of people who voted early?
54 | #' 
55 | ## -------------------------------------------------------------------
56 | 
57 | 
58 | #' 
59 | #' # Part 2
60 | #' 
61 | #' 1. Is there a relationship between PartyID and When people voted in the 2016 election (on election day or early voting)?
62 | #' 
63 | ## -------------------------------------------------------------------
64 | 
65 | 
66 | #' 
67 | #' 2. Is there a relationship between PartyID and trust in the government?
68 | #' 
69 | ## -------------------------------------------------------------------
70 | 
71 | 
72 | #' 
73 | #' 
74 | #' # Bonus
75 | #' 
76 | #' 1. What percentage of people lean republican?
77 | #' 
78 | ## -------------------------------------------------------------------
79 | 
80 | 
81 | #' 
82 | #' 2. Were people who lean democrat more likely to vote early in the 2020 election?
83 | #' 
84 | ## -------------------------------------------------------------------
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/Exercises/CategorialExercises.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Categorical Data Analysis Exercise Solutions"
 3 | output:
 4 |   html_document:
 5 |     df_print: paged
 6 | ---
 7 | 
 8 | # Set-up
 9 | ```{r}
10 | library(tidyverse) # for tidyverse
11 | library(here) # for file paths
12 | library(survey) # for survey analysis
13 | library(srvyr) # for tidy survey analysis
14 | 
15 | anes <- read_rds(here("Data", "anes.rds")) %>%
16 |    mutate(Weight=Weight/sum(Weight)*224059005) 
17 | # adjust weight to sum to citizen pop, 18+ in Nov 2016 per ANES methodology documentation
18 | 
19 | anes_des <- anes %>%
20 |    as_survey_design(weights = Weight,
21 |                     strata = Stratum,
22 |                     ids = VarUnit,
23 |                     nest = TRUE)
24 | ```
25 | 
26 | # Part 1
27 | 
28 | 1. How many females have a graduate degree?
29 | 
30 | ```{r}
31 | 
32 | 
33 | ```
34 | 
35 | 2. What percentage of people identify as "Strong democrat"?
36 | 
37 | ```{r}
38 | 
39 | ```
40 | 
41 | 3. What percentage of people who voted in the 2016 election identify as "Strong republican"?
42 | 
43 | ```{r}
44 | 
45 | ```
46 | 
47 | 4. What percentage of people voted in both the 2012 election and in the 2016 election?  Include the confidence interval.
48 | 
49 | ```{r}
50 | 
51 | ```
52 | 
53 | 5. What is the design effect for the proportion of people who voted early?
54 | 
55 | ```{r}
56 | 
57 | ```
58 | 
59 | # Part 2
60 | 
61 | 1. Is there a relationship between PartyID and When people voted in the 2016 election (on election day or early voting)?
62 | 
63 | ```{r}
64 | 
65 | ```
66 | 
67 | 2. Is there a relationship between PartyID and trust in the government?
68 | 
69 | ```{r}
70 | 
71 | ```
72 | 
73 | 
74 | # Bonus
75 | 
76 | 1. What percentage of people lean republican?
77 | 
78 | ```{r}
79 | 
80 | ```
81 | 
82 | 2. Were people who lean democrat more likely to vote early in the 2020 election?
83 | 
84 | ```{r}
85 | 
86 | ```


--------------------------------------------------------------------------------
/Exercises/CategorialExercises_solutions.R:
--------------------------------------------------------------------------------
  1 | #' ---
  2 | #' title: "Categorical Data Analysis Exercise Solutions"
  3 | #' output:
  4 | #'   html_document:
  5 | #'     df_print: paged
  6 | #' ---
  7 | #' 
  8 | #' # Set-up
  9 | ## -------------------------------------------------------------------
 10 | library(tidyverse) # for tidyverse
 11 | library(here) # for file paths
 12 | library(survey) # for survey analysis
 13 | library(srvyr) # for tidy survey analysis
 14 | 
 15 | anes <- read_rds(here("Data", "anes.rds")) %>%
 16 |    mutate(Weight=Weight/sum(Weight)*224059005) 
 17 | # adjust weight to sum to citizen pop, 18+ in Nov 2016 per ANES methodology documentation
 18 | 
 19 | anes_des <- anes %>%
 20 |    as_survey_design(weights = Weight,
 21 |                     strata = Stratum,
 22 |                     ids = VarUnit,
 23 |                     nest = TRUE)
 24 | 
 25 | #' 
 26 | #' # Part 1
 27 | #' 
 28 | #' 1. How many females have a graduate degree?
 29 | #' 
 30 | ## -------------------------------------------------------------------
 31 | #Option 1:
 32 | anes_des %>%
 33 |   filter(Gender=="Female", Education=="Graduate") %>%
 34 |   survey_count(name="n")
 35 | #Option 2:
 36 | anes_des %>%
 37 |   filter(Gender=="Female", Education=="Graduate") %>%
 38 |   summarize(
 39 |     N=survey_total(),  .groups="drop"
 40 |   )
 41 | 
 42 | 
 43 | #' 
 44 | #' 2. What percentage of people identify as "Strong democrat"?
 45 | #' 
 46 | ## -------------------------------------------------------------------
 47 | anes_des %>%
 48 |   group_by(PartyID) %>% 
 49 |   summarize(
 50 |     p=survey_mean()
 51 |   ) %>%
 52 |   filter(PartyID=="Strong democrat")
 53 | 
 54 | #' 
 55 | #' 3. What percentage of people who voted in the 2016 election identify as "Strong republican"?
 56 | #' 
 57 | ## -------------------------------------------------------------------
 58 | anes_des %>%
 59 |   filter(VotedPres2016=="Yes") %>% 
 60 |   group_by(PartyID) %>% 
 61 |   summarize(
 62 |     p=survey_mean()
 63 |   )
 64 | 
 65 | #' 
 66 | #' 4. What percentage of people voted in both the 2012 election and in the 2016 election?  Include the confidence interval.
 67 | #' 
 68 | ## -------------------------------------------------------------------
 69 | anes_des %>%
 70 |   group_by(groups=interaction(VotedPres2012, VotedPres2016)) %>% 
 71 |   filter(!is.na(groups)) %>%
 72 |   summarize(
 73 |     VotedPres2012=VotedPres2012[1],
 74 |     VotedPres2016=VotedPres2016[1],
 75 |     p=survey_mean(var="ci")
 76 |   )
 77 | 
 78 | #' 
 79 | #' 5. What is the design effect for the proportion of people who voted early?
 80 | #' 
 81 | ## -------------------------------------------------------------------
 82 | anes_des %>%
 83 |    filter(!is.na(EarlyVote2016)) %>%
 84 |    group_by(EarlyVote2016) %>%
 85 |    summarize(
 86 |       p=survey_mean(deff=TRUE),
 87 |       N=survey_total()
 88 |    )
 89 | 
 90 | #' 
 91 | #' # Part 2
 92 | #' 
 93 | #' 1. Is there a relationship between PartyID and When people voted in the 2016 election (on election day or early voting)?
 94 | #' 
 95 | ## -------------------------------------------------------------------
 96 | anes_des %>%
 97 |    svychisq(design=.,
 98 |             formula=~PartyID +EarlyVote2016)
 99 | 
100 | #' 
101 | #' 2. Is there a relationship between PartyID and trust in the government?
102 | #' 
103 | ## -------------------------------------------------------------------
104 | anes_des %>%
105 |    svychisq(design=.,
106 |             formula=~PartyID+TrustGovernment,
107 |             statistic="Wald")
108 | 
109 | #' 
110 | #' 
111 | #' # Bonus
112 | #' 
113 | #' 1. What percentage of people lean republican?
114 | #' 
115 | ## -------------------------------------------------------------------
116 | 
117 | #Solution 1: Using forcats package
118 | anes_des %>%
119 |    mutate(PartyID3=fct_collapse(PartyID,
120 |                                 LeanDem=c("Strong democrat",
121 |                                           "Not very strong democrat",
122 |                                           "Independent-democrat"),
123 |                                 LeanRep=c("Strong republican",
124 |                                           "Not very strong republican",
125 |                                           "Independent-republican"),
126 |                                 other_level="Other")) %>% 
127 |    group_by(PartyID3) %>% 
128 |    summarize(p=survey_mean())
129 | 
130 | #Solution 2: Using case_when
131 | anes_des %>%
132 |    mutate(PartyID3=case_when(PartyID %in% c("Strong democrat",
133 |                                             "Not very strong democrat",
134 |                                             "Independent-democrat")~"LeanDem",
135 |                              PartyID %in% c("Strong republican",
136 |                                             "Not very strong republican",
137 |                                             "Independent-republican")~"LeanRep",
138 |                              is.na(PartyID)~NA_character_,
139 |                              TRUE~"Other")) %>% 
140 |    group_by(PartyID3) %>% 
141 |    summarize(p=survey_mean())
142 | 
143 | 
144 | #' 
145 | #' 2. Were people who lean democrat more likely to vote early in the 2020 election?
146 | #' 
147 | ## -------------------------------------------------------------------
148 | 
149 | earlyv_glm<-anes_des %>%
150 |    mutate(PartyID3=fct_collapse(PartyID,
151 |                                 LeanDem=c("Strong democrat",
152 |                                           "Not very strong democrat",
153 |                                           "Independent-democrat"),
154 |                                 LeanRep=c("Strong republican",
155 |                                           "Not very strong republican",
156 |                                           "Independent-republican"),
157 |                                 other_level="Other")) %>% 
158 |    svyglm(design=.,
159 |           formula=(EarlyVote2016=="Yes")~PartyID3,
160 |           family=quasibinomial(),
161 |           na.action=na.omit)
162 | 
163 | summary(earlyv_glm)
164 | 
165 | 


--------------------------------------------------------------------------------
/Exercises/CategorialExercises_solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Categorical Data Analysis Exercise Solutions"
  3 | output:
  4 |   html_document:
  5 |     df_print: paged
  6 | ---
  7 | 
  8 | # Set-up
  9 | ```{r}
 10 | library(tidyverse) # for tidyverse
 11 | library(here) # for file paths
 12 | library(survey) # for survey analysis
 13 | library(srvyr) # for tidy survey analysis
 14 | 
 15 | anes <- read_rds(here("Data", "anes.rds")) %>%
 16 |    mutate(Weight=Weight/sum(Weight)*224059005) 
 17 | # adjust weight to sum to citizen pop, 18+ in Nov 2016 per ANES methodology documentation
 18 | 
 19 | anes_des <- anes %>%
 20 |    as_survey_design(weights = Weight,
 21 |                     strata = Stratum,
 22 |                     ids = VarUnit,
 23 |                     nest = TRUE)
 24 | ```
 25 | 
 26 | # Part 1
 27 | 
 28 | 1. How many females have a graduate degree?
 29 | 
 30 | ```{r}
 31 | #Option 1:
 32 | anes_des %>%
 33 |   filter(Gender=="Female", Education=="Graduate") %>%
 34 |   survey_count(name="n")
 35 | #Option 2:
 36 | anes_des %>%
 37 |   filter(Gender=="Female", Education=="Graduate") %>%
 38 |   summarize(
 39 |     N=survey_total(),  .groups="drop"
 40 |   )
 41 | 
 42 | ```
 43 | 
 44 | 2. What percentage of people identify as "Strong democrat"?
 45 | 
 46 | ```{r}
 47 | anes_des %>%
 48 |   group_by(PartyID) %>% 
 49 |   summarize(
 50 |     p=survey_mean()
 51 |   ) %>%
 52 |   filter(PartyID=="Strong democrat")
 53 | ```
 54 | 
 55 | 3. What percentage of people who voted in the 2016 election identify as "Strong republican"?
 56 | 
 57 | ```{r}
 58 | anes_des %>%
 59 |   filter(VotedPres2016=="Yes") %>% 
 60 |   group_by(PartyID) %>% 
 61 |   summarize(
 62 |     p=survey_mean()
 63 |   )
 64 | ```
 65 | 
 66 | 4. What percentage of people voted in both the 2012 election and in the 2016 election?  Include the confidence interval.
 67 | 
 68 | ```{r}
 69 | anes_des %>%
 70 |   group_by(groups=interaction(VotedPres2012, VotedPres2016)) %>% 
 71 |   filter(!is.na(groups)) %>%
 72 |   summarize(
 73 |     VotedPres2012=VotedPres2012[1],
 74 |     VotedPres2016=VotedPres2016[1],
 75 |     p=survey_mean(var="ci")
 76 |   )
 77 | ```
 78 | 
 79 | 5. What is the design effect for the proportion of people who voted early?
 80 | 
 81 | ```{r}
 82 | anes_des %>%
 83 |    filter(!is.na(EarlyVote2016)) %>%
 84 |    group_by(EarlyVote2016) %>%
 85 |    summarize(
 86 |       p=survey_mean(deff=TRUE),
 87 |       N=survey_total()
 88 |    )
 89 | ```
 90 | 
 91 | # Part 2
 92 | 
 93 | 1. Is there a relationship between PartyID and When people voted in the 2016 election (on election day or early voting)?
 94 | 
 95 | ```{r}
 96 | anes_des %>%
 97 |    svychisq(design=.,
 98 |             formula=~PartyID +EarlyVote2016)
 99 | ```
100 | 
101 | 2. Is there a relationship between PartyID and trust in the government?
102 | 
103 | ```{r}
104 | anes_des %>%
105 |    svychisq(design=.,
106 |             formula=~PartyID+TrustGovernment,
107 |             statistic="Wald")
108 | ```
109 | 
110 | 
111 | # Bonus
112 | 
113 | 1. What percentage of people lean republican?
114 | 
115 | ```{r}
116 | 
117 | #Solution 1: Using forcats package
118 | anes_des %>%
119 |    mutate(PartyID3=fct_collapse(PartyID,
120 |                                 LeanDem=c("Strong democrat",
121 |                                           "Not very strong democrat",
122 |                                           "Independent-democrat"),
123 |                                 LeanRep=c("Strong republican",
124 |                                           "Not very strong republican",
125 |                                           "Independent-republican"),
126 |                                 other_level="Other")) %>% 
127 |    group_by(PartyID3) %>% 
128 |    summarize(p=survey_mean())
129 | 
130 | #Solution 2: Using case_when
131 | anes_des %>%
132 |    mutate(PartyID3=case_when(PartyID %in% c("Strong democrat",
133 |                                             "Not very strong democrat",
134 |                                             "Independent-democrat")~"LeanDem",
135 |                              PartyID %in% c("Strong republican",
136 |                                             "Not very strong republican",
137 |                                             "Independent-republican")~"LeanRep",
138 |                              is.na(PartyID)~NA_character_,
139 |                              TRUE~"Other")) %>% 
140 |    group_by(PartyID3) %>% 
141 |    summarize(p=survey_mean())
142 | 
143 | ```
144 | 
145 | 2. Were people who lean democrat more likely to vote early in the 2020 election?
146 | 
147 | ```{r}
148 | 
149 | earlyv_glm<-anes_des %>%
150 |    mutate(PartyID3=fct_collapse(PartyID,
151 |                                 LeanDem=c("Strong democrat",
152 |                                           "Not very strong democrat",
153 |                                           "Independent-democrat"),
154 |                                 LeanRep=c("Strong republican",
155 |                                           "Not very strong republican",
156 |                                           "Independent-republican"),
157 |                                 other_level="Other")) %>% 
158 |    svyglm(design=.,
159 |           formula=(EarlyVote2016=="Yes")~PartyID3,
160 |           family=quasibinomial(),
161 |           na.action=na.omit)
162 | 
163 | summary(earlyv_glm)
164 | ```


--------------------------------------------------------------------------------
/Exercises/ContinuousExercises.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: "Continous Data Analysis Exercises"
 3 | #' output:
 4 | #'   html_document:
 5 | #'     df_print: paged
 6 | #' ---
 7 | #' 
 8 | #' # Set-up
 9 | ## -------------------------------------------------------------------
10 | library(tidyverse) # for tidyverse
11 | library(here) # for file paths
12 | library(survey) # for survey analysis
13 | library(srvyr) # for tidy survey analysis
14 | 
15 | recs <- read_rds(here("Data", "recs.rds"))
16 | 
17 | recs_des <- recs %>%
18 |    as_survey_rep(weights=NWEIGHT,
19 |                  repweights=starts_with("BRRWT"),
20 |                  type="Fay",
21 |                  rho=0.5,
22 |                  mse=TRUE)
23 | 
24 | #' 
25 | #' # Part 1
26 | #' 
27 | #' 1. Find the average square footage of housing units (TOTSQFT_EN) with a 90% confidence interval.
28 | #' 
29 | ## -------------------------------------------------------------------
30 | 
31 | 
32 | #' 
33 | #' 2. Estimate the ratio of cooled square footage to total square footage (TOTCSQFT) to the total square footage of housing units (TOTSQFT_EN) with its standard error.
34 | #' 
35 | ## -------------------------------------------------------------------
36 | 
37 | 
38 | #' 
39 | #' 3. Estimate the median temperature housing units are set to during the night in the winter (WinterTempNight) using the `survey_median` function.
40 | #' 
41 | ## -------------------------------------------------------------------
42 | 
43 | 
44 | #' 
45 | #' 4. Estimate the median temperature housing units are set to during the night in the winter (WinterTempNight) using the `survey_quantile` function.
46 | #' 
47 | ## -------------------------------------------------------------------
48 | 
49 | 
50 | #' 
51 | #' # Part 2
52 | #' 
53 | #' 1. Estimate the total average energy cost (TOTALDOL) by region, division, and urbanicity.
54 | #' 
55 | ## -------------------------------------------------------------------
56 | 
57 | 
58 | #' 
59 | #' 2. What is the median electric cost (DOLLAREL) for housing units in the South Region? What is the 95% confidence interval?
60 | #' 
61 | ## -------------------------------------------------------------------
62 | 
63 | 
64 | #' 
65 | #' 3. Test whether daytime winter and daytime summer temperatures of homes are set the same.
66 | #' 
67 | ## -------------------------------------------------------------------
68 | 
69 | 
70 | #' 
71 | #' 4. Test whether average electric bill (DOLLAREL) varies by region (Region).
72 | #' 
73 | ## -------------------------------------------------------------------
74 | 
75 | 
76 | #' 
77 | #' 5. Fit a regression between the cooled square footage of a housing unit (TOTCSQFT) and the total amount spent on energy (TOTALDOL).
78 | #' 
79 | ## -------------------------------------------------------------------
80 | 
81 | 
82 | #' 
83 | 


--------------------------------------------------------------------------------
/Exercises/ContinuousExercises.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Continous Data Analysis Exercises"
 3 | output:
 4 |   html_document:
 5 |     df_print: paged
 6 | ---
 7 | 
 8 | # Set-up
 9 | ```{r}
10 | library(tidyverse) # for tidyverse
11 | library(here) # for file paths
12 | library(survey) # for survey analysis
13 | library(srvyr) # for tidy survey analysis
14 | 
15 | recs <- read_rds(here("Data", "recs.rds"))
16 | 
17 | recs_des <- recs %>%
18 |    as_survey_rep(weights=NWEIGHT,
19 |                  repweights=starts_with("BRRWT"),
20 |                  type="Fay",
21 |                  rho=0.5,
22 |                  mse=TRUE)
23 | ```
24 | 
25 | # Part 1
26 | 
27 | 1. Find the average square footage of housing units (TOTSQFT_EN) with a 90% confidence interval.
28 | 
29 | ```{r}
30 | 
31 | ```
32 | 
33 | 2. Estimate the ratio of cooled square footage to total square footage (TOTCSQFT) to the total square footage of housing units (TOTSQFT_EN) with its standard error.
34 | 
35 | ```{r}
36 | 
37 | ```
38 | 
39 | 3. Estimate the median temperature housing units are set to during the night in the winter (WinterTempNight) using the `survey_median` function.
40 | 
41 | ```{r}
42 | 
43 | ```
44 | 
45 | 4. Estimate the median temperature housing units are set to during the night in the winter (WinterTempNight) using the `survey_quantile` function.
46 | 
47 | ```{r}
48 | 
49 | ```
50 | 
51 | # Part 2
52 | 
53 | 1. Estimate the total average energy cost (TOTALDOL) by region, division, and urbanicity.
54 | 
55 | ```{r}
56 | 
57 | ```
58 | 
59 | 2. What is the median electric cost (DOLLAREL) for housing units in the South Region? What is the 95% confidence interval?
60 | 
61 | ```{r}
62 | 
63 | ```
64 | 
65 | 3. Test whether daytime winter and daytime summer temperatures of homes are set the same.
66 | 
67 | ```{r}
68 | 
69 | ```
70 | 
71 | 4. Test whether average electric bill (DOLLAREL) varies by region (Region).
72 | 
73 | ```{r}
74 | 
75 | ```
76 | 
77 | 5. Fit a regression between the cooled square footage of a housing unit (TOTCSQFT) and the total amount spent on energy (TOTALDOL).
78 | 
79 | ```{r}
80 | 
81 | ```
82 | 
83 | 


--------------------------------------------------------------------------------
/Exercises/ContinuousExercises_solutions.R:
--------------------------------------------------------------------------------
  1 | #' ---
  2 | #' title: "Continous Data Analysis Exercise Solutions"
  3 | #' output:
  4 | #'   html_document:
  5 | #'     df_print: paged
  6 | #' ---
  7 | #' 
  8 | #' # Set-up
  9 | ## -------------------------------------------------------------------
 10 | library(tidyverse) # for tidyverse
 11 | library(here) # for file paths
 12 | library(survey) # for survey analysis
 13 | library(srvyr) # for tidy survey analysis
 14 | 
 15 | recs <- read_rds(here("Data", "recs.rds"))
 16 | 
 17 | recs_des <- recs %>%
 18 |    as_survey_rep(weights=NWEIGHT,
 19 |                  repweights=starts_with("BRRWT"),
 20 |                  type="Fay",
 21 |                  rho=0.5,
 22 |                  mse=TRUE)
 23 | 
 24 | #' 
 25 | #' # Part 1
 26 | #' 
 27 | #' 1. Find the average square footage of housing units (TOTSQFT_EN) with a 90% confidence interval.
 28 | #' 
 29 | ## -------------------------------------------------------------------
 30 | recs_des %>%
 31 |    summarize(
 32 |       SF_HU=survey_mean(TOTSQFT_EN,
 33 |                           vartype = "ci",
 34 |                           level = 0.9)
 35 |    )
 36 | 
 37 | #' 
 38 | #' 2. Estimate the ratio of cooled square footage to total square footage (TOTCSQFT) to the total square footage of housing units (TOTSQFT_EN) with its standard error.
 39 | #' 
 40 | ## -------------------------------------------------------------------
 41 | recs_des %>%
 42 |    summarize(
 43 |       PropCooled=survey_ratio(
 44 |          numerator = TOTCSQFT,
 45 |          denominator = TOTSQFT_EN,
 46 |          vartype = "se")
 47 |    )
 48 | 
 49 | #' 
 50 | #' 3. Estimate the median temperature housing units are set to during the night in the winter (WinterTempNight) using the `survey_median` function.
 51 | #' 
 52 | ## -------------------------------------------------------------------
 53 | recs_des %>%
 54 |    summarize(
 55 |       WinterNightTemp=survey_median(WinterTempNight,
 56 |                      vartype = "se",
 57 |                      na.rm = TRUE)
 58 |    )
 59 | 
 60 | #' 
 61 | #' 4. Estimate the median temperature housing units are set to during the night in the winter (WinterTempNight) using the `survey_quantile` function.
 62 | #' 
 63 | ## -------------------------------------------------------------------
 64 | recs_des %>%
 65 |    summarize(
 66 |       WinterNightTemp=survey_median(WinterTempNight,
 67 |                             quantiles = "0.5",
 68 |                             vartype = "se",
 69 |                             na.rm = TRUE)
 70 |    )
 71 | 
 72 | #' 
 73 | #' # Part 2
 74 | #' 
 75 | #' 1. Estimate the total average energy cost (TOTALDOL) by region, division, and urbanicity.
 76 | #' 
 77 | ## -------------------------------------------------------------------
 78 | # option 1
 79 | recs_des %>%
 80 |    group_by(Region, Division, Urbanicity) %>%
 81 |    cascade(
 82 |       EnergyCost=survey_mean(TOTALDOL)
 83 |    )
 84 | # option 2
 85 | # one way
 86 | recs_des %>%
 87 |    group_by(Region, Division, Urbanicity) %>%
 88 |    summarize(
 89 |       EnergyCost=survey_mean(TOTALDOL)
 90 |    )
 91 | 
 92 | #' 
 93 | #' 2. What is the median electric cost (DOLLAREL) for housing units in the South Region? What is the 95% confidence interval?
 94 | #' 
 95 | ## -------------------------------------------------------------------
 96 | recs_des %>%
 97 |    filter(Region=="South") %>%
 98 |    summarize(
 99 |       MedElBill=survey_median(DOLLAREL,
100 |                               vartype="ci")
101 |    )
102 | 
103 | #' 
104 | #' 3. Test whether daytime winter and daytime summer temperatures of homes are set the same.
105 | #' 
106 | ## -------------------------------------------------------------------
107 | recs_des %>%
108 |    svyttest(design=.,
109 |             formula = I(WinterTempDay-SummerTempDay)~0,
110 |             na.rm = TRUE)
111 | 
112 | #' 
113 | #' 4. Test whether average electric bill (DOLLAREL) varies by region (Region).
114 | #' 
115 | ## -------------------------------------------------------------------
116 | m1 <- recs_des %>%
117 |    svyglm(design=.,
118 |           formula=DOLLAREL~Region,
119 |           na.action=na.omit)
120 | summary(m1)
121 | 
122 | #' 
123 | #' 5. Fit a regression between the cooled square footage of a housing unit (TOTCSQFT) and the total amount spent on energy (TOTALDOL).
124 | #' 
125 | ## -------------------------------------------------------------------
126 | m2 <- recs_des %>%
127 |    svyglm(design=.,
128 |           formula=TOTALDOL~TOTCSQFT,
129 |           na.action=na.omit)
130 | summary(m2)
131 | 
132 | #' 
133 | 


--------------------------------------------------------------------------------
/Exercises/ContinuousExercises_solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Continous Data Analysis Exercise Solutions"
  3 | output:
  4 |   html_document:
  5 |     df_print: paged
  6 | ---
  7 | 
  8 | # Set-up
  9 | ```{r}
 10 | library(tidyverse) # for tidyverse
 11 | library(here) # for file paths
 12 | library(survey) # for survey analysis
 13 | library(srvyr) # for tidy survey analysis
 14 | 
 15 | recs <- read_rds(here("Data", "recs.rds"))
 16 | 
 17 | recs_des <- recs %>%
 18 |    as_survey_rep(weights=NWEIGHT,
 19 |                  repweights=starts_with("BRRWT"),
 20 |                  type="Fay",
 21 |                  rho=0.5,
 22 |                  mse=TRUE)
 23 | ```
 24 | 
 25 | # Part 1
 26 | 
 27 | 1. Find the average square footage of housing units (TOTSQFT_EN) with a 90% confidence interval.
 28 | 
 29 | ```{r}
 30 | recs_des %>%
 31 |    summarize(
 32 |       SF_HU=survey_mean(TOTSQFT_EN,
 33 |                           vartype = "ci",
 34 |                           level = 0.9)
 35 |    )
 36 | ```
 37 | 
 38 | 2. Estimate the ratio of cooled square footage to total square footage (TOTCSQFT) to the total square footage of housing units (TOTSQFT_EN) with its standard error.
 39 | 
 40 | ```{r}
 41 | recs_des %>%
 42 |    summarize(
 43 |       PropCooled=survey_ratio(
 44 |          numerator = TOTCSQFT,
 45 |          denominator = TOTSQFT_EN,
 46 |          vartype = "se")
 47 |    )
 48 | ```
 49 | 
 50 | 3. Estimate the median temperature housing units are set to during the night in the winter (WinterTempNight) using the `survey_median` function.
 51 | 
 52 | ```{r}
 53 | recs_des %>%
 54 |    summarize(
 55 |       WinterNightTemp=survey_median(WinterTempNight,
 56 |                      vartype = "se",
 57 |                      na.rm = TRUE)
 58 |    )
 59 | ```
 60 | 
 61 | 4. Estimate the median temperature housing units are set to during the night in the winter (WinterTempNight) using the `survey_quantile` function.
 62 | 
 63 | ```{r}
 64 | recs_des %>%
 65 |    summarize(
 66 |       WinterNightTemp=survey_median(WinterTempNight,
 67 |                             quantiles = "0.5",
 68 |                             vartype = "se",
 69 |                             na.rm = TRUE)
 70 |    )
 71 | ```
 72 | 
 73 | # Part 2
 74 | 
 75 | 1. Estimate the total average energy cost (TOTALDOL) by region, division, and urbanicity.
 76 | 
 77 | ```{r}
 78 | # option 1
 79 | recs_des %>%
 80 |    group_by(Region, Division, Urbanicity) %>%
 81 |    cascade(
 82 |       EnergyCost=survey_mean(TOTALDOL)
 83 |    )
 84 | # option 2
 85 | # one way
 86 | recs_des %>%
 87 |    group_by(Region, Division, Urbanicity) %>%
 88 |    summarize(
 89 |       EnergyCost=survey_mean(TOTALDOL)
 90 |    )
 91 | ```
 92 | 
 93 | 2. What is the median electric cost (DOLLAREL) for housing units in the South Region? What is the 95% confidence interval?
 94 | 
 95 | ```{r}
 96 | recs_des %>%
 97 |    filter(Region=="South") %>%
 98 |    summarize(
 99 |       MedElBill=survey_median(DOLLAREL,
100 |                               vartype="ci")
101 |    )
102 | ```
103 | 
104 | 3. Test whether daytime winter and daytime summer temperatures of homes are set the same.
105 | 
106 | ```{r}
107 | recs_des %>%
108 |    svyttest(design=.,
109 |             formula = I(WinterTempDay-SummerTempDay)~0,
110 |             na.rm = TRUE)
111 | ```
112 | 
113 | 4. Test whether average electric bill (DOLLAREL) varies by region (Region).
114 | 
115 | ```{r}
116 | m1 <- recs_des %>%
117 |    svyglm(design=.,
118 |           formula=DOLLAREL~Region,
119 |           na.action=na.omit)
120 | summary(m1)
121 | ```
122 | 
123 | 5. Fit a regression between the cooled square footage of a housing unit (TOTCSQFT) and the total amount spent on energy (TOTALDOL).
124 | 
125 | ```{r}
126 | m2 <- recs_des %>%
127 |    svyglm(design=.,
128 |           formula=TOTALDOL~TOTCSQFT,
129 |           na.action=na.omit)
130 | summary(m2)
131 | ```
132 | 
133 | 


--------------------------------------------------------------------------------
/Exercises/WarmUpExercises.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: "Warm-up Exercises"
 3 | #' output:
 4 | #'   html_document:
 5 | #'     df_print: paged
 6 | #' ---
 7 | #' 
 8 | #' # Course set-up
 9 | #' First, let's make sure you have everything you need for the course. Run the following library statements. If something is not installed, install it.
10 | #' 
11 | ## -------------------------------------------------------------------
12 | # install.packages("tidyverse")
13 | # install.packages("srvyr")
14 | # install.packages("here")
15 | # install.packages("palmerpenguins")
16 | # install.packages("remotes")
17 | 
18 | library(tidyverse) # for tidyverse
19 | library(here) # for file paths
20 | 
21 | remotes::install_github("bschneidr/survey", ref = "c217689")
22 | library(srvyr)
23 | library(palmerpenguins)
24 | 
25 | 
26 | 
27 | #' 
28 | #' # Warm-up exercises: Play with penguin data!!!
29 | #' 
30 | ## -------------------------------------------------------------------
31 | penguins
32 | 
33 | #' 
34 | #' How many penguins of each species are there? Hint: use `count`
35 | ## -------------------------------------------------------------------
36 | 
37 | 
38 | #' 
39 | #' How many penguins of each species and sex are there? Hint: use `count`
40 | #' 
41 | ## -------------------------------------------------------------------
42 | 
43 | 
44 | #' 
45 | #' What is the mean length of flipper by species? Hint: use `group_by` and `summarise`
46 | ## -------------------------------------------------------------------
47 | 
48 | 
49 | #' 
50 | #' 
51 | #' What is the mean flipper length by species and sex? Hint: use `group_by` and `summarise`
52 | ## -------------------------------------------------------------------
53 | 
54 | 
55 | #' 
56 | #' # Advanced warm-up exercises
57 | #' 
58 | #' Fit a simple linear regression between body mass and flipper length.
59 | #' 
60 | ## -------------------------------------------------------------------
61 | 
62 | 
63 | #' 
64 | #' 
65 | #' Test whether the average flipper length is significantly different between male and female penguins. Use t-test, lm, or glm
66 | #' 
67 | ## -------------------------------------------------------------------
68 | 
69 | 
70 | #' 
71 | 


--------------------------------------------------------------------------------
/Exercises/WarmUpExercises.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Warm-up Exercises"
 3 | output:
 4 |   html_document:
 5 |     df_print: paged
 6 | ---
 7 | 
 8 | # Course set-up
 9 | First, let's make sure you have everything you need for the course. Run the following library statements. If something is not installed, install it.
10 | 
11 | ```{r}
12 | # install.packages("tidyverse")
13 | # install.packages("srvyr")
14 | # install.packages("here")
15 | # install.packages("palmerpenguins")
16 | # install.packages("remotes")
17 | 
18 | library(tidyverse) # for tidyverse
19 | library(here) # for file paths
20 | 
21 | remotes::install_github("bschneidr/survey", ref = "c217689")
22 | library(srvyr)
23 | library(palmerpenguins)
24 | 
25 | 
26 | ```
27 | 
28 | # Warm-up exercises: Play with penguin data!!!
29 | 
30 | ```{r}
31 | penguins
32 | ```
33 | 
34 | How many penguins of each species are there? Hint: use `count`
35 | ```{r}
36 | 
37 | ```
38 | 
39 | How many penguins of each species and sex are there? Hint: use `count`
40 | 
41 | ```{r}
42 | 
43 | ```
44 | 
45 | What is the mean length of flipper by species? Hint: use `group_by` and `summarise`
46 | ```{r}
47 | 
48 | ```
49 | 
50 | 
51 | What is the mean flipper length by species and sex? Hint: use `group_by` and `summarise`
52 | ```{r}
53 | 
54 | ```
55 | 
56 | # Advanced warm-up exercises
57 | 
58 | Fit a simple linear regression between body mass and flipper length.
59 | 
60 | ```{r}
61 | 
62 | ```
63 | 
64 | 
65 | Test whether the average flipper length is significantly different between male and female penguins. Use t-test, lm, or glm
66 | 
67 | ```{r}
68 | 
69 | ```
70 | 
71 | 


--------------------------------------------------------------------------------
/Exercises/WarmUpExercises_solutions.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: "Warm-up Exercise Solutions"
 3 | #' output:
 4 | #'   html_document:
 5 | #'     df_print: paged
 6 | #' ---
 7 | #' 
 8 | #' # Course set-up
 9 | #' First, let's make sure you have everything you need for the course. Run the following library statements. If something is not installed, install it.
10 | #' 
11 | ## -------------------------------------------------------------------
12 | # install.packages("tidyverse")
13 | # install.packages("srvyr")
14 | # install.packages("here")
15 | # install.packages("palmerpenguins")
16 | # install.packages("remotes")
17 | 
18 | library(tidyverse) # for tidyverse
19 | library(here) # for file paths
20 | 
21 | remotes::install_github("bschneidr/survey", ref = "c217689")
22 | library(srvyr)
23 | library(palmerpenguins)
24 | 
25 | 
26 | 
27 | #' 
28 | #' # Warm-up exercises: Play with penguin data!!!
29 | #' 
30 | ## -------------------------------------------------------------------
31 | penguins
32 | 
33 | #' 
34 | #' How many penguins of each species are there? Hint: use `count`
35 | ## -------------------------------------------------------------------
36 | penguins %>%
37 |    count(species)
38 | 
39 | #' 
40 | #' How many penguins of each species and sex are there? Hint: use `count`
41 | #' 
42 | ## -------------------------------------------------------------------
43 | penguins %>%
44 |    count(species, sex)
45 | 
46 | #' 
47 | #' What is the mean length of flipper by species? Hint: use `group_by` and `summarise`
48 | ## -------------------------------------------------------------------
49 | penguins %>%
50 |    group_by(species) %>%
51 |    summarize(
52 |      MeanFlipperLength=mean(flipper_length_mm, 
53 |                             na.rm=TRUE))
54 | 
55 | #' 
56 | #' 
57 | #' What is the mean flipper length by species and sex? Hint: use `group_by` and `summarise`
58 | ## -------------------------------------------------------------------
59 | penguins %>%
60 |    group_by(species, sex) %>%
61 |    summarize(
62 |      MeanFlipperLength=mean(flipper_length_mm,
63 |                             na.rm=TRUE))
64 | 
65 | #' 
66 | #' # Advanced warm-up exercises
67 | #' 
68 | #' Fit a simple linear regression between body mass and flipper length.
69 | #' 
70 | ## -------------------------------------------------------------------
71 | mod1 <- lm(body_mass_g ~ flipper_length_mm,
72 |            data=penguins)
73 | summary(mod1)
74 | 
75 | #' 
76 | #' 
77 | #' Test whether the average flipper length is significantly different between male and female penguins. Use t-test, lm, or glm
78 | #' 
79 | ## -------------------------------------------------------------------
80 | t.test(flipper_length_mm ~ sex, data=penguins)
81 | 
82 | mod3 <- lm(flipper_length_mm ~ sex, data=penguins)
83 | summary(mod3)
84 | 
85 | mod4 <- glm(flipper_length_mm ~ sex, data=penguins)
86 | summary(mod4)
87 | 
88 | #' 
89 | 


--------------------------------------------------------------------------------
/Exercises/WarmUpExercises_solutions.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Warm-up Exercise Solutions"
 3 | output:
 4 |   html_document:
 5 |     df_print: paged
 6 | ---
 7 | 
 8 | # Course set-up
 9 | First, let's make sure you have everything you need for the course. Run the following library statements. If something is not installed, install it.
10 | 
11 | ```{r}
12 | # install.packages("tidyverse")
13 | # install.packages("srvyr")
14 | # install.packages("here")
15 | # install.packages("palmerpenguins")
16 | # install.packages("remotes")
17 | 
18 | library(tidyverse) # for tidyverse
19 | library(here) # for file paths
20 | 
21 | remotes::install_github("bschneidr/survey", ref = "c217689")
22 | library(srvyr)
23 | library(palmerpenguins)
24 | 
25 | 
26 | ```
27 | 
28 | # Warm-up exercises: Play with penguin data!!!
29 | 
30 | ```{r}
31 | penguins
32 | ```
33 | 
34 | How many penguins of each species are there? Hint: use `count`
35 | ```{r}
36 | penguins %>%
37 |    count(species)
38 | ```
39 | 
40 | How many penguins of each species and sex are there? Hint: use `count`
41 | 
42 | ```{r}
43 | penguins %>%
44 |    count(species, sex)
45 | ```
46 | 
47 | What is the mean length of flipper by species? Hint: use `group_by` and `summarise`
48 | ```{r}
49 | penguins %>%
50 |    group_by(species) %>%
51 |    summarize(
52 |      MeanFlipperLength=mean(flipper_length_mm, 
53 |                             na.rm=TRUE))
54 | ```
55 | 
56 | 
57 | What is the mean flipper length by species and sex? Hint: use `group_by` and `summarise`
58 | ```{r}
59 | penguins %>%
60 |    group_by(species, sex) %>%
61 |    summarize(
62 |      MeanFlipperLength=mean(flipper_length_mm,
63 |                             na.rm=TRUE))
64 | ```
65 | 
66 | # Advanced warm-up exercises
67 | 
68 | Fit a simple linear regression between body mass and flipper length.
69 | 
70 | ```{r}
71 | mod1 <- lm(body_mass_g ~ flipper_length_mm,
72 |            data=penguins)
73 | summary(mod1)
74 | ```
75 | 
76 | 
77 | Test whether the average flipper length is significantly different between male and female penguins. Use t-test, lm, or glm
78 | 
79 | ```{r}
80 | t.test(flipper_length_mm ~ sex, data=penguins)
81 | 
82 | mod3 <- lm(flipper_length_mm ~ sex, data=penguins)
83 | summary(mod3)
84 | 
85 | mod4 <- glm(flipper_length_mm ~ sex, data=penguins)
86 | summary(mod4)
87 | ```
88 | 
89 | 


--------------------------------------------------------------------------------
/FinalizeMaterials.R:
--------------------------------------------------------------------------------
 1 | ### This program creates PDF slides and R files from the Rmd files
 2 | 
 3 | library(knitr)
 4 | library(here)
 5 | 
 6 | mypurl <- function(folder, fn){
 7 |    purl(here(folder, stringr::str_c(fn, ".Rmd")),
 8 |         output=here(folder, stringr::str_c(fn, ".R")),
 9 |         documentation=2)
10 |    
11 | }
12 | 
13 | mypurl("Exercises", "CategorialExercises")
14 | mypurl("Exercises", "ContinuousExercises")
15 | mypurl("Exercises", "WarmUpExercises")
16 | 
17 | mypurl("Exercises", "CategorialExercises_solutions")
18 | mypurl("Exercises", "ContinuousExercises_solutions")
19 | mypurl("Exercises", "WarmUpExercises_solutions")
20 | 
21 | mypurl("Presentation", "Slides")
22 | 
23 | # remotes::install_github("jhelvy/xaringanBuilder")
24 | # remotes::install_github('rstudio/chromote')
25 | xaringanBuilder::build_pdf(
26 |    input=here("Presentation", "Slides.html"),
27 |    output_file=here("Presentation", "Slides.pdf"),
28 |    partial_slides= TRUE)
29 | xaringanBuilder::build_pptx(
30 |    input=here("Presentation", "Slides.pdf"),
31 |    output_file=here("Presentation", "Slides.pptx"),
32 |    partial_slides= TRUE)
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/Presentation/Slides.Rmd:
--------------------------------------------------------------------------------
   1 | ---
   2 | title: "Tidy Survey Analysis in R using the srvyr Package"
   3 | subtitle: "AAPOR 2021 Short Course"
   4 | author:
   5 |    - Stephanie Zimmer, RTI International
   6 |    - Rebecca Powell, RTI International
   7 | date: "2021-05-06"
   8 | output:
   9 |   xaringan::moon_reader:
  10 |     css: xaringan-themer.css
  11 |     nature:
  12 |       slideNumberFormat: "%current%"
  13 |       highlightStyle: github
  14 |       highlightLines: true
  15 |       ratio: 16:9
  16 |       countIncrementalSlides: true
  17 | ---
  18 | 
  19 | ```{r setup, include=FALSE}
  20 | knitr::opts_chunk$set(echo = TRUE, message = FALSE, tidy = FALSE)
  21 | ```
  22 | 
  23 | 
  24 | ```{r xaringan-themer, include=FALSE, warning=FALSE}
  25 | library(xaringanthemer)
  26 | style_duo_accent(
  27 |   primary_color = "#1E4F96",
  28 |   secondary_color = "#00A3E0",
  29 |   inverse_header_color = "#FFFFFF"
  30 | )
  31 | ```
  32 | 
  33 | class: inverse center middle
  34 | # Introduction
  35 | 
  36 | ---
  37 | 
  38 | ```{css, echo = FALSE}
  39 | .small .remark-code { /*Change made here*/
  40 |   font-size: 80% !important;
  41 | }
  42 | .smaller .remark-code { /*Change made here*/
  43 |   font-size: 70% !important;
  44 | }
  45 | ```
  46 | 
  47 | ## Overview
  48 | 
  49 | - At the end of this course, you should be able to 
  50 |   - Calculate point estimates and their standard errors with survey data 
  51 |       - Means & Proportions
  52 |       - Totals
  53 |       - Quantiles
  54 |   - Perform t-tests and chi-squared tests
  55 |   - Fit regression models
  56 |   - Specify a survey design in R to create a survey object
  57 |   
  58 | - We will not be going over the following but provide some resources at the end
  59 |   - Weighting (calibration, post-stratification, raking, etc.)
  60 |   - Survival analysis
  61 |   - Nonlinear models
  62 | 
  63 | 
  64 | 
  65 | ---
  66 | ## Overview: Course Roadmap
  67 | 
  68 | - Get familiar with RStudio Cloud with a warm-up exercise using the tidyverse
  69 | 
  70 | - Introduce the survey data we'll be using in the course
  71 | 
  72 | - Analysis of continuous data with time for practice
  73 | 
  74 | - Analysis of categorical data with time for practice
  75 | 
  76 | - Specify a survey design object in R with exercises
  77 | 
  78 | - Resources for other survey analysis topics
  79 | 
  80 | - Closing
  81 | 
  82 | ---
  83 | ## Logistics
  84 | 
  85 | - We will be using RStudio Cloud today to ensure everyone has access
  86 | 
  87 |     - Sign-up for a free RStudio Cloud account 
  88 |     - Access the project and files via link in email and Zoom chat
  89 |     - Click "START" to open the project and get started
  90 |     - Rstudio Cloud has the same features and appearance as RStudio for ease of use
  91 | 
  92 | - All slides and code are available on GitHub: https://github.com/szimmer/tidy-survey-aapor-2021
  93 | 
  94 | ???
  95 | Github repo is for future reference, all material on RStudio cloud
  96 | ---
  97 | ## Intro to RStudio Cloud: Penguins!!
  98 | 
  99 | - Using `palmerpenguins` data for warm-up exercises
 100 | 
 101 | - Data were collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER, a member of the Long Term Ecological Research Network.
 102 | 
 103 | - Access data through `palmerpenguins` package https://github.com/allisonhorst/palmerpenguins/
 104 | 
 105 | 
 106 | ####If you are using your own RStudio environment:
 107 | - Make sure you have `tidyverse`, `here`, and `palmerpenguins` installed
 108 | 
 109 | ```{r inst_packages, error=FALSE, warning=FALSE, eval=FALSE}
 110 | # Run package installation if you don't have these packages already
 111 | # As a reminder, installing takes package from internet to your computer 
 112 | # and only needs to be done once, not each session
 113 | 
 114 | install.packages(c("tidyverse", "here", "palmerpenguins"))
 115 | ```
 116 | 
 117 | ---
 118 | ## Intro to RStudio Cloud: Penguins!!
 119 | 
 120 | - Load `tidyverse`, `here`, and `palmerpenguins`
 121 | 
 122 | - Look at the penguins dataset using `glimpse`
 123 | 
 124 | ```{r load_pack1, error=FALSE, warning=FALSE}
 125 | library(tidyverse) # for tidyverse
 126 | library(here) # for file paths
 127 | library(palmerpenguins) # for warm-up data
 128 | glimpse(penguins)
 129 | ```
 130 | 
 131 | ---
 132 | ## Warm-up Exercises: WarmUpExercises.Rmd
 133 | 
 134 | - <b>Let's open RStudio cloud and do some warm-up examples</b>
 135 |   - We will do one together and then give you 5-minutes to work through other examples and get familiar with RStudio Cloud
 136 | 
 137 | - Explore the penguins data
 138 |   - How many penguins of each species are there? <i>We will do this one together</i>
 139 |   - How many penguins of each species and sex are there? Hint: use `count`
 140 |   - What is the mean length of flipper by species? Hint: use `group_by` and `summarize`
 141 |   - What is the mean flipper length by species and sex? 
 142 | 
 143 | - More advanced warm-up
 144 |   - Fit a simple linear regression between body mass and flipper length.
 145 |   - Test whether the average flipper length is significantly different between male and female penguins. Use t-test, lm, or glm
 146 | 
 147 | ---
 148 | ## Ex. 1: How many penguins of each species are there? 
 149 | 
 150 | ```{r peng1}
 151 | penguins %>%
 152 |    count(species)
 153 | ```
 154 | 
 155 | ???
 156 | - Using `count` we see there are 3 different species and the number of penguins for each species
 157 | ---
 158 | ## Ex. 2: How many penguins of each species and sex are there?
 159 | 
 160 | ```{r peng2}
 161 | penguins %>%
 162 |    count(species, sex)
 163 | ```
 164 | ???
 165 | - `count` can take more than one variable to get a cross-tabs between the two variables
 166 | 
 167 | ---
 168 | ## Ex. 3: What is the mean length of flipper by species? 
 169 | 
 170 | ```{r peng3}
 171 | penguins %>%
 172 |    group_by(species) %>%
 173 |    summarize(
 174 |      MeanFlipperLength=mean(flipper_length_mm, 
 175 |                             na.rm=TRUE))
 176 | ```
 177 | ???
 178 | - `group_by` allows us to look at metrics by different subgroups like species
 179 | - when using `group_by` follow it with `summarize` to get metrics (like average) at the group level
 180 | - `na.rm=TRUE` removes missing data from the calculation
 181 |     - forgetting this argument will result in a value of `NA` as the function will try to average missing data
 182 | 
 183 | ---
 184 | ## Ex. 4: What is the mean flipper length by species and sex?
 185 | 
 186 | ```{r peng4}
 187 | penguins %>%
 188 |    group_by(species, sex) %>%
 189 |    summarize(
 190 |      MeanFlipperLength=mean(flipper_length_mm,
 191 |                             na.rm=TRUE))
 192 | ```
 193 | ???
 194 | - As with `count`, you can `group_by` multiple variables
 195 | 
 196 | ---
 197 | ## Advanced Ex. 1: Linear regression (body mass & flipper length)
 198 | .small[
 199 | ```{r pengad1}
 200 | mod1 <- lm(body_mass_g ~ flipper_length_mm, data=penguins)
 201 | summary(mod1)
 202 | ```
 203 | ]
 204 | ???
 205 | - use `lm` (linear model) function
 206 | - equation is written as y-variable ~ x-variables
 207 | 
 208 | ---
 209 | ## Advanced Ex. 2: Flipper length differences by sex: t-test
 210 | 
 211 | ```{r pengad2a}
 212 | t.test(flipper_length_mm ~ sex, data=penguins)
 213 | ```
 214 | ???
 215 | - ~ also used in `t.test` to separate what we want to measure (our y) and the groups of interest (our x)
 216 | ---
 217 | ## Advanced Ex. 2: Flipper length differences by sex: lm 
 218 | .small[
 219 | ```{r pengad2b}
 220 | mod3 <- lm(flipper_length_mm ~ sex, data=penguins)
 221 | summary(mod3)
 222 | ```
 223 | ]
 224 | ---
 225 | ## Advanced Ex. 2: Flipper length differences by sex: glm 
 226 | .small[
 227 | ```{r pengad2c}
 228 | mod4 <- glm(flipper_length_mm ~ sex, data=penguins)
 229 | summary(mod4)
 230 | ```
 231 | ]
 232 | ???
 233 | - `glm` takes the same arguments as `lm`, but is more flexible for working with non-normal data
 234 | ---
 235 | class: inverse center middle
 236 | # Survey Datasets
 237 | ---
 238 | ## Residential Energy Consumption Survey (RECS) 2015
 239 | 
 240 | - Energy consumption/expenditures collected through energy suppliers
 241 | 
 242 | - Fielded 14 times between 1950 and 2015
 243 | 
 244 | - Topics include appliances, electronics, heating, a/c, temperatures, water heating, lighting, energy bills, respondent demographics, and energy assistance
 245 | 
 246 | - Funded by the Energy Information Administration
 247 | 
 248 | - <b>Target Population</b>: Primary occupied housing units in the US
 249 | 
 250 | - <b>Mode</b>: In-person, paper, and web interview mode
 251 | 
 252 | - <b>Sample Information</b>: BRR Replicate weights included for variance estimation
 253 | 
 254 | https://www.eia.gov/consumption/residential/index.php
 255 | 
 256 | ???
 257 | - We have subset the columns of this data and created derived variables, code in repository
 258 | ---
 259 | ## American National Election Studies (ANES) 2016 
 260 | 
 261 | - Pre and post election surveys
 262 | 
 263 | - Fielded almost every 2 years since 1948
 264 | 
 265 | - Topics include voter registration status, candidate preference, opinions on country and government, party and ideology affiliation, opinions on policy, news sources, and more
 266 | 
 267 | - Collaboration of Stanford, University of Michigan - funding by the National Science Foundation
 268 | 
 269 | - <b>Target Population</b>: US citizens, 18 and older living in US 
 270 | 
 271 | - <b>Mode</b>: FTF with CASI and Web
 272 | 
 273 | - <b>Sample Information</b>: Pseudo-strata and pseudo-cluster included for variance estimation
 274 | 
 275 | https://electionstudies.org/
 276 | 
 277 | ???
 278 | Chose not to use 2020 data because it is still preliminary
 279 | 
 280 | ---
 281 | class: inverse center middle
 282 | # Continuous data analysis
 283 | ---
 284 | ## Overview of Survey Analysis using `srvyr` Package
 285 | 
 286 | 1. Create a `tbl_svy` object using: `as_survey_design` or `as_survey_rep`
 287 | 
 288 | 2. Subset data (if needed) using `filter` (subpopulations)
 289 | 
 290 | 3. Specify domains of analysis using `group_by` 
 291 | 
 292 | 4. Within `summarize`, specify variables to calculate including means, totals, proportions, quantiles and more
 293 | 
 294 | 
 295 | <b>Note: We will be teaching this in the reverse order!!!</b>
 296 | ---
 297 | ## Set-up for Analysis
 298 | - `srvyr` package uses tidy-syntax but uses the `survey` package behind it to do calculations
 299 | 
 300 | - If using your own RStudio environment, install both packages:
 301 | ```{r inst_srv, eval=FALSE}
 302 | # Install survey and srvyr packages
 303 | 
 304 | remotes::install_github("bschneidr/survey", ref = "c217689")
 305 | install.packages("srvyr")
 306 | ```
 307 | 
 308 | - First, we will set-up a design object and later talk about what it means
 309 | ```{r recs_des, error=FALSE, warning=FALSE}
 310 | library(survey) # for survey analysis
 311 | library(srvyr) # for tidy survey analysis
 312 | 
 313 | recs <- read_rds(here("Data", "recs.rds"))
 314 | 
 315 | recs_des <- recs %>%
 316 |    as_survey_rep(weights=NWEIGHT,
 317 |                  repweights=starts_with("BRRWT"),
 318 |                  type="Fay",
 319 |                  rho=0.5,
 320 |                  mse=TRUE)
 321 | 
 322 | ```
 323 | ???
 324 | - need to install github version of survey package if you want CIs with quantiles
 325 | ---
 326 | ## Weighted Analysis for Continuous Variables
 327 | 
 328 | - Common functions for continuous summaries
 329 |    - survey_mean
 330 |    - survey_total (like sum)
 331 |    - survey_median
 332 |    - survey_quantile
 333 |    - survey_ratio
 334 | 
 335 | - Always call within `summarize`/`summarise`
 336 | ---
 337 | ## `survey_mean` Syntax
 338 | 
 339 | ```{r survey_mean_syn, eval=FALSE}
 340 | survey_mean(
 341 |   x,
 342 |   na.rm = FALSE,
 343 |   vartype = c("se", "ci", "var", "cv"),
 344 |   level = 0.95,
 345 |   proportion = FALSE,
 346 |   deff = FALSE,
 347 |   df = NULL,
 348 |   ...
 349 | )
 350 | ```
 351 | 
 352 | To calculate a survey mean, we use this in `summarize`/`summarise`
 353 | ```{r survey_mean_syn2, eval=FALSE}
 354 | survey_design_object %>%
 355 |    summarize(
 356 |       mean_varname=survey_mean(x = continuous_varname)
 357 |       )
 358 | ```
 359 | 
 360 | ???
 361 | Only required argument is the variable
 362 | 
 363 | ---
 364 | ## `survey_mean` Example 1: Mean dollars spent on energy
 365 | 
 366 | This is an example using the `recs_des` survey design object and `survey_mean` function defaults
 367 | 
 368 | ```{r survey_mean_ex1}
 369 | recs_des %>%
 370 |    summarize(
 371 |       TD_mean=survey_mean(x = TOTALDOL)
 372 |       )
 373 | ```
 374 | ---
 375 | ## `survey_mean` Example 2: Mean temperature setting for summer during the day
 376 | 
 377 | Run this code. What happens? Why?
 378 | 
 379 | ```{r survey_mean_ex2, eval=FALSE}
 380 | recs_des %>%
 381 |    summarize(
 382 |       TD_mean=survey_mean(x = SummerTempDay)
 383 |       )
 384 | ```
 385 | ---
 386 | ## `survey_mean` Example 2: Mean temperature setting for summer during the day
 387 | 
 388 | Run this code. What happens? Why?
 389 | 
 390 | ```{r survey_mean_ex2_r, error=TRUE}
 391 | recs_des %>%
 392 |    summarize(
 393 |       TD_mean=survey_mean(x = SummerTempDay)
 394 |       )
 395 | ```
 396 | 
 397 | <b>How do we fix this code?</b>
 398 | 
 399 | ???
 400 | - missing data in temperature, need `na.rm=TRUE`
 401 | ---
 402 | ## `survey_mean` Example 2: Missing data solution
 403 | 
 404 | ```{r survey_mean_ex2_sol, error=TRUE, tidy=FALSE}
 405 | recs_des %>%
 406 |    summarize(
 407 |       TD_mean = survey_mean(
 408 |         x = SummerTempDay, 
 409 |         na.rm = TRUE )#<<
 410 |       )
 411 | ```
 412 | 
 413 | ---
 414 | ## `survey_median` Syntax
 415 | 
 416 | ```{r survey_median_syn, eval=FALSE}
 417 | survey_median(
 418 |   x,
 419 |   na.rm = FALSE,
 420 |   vartype = c("se", "ci"),
 421 |   level = 0.95,
 422 |   df = NULL,
 423 |   ...
 424 | )
 425 | ```
 426 | 
 427 | ???
 428 | Only required argument is the variable
 429 | 
 430 | 
 431 | ---
 432 | ## `survey_median` Example: Median temperature setting for summer during day
 433 | 
 434 | Fill in the blank:
 435 | 
 436 | ```{r survey_median_fib, eval=FALSE}
 437 | recs_des %>%
 438 |    summarize(
 439 |       TD_median=survey_median(x=_________,
 440 |                           na.rm=_________)
 441 |       )
 442 | ```
 443 | 
 444 | --
 445 | 
 446 | ```{r survey_median_fib_sol}
 447 | recs_des %>%
 448 |    summarize(
 449 |       TD_median=survey_median(x=SummerTempDay,
 450 |                               na.rm=TRUE)
 451 |       )
 452 | ```
 453 | 
 454 | 
 455 | 
 456 | ---
 457 | ## `survey_quantile` Syntax
 458 | 
 459 | ```{r survey_quantile_syn, eval=FALSE}
 460 | survey_quantile(
 461 |   x,
 462 |   quantiles, #<<
 463 |   na.rm = FALSE,
 464 |   vartype = c("se", "ci", "var", "cv"),
 465 |   level = 0.95,
 466 |   df = NULL,
 467 |   ...
 468 | )
 469 | ```
 470 | 
 471 | ???
 472 | - need both the variable and the quantiles in a vector e.g. (c(.25, .75))
 473 | ---
 474 | ## `survey_quantile` Example 1: 1st and 3rd quantile of dollars spent on energy
 475 | 
 476 | ```{r survey_quantile_ex1, error=TRUE}
 477 | recs_des %>%
 478 |    summarize(
 479 |       Spent=survey_quantile(
 480 |         x = TOTALDOL,
 481 |         quantiles = c(.25, .75)) #<<
 482 |       )
 483 | ```
 484 | ???
 485 | - This estimates the 25th and 75th percentile
 486 | 
 487 | ---
 488 | ## `survey_quantile` Example 2: 1st and 3rd quantile of dollars spent on energy now with confidence interval
 489 | 
 490 | ```{r survey_quantile_ex2, error=TRUE}
 491 | recs_des %>%
 492 |    summarize(
 493 |       Spent=survey_quantile(x = TOTALDOL,
 494 |                             quantiles = c(.25, .75),
 495 |                             vartype = "ci" #<<
 496 |          )
 497 |       )
 498 | ```
 499 | ---
 500 | ## `survey_ratio` Syntax
 501 | 
 502 | - Note this estimates: $\sum x_i/\sum y_i$ not $\sum \frac{x_i}{y_i}$
 503 | 
 504 | ```{r survey_ratio_syn, eval=FALSE}
 505 | survey_ratio(
 506 |   numerator, #<<
 507 |   denominator, #<<
 508 |   na.rm = FALSE,
 509 |   vartype = c("se", "ci", "var", "cv"),
 510 |   level = 0.95,
 511 |   deff = FALSE,
 512 |   df = NULL,
 513 |   ...
 514 | )
 515 | ```
 516 | 
 517 | 
 518 | ---
 519 | ## `survey_ratio` Example: mean dollars per BTU spent on energy
 520 | 
 521 | ```{r survey_ratio_ex}
 522 | recs_des %>%
 523 |    summarize(
 524 |       DolPerBTU=survey_ratio(
 525 |          numerator = TOTALDOL, #<<
 526 |          denominator = TOTALBTU, #<<
 527 |          na.rm = TRUE
 528 |          )
 529 |       )
 530 | ```
 531 | ---
 532 | ## Practice on your own
 533 | 
 534 | - Open ContinuousExercises.Rmd and work through Part 1
 535 | 
 536 | - We will take 15 minutes. Use this time for the exercises and a break
 537 | ---
 538 | ## Weighted Analysis for Continuous Variables: Domain Analysis
 539 | 
 540 | - If we want to get estimates by another variable, we need to add a `group_by` statement before doing the analysis.
 541 | 
 542 | - Example: Average dollars spent on electricity by whether AC is used
 543 | 
 544 | ```{r domain_ex}
 545 | recs_des %>%
 546 |   group_by(ACUsed) %>% #<<
 547 |   summarize(
 548 |     ElBill=survey_mean(DOLLAREL, 
 549 |                        na.rm=TRUE)
 550 |   )
 551 | ```
 552 | ---
 553 | ## Domain Analysis: Totals
 554 | 
 555 | - If we want the overall electric bill too, use the `cascade` function instead of `summarize`
 556 | 
 557 | ```{r domain_ex_casc}
 558 | recs_des %>%
 559 |    group_by(ACUsed) %>%
 560 |    cascade(
 561 |       ElBill=survey_mean(DOLLAREL, 
 562 |                          na.rm=TRUE)
 563 |    )
 564 | 
 565 | ```
 566 | 
 567 | ???
 568 | - Note the overall appears as NA
 569 | 
 570 | ---
 571 | ## Domain Analysis: Totals
 572 | 
 573 | - Also can add sample and pop sizes
 574 | 
 575 | ```{r domain_tot}
 576 | recs_des %>%
 577 |    group_by(ACUsed) %>%
 578 |    cascade(
 579 |       ElBill=survey_mean(DOLLAREL, na.rm=TRUE),
 580 |       N=survey_total(!is.na(DOLLAREL)), #<<
 581 |       n=unweighted(sum(!is.na(DOLLAREL))) #<<
 582 |    )
 583 | 
 584 | ```
 585 | 
 586 | ???
 587 | - survey_total gets a weighted total
 588 | - unweighted does just that, an unweighted estimate, can also get an unweighted mean or any other stat
 589 | 
 590 | ---
 591 | ## Weighted Analysis for Specific Subpopulations
 592 | 
 593 | - filtering (subsetting) the data should be done AFTER specifying the design to ensure accurate standard errors
 594 | 
 595 | - Use the `filter` function after creating the survey design object and before summarizing
 596 | 
 597 | Wrong way:
 598 | ```{r filter_bad, eval = FALSE}
 599 | data %>%
 600 |   filter(state=="NC") %>% #<<
 601 |   as_survey_design(...) %>%
 602 |   summarize(AvgAge=mean(Age))
 603 | ```
 604 | 
 605 | Right way:
 606 | ```{r filter_good, eval=FALSE}
 607 | data %>%
 608 |   as_survey_design(...) %>%
 609 |   filter(state=="NC") %>% #<<
 610 |   summarize(AvgAge=mean(Age))
 611 | ```
 612 | 
 613 | ???
 614 | - The difference in these two methods occurs when the subpopulation doesn't occur in all strata or PSUs
 615 | 
 616 | ---
 617 | ## Subpopulation Example 1: Average electric cost of single family homes
 618 | 
 619 | ```{r subpop1}
 620 | recs_des %>%
 621 |   filter(HousingUnitType %in% c("Single-family detached",
 622 |                                 "Single-family attached")) %>%
 623 |   summarize(
 624 |     ElBill=survey_mean(DOLLAREL, 
 625 |                        na.rm=TRUE)
 626 |   )
 627 | ```
 628 | 
 629 | ---
 630 | ## Comparisons with t-tests: `svyttest` Syntax
 631 | 
 632 | - t-tests are done in the package `survey` not `srvyr` but you can use the same design object
 633 | 
 634 | ```{r ttest_syn, eval=FALSE}
 635 | svyttest(formula, # outcome~group for two-sample, outcome~0 for one-sample
 636 |          design,
 637 |          na.rm = FALSE
 638 |          ....)
 639 | ```
 640 | 
 641 | ???
 642 | - Uses standard R formula notation
 643 | - will go over examples of 1-sample, 2-sample, and paired t-test
 644 | 
 645 | ---
 646 | ## `svyttest` Example 1: One-sample t-test
 647 | 
 648 | - I keep my house at 68 degrees at night during the summer. Is this different from the national average?
 649 | 
 650 | ```{r ttest_ex1}
 651 | recs_des %>%
 652 |    svyttest(design=.,
 653 |             formula=I(SummerTempNight-68)~0,
 654 |             na.rm=TRUE)
 655 | ```
 656 | 
 657 | ???
 658 | - Note the I notation, this does the arithmetic before modeling
 659 | 
 660 | ---
 661 | ## `svyttest` Example 2: Comparing two variables
 662 | 
 663 | - Do people keep their house the same temperature at night during the summer and the winter?
 664 | 
 665 | ```{r ttest_ex2}
 666 | recs_des %>%
 667 |    svyttest(design=.,
 668 |             formula=I(SummerTempNight-WinterTempNight)~0,
 669 |             na.rm=TRUE)
 670 | ```
 671 | 
 672 | ???
 673 | - this is a paired t-test
 674 | - testing whether the difference is 0 for each household
 675 | ---
 676 | ## `svyttest` Example 3: Two-sample t-test
 677 | 
 678 | - Are electric bills different between those with and without A/C?
 679 | 
 680 | ```{r ttest_ex3}
 681 | recs_des %>%
 682 |    svyttest(design=.,
 683 |             formula=DOLLAREL~ACUsed,
 684 |             na.rm=TRUE)
 685 | ```
 686 | 
 687 | 
 688 | 
 689 | ---
 690 | ## Linear Regression or ANOVA: `svyglm` Syntax
 691 | 
 692 | - As with t-tests, regressions are done in the package `survey` not `srvyr` but you can use the same design object
 693 | 
 694 | - Syntax is similar between t-test and glm
 695 | 
 696 | ```{r glm_syn, eval=FALSE}
 697 | svyglm(formula, 
 698 |        design,
 699 |        na.action, #default is na.omit
 700 |        ....)
 701 | ```
 702 | ---
 703 | ## `svyglm` Example: Two-sample
 704 | 
 705 | Same example as two-sample t-test: Are electric bills different between those with and without A/C?
 706 | 
 707 | <b>t-test:</b>
 708 | ```{r twosamp_ex_ttest, eval=FALSE}
 709 | recs_des %>%
 710 |    svyttest(design=.,
 711 |             formula=DOLLAREL~ACUsed,
 712 |             na.rm=TRUE) #<<
 713 | ```
 714 | 
 715 | <b>glm:</b>
 716 | ```{r twosamp_ex_glm, eval=FALSE}
 717 | recs_des %>%
 718 |    svyglm(design=.,
 719 |           formula=DOLLAREL~ACUsed,
 720 |           na.action=na.omit) #<<
 721 | ```
 722 | 
 723 | ???
 724 | - one major difference in how you specify to ignore NA values
 725 | - svyttest can only have 2-levels in group variable
 726 | - svyglm, the variable on right can be anything (continuous or factor)
 727 | 
 728 | ---
 729 | ## `svyglm` Example: Two-sample
 730 | 
 731 | Are electric bills different between those with and without A/C?
 732 | .small[
 733 | ```{r twosamp_ex_ttest_run}
 734 | recs_des %>%
 735 |    svyglm(design=.,
 736 |           formula=DOLLAREL~ACUsed,
 737 |           na.action=na.omit) %>%
 738 |   summary()
 739 | ```
 740 | ]
 741 | 
 742 | ???
 743 | - same results as t-test
 744 | 
 745 | ---
 746 | ## `svyglm` Example 1: ANOVA Test
 747 | 
 748 | Does temperature of AC at night vary by region?
 749 | .smaller[
 750 | ```{r anova_ex}
 751 | recs_des %>%
 752 |    svyglm(design=.,
 753 |           formula=SummerTempNight~Region,
 754 |           na.action=na.omit) %>%
 755 |   summary()
 756 | 
 757 | ```
 758 | ]
 759 | 
 760 | ???
 761 | - Region is a factor variable, if it is numeric - this will treat it like a linear model
 762 | 
 763 | ---
 764 | ## `svyglm` Example 2: Linear Model
 765 | 
 766 | - Is there a relationship between square footage and electric bill?
 767 | - Let's review the data first with a ggplot. <i>Note we use the original data and do <b>NOT</b> use the survey design object.</i>
 768 | 
 769 | ```{r plot_sf_elbill}
 770 | p <- recs %>%
 771 |   ggplot(aes(x=TOTSQFT_EN, y=DOLLAREL, weight=NWEIGHT)) +
 772 |   geom_hex() + 
 773 |   theme(legend.position="right") +
 774 |   guides(fill=guide_legend(title="HUs"))
 775 | ```
 776 | ---
 777 | ## `svyglm` Example 2: Linear Model
 778 | ```{r plot_sf_elbill_disp, echo=FALSE, fig.asp=9/16, fig.align="center", out.width="90%", dpi=300}
 779 | p +
 780 |    theme_xaringan() 
 781 | ```
 782 | 
 783 | 
 784 | ---
 785 | ## `svyglm` Example 2: Linear Model
 786 | .small[
 787 | ```{r lm_ex}
 788 | m_electric_sqft <- recs_des %>%
 789 |    svyglm(design=.,
 790 |           formula=DOLLAREL~TOTSQFT_EN,
 791 |           na.action=na.omit)
 792 | summary(m_electric_sqft)
 793 | ```
 794 | ]
 795 | 
 796 | ???
 797 | - for every square foot bigger, on average 24.6c more in electric
 798 | ---
 799 | ## Practice on your own
 800 | 
 801 | - Open ContinuousExercises.Rmd and work through Part 2
 802 | 
 803 | - We will take 15 minutes. Use this time for the exercises and a break
 804 | ---
 805 | class: inverse center middle
 806 | # Categorical data analysis
 807 | ---
 808 | ## Weighted Analysis for Categorical Variable
 809 | 
 810 | - Functions to use within `summarize` after `group_by`
 811 |    - survey_mean
 812 |    - survey_total
 813 | 
 814 | - Functions to get counts
 815 |   - survey_count
 816 | 
 817 | ???
 818 | 
 819 | - we use the same mean and total functions as with continuous variables
 820 | - `survey_count` is new
 821 |     - has a similar structure as the standard (non-survey) version of count
 822 | 
 823 | ---
 824 | ## Set-up ANES Data for Examples
 825 | 
 826 | ```{r anes_des}
 827 | anes <- read_rds(here("Data", "anes.rds")) %>%
 828 |    mutate(Weight=Weight/sum(Weight)*224059005) 
 829 | # adjust weight to sum to citizen pop, 18+ in Nov 2016 per ANES methodology documentation
 830 | anes_des <- anes %>%
 831 |    as_survey_design(weights = Weight,
 832 |                     strata = Stratum,
 833 |                     ids = VarUnit,
 834 |                     nest = TRUE)
 835 | 
 836 | ```
 837 | ???
 838 | 
 839 | - American National Election Studies
 840 | - provides weights that sum to the sample, but we want to get population estimates
 841 |     - need to adjust the weight to get it to the population count
 842 | - as we mentioned before we will cover setting up the sample design object later
 843 | 
 844 | ---
 845 | ## `survey_count` Syntax
 846 | 
 847 | - `survey_count` functions similarly to `count` in that it is <b>NOT</b> called within `summarize`
 848 | 
 849 | - Produces weighted counts and variance of your choice of those counts
 850 | ```{r survey_count_syn, eval=FALSE}
 851 | survey_count(
 852 |   x,
 853 |   ...,
 854 |   wt = NULL,
 855 |   sort = FALSE,
 856 |   name = "n",
 857 |   .drop = dplyr::group_by_drop_default(x),
 858 |   vartype = c("se", "ci", "var", "cv")
 859 | )
 860 | ```
 861 | ???
 862 | - similar to count in that it takes one or many variables
 863 | - can change the variance type as we have seen in the other survey functions
 864 | 
 865 | ---
 866 | ## `survey_count` Example
 867 | 
 868 | - Cross-tab of population in each age group and gender
 869 | ```{r survey_count_ex}
 870 | anes_des %>%
 871 |   survey_count(AgeGroup, Gender, name="n")
 872 | 
 873 | ```
 874 | ???
 875 | - `survey_count` is placed on its own like `count`
 876 | - it does NOT go in a `summarize` function
 877 | - can take multiple variables
 878 | - can change the output count name, `n` is the default
 879 | 
 880 | ---
 881 | ## `survey_mean` and `survey_total` Examples
 882 | 
 883 | - `survey_mean` used with no x (variable) calculates a proportion of groups specified in `group_by`
 884 | - `survey_total` used with no x (variable) calculates a population count estimate for the groups specified in `group_by`
 885 | 
 886 | Cross-tab of population who voted in 2016
 887 | ```{r survey_p_ex1}
 888 | anes_des %>%
 889 |   filter(!is.na(VotedPres2016)) %>%
 890 |   group_by(VotedPres2016) %>%
 891 |   summarize(
 892 |     p=survey_mean(),
 893 |     N=survey_total(),
 894 |     n=unweighted(n()), .groups="drop"
 895 |   )
 896 | ```
 897 | ???
 898 | - to get proportions we use `group_by` and `survey_mean`
 899 | - also use `survey_total` to get a population count estimate as before
 900 | ---
 901 | ## Conditional proportions with more than one group
 902 | 
 903 | - Specifying more than one group calculates conditional proportions
 904 | - Example: people voting in 2012 and 2016
 905 | 
 906 | ```{r survey_p_cond}
 907 | anes_des %>%
 908 |   filter(!is.na(VotedPres2012), !is.na(VotedPres2016)) %>%
 909 |   group_by(VotedPres2012, VotedPres2016) %>%
 910 |   summarize(
 911 |     p=survey_mean(),
 912 |     N=survey_total(),
 913 |     n=unweighted(n()), .groups="drop"
 914 |   )
 915 | ```
 916 | ???
 917 | - Note that this is the proportion of voting in 2016 by whether people voted in 2012
 918 | - What if we don't want conditional proportions?
 919 | 
 920 | ---
 921 | ## Joint proportions with more than one group
 922 | 
 923 | - Specify an interaction to get joint distribution
 924 | - Example: people voting in 2012 and 2016
 925 | 
 926 | ```{r survey_p_joint}
 927 | anes_des %>%
 928 |   filter(!is.na(VotedPres2012), !is.na(VotedPres2016)) %>%
 929 |   group_by(groups = interaction(VotedPres2016, VotedPres2012)) %>% #<<
 930 |   summarize(
 931 |     p=survey_mean(),
 932 |     N=survey_total(),
 933 |     .groups="drop"
 934 |   )
 935 | ```
 936 | ???
 937 | - We add an interaction for the groups
 938 | - This outputs the joint distribution, but the `groups` variable is hard to interpret
 939 | 
 940 | ---
 941 | ## Joint proportions with more than one group
 942 | 
 943 | - Specify an interaction to get joint distribution
 944 | - Example: people voting in 2012 and 2016
 945 | 
 946 | ```{r survey_p_joint2}
 947 | anes_des %>%
 948 |   filter(!is.na(VotedPres2012), !is.na(VotedPres2016)) %>%
 949 |   group_by(groups = interaction(VotedPres2016, VotedPres2012)) %>% 
 950 |   summarize(
 951 |     VotedPres2012=VotedPres2012[1], #<<
 952 |     VotedPres2016=VotedPres2016[1], #<<
 953 |     p=survey_mean(),
 954 |     N=survey_total(),
 955 |     .groups="drop"
 956 |   )
 957 | ```
 958 | ???
 959 | - We can add in two variables one for `VotedPres2012` and `VotedPres2016`
 960 | - using the 1 in brackets pulls out the labels for these two variables so we see the "Yes" and "No" labels
 961 | 
 962 | ---
 963 | ## Proportions with Design Effects
 964 | 
 965 | ```{r survey_p_deff}
 966 | anes_des %>%
 967 |   filter(!is.na(VotedPres2012), !is.na(VotedPres2016)) %>%
 968 |   group_by(VotedPres2012, VotedPres2016) %>%
 969 |   summarize(
 970 |     p=survey_mean(deff=TRUE),
 971 |     N=survey_total()
 972 |   )
 973 | ```
 974 | ???
 975 | - Use `deff=TRUE` option in the `survey_mean` function
 976 | 
 977 | ---
 978 | ## `svychisq` Syntax
 979 | 
 980 | - As with testing on continuous variables, `svychisq` comes from the `survey` package
 981 | 
 982 | ```{r svychisq_syn, eval=FALSE}
 983 | svychisq(formula,
 984 |          design, 
 985 |          statistic = c("F",  "Chisq", "Wald", "adjWald", "lincom", "saddlepoint"),
 986 |          na.rm=TRUE,
 987 |          ...)
 988 |                        
 989 | ```
 990 | ???
 991 | - when we want to test categorical distributions we use `svychisq`
 992 | - it takes a formula, and the survey design data
 993 | 
 994 | ---
 995 | ## `svychisq` Example 1: Function Defaults
 996 | 
 997 | - How often can you trust the federal gov't to do what is right?
 998 | - How often can you trust other people?
 999 | 
1000 | ```{r svychisq_ex1}
1001 | anes_des %>%
1002 |    svychisq(design=.,
1003 |             formula=~TrustPeople +TrustGovernment)
1004 | 
1005 | 
1006 | ```
1007 | ???
1008 | - We want to compare the distributions of these two questions
1009 | 
1010 | ---
1011 | ## `svychisq` Example 2: Wald Statistic
1012 | 
1013 | - How often can you trust the federal gov't to do what is right?
1014 | - Who did you vote for? Clinton, Trump, or Other
1015 | 
1016 | ```{r svychisq_ex2}
1017 | anes_des %>%
1018 |    svychisq(design=.,
1019 |             formula=~TrustGovernment +VotedPres2016_selection,
1020 |             statistic="Wald")
1021 | 
1022 | 
1023 | ```
1024 | ???
1025 | - Can use different statistics
1026 | ---
1027 | ## Practice on your own
1028 | 
1029 | - Open CategoricalExercises.Rmd and work through the exercises
1030 | 
1031 | - We will take 10 minutes. Use this time for the exercises and a break
1032 | ---
1033 | class: inverse center middle
1034 | # Sample design object
1035 | ---
1036 | ## `tbl_svy` Object: Taylor's Series
1037 | 
1038 | - `tbl_svy` object defines the sampling design or replicate weights
1039 | 
1040 | - Key information is usually found in documentation of a public use file
1041 | 
1042 | ```{r sd_tsl_syn, eval=FALSE}
1043 | as_survey_design(
1044 |   .data,
1045 |   ids = NULL,#cluster IDs/PSUs
1046 |   strata = NULL,#strata variables
1047 |   variables = NULL,#defaults to all in .data
1048 |   fpc = NULL,#variables defining the fpc
1049 |   nest = FALSE,#TRUE/FALSE - relabel clusters to nest within strata
1050 |   check_strata = !nest, #check that clusters are nested in strata
1051 |   weights = NULL,# weight variable
1052 |   ...
1053 | )
1054 | ```
1055 | 
1056 | ???
1057 | - discussing TSL first
1058 | ---
1059 | ## `tbl_svy` for Common Designs
1060 | 
1061 | ```{r sd_tsl_gen_ex, eval=FALSE}
1062 | # simple random sample (SRS)
1063 | apisrs %>% as_survey_design(fpc = fpc)
1064 | 
1065 | # stratified sample
1066 | apistrat %>% as_survey_design(strata = stype, weights = pw)
1067 | 
1068 | # one-stage cluster sample
1069 | apiclus1 %>% as_survey_design(ids = dnum, weights = pw, fpc = fpc)
1070 | 
1071 | # two-stage cluster sample, weights computed from pop size
1072 | apiclus2 %>% as_survey_design(ids = c(dnum, snum), fpc = c(fpc1, fpc2))
1073 | 
1074 | # stratified, cluster design
1075 | apistrat %>% as_survey_design(ids = dnum, strata = stype, weights =pw, nest = TRUE)
1076 | 
1077 | ```
1078 | 
1079 | - examples from `srvyr` help documentation
1080 | 
1081 | ---
1082 | ## ANES Design Object
1083 | 
1084 | .smaller[
1085 | ```{r sd_anes, eval=TRUE}
1086 | anes_des <- anes %>%
1087 |    as_survey_design(weights = Weight,
1088 |                     strata = Stratum,
1089 |                     ids = VarUnit,
1090 |                     nest = TRUE)
1091 | summary(anes_des)
1092 | ```
1093 | ]
1094 | 
1095 | ???
1096 | - Pseudo-strata (Stratum) and pseudo-cluster (VarUnit) included for variance estimation
1097 | - we renamed these variables to be more obvious but original documentation has diff var names
1098 | 
1099 | ---
1100 | ## `tbl_svy` Objects with Supplied Replicate Weights
1101 | 
1102 | - Key information is usually found in documentation of a public use file
1103 | 
1104 | ```{r sd_rep_syn, eval=FALSE}
1105 | as_survey_rep(
1106 |   .data,
1107 |   variables = NULL,#defaults to all in .data
1108 |   repweights = NULL,#variables specifying replicate weights
1109 |   weights = NULL,#variable for analysis weight
1110 |   type = c("BRR", "Fay", "JK1", "JKn", "bootstrap", "other"),
1111 |   rho = NULL,#shrinkage factor for Fay's method,
1112 |   mse = getOption("survey.replicates.mse"), #	if TRUE, compute variances based on 
1113 |   # sum of squares around the point estimate, rather than the mean of the replicates
1114 |   scale = NULL, # overall multiplier for squared deviations
1115 |   ...
1116 | )
1117 | 
1118 | ```
1119 | ---
1120 | ## RECS Design Object
1121 | 
1122 | .smaller[
1123 | ```{r sd_recs, eval=TRUE}
1124 | recs_des <- recs %>%
1125 |    as_survey_rep(weights=NWEIGHT,
1126 |                  repweights=starts_with("BRRWT"),
1127 |                  type="Fay",
1128 |                  rho=0.5,
1129 |                  mse=TRUE)
1130 | summary(recs_des)
1131 | ```
1132 | ]
1133 | 
1134 | ???
1135 | - Fay's method of BRR weight with $\epsilon=0.5$
1136 | - RECS documentation includes syntax for creating survey design object
1137 | - https://www.eia.gov/consumption/residential/data/2015/pdf/microdata_v3.pdf
1138 | 
1139 | ---
1140 | ## Create Replicate Weights: jackknife
1141 | 
1142 | - You can also start with a design object specified by the design and create replicate weights
1143 | .smaller[
1144 | ```{r sd_create_rep}
1145 | data(api)
1146 | dclus1 <- apiclus1 %>% as_survey_design(ids = dnum, weights = pw, fpc = fpc)
1147 | rclus1 <- as_survey_rep(dclus1)
1148 | summary(rclus1)
1149 | 
1150 | ```
1151 | ]
1152 | ---
1153 | ## Create Replicate Weights: bootstrap
1154 | 
1155 | - You can also start with a design object specified by the design and create replicate weights
1156 | .small[
1157 | ```{r sd_create_boot}
1158 | bclus1 <- as_survey_rep(dclus1, type="bootstrap", replicates=100)
1159 | summary(bclus1)
1160 | 
1161 | ```
1162 | ]
1163 | ---
1164 | ## Create Survey Design Object for ACS
1165 | 
1166 | Fill in the blanks
1167 | - Analysis weight: PWGTP
1168 | - replicate weights: PWGTP1-PWGTP180
1169 | - jackknife with scale adjustment of 4/80
1170 | ```{r sd_acs_fib, eval=FALSE}
1171 | acs_des <- acs_pums %>%
1172 |   as_survey_rep(
1173 |     weights=___________,
1174 |     repweights=___________,
1175 |     type=___________,
1176 |     scale=_________
1177 |   )
1178 | ```
1179 | --
1180 | 
1181 | ```{r sd_acs_fib_sol, eval=FALSE}
1182 | acs_des <- acs_pums %>%
1183 |    as_survey_rep(
1184 |       weights=PWGTP,
1185 |       repweights=stringr::str_c("PWGTP", 1:80),
1186 |       type="JK1",
1187 |       scale=4/80
1188 |    )
1189 | 
1190 | ```
1191 | ---
1192 | ## Create Survey Design Object for CPS 2011 Supplement
1193 | 
1194 | Fill in the blanks
1195 | - Analysis weight: wtsupp
1196 | - replicate weights: repwtp1 -repwtp160
1197 | - BRR
1198 | ```{r sd_cps_fib, eval=FALSE}
1199 | cps_des <- cps %>%
1200 |   as_survey_rep(
1201 |     weights=___________,
1202 |     repweights=___________,
1203 |     type=___________
1204 |   )
1205 | ```
1206 | --
1207 | ```{r sd_cps_fib_sol, eval=FALSE}
1208 | cps_des <- cps %>%
1209 |   as_survey_rep(
1210 |     weights=wtsupp,
1211 |     repweights=starts_with("repwtp"),
1212 |     type="BRR"
1213 |   )
1214 | ```
1215 | ---
1216 | ## Create Survey Design Object for NHANES
1217 | 
1218 | Fill in the blanks
1219 | - Analysis weight: WTINT2YR
1220 | - Variance Stratum: SDMVSTRA
1221 | - Variance Primary Sampling Unit: VPSU
1222 | ```{r sd_nhanes_fib, eval=FALSE}
1223 | nhanes_des <- nhanes %>%
1224 |   as_survey_design(
1225 |     weights=___________,
1226 |     ids=___________,
1227 |     strata=___________,
1228 |     fpc=___________
1229 |   )
1230 | ```
1231 | --
1232 | ```{r sd_nhanes_fib_sol, eval=FALSE}
1233 | nhanes_des <- nhanes %>%
1234 |   as_survey_design(
1235 |     weights=WTINT2YR,
1236 |     ids=VPSU,
1237 |     strata=SDMVSTRA,
1238 |     fpc=NULL
1239 |   )
1240 | ```
1241 | ---
1242 | ## Create Survey Design Object for LEMAS 2016
1243 | 
1244 | Fill in the blanks
1245 | - Analysis weight: ANALYSISWEIGHT
1246 | - Variance Stratum: STRATA
1247 | - FPC: FRAMESIZE
1248 | ```{r sd_lemas_fib, eval=FALSE}
1249 | lemas_des <- lemas %>%
1250 |   as_survey_design(
1251 |     weights=___________,
1252 |     ids=___________,
1253 |     strata=___________,
1254 |     fpc=___________
1255 |   )
1256 | ```
1257 | --
1258 | 
1259 | ```{r sd_lemas_fib_sol, eval=FALSE}
1260 | lemas_des <- lemas %>%
1261 |   as_survey_design(
1262 |     weights=ANALYSISWEIGHT,
1263 |     ids=1,
1264 |     strata=STRATA,
1265 |     fpc=FRAMESIZE
1266 |   )
1267 | ```
1268 | 
1269 | ---
1270 | class: inverse center middle
1271 | # Closing
1272 | ---
1273 | ## Resources for more learning
1274 | 
1275 | - https://cran.r-project.org/web/packages/srvyr/vignettes/srvyr-vs-survey.html
1276 | 
1277 | - https://r-survey.r-forge.r-project.org/survey/ 
1278 |   - Includes more advanced modeling
1279 | 
1280 | 
1281 | ---
1282 | ## Thank You!
1283 | 
1284 | ### We hope you learned a lot in this short course!
1285 | 
1286 | Please let us know if you have any feedback on this course.  You will receive an email from AAPOR asking you to fill out a survey about this course. All feedback is welcome!
1287 | 
1288 | 
1289 | ## Questions?
1290 | 
1291 | ---
1292 | ## Sources
1293 | 
1294 | - <font size="2">The American National Election Studies (https://electionstudies.org/). These materials are based on work supported by the National Science Foundation under grant numbers SES 1444721, 2014-2017, the University of Michigan, and Stanford University.  </font>
1295 | 
1296 | - <font size="2">*Residential Energy Consumption Survey: Using the 2015 Microdata File to Compute Estimates and Standard Errors.* U.S. Department of Energy (2017) https://www.eia.gov/consumption/residential/data/2015/pdf/microdata_v3.pdf </font>
1297 | 
1298 | - <font size="2">Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer Archipelago (Antarctica) penguin data. R package version 0.1.0. https://allisonhorst.github.io/palmerpenguins/ </font>
1299 | 
1300 | - <font size="2">T. Lumley (2020) "survey: analysis of complex survey samples". R package version 4.0. https://r-survey.r-forge.r-project.org/survey/ </font>
1301 | 
1302 | - <font size="2">Greg Freedman Ellis and Ben Schneider (2020). srvyr: 'dplyr'-Like Syntax for Summary Statistics of Survey Data. R package version 1.0.0. https://CRAN.R-project.org/package=srvyr </font>
1303 | 
1304 | - <font size="2">Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2021). dplyr: A Grammar of Data Manipulation. R package version 1.0.5. https://CRAN.R-project.org/package=dplyr </font>
1305 | 


--------------------------------------------------------------------------------
/Presentation/Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/Presentation/Slides.pdf


--------------------------------------------------------------------------------
/Presentation/Slides.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/Presentation/Slides.pptx


--------------------------------------------------------------------------------
/Presentation/Slides_files/figure-html/plot_sf_elbill_disp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/Presentation/Slides_files/figure-html/plot_sf_elbill_disp-1.png


--------------------------------------------------------------------------------
/Presentation/xaringan-themer.css:
--------------------------------------------------------------------------------
  1 | /* -------------------------------------------------------
  2 |  *
  3 |  *     !! This file was generated by xaringanthemer !!
  4 |  *
  5 |  *  Changes made to this file directly will be overwritten
  6 |  *  if you used xaringanthemer in your xaringan slides Rmd
  7 |  *
  8 |  *  Issues or likes?
  9 |  *    - https://github.com/gadenbuie/xaringanthemer
 10 |  *    - https://www.garrickadenbuie.com
 11 |  *
 12 |  *  Need help? Try:
 13 |  *    - vignette(package = "xaringanthemer")
 14 |  *    - ?xaringanthemer::style_xaringan
 15 |  *    - xaringan wiki: https://github.com/yihui/xaringan/wiki
 16 |  *    - remarkjs wiki: https://github.com/gnab/remark/wiki
 17 |  *
 18 |  *  Version: 0.3.3
 19 |  *
 20 |  * ------------------------------------------------------- */
 21 | @import url(https://fonts.googleapis.com/css?family=Noto+Sans:400,400i,700,700i&display=swap);
 22 | @import url(https://fonts.googleapis.com/css?family=Cabin:600,600i&display=swap);
 23 | @import url(https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700&display=swap);
 24 | 
 25 | 
 26 | :root {
 27 |   /* Fonts */
 28 |   --text-font-family: 'Noto Sans';
 29 |   --text-font-is-google: 1;
 30 |   --text-font-family-fallback: -apple-system, BlinkMacSystemFont, avenir next, avenir, helvetica neue, helvetica, Ubuntu, roboto, noto, segoe ui, arial;
 31 |   --text-font-base: sans-serif;
 32 |   --header-font-family: Cabin;
 33 |   --header-font-is-google: 1;
 34 |   --header-font-family-fallback: Georgia, serif;
 35 |   --code-font-family: 'Source Code Pro';
 36 |   --code-font-is-google: 1;
 37 |   --base-font-size: 20px;
 38 |   --text-font-size: 1rem;
 39 |   --code-font-size: 0.9rem;
 40 |   --code-inline-font-size: 1em;
 41 |   --header-h1-font-size: 2.75rem;
 42 |   --header-h2-font-size: 2.25rem;
 43 |   --header-h3-font-size: 1.75rem;
 44 | 
 45 |   /* Colors */
 46 |   --text-color: #000000;
 47 |   --header-color: #1E4F96;
 48 |   --background-color: #FFFFFF;
 49 |   --link-color: #1E4F96;
 50 |   --text-bold-color: #1E4F96;
 51 |   --code-highlight-color: rgba(255,255,0,0.5);
 52 |   --inverse-text-color: #000000;
 53 |   --inverse-background-color: #00A3E0;
 54 |   --inverse-header-color: #FFFFFF;
 55 |   --inverse-link-color: #1E4F96;
 56 |   --title-slide-background-color: #1E4F96;
 57 |   --title-slide-text-color: #FFFFFF;
 58 |   --header-background-color: #1E4F96;
 59 |   --header-background-text-color: #FFFFFF;
 60 |   --primary: #1E4F96;
 61 |   --secondary: #00A3E0;
 62 |   --white: #FFFFFF;
 63 |   --black: #000000;
 64 | }
 65 | 
 66 | html {
 67 |   font-size: var(--base-font-size);
 68 | }
 69 | 
 70 | body {
 71 |   font-family: var(--text-font-family), var(--text-font-family-fallback), var(--text-font-base);
 72 |   font-weight: normal;
 73 |   color: var(--text-color);
 74 | }
 75 | h1, h2, h3 {
 76 |   font-family: var(--header-font-family), var(--header-font-family-fallback);
 77 |   font-weight: 600;
 78 |   color: var(--header-color);
 79 | }
 80 | .remark-slide-content {
 81 |   background-color: var(--background-color);
 82 |   font-size: 1rem;
 83 |   padding: 16px 64px 16px 64px;
 84 |   width: 100%;
 85 |   height: 100%;
 86 | }
 87 | .remark-slide-content h1 {
 88 |   font-size: var(--header-h1-font-size);
 89 | }
 90 | .remark-slide-content h2 {
 91 |   font-size: var(--header-h2-font-size);
 92 | }
 93 | .remark-slide-content h3 {
 94 |   font-size: var(--header-h3-font-size);
 95 | }
 96 | .remark-code, .remark-inline-code {
 97 |   font-family: var(--code-font-family), Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace;
 98 | }
 99 | .remark-code {
100 |   font-size: var(--code-font-size);
101 | }
102 | .remark-inline-code {
103 |   font-size: var(--code-inline-font-size);
104 |   color: #1E4F96;
105 | }
106 | .remark-slide-number {
107 |   color: #1E4F96;
108 |   opacity: 1;
109 |   font-size: 0.9em;
110 | }
111 | strong { color: var(--text-bold-color); }
112 | a, a > code {
113 |   color: var(--link-color);
114 |   text-decoration: none;
115 | }
116 | .footnote {
117 |   position: absolute;
118 |   bottom: 60px;
119 |   padding-right: 4em;
120 |   font-size: 0.9em;
121 | }
122 | .remark-code-line-highlighted {
123 |   background-color: var(--code-highlight-color);
124 | }
125 | .inverse {
126 |   background-color: var(--inverse-background-color);
127 |   color: var(--inverse-text-color);
128 |   
129 | }
130 | .inverse h1, .inverse h2, .inverse h3 {
131 |   color: var(--inverse-header-color);
132 | }
133 | .inverse a, .inverse a > code {
134 |   color: var(--inverse-link-color);
135 | }
136 | .title-slide, .title-slide h1, .title-slide h2, .title-slide h3 {
137 |   color: var(--title-slide-text-color);
138 | }
139 | .title-slide {
140 |   background-color: var(--title-slide-background-color);
141 | }
142 | .title-slide .remark-slide-number {
143 |   display: none;
144 | }
145 | /* Two-column layout */
146 | .left-column {
147 |   width: 20%;
148 |   height: 92%;
149 |   float: left;
150 | }
151 | .left-column h2, .left-column h3 {
152 |   color: #1E4F9699;
153 | }
154 | .left-column h2:last-of-type, .left-column h3:last-child {
155 |   color: #1E4F96;
156 | }
157 | .right-column {
158 |   width: 75%;
159 |   float: right;
160 |   padding-top: 1em;
161 | }
162 | .pull-left {
163 |   float: left;
164 |   width: 47%;
165 | }
166 | .pull-right {
167 |   float: right;
168 |   width: 47%;
169 | }
170 | .pull-right + * {
171 |   clear: both;
172 | }
173 | img, video, iframe {
174 |   max-width: 100%;
175 | }
176 | blockquote {
177 |   border-left: solid 5px #00A3E080;
178 |   padding-left: 1em;
179 | }
180 | .remark-slide table {
181 |   margin: auto;
182 |   border-top: 1px solid #666;
183 |   border-bottom: 1px solid #666;
184 | }
185 | .remark-slide table thead th {
186 |   border-bottom: 1px solid #ddd;
187 | }
188 | th, td {
189 |   padding: 5px;
190 | }
191 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) {
192 |   background: #CCECF8;
193 | }
194 | table.dataTable tbody {
195 |   background-color: var(--background-color);
196 |   color: var(--text-color);
197 | }
198 | table.dataTable.display tbody tr.odd {
199 |   background-color: var(--background-color);
200 | }
201 | table.dataTable.display tbody tr.even {
202 |   background-color: #CCECF8;
203 | }
204 | table.dataTable.hover tbody tr:hover, table.dataTable.display tbody tr:hover {
205 |   background-color: rgba(255, 255, 255, 0.5);
206 | }
207 | .dataTables_wrapper .dataTables_length, .dataTables_wrapper .dataTables_filter, .dataTables_wrapper .dataTables_info, .dataTables_wrapper .dataTables_processing, .dataTables_wrapper .dataTables_paginate {
208 |   color: var(--text-color);
209 | }
210 | .dataTables_wrapper .dataTables_paginate .paginate_button {
211 |   color: var(--text-color) !important;
212 | }
213 | 
214 | /* Slide Header Background for h1 elements */
215 | .remark-slide-content.header_background > h1 {
216 |   display: block;
217 |   position: absolute;
218 |   top: 0;
219 |   left: 0;
220 |   width: 100%;
221 |   background: var(--header-background-color);
222 |   color: var(--header-background-text-color);
223 |   padding: 2rem 64px 1.5rem 64px;
224 |   margin-top: 0;
225 |   box-sizing: border-box;
226 | }
227 | .remark-slide-content.header_background {
228 |   padding-top: 7rem;
229 | }
230 | 
231 | @page { margin: 0; }
232 | @media print {
233 |   .remark-slide-scaler {
234 |     width: 100% !important;
235 |     height: 100% !important;
236 |     transform: scale(1) !important;
237 |     top: 0 !important;
238 |     left: 0 !important;
239 |   }
240 | }
241 | 
242 | .primary {
243 |   color: var(--primary);
244 | }
245 | .bg-primary {
246 |   background-color: var(--primary);
247 | }
248 | .secondary {
249 |   color: var(--secondary);
250 | }
251 | .bg-secondary {
252 |   background-color: var(--secondary);
253 | }
254 | .white {
255 |   color: var(--white);
256 | }
257 | .bg-white {
258 |   background-color: var(--white);
259 | }
260 | .black {
261 |   color: var(--black);
262 | }
263 | .bg-black {
264 |   background-color: var(--black);
265 | }
266 | 
267 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | A new version of this course is at: https://github.com/tidy-survey-r/tidy-survey-short-course
 2 | 
 3 | # Tidy Survey Analysis in R using the srvyr Package
 4 | Materials for [AAPOR short course](https://www.aapor.org/Conference-Events/Annual-Meeting/Short-Courses.aspx) on Tidy Survey Analysis in R using the `srvyr` Package in May 2021
 5 | 
 6 | - **RawData** folder contains public use file data along with any documentation
 7 |    - American National Election Studies, 2016
 8 |    - Residential Energy Consumption Survey, 2015
 9 | - **DataCleaningScripts** folder contains scripts for making public use files analysis ready
10 |    - Create derived variables
11 |    - Renames some variables
12 |    - Selects fewer variables just for examples
13 | - **Data** folder contains data files ready for analysis in presentation and examples
14 | - **Presentation** folder contains the slides for the course
15 |    - Includes Rmd to create slides
16 |    - Slides are available in html, pptx, R, and PDF
17 | - **Exercises** contains RMD and R files with exercises and solutions to practice concepts
18 | 
19 | ## Sources
20 | 
21 | - The American National Election Studies (https://electionstudies.org/). These materials are based on work supported by the National Science Foundation under grant numbers SES 1444721, 2014-2017, the University of Michigan, and Stanford University. 
22 | 
23 | - *Residential Energy Consumption Survey: Using the 2015 Microdata File to Compute Estimates and Standard Errors.* U.S. Department of Energy (2017) https://www.eia.gov/consumption/residential/data/2015/pdf/microdata_v3.pdf
24 | 
25 | - Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer Archipelago (Antarctica) penguin data. R package version 0.1.0. https://allisonhorst.github.io/palmerpenguins/
26 | 
27 | - T. Lumley (2020) "survey: analysis of complex survey samples". R package version 4.0. https://r-survey.r-forge.r-project.org/survey/
28 | 
29 | - Greg Freedman Ellis and Ben Schneider (2020). srvyr: 'dplyr'-Like Syntax for Summary Statistics of Survey Data. R package version 1.0.0. https://CRAN.R-project.org/package=srvyr
30 | 
31 | - Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2021). dplyr: A Grammar of Data Manipulation. R package version 1.0.5. https://CRAN.R-project.org/package=dplyr
32 | 


--------------------------------------------------------------------------------
/RawData/ANES_2016/anes_timeseries_2016.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/RawData/ANES_2016/anes_timeseries_2016.sav


--------------------------------------------------------------------------------
/RawData/ANES_2016/anes_timeseries_2016_qnaire_post.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/RawData/ANES_2016/anes_timeseries_2016_qnaire_post.pdf


--------------------------------------------------------------------------------
/RawData/ANES_2016/anes_timeseries_2016_qnaire_pre.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/RawData/ANES_2016/anes_timeseries_2016_qnaire_pre.pdf


--------------------------------------------------------------------------------
/RawData/ANES_2016/anes_timeseries_2016_userguidecodebook.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/RawData/ANES_2016/anes_timeseries_2016_userguidecodebook.pdf


--------------------------------------------------------------------------------
/RawData/RECS_2015/2020_RECS-457A.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/RawData/RECS_2015/2020_RECS-457A.pdf


--------------------------------------------------------------------------------
/RawData/RECS_2015/README.md:
--------------------------------------------------------------------------------
1 | # Residential Energy Consumption Survey (RECS) 2015
2 | 
3 | All data and resources were downloaded from https://www.eia.gov/consumption/residential/data/2015/index.php?view=microdata on March 3, 2021.


--------------------------------------------------------------------------------
/RawData/RECS_2015/codebook_publicv4.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/RawData/RECS_2015/codebook_publicv4.xlsx


--------------------------------------------------------------------------------
/RawData/RECS_2015/microdata_v3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szimmer/tidy-survey-aapor-2021/5a82c36eb619bf67ccec2593a770904eedcd1d18/RawData/RECS_2015/microdata_v3.pdf


--------------------------------------------------------------------------------
/tidy-survey-short-course.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 3
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------