├── .DS_Store
├── .Rhistory
├── .gitignore
├── Census-Data-in-R.Rproj
├── LICENSE
├── Lessons
    ├── Census-Data-in-R-Challenges.Rmd
    ├── Census-Data-in-R-Challenges.html
    ├── Census-Data-in-R-Slides.Rmd
    ├── Census-Data-in-R-Slides.html
    ├── Census-Data-in-R.Rmd
    └── Census-Data-in-R.html
├── README.md
├── Solutions
    ├── Census-Data-in-R-Solutions.Rmd
    └── Census-Data-in-R-Solutions.html
├── data
    ├── .DS_Store
    ├── .Rapp.history
    ├── census2010_vars.png
    ├── census_data_by_prod_geo.png
    ├── census_geo_hierarchy.png
    ├── census_geodata.png
    ├── census_page.png
    ├── cenvar_lookup.csv
    ├── mapview_example.png
    ├── request_api_key.png
    ├── swd.png
    └── tidycensus_articles.png
├── data_out
    └── readme_data_out.txt
├── install.R
├── previous_versions
    ├── Rcensus_data_maps-slides.Rmd
    ├── Rcensus_data_maps-slides.html
    ├── Rcensus_data_maps-tutorial.Rmd
    ├── Rcensus_data_maps-tutorial.html
    └── snippets_to_save_for_later.Rmd
└── runtime.txt


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/.DS_Store


--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
  1 | year = 2010,
  2 | geometry=TRUE,
  3 | shift_geo=TRUE)
  4 | ## Shift Happens!
  5 | plot(pop2010geo_shifted$geometry)
  6 | st_write(pop2010geo_shifted, here("data_out/usa_pop2010_shifted.shp"))
  7 | # Check to see if the data was written out to a shapefile
  8 | dir(here("data_out"))
  9 | ca_med_age <- get_decennial(geography = "county",
 10 | variables = "P013001",
 11 | year = 2010,
 12 | state='CA',
 13 | geometry=TRUE)
 14 | # map it with plot
 15 | plot(ca_med_age['value'])
 16 | # map it with ggplot - setting CRS to 3310
 17 | ca_med_age %>%
 18 | ggplot(aes(fill = value)) +
 19 | geom_sf(color=NA) +
 20 | coord_sf(crs = 3310) +
 21 | scale_fill_viridis_c(option = "viridis")
 22 | vars_acs2019 <- load_variables(year=2019,      # end year 2016-2020 period
 23 | dataset = 'acs5', # the ACS data product
 24 | cache = T) # Save locally for future access
 25 | # how many variables?
 26 | dim(vars_acs2019)
 27 | vars_acs2019 <- load_variables(year=2020,      # end year 2016-2020 period
 28 | dataset = 'acs5', # the ACS data product
 29 | cache = T) # Save locally for future access
 30 | # how many variables?
 31 | dim(vars_acs2019)
 32 | alco_mhhincome2 <- get_acs(geography='tract',
 33 | variables=c(median_hhincome = "B19013_001"),
 34 | year = 2015,
 35 | state='CA',
 36 | county='Alameda',
 37 | geometry=TRUE
 38 | )
 39 | plot(alco_mhhincome['estimate'])
 40 | plot(alco_mhhincome2['estimate'])
 41 | plot(alco_mhhincome['estimate'])
 42 | plot(alco_mhhincome2['estimate'])
 43 | ```
 44 | plot(alco_mhhincome['estimate'])
 45 | plot(alco_mhhincome2['estimate'])
 46 | ```
 47 | alco_mhhincome2 <- get_acs(geography='tract',
 48 | variables=c(median_hhincome = "B19013_001"),
 49 | year = 2015,
 50 | state='CA',
 51 | county='Alameda',
 52 | geometry=TRUE
 53 | )
 54 | plot(alco_mhhincome2['estimate'])
 55 | ```
 56 | vars_acs2015 <- load_variables(year=2015,      # end year 2016-2020 period
 57 | dataset = 'acs5', # the ACS data product
 58 | cache = T) # Save locally for future access
 59 | View(vars_acs2015)
 60 | alco_mhhincome2 <- get_acs(geography='tract',
 61 | variables=c(median_hhincome = "B19019_001"),
 62 | year = 2015,
 63 | state='CA',
 64 | county='Alameda',
 65 | geometry=TRUE
 66 | )
 67 | ```
 68 | alco_mhhincome2 <- get_acs(geography='tract',
 69 | variables=c(median_hhincome = "B19019_001"),
 70 | year = 2015,
 71 | state='CA',
 72 | county='Alameda',
 73 | geometry=TRUE
 74 | )
 75 | ```
 76 | B19013_001
 77 | alco_mhhincome2 <- get_acs(geography='tract',
 78 | variables=c(median_hhincome = "B19013_001"),
 79 | year = 2015,
 80 | state='CA',
 81 | county='Alameda',
 82 | geometry=TRUE
 83 | )
 84 | alco_mhhincome2 <- get_acs(geography='tract',
 85 | variables=c(median_hhincome = "B19019_001"),
 86 | year = 2015,
 87 | state='CA',
 88 | county='Alameda',
 89 | geometry=TRUE
 90 | )
 91 | plot(alco_mhhincome['estimate'])
 92 | alco_mhhincome <- get_acs(geography='tract',
 93 | variables=c(median_hhincome = "B19013_001"),
 94 | year = 2020,
 95 | state='CA',
 96 | county='Alameda',
 97 | geometry=TRUE
 98 | )
 99 | plot(alco_mhhincome['estimate'])
100 | plot(alco_mhhincome2['estimate'])
101 | ```
102 | plot(alco_mhhincome['estimate'])
103 | plot(alco_mhhincome2['estimate'])
104 | ```
105 | plot(alco_mhhincome['estimate'])
106 | plot(alco_mhhincome2['estimate'])
107 | ```
108 | plot(alco_mhhincome['estimate'])
109 | plot(alco_mhhincome2['estimate'])
110 | ```
111 | alco_mhhincome2015 <- get_acs(geography='tract',
112 | variables=c(median_hhincome = "B19019_001"),
113 | year = 2015,
114 | state='CA',
115 | county='Alameda',
116 | geometry=TRUE
117 | )
118 | plot(alco_mhhincome2015['estimate'])
119 | head(alco_mhhincome)
120 | plot(alco_mhhincome['estimate'])
121 | alco_mhhincome2015 <- get_acs(geography='tract',
122 | variables=c(median_hhincome = "B19019_001"),
123 | year = 2015,
124 | state='CA',
125 | county='Alameda',
126 | geometry=TRUE
127 | )
128 | plot(alco_mhhincome2015['estimate'])
129 | alco_mhhincome <- get_acs(geography='tract',
130 | variables=c(median_hhincome = "B19013_001"),
131 | year = 2020,
132 | state='CA',
133 | county='San Francisco',
134 | geometry=TRUE
135 | )
136 | head(alco_mhhincome)
137 | plot(alco_mhhincome['estimate'])
138 | mapview(alco_mhhincome['estimate'])
139 | alco_mhhincome <- get_acs(geography='tract',
140 | variables=c(median_hhincome = "B19013_001", pop20='P001001'),
141 | year = 2020,
142 | state='CA',
143 | county='San Francisco',
144 | geometry=TRUE
145 | )
146 | mapview(alco_mhhincome['estimate'])
147 | View(vars_acs2019)
148 | alco_mhhincome <- get_acs(geography='tract',
149 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'),
150 | year = 2020,
151 | state='CA',
152 | county='San Francisco',
153 | geometry=TRUE
154 | )
155 | mapview(alco_mhhincome['estimate'])
156 | mapview(alco_mhhincome)
157 | head(alco_mhhincome)
158 | alco_mhhincome <- get_acs(geography='tract',
159 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'),
160 | year = 2020,
161 | state='CA',
162 | county='San Francisco',
163 | geometry=TRUE,
164 | wide=T
165 | )
166 | head(alco_mhhincome)
167 | alco_mhhincome <- get_acs(geography='tract',
168 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'),
169 | year = 2020,
170 | state='CA',
171 | county='San Francisco',
172 | geometry=TRUE,
173 | output="wide"
174 | )
175 | head(alco_mhhincome)
176 | mapview(alco_mhhincome)
177 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'])
178 | mapview(alco_mhhincome[alco_mhhincome['median_hhincomeE'].isna(),])
179 | mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),])
180 | What is the variable?
181 | ```{r}
182 | mapview(alco_mhhincome)
183 | mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),])
184 | What is the variable?
185 | ```{r}
186 | mapview(alco_mhhincome)
187 | mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),])
188 | What is the variable?
189 | ```{r}
190 | mapview(alco_mhhincome)
191 | #mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),])
192 | What is the variable?
193 | ```{r}
194 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = NA)
195 | #mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),])
196 | What is the variable?
197 | ```{r}
198 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = black)
199 | #mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),])
200 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = black)
201 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black")
202 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome)
203 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, col.regions="white")
204 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, zcol="pop20", col.regions="white")
205 | alco_mhhincome <- get_acs(geography='tract',
206 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'),
207 | year = 2020,
208 | state='CA',
209 | county='San Francisco',
210 | geometry=TRUE,
211 | output="wide"
212 | )
213 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, zcol="pop20", col.regions="white")
214 | What is the variable?
215 | ```{r}
216 | #mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, zcol="pop20")
217 | #mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),])
218 | mapview(alco_mhhincome, zcol="pop20")
219 | head(alco_mhhincome)
220 | mapview(alco_mhhincome, zcol="pop20E")
221 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, zcol="pop20E", col.regions="white")
222 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "yellow") + mapview(alco_mhhincome, zcol="pop20E", col.regions="blue")
223 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "yellow") + mapview(alco_mhhincome, zcol="pop20E", col.regions="blue", alpha.regions=0.5)
224 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "yellow",alpha.regions) + mapview(alco_mhhincome, zcol="pop20E", col.regions="blue", alpha.regions=0.5)
225 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "yellow",alpha.regions=0.5) + mapview(alco_mhhincome, zcol="pop20E", col.regions="blue", alpha.regions=0.5)
226 | mapview(alco_mhhincome['median_hhincomeE'],col.regions = "yellow",alpha.regions=0.5) + mapview(alco_mhhincome, zcol="pop20E", col.regions="lightblue", alpha.regions=0.5)
227 | mapview(alco_mhhincome['median_hhincomeE'],col.regions = "white",alpha.regions=0.5) + mapview(alco_mhhincome, zcol="pop20E", col.regions="black", alpha.regions=0.5)
228 | mapview(alco_mhhincome['median_hhincomeE'],col.regions = "white",alpha.regions=0.75) + mapview(alco_mhhincome, zcol="pop20E", col.regions="black", alpha.regions=0.5)
229 | mapview(alco_mhhincome['median_hhincomeE'],col.regions = "white",alpha.regions=0.75) + mapview(alco_mhhincome, zcol="pop20E", col.regions="black", alpha.regions=0.25)
230 | alco_mhhincome <- get_acs(geography='tract',
231 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'),
232 | year = 2020,
233 | state='CA',
234 | county='San Francisco',
235 | geometry=TRUE, # get the geography too
236 | output="wide"
237 | )
238 | head(alco_mhhincome)
239 | alco_mhhincome <- get_acs(geography='tract',
240 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'),
241 | year = 2020,
242 | state='CA',
243 | county='San Francisco',
244 | geometry=TRUE # get the geography too
245 | )
246 | head(alco_mhhincome)
247 | alco_mhhincome <- get_acs(geography='tract',
248 | variables="B19013_001",
249 | year = 2020,
250 | state='CA',
251 | county='San Francisco',
252 | geometry=TRUE # get the geography too
253 | )
254 | head(alco_mhhincome)
255 | alco_mhhincome <- get_acs(geography='tract',
256 | variables="B19013_001",
257 | year = 2020,
258 | state='CA',
259 | county='San Francisco',
260 | geometry=TRUE, # get the geography too
261 | output="wide"
262 | )
263 | head(alco_mhhincome)
264 | ```{r}
265 | plot(alco_mhhincome['estimate'])
266 | alco_mhhincome <- get_acs(geography='tract',
267 | variables="B19013_001",
268 | year = 2020,
269 | state='CA',
270 | county='San Francisco',
271 | geometry=TRUE, # get the geography too
272 | output="wide"
273 | )
274 | ```{r}
275 | plot(alco_mhhincome['estimate'])
276 | alco_mhhincome <- get_acs(geography='tract',
277 | variables="B19013_001",
278 | year = 2020,
279 | state='CA',
280 | county='San Francisco',
281 | geometry=TRUE # get the geography too
282 | )
283 | ```{r}
284 | head(alco_mhhincome)
285 | ```
286 | ```{r}
287 | plot(alco_mhhincome['estimate'])
288 | alco_mhhincome['estimate'] %>% select(estimate != NA) %>% plot()
289 | ```
290 | alco_mhhincome['estimate'] %>% select(estimate != NA) %>% plot(estimate)
291 | ```
292 | alco_mhhincome['estimate'] %>% filter(na.rm(estimate)) %>% plot(estimate)
293 | ```
294 | alco_mhhincome %>% filter(estimate, na.rm= TRUE %>% plot(estimate)
295 | ```
296 | alco_mhhincome %>% filter(estimate, na.rm= TRUE) %>% plot(estimate)
297 | ```
298 | alco_mhhincome %>% filter(estimate, na.rm= TRUE) #%>% plot(estimate)
299 | ```
300 | alco_mhhincome %>% filter(! is.na(estimate)) #%>% plot(estimate)
301 | ```
302 | alco_mhhincome %>% filter(! is.na(estimate)) #%>% plot(estimate)
303 | ```
304 | alco_mhhincome %>% filter(! is.na(estimate)) %>% plot(estimate)
305 | ```
306 | alco_mhhincome %>% filter(! is.na(estimate)) %>% plot(alco_mhhincome['estimate'])
307 | ```
308 | ```{r}
309 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),]['estimate'])
310 | plot(alco_mhhincome['estimate'])
311 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),]['estimate'])
312 | plot(alco_mhhincome['estimate'])
313 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),]['estimate'])
314 | plot(alco_mhhincome['estimate'])
315 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),]['estimate'])
316 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),][,'estimate'])
317 | plot(alco_mhhincome['estimate'])
318 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),][,'estimate'])
319 | ggplot(alco_mhhincome, aes(fill = estimate)) +
320 | geom_sf()
321 | alco_mhhincome %>% filter(! is.na(estimate)) %>%
322 | ggplot(alco_mhhincome, aes(fill = estimate)) +
323 | geom_sf()
324 | ggplot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),], aes(fill = estimate)) +
325 | geom_sf()
326 | ggplot(alco_mhhincome, aes(fill = estimate)) +
327 | geom_sf() +
328 | xlim(122.5, 122.35)
329 | ggplot(alco_mhhincome, aes(fill = estimate)) +
330 | geom_sf() +
331 | xlim(122.25, 122.35)
332 | ggplot(alco_mhhincome, aes(fill = estimate)) +
333 | geom_sf() +
334 | xlim(122.75, 122.35)
335 | ggplot(alco_mhhincome, aes(fill = estimate)) +
336 | geom_sf() +
337 | xlim(122.55, 122.35)
338 | alco_mhhincome <- get_acs(geography='tract',
339 | variables="B19013_001",
340 | year = 2020,
341 | state='CA',
342 | county='Alameda',
343 | geometry=TRUE # get the geography too
344 | )
345 | ```{r}
346 | head(alco_mhhincome)
347 | ```
348 | plot(alco_mhhincome['estimate'])
349 | ggplot(alco_mhhincome, aes(fill = estimate)) +
350 | geom_sf()
351 | ggplot(alco_mhhincome, aes(fill = estimate)) +
352 | geom_sf()
353 | ```{r}
354 | plot(med_hhincome['estimate'])
355 | med_hhincome <- get_acs(geography='tract',
356 | variables="B19013_001",
357 | year = 2020,
358 | state='CA',
359 | county='San Francisco',
360 | geometry=TRUE # get the geography too
361 | )
362 | ```{r}
363 | head(med_hhincome)
364 | ```
365 | ```{r}
366 | plot(med_hhincome['estimate'])
367 | ggplot(med_hhincome, aes(fill = estimate)) +
368 | geom_sf()
369 | ggplot(med_hhincome, aes(fill = estimate)) +
370 | geom_sf() +
371 | xlim(-122.55, -122.3)
372 | inc_by_race <- c(White = "B19013H_001",
373 | Black = "B19013B_001",
374 | Asian = "B19013D_001",
375 | Hispanic = "B19013I_001" )
376 | alco_mhhinc_by_race <- get_acs(geography='tract',
377 | variables=inc_by_race,
378 | year = 2019,
379 | state='CA',
380 | county='Alameda',
381 | geometry=T )
382 | medhhinc_facet_map <- alco_mhhinc_by_race %>%
383 | ggplot(aes(fill = estimate)) +
384 | facet_wrap(~variable) +
385 | geom_sf(color=NA) +
386 | scale_fill_viridis_c(option="magma")
387 | # Display the map
388 | medhhinc_facet_map
389 | #
390 | ```
391 | medhhinc_facet_map <- alco_mhhinc_by_race %>%
392 | ggplot(aes(fill = estimate)) +
393 | facet_wrap(~variable) +
394 | geom_sf()
395 | #geom_sf(color=NA) +
396 | scale_fill_viridis_c(option="magma")
397 | medhhinc_facet_map <- alco_mhhinc_by_race %>%
398 | ggplot(aes(fill = estimate)) +
399 | facet_wrap(~variable) +
400 | geom_sf() +
401 | #geom_sf(color=NA) +
402 | scale_fill_viridis_c(option="magma")
403 | # Display the map
404 | medhhinc_facet_map
405 | #
406 | ```
407 | medhhinc_facet_map <- alco_mhhinc_by_race %>%
408 | ggplot(aes(fill = estimate)) +
409 | facet_wrap(~variable) +
410 | geom_sf(color=NA) +   # why color=NA?
411 | scale_fill_viridis_c(option="magma")
412 | # Display the map
413 | medhhinc_facet_map
414 | #
415 | ```
416 | medhhinc_facet_map <- alco_mhhinc_by_race %>%
417 | ggplot(aes(fill = estimate)) +
418 | facet_wrap(~variable) +
419 | geom_sf(color=NA) +   # why color=NA?
420 | scale_fill_viridis_c(option="magma")
421 | # Display the map
422 | medhhinc_facet_map
423 | #
424 | ```
425 | # Create the map
426 | medhhinc_facet_map <- alco_mhhinc_by_race %>%
427 | ggplot(aes(fill = estimate)) +
428 | facet_wrap(~variable) +
429 | geom_sf(color=NA) +   # why color=NA?
430 | scale_fill_viridis_c(option="plasma")
431 | # Display the map
432 | medhhinc_facet_map
433 | # Create the map
434 | medhhinc_facet_map <- alco_mhhinc_by_race %>%
435 | ggplot(aes(fill = estimate)) +
436 | facet_wrap(~variable) +
437 | geom_sf(color=NA) +   # why color=NA?
438 | scale_fill_viridis_c(option="magma")
439 | # Display the map
440 | medhhinc_facet_map
441 | mapview(med_hhincome)
442 | mapview(med_hhincome, zcol="estimate")
443 | mapview(med_hhincome)
444 | # Create a color palette
445 | pal <- colorNumeric(
446 | palette = "YlOrRd",
447 | domain = med_hhincome$estimate
448 | )
449 | # specify dataset
450 | leaflet(med_hhincome) %>%
451 | addProviderTiles(providers$CartoDB.Positron) %>%
452 | # adjust color palette and ploygon features.
453 | addPolygons(stroke = FALSE, smoothFactor = 0.2, fillOpacity = .5,
454 | color = ~pal(estimate)) %>%
455 | # add legend
456 | addLegend(pal = pal, values = ~estimate,
457 | title = "Median Household Income",
458 | labFormat = labelFormat(prefix = "$"),
459 | position = "bottomleft")
460 | sf_rented <- get_decennial(geography = "tract",  # census tabulation unit
461 | variables =  "H004004", #number of households rented
462 | year = 2010,
463 | summary_var = "H004001",  # Total households
464 | state='CA',
465 | county='San Francisco',
466 | geometry=T)
467 | sf_pct_rented <- sf_rented[sf_rented$value > 0,] %>%
468 | mutate(pct = 100 * (value / summary_value))
469 | # Take a look
470 | head(sf_pct_rented)
471 | ### Map the result
472 | ```{r, eval=F}
473 | plot(sf_pct_rented['pct'])
474 | ```
475 | sf_medrent <- get_acs(geography="tract",
476 | variables=c(median_rent2019="B25064_001"),
477 | year =2019,
478 | state="CA",
479 | county=c("San Francisco"),
480 | geometry=T)
481 | plot(sf_medrent[!is.na(sf_medrent$estimate),]['estimate'])
482 | sf_medrent %>%
483 | drop_na(estimate)  %>%
484 | ggplot(aes(fill = estimate)) +
485 | geom_sf(color=NA) +
486 | coord_sf(crs = 26910) +  # CRS for Northern CA - UTM 10
487 | scale_fill_viridis_c(option = "magma")
488 | mapview(sf_medrent)
489 | mapview(sdf_medrent, zcol='estimate')
490 | ```
491 | mapview(sdf_medrent, zcol='estimate')
492 | ```
493 | mapview(sf_medrent, zcol='estimate')
494 | ```
495 | View(sf_medrent)
496 | mapview(sf_medrent, zcol='moe')
497 | ?get_acs
498 | ?write_csv
499 | write_csv(state_pop, here('data_out/state_pop_2010.csv' )
500 | )
501 | ?write_csv
502 | write.csv(state_pop, here('data_out/state_pop_2010.csv') )
503 | write_csv(state_pop, here('data_out/state_pop_2010.csv') )
504 | # Uncomment this to install packages, if necessary.
505 | # install.packages(c("here", "tidyverse", "sf", "leaflet", "mapview", "tigris", "tidycensus"))
506 | library(here)
507 | library(tidyverse)
508 | library(sf)
509 | library(leaflet)
510 | library(mapview)
511 | library(tigris)
512 | library(tidycensus)
513 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/Census-Data-in-R.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: XeLaTeX
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 dlab-geo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Lessons/Census-Data-in-R-Challenges.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R-Census-Data-Maps-Challenges.Rmd"
  3 | author: "Avery Richards"
  4 | date: "3/21/2022"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ```{r}
 13 | 
 14 | # install pacman for package management.
 15 | if (!require("pacman")) install.packages("pacman")
 16 | 
 17 | pacman::p_load(
 18 |   here, # locate files 
 19 |   tidyverse, # data wrangling
 20 |   sf,  # geospatial data management
 21 |   leaflet, # interactive mapping
 22 |   tigris, tidycensus  # census data 
 23 | )
 24 | 
 25 | ```
 26 | 
 27 | These six libraries should be loaded in your environment now. 
 28 | 
 29 | ```{r}
 30 | 
 31 | # If you run this chunk, output from the "here" function should be visible below. This is your local directory path. We can use this to import files later on. 
 32 | here()
 33 | 
 34 | ```
 35 | 
 36 | *Solutions are available in the Solutions folder, as needed.* 
 37 | 
 38 | ### Challenge 1
 39 | 
 40 | > Use the `get_decennial` function like we've seen above, but fill in the code arguments to fetch total population in 2010 just for California. 
 41 | 
 42 | ```{r, challenge_1_code_A }
 43 | 
 44 | # add your code here. 
 45 | 
 46 |         
 47 | ```
 48 | 
 49 | Alter the above code to fetch total pop in 2010 for CA, TX and FL
 50 | ```{r, challenge_1_code_B }
 51 | 
 52 | # add your code here. 
 53 | 
 54 | ```
 55 | 
 56 | 
 57 | ### Challenge 2
 58 | 
 59 | > You can also filter tidycensus results by `county`. Alter the code below to fetch 2010 population for Alameda & San Francisco Counties.
 60 | 
 61 | 
 62 | ```{r}
 63 | 
 64 | get_decennial(geography = "county",              # census tabulation unit
 65 |                           variables = "P001001", # variable(s) of interest
 66 |                           year = 2010,           # census year
 67 |                           state='CA',            # Filter by state is CA
 68 |                           county='Alameda')      # Filter by county Alameda
 69 | ```
 70 | 
 71 | 
 72 | 
 73 | ```{r}
 74 | 
 75 | # add your code here. 
 76 | 
 77 | ```
 78 | 
 79 | What was the total population in the US in 2010?
 80 | ```{r, challenge_2_code_B }
 81 | 
 82 | # add your code here. 
 83 | 
 84 | ```
 85 | 
 86 | What census region had the largest population in the US in 2010?
 87 | ```{r, challenge_2_code_C }
 88 | 
 89 | # add your code here. 
 90 | 
 91 | ```
 92 | 
 93 | ## Challenge 3
 94 | 
 95 | > Use the `get_decennial` function to fetch and plot an `Avg Family Size`varaible by CA County in `2010`, and name the call as a dataframe, `ca_fam_size`. Once you've done that, plot the dataframe with the `ggplot`call below. __Hint: "P037001"__ 
 96 | 
 97 | ```{r, challenge_3A}
 98 | 
 99 | # add you code here to create a ca_fam_size dataframe with get_decennial.
100 | 
101 | ```
102 | 
103 | 
104 | 
105 | ```{r, challenge_3B}
106 | 
107 | # uncomment and plot from the ca_fam_size dataframe.
108 | # ca_fam_size %>%
109 | #   ggplot(aes(x = value, 
110 | #              y = reorder(NAME, value))) + 
111 | #       geom_point()
112 | 
113 | ```
114 | 
115 | ## Challenge 4
116 | 
117 | > Repeat the previous challenge with data from the `2000` decennial census. Don't assume variable names are the same across the 2000 and 2010 census
118 | 
119 | - Use `load_variables` to check!
120 | 
121 | 
122 | ```{r, challenge_4}
123 | # Add your code below
124 | 
125 | 
126 | ```
127 | 
128 | 
129 | ## Challenge 5
130 | 
131 | Create a `map` of CA Median Age by county in 2010. 
132 | 
133 | ```{r, challenge_5A}
134 | 
135 | # Add your code to fetch the data for the map. 
136 | 
137 | ```
138 | 
139 | 
140 | ```{r, challenge_5B}
141 | 
142 | # Map the data with the plot function
143 | 
144 | 
145 | ```
146 | 
147 | 
148 | ```{r, challenge_5C}
149 | 
150 | # Map the data with ggplot, setting the CRS to 3310 (CA Albers), the preferred default CRS for statewide maps of CA.
151 | 
152 | ```
153 | 
154 | 
155 | ## Challenge 6
156 | 
157 | > Make a ggplot map of MEDIAN GROSS RENT (`"B25064_001"`) in San Francisco County by tract
158 | using data from the ACS 2016—2020 5-year dataset 
159 | 
160 | 
161 | ```{r, challenge_6}
162 | 
163 | # Add your code here.
164 | 
165 | ```
166 | 
167 | ## Challenge 7
168 | 
169 | >Use `mapview` to create an interactive map of SF median household rent.
170 | 
171 | ```{r, challenge_7}
172 | 
173 | # Use mapview to create an interactive map of median household rent (from challenge 6).
174 | # Your code here
175 | 
176 | ```
177 | 
178 | > *Check Census-Data-in-R-Solutions.Rmd for answers, as needed.* 
179 | 


--------------------------------------------------------------------------------
/Lessons/Census-Data-in-R-Slides.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Census Data Wrangling and Mapping in R"
  3 | author: "Patty Frontiera, Irene Farah, Avery Richards"
  4 | date: "4/1/2022"
  5 | output: 
  6 |   ioslides_presentation
  7 | editor_options: 
  8 |   chunk_output_type: console
  9 | ---
 10 | 
 11 | ```{r setup, include=FALSE}
 12 | knitr::opts_chunk$set(echo = TRUE)
 13 | 
 14 | ```
 15 | 
 16 | # Getting Started
 17 | 
 18 | ## Setup
 19 | 
 20 | Welcome! While we're waiting:
 21 | 
 22 | -   Navigate to the workshop webpage: <https://github.com/dlab-berkeley/Census-Data-in-R>
 23 | 
 24 | -   Scroll down and read the `Readme` section.
 25 | 
 26 | -   **Clone or download** the workshop files by clicking on the green `CODE` button.
 27 | 
 28 |     -   If you download the zipfile, **unzip it**.
 29 | 
 30 |     -   Make a note of the folder in which the workshop files reside.
 31 | 
 32 | ## Introduction
 33 | 
 34 | -   About me
 35 | 
 36 | -   About you
 37 | 
 38 |     -   Your familiarity with US Census data
 39 |     -   With geospatial data
 40 |     -   With geospatial data in R
 41 | 
 42 | ## Outline
 43 | 
 44 | -   **Brief** overview of the primary US Census data products
 45 | 
 46 | -   Introduce R packages for working with census data
 47 | 
 48 | -   Use those packages to fetch census data
 49 | 
 50 | -   Use those packages to fetch census data plus census geographic boundary files
 51 | 
 52 | -   Make maps of census data
 53 | 
 54 | # Census Data Overview
 55 | 
 56 | ## US Census Bureau
 57 | 
 58 | The "nation's leading provider of quality data about its people and economy."
 59 | 
 60 | <img src="../data/census_page.png" width="700px"/></img> - <https://www.census.gov>
 61 | 
 62 | ## Primary Census Products
 63 | 
 64 | -   Decennial Census
 65 | 
 66 | -   American Community Survey (ACS)
 67 | 
 68 | ## Decennial Census
 69 | 
 70 | Complete count of the population every 10 years since `1790`
 71 | 
 72 | A snapshot of the American population in time, with an `April 1` reference date.
 73 | 
 74 | Includes data on
 75 | 
 76 | -   `Population`: by sex, age, race/ethnicity, and family / household relationships
 77 | 
 78 | -   `Housing`: by occupancy (occupied, vacant), tenure (owned, rented), and group quarters
 79 | 
 80 | From 1840 to 2000, additional questions were asked of a `sample` of the population.
 81 | 
 82 | -   See <https://www.census.gov/history/www/through_the_decades/index_of_questions/>
 83 | 
 84 | ## American Community Survey (ACS)
 85 | 
 86 | Since 2005, the `American Community Survey (ACS)` has replaced the decennial census `sample` data questions.
 87 | 
 88 | -   Annual survey of a sample of about 3.5 million households released for 1, 3 or 5 year period.
 89 | 
 90 | -   Provides `period estimates` of demographic, social, economic, and housing characteristics
 91 | 
 92 | -   Includes `margin of error` values for the estimates
 93 | 
 94 | ## ACS Data Products
 95 | 
 96 | ACS 1-year and 5-year estimates are currently available through 2020
 97 | 
 98 | -   New data is released at the end of the next year (e.g., 2020 data in Dec 2021)
 99 | -   But COVID is causing a [delay in the release dates](https://www.census.gov/programs-surveys/acs/news/data-releases/2020/release-schedule.html) and the `2020 data was just released!!`
100 | 
101 | `ACS 3-year` no longer available (2008---2013)
102 | 
103 | -   More data tables are available for the ACS 5-year estimates than for the ACS 1 year or ACS 3 year estimates.
104 | 
105 | See: [Census ACS: Guidance for Data Users](https://www.census.gov/programs-surveys/acs/guidance.html)
106 | 
107 | ## [ACS Period Estimates](https://www.census.gov/programs-surveys/decennial-census/decade/2020/news/blog-posts.html)
108 | 
109 | The `ACS 1 year estimates` include data from a sample of the population collected over a one year period.
110 | 
111 | Five years of data are pooled together, weighted and processed as a whole dataset to create the `ACS 5 year estimates`.
112 | 
113 | Use the `ACS 1 year estimates` when you want the `most current data` and are less concerned about precision (larger margins of error). However, the ACS 1 year estimates are only available for areas with large populations (+65,000) and for a subset of data tables.
114 | 
115 | Use the `ACS 5 year estimates` when you want `more stability in the estimates`, more data tables, and smaller geographic tabulation units. But can be tricky to interpret the data if the five year period is not stable (e.g., covid and 2016-2022 ACS 5yr.)
116 | 
117 | ## 
118 | 
119 | | Demographic\*        | Social            | Economic          | Housing           |
120 | |------------------|------------------|------------------|------------------|
121 | | Sex                  | Families          | Income            | Tenure\*          |
122 | | Age                  | Education         | Benefits          | Occupancy\*       |
123 | | Race                 | Marital Status    | Employment Status | Group quarters\*  |
124 | | Hispanic Origin      | Fertility         | Occupation        | Housing Value     |
125 | | Relationships        | Grandparents      | Industry          | Taxes & Insurance |
126 | |                      | Veterans          | Commuting         | Utilities         |
127 | |                      | Disability Status | Place of Work     | Mortgage          |
128 | |                      | Language at Home  | Health Insurance  | Monthly Rent      |
129 | |                      | Citizenship       |                   | Structure Type    |
130 | | *\*decennial census* | Mobility          |                   |                   |
131 | 
132 | ## Census Geographies
133 | 
134 | Census data is collected from individuals. The individual-level response data is called `microdata`.
135 | 
136 | For privacy reasons, only a very limited subset of census microdata is publicly available as the [Public Use Microdata Samples (PUMS)](https://www.census.gov/programs-surveys/acs/microdata.html) data.
137 | 
138 | Most census data is made publicly available only when aggregated to a geographic `tabulation unit`.
139 | 
140 | -   Tabulation units include states, counties, census tracts, block groups, blocks, etc.
141 | 
142 | Not all census data is available for all geographic tabulation units. For example, only decennial census data are available at the block level.
143 | 
144 | ## Census Geographic Tabulation Units
145 | 
146 | <img src="../data/census_geo_hierarchy.png" width="600px"/></img>
147 | 
148 | ## Census Data and Census Geographies
149 | 
150 | <img src="../data/census_data_by_prod_geo.png" width="1000px"/></img>
151 | 
152 | ## Census Data Workflow
153 | 
154 | Identify your
155 | 
156 | -   `Topic of interest`, e.g., population by age, income, monthly rents, etc...
157 | -   `Dataset`: Decennial Census or ACS 1-yr or ACS 5-yr?
158 | -   `Year(s)`: for what time period?
159 | -   `Geographic tabulation unit` of aggregation (county, tract, etc.)
160 | -   `Geographic filter` by state(s) or counties
161 | 
162 | Then determine what specific census variables are available for your topic.
163 | 
164 | ## CAUTION
165 | 
166 | "If you want to measure change you can't change the measures!"
167 | 
168 | **Census tables, variables, geographies, and geographic boundaries change over time!**
169 | 
170 | Measuring change over time with census data is *its own thing*, complex, and not covered by this workshop!
171 | 
172 | ## Getting Census Data
173 | 
174 | Here are three of the primary websites from which you can directly download census data:
175 | 
176 | -   [data.census.gov](https://data.census.gov/cedsci)
177 | -   [NHGIS.org](https://www.nhgis.org/)
178 | -   [Social Explorer](https://www.socialexplorer.com/), a subscription web platform but FREE for UCB community
179 | 
180 | You can download Census `geographic data` directly on the [Census website](https://www.census.gov/programs-surveys/geography/guidance/tiger-data-products-guide.html).
181 | 
182 | ## Census APIs
183 | 
184 | You can write code to fetch data from the [Census Web APIs](https://www.census.gov/data/developers/data-sets.html)
185 | 
186 | -   `API`: application programming interface
187 | 
188 | -   `Web API`: URLs can be formatted to make queries that return data
189 | 
190 | Or you can leverage an existing R package to make this easier!
191 | 
192 | -   That's what we will do!
193 | 
194 | *Only a subset of recent Census data products are available via APIs.*
195 | 
196 | # R Packages for Working with Census Data
197 | 
198 | ## R Packages for Working with Census Data
199 | 
200 | These are the ones we recommend and will use today.
201 | 
202 | -   [tidycensus](https://walker-data.com/tidycensus/)
203 | 
204 | -   [tidyverse](https://www.tidyverse.org/)
205 | 
206 | -   [sf](https://r-spatial.github.io/sf/)
207 | 
208 | -   [mapview](https://r-spatial.github.io/mapview/)
209 | 
210 | ## [tidycensus](https://walker-data.com/tidycensus/)
211 | 
212 | An R package with functions that make it easier to fetch decennial census and ACS data from the Census APIs.
213 | 
214 | Only a limited set of Census data available via `tidycensus`
215 | 
216 | -   Decennial census: 1990, 2000, and 2010
217 | 
218 | -   ACS 1 yr: 2005 through 2019
219 | 
220 | -   ACS 5 yr: 2005---2009 through 2015---2019 are available.
221 | 
222 | Actively maintained and expanding to include more census data products (see `tidycensus` website)
223 | 
224 | ## About [tidycensus](https://walker-data.com/tidycensus/)
225 | 
226 | Developed by [Kyle Walker](https://walkerke.github.io/) to make it easier to fetch data from Census APIs in **R** in a `tidy` format to analyze, plot, and map.
227 | 
228 | Check out his website(<https://walker-data.com/>) to keep abreast of his great packages, blog posts, and tutorials.
229 | 
230 | And his new ebook [Analyzing the US Census with R](https://walker-data.com/census-r/), currently available to read online.
231 | 
232 | ## [tidycensus tutorials](https://walker-data.com/tidycensus/articles/basic-usage.html)
233 | 
234 | <img src="../data/tidycensus_articles.png"/></img>
235 | 
236 | ## [tidyverse](https://www.tidyverse.org)
237 | 
238 | The **tidyverse** package is an umbrella package that installs all the core tidyverse packages and makes them easier to manage and load in R, including:
239 | 
240 | -   `ggplot2`, for data visualization
241 | -   `dplyr`, for data manipulation
242 | -   `tidyr`, for data tidying
243 | -   `readr`, for data import
244 | -   `purrr`, for functional programming
245 | -   `tibble`, for tibbles, a modern re-imagining of data frames
246 | -   `stringr`, for strings
247 | -   `forcats`, for factors
248 | 
249 | ## [sf](https://r-spatial.github.io/sf/) package
250 | 
251 | `Simple features` for geospatial data objects and methods.
252 | 
253 | -   The main R package for working with vector geospatial data
254 |     -   `vector`: locations represented as points, lines and polygons
255 | 
256 | `sf` is loaded and used automatically by `tidycensus`.
257 | 
258 | The online book [Geocomputation with R](https://geocompr.robinlovelace.net/) is a great resource for learning about the `sf` package and working with geospatial data in R.
259 | 
260 | ## [mapview](https://r-spatial.github.io/mapview/)
261 | 
262 | `mapview` provides functions to quickly and easily create interactive maps for data exploration.
263 | 
264 | <img src="../data/mapview_example.png" width="600px"/></img>
265 | 
266 | ## Requesting a Census API key
267 | 
268 | Before you can fetch data from the Census APIs, you must have a free `Census API Key`
269 | 
270 | Request one now if you don't have one yet!
271 | 
272 | -   (<https://api.census.gov/data/key_signup.html>)
273 | 
274 | # Hands-on Tutorial Time!
275 | 
276 | ## Setup
277 | 
278 | **Clone or downloaded and unzip** the workshop files from: <https://github.com/dlab-berkeley/Census-Data-in-R>
279 | 
280 | Then:
281 | 
282 | 1.  Open the folder with the workshop files
283 | 
284 | 2.  Double-click on the R Project file `Census-Data-in-R.Rproj`
285 | 
286 | 3.  This should open RStudio - with the `Files` panel displaying the workshop folder contents.
287 | 
288 | 4.  Double-click on the file `Census-Data-in-R.Rmd` in the `Lessons` folder to follow along!
289 | 
290 | - You can also open the file `Census-Data-in-R.html` to follow along in a web brower.
291 | 


--------------------------------------------------------------------------------
/Lessons/Census-Data-in-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Census Data Wrangling and Mapping in R"
  3 | author: "Patty Frontiera"
  4 | date: "04/01/2022"
  5 | theme: readable
  6 | output: 
  7 |   html_document: 
  8 |     toc: yes
  9 |     toc_float: yes
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE)
 14 | ```
 15 | 
 16 | # Census Data and `tidycensus`
 17 | 
 18 | In this tutorial, we will work through several exercises using the [tidycensus](https://walker-data.com/tidycensus/index.html) R package to fetch, wrangle, and map census data.
 19 | 
 20 | The key `tidycensus` functions we will use today are:
 21 | 
 22 | -   `census_api_key`: makes your Census API key available to tidycensus
 23 | 
 24 | -   `load_variables`: retrieves a dataframe of available census data variables
 25 | 
 26 | -   `get_decennial`: fetch census data from a recent decennial censuses - 2000, 2010 (and soon 2020)
 27 | 
 28 | -   `get_acs`: fetch census data from an ACS (American Community Survey) 1 or 5 year dataset, 2005 - 2020.
 29 | 
 30 | # Setup
 31 | 
 32 | Be sure to **clone** or **downloaded and unzip** the workshop files from: <https://github.com/dlab-berkeley/Census-Data-in-R>
 33 | 
 34 | Then:
 35 | 
 36 | 1.  Open the folder with the workshop files
 37 | 
 38 | 2.  Double-click on the R Project file `Census-Data-in-R.Rproj`
 39 | 
 40 | 3.  This should open RStudio - with the `Files` panel displaying the workshop folder contents.
 41 | 
 42 | 4.  Double-click on the file `Census-Data-in-R.Rmd` to follow along!
 43 | 
 44 | *You can also click on the file `Census-Data-in-R.html` in the Files tab to open the workshop tutorial in a web brower.*
 45 | 
 46 | ## Install packages
 47 | 
 48 | > If you installed any of these packages awhile ago, (especially `tidycensus`), it's a good idea to install updates when you can (though not during the workshop as things can break!).
 49 | 
 50 | ```{r}
 51 | 
 52 | # Uncomment this to install packages, if necessary.
 53 | # install.packages(c("here", "tidyverse", "sf", "leaflet", "mapview", "tigris", "tidycensus"))
 54 | 
 55 | library(here)
 56 | library(tidyverse)
 57 | library(sf)
 58 | library(leaflet)
 59 | library(mapview)
 60 | library(tigris)
 61 | library(tidycensus)
 62 | 
 63 | ```
 64 | 
 65 | These seven libraries should be loaded in your environment now.
 66 | 
 67 | ```{r}
 68 | 
 69 | # If you run this chunk, output from the "here" function should be visible below. This is your local directory path. We can use this to import files later on. 
 70 | here()
 71 | 
 72 | ```
 73 | 
 74 | ## Census API Key
 75 | 
 76 | You need a `Census API key` to programmatically fetch census data.
 77 | 
 78 | -   Get it here (pretty quickly): <https://api.census.gov/data/key_signup.html>
 79 | 
 80 | -   The key will be sent to your email and you will need to click to activate it.
 81 | 
 82 | -   Keep the email with the key open for use in this workshop.
 83 | 
 84 | *For more info on all available Census APIs see: <https://www.census.gov/data/developers/data-sets.html>*
 85 | 
 86 | ### Add Your Census API Key
 87 | 
 88 | To use your Census API Key in R
 89 | 
 90 | 1.  Copy and paste your Census API key from your email
 91 | 
 92 | 2.  Use the tidycensus function **census_api_key** to register your API key with tidycensus. **Don't forget to put quotes around the key!**.
 93 | 
 94 | ```{r, eval=FALSE}
 95 | 
 96 | # Install your census api key - long alphanumeric string
 97 | census_api_key("THE_BIG_LONG_ALPHANUMERIC_API_KEY_YOU_GOT_FROM_CENSUS")
 98 | 
 99 | ```
100 | 
101 | Another way to add your Census API Key:
102 | 
103 | I keep my key in a file so no one can see it. One way to do this is by making a script that creates a variable key, and then using the `source` function to add that script as an object into your coding environment. The code chunk below is an example of how you might do that:
104 | 
105 | ```{r,eval=FALSE}
106 | 
107 | # source (run) an r script that creates a variable with my key
108 | #source("/Users/pattyf/Documents/Dlab/workshops/keys/census_api_key.R")
109 | 
110 | #print(my_census_api_key) 
111 | 
112 | # register the key
113 | census_api_key(key = my_census_api_key)
114 | ```
115 | 
116 | # Decennial Census Data
117 | 
118 | ## The `get_decennial` function
119 | 
120 | We start by fetching `total population` from the 2010 Census with tidycensus's `get_decennial` function. Let's first talk about the code.
121 | 
122 | ```{r}
123 | pop2010 <- get_decennial(geography = "state",   # census tabulation unit
124 |                          variables = "P001001", # variable(s) of interest
125 |                          year = 2010)           # census year
126 | 
127 | head(pop2010)          
128 | ```
129 | 
130 | ## Fetching data for more than one Census variable
131 | 
132 | We can pass a vector of census identifiers to the `variables` function argument if we want to get data for more than one variable. Below we add `P0002002` for population in urban areas.
133 | 
134 | ```{r}
135 | pop2010 <- get_decennial(geography="state",
136 |                          variables = c("P001001","P002002"), # variable(s) of interest
137 |                          year = 2010)           # census year
138 | 
139 | # take a look
140 | head(pop2010)
141 | ```
142 | 
143 | We can see the data for both variables if we sort the output by county name.
144 | 
145 | ```{r}
146 | # Sort dataframe by county names (the NAME column)
147 | pop2010 %>% arrange(NAME) %>% head()
148 | ```
149 | 
150 | ## tidycensus returns `tidy` data
151 | 
152 | By default, tidycensus returns data in a `tidy`, or `long` format that allows data for multiple variables to be contained within the `variable` and `value` columns.
153 | 
154 | This is in contrast to untidy, or `wide` data where each variable is in its own column.
155 | 
156 | tidycensus can return `wide` data if you can add the parameter `output=wide` to the function call.
157 | 
158 | ```{r}
159 | 
160 | # wide format
161 | pop2010w <- get_decennial(geography = "state",   # census tabulation unit
162 |                          variables = c("P001001","P002002"), # variable(s) of interest
163 |                          year = 2010,           # census year
164 |                          output="wide")         # get output in wide format
165 | head(pop2010w) 
166 | ```
167 | 
168 | ### The `GEOID` column
169 | 
170 | The `GEOID` column is included in tidycensus output by default.
171 | 
172 | This is a Census geographic identifier for the tabulation unit.
173 | 
174 | The `GEOID` is sometimes called the Census `FIPS` code and for most tabulation units these are the same.
175 | 
176 | The `GEOID` makes it possible to link to Census demographic data to Census geographic data and make maps. We will do this in a bit.
177 | 
178 | The `GEOID` is a text string and must be quoted.
179 | 
180 | -   Beware of GEOID leading zeros, since some software will remove these and convert GEIOD values to numbers (rather than text strings).
181 | 
182 | > Question: **What is the GEOID for California?**
183 | 
184 | # Census Tabulation Units
185 | 
186 | Public census data is typically aggregated by census geographies to protect privacy.
187 | 
188 | These census geographies are called `Census tabulation units`.
189 | 
190 | -   Some of these are real administrative units like states and counties.
191 | -   Others are statistical units created by the census, like census tracts and block groups.
192 | 
193 | Some of the most common geographic tabulation units and their tidycensus function abbreviations are shown below, along with **required** and available filters that limit what data are returned.
194 | 
195 | | Geography     | Definition           | Filter(s)             | Used in tidycensus         |
196 | |-----------------|-----------------|-----------------|---------------------|
197 | | "us"          | United States        |                       | get_acs(), get_decennial() |
198 | | "region"      | Census region        |                       | get_acs(), get_decennial() |
199 | | "state"       | State or equivalent  | state                 | get_acs(), get_decennial() |
200 | | "county"      | County or equivalent | state, county         | get_acs(), get_decennial() |
201 | | "place"       | Census place         | state                 | get_acs(), get_decennial() |
202 | | "tract"       | Census tract         | **state**, county     | get_acs(), get_decennial() |
203 | | "block group" | Census block group   | **state**, county     | get_acs(), get_decennial() |
204 | | "block"       | Census block         | **state**, **county** | get_decennial() only!      |
205 | 
206 | ## `get_decennial` Tabulation Units and Filters
207 | 
208 | Let's take a few minutes to practice fetching population data with the `get_decennial` function.
209 | 
210 | -   See `?get_decennial` for help
211 | 
212 | ### Challenge 1
213 | 
214 | > Open **Census-Data-in-R-Challenges.Rmd** and use the `get_decennial` function like we've seen above to fetch population data. *Solutions are available in the Solutions folder, as needed.*
215 | 
216 | ### Changing the tabulation unit
217 | 
218 | Let's fetch 2010 population data for CA counties
219 | 
220 | **What changes in the code?**
221 | 
222 | ```{r}
223 | 
224 | get_decennial(geography = "county",              # census tabulation unit
225 |                           variables = "P001001", # variable(s) of interest
226 |                           year = 2010,           # census year
227 |                           state='CA')            # Filter by state is CA
228 | ```
229 | 
230 | #### Questions
231 | 
232 | -   How do we specify the state of CA above? How else can we?
233 | -   Can you fetch population data for all counties in the USA or do you need to have a `state=` filter?
234 | 
235 | ### Adding a county filter
236 | 
237 | You can also filter tidycensus results by `county`
238 | 
239 | ```{r}
240 | 
241 | get_decennial(geography = "county",              # census tabulation unit
242 |                           variables = "P001001", # variable(s) of interest
243 |                           year = 2010,           # census year
244 |                           state='CA',            # Filter by state is CA
245 |                           county='Alameda')      # Filter by county Alameda
246 | ```
247 | 
248 | ### Challenge 2
249 | 
250 | > In **Census-Data-in-R-Challenges.Rmd**, alter the code above to fetch 2010 population for Alameda & San Francisco Counties. Then try Challenge 2B & 2C.
251 | 
252 | ## Visualizing Results
253 | 
254 | We can visualize data to get a quick overview of the distribution of the values.
255 | 
256 | It's a first step in exploratory data analysis and a last step in data communication.
257 | 
258 | `ggplot2` is the most commonly used R package for data visualization.
259 | 
260 | -   It is loaded when you load the `tidyverse` package.
261 | 
262 | Let's use it to visualize the population data.
263 | 
264 | ### Plot 2010 Population by state
265 | 
266 | Use `ggplot2` to create an ordered horizontal bar chart.
267 | 
268 | ```{r}
269 | 
270 | # create a plot.
271 | pop_plot <- ggplot(data=pop2010, 
272 |                    # set aesthetic variables
273 |                    aes(x=value/1000000, y=reorder(NAME,value)) ) + 
274 |                    # pick geometry
275 |                    geom_bar(stat="identity") +  
276 |                    # add theme and titles. 
277 |                    theme_minimal() + 
278 |                    labs(title = "2010 US Population by State") +
279 |                    xlab("Population (in Millions)") +
280 |                    ylab("State")
281 | 
282 | # display the plot.
283 | pop_plot
284 | 
285 | ```
286 | 
287 | Developing your `ggplot2` knowledge can really enhance your data analysis skills.
288 | 
289 | In combination with `tidycensus` it creates a powerful, reproducible data science workflow.
290 | 
291 | # Identifying Census Variables
292 | 
293 | In the code above we fetched data for total population in 2010 using the variable `"P001001"`.
294 | 
295 | That is not an obvious variable name, so how do we get those census data identifiers?
296 | 
297 | We can use the tidycensus `load_variables` function for this.
298 | 
299 | ## `load_variables` function
300 | 
301 | For any census dataset like the decennial census or the ACS 1 or 5-year estimates, use the `load_variables` function to fetch all available variables and identifiers.
302 | 
303 | Since these datasets have many, many variables, save the resulting dataframe to a variable and cache it locally so you do not need to repeatedly retrieve it over the web.
304 | 
305 | ```{r}
306 | 
307 | vars2010 <- load_variables(year=2010,        # Year or end year for ACS-5yr
308 |                            dataset = 'sf1',  # 'sf1' for decennial census
309 |                            cache = TRUE)     # Save fetched data locally
310 | 
311 | # How large is the output
312 | dim(vars2010)
313 | 
314 | # Take a look with head or View
315 | head(vars2010)
316 | 
317 | ```
318 | 
319 | ## 2010 Decennial Census Tables & Variables
320 | 
321 | -   Over 3,000 unique variables that describe population and housing characteristics
322 | 
323 | -   Organized in `333` Tables
324 | 
325 |     -   177 population tables (identified with a ''P'') available to the block level
326 |     -   58 housing tables (identified with an ''H'') available to the block level
327 |     -   82 population tables (identified with a ''PCT'') available to the census tract level
328 |     -   4 housing tables (identified with an "HCT") available to the census tract level
329 |     -   10 population tables (identified with a "PCO") available to the county level
330 |     -   plus 2 additional PCT tables
331 | 
332 | <small>See: <https://www.census.gov/data/datasets/2010/dec/summary-file-1.html></small>
333 | 
334 | #### What Variable Has the 2010 Total Population value?
335 | 
336 | We know this from our previous code blocks, but let's find it for practice navigating the dataframe.
337 | 
338 | -   Let's sort and filter the `vars2010` dataframe to find it.
339 | 
340 | #### *Questions:*
341 | 
342 | What 2010 decennial census variable contains...
343 | 
344 | -   `Median Age`
345 | 
346 | -   `Average Family Size`
347 | 
348 | -   `Number of occupied housing units`
349 | 
350 | \*See **Census-Data-in-R-Solutions.Rmd** if needed (under Variable Questions)
351 | 
352 | ### Challenge 3
353 | 
354 | > Return to **Census-Data-in-R-Challenges.Rmd** and use the `get_decennial` function to fetch and plot an `Avg Family Size`varaible by CA County in `2010`, and name the call as a dataframe, `ca_fam_size`. Once you've done that, plot the dataframe with the `ggplot`call below. **Hint: "P037001"**
355 | 
356 | ### Challenge 4
357 | 
358 | > Repeat the previous challenge with data from the `2000` decennial census. Don't assume variable names are the same across the 2000 and 2010 census
359 | 
360 | > **Use `load_variables` to check the variable name!**
361 | 
362 | # Census Tract Data
363 | 
364 | Census tracts are the most commonly used census tabulation unit.
365 | 
366 | Let's fetch population data for the census tabulation unit to *tract*
367 | 
368 | > Because of the large number of census tracts, you **MUST** specify a state when requesting these data with tidycensus.
369 | 
370 | ```{r}
371 | ## Fetch population by **tract** for California.
372 | ca_tract_pop2010 <- get_decennial(geography = "tract",   # census tab unit
373 |                                    variables = "P001001", # var of interest
374 |                                    year = 2010,           # census year
375 |                                    state='CA')      # State filter
376 | 
377 | # How many tracts in CA
378 | dim(ca_tract_pop2010)
379 | 
380 | # take a look
381 | head(ca_tract_pop2010)
382 | ```
383 | 
384 | ## Fetching Census Tract Data
385 | 
386 | Census tract data can be quite large!
387 | 
388 | Fortunately, you can also limit the results to one or more counties.
389 | 
390 | ```{r,}
391 | 
392 | tract_pop2010 <- get_decennial(geography = "tract",   # census tabulation unit
393 |                          variables = "P001001",       # variable of interest
394 |                          year = 2010,                 # census year - only one!
395 |                          state="CA",                  # limit to California
396 |                          county=c("Alameda","Contra Costa"))  # & counties
397 | 
398 | dim(tract_pop2010)
399 | 
400 | ```
401 | 
402 | ## Customizing tidycensus output
403 | 
404 | What **two** things are new here?
405 | 
406 | ```{r}
407 | 
408 | #urban and rural pop for 3 CA counties
409 | ur_pop10 <- get_decennial(geography = "county",  # census tabulation unit
410 |                            variables = c(urban="P002002",rural="P002005"),
411 |                            year = 2010, 
412 |                            summary_var = "P002001",  # The denominator
413 |                            state='CA',
414 |                            county=c("Napa","Sonoma","Mendocino"))
415 | 
416 | ```
417 | 
418 | #### When fetching census data...
419 | 
420 | We have already specified more than one variable:
421 | 
422 |     variables = c("P002002","P002005")
423 | 
424 | 1.  You can also rename the values in the output 'variable' column.
425 | 
426 | <!-- -->
427 | 
428 |     variables = c(urban="P002002",rural="P002005")
429 | 
430 | 2.  You can identify a `summary_var` (a denominator - here, the total count of all people or households surveyed. Can be used for calculations like percent of total.)
431 | 
432 | <!-- -->
433 | 
434 |     summary_var = "P002001"
435 | 
436 | #### Now let's take a look at the resultant dataframe
437 | 
438 | ```{r}
439 | # take a look at the results
440 | ur_pop10
441 | ```
442 | 
443 | #### Calculating Percents
444 | 
445 | The `summary_value` column comes in handy when you want to compute percent of total, for example:
446 | 
447 | ```{r}
448 | 
449 | # Calculate the percent of population that is Urban or Rural
450 | ur_pop10 <- ur_pop10 %>%
451 |             mutate(pct = 100 * (value / summary_value))
452 | 
453 | # Take a look at the output.
454 | ur_pop10 
455 | 
456 | ```
457 | 
458 | A plot gives us compact visual summaries of the data.
459 | 
460 | ```{r}
461 | 
462 | ## Plot it with ggplot2
463 | myplot <- ggplot(data = ur_pop10, 
464 |           mapping = aes(x = NAME, fill = variable, 
465 |                      y = ifelse(test = variable == "urban", 
466 |                                 yes = -pct, no = pct))) +
467 |           geom_bar(stat = "identity") +
468 |           scale_y_continuous(labels = abs, limits=c(-100,100)) +
469 |           labs(title="Urban & Rural Population in Wine Country", 
470 |                x="County", y = " Percent of Population", fill="") +
471 |           coord_flip()
472 | 
473 | myplot
474 | 
475 | ```
476 | 
477 | Don't worry if you don't get all the ggplot code now. It's here for reference.
478 | 
479 | -   You may want to check out D-Lab's *R Data Visualization with ggplot* workshop!
480 | 
481 | ## Data Wrangling to Combine Data from 2 Censuses
482 | 
483 | You can use your R skills to reformat the data and make it more usable.
484 | 
485 | Let's fetch population data for 2010 and 2000 by state.
486 | 
487 | Then we will combine these into one data frame using the `tidyverses::bind_rows` function
488 | 
489 | ```{r}
490 | 
491 | # Fetch 2000 population data by state
492 | pop2000 <- get_decennial(geography = "state",
493 |                          variables = c(pop2000="P001001"), 
494 |                          year = 2000)
495 | 
496 | # Fetch 2010 population data by state
497 | pop2010 <- get_decennial(geography = "state",
498 |                          variables = c(pop2010="P001001"), 
499 |                          year = 2010)
500 | 
501 | # Use tidyverse `bind_rows` function to combine the data for these years
502 | state_pop <- bind_rows(pop2000, pop2010)
503 | 
504 | # Take a look with head or View
505 | state_pop %>% arrange(NAME) %>% head(10)
506 | ```
507 | 
508 | # Saving `tidycensus` output
509 | 
510 | The data we fetch using `tidycensus` is stored in an R dataframe.
511 | 
512 | We can use the `write.csv` or `write_csv` function to save the contents of a dataframe to a CSV file.
513 | 
514 | ```{r}
515 | write_csv(state_pop, here('data_out/state_pop_2010.csv') )
516 | ```
517 | 
518 | > **Any Questions?**
519 | 
520 | # Mapping Census Data
521 | 
522 | You can fetch census geographic data by adding the parameter **geometry=TRUE** to `tidycensus` functions
523 | 
524 | -   Under the hood, tidycensus calls the `tigris` package to fetch data from the Census Geographic Data APIs.
525 | 
526 | You can then use your favorite R mapping packages like `sf`, `ggplot`, `tmap`, `mapview` and `leaflet` to make maps.
527 | 
528 | ## Geometry Options
529 | 
530 | Before fetching census geographic data, we need to set the option `tigris_use_cache` to TRUE
531 | 
532 | Caching saves data locally. This greatly speeds things up if you fetch the same census geographic data repeatedly.
533 | 
534 | ```{r}
535 | 
536 | # Tigris options - used by tidycensus
537 | # Cache retrieved geographic data locally
538 | options(tigris_use_cache = TRUE)  
539 | 
540 | ```
541 | 
542 | ## Fetch Geographic Boundary Data with `tidycensus`
543 | 
544 | We fetch the census geographic data by setting **geometry=TRUE**.
545 | 
546 | ```{r}
547 | 
548 | pop2010geo <- get_decennial(geography = "state", 
549 |                           variables = c(pop10="P001001"), 
550 |                           year = 2010, 
551 |                           output="wide", 
552 |                           geometry=TRUE) # Fetch geometry data for mapping
553 |  
554 | ```
555 | 
556 | ### Take a look
557 | 
558 | Let's take a minute to discuss the format of an `sf` spatial object.
559 | 
560 | ```{r}
561 | 
562 | head(pop2010geo, 3)
563 | 
564 | ```
565 | 
566 | # Geospatial Data in R
567 | 
568 | The `tidycensus` package uses the R `sf` package to manage geospatial data.
569 | 
570 | R `sf` objects include:
571 | 
572 | -   a dataframe with a `geometry` column labeled `geometry`
573 | 
574 |     -   The geometry can be of type POINT, LINE, POLYGON
575 |     -   or, MULTIPOINT, MULTILINE or MULTIPOLGYON
576 | 
577 | -   a `CRS` (coordinate reference system), specified by
578 | 
579 |     -   epsg(SRID) code
580 |     -   proj4string
581 | 
582 | For a deeper understanding of the `sf` package and its functionality, we recommend
583 | 
584 | -   our [R-Geospatial-Fundamentals](https://github.com/dlab-berkeley/R-Geospatial-Fundamentals) workshop
585 | 
586 | -   the [Geocomputation with R ebook](https://geocompr.robinlovelace.net/)
587 | 
588 | ## Census Data Coordinate Reference System (CRS)
589 | 
590 | All geospatial data are referenced to the surface of the earth with a `CRS`, or `coordinate reference system`. Anyone working with geospatial data will need to develop an understanding of CRSs.
591 | 
592 | Fortunately, many of us are familiar with longitude and latitude, which are geographic coordinates. But there are different versions of geographic CRSs. And there are also projected CRSs which transform longitude and latitude to 2 dimensional surface for mapping & analysis.
593 | 
594 | All census geographic data use the `NAD83` geographic CRS. `NAD83` stands for North American Datum of 1983. This CRS (or version of latitude and longitude) is best for locations in North America.
595 | 
596 | Many geospatial operations require you transform data to a common CRS before conducting spatial analysis or mapping.
597 | 
598 | -   This could be an issue if you try to combine the census geospatial data with other geospatial data. But it is not an issue in this tutorial.
599 | 
600 | An in-depth discussion of CRSs is outside the scope of this workshop. See [Geocomputation in R](https://geocompr.robinlovelace.net/reproj-geo-data.html) for more information.
601 | 
602 | ## Mapping `sf` Spatial Objects
603 | 
604 | We can use `sf::plot` to make a quick map the geometry stored in an `sf` spatial object.
605 | 
606 | ```{r}
607 | # plot the geometry column data
608 | plot(pop2010geo$geometry)
609 | ```
610 | 
611 | ### The Challenge of US maps
612 | 
613 | The vast geographic extent and non-contiguous nature of the USA makes it difficult to map.
614 | 
615 | Fortunately, tidycensus includes a `shift_geo` parameter to shift AK & HI to below Texas.
616 | 
617 | ```{r}
618 | 
619 | pop2010geo_shifted <- get_decennial(geography = "state", 
620 |                                     variables = c(pop10="P001001"), 
621 |                                     output="wide",
622 |                                     year = 2010, 
623 |                                     geometry=TRUE, 
624 |                                     shift_geo=TRUE)
625 | 
626 | ## Shift Happens!
627 | plot(pop2010geo_shifted$geometry)
628 | ```
629 | 
630 | ## Saving Spatial Objects
631 | 
632 | You can save any `sf` spatial data object to an [ESRI shapefile](https://en.wikipedia.org/wiki/Shapefile) using `st_write`
633 | 
634 | ```{r, eval=F}
635 | 
636 | st_write(pop2010geo_shifted, here("data_out/usa_pop2010_shifted.shp"))
637 | 
638 | ```
639 | 
640 | Now take a look at the output shapefile.
641 | 
642 | ```{r, eval=F}
643 | 
644 | # Check to see if the data was written out to a shapefile
645 | dir(here("data_out")) 
646 | ```
647 | 
648 | ### ESRI Shapefiles
649 | 
650 | You can see from this output that an ESRI shapefile is actually a collection of files that all have the same prefix.
651 | 
652 | Shapefiles are the most common file format for geospatial data. So it's worthwhile to learn more about them if you will be working with census geographic data.
653 | 
654 | -   Check out the [Shapefile Wikipedia page](https://en.wikipedia.org/wiki/Shapefile) for more information.
655 | 
656 | ## Mapping Data Values
657 | 
658 | You can use the sf `plot` command to make a map that sets the color of the geometry by the data values
659 | 
660 | -   This type of map is called a `thematic map`.
661 | 
662 | -   When the features being plotted are areas (or polygons), it's called a `choropleth` map!
663 | 
664 | ```{r}
665 | # Name the column with the variable values to make
666 | # a thematic map, also called a choropleth map.
667 | plot(pop2010geo_shifted['pop10'])  
668 | 
669 | ```
670 | 
671 | ### `ggplot2` Map
672 | 
673 | `ggplot` knows how to map sf objects!
674 | 
675 | ```{r}
676 | 
677 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 
678 |   geom_sf()  # tells ggplot that geographic data are being plotted
679 | 
680 | ```
681 | 
682 | If you are familiar with `sf` objects and `ggplot` you can further customize your maps.
683 | 
684 | ```{r}
685 | 
686 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 
687 |   geom_sf(color=NA) + # What does color=NA do
688 |   coord_sf(crs = 3857) + # Dynamically change the CRS
689 |   scale_fill_viridis_c(option = "viridis")  # Change the color palette
690 |                                             # Try different options, e.g.
691 |                                             # plasma, magma, inferno, cividis
692 | 
693 | ```
694 | 
695 | ### Challenge 5
696 | 
697 | > In your **Census-Data-in-R-Challenges.Rmd** file, create a map of `Median Age by California County in 2010`. *Solutions are in the Census-Data-in-R-Solutions.Rmd file*
698 | 
699 | ### Fetch Census Data and Geometry for Multiple States or Counties
700 | 
701 | We can fetch Census data and **geometry** for more than one state or county with same function call.
702 | 
703 | -   This is so much easier than any alternative approach!
704 | 
705 | -   It can be applied to any available geographic tabulation areas (eg states, counties, tracts, places).
706 | 
707 | Let's try it with Census Tracts!
708 | 
709 | Fetch tract population and geometry data for Bay Area Counties.
710 | 
711 | ```{r}
712 | 
713 | bay_counties <- c("Alameda", "Contra Costa", "Marin", "San Francisco",
714 |                   "Sonoma", "Napa","Solano", "San Mateo", "Santa Clara")
715 | 
716 | bayarea_pop10 <- get_decennial(geography = "tract", 
717 |                       variables = "P001001", 
718 |                       year = 2010, 
719 |                       state='CA',
720 |                       county=bay_counties,
721 |                       geometry=T)
722 | 
723 | # Quick map
724 | plot(bayarea_pop10['value'])
725 | ```
726 | 
727 | > **Any Questions?**
728 | 
729 | # Fetching ACS data with `get_acs`
730 | 
731 | -   ACS data contains the most recent information about the American population.
732 | 
733 | -   We can use the tidycensus function `get_acs` to retrieve `ACS data` using code very similar to `get_decennial`.
734 | 
735 | **BUT** the workflow is more complex because:
736 | 
737 | 1.  ACS data has a lot more variables, and
738 | 
739 | 2.  ACS data are **sample data**, so each ACS variable that you retrieve with `tidycensus` will fetch both an **estimate** of the value and a **margin of error**.
740 | 
741 | ## ACS Data Products
742 | 
743 | The ACS has two primary data products - the ACS 1 year estimates and the 5 year estimates.
744 | 
745 | -   The `ACS 1 year estimates` are more current but hasve a larger margin of error and is not available for Census geographies with a population of \< 65,000.
746 | 
747 | -   The `ACS 5-year estimates` are more stable but represent a larger time period.
748 | 
749 | -   *The `ACS 3 year estimates` has been discontinued.*
750 | 
751 | ## Fetch metadata on ACS 5-Year Variables
752 | 
753 | Let's use the `load_variables` function to get a dataframe of all variables from the ACS 2016---2020 5-year dataset.
754 | 
755 | -   Note: we change the dataset value to `acs5` where before we used `sf1` to fetch info on the decennial census variables.
756 | 
757 | ```{r}
758 | 
759 | vars_acs2020 <- load_variables(year=2020,       # end year 2016-2020 period
760 |                               dataset = 'acs5', # the ACS data product
761 |                               cache = T)        # Save locally for future use
762 | 
763 | # how many variables?
764 | dim(vars_acs2020)
765 | ```
766 | 
767 | ### Exploring the ACS Variables
768 | 
769 | View the `vars_acs2020` dataframe to find the variable name for `median household income`.
770 | 
771 | ```{r, eval=FALSE}
772 | #View(vars_acs2020)
773 | ```
774 | 
775 | > *Question* Is the variable name for `total population` in the ACS 5 year 2020 data the same as it is in the 2010 census data?
776 | 
777 | ### Fetch ACS Data on Median Household Income
778 | 
779 | Let's fetch the median household income data for San Francisco County by census tract.
780 | 
781 | ```{r}
782 | 
783 | med_hhincome <- get_acs(geography='tract',
784 |                         variables="B19013_001",
785 |                         year = 2020,
786 |                         state='CA',
787 |                         county='San Francisco',
788 |                         geometry=TRUE # get the geography too
789 |                         )
790 | ```
791 | 
792 | Take a look at the output
793 | 
794 | ```{r}
795 | head(med_hhincome)
796 | ```
797 | 
798 | ### ACS Output
799 | 
800 | The census data returned by the `get_acs` function is a bit different from that returned by `get_decennial`.
801 | 
802 | -   What is the name of the variable containing the income data?
803 | 
804 | -   What is the name if we set `output="wide"`?
805 | 
806 | ```{r}
807 | med_hhincome_wide <- get_acs(geography='tract',
808 |                         variables="B19013_001",
809 |                         year = 2020,
810 |                         state='CA',
811 |                         county='San Francisco',
812 |                         geometry=TRUE, # get the geography too
813 |                         output="wide"
814 |                         )
815 |                         
816 | # uncomment and run to view
817 | # head(med_hhincome_wide)
818 | ```
819 | 
820 | ### Map Median Household Income by tract
821 | 
822 | Use `sf::plot` to create a map of median household income in San Francisco.
823 | 
824 | ```{r}
825 | plot(med_hhincome['estimate'])
826 | ```
827 | 
828 | *What do you think of that map?*
829 | 
830 | It's odd because San Francisco County is not the same as the city of San Francisco and what we want to map is the city.
831 | 
832 | #### Create a map with ggplot
833 | 
834 | We can use ggplot to zoom in on the city by setting the x axis limits to a narrower geographic range.
835 | 
836 | ```{r}
837 | ggplot(med_hhincome, aes(fill = estimate)) + 
838 |   geom_sf() +
839 |   xlim(-122.55, -122.3)
840 |   
841 | ```
842 | 
843 | ### Question
844 | 
845 | > Why do you think we have NA values in the ACS estimates?
846 | 
847 | ### Fetching Multiple ACS-5 Variables
848 | 
849 | We can drill down into the ACS data by fetching data for subgroups, where available.
850 | 
851 | Let's fetch median household income by race.
852 | 
853 | First identify the variables of interest.
854 | 
855 | ```{r}
856 | 
857 | # Median household income by race/ethnicity: Variables from ACS 2015—19
858 | # All households =   "B19013_001",
859 | inc_by_race <- c(White = "B19013H_001",
860 |                  Black = "B19013B_001",
861 |                  Asian = "B19013D_001",
862 |                  Hispanic = "B19013I_001" )
863 | ```
864 | 
865 | Fetch census tract data for multiple variables at once.
866 | 
867 | ```{r}
868 | # Fetch the Data
869 | alco_mhhinc_by_race <- get_acs(geography='tract',
870 |                                   variables=inc_by_race,
871 |                                   year = 2019,
872 |                                   state='CA',
873 |                                   county='Alameda',
874 |                                   geometry=T )
875 | ```
876 | 
877 | ## Facet Mapping
878 | 
879 | Facet maps are a way to create visualizations of `small multiples`, or subsets of the data in order to facilitate comparisons. Here, we use ggplot's `facet_wrap` function to make multiple maps of median household income by race for Alameda County.
880 | 
881 | ```{r}
882 | # Create the map
883 | medhhinc_facet_map <- alco_mhhinc_by_race %>%
884 |                         ggplot(aes(fill = estimate)) +
885 |                           facet_wrap(~variable) +
886 |                           geom_sf(color=NA) +   # why color=NA?
887 |                           scale_fill_viridis_c(option="magma")
888 | 
889 | # Display the map
890 | medhhinc_facet_map
891 | ```
892 | 
893 | ### Challenge 6
894 | 
895 | > In **Census-Data-in-R-Challenges.Rmd** file, Make a ggplot map of MEDIAN GROSS `RENT` in San Francisco County by tract using data from the ACS 2016---2020 5-year product. *Check Census-Data-in-R-Solutions.Rmd for answers, as needed.*
896 | 
897 | > **Any Questions?**
898 | 
899 | # Interactive Mapping
900 | 
901 | Interactive mapping gives the RStudio environment some of the functionality of desktop GIS.
902 | 
903 | There are a number of R packages that you can use, including:
904 | 
905 | -   [mapview](https://r-spatial.github.io/mapview/): quick interactive exploratory data viewing
906 | 
907 | -   [tmap](https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html): great static and interactive maps
908 | 
909 | -   [Leaflet](https://rstudio.github.io/leaflet/): highly customizable interactive maps
910 | 
911 | *All of these are based on the `Leaflet Javascript Library`.*
912 | 
913 | ### [mapview](https://r-spatial.github.io/mapview/)
914 | 
915 | Let's use `mapview` to make quick interactive maps of the median hhousehold income data
916 | 
917 | ```{r}
918 | 
919 | mapview(med_hhincome)
920 | 
921 | ```
922 | 
923 | When passed the name of an `sf` object and no other options, `mapview` will:
924 | 
925 | -   display the geometry using a single color for the fill and for the stroke
926 | 
927 | -   display the feature ID `on hover`
928 | 
929 | -   display the data from the dataframe `on click`
930 | 
931 | #### Mapview Thematic Maps
932 | 
933 | The `zcol` argument will take a column name and color the features by the values in that column.
934 | 
935 | ```{r}
936 | 
937 | mapview(med_hhincome, zcol="estimate")
938 | 
939 | ```
940 | 
941 | ### Challenge 7
942 | 
943 | > In the **Census-Data-in-R-Challenges.Rmd**, use `mapview` to create an interactive map of median household rent.
944 | 
945 | ## Determining what ACS Variables to use
946 | 
947 | ACS variables can be confusing.
948 | 
949 | Some ways to identify the best variables to explore:
950 | 
951 | -   Web search, especially Census web resources, can help.
952 | 
953 | -   The Census Reporter website (<https://censusreporter.org>) provides another tool for navigating topics, tables, and variable names.
954 | 
955 | -   The NHGIS website (nhgis.org) is a great way to browse variables of interest.
956 | 
957 | ## ACS Margins of Error (MOE)
958 | 
959 | We haven't talked about it but it may be important in your work with ACS data.
960 | 
961 | Math is needed to combine MOEs when you combine variables.
962 | 
963 | -   `tidycensus` includes some nice [functions](https://walker-data.com/tidycensus/articles/margins-of-error.html) for these calculations and a good overview of the topic.
964 | 
965 | # Summary
966 | 
967 | `tidycensus` offers two key functions for fetching census tabular and geographic: **get_acs** and **get_decennial**.
968 | 
969 | -   The **load_variables** function helps identify the names of census variables of interest.
970 | 
971 | Support for fetching `population estimates` and `migration flow` census data was recently added to tidycensus. You can read up on it on the [tidycensus documentation website](https://walker-data.com/tidycensus/articles/other-datasets.html)
972 | 
973 | Using `tidycensus` to fetch the tabular data or both tabular and geographic data is IMO way easier than any alternatives, **IF** you (1) know R, (2) know a bit about working with geographic data in R.
974 | 
975 | This approach is also scaleable if you want multiple census variables for various locations and tabulation areas.
976 | 
977 | You can make publication or report ready maps with highly customizable `ggplot2` code or use the `sf::plot` command to make quick maps.
978 | 
979 | Interactive mapping greatly enhances your ability to do exploratory data analysis in RStudio.
980 | 
981 | ### References
982 | 
983 | Much of this tutorial is based on resources by Kyle Walker, author of `tidycensus`. See:
984 | 
985 | -   [tidycensus webpage](https://walker-data.com/tidycensus/index.html))
986 | -   [Analyzing the US Census with R](https://walker-data.com/census-r), an online book.
987 | 
988 | Related D-Lab Workshops
989 | 
990 | -   [R Fundamentals](https://github.com/dlab-berkeley/R-Fundamentals)
991 | -   [Geospatial Data in R, parts 1, 2, & 3](https://github.com/dlab-berkeley/R-Geospatial-Fundamentals)
992 | 
993 | Great online resource for working with spatial data in R
994 | 
995 | -   [Geocomputation with R](https://geocompr.robinlovelace.net/)
996 | 
997 | ------------------------------------------------------------------------
998 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Census Data in R
 2 | 
 3 | This workshop provides an introduction to working with census data in R using the `tidycensus` package.
 4 | 
 5 | ## Description
 6 | 
 7 | Since 1790, the US Census has been THE source of data about American people, providing valuable insights to social scientists and humanists. Mapping these data by census geographies adds more value by allowing researchers to explore spatial trends and outliers. This workshop will introduce `tidycensus`, an important and powerful R package for streamlining census data workflows. Participants will learn how to download census tabular data for one or more geographic tabulation units and/or years, download the associated census geographic data, and use these data for analysis and mapping.
 8 | 
 9 | Specifically, we will:
10 | 
11 | -   Describe the primary Census data products
12 | -   Introduce the R `tidycensus` package for working with Census Data
13 | -   Use that packages to fetch decennial and ACS census data
14 | -   Use those packages to fetch census geographic boundary files
15 | -   Make maps of census data, symbolizing the color of those maps by the data values
16 | 
17 | ### Knowledge Requirements:
18 | 
19 | R experience equivalent to the D-Lab R Fundamentals workshop series is required to follow along with the tutorial. Basic knowledge of census data and geospatial data will be very helpful.
20 | 
21 | ### Tech Requirements:
22 | 
23 | Bring a laptop with R, RStudio and the following R packages listed below installed.
24 | 
25 | ## R Packages to install
26 | 
27 | Here are the R packages you will need for this workshop:
28 | 
29 | -   `tidyverse`
30 | -   `ggplot2`
31 | -   `sf`
32 | -   `tidycensus`
33 | -   `tigris`
34 | -   `mapview`
35 | -   `leaflet`
36 | 
37 | ## Is R not working on your laptop?
38 | 
39 | If you have a Berkeley CalNet ID, you can run these lessons on UC Berkeley's DataHub by clicking [![Datauhb](https://img.shields.io/badge/launch-datahub-blue)](https://datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FCensus-Data-in-R&urlpath=rstudio%2F&branch=master). By using this link, you can save your work and come back to it at any time. When you want to return to your saved work, just go straight to DataHub (<https://datahub.berkeley.edu>), sign in, and you click on the `Census-Data-in-R` folder.
40 | 
41 | If you don't have a CalNet ID, you are able to access the workshop by clicking [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dlab-berkeley/Census-Data-in-R/HEAD?urlpath=rstudio). This link takes a moment to load, so patience is required!
42 | 
43 | ## Requesting a Census API key
44 | 
45 | The `tidycensus` package, and any R package that accesses the Census APIs, require you to first get a Census API key. Get one now if you don’t have one yet here: (just takes a minute): <https://api.census.gov/data/key_signup.html>
46 | 
47 | ## Going Further with Geospatial Analysis
48 | 
49 | This workshop in mainly about using R to access US Census data in dynamic and efficient ways. We also explore data-centric mapping techniques to explore our queries over space. Is geospatial topics interest you and you'd like to study more, we recommed these resources as good starting points:
50 | 
51 | -   [Geospatial Fundamentals in R](https://github.com/dlab-berkeley/R-Geospatial-Fundamentals) - D-Lab workshop focusing on mapping and geospatial analysis in R.
52 | 
53 | -   [Leaflet for R](https://rstudio.github.io/leaflet/) - Well written introduciton to mapping with the `leaflet` package in R.
54 | 
55 | -   [Geocomputation in R](https://geocompr.robinlovelace.net/) - Textbook (Lovelace, Nowosad, and Muenchow, 2019) is an excellent resource for getting up and running.
56 | 
57 | -   [R Shiny Tutorials](https://shiny.rstudio.com/tutorial/) - `shiny` is an R package that makes it easy to build interactive web apps straight from R, incuding maps!
58 | 
59 | ## Contributors
60 | 
61 | -   Patty Frontiera
62 | -   Irene Farah
63 | -   [Avery Richards](https://github.com/Averysaurus)
64 | 
65 | ------------------------------------------------------------------------
66 | 
67 | <div style="display:inline-block;vertical-align:middle;align:left">
68 |     <div style="font-size:larger">D-Lab @ University of California - Berkeley
69 |     </br>
70 |     <a href="https://dlab.berkeley.edu" target="_blank">https://dlab.berkeley.edu</a>
71 |     </br>
72 |     &nbsp;
73 |     </div>
74 | </div>
75 | 


--------------------------------------------------------------------------------
/Solutions/Census-Data-in-R-Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Solutions: R-Census-Data-Challenges'
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r, libraries}
 11 | 
 12 | # run this chunk if you have not loaded the necessary libraries yet. 
 13 | if (!require("pacman")) install.packages("pacman")
 14 | 
 15 | pacman::p_load(
 16 |   here, # locate files 
 17 |   tidyverse, # data wrangling
 18 |   sf,  # geospatial data management
 19 |   mapview, # mapping
 20 |   tigris, tidycensus  # census data 
 21 | )
 22 | 
 23 | # These six libraries should be loaded in your environment now. 
 24 | 
 25 | # Tigris options - used by tidycensus
 26 | # Cache retrieved geographic data locally
 27 | options(tigris_use_cache = TRUE)  
 28 | ```
 29 | 
 30 | 
 31 | ```{r, challenge_1A_solution}
 32 | 
 33 | ## Fill in the code to fetch total population in 2010 just for California
 34 | get_decennial(geography = "state",   # census tabulation unit
 35 |               variables = "P001001", # variable(s) of interest
 36 |               year = 2010,           # census year
 37 |               state="CA")            # Filter by state is CA
 38 | 
 39 | ```
 40 | 
 41 | 
 42 | ```{r, challenge_1B_solution}
 43 | 
 44 | ## fetch total pop in 2010 for CA, TX and FL
 45 | get_decennial(geography = "state",             # census tabulation unit
 46 |               variables = "P001001",           # variable(s) of interest
 47 |               year = 2010,                     # census year
 48 |               state=c("CA","FL","TX"))         # Filter by state is CA, Fl or TX
 49 | 
 50 |      
 51 | ```
 52 | 
 53 | 
 54 | 
 55 | ```{r, challenge_2A_solution}
 56 | 
 57 | ## fetch 2010 population for Alameda & San Francisco Counties
 58 | get_decennial(geography = "county",              # census tabulation unit
 59 |               variables = "P001001",             # variable(s) of interest
 60 |               year = 2010,                       # census year
 61 |               state='CA',                        # filter by state 
 62 |               county=c('Alameda',
 63 |                        'San Francisco'))         # filter by County
 64 | 
 65 | ```
 66 | ```{r, challenge_2B_solution}
 67 | 
 68 | ## Fill in the code to fetch total population in 2010 the US
 69 | get_decennial(geography = "us",   # census tabulation unit
 70 |               variables = "P001001", # variable(s) of interest
 71 |               year = 2010)           # census year
 72 | 
 73 | ```
 74 | 
 75 | ```{r, challenge_2C_solution}
 76 | 
 77 | ## Fill in the code to fetch total population in 2010  census regions
 78 | get_decennial(geography = "region",   # census tabulation unit
 79 |               variables = "P001001", # variable(s) of interest
 80 |               year = 2010)           # census year
 81 | ```
 82 | 
 83 | ```{Variable Questions}
 84 | # Search vars2010
 85 | ## vars20210 <- load_variables(2010, dataset='sf1', cache=T)
 86 | 
 87 | - `Median Age`: "P013001"
 88 | 
 89 | - `Average Family Size`: "P037001"
 90 | 
 91 | - `Number of occupied housing units` : "H003002"
 92 | ```
 93 | 
 94 | ```{r, challenge_3_solution}
 95 | 
 96 | # Fetch Avg family size by CA County in 2010
 97 | ca_fam_size <- get_decennial(geography='county',   # census tabulation unit
 98 |                             variables="P037001",   # variable(s) of interest
 99 |                             state='CA',            # filter by state, county,
100 |                             year=2010)             # and year. 
101 |                             
102 | 
103 | # create a ggplot
104 | ca_fam_size %>%
105 |   ggplot(aes(x = value, y = reorder(NAME, value))) + 
106 |   geom_point()
107 |   
108 | # display the ggplot
109 | ca_fam_size
110 | 
111 | ```
112 | 
113 | 
114 | ```{r, challenge_4_solution}
115 | 
116 | # Load 2000 decennial census variables
117 | vars2000 <- load_variables(year=2000, dataset = 'sf1', cache = T)
118 | 
119 | # Fetch Avg family size by CA County in 2000
120 | ca_fam_size2000 <- get_decennial(geography='county',
121 |                             variables='P033001',
122 |                             state='CA',
123 |                             year=2000)
124 |   
125 | ```
126 | 
127 | 
128 | ```{r, challenge_5_solution}
129 | 
130 | ## Fetch data + geometry for CA Median Age by county in 2010
131 | ca_med_age <- get_decennial(geography = "county", 
132 |                             variables = "P013001",
133 |                             year = 2010, 
134 |                             state='CA',
135 |                             geometry=TRUE)
136 | 
137 | # map it with plot
138 | plot(ca_med_age['value'])
139 | 
140 | # map it with ggplot - setting CRS to 3310
141 | ca_med_age %>%
142 |   ggplot(aes(fill = value)) + 
143 |   geom_sf(color=NA) + 
144 |   coord_sf(crs = 3310) + 
145 |   scale_fill_viridis_c(option = "viridis") 
146 | 
147 | ```
148 | 
149 | ```{r, challenge_6_solution }
150 | 
151 | # Fetch the data
152 | # Median household rent for San Francisco County
153 | # using data from the ACS 2016—2020 5-year product.
154 | sf_medrent <- get_acs(geography="tract",
155 |                         variables=c(median_rent="B25064_001"),
156 |                         year =2020,
157 |                         state="CA",
158 |                         county=c("San Francisco"),
159 |                         geometry=T)
160 | 
161 | # Map it with plot - remove rows with NAs
162 | plot(sf_medrent[!is.na(sf_medrent$estimate),]['estimate'])
163 | 
164 | # Map it with ggplot - remove rows with NAs
165 | sf_medrent %>%
166 |   drop_na(estimate)  %>%
167 |   ggplot(aes(fill = estimate)) + 
168 |   geom_sf(color=NA) + 
169 |   coord_sf(crs = 26910) +  # CRS for Northern CA - UTM 10 
170 |   scale_fill_viridis_c(option = "magma") 
171 | 
172 | ```
173 | 
174 | 
175 | ```{r, challenge_7_solution }
176 | 
177 | # simple interactive map
178 | mapview(sf_medrent)
179 | 
180 | # thematic (or choropleth) map of median rent
181 | mapview(sf_medrent, zcol='estimate')
182 | 
183 | ```
184 | 
185 | 


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/.DS_Store


--------------------------------------------------------------------------------
/data/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/.Rapp.history


--------------------------------------------------------------------------------
/data/census2010_vars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census2010_vars.png


--------------------------------------------------------------------------------
/data/census_data_by_prod_geo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census_data_by_prod_geo.png


--------------------------------------------------------------------------------
/data/census_geo_hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census_geo_hierarchy.png


--------------------------------------------------------------------------------
/data/census_geodata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census_geodata.png


--------------------------------------------------------------------------------
/data/census_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census_page.png


--------------------------------------------------------------------------------
/data/cenvar_lookup.csv:
--------------------------------------------------------------------------------
 1 | my_cen_var_names,my_cen_vars
 2 | citizenship_totpop,B05001_001E
 3 | citizenship_non_citizen,B05001_006E
 4 | entry_totpop,B05005_001E
 5 | entry_2010,B05005_002E
 6 | entry_2000_2009,B05005_007E
 7 | birthplace_totpop,B05007_001E
 8 | birthplace_europ,B05007_014E
 9 | birthplace_asian,B05007_027E
10 | birthplace_latinAmerica,B05007_040E
11 | birthplace_southAmerica,B05007_081E
12 | birthplace_other_nonUSA,B05007_094E
13 | birthplace_byage_totpop,B06001_001E
14 | birthplace_byage_fborn,B06001_049E
15 | poverty_totpop,B06012_001E
16 | below_pov,B06012_002E
17 | below_pov2,B06012_003E
18 | poverty_fborn_totpop,B06012_017E
19 | below_pov_fborn,B06012_018E
20 | below_pov2_fborn,B06012_019E
21 | health_native_totpop,B27020_002E
22 | health_native_noinsurance,B27020_006E
23 | health_fborn_nat_totpop,B27020_008E
24 | fborn_nohealth_naturalized,B27020_012E
25 | health_fborn_noncit_totpop,B27020_013E
26 | fborn_nohealth_noncitizen,B27020_017E


--------------------------------------------------------------------------------
/data/mapview_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/mapview_example.png


--------------------------------------------------------------------------------
/data/request_api_key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/request_api_key.png


--------------------------------------------------------------------------------
/data/swd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/swd.png


--------------------------------------------------------------------------------
/data/tidycensus_articles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/tidycensus_articles.png


--------------------------------------------------------------------------------
/data_out/readme_data_out.txt:
--------------------------------------------------------------------------------
1 | Folder for output from tutorial
2 | 


--------------------------------------------------------------------------------
/install.R:
--------------------------------------------------------------------------------
1 | install.packages(c("tidyverse",
2 |                    "tidycensus",
3 |                    "sf",
4 |                    "mapview", 
5 |                   "tigris", 
6 |                   "here"))
7 | 


--------------------------------------------------------------------------------
/previous_versions/Rcensus_data_maps-slides.Rmd:
--------------------------------------------------------------------------------
   1 | ---
   2 | title: "Census Data Wrangling and Mapping in R"
   3 | author: "Patty Frontiera"
   4 | date: "03/21/2019"
   5 | output: 
   6 |   ioslides_presentation
   7 | editor_options: 
   8 |   chunk_output_type: console
   9 | ---
  10 | 
  11 | ```{r setup, include=FALSE}
  12 | knitr::opts_chunk$set(echo = TRUE)
  13 | 
  14 | ```
  15 | 
  16 | # Getting Started
  17 | 
  18 | ## Setup
  19 | 
  20 | Welcome! While we're waiting:
  21 | 
  22 | * **Clone or download** the workshop files from: [https://github.com/dlab-geo/rCensus_workshop](https://github.com/dlab-geo/rCensus_workshop)
  23 |     - If you downloaded the zipfile, **unzip it**.
  24 |     - Make a note of the folder in which the files reside. 
  25 |     
  26 |     
  27 | * Open **RStudio**
  28 | 
  29 | * Open a new **R script** file
  30 | 
  31 | ## Introduction
  32 | 
  33 | - About me
  34 | 
  35 | - About you
  36 |     - Your familiarity with US Census data
  37 |     - with geospatial data
  38 |     - with geospatial data in R
  39 | 
  40 | ## Outline
  41 | 
  42 | - Describe primary Census data products
  43 | 
  44 | - Introduce R packages for working with Census Data
  45 | 
  46 | - Use those packages to fetch census data
  47 | 
  48 | - Use those packages to fetch census data plus census geograpic boundary files
  49 | 
  50 | - Make maps of census data
  51 | 
  52 | # Census Data Overview
  53 | 
  54 | ## US Census Data
  55 | 
  56 | The "nation's leading provider of quality data about its people and economy."
  57 | 
  58 | <img src="data/census_page.png" width="700px"></img>
  59 | 
  60 | Available at [www.census.gov](www.census.gov)
  61 | 
  62 | ## Primary Census Products
  63 | 
  64 | - Decennial Census
  65 | 
  66 | - American Community Survey (ACS)
  67 | 
  68 | ##  Decennial Census
  69 | 
  70 | Complete count of the population every 10 years since `1790` 
  71 | 
  72 | Includes data on 
  73 | 
  74 | - population, by age & race/ethnicity 
  75 | 
  76 | - housing, by occupancy & tenure (owned, rented)
  77 | 
  78 | ## American Community Survey (ACS)
  79 | 
  80 | - Annual survey of a sample of about 3 million households
  81 | 
  82 | - Provides estimates of demographic, social, economic & housing characteristics
  83 | 
  84 | - Includes margin of error values for the estimates.
  85 | 
  86 |  
  87 | ## Decennial Census* vs ACS Data
  88 | | Demographic*    | Social             | Economic          | Housing           |
  89 | |-----------------|--------------------|-------------------|-------------------|
  90 | | Sex             | Families           | Income            | Tenure*           |
  91 | | Age             | Education          | Benefits          | Occupancy*        |
  92 | | Race            | Marital Status     | Employment Status | Structure Type    |
  93 | | Hispanic Origin | Fertility          | Occupation        | Housing Value     |
  94 | |                 | Grandparents       | Industry          | Taxes & Insurance |
  95 | |                 | Veterans           | Commuting         | Utilities         |
  96 | |                 | Disability Status  | Place of Work     | Mortgage          |
  97 | |                 | Language at Home   | Health Insurance  | Monthly Rent      |
  98 | |                 | Citizenship        |                   |                   |
  99 | |                 | Mobility           |                   |                   |
 100 | 
 101 | 
 102 | 
 103 | ## Census Geographies
 104 | 
 105 | Census data are publicly available at one or more levels of geographic aggregation.
 106 | 
 107 | <img src="data/census_geo_hierarchy.png" height="400px"></img>
 108 | 
 109 | ## Census Data & Census Geographies
 110 | 
 111 | <img src="data/census_data_by_prod_geo.png" width="800px"></img>
 112 | 
 113 | ## ACS 5 Year Dataset RECOMMENDED
 114 | 
 115 | ACS 1 year and 5 year products are currently available 
 116 | 
 117 | - ACS 3 year no longer available
 118 | 
 119 | ACS 5 year data provdes much better estimates, lower margins of error
 120 | 
 121 | More data available for ACS 5 Year product
 122 | 
 123 | 
 124 | ## Census Data Workflow
 125 | 
 126 | Identify your 
 127 | 
 128 | - topic of interest
 129 | - year(s)
 130 | - geographic level of detail
 131 | - for what locations?
 132 | 
 133 | Then determine what specific tables and variables
 134 | are available - ACS or Decennial?
 135 | 
 136 | ## CAUTION
 137 | 
 138 | "If you want to measure change you can't change the measures!"
 139 | 
 140 | **Census tables, variables, geographies, and geographic boundaries change over time!**
 141 | 
 142 | Measuring change over time with census data is *its own thing*, complex and not covered by this workshop!
 143 | 
 144 | # R Packages
 145 | 
 146 | ## Packages for Working with Census Data
 147 | 
 148 | These are the ones we recommend and will use today.
 149 | 
 150 | - [tidycensus](https://walkerke.github.io/tidycensus) & [tigris](https://github.com/walkerke/tigris)
 151 | 
 152 | - [tidyverse](https://www.tidyverse.org/)
 153 |    
 154 | - [sf](https://r-spatial.github.io/sf/)
 155 | 
 156 | 
 157 | # tidycensus & tigris
 158 | 
 159 | ## [tidycensus](https://walkerke.github.io/tidycensus)
 160 | 
 161 | Functions for accessing census decennial and ACS 5 year datasets via Census APIs
 162 | 
 163 | - only a subset of datasets / years available
 164 | - requires a `Census API key` 
 165 | 
 166 | ## [tidycensus](https://walkerke.github.io/tidycensus)
 167 | 
 168 | Limited set of years available via `tidycensus`
 169 | 
 170 | - decennial census: 1990, 2000, and 2010
 171 | - ACS 5 yr: 2006-2010 through 2014-2018 are available. 
 172 | - Note: tidycensus refers to ACS 5year datasets by the endyear.
 173 | - Need to check availability of latest census data releases in `tidycensus`
 174 | 
 175 | ##  [tigris](https://github.com/walkerke/tigris)
 176 | 
 177 | Provides access to census geographic data files
 178 | 
 179 | - detailed TIGER/Line boundary files (e.g., shapefiles), or
 180 | - simplified Cartographic boundary files
 181 |     
 182 | Also provides access to census `feature data`,
 183 | 
 184 | - eg, rivers, roads, coastlands, landmarks, and more
 185 |     
 186 | 
 187 | Used by `tidycensus` to access state, county, tract, block group, block, and ZCTA boundaries.
 188 | 
 189 | - Use `tigris` directly to access other census geographic data.
 190 |     
 191 | ## tidycensus & tigris
 192 | 
 193 | Packages developed by [Kyle Walker](https://walkerke.github.io/) to make it easier to fetch data from Census websites and APIs in **R** and get that data in a useable format to analyze, plot, and map.
 194 | 
 195 | Check out his website to keep abreast of his great packages, blog posts, and tutorials.
 196 | 
 197 | - http://personal.tcu.edu/kylewalker/
 198 | 
 199 | - https://walkerke.github.io/
 200 | 
 201 | Walker also develped a new [DataCamp](https://www.datacamp.com) course: [Analyzing US Census Data in R!](https://www.datacamp.com/courses/analyzing-us-census-data-in-r)
 202 | 
 203 | - Highly recommended! First chapter free!
 204 | 
 205 | 
 206 | ## [tidyverse](https://www.tidyverse.org/)
 207 | 
 208 | A collection of R Packages for data science
 209 | - developed primarily by [Hadley Wickham](http://hadley.nz/), Chief Scientist at [RStudio](https://www.rstudio.com/).
 210 | 
 211 |  - `dplyr` and `tidyr` for reshaping data
 212 | 
 213 |  - `ggplot2` for plotting
 214 | 
 215 |  - `purr`, `readr` and `tibble` for improved performance 
 216 | 
 217 | These packages are used by `tidyverse` under the hood.
 218 | 
 219 | ## [sf](https://r-spatial.github.io/sf/)
 220 | 
 221 | Simple features for geospatial data objects and methods.
 222 | 
 223 | - Next generation R package for working with vector geospatial data
 224 |     - superceding the `sp` package
 225 |     
 226 | `sf` includes the functionality of the `sp`, `rgdal`, `rgeos` and `proj4` packages.
 227 | 
 228 | - but with improved performance, simplified command syntax, and easier workflows.
 229 | 
 230 | ## Alternatives to Accessing Census Data in R
 231 | 
 232 | You can write code to access the [Census APIs](https://www.census.gov/data/developers/data-sets.html) directly.
 233 | 
 234 | You can download Census data directly from:
 235 | 
 236 | - [American Factfinder](https://factfinder.census.gov/faces/nav/jsf/pages/index.xhtml) or 
 237 | - [NHGIS.org](https://www.nhgis.org/)
 238 | - [Social Explorer](https://www.socialexplorer.com/)
 239 |     - Subscription service but FREE for UCB community
 240 | 
 241 | You can download Census `geographic data` directly on the [census website](https://www.census.gov/geo/maps-data/)
 242 | 
 243 | 
 244 | # Tutorial Time!
 245 | 
 246 | ## Part 1
 247 | 
 248 | We will work through several exercises using `tidycensus` to fetch, wrangle and map census data.
 249 | 
 250 | ## Loading packages
 251 | 
 252 | Load the packages we will use today
 253 | 
 254 | ```{r, message=FALSE, warning=FALSE}
 255 | library(tidycensus)
 256 | library(tidyverse) 
 257 | library(tigris)
 258 | library(sf)
 259 | ```
 260 | 
 261 | If you are getting errors try importing dplyr or reinstalling dplyr package as that has worked for some.
 262 | 
 263 | ## Install any packages that you do not have on your computer
 264 | 
 265 | Also install any dependancies.
 266 | 
 267 | ```{r, eval=FALSE}
 268 | # install.packages("tidyverse")
 269 | # install.packages("tidycensus")
 270 | # install.packages("sf")
 271 | ```
 272 | 
 273 | 
 274 | ## Census API Key
 275 | 
 276 | You need a census API key to programmatically fetch census data.
 277 | 
 278 | Get it here (pretty quick):
 279 | 
 280 | * (https://api.census.gov/data/key_signup.html)
 281 | 
 282 | For more info see:
 283 | 
 284 | * https://www.census.gov/data/developers/data-sets.html
 285 | 
 286 | ## Install your Census API Key
 287 | 
 288 | Use the tidycensus function **census_api_key** to make tidycensus use your key when it fetches data from the census.
 289 | 
 290 | ```{r, eval=F}
 291 | # Install your census api key - long alphanumeric string
 292 | census_api_key(THE_BIG_LONG_ALPHANUMERIC_API_KEY_YOU_GOT_FROM_CENSUS)
 293 | ```
 294 | 
 295 | ## Set working directory
 296 | 
 297 | Be sure to **Clone or downloaded & unzip** the workshop files from: [https://github.com/dlab-geo/rCensus_workshop](https://github.com/dlab-geo/rCensus_workshop)
 298 | 
 299 | Then, set your working directory this folder, e.g.,
 300 | 
 301 | * `setwd("~/Documents/Dlab/workshops/2019/rCensus_workshop")`
 302 | 
 303 | <img src="./data/swd.png" width="600px"></img>
 304 | 
 305 | # Fetching Decennial Census Data
 306 | 
 307 | ## Population Data
 308 | 
 309 | Let's start by fetching **population data** from the 2010 Census **for all states**
 310 | 
 311 | In order to fetch census data you need to identify the census **variables** that contain the data of interest.
 312 | 
 313 | ## Topics, Tables & Variables
 314 | 
 315 | Census data **variables** are organized in **tables**
 316 | 
 317 | Which are organized by **topic** or concept.
 318 | 
 319 | The tidycensus **load_variables** function can help with this step.
 320 | 
 321 | First, take a look at the function documentation.
 322 | ```{r, eval=F}
 323 | ?load_variables
 324 | ```
 325 | 
 326 | ## load_variables
 327 | 
 328 | Use `load_variables` to fetch all variables used in the 2010 census into a dataframe.
 329 | ```{r}
 330 | vars2010 <- load_variables(year=2010,        # Year or end year for ACS
 331 |                            dataset = 'sf1',  # 'sf1' for decennial or 'acs5'
 332 |                            cache = TRUE)     # Whether to save fetched data locally
 333 | ```
 334 | 
 335 | ## Decennial Census Variables
 336 | 
 337 | Let's take a look at and discuss the resultant dataframe.
 338 | 
 339 | - How many 2010 census variables are in the dataframe?
 340 | ```{r, eval=F}
 341 | View(vars2010)
 342 | ```
 343 | 
 344 | ## 2010 Decennial Census Tables
 345 | 
 346 | - Variables: 3,346
 347 | 
 348 | - Topics: Population, housing
 349 | 
 350 | - Tables: currenty `333` - *that's a lot*!
 351 |     - 177 population tables (identified with a ‘‘P’’) available to the block level 
 352 |     - 58 housing tables (identified with an ‘‘H’’) available to the block level
 353 |     - 82 population tables (identified with a ‘‘PCT’’) available to the census tract level
 354 |     - 4 housing tables (identified with an “HCT”) available to the census tract level
 355 |     - 10 population tables (identified with a “PCO”) available to the county level 
 356 |     - plus 2 additoinal PCT tables
 357 | 
 358 | ## What Variable has the 2010 Total Population value? 
 359 | 
 360 | We can sort and filter the vars2010 dataframe to find it.
 361 | 
 362 | <img src="./data/census2010_vars.png" height="500px"></img>
 363 | 
 364 | ## get_decennial
 365 | 
 366 | We can use the tidycensus function **get_decenial** to fetch the 2010 census data for total population by state.
 367 | 
 368 | First, check the documentation for the function.
 369 | ```{r, eval=F}
 370 | ?get_decennial
 371 | ```
 372 | 
 373 | ## get_decennial
 374 | 
 375 | Fetch total population by state (**P001001**) from the 2010 census using `get_decennial`.
 376 | 
 377 | ```{r}
 378 | 
 379 | pop2010 <- get_decennial(geography = "state",   # census tabulation unit
 380 |                          variables = "P001001", # variable(s) of interest
 381 |                          year = 2010)           # census year
 382 |           
 383 | ```
 384 | 
 385 | ## View the Data
 386 | 
 387 | - How many rows and columns? 
 388 | 
 389 | - Do you see the expected number of states?
 390 | 
 391 | - What column contains the population counts?
 392 | 
 393 | - Do the data values see to be right?
 394 | ```{r}
 395 | #pop2010
 396 | ```
 397 | 
 398 | ## Visualize results
 399 | 
 400 | We can visualize the data to get a quick overview of the distribution of data values.
 401 | 
 402 | It's a first step in exploratory data analysis and a last step in data communication.
 403 | 
 404 | `ggplot2` is the most commonly used R package for data visualization. 
 405 | 
 406 | - It is loaded when you load the `tidyverse` package.
 407 | 
 408 | Let's use it to visualize the population data.
 409 | 
 410 | ## Plot 2010 Population by state
 411 | 
 412 | Use `ggplot2` to create an ordered horizontal bar chart.
 413 | ```{r}
 414 | pop_plot<- ggplot(data=pop2010, aes(x=reorder(NAME,value), y=value/1000000)) + 
 415 |   geom_bar(stat="identity") + coord_flip() +
 416 |   theme_minimal() + 
 417 |   labs(title = "2010 US Population by State") +
 418 |   xlab("State") +
 419 |   ylab("in millions")
 420 | ```
 421 | 
 422 | ## Display the plot
 423 | 
 424 | ```{r, echo=F}
 425 | pop_plot
 426 | ```
 427 | 
 428 | ## Challenge
 429 | 
 430 | Fetch population data by state for 2000.
 431 | 
 432 | *Don't assume variable names are the same across years.* Check first!
 433 | 
 434 | ## Challenge Solution
 435 | 
 436 | Total Population in 2000
 437 | 
 438 | ```{r, eval = F, code_folding = "hide"}
 439 | # What is the variable name in 2000?
 440 | vars2000 <- load_variables(year=2000, dataset = 'sf1', cache = T)
 441 | 
 442 | # Take a look and search in the dataframe
 443 | View(vars2000)
 444 | 
 445 | # Fetch the 2000 pop data
 446 | pop2000 <- get_decennial(geography = "state", variables = "P001001", year = 2000)
 447 | 
 448 | # Take a look (plot if time)
 449 | pop2000
 450 | ```
 451 | 
 452 | ## Limiting by Area of Interest
 453 | 
 454 | In the previous example we retrieved population data for all states.
 455 | 
 456 | - This is the default behavior if you don't specify a subset.
 457 | 
 458 | - But you can limit the data to be retrieved by subunits like state.
 459 | 
 460 | ## Limit Areas of Interest
 461 | 
 462 | Let's fetch data for just 3 states.
 463 | 
 464 | ```{r}
 465 | state_pop2010 <- get_decennial(geography = "state", # census tabulation unit
 466 |                          variables = "P001001",     # variables of interest
 467 |                          year = 2010,               # census year
 468 |                          state=c("CA","OR","WA"))   # Filter by states of interest
 469 | 
 470 | ```
 471 | 
 472 | *Note we are referencing states by their abbrevation.*
 473 | 
 474 | ## View Results
 475 | ```{r}
 476 | state_pop2010
 477 | ```
 478 | 
 479 | ## Changing Census Tabulation unit
 480 | 
 481 | `get_decennial` accepts a number of different values for **tabulation unit**.
 482 | 
 483 | - Options include: `state`, `county`, `tract`, `block group`, `block`, and `ZCTA`.
 484 | 
 485 | Let's change the tabulation unit from `state` to `county`.
 486 | ```{r}
 487 | co_pop2010 <- get_decennial(geography = "county",   # census tabulation unit
 488 |                             variables = "P001001",  # variables of interest
 489 |                             year = 2010)
 490 | ```
 491 | 
 492 | ## Changing Census Tabulation unit
 493 | 
 494 | View the county data to see what was retrieved.
 495 | ```{r}
 496 | co_pop2010
 497 | ```
 498 | 
 499 | ## Challenge 
 500 | 
 501 | * Fetch population by **county** for just California
 502 | 
 503 | * Fetch population by **county** for Oregon & California
 504 | 
 505 | *Try it before you look ahead at solutions.*
 506 | 
 507 | ## Challenge Solution
 508 | ```{r}
 509 | ## Fetch population by **county** for just California
 510 | co_pop2010_ca <- get_decennial(geography = "county",   # census tabulation unit
 511 |                             variables = "P001001",  # variables of interest
 512 |                             year = 2010,
 513 |                             state=c('CA'))
 514 | #co_pop2010_ca
 515 | 
 516 | ## Fetch population by **county** for Oregon & California
 517 | co_pop2010_caor <- get_decennial(geography = "county",   # census tabulation unit
 518 |                                variables = "P001001",  # variables of interest
 519 |                                year = 2010,
 520 |                                state=c('CA','OR'))
 521 | co_pop2010_caor
 522 | 
 523 | ```
 524 | 
 525 | ## Challenge
 526 | 
 527 | * Fetch population by **tract** for all states.
 528 | 
 529 | * Fetch population by **tract** for California.
 530 | 
 531 | ## Challenge Solution
 532 | ```{r, eval=F}
 533 | ## Fetch population by **tract** for California.
 534 | cal_pop2010_tracts <- get_decennial(geography = "tract",   # census tabulation unit
 535 |                                  variables = "P001001",  # variables of interest
 536 |                                  year = 2010,
 537 |                                  state=c('CA'))
 538 | cal_pop2010_tracts
 539 | 
 540 | 
 541 | ## Fetch population by **tract** for all states.
 542 | pop2010_tracts <- get_decennial(geography = "tract",   # census tabulation unit
 543 |                                     variables = "P001001",  # variables of interest
 544 |                                     year = 2010)
 545 | 
 546 | pop2010_tracts  ## DOES THIS WORK?
 547 | ```
 548 | 
 549 | ## Fetching Census Tract Data
 550 | 
 551 | If you want census data at the tract level or below you **must** specifiy the state & county or counties.
 552 | ```{r,}
 553 | tract_pop2010 <- get_decennial(geography = "tract",   # census tabulation unit
 554 |                          variables = "P001001",       # variable of interest
 555 |                          year = 2010,                 # census year
 556 |                          state="CA",                  # limit to state of California
 557 |                          county=c("Alameda","Contra Costa"))  # and only these counties
 558 | ```
 559 | 
 560 | ## Fetching Census Tract Data
 561 | 
 562 | View the results! How many census tracts are in these 3 counties?
 563 | 
 564 | ```{r}
 565 | tract_pop2010
 566 | ```
 567 | 
 568 | ## Challenge
 569 | 
 570 | 1. Fetch population by **county** for Alameda County, California
 571 | 
 572 | 2. Fetch population by **tract** for the nine county Bay Area:
 573 | - Alameda, SF, Contra Costa, Marin County, Napa, 
 574 | - San Mateo, Santa Clara,  Solano,  Sonoma, Santa Cruz
 575 | 
 576 | Note: You can use names, abbreviations or FIPS codes for your `state` and `county`.
 577 | 
 578 | ```{r}
 579 | # County FIPS Codes for
 580 | # Alameda, SF, Contra Costa, Marin County, Napa, 
 581 | # San Mateo, Santa Clara,  Solano,  Sonoma, santa cruz
 582 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097")
 583 | ```
 584 | 
 585 | ## Challenge Solution
 586 | 
 587 | ```{r}
 588 | #  population by **county** for Alameda County, California
 589 | alco_pop2010 <- get_decennial(geography = "county",   # census tabulation unit
 590 |                                  variables = "P001001",  # variables of interest
 591 |                                  year = 2010,
 592 |                                  state=c('CA'),
 593 |                                  county=c('Alameda County'))
 594 | #alco_pop2010
 595 | 
 596 | ```
 597 | 
 598 | ## Challenge Solution
 599 | 
 600 | Fetch population by **tract** for the nine county Bay Area
 601 | ```{r}
 602 | # County FIPs Codes for
 603 | # Alameda, SF, Contra Costa, Marin County, Napa, 
 604 | # San Mateo, Santa Clara,  Solano,  Sonoma, santa cruz
 605 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097")
 606 | 
 607 | bayarea_pop2010_tract <- get_decennial(geography = "tract",   # census tabulation unit
 608 |                          variables = "P001001",       # variable of interest
 609 |                          year = 2010,                 # census year
 610 |                          state="CA",                  # limit to state of California
 611 |                          county=nine_counties)  # and only these counties
 612 | #bayarea_pop2010_tract
 613 | ```
 614 | 
 615 | 
 616 | ## RECAP & QUESTIONS
 617 | 
 618 | Fetch population by **tract** for the nine county Bay Area
 619 | ```{r, eval=F}
 620 | # County FIPs Codes for
 621 | # Alameda, SF, Contra Costa, Marin County, Napa, 
 622 | # San Mateo, Santa Clara,  Solano,  Sonoma, santa cruz
 623 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097")
 624 | 
 625 | bayarea_pop2010 <- get_decennial(geography = "tract",   # census tabulation unit
 626 |                       variables = "P001001",            # variable of interest
 627 |                       year = 2010,                      # census year
 628 |                       state="CA",                       # limit to state of California
 629 |                      county=nine_counties)             # and only these counties
 630 | 
 631 | # View the data
 632 | bayarea_pop2010
 633 | ```
 634 | 
 635 | 
 636 | 
 637 | ## Fetching data for more than one census variable
 638 | 
 639 | What **three** things are new here?
 640 | ```{r}
 641 | #urban rural pop for 3 counties
 642 | ur_pop10 <- get_decennial(geography = "county",  # census tabulation unit
 643 |                            variables = c(urban="P002002",rural="P002005"),
 644 |                            year = 2010, 
 645 |                            summary_var = "P002001",  # The denominator
 646 |                            state='CA',
 647 |                            county=c("Napa","Sonoma","Mendocino"))
 648 | 
 649 | ```
 650 | 
 651 | ## Fetching data for more than one census variable
 652 | 
 653 | 1. You can specify more than one variable:
 654 | ```
 655 | variables = c("P002002","P002005")
 656 | ```
 657 | 
 658 | 2. You can rename the values in the output 'variable' column.
 659 | ```
 660 | variables = c(urban="P002002",rural="P002005")
 661 | ```
 662 | 
 663 | 3. You can identify a `summary_var` (a denominator - here, the total count of all people or households surveyed. Can be used for calcuations like percent of total.) 
 664 | ```
 665 | summary_var = "P002001"
 666 | ```
 667 | 
 668 | ## Take a look at the results
 669 | ```{r}
 670 | ur_pop10
 671 | ```
 672 | 
 673 | ## Calculating Percents
 674 | 
 675 | The `summary_value` column comes in handy when you want to compute percent of total.
 676 | 
 677 | Here's one way to do it.
 678 | ```{r}
 679 | # Calculate the percent of population that is Urban or Rural
 680 | ur_pop10 <- ur_pop10 %>%
 681 |             mutate(pct = 100 * (value / summary_value))
 682 | 
 683 | ```
 684 | 
 685 | ## Calculating Percents
 686 | 
 687 | Let's take a look at the output
 688 | ```{r}
 689 | ur_pop10 # Take a look
 690 | ```
 691 | 
 692 | ## Plot it
 693 | 
 694 | Plots give us compact visual summaries of the data
 695 | ```{r}
 696 | myplot <- ggplot(data = ur_pop10, 
 697 |           mapping = aes(x = NAME, fill = variable, 
 698 |                      y = ifelse(test = variable == "urban", 
 699 |                                 yes = -pct, no = pct))) +
 700 |           geom_bar(stat = "identity") +
 701 |           scale_y_continuous(labels = abs, limits=c(-100,100)) +
 702 |           labs(title="Urban & Rural Population in Wine Country", 
 703 |                x="County", y = " Percent of Population", fill="") +
 704 |           coord_flip()
 705 | ```
 706 | *Don't worry if you don't get all the ggplot code now. It's here for reference.*
 707 | 
 708 | ## Plot it
 709 | ```{r}
 710 | myplot
 711 | ```
 712 | 
 713 | ## Fetch all the data in one table
 714 | 
 715 | This is often helpful **but** you need to keep tract of the meaning of each variable.
 716 | ```{r}
 717 | alco_pop10 <- get_decennial(geography = "tract", # Census tabulation unit
 718 |                            table =  "P002",      # Table of urban & rural population counts
 719 |                            year = 2010,          # Decennial census year
 720 |                            state='CA',           # Filter state
 721 |                            county="Alameda")     # Filter county
 722 | 
 723 | ```
 724 | 
 725 | ## Take a look
 726 | ```{r}
 727 | unique(alco_pop10$variable) # What and how many unique vars in table?
 728 | 
 729 | head(alco_pop10,3)  # Take a look at output
 730 | ```
 731 | 
 732 | 
 733 | ## Output options
 734 | 
 735 | Let's try all three of these commands and then look at the ouput to see what's different?
 736 | 
 737 | ```{r, eval=F}
 738 | get_decennial(geography = "state", variables = "P001001",
 739 |               year = 2010)
 740 | 
 741 | get_decennial(geography = "state", variables = c(pop10="P001001"),
 742 |               year = 2010)
 743 | 
 744 | get_decennial(geography = "state", variables = c(pop10="P001001"),
 745 |               year = 2010, output="wide")
 746 | ```
 747 | 
 748 | ## Output options
 749 | 
 750 | ```{r}
 751 | head(get_decennial(geography = "state", variables = "P001001",
 752 |                    year = 2010), 2)
 753 | head(get_decennial(geography = "state", variables = c(pop10="P001001"),
 754 |                    year = 2010), 2)
 755 | head(get_decennial(geography = "state", variables = c(pop10="P001001"),
 756 |                    year = 2010, output="wide"), 2)
 757 | ```
 758 | 
 759 | 
 760 | ## Data Wrangling
 761 | 
 762 | Your R skills can help you reformat the data and make it more useable.
 763 | 
 764 | Let's fetch population data for 2010 & 2000 by state with **output=wide**.
 765 | 
 766 | - We will label the variables **pop00** and **pop10**.
 767 | 
 768 | Then we will combine these into one data frame.
 769 | 
 770 | ## Data Wrangling
 771 | 
 772 | Fetch pop by state from both the 2000 and 2010 census
 773 | ```{r}
 774 | pop2000 <- get_decennial(geography = "state",
 775 |                          variables = c(pop00="P001001"), 
 776 |                          year = 2000, output="wide")
 777 | 
 778 | pop2010 <- get_decennial(geography = "state",
 779 |                          variables = c(pop10="P001001"), 
 780 |                          year = 2010, output="wide")
 781 | 
 782 | ```
 783 | 
 784 | ## Merge population by state from both censuses
 785 | 
 786 | Save in a new dataframe with both columns
 787 | ```{r}
 788 | pop2000_2010 <- pop2000 %>% merge(pop2010, by="NAME") %>%
 789 |                              select(NAME, pop00, pop10)
 790 | 
 791 | head(pop2000_2010,3)
 792 | ```
 793 | 
 794 | ## Save the data
 795 | 
 796 | Use `write.csv` to save a data frame to a `CSV` file.
 797 | 
 798 | ```{r, eval=F}
 799 | write.csv(pop2000_2010, file="pop2000_2010.csv", row.names = FALSE)
 800 | ```
 801 | 
 802 | # TIME FOR QUESTIONS
 803 | 
 804 | 
 805 | # Part 2. Mapping
 806 | 
 807 | 
 808 | ## Mapping Census Data with `tidycensus`
 809 | 
 810 | You can fetch geographic data by adding the parameter **geometry=TRUE** to `tidycensus` functions
 811 | 
 812 | - Under the hood, tidycensus calls the `tigris` package to fetch data from the Census Geographic Data APIs.
 813 | 
 814 | - Only a subset of data available via `tigris` can be accessed via `tidycensus`.
 815 | 
 816 | You can then use common mapping options like `plot`, `ggplot` and `tmap` to make maps.
 817 | 
 818 | ## Geometry Options
 819 | 
 820 | Before fetching geometry, we need to specify a few `tigris` options
 821 | 
 822 | - Set the `class` of returned data to be `sf` objects (not `sp`, the default)
 823 | 
 824 | - Set `tigris_use_cache` to TRUE
 825 | 
 826 | ```{r}
 827 | # Tigris options - used by tidycensus
 828 | options(tigris_class = "sf")      # SP is the default format returned by tigris
 829 | options(tigris_use_cache = TRUE)  # Save retrieved data locally
 830 | 
 831 | ```
 832 | 
 833 | Caching the data is important because it speeds things up if you often fetch census data for the same geographies over and over again.
 834 | 
 835 | ## tigris cache directory
 836 | 
 837 | You may want to use the geographic data downloaded by tigris in other applications.
 838 | 
 839 | To do this, you need to know where the files are saved locally.
 840 | 
 841 | You can also specify where tigris should save cached data.
 842 | ```{r, eval=F}
 843 | # Check the location of the tigris cached data
 844 | Sys.getenv('TIGRIS_CACHE_DIR') 
 845 | 
 846 | # Set it
 847 | tigris_cache_dir("~/Documents/gis_data/census")  # Folder for local data
 848 | 
 849 | # Check it again
 850 | Sys.getenv('TIGRIS_CACHE_DIR') 
 851 | ```
 852 | 
 853 | ## Fetch geographic boundary data with tidycensus
 854 | 
 855 | We fetch the geospatial data by setting **geometry=TRUE**.
 856 | 
 857 | ```{r}
 858 | pop2010geo <- get_decennial(geography = "state", 
 859 |                           variables = c(pop10="P001001"), 
 860 |                           year = 2010, 
 861 |                           output="wide", 
 862 |                           geometry=TRUE) # Fetch geometry with the data for mapping
 863 |  
 864 | ```
 865 | 
 866 | ## Take a look
 867 | 
 868 | Let's take a minute to discuss the format of an `sf` spatial object.
 869 | ```{r}
 870 | pop2010geo
 871 | ```
 872 | 
 873 | 
 874 | ## Geospatial Data in R
 875 | 
 876 | R `sf` objects include
 877 | 
 878 | - a dataframe with a `geometry` column named of `geometry`
 879 | 
 880 |     - The geometry can be of type POINT, LINE, POLYGON
 881 |     - or, MULTIPOINT, MULTILINE or MULTIPOLGYON
 882 | 
 883 | - a `CRS` (coordinate reference system), specified by
 884 |     - epsg(SRID) code
 885 |     - proj4string
 886 |     
 887 | For a deeper understanding of the `sf` package and its functionality, we recommend our [Geospatial-Fundamentals-in-R-with-sf](https://github.com/dlab-berkeley/Geospatial-Fundamentals-in-R-with-sf) workshop.
 888 |     
 889 | ## Census Data Coordinate Reference System (CRS)
 890 | 
 891 | All census geographic data use the `NAD83` CRS, or coordinate reference system. `NAD83` stands for North American Datum of 1983. The geographic coordinates are longitude and latitude values encoded as decimal degrees.
 892 | 
 893 | `WGS84`, or [The World Geodetic System of 1984](https://en.wikipedia.org/wiki/World_Geodetic_System) is the most commonly used geographic CRS. The difference between points in these systems varies up to 1 meter in continental US.
 894 | 
 895 | Many geospatial operations require you transform data to a common CRS before conducting spatial analysis or mapping.  
 896 | 
 897 | An in-depth discussion of CRSs is outside the scope of this workshop. See [Geocomputation in R](https://geocompr.robinlovelace.net/reproj-geo-data.html) for more information.
 898 | 
 899 | ## Mapping sf Spatial Objects
 900 | 
 901 | We can use `plot` to make a quick map the geometry stored in an `sf` spatial object.
 902 | 
 903 | ```{r}
 904 | plot(pop2010geo$geometry)
 905 | ```
 906 | 
 907 | ## Question
 908 | 
 909 | What do you get if you plot the `sf` object without specifying "$geometry"
 910 | 
 911 | 
 912 | ## The Challenge of US maps
 913 | 
 914 | The vast geographic extent and non-contiguous nature of the USA makes it difficult to map.
 915 | 
 916 | ```{r, echo=F}
 917 | plot(pop2010geo$geometry) #view again
 918 | ```
 919 | 
 920 | ## Fetch geographic data with tidycensus, SHIFTED
 921 | 
 922 | tidycensus includes a `shift_geo` parameter to shift AK & HI to below Texas.
 923 | ```{r}
 924 | 
 925 | pop2010geo_shifted <- get_decennial(geography = "state", 
 926 |                                     variables = c(pop10="P001001"), 
 927 |                                     output="wide",
 928 |                                     year = 2010, 
 929 |                                     geometry=TRUE, 
 930 |                                     shift_geo=TRUE)
 931 | 
 932 | ```
 933 | 
 934 | ## Shift Happens!
 935 | ```{r}
 936 | plot(pop2010geo_shifted$geometry)
 937 | ```
 938 | 
 939 | ## Save it
 940 | 
 941 | You can save `sf` data to a shapefile using `st_write`
 942 | 
 943 | ```{r, eval=F}
 944 | st_write(pop2010geo_shifted,"usa_2010_shifted.shp")
 945 | ```
 946 | 
 947 | ## Check your TIGRIS_CACHE_DIR to see it
 948 | 
 949 | ```{r, eval=F}
 950 | my_cache_dir <- Sys.getenv('TIGRIS_CACHE_DIR') 
 951 | 
 952 | dir(my_cache_dir) # What files stored there?
 953 | ```
 954 | 
 955 | ## Mapping Data Values
 956 | 
 957 | ```{r}
 958 | plot(pop2010geo_shifted['pop10'])
 959 | ```
 960 | 
 961 | ## ggplot2 Maps
 962 | 
 963 | ```{r}
 964 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 
 965 |   geom_sf()
 966 | ```
 967 | 
 968 | ## ggplot2 Maps
 969 | 
 970 | Note the use of **geom_sf** which tells ggplot that spatial data objects are being mapped.
 971 | - this is a huge improvement!!
 972 | 
 973 | ```{r, include=F}
 974 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 
 975 |   geom_sf()
 976 | ```
 977 | 
 978 | ## Challenge 
 979 | 
 980 | Create a `map` of CA Population in 2010 by county
 981 | 
 982 | 
 983 | ## Challenge Solution
 984 | 
 985 | 2010 pop Data for California Counties
 986 | ```{r, eval=F}
 987 | 
 988 | #fetch it
 989 | cal_pop10 <- get_decennial(geography = "county", 
 990 |                            variables = "P001001",
 991 |                            year = 2010, 
 992 |                            state='CA',
 993 |                            geometry=TRUE)
 994 | 
 995 | # map it
 996 | #plot(cal_pop10['value'])
 997 | ```
 998 | 
 999 | 
1000 | ## Fetch County data for more than one state
1001 | 
1002 | We can fetch both the census data and the **geometry** for more than one state!
1003 | 
1004 | - *this is so much easier than any alternative approach!*
1005 | ```{r}
1006 | west_pop10 <- get_decennial(geography = "county", 
1007 |                            variables =  "P001001",
1008 |                            year = 2010, 
1009 |                            state=c('CA','OR','NV',"AZ"),
1010 |                            geometry=T)
1011 | ```
1012 | 
1013 | ## Map it
1014 | 
1015 | These are just quick plots to make sure we got the right data!
1016 | ```{r}
1017 | plot(west_pop10['value'])
1018 | ```
1019 | 
1020 | ## Census Tract Data
1021 | 
1022 | Fetching the data for all `tracts` in one state.
1023 | 
1024 | - **but** you need to specify one or more counties.
1025 | ```{r}
1026 | # Fetch tract data 
1027 | alco_pop10 <- get_decennial(geography = "tract", 
1028 |                            variables = "P001001", 
1029 |                            year = 2010, 
1030 |                            state='CA',
1031 |                            county='Alameda',
1032 |                            geometry=T)
1033 | ```
1034 | 
1035 | ## Challenge
1036 | 
1037 | Fetch and map the 2010 population by census tract for Alameda and Contra Costa counties.
1038 | 
1039 | 
1040 | ## Challenge Solution
1041 | 
1042 | Fetch Tract population & geometry data for Alameda & Contra Costa Counties
1043 | 
1044 | ```{r}
1045 | 
1046 | alcc_pop10 <- get_decennial(geography = "tract", 
1047 |                       variables = "P001001", 
1048 |                       year = 2010, 
1049 |                       state='CA',
1050 |                       county=c("Alameda","Contra Costa"),
1051 |                       geometry=T) 
1052 | ```
1053 | 
1054 | ## Challenge Solution
1055 | 
1056 | Map it
1057 | ```{r}
1058 | plot(alcc_pop10['value'])
1059 | ```
1060 | 
1061 | 
1062 | ## More Complex Challenge (if time)
1063 | 
1064 | Fetch and map the percent of San Francicso properties by census tract that were coded as rented in the 2010 Census.
1065 | 
1066 | To start, identify the variables for the
1067 | 
1068 | - total number of hounsing units 
1069 | 
1070 | - number of renter occupied units
1071 | 
1072 | ## Complex Challenge Solution
1073 | 
1074 | SF Rented Units, 2010 
1075 | ```{r, eval=F}
1076 | sf_rented <- get_decennial(geography = "tract",  # census tabulation unit
1077 |                            variables =  "H004004",
1078 |                            year = 2010, 
1079 |                            summary_var = "H004001",  # Total Urban - the denominator
1080 |                            state='CA',
1081 |                            county='San Francisco',
1082 |                            geometry=T)
1083 | 
1084 | sf_pct_rented <- sf_rented[sf_rented$value > 0,] %>%
1085 |                  mutate(pct = 100 * (value / summary_value))
1086 | 
1087 | plot(sf_pct_rented['pct'])
1088 | ```
1089 | 
1090 | # Questions?
1091 | 
1092 | # Part 3. ACS 5 year data
1093 | 
1094 | ## ACS Data with tidycensus
1095 | 
1096 | The tidycensus workflow for ACS data is similar to that used for decennial census data.
1097 | 
1098 | - But there are many more variables in the ACS.
1099 | 
1100 | Because the ACS contains **sample data**, each ACS variable of interest includes both an **estimate** of the value and a **margin of error**.
1101 | 
1102 | ## ACS 5 year
1103 | 
1104 | You can use the tidycensus **get_acs** function to retrieve data for the ACS 5 year products, beginning with the 2006 - 2010 dataset. 
1105 | 
1106 | The **default** end year for my version of `tidycensus` (as of April 9, 2020)  is **2018** for the 2014-2018 ACS 5 year dataset.
1107 | 
1108 |   
1109 | ## Fetch List of ACS 5 year Variables
1110 |   
1111 | Let's start by fetching ACS 5-year 2016 data on poverty (not all variables appear included in 2018 data yet). 
1112 | 
1113 | We want to explore the number of folks living below the poverty level by census tract.
1114 | 
1115 | First we need to find the variable name(s)!
1116 | 
1117 | ## Load ACS Table Vars
1118 | 
1119 | Load the ACS 2012-2016 5 year data variables into a dataframe.
1120 | 
1121 | - ACS 5-year datasets are referenced by `end year` in tidycensus!
1122 | 
1123 | Then take a look at the variable names, labels and concepts.
1124 | 
1125 | How many variables refer to the concept of poverty?
1126 | 
1127 | ```{r}
1128 | acs2016vars <- load_variables(year=2016, dataset = 'acs5', cache = T)
1129 | View(acs2016vars)
1130 | ```
1131 | 
1132 | ## ACS Tables and variables
1133 | 
1134 | Many thousands more than for decennial census!
1135 | 
1136 | See the documentation on the [census website](https://www.census.gov/programs-surveys/acs/guidance/which-data-tool/table-ids-explained.html)
1137 | 
1138 | Types of tables:
1139 | 
1140 | - `B` prefix = base tables
1141 | - `C` = collapsed tables
1142 | - `DP` = data profiles
1143 | - `S` = Subject tables
1144 | 
1145 | ## Census Reporter
1146 | 
1147 | ACS variables can  be confusing. 
1148 | 
1149 | The Census Reporter website (https://censusreporter.org) provides another tool for navigating topics, tables, and variable names.
1150 | 
1151 | Let's check it out to see what tables/variables we should use.
1152 | 
1153 | ## Filter the ACS Variables
1154 | 
1155 | In RStudio, view the dataframe **acs2016vars** and interactively filter the name column to display only the variables in the table **C17002** 
1156 | 
1157 | Take a look at the different variables in this table.
1158 | 
1159 | What variable(s) contain the estimate of the number of people living below poverty?
1160 | 
1161 | ## get_acs
1162 | 
1163 | Use the tidycensus `get_acs` function to fetch the poverty data for census tracts in San Francisco
1164 | ```{r, eval=F}
1165 | ?get_acs
1166 | ```
1167 | 
1168 | ## get_acs in action
1169 | 
1170 | Fetch the data in the table **C17002** that contain the counts of people living below 100% of the poverty line.
1171 | ```{r}
1172 | sf_poor <- get_acs(geography = "tract",  
1173 |                    variables = c('C17002_002','C17002_003'), # poverty variables
1174 |                    year = 2016,          
1175 |                    state="CA",
1176 |                    summary_var = "C17002_001", # Est of num people - denom
1177 |                    county="San Francisco",
1178 |                    geometry=T)               
1179 | ```
1180 | 
1181 | ## View output
1182 | 
1183 | Let's take a look at the output of `get_acs` and discuss how it differs from `get_decennial`.
1184 | 
1185 | ```{r, eval=F}
1186 | sf_poor
1187 | ```
1188 | 
1189 | ## Create Poverty Map, try 2
1190 | 
1191 | What are we mapping!
1192 | ```{r}
1193 | # What are we mapping?
1194 | plot(sf_poor['estimate'])
1195 | ```
1196 | 
1197 | ## Create Poverty Map, try 2
1198 | 
1199 | ```{r}
1200 | # Remove census tracts that have no people!
1201 | sf_poor <- subset(sf_poor, summary_est > 0)
1202 | 
1203 | # What are we mapping?
1204 | plot(sf_poor['estimate'])
1205 | ```
1206 | 
1207 | ## Calculating percents
1208 | 
1209 | Let's calculate the percent below poverty by tract.
1210 | 
1211 | ```{r}
1212 | sf_poor <- sf_poor %>%
1213 |   mutate(pct = 100 * (estimate / summary_est))
1214 | 
1215 | head(sf_poor, 3)
1216 | ```
1217 | 
1218 | 
1219 | ## Group by and sum
1220 | 
1221 | We want to group the data by the geometry and then sum the data values so that we have one value per geometry.
1222 | ```{r}
1223 | sf_poor_summed <- sf_poor %>%
1224 |   select(GEOID, estimate, pct, geometry) %>%
1225 |   group_by(GEOID) %>% 
1226 |   summarise(count_below_pov = sum(estimate),
1227 |             pct_below_pov = sum(pct))
1228 | ```
1229 | 
1230 | ## Group by and sum
1231 | 
1232 | ```{r}
1233 | head(sf_poor_summed)
1234 | ```
1235 | 
1236 | ## Map Counts
1237 | 
1238 | Where are SF's poorest areas?
1239 | ```{r}
1240 | plot(sf_poor_summed['count_below_pov'])
1241 | 
1242 | ```
1243 | 
1244 | ## Map Percents
1245 | 
1246 | Where are SF's poorest areas?
1247 | ```{r}
1248 | plot(sf_poor_summed['pct_below_pov'])
1249 | 
1250 | ```
1251 | 
1252 | 
1253 | ## Challenge
1254 | 
1255 | The ACS **2013-2017** 5 year dataset was released Dec 6, 2018. 
1256 | 
1257 | Although my current version of `tidycensus` states that 2012-2016 is the latest ACS 5-year product, see if you can fetch & map the percent of people below poverty line in San Francisco using the **2013-2017** ACS 5-year data.
1258 | 
1259 | ## Challenge Solution
1260 | ```{r, eval=F}
1261 | sf_poor_2017 <- get_acs(geography = "tract",  
1262 |                    variables = c('C17002_002','C17002_003'), # poverty variables
1263 |                    year = 2017,          
1264 |                    state="CA",
1265 |                    summary_var = "C17002_001", # Est of num people - denom
1266 |                    county="San Francisco",
1267 |                    geometry=T)   
1268 | 
1269 | head(sf_poor_2017)
1270 | ```
1271 | 
1272 | 
1273 | ## Margins of Error (MOE)
1274 | 
1275 | We haven't talked about it but it may be important in your work with ACS data.
1276 | 
1277 | Math is needed to combine MOEs when you combine variables.
1278 | 
1279 | - tidycensus includes some nice [functions](https://walkerke.github.io/tidycensus/reference/index.html) for these calculations.
1280 | 
1281 | See this web page on how to handle [MOEs in tidycensus](https://walkerke.github.io/tidycensus/articles/margins-of-error.html)
1282 | 
1283 | # Questions?
1284 | 
1285 | 
1286 | # Maps with tmap - Demo
1287 | 
1288 | ## tmap
1289 | 
1290 | The `tmap` package is great for making both static and interactive maps. It turns R into a `GIS`.
1291 | 
1292 | Let's check it out with our last dataframe.
1293 | 
1294 | ## tmap
1295 | 
1296 | ```{r}
1297 | library(tmap)
1298 | tmap_mode("view") # set mode to interactive
1299 | 
1300 | poverty_map <- tm_shape(sf_poor_summed) +
1301 |                   tm_polygons(col="pct_below_pov", alpha=0.7)
1302 | ```
1303 | 
1304 | ## tmap
1305 | 
1306 | View the map - click on tracts
1307 | 
1308 | ```{r}
1309 | poverty_map
1310 | ```
1311 | 
1312 | ## tmap
1313 | 
1314 | There are a number of great tutorials online for working with `tmap`.
1315 | 
1316 | See the `References` at the end of this workshop document.
1317 | 
1318 | # Census Geographic Data Files
1319 | 
1320 | ## Census Geographic Data Files
1321 | 
1322 | **Cartographic Boundary** vs **Detailed TIGER/Line** data
1323 | 
1324 | By default, `tidycensus` downloads census **cartographic boundary** data.
1325 | 
1326 | - These are simplifed geometries, clipped to coastlines. 
1327 | 
1328 | In `get_acs` you can also request the more detailed census **TIGER/Line** data.
1329 | 
1330 | The cartographic boundary data is great for mapping but the detailed data is often better for analysis.
1331 | 
1332 | Let's check it out.
1333 | 
1334 | 
1335 | ## Fetch Cartographic Boundary Data
1336 | ```{r}
1337 | 
1338 | sf_poor_cb <- get_acs(geography = "tract",   
1339 |                    variables = c('C17002_002','C17002_003'), # poverty variables
1340 |                    summary_var = "C17002_001",
1341 |                    year = 2016,           
1342 |                    state="CA",
1343 |                    county="San Francisco",
1344 |                    geometry=TRUE,
1345 |                    cb = TRUE)     # THIS IS THE DEFAULT!
1346 | ```
1347 | 
1348 | ## Fetch Detailed TIGER/Line Geometry
1349 | ```{r}
1350 | 
1351 | sf_poor_tl <- get_acs(geography = "tract",   
1352 |                    variables = c('C17002_002','C17002_003'), # poverty variables       
1353 |                    summary_var = "C17002_001",
1354 |                    year = 2016,              
1355 |                    state="CA",
1356 |                    county="San Francisco",
1357 |                    geometry=TRUE,
1358 |                    cb = FALSE)  # Fetching the TIGER/Line data  
1359 | ```
1360 | 
1361 | 
1362 | ## Visualize differences with Tmap
1363 | 
1364 | zoom in to explore, especially around the coastline.
1365 | ```{r}
1366 | tm_shape(sf_poor_tl) + tm_borders() +
1367 | tm_shape(sf_poor_cb) + tm_borders(col="red")
1368 | 
1369 | ```
1370 | 
1371 | 
1372 | # Questions?
1373 | 
1374 | # Summary
1375 | 
1376 | ## Summary
1377 | 
1378 | - `tidycensus` offers two key functions for fetching census tabular and geographic: **get_acs** and **get_decennial**
1379 | 
1380 | - Using `tidycensus` to fetch the tabular data or both tabular and geographic data is IMHO way easier than any alternatives, **IF** you (1) know R, (2) know a bit about working with geographic data in R.
1381 | 
1382 | - This approach is also scaleable if you want multiple census variables and geographies.
1383 | 
1384 | - If you just want to fetch the geographic data it may be easier to use the **tigris** package or download it directly from the census.
1385 | 
1386 | ## References
1387 | 
1388 | - [DataCamp](https://www.datacamp.com) course [Analyzing US Census Data in R!](https://www.datacamp.com/courses/analyzing-us-census-data-in-r)
1389 | - [Geocomputation in R](https://geocompr.robinlovelace.net/)
1390 | - [Creating beautiful demographic maps with tidycensus and tmap packages](https://www.zevross.com/blog/2018/10/02/creating-beautiful-demographic-maps-in-r-with-the-tidycensus-and-tmap-packages/)
1391 | 
1392 | ## Related D-Lab Workshops
1393 | 
1394 | - [R Fundamentals](https://github.com/dlab-berkeley/R-Fundamentals)
1395 | - [Geospatial Data in R, parts 1, 2, & 3](https://github.com/dlab-berkeley/Geospatial-Fundamentals-in-R-with-sf)
1396 | - [Web Maps in R with Leaflet](https://github.com/dlab-berkeley/Leaflet-Maps-in-R)
1397 | - [Geocoding & Mapping in R](https://github.com/dlab-berkeley/Geocoding-in-R)
1398 | 
1399 | # Extras for Enthusiasts
1400 | 
1401 | ## Scaling Up Example
1402 | 
1403 | In this example we show you how you can read in census variables of interest from a file into an R dataframe. You can then use that dataframe to fetch data for all those variables using `tidycensus`.
1404 | 
1405 | ```{r}
1406 | 
1407 | # Load cenvar lookup table of vars of interest
1408 | my_cenvar_df <-read.csv("data/cenvar_lookup.csv", strip.white = T, stringsAsFactors = F)
1409 | 
1410 | my_cenvar_df
1411 | ```
1412 | 
1413 | ## Fetch the ACS data
1414 | 
1415 | Fetch the ACS data for these variables for the 9 county bay area
1416 | 
1417 | ```{r}
1418 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097")
1419 | bay9_data <-get_acs(geography = "tract", 
1420 |                        variables = my_cenvar_df$my_cen_vars, 
1421 |                        year=2016,
1422 |                        state = "CA", 
1423 |                        county = nine_counties, 
1424 |                        geometry = T)
1425 | 
1426 | bay9_data
1427 | ```
1428 | 
1429 | ## Reformat Ouput
1430 | 
1431 | 1. We only want to keep the estimate column for each variable of interest, plus the GEOID and geometry columns.
1432 | 
1433 | 2. We then want to make the data `wide` using the `spread` function. This will put each estimate variable is in its own column.
1434 | ```{r}
1435 | bay9_data2 <- bay9_data %>%
1436 |   select("GEOID", "variable", "estimate") %>%
1437 |   spread(key=variable, value=estimate)
1438 | ```
1439 | 
1440 | ## Take a look
1441 | ```{r}
1442 | bay9_data2
1443 | ```
1444 | 
1445 | ## Rename the columns
1446 | 
1447 | Use the dataframe of census variables to rename the columns so that they are self-describing.
1448 | ```{r}
1449 | colnames(bay9_data2)<-c("GEOID", my_cenvar_df$my_cen_var_names, "geometry")
1450 | 
1451 | ```
1452 | 
1453 | ## Take a look
1454 | ```{r}
1455 | 
1456 | bay9_data2
1457 | ```
1458 | 
1459 | 
1460 | ## Fetching data for multiple years
1461 | 
1462 | This requires variable name to be the same across years!
1463 | ```{r, eval=FALSE}
1464 | # use purr::map_df to get data for multiple years (must have same vars!)
1465 | pop90_10 <- map_df(c(1990, 2000, 2010), function(x) { 
1466 |   get_decennial(geography = "state",
1467 |   variables = c(totalpop = "P001001"),
1468 |   dataset = "sf1",
1469 |   year = x) %>%
1470 |   mutate(year = x) }
1471 | )
1472 | 
1473 | # View output
1474 | head(pop90_10)
1475 | tail(pop90_10)
1476 | 
1477 | # Plot it
1478 | pop90_10 %>% ggplot(aes(x=reorder(NAME,value), y=value/1000000, fill=factor(year))) + 
1479 |              geom_bar(stat="identity", position=position_dodge()) + coord_flip()
1480 | 
1481 | ```
1482 | 
1483 | 
1484 | # Combining Census Data with Other Data
1485 | 
1486 | ## Area Weighted Interpolation
1487 | 
1488 | One of the strenghts of the `sf` package is how relatively easy it is to reaggregate data from one geometry to another. This process is called areal interpolation.
1489 | 
1490 | Area weighted interpolation reaggregates the data based on the percent of area shared by input and output geometeries.
1491 | 
1492 | ## Read in a Shapefile
1493 | ```{r, eval=F}
1494 | sfnhoods<- st_read("data/sfnhoods.shp")
1495 | head(sfnhoods)
1496 | plot(sfnhoods['nhood'])
1497 | ```
1498 | 
1499 | ##  Check the CRS
1500 | ```{r, eval=F}
1501 | st_crs(sfnhoods)
1502 | st_crs(sf_poor5)
1503 | ```
1504 | 
1505 | ## CRS transformation
1506 | ```{r, eval=F}
1507 | sf_poor5_4326 = st_transform(sf_poor5, st_crs(sfnhoods))
1508 | ```
1509 | 
1510 | ## Area Weighted Interpolation
1511 | 
1512 | Reaggregate percent of people below poverty from census tract to neighborhood polygons.
1513 | 
1514 | ```{r, eval=F}
1515 | sfhoods2 = st_interpolate_aw(sf_poor5_4326[, "pct_below_pov"], sfnhoods,
1516 | extensive = F) # True= aw sum; False= aw avg
1517 | ```
1518 | 
1519 | ## Map it
1520 | ```{r, eval=F}
1521 | par(mfrow=c(1,2))
1522 | plot(sf_poor5['pct_below_pov'])
1523 | plot(sfhoods2['pct_below_pov'])
1524 | par(mfrow=c(1,1))
1525 | ```
1526 | 
1527 | ## Map it with `tmap`
1528 | ```{r, eval=F}
1529 | tm_shape(sfhoods2) +
1530 |    tm_polygons(col="pct_below_pov")
1531 | ```
1532 | 
1533 | ## Combine the values
1534 | ```{r, eval=F}
1535 | head(sfhoods2)
1536 | sfnhoods$pct_below_pov <- sfhoods2$pct_below_pov
1537 | 
1538 | # map again - click on polygons and view data in popups
1539 | # to confirm the AWI output values
1540 | tm_shape(sfnhoods) +
1541 |   tm_polygons(col="pct_below_pov", 
1542 |     popup.vars = c("nhood", "pct_below_pov")
1543 |   )
1544 | ```
1545 | 
1546 | 


--------------------------------------------------------------------------------
/previous_versions/Rcensus_data_maps-tutorial.Rmd:
--------------------------------------------------------------------------------
   1 | ---
   2 | title: "Census Data Wrangling and Mapping in R"
   3 | author: "Patty Frontiera"
   4 | date: "03/21/2019"
   5 | output: 
   6 |   html_document:
   7 |       toc: true
   8 |       number_sections: true
   9 |       toc_float: true
  10 | ---
  11 | 
  12 | ```{r setup, include=FALSE}
  13 | knitr::opts_chunk$set(echo = TRUE)
  14 | 
  15 | ```
  16 | 
  17 | # Getting Started
  18 | 
  19 | ## Setup
  20 | 
  21 | Welcome! While we're waiting:
  22 | 
  23 | * **Clone or download** the workshop files from: [https://github.com/dlab-geo/rCensus_workshop](https://github.com/dlab-geo/rCensus_workshop)
  24 |     - If you downloaded the zipfile, **unzip it**.
  25 |     - Make a note of the folder in which the files reside. 
  26 |     
  27 |     
  28 | * Open **RStudio**
  29 | 
  30 | * Open a new **R script** file
  31 | 
  32 | ## Introduction
  33 | 
  34 | - About me
  35 | 
  36 | - About you
  37 |     - Your familiarity with US Census data
  38 |     - with geospatial data
  39 |     - with geospatial data in R
  40 | 
  41 | ## Outline
  42 | 
  43 | - Describe primary Census data products
  44 | 
  45 | - Introduce R packages for working with Census Data
  46 | 
  47 | - Use those packages to fetch census data
  48 | 
  49 | - Use those packages to fetch census data plus census geograpic boundary files
  50 | 
  51 | - Make maps of census data
  52 | 
  53 | # Census Data Overview
  54 | 
  55 | ## US Census Data
  56 | 
  57 | The "nation's leading provider of quality data about its people and economy."
  58 | 
  59 | <img src="data/census_page.png" width="700px"></img>
  60 | 
  61 | Available at [www.census.gov](www.census.gov)
  62 | 
  63 | ## Primary Census Products
  64 | 
  65 | - Decennial Census
  66 | 
  67 | - American Community Survey (ACS)
  68 | 
  69 | ##  Decennial Census
  70 | 
  71 | Complete count of the population every 10 years since `1790` 
  72 | 
  73 | Includes data on 
  74 | 
  75 | - population, by age & race/ethnicity 
  76 | 
  77 | - housing, by occupancy & tenure (owned, rented)
  78 | 
  79 | ## American Community Survey (ACS)
  80 | 
  81 | - Annual survey of a sample of about 3 million household
  82 | 
  83 | - Provides estimates of demographic, social, economic & housing characteristics
  84 | 
  85 | - Includes margin of error values for the estimates.
  86 | 
  87 |  
  88 | ## Decennial Census* vs ACS Data
  89 | | Demographic*    | Social             | Economic          | Housing           |
  90 | |-----------------|--------------------|-------------------|-------------------|
  91 | | Sex             | Families           | Income            | Tenure*           |
  92 | | Age             | Education          | Benefits          | Occupancy*        |
  93 | | Race            | Marital Status     | Employment Status | Structure Type    |
  94 | | Hispanic Origin | Fertility          | Occupation        | Housing Value     |
  95 | |                 | Grandparents       | Industry          | Taxes & Insurance |
  96 | |                 | Veterans           | Commuting         | Utilities         |
  97 | |                 | Disability Status  | Place of Work     | Mortgage          |
  98 | |                 | Language at Home   | Health Insurance  | Monthly Rent      |
  99 | |                 | Citizenship        |                   |                   |
 100 | |                 | Mobility           |                   |                   |
 101 | 
 102 | 
 103 | 
 104 | ## Census Geographies
 105 | 
 106 | Census data are publicly available at one or more levels of geographic aggregation.
 107 | 
 108 | <img src="data/census_geo_hierarchy.png" height="400px"></img>
 109 | 
 110 | ## Census Data & Census Geographies
 111 | 
 112 | <img src="data/census_data_by_prod_geo.png" width="800px"></img>
 113 | 
 114 | ## ACS 5 Year Dataset RECOMMENDED
 115 | 
 116 | ACS 1 year and 5 year products are currently available 
 117 | 
 118 | - ACS 3 year no longer available
 119 | 
 120 | ACS 5 year data provdes much better estimates, lower margins of error
 121 | 
 122 | More data available for ACS 5 Year product
 123 | 
 124 | 
 125 | ## Census Data Workflow
 126 | 
 127 | Identify your 
 128 | 
 129 | - topic of interest
 130 | - year(s)
 131 | - geographic level of detail
 132 | - for what locations?
 133 | 
 134 | Then determine what specific tables and variables
 135 | are available - ACS or Decennial?
 136 | 
 137 | ## CAUTION
 138 | 
 139 | "If you want to measure change you can't change the measures!"
 140 | 
 141 | **Census tables, variables, geographies, and geographic boundaries change over time!**
 142 | 
 143 | Measuring change over time with census data is *it's own thing*, complex and not covered by this workshop!
 144 | 
 145 | # R Packages
 146 | 
 147 | ## Packages for Working with Census Data
 148 | 
 149 | These are the ones we recommend and will use today.
 150 | 
 151 | - [tidycensus](https://walkerke.github.io/tidycensus) & [tigris](https://github.com/walkerke/tigris)
 152 | 
 153 | - [tidyverse](https://www.tidyverse.org/)
 154 |    
 155 | - [sf](https://r-spatial.github.io/sf/)
 156 | 
 157 | 
 158 | # tidycensus & tigris
 159 | 
 160 | ## [tidycensus](https://walkerke.github.io/tidycensus)
 161 | 
 162 | Functions for accessing census decennial and ACS 5 year datasets via Census APIs
 163 | 
 164 | - only a subset of datasets / years available
 165 | - requires a `Census API key` 
 166 | 
 167 | ## [tidycensus](https://walkerke.github.io/tidycensus)
 168 | 
 169 | Limited set of years available via `tidycensus`
 170 | 
 171 | - decennial census: 1990, 2000, and 2010
 172 | - ACS 5 yr: 2005-2010 through 2012-2016 are available. 
 173 | - Note: tidycensus referes to ACS 5year datasets by the endyear.
 174 | - 2013 - 2017 released [Dec 6, 2018](https://www.census.gov/programs-surveys/acs/news/data-releases/2017/release-schedule.html) by the Census. 
 175 |     - Need to check its availability in `tidycensus`.
 176 | 
 177 | ##  [tigris](https://github.com/walkerke/tigris)
 178 | 
 179 | Provides access to census geographic data files
 180 | 
 181 | - detailed TIGER/Line boundary files (e.g., shapefiles), or
 182 | - simplified Cartographic boundary files
 183 |     
 184 | Also provides access to census `feature data`,
 185 | 
 186 | - eg, rivers, roads, coastlands, landmarks, and more
 187 |     
 188 | 
 189 | Used by `tidycensus` to access state, county, tract, block group, block, and ZCTA boundaries.
 190 | 
 191 | - Use `tigris` directly to access other census geographic data.
 192 |     
 193 | ## tidycensus & tigris
 194 | 
 195 | Packages developed by [Kyle Walker](https://walkerke.github.io/) to make it easier to fetch data from Census websites and APIs in **R** and get that data in a useable format to analyze, plot, and map.
 196 | 
 197 | Check out his website to keep abreast of his great packages, blog posts and tutorials.
 198 | 
 199 | - http://personal.tcu.edu/kylewalker/
 200 | 
 201 | - https://walkerke.github.io/
 202 | 
 203 | Walker also develped a new [DataCamp](https://www.datacamp.com) course [Analyzing US Census Data in R!](https://www.datacamp.com/courses/analyzing-us-census-data-in-r)
 204 | 
 205 | - Highly recommended! First chapter free!
 206 | 
 207 | 
 208 | ## [tidyverse](https://www.tidyverse.org/)
 209 | 
 210 | A collection of R Packages for data science
 211 | - developed primarily by [Hadley Wickham](http://hadley.nz/), Chief Scientist at [RStudio](https://www.rstudio.com/).
 212 | 
 213 |  - `dplyr` and `tidyr` for reshaping data
 214 | 
 215 |  - `ggplot2` for plotting
 216 | 
 217 |  - `purr`, `readr` and `tibble` for improved performance 
 218 | 
 219 | These packages are used by `tidyverse` under the hood.
 220 | 
 221 | ## [sf](https://r-spatial.github.io/sf/)
 222 | 
 223 | Simple features for geospatial data objects and methods.
 224 | 
 225 | - Next generation R package for working with vector geospatial data
 226 |     - will soon supercede the `sp` package
 227 |     
 228 | `sf` includes the functionality of the `sp`, `rgdal`, `rgeos` and `proj4` packages.
 229 | 
 230 | - but with improved performance, simplified command syntax and easier workflows.
 231 | 
 232 | ## Alternatives to Accessing Census Data in R
 233 | 
 234 | You can write code to access the [Census APIs](https://www.census.gov/data/developers/data-sets.html) directly.
 235 | 
 236 | You can download Census data directly from:
 237 | 
 238 | - [American Factfinder](https://factfinder.census.gov/faces/nav/jsf/pages/index.xhtml) or 
 239 | - [NHGIS.org](https://www.nhgis.org/)
 240 | - [Social Explorer](https://www.socialexplorer.com/)
 241 |     - Subscription service but FREE for UCB community
 242 | 
 243 | You can download Census `geographic data` directly on the [census website](https://www.census.gov/geo/maps-data/)
 244 | 
 245 | 
 246 | # Tutorial Time!
 247 | 
 248 | ## Part 1
 249 | 
 250 | We will work through several exercises using `tidycensus` to fetch, wrangle and map census data.
 251 | 
 252 | ## Loading packages
 253 | 
 254 | Load the packages we will use today
 255 | 
 256 | ```{r, message=FALSE, warning=FALSE}
 257 | library(tidycensus)
 258 | library(tidyverse) 
 259 | library(tigris)
 260 | library(sf)
 261 | ```
 262 | 
 263 | If you are getting errors try importing dplyr or reinstalling dplyr package as that has worked for some.
 264 | 
 265 | ## Install any packages that you do not have on your computer
 266 | 
 267 | Also install any dependancies.
 268 | 
 269 | ```{r, eval=FALSE}
 270 | # install.packages("tidyverse")
 271 | # install.packages("tidycensus")
 272 | # install.packages("sf")
 273 | ```
 274 | 
 275 | 
 276 | ## Census API Key
 277 | 
 278 | You need a census API key to programmatically fetch census data.
 279 | 
 280 | Get it here (pretty quick):
 281 | 
 282 | * (https://api.census.gov/data/key_signup.html)
 283 | 
 284 | For more info see:
 285 | 
 286 | * https://www.census.gov/data/developers/data-sets.html
 287 | 
 288 | ## Install your Census API Key
 289 | 
 290 | Use the tidycensus function **census_api_key** to make tidycensus use your key when it fetches data from the census.
 291 | 
 292 | ```{r, eval=F}
 293 | # Install your census api key - long alphanumeric string
 294 | census_api_key(THE_BIG_LONG_ALPHANUMERIC_API_KEY_YOU_GOT_FROM_CENSUS)
 295 | ```
 296 | 
 297 | ## Set working directory
 298 | 
 299 | Be sure to **Clone or downloaded & unzip** the workshop files from: [https://github.com/dlab-geo/rCensus_workshop](https://github.com/dlab-geo/rCensus_workshop)
 300 | 
 301 | * unzip if needed
 302 | 
 303 | THen, set your working directory this folder, e.g.,
 304 | 
 305 | * `setwd("~/Documents/Dlab/workshops/2019/rCensus_workshop")`
 306 | 
 307 | <img src="./data/swd.png" width="600px"></img>
 308 | 
 309 | # Fetching Decennial Census Data
 310 | 
 311 | ## Population Data
 312 | 
 313 | Let's start by fetching **population data** from the 2010 Census **for all states**
 314 | 
 315 | In order to fetch census data you need to identify the census **variables** that contain the data of interest.
 316 | 
 317 | ## Topics, Tables & Variables
 318 | 
 319 | Census data **variables** are organized in **tables**
 320 | 
 321 | Which are organized by **topic** or concept.
 322 | 
 323 | The tidycensus **load_variables** function can help with this step.
 324 | 
 325 | First, take a look at the function documentation.
 326 | ```{r, eval=F}
 327 | ?load_variables
 328 | ```
 329 | 
 330 | ## load_variables
 331 | 
 332 | Use `load_variables` to fetch all variables used in the 2010 census into a dataframe.
 333 | ```{r}
 334 | vars2010 <- load_variables(year=2010,        # Year or end year for ACS
 335 |                            dataset = 'sf1',  # 'sf1' for decennial or 'acs5'
 336 |                            cache = TRUE)     # Whether to save fetched data locally
 337 | ```
 338 | 
 339 | ## Decennial Census Variables
 340 | 
 341 | Let's take a look at and discuss the resultant dataframe.
 342 | 
 343 | - How many 2010 census variables are in the dataframe?
 344 | ```{r, eval=F}
 345 | View(vars2010)
 346 | ```
 347 | 
 348 | ## 2010 Decennial Census Tables
 349 | 
 350 | - Variables: 3,346
 351 | 
 352 | - Topics: Population, housing
 353 | 
 354 | - Tables: currenty `333` - *that's a lot*!
 355 |     - 177 population tables (identified with a ‘‘P’’) available to the block level 
 356 |     - 58 housing tables (identified with an ‘‘H’’) available to the block level
 357 |     - 82 population tables (identified with a ‘‘PCT’’) available to the census tract level
 358 |     - 4 housing tables (identified with an “HCT”) available to the census tract level
 359 |     - 10 population tables (identified with a “PCO”) available to the county level 
 360 |     - plus 2 additoinal PCT tables
 361 | 
 362 | ## What Variable has the 2010 Total Population value? 
 363 | 
 364 | We can sort and filter the vars2010 dataframe to find it.
 365 | 
 366 | <img src="./data/census2010_vars.png" height="500px"></img>
 367 | 
 368 | ## get_decennial
 369 | 
 370 | We can use the tidycensus function **get_decenial** to fetch the 2010 census data for total population by state.
 371 | 
 372 | First, check the documentation for the function.
 373 | ```{r, eval=F}
 374 | ?get_decennial
 375 | ```
 376 | 
 377 | ## get_decennial
 378 | 
 379 | Fetch total population by state (**P001001**) from the 2010 census using `get_decennial`.
 380 | 
 381 | ```{r}
 382 | 
 383 | pop2010 <- get_decennial(geography = "state",   # census tabulation unit
 384 |                          variables = "P001001", # variable(s) of interest
 385 |                          year = 2010)           # census year
 386 |           
 387 | ```
 388 | 
 389 | ## View the Data
 390 | 
 391 | - How many rows and columns? 
 392 | 
 393 | - Do you see the expected number of states?
 394 | 
 395 | - What column contains the population counts?
 396 | 
 397 | - Do the data values see to be right?
 398 | ```{r}
 399 | #pop2010
 400 | ```
 401 | 
 402 | ## Visualize results
 403 | 
 404 | We can visualize the data to get a quick overview of the distribution of data values.
 405 | 
 406 | It's a first step in exploratory data analysis and a last step in data communication.
 407 | 
 408 | `ggplot2` is the most commonly used R package for data visualization. 
 409 | 
 410 | - It is loaded when you load the `tidyverse` package.
 411 | 
 412 | Let's use it to visualize the population data.
 413 | 
 414 | ## Plot 2010 Population by state
 415 | 
 416 | Use `ggplot2` to create an ordered horizontal bar chart.
 417 | ```{r}
 418 | pop_plot<- ggplot(data=pop2010, aes(x=reorder(NAME,value), y=value/1000000)) + 
 419 |   geom_bar(stat="identity") + coord_flip() +
 420 |   theme_minimal() + 
 421 |   labs(title = "2010 US Population by State") +
 422 |   xlab("State") +
 423 |   ylab("in millions")
 424 | ```
 425 | 
 426 | ## Display the plot
 427 | 
 428 | ```{r, echo=F}
 429 | pop_plot
 430 | ```
 431 | 
 432 | ## Challenge
 433 | 
 434 | Fetch population data by state for 2000.
 435 | 
 436 | *Don't assume variable names are the same across years.* Check first!
 437 | 
 438 | ## Challenge Solution
 439 | 
 440 | Total Population in 2000
 441 | 
 442 | ```{r, eval = F, code_folding = "hide"}
 443 | # What is the variable name in 2000?
 444 | vars2000 <- load_variables(year=2000, dataset = 'sf1', cache = T)
 445 | 
 446 | # Take a look and search in the dataframe
 447 | View(vars2000)
 448 | 
 449 | # Fetch the 2000 pop data
 450 | pop2000 <- get_decennial(geography = "state", variables = "P001001", year = 2000)
 451 | 
 452 | # Take a look (plot if time)
 453 | pop2000
 454 | ```
 455 | 
 456 | ## Limiting by Area of Interest
 457 | 
 458 | In the previous example we retrieved population data for all states.
 459 | 
 460 | - This is the default behavior if you don't specify a subset.
 461 | 
 462 | - But you can limit the data to be retrieved by subunits like state.
 463 | 
 464 | ## Limit Areas of Interest
 465 | 
 466 | Let's fetch data for just 3 states.
 467 | 
 468 | ```{r}
 469 | state_pop2010 <- get_decennial(geography = "state", # census tabulation unit
 470 |                          variables = "P001001",     # variables of interest
 471 |                          year = 2010,               # census year
 472 |                          state=c("CA","OR","WA"))   # Filter by states of interest
 473 | 
 474 | ```
 475 | 
 476 | *Note we are referencing states by their abbrevation.*
 477 | 
 478 | ## View Results
 479 | ```{r}
 480 | state_pop2010
 481 | ```
 482 | 
 483 | ## Changing Census Tabulation unit
 484 | 
 485 | `get_decennial` accepts a number of different values for **tabulation unit**.
 486 | 
 487 | - Options include: `state`, `county`, `tract`, `block group`, `block`, and `ZCTA`.
 488 | 
 489 | Let's change the tabulation unit from `state` to `county`.
 490 | ```{r}
 491 | co_pop2010 <- get_decennial(geography = "county",   # census tabulation unit
 492 |                             variables = "P001001",  # variables of interest
 493 |                             year = 2010)
 494 | ```
 495 | 
 496 | ## Changing Census Tabulation unit
 497 | 
 498 | View the county data to see what was retrieved.
 499 | ```{r}
 500 | co_pop2010
 501 | ```
 502 | 
 503 | ## Challenge 
 504 | 
 505 | * Fetch population by **county** for just California
 506 | 
 507 | * Fetch population by **county** for Oregon & California
 508 | 
 509 | *Try it before you look ahead at solutions.*
 510 | 
 511 | ## Challenge Solution
 512 | ```{r}
 513 | ## Fetch population by **county** for just California
 514 | co_pop2010_ca <- get_decennial(geography = "county",   # census tabulation unit
 515 |                             variables = "P001001",  # variables of interest
 516 |                             year = 2010,
 517 |                             state=c('CA'))
 518 | #co_pop2010_ca
 519 | 
 520 | ## Fetch population by **county** for Oregon & California
 521 | co_pop2010_caor <- get_decennial(geography = "county",   # census tabulation unit
 522 |                                variables = "P001001",  # variables of interest
 523 |                                year = 2010,
 524 |                                state=c('CA','OR'))
 525 | co_pop2010_caor
 526 | 
 527 | ```
 528 | 
 529 | ## Challenge
 530 | 
 531 | * Fetch population by **tract** for all states.
 532 | 
 533 | * Fetch population by **tract** for California.
 534 | 
 535 | ## Challenge Solution
 536 | ```{r, eval=F}
 537 | ## Fetch population by **tract** for California.
 538 | cal_pop2010_tracts <- get_decennial(geography = "tract",   # census tabulation unit
 539 |                                  variables = "P001001",  # variables of interest
 540 |                                  year = 2010,
 541 |                                  state=c('CA'))
 542 | cal_pop2010_tracts
 543 | 
 544 | 
 545 | ## Fetch population by **tract** for all states.
 546 | pop2010_tracts <- get_decennial(geography = "tract",   # census tabulation unit
 547 |                                     variables = "P001001",  # variables of interest
 548 |                                     year = 2010)
 549 | 
 550 | pop2010_tracts  ## DOES THIS WORK?
 551 | ```
 552 | 
 553 | ## Fetching Census Tract Data
 554 | 
 555 | If you want census data at the tract level or below you **must** specifiy the state & county or counties.
 556 | ```{r,}
 557 | tract_pop2010 <- get_decennial(geography = "tract",   # census tabulation unit
 558 |                          variables = "P001001",       # variable of interest
 559 |                          year = 2010,                 # census year
 560 |                          state="CA",                  # limit to state of California
 561 |                          county=c("Alameda","Contra Costa"))  # and only these counties
 562 | ```
 563 | 
 564 | ## Fetching Census Tract Data
 565 | 
 566 | View the results! How many census tracts are in these 3 counties?
 567 | 
 568 | ```{r}
 569 | tract_pop2010
 570 | ```
 571 | 
 572 | ## Challenge
 573 | 
 574 | 1. Fetch population by **county** for Alameda County, California
 575 | 
 576 | 2. Fetch population by **tract** for the nine county Bay Area:
 577 | - Alameda, SF, Contra Costa, Marin County, Napa, 
 578 | - San Mateo, Santa Clara,  Solano,  Sonoma, Santa Cruz
 579 | 
 580 | Note: You can use names, abbreviations or FIPs codes for your `state` and `county`.
 581 | 
 582 | ```{r}
 583 | # County FIPs Codes for
 584 | # Alameda, SF, Contra Costa, Marin County, Napa, 
 585 | # San Mateo, Santa Clara,  Solano,  Sonoma, santa cruz
 586 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097")
 587 | ```
 588 | 
 589 | ## Challenge Solution
 590 | 
 591 | ```{r}
 592 | #  population by **county** for Alameda County, California
 593 | alco_pop2010 <- get_decennial(geography = "county",   # census tabulation unit
 594 |                                  variables = "P001001",  # variables of interest
 595 |                                  year = 2010,
 596 |                                  state=c('CA'),
 597 |                                  county=c('Alameda County'))
 598 | #alco_pop2010
 599 | 
 600 | ```
 601 | 
 602 | ## Challenge Solution
 603 | 
 604 | Fetch population by **tract** for the nine county Bay Area
 605 | ```{r}
 606 | # County FIPs Codes for
 607 | # Alameda, SF, Contra Costa, Marin County, Napa, 
 608 | # San Mateo, Santa Clara,  Solano,  Sonoma, santa cruz
 609 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097")
 610 | 
 611 | bayarea_pop2010_tract <- get_decennial(geography = "tract",   # census tabulation unit
 612 |                          variables = "P001001",       # variable of interest
 613 |                          year = 2010,                 # census year
 614 |                          state="CA",                  # limit to state of California
 615 |                          county=nine_counties)  # and only these counties
 616 | #bayarea_pop2010_tract
 617 | ```
 618 | 
 619 | 
 620 | ## RECAP & QUESTIONS
 621 | 
 622 | Fetch population by **tract** for the nine county Bay Area
 623 | ```{r, eval=F}
 624 | # County FIPs Codes for
 625 | # Alameda, SF, Contra Costa, Marin County, Napa, 
 626 | # San Mateo, Santa Clara,  Solano,  Sonoma, santa cruz
 627 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097")
 628 | 
 629 | bayarea_pop2010 <- get_decennial(geography = "tract",   # census tabulation unit
 630 |                       variables = "P001001",            # variable of interest
 631 |                       year = 2010,                      # census year
 632 |                       state="CA",                       # limit to state of California
 633 |                      county=nine_counties)             # and only these counties
 634 | 
 635 | # View the data
 636 | bayarea_pop2010
 637 | ```
 638 | 
 639 | 
 640 | 
 641 | ## Fetching data for more than one census variable
 642 | 
 643 | What **three** things are new here?
 644 | ```{r}
 645 | #urban rural pop for 3 counties
 646 | ur_pop10 <- get_decennial(geography = "county",  # census tabulation unit
 647 |                            variables = c(urban="P002002",rural="P002005"),
 648 |                            year = 2010, 
 649 |                            summary_var = "P002001",  # The denominator
 650 |                            state='CA',
 651 |                            county=c("Napa","Sonoma","Mendocino"))
 652 | 
 653 | ```
 654 | 
 655 | ## Fetching data for more than one census variable
 656 | 
 657 | What `three` things are new here?
 658 | 
 659 | 1. You can specify more than one variable:
 660 | ```
 661 | variables = c("P002002","P002005")
 662 | ```
 663 | 
 664 | 2. You can name the output columns.
 665 | ```
 666 | variables = c(urban="P002002",rural="P002005")
 667 | ```
 668 | 
 669 | 3. You can identify a `summary_var`. 
 670 | ```
 671 | summary_var = "P002001"
 672 | ```
 673 | 
 674 | This value is the denominator - the total count of all people or households surveyed. The values in this column can be used as a demoninator for other calcuations like percent of total. 
 675 | 
 676 | ## Take a look at the results
 677 | ```{r}
 678 | ur_pop10
 679 | ```
 680 | 
 681 | ## Calculating Percents
 682 | 
 683 | The `summary_value` column comes in handy when you want to compute percent of total.
 684 | 
 685 | Here's one way to do it.
 686 | ```{r}
 687 | # Calculate the percent of population that is Urban or Rural
 688 | ur_pop10 <- ur_pop10 %>%
 689 |             mutate(pct = 100 * (value / summary_value))
 690 | 
 691 | ```
 692 | 
 693 | ## Calculating Percents
 694 | 
 695 | Let's take a look at the output
 696 | ```{r}
 697 | ur_pop10 # Take a look
 698 | ```
 699 | 
 700 | ## Plot it
 701 | 
 702 | Plots give us compact visual summaries of the data
 703 | ```{r}
 704 | myplot <- ggplot(data = ur_pop10, 
 705 |           mapping = aes(x = NAME, fill = variable, 
 706 |                      y = ifelse(test = variable == "urban", 
 707 |                                 yes = -pct, no = pct))) +
 708 |           geom_bar(stat = "identity") +
 709 |           scale_y_continuous(labels = abs, limits=c(-100,100)) +
 710 |           labs(title="Urban & Rural Population in Wine Country", 
 711 |                x="County", y = " Percent of Population", fill="") +
 712 |           coord_flip()
 713 | ```
 714 | *Don't worry if you don't get all the ggplot code now. It's here for reference.*
 715 | 
 716 | ## Plot it
 717 | ```{r}
 718 | myplot
 719 | ```
 720 | 
 721 | ## Fetch all the data in one table
 722 | 
 723 | This is often helpful **but** you need to keep tract of the meaning of each variable.
 724 | ```{r}
 725 | alco_pop10 <- get_decennial(geography = "tract", # Census tabulation unit
 726 |                            table =  "P002",      # Table of urban & rural population counts
 727 |                            year = 2010,          # Decennial census year
 728 |                            state='CA',           # Filter state
 729 |                            county="Alameda")     # Filter county
 730 | 
 731 | ```
 732 | 
 733 | ## Take a look
 734 | ```{r}
 735 | unique(alco_pop10$variable) # What and how many unique vars in table?
 736 | 
 737 | head(alco_pop10,3)  # Take a look at output
 738 | ```
 739 | 
 740 | 
 741 | ## Output options
 742 | 
 743 | Let's try all three of these commands and then look at the ouput to see what's different?
 744 | 
 745 | ```{r, eval=F}
 746 | get_decennial(geography = "state", variables = "P001001", year = 2010)
 747 | 
 748 | get_decennial(geography = "state", variables = c(pop10="P001001"), year = 2010)
 749 | 
 750 | get_decennial(geography = "state", variables = c(pop00="P001001"), year = 2010, 
 751 |               output="wide")
 752 | ```
 753 | 
 754 | ## Output options
 755 | 
 756 | ```{r}
 757 | head(get_decennial(geography = "state", variables = "P001001", year = 2010),2)
 758 | head(get_decennial(geography = "state", variables = c(pop10="P001001"), year = 2010),2)
 759 | head(get_decennial(geography = "state", variables = c(pop00="P001001"), year = 2010, output="wide"), 2)
 760 | ```
 761 | 
 762 | 
 763 | ## Data Wrangling
 764 | 
 765 | Your R skills can help you reformat the data and make it more useable.
 766 | 
 767 | Let's fetch population data for 2010 & 2000 by state with **output=wide**.
 768 | 
 769 | - We will label the variables **pop00** and **pop10**.
 770 | 
 771 | Then we will combine these into one data frame.
 772 | 
 773 | ## Data Wrangling
 774 | 
 775 | Fetch pop by state from both the 2000 and 2010 census
 776 | ```{r}
 777 | pop2000 <- get_decennial(geography = "state", variables = c(pop00="P001001"), 
 778 |                          year = 2000, output="wide")
 779 | 
 780 | pop2010 <- get_decennial(geography = "state", variables = c(pop10="P001001"), 
 781 |                          year = 2010, output="wide")
 782 | 
 783 | ```
 784 | 
 785 | ## Merge population by state from both censuses
 786 | 
 787 | Save in a new dataframe with both columns
 788 | ```{r}
 789 | pop2000_2010 <- pop2000 %>% merge(pop2010, by="NAME") %>%
 790 |                              select(NAME, pop00, pop10)
 791 | 
 792 | head(pop2000_2010,3)
 793 | ```
 794 | 
 795 | ## Save the data
 796 | 
 797 | Use `write.csv` to save a data frame to a `CSV` file.
 798 | 
 799 | ```{r, eval=F}
 800 | write.csv(pop2000_2010, file="pop2000_2010.csv", row.names = FALSE)
 801 | ```
 802 | 
 803 | # QUESTIONS?
 804 | 
 805 | 
 806 | # Part 2. Mapping
 807 | 
 808 | 
 809 | ## Mapping Census Data with `tidycensus`
 810 | 
 811 | You can fetch geographic data by adding the parameter **geometry=TRUE** to `tidycensus` functions
 812 | 
 813 | - Under the hood, tidycensus calls the `tigris` package to fetch data from the Census Geographic Data APIs.
 814 | 
 815 | - Only a subset of data available via `tigris` can be accessed via `tidycensus`.
 816 | 
 817 | You can then use common mapping functions like `plot`, `ggplot` and `tmap` to make maps.
 818 | 
 819 | ## Geometry Options
 820 | 
 821 | Before fetching geometry, we need to specify a few `tigris` options
 822 | 
 823 | - Set the `class` of returned data to be `sf` objects (not `sp`, the default)
 824 | 
 825 | - Set `tigris_use_cache` to TRUE
 826 | 
 827 | ```{r}
 828 | # Tigris options - used by tidycensus
 829 | options(tigris_class = "sf")      # SP is the default format returned by tigris
 830 | options(tigris_use_cache = TRUE)  # Save retrieved data locally
 831 | 
 832 | ```
 833 | 
 834 | Caching the data is important because it speeds things up if you often fetch census data for the same geographies over and over again.
 835 | 
 836 | ## tigris cache directory
 837 | 
 838 | You may want to use the geographic data downloaded by tigris in other applications.
 839 | 
 840 | To do this, you need to know where the files are saved locally.
 841 | 
 842 | You can also specify where tigris should save cached data.
 843 | ```{r, eval=F}
 844 | # Check the location of the tigris cached data
 845 | Sys.getenv('TIGRIS_CACHE_DIR') 
 846 | 
 847 | # Set it
 848 | tigris_cache_dir("~/Documents/gis_data/census")  # Folder for local data
 849 | 
 850 | # Check it again
 851 | Sys.getenv('TIGRIS_CACHE_DIR') 
 852 | ```
 853 | 
 854 | ## Fetch geographic boundary data with tidycensus
 855 | 
 856 | We fetch the geospatial data by setting **geometry=TRUE**.
 857 | 
 858 | ```{r}
 859 | pop2010geo <- get_decennial(geography = "state", 
 860 |                           variables = c(pop10="P001001"), 
 861 |                           year = 2010, 
 862 |                           output="wide", 
 863 |                           geometry=TRUE) # Fetch geometry with the data for mapping
 864 |  
 865 | ```
 866 | 
 867 | ## Take a look
 868 | 
 869 | Let's take a minute to discuss the format of an `sf` spatial object.
 870 | ```{r}
 871 | pop2010geo
 872 | ```
 873 | 
 874 | 
 875 | ## Geospatial Data in R
 876 | 
 877 | R `sf` objects include
 878 | 
 879 | - a dataframe with a `geometry` column named of `geometry`
 880 | 
 881 |     - The geometry can be of type POINT, LINE, POLYGON
 882 |     - or, MULTIPOINT, MULTILINE or MULTIPOLGYON
 883 | 
 884 | - a `CRS` (coordinate reference system), specified by
 885 |     - epsg(SRID) code
 886 |     - proj4string
 887 |     
 888 | ## Census Data Coordinate Reference System (CRS)
 889 | 
 890 | All census geographic data use the `NAD83` CRS, or coordinate reference system.
 891 | 
 892 | `NAD83` stands for North American Datum of 1983. The geographic coordinates are longitude and latitude values encoded as decimal degrees.
 893 | 
 894 | `WGS84`, or [The World Geodetic System of 1984](https://en.wikipedia.org/wiki/World_Geodetic_System) is the most commonly used geographic CRS. The difference between points encoded in these two systems can vary, on average, up to 1 meter in the continental US.
 895 | 
 896 | Many geospatial operations require you transform data to a common CRS before conducting spatial analysis or mapping.  
 897 | 
 898 | As an in depth discussion of CRSs is outside the scope of this workshop, see [Geocomputation in R](https://geocompr.robinlovelace.net/reproj-geo-data.html) for more information.
 899 | 
 900 | ## Mapping sf Spatial Objects
 901 | 
 902 | We can use `plot` to make a quick map the geometry stored in an `sf` spatial object.
 903 | 
 904 | ```{r}
 905 | plot(pop2010geo$geometry)
 906 | ```
 907 | 
 908 | ## Question
 909 | 
 910 | What do you get if you plot the `sf` object without specifying "$geometry"
 911 | 
 912 | 
 913 | ## The Challenge of US maps
 914 | 
 915 | The vast geographic extent and non-contiguous nature of the USA makes it difficult to map.
 916 | 
 917 | ```{r, echo=F}
 918 | plot(pop2010geo$geometry) #view again
 919 | ```
 920 | 
 921 | ## Fetch geographic data with tidycensus, SHIFTED
 922 | 
 923 | tidycensus includes a `shift_geo` parameter to shift AK & HI to below Texas.
 924 | ```{r}
 925 | 
 926 | pop2010geo_shifted <- get_decennial(geography = "state", 
 927 |                                     variables = c(pop10="P001001"), 
 928 |                                     output="wide",
 929 |                                     year = 2010, 
 930 |                                     geometry=TRUE, 
 931 |                                     shift_geo=TRUE)
 932 | 
 933 | ```
 934 | 
 935 | ## Shift Happens!
 936 | ```{r}
 937 | plot(pop2010geo_shifted$geometry)
 938 | ```
 939 | 
 940 | ## Save it
 941 | 
 942 | You can save `sf` data to a shapefile using `st_write`
 943 | 
 944 | ```{r, eval=F}
 945 | st_write(pop2010geo_shifted,"usa_2010_shifted.shp")
 946 | ```
 947 | 
 948 | ## Check your TIGRIS_CACHE_DIR to see it
 949 | 
 950 | ```{r, eval=F}
 951 | my_cache_dir <- Sys.getenv('TIGRIS_CACHE_DIR') 
 952 | 
 953 | dir(my_cache_dir) # What files stored there?
 954 | ```
 955 | 
 956 | ## Mapping Data Values
 957 | 
 958 | ```{r}
 959 | plot(pop2010geo_shifted['pop10'])
 960 | ```
 961 | 
 962 | ## ggplot2 Maps
 963 | 
 964 | ```{r}
 965 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 
 966 |   geom_sf()
 967 | ```
 968 | 
 969 | ## ggplot2 Maps
 970 | 
 971 | Note the use of **geom_sf** which tells ggplot that spatial data objects are being mapped.
 972 | - this is a huge improvememnt!!
 973 | 
 974 | ```{r}
 975 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 
 976 |   geom_sf()
 977 | ```
 978 | 
 979 | ## Challenge 
 980 | 
 981 | Create a `map` of CA Population in 2010 by county
 982 | 
 983 | 
 984 | ## Challenge Solution
 985 | 
 986 | 2010 pop Data for California Counties
 987 | ```{r, eval=F}
 988 | 
 989 | #fetch it
 990 | cal_pop10 <- get_decennial(geography = "county", 
 991 |                            variables = "P001001",
 992 |                            year = 2010, 
 993 |                            state='CA',
 994 |                            geometry=TRUE)
 995 | 
 996 | # map it
 997 | #plot(cal_pop10['value'])
 998 | ```
 999 | 
1000 | 
1001 | ## Fetch County data for more than one state
1002 | 
1003 | We can fetch both the census data and the **geometry** for more than one state!
1004 | 
1005 | - *this is so much easier than any alternative approach!*
1006 | ```{r}
1007 | west_pop10 <- get_decennial(geography = "county", 
1008 |                            variables =  "P001001",
1009 |                            year = 2010, 
1010 |                            state=c('CA','OR','NV',"AZ"),
1011 |                            geometry=T)
1012 | ```
1013 | 
1014 | ## Map it
1015 | 
1016 | These are just quick plots to make sure we got the right data!
1017 | ```{r}
1018 | plot(west_pop10['value'])
1019 | ```
1020 | 
1021 | ## Census Tract Data
1022 | 
1023 | Fetching the data for all `tracts` in one state.
1024 | 
1025 | - **but** you need to specify one or more counties.
1026 | ```{r}
1027 | # Fetch tract data 
1028 | alco_pop10 <- get_decennial(geography = "tract", 
1029 |                            variables = "P001001", 
1030 |                            year = 2010, 
1031 |                            state='CA',
1032 |                            county='Alameda',
1033 |                            geometry=T)
1034 | ```
1035 | 
1036 | ## Challenge
1037 | 
1038 | Fetch and map the 2010 population by census tract for Alameda and Countra Costa counties.
1039 | 
1040 | 
1041 | ## Challenge Solution
1042 | 
1043 | Fetch Tract population & geometry data for Alameda & Contra Costa Counties
1044 | 
1045 | ```{r}
1046 | 
1047 | alcc_pop10 <- get_decennial(geography = "tract", 
1048 |                       variables = "P001001", 
1049 |                       year = 2010, 
1050 |                       state='CA',
1051 |                       county=c("Alameda","Contra Costa"),
1052 |                       geometry=T) 
1053 | ```
1054 | 
1055 | ## Challenge Solution
1056 | 
1057 | Map it
1058 | ```{r}
1059 | plot(alcc_pop10['value'])
1060 | ```
1061 | 
1062 | 
1063 | ## More Complex Challenge (if time)
1064 | 
1065 | Fetch and map the percent of San Francicso properties by census tract that were coded as rented in the 2010 Census.
1066 | 
1067 | To start, indentify the variables for the
1068 | 
1069 | - total number of hounsing units 
1070 | 
1071 | - number of renter occupied units
1072 | 
1073 | ## Complex Challenge Solution
1074 | 
1075 | SF Rented Units, 2010 
1076 | ```{r, eval=F}
1077 | sf_rented <- get_decennial(geography = "tract",  # census tabulation unit
1078 |                            variables =  "H004004",
1079 |                            year = 2010, 
1080 |                            summary_var = "H004001",  # Total Urban - the denominator
1081 |                            state='CA',
1082 |                            county='San Francisco',
1083 |                            geometry=T)
1084 | 
1085 | sf_pct_rented <- sf_rented[sf_rented$value > 0,] %>%
1086 |                  mutate(pct = 100 * (value / summary_value))
1087 | 
1088 | plot(sf_pct_rented['pct'])
1089 | ```
1090 | 
1091 | # Questions?
1092 | 
1093 | # Part 3. ACS 5 year data
1094 | 
1095 | ## ACS Data with tidycensus
1096 | 
1097 | The tidycensus workflow for ACS data is similar to that used for decennial census data.
1098 | 
1099 | - But there are many more variables in the ACS.
1100 | 
1101 | Because the ACS contains **sample data**, each ACS variable of interest includes both an **estimate** of the value and a **margin of error**.
1102 | 
1103 | ## ACS 5 year
1104 | 
1105 | You can use the tidycensus **get_acs** function to retrieve data for the ACS 5 year products, beginning with the 2005 - 2010 dataset. 
1106 | 
1107 | The **default** end year for my version of `tidycensus` (as of Dec 4, 2018)  is **2016** for the 2012-2016 ACS 5 year dataset.
1108 | 
1109 |   
1110 | ## Fetch List of ACS 5 year Variables
1111 |   
1112 | Let's start by fetching ACS 5-year 2016 data on poverty. 
1113 | 
1114 | We want to explore the number of folks living below the poverty level by census tract.
1115 | 
1116 | First we need to find the variable name(s)!
1117 | 
1118 | ## Load ACS Table Vars
1119 | 
1120 | Load the ACS 2012-2016 5 year data variables into a dataframe.
1121 | 
1122 | - ACS 5 year datasets are referenced by `end year` in tidycensus!
1123 | 
1124 | Then take a look at the variable names, labels and concepts.
1125 | 
1126 | How many variables refer to the concept of poverty?
1127 | 
1128 | ```{r}
1129 | acs2016vars <- load_variables(year=2016, dataset = 'acs5', cache = T)
1130 | #View(acs2016vars)
1131 | ```
1132 | 
1133 | ## ACS Tables and variables
1134 | 
1135 | Many hundreds (thousands?) more than for decennial census!
1136 | 
1137 | See the documentation on the [census website](https://www.census.gov/programs-surveys/acs/guidance/which-data-tool/table-ids-explained.html)
1138 | 
1139 | Types of tables:
1140 | 
1141 | - `B` prefix = base tables
1142 | - `C` = collapsed tables
1143 | - `DP` = data profiles
1144 | - `S` = Subject tables
1145 | 
1146 | ## Census Reporter
1147 | 
1148 | ACS variables can  be confusing. 
1149 | 
1150 | The Census Reporter website (https://censusreporter.org) provides another tool for navigating topics, tables, and variable names.
1151 | 
1152 | Let's check it out to see what tables/variables we should use.
1153 | 
1154 | ## Filter the ACS Variables
1155 | 
1156 | In RStudio, view the dataframe **acs2016vars** and interactively filter the name column to display only the variables in the table **C17002** 
1157 | 
1158 | Take a look at the different variables in this table.
1159 | 
1160 | What variable(s) contain the estimate of the number of people living below poverty?
1161 | 
1162 | ## get_acs
1163 | 
1164 | Use the tidycensus `get_acs` function to fetch the poverty data for census tracts in San Francisco
1165 | ```{r, eval=F}
1166 | ?get_acs
1167 | ```
1168 | 
1169 | ## get_acs in action
1170 | 
1171 | Fetch the data in the table **C17002** that contain the counts of people living below 100% of the poverty line.
1172 | ```{r}
1173 | sf_poor <- get_acs(geography = "tract",  
1174 |                    variables = c('C17002_002','C17002_003'), # poverty variables
1175 |                    year = 2016,          
1176 |                    state="CA",
1177 |                    summary_var = "C17002_001", # Est of num people - denom
1178 |                    county="San Francisco",
1179 |                    geometry=T)               
1180 | ```
1181 | 
1182 | ## View output
1183 | 
1184 | Let's take a look at the output of `get_acs` and discuss how it differs from `get_decennial`.
1185 | 
1186 | ```{r, eval=F}
1187 | sf_poor
1188 | ```
1189 | 
1190 | ## Create Poverty Map, try 2
1191 | 
1192 | What are we mapping!
1193 | ```{r}
1194 | # What are we mapping?
1195 | plot(sf_poor['estimate'])
1196 | ```
1197 | 
1198 | ## Create Poverty Map, try 2
1199 | 
1200 | ```{r}
1201 | # Remove census tracts that have no people!
1202 | sf_poor <- subset(sf_poor, summary_est > 0)
1203 | 
1204 | # What are we mapping?
1205 | plot(sf_poor['estimate'])
1206 | ```
1207 | 
1208 | ## Calculating percents
1209 | 
1210 | Let's calculate the percent below poverty by tract.
1211 | 
1212 | ```{r}
1213 | sf_poor <- sf_poor %>%
1214 |   mutate(pct = 100 * (estimate / summary_est))
1215 | 
1216 | head(sf_poor, 3)
1217 | ```
1218 | 
1219 | 
1220 | ## Group by and sum
1221 | 
1222 | We want to group the data by the geometry and then sum the data values so that we have one value per geometry.
1223 | ```{r}
1224 | sf_poor_summed <- sf_poor %>%
1225 |   select(GEOID, estimate, pct, geometry) %>%
1226 |   group_by(GEOID) %>% 
1227 |   summarise(count_below_pov = sum(estimate),
1228 |             pct_below_pov = sum(pct))
1229 | ```
1230 | 
1231 | ## Group by and sum
1232 | 
1233 | ```{r}
1234 | head(sf_poor_summed)
1235 | ```
1236 | 
1237 | ## Map Counts
1238 | 
1239 | Where are SF's poorest areas?
1240 | ```{r}
1241 | plot(sf_poor_summed['count_below_pov'])
1242 | 
1243 | ```
1244 | 
1245 | ## Map Percents
1246 | 
1247 | Where are SF's poorest areas?
1248 | ```{r}
1249 | plot(sf_poor_summed['pct_below_pov'])
1250 | 
1251 | ```
1252 | 
1253 | 
1254 | ## Challenge
1255 | 
1256 | The ACS **2013-2017** 5 year dataset was released Dec 6, 2018. 
1257 | 
1258 | Although my current version of `tidycensus` states that 2012-2016 is the latest ACS 5-year product, see if you can fetch & map the percent of people below poverty line in San Francisco using the **2013-2017** ACS 5-year data.
1259 | 
1260 | ## Challenge Solution
1261 | ```{r, eval=F}
1262 | sf_poor_2017 <- get_acs(geography = "tract",  
1263 |                    variables = c('C17002_002','C17002_003'), # poverty variables
1264 |                    year = 2017,          
1265 |                    state="CA",
1266 |                    summary_var = "C17002_001", # Est of num people - denom
1267 |                    county="San Francisco",
1268 |                    geometry=T)   
1269 | 
1270 | head(sf_poor_2017)
1271 | ```
1272 | 
1273 | 
1274 | ## Margins of Error (MOE)
1275 | 
1276 | We haven't talked about it but it may be important in your work with ACS data.
1277 | 
1278 | Math is needed to combine MOEs when you combine variables.
1279 | 
1280 | - tidycensus includes some nice [functions](https://walkerke.github.io/tidycensus/reference/index.html) for these calculations.
1281 | 
1282 | See this web page on how to handle [MOEs in tidycensus](https://walkerke.github.io/tidycensus/articles/margins-of-error.html)
1283 | 
1284 | # Questions?
1285 | 
1286 | 
1287 | # Maps with tmap - Demo
1288 | 
1289 | ## tmap
1290 | 
1291 | The `tmap` package is great for making both static and interactive maps. It turns R into a `GIS`.
1292 | 
1293 | Let's check it out with our last dataframe.
1294 | 
1295 | ## tmap
1296 | 
1297 | ```{r}
1298 | library(tmap)
1299 | tmap_mode("view") # set mode to interactive
1300 | 
1301 | poverty_map <- tm_shape(sf_poor_summed) +
1302 |                   tm_polygons(col="pct_below_pov")
1303 | ```
1304 | 
1305 | ## tmap
1306 | 
1307 | View the map - click on tracts
1308 | 
1309 | ```{r}
1310 | poverty_map
1311 | ```
1312 | 
1313 | ## tmap
1314 | 
1315 | There are a number of great tutorials online for working with `tmap`.
1316 | 
1317 | See the `References` at the end of this workshop document.
1318 | 
1319 | # Census Geographic Data Files
1320 | 
1321 | ## Census Geographic Data Files
1322 | 
1323 | **Cartographic Boundary** vs **Detailed TIGER/Line** data
1324 | 
1325 | By default, `tidycensus` downloads census **cartographic boundary** data.
1326 | 
1327 | - These are simplifed geometries, clipped to coastlines. 
1328 | 
1329 | In `get_acs` you can also request the more detailed census **TIGER/Line** data.
1330 | 
1331 | The cartographic boundary data is great for mapping but the detailed data is often better for analysis.
1332 | 
1333 | Let's check it out.
1334 | 
1335 | 
1336 | ## Fetch Cartographic Boundary Data
1337 | ```{r}
1338 | 
1339 | sf_poor_cb <- get_acs(geography = "tract",   
1340 |                    variables = c('C17002_002','C17002_003'), # poverty variables
1341 |                    summary_var = "C17002_001",
1342 |                    year = 2016,           
1343 |                    state="CA",
1344 |                    county="San Francisco",
1345 |                    geometry=TRUE,
1346 |                    cb = TRUE)     # THIS IS THE DEFAULT!
1347 | ```
1348 | 
1349 | ## Fetch Detailed TIGER/Line Geometry
1350 | ```{r}
1351 | 
1352 | sf_poor_tl <- get_acs(geography = "tract",   
1353 |                    variables = c('C17002_002','C17002_003'), # poverty variables       
1354 |                    summary_var = "C17002_001",
1355 |                    year = 2016,              
1356 |                    state="CA",
1357 |                    county="San Francisco",
1358 |                    geometry=TRUE,
1359 |                    cb = FALSE)  # Fetching the TIGER/Line data  
1360 | ```
1361 | 
1362 | 
1363 | ## Visualize differences with Tmap
1364 | 
1365 | zoom in to explore, especially around the coastline.
1366 | ```{r}
1367 | tm_shape(sf_poor_tl) + tm_borders() +
1368 | tm_shape(sf_poor_cb) + tm_borders(col="red")
1369 | 
1370 | ```
1371 | 
1372 | 
1373 | # Questions?
1374 | 
1375 | # Summary
1376 | 
1377 | ## Summary
1378 | 
1379 | - `tidycensus` offers two key functions for fetching census tabular and geographic: **get_acs** and **get_decennial**
1380 | 
1381 | - Using `tidycensus` to fetch the tabular data or both tabular and geographic data is IMOH way easier than any alternatives, **IF** you (1) know R, (2)know a bit about working with geographic data in R.
1382 | 
1383 | - This approach is also scaleable if you want multiple census variables and geographies.
1384 | 
1385 | - If you just want to fetcch the geographic data it may be easier to use the **tigris** package or download it directly from the census.
1386 | 
1387 | 
1388 | ## References
1389 | 
1390 | - [DataCamp](https://www.datacamp.com) course [Analyzing US Census Data in R!](https://www.datacamp.com/courses/analyzing-us-census-data-in-r)
1391 | - [Geocomputation in R](https://geocompr.robinlovelace.net/)
1392 | - [Creating beautiful demographic maps with tidycensus and tmap packages](https://www.zevross.com/blog/2018/10/02/creating-beautiful-demographic-maps-in-r-with-the-tidycensus-and-tmap-packages/)
1393 | 
1394 | ## Related D-Lab Workshops
1395 | 
1396 | - R Fundamentals
1397 | - Geospatial Data in R, parts 1, 2, & 3
1398 | - Web Maps in R with Leaflet
1399 | - Geocoding & Mapping in R
1400 | 
1401 | # Extras for Enthusiasts
1402 | 
1403 | ## Scaling Up Example
1404 | 
1405 | In this example we show you how you can read in census variables of interest from a file into an R dataframe. You can then use that dataframe to fetch data for all those variables using `tidycensus`.
1406 | 
1407 | ```{r}
1408 | 
1409 | # Load cenvar lookup table of vars of interest
1410 | my_cenvar_df <-read.csv("data/cenvar_lookup.csv", strip.white = T, stringsAsFactors = F)
1411 | 
1412 | my_cenvar_df
1413 | ```
1414 | 
1415 | ## Fetch the ACS data
1416 | 
1417 | Fetch the ACS data for these variables for the 9 county bay area
1418 | 
1419 | ```{r}
1420 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097")
1421 | bay9_data <-get_acs(geography = "tract", 
1422 |                        variables = my_cenvar_df$my_cen_vars, 
1423 |                        year=2016,
1424 |                        state = "CA", 
1425 |                        county = nine_counties, 
1426 |                        geometry = T)
1427 | 
1428 | bay9_data
1429 | ```
1430 | 
1431 | ## Reformat Ouput
1432 | 
1433 | 1. We only want to keep the estimate column for each variable of interest, plus the GEOID and geometry columns.
1434 | 
1435 | 2. We then want to make the data `wide` using the `spread` function. This will put each estimate variable is in its own column.
1436 | ```{r}
1437 | bay9_data2 <- bay9_data %>%
1438 |   select("GEOID", "variable", "estimate") %>%
1439 |   spread(key=variable, value=estimate)
1440 | ```
1441 | 
1442 | ## Take a look
1443 | ```{r}
1444 | bay9_data2
1445 | ```
1446 | 
1447 | ## Rename the columns
1448 | 
1449 | Use the dataframe of census variables to rename the columns so that they are self-describing.
1450 | ```{r}
1451 | colnames(bay9_data2)<-c("GEOID", my_cenvar_df$my_cen_var_names, "geometry")
1452 | 
1453 | ```
1454 | 
1455 | ## Take a look
1456 | ```{r}
1457 | 
1458 | bay9_data2
1459 | ```
1460 | 
1461 | 
1462 | ## Fetching data for multiple years
1463 | 
1464 | This requires variable name to be the same across years!
1465 | ```{r, eval=FALSE}
1466 | # use purr::map_df to get data for multiple years (must have same vars!)
1467 | pop90_10 <- map_df(c(1990, 2000, 2010), function(x) { 
1468 |   get_decennial(geography = "state",
1469 |   variables = c(totalpop = "P001001"),
1470 |   dataset = "sf1",
1471 |   year = x) %>%
1472 |   mutate(year = x) }
1473 | )
1474 | 
1475 | # View output
1476 | head(pop90_10)
1477 | tail(pop90_10)
1478 | 
1479 | # Plot it
1480 | pop90_10 %>% ggplot(aes(x=reorder(NAME,value), y=value/1000000, fill=factor(year))) + 
1481 |              geom_bar(stat="identity", position=position_dodge()) + coord_flip()
1482 | 
1483 | ```
1484 | 
1485 | 
1486 | # Combining Census Data with Other Data
1487 | 
1488 | ## Area Weighted Interpolation
1489 | 
1490 | One of the strenghts of the `sf` package is how relatively easy it is to reaggregate data from one geometry to another. This process is called areal interpolation.
1491 | 
1492 | Area weighted interpolation reaggregates the data based on the percent of area shared by input and output geometeries.
1493 | 
1494 | ## Read in a Shapefile
1495 | ```{r, eval=F}
1496 | sfnhoods<- st_read("data/sfnhoods.shp")
1497 | head(sfnhoods)
1498 | plot(sfnhoods['nhood'])
1499 | ```
1500 | 
1501 | ##  Check the CRS
1502 | ```{r, eval=F}
1503 | st_crs(sfnhoods)
1504 | st_crs(sf_poor5)
1505 | ```
1506 | 
1507 | ## CRS transformation
1508 | ```{r, eval=F}
1509 | sf_poor5_4326 = st_transform(sf_poor5, st_crs(sfnhoods))
1510 | ```
1511 | 
1512 | ## Area Weighted Interpolation
1513 | 
1514 | Reaggregate percent of people below poverty from census tract to neighborhood polygons.
1515 | 
1516 | ```{r, eval=F}
1517 | sfhoods2 = st_interpolate_aw(sf_poor5_4326[, "pct_below_pov"], sfnhoods,
1518 | extensive = F) # True= aw sum; False= aw avg
1519 | ```
1520 | 
1521 | ## Map it
1522 | ```{r, eval=F}
1523 | par(mfrow=c(1,2))
1524 | plot(sf_poor5['pct_below_pov'])
1525 | plot(sfhoods2['pct_below_pov'])
1526 | par(mfrow=c(1,1))
1527 | ```
1528 | 
1529 | ## Map it with `tmap`
1530 | ```{r, eval=F}
1531 | tm_shape(sfhoods2) +
1532 |    tm_polygons(col="pct_below_pov")
1533 | ```
1534 | 
1535 | ## Combine the values
1536 | ```{r, eval=F}
1537 | head(sfhoods2)
1538 | sfnhoods$pct_below_pov <- sfhoods2$pct_below_pov
1539 | 
1540 | # map again - click on polygons and view data in popups
1541 | # to confirm the AWI output values
1542 | tm_shape(sfnhoods) +
1543 |   tm_polygons(col="pct_below_pov", 
1544 |     popup.vars = c("nhood", "pct_below_pov")
1545 |   )
1546 | ```
1547 | 
1548 | 


--------------------------------------------------------------------------------
/previous_versions/snippets_to_save_for_later.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "snippets_to_save_for_later"
 3 | output: html_document
 4 | date: '2022-04-01'
 5 | ---
 6 | 
 7 | ```{r setup, include=FALSE}
 8 | knitr::opts_chunk$set(echo = TRUE)
 9 | ```
10 | 
11 | ### [Leaflet](https://rstudio.github.io/leaflet/):
12 | 
13 | Leaflet is the `ggplot2` of interactive mapping. Leaflet in R follows a tidyverse convention, using pipes (%>%) to create layers in the mapping object. We can use leaflet to create interactive maps allowing for more flexibility in design and features we can create. With added complexity in the code, of course! 
14 |   
15 |   ```{r}
16 | 
17 | # Create a color palette
18 | pal <- colorNumeric(
19 |   palette = "YlOrRd",
20 |   domain = med_hhincome$estimate
21 | )
22 | 
23 | # specify dataset
24 | leaflet(med_hhincome) %>% 
25 |   addProviderTiles(providers$CartoDB.Positron) %>% 
26 |   # adjust color palette and ploygon features.
27 |   addPolygons(stroke = FALSE, smoothFactor = 0.2, fillOpacity = .5,
28 |               color = ~pal(estimate)) %>% 
29 |   # add legend 
30 |   addLegend(pal = pal, values = ~estimate,
31 |             title = "Median Household Income",
32 |             labFormat = labelFormat(prefix = "$"),
33 |             position = "bottomleft")
34 | 
35 | 
36 | ```
37 | 
38 | ---
39 |   ## Appendix
40 |   
41 |   ### A More Complex Query
42 |   
43 |   Let's use the 2010 census data to map the percent of San Francisco (SF) properties that were rented.
44 | 
45 | To start, identify the variables for the
46 | 
47 | - Total number of housing units 
48 | 
49 | - Number of renter occupied units
50 | 
51 | ### Complete the query
52 | ```{r, eval=F}
53 | 
54 | sf_rented <- get_decennial(geography =  ,  # census tabulation unit
55 |                            variables =   , # number of households rented
56 |                            year =  , 
57 |                            summary_var = ,  # Total households
58 |                            state=,
59 |                            county=,
60 |                            geometry=)
61 | ```
62 | 
63 | > And here it is SF Percent Rented Units, 2010 
64 | 
65 | ```{r, eval=F}
66 | sf_rented <- get_decennial(geography = "tract",  # census tabulation unit
67 |                            variables =  "H004004", #number of households rented
68 |                            year = 2010, 
69 |                            summary_var = "H004001",  # Total households
70 |                            state='CA',
71 |                            county='San Francisco',
72 |                            geometry=T)
73 | 
74 | # take a look at the output
75 | head(sf_rented)
76 | ```
77 | 
78 | 
79 | ### Calculate Percent Rented
80 | 
81 | ```{r, eval=F}
82 | sf_pct_rented <- sf_rented[sf_rented$value > 0,] %>%
83 |                  mutate(pct = 100 * (value / summary_value))
84 | 
85 | # Take a look
86 | head(sf_pct_rented)
87 | 
88 | ```
89 | 
90 | ### Map the result
91 | ```{r, eval=F}
92 | plot(sf_pct_rented['pct'])
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | r-4.0-2020-10-10
2 | 
3 | 


--------------------------------------------------------------------------------