├── .DS_Store ├── .Rhistory ├── .gitignore ├── Census-Data-in-R.Rproj ├── LICENSE ├── Lessons ├── Census-Data-in-R-Challenges.Rmd ├── Census-Data-in-R-Challenges.html ├── Census-Data-in-R-Slides.Rmd ├── Census-Data-in-R-Slides.html ├── Census-Data-in-R.Rmd └── Census-Data-in-R.html ├── README.md ├── Solutions ├── Census-Data-in-R-Solutions.Rmd └── Census-Data-in-R-Solutions.html ├── data ├── .DS_Store ├── .Rapp.history ├── census2010_vars.png ├── census_data_by_prod_geo.png ├── census_geo_hierarchy.png ├── census_geodata.png ├── census_page.png ├── cenvar_lookup.csv ├── mapview_example.png ├── request_api_key.png ├── swd.png └── tidycensus_articles.png ├── data_out └── readme_data_out.txt ├── install.R ├── previous_versions ├── Rcensus_data_maps-slides.Rmd ├── Rcensus_data_maps-slides.html ├── Rcensus_data_maps-tutorial.Rmd ├── Rcensus_data_maps-tutorial.html └── snippets_to_save_for_later.Rmd └── runtime.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/.DS_Store -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | year = 2010, 2 | geometry=TRUE, 3 | shift_geo=TRUE) 4 | ## Shift Happens! 5 | plot(pop2010geo_shifted$geometry) 6 | st_write(pop2010geo_shifted, here("data_out/usa_pop2010_shifted.shp")) 7 | # Check to see if the data was written out to a shapefile 8 | dir(here("data_out")) 9 | ca_med_age <- get_decennial(geography = "county", 10 | variables = "P013001", 11 | year = 2010, 12 | state='CA', 13 | geometry=TRUE) 14 | # map it with plot 15 | plot(ca_med_age['value']) 16 | # map it with ggplot - setting CRS to 3310 17 | ca_med_age %>% 18 | ggplot(aes(fill = value)) + 19 | geom_sf(color=NA) + 20 | coord_sf(crs = 3310) + 21 | scale_fill_viridis_c(option = "viridis") 22 | vars_acs2019 <- load_variables(year=2019, # end year 2016-2020 period 23 | dataset = 'acs5', # the ACS data product 24 | cache = T) # Save locally for future access 25 | # how many variables? 26 | dim(vars_acs2019) 27 | vars_acs2019 <- load_variables(year=2020, # end year 2016-2020 period 28 | dataset = 'acs5', # the ACS data product 29 | cache = T) # Save locally for future access 30 | # how many variables? 31 | dim(vars_acs2019) 32 | alco_mhhincome2 <- get_acs(geography='tract', 33 | variables=c(median_hhincome = "B19013_001"), 34 | year = 2015, 35 | state='CA', 36 | county='Alameda', 37 | geometry=TRUE 38 | ) 39 | plot(alco_mhhincome['estimate']) 40 | plot(alco_mhhincome2['estimate']) 41 | plot(alco_mhhincome['estimate']) 42 | plot(alco_mhhincome2['estimate']) 43 | ``` 44 | plot(alco_mhhincome['estimate']) 45 | plot(alco_mhhincome2['estimate']) 46 | ``` 47 | alco_mhhincome2 <- get_acs(geography='tract', 48 | variables=c(median_hhincome = "B19013_001"), 49 | year = 2015, 50 | state='CA', 51 | county='Alameda', 52 | geometry=TRUE 53 | ) 54 | plot(alco_mhhincome2['estimate']) 55 | ``` 56 | vars_acs2015 <- load_variables(year=2015, # end year 2016-2020 period 57 | dataset = 'acs5', # the ACS data product 58 | cache = T) # Save locally for future access 59 | View(vars_acs2015) 60 | alco_mhhincome2 <- get_acs(geography='tract', 61 | variables=c(median_hhincome = "B19019_001"), 62 | year = 2015, 63 | state='CA', 64 | county='Alameda', 65 | geometry=TRUE 66 | ) 67 | ``` 68 | alco_mhhincome2 <- get_acs(geography='tract', 69 | variables=c(median_hhincome = "B19019_001"), 70 | year = 2015, 71 | state='CA', 72 | county='Alameda', 73 | geometry=TRUE 74 | ) 75 | ``` 76 | B19013_001 77 | alco_mhhincome2 <- get_acs(geography='tract', 78 | variables=c(median_hhincome = "B19013_001"), 79 | year = 2015, 80 | state='CA', 81 | county='Alameda', 82 | geometry=TRUE 83 | ) 84 | alco_mhhincome2 <- get_acs(geography='tract', 85 | variables=c(median_hhincome = "B19019_001"), 86 | year = 2015, 87 | state='CA', 88 | county='Alameda', 89 | geometry=TRUE 90 | ) 91 | plot(alco_mhhincome['estimate']) 92 | alco_mhhincome <- get_acs(geography='tract', 93 | variables=c(median_hhincome = "B19013_001"), 94 | year = 2020, 95 | state='CA', 96 | county='Alameda', 97 | geometry=TRUE 98 | ) 99 | plot(alco_mhhincome['estimate']) 100 | plot(alco_mhhincome2['estimate']) 101 | ``` 102 | plot(alco_mhhincome['estimate']) 103 | plot(alco_mhhincome2['estimate']) 104 | ``` 105 | plot(alco_mhhincome['estimate']) 106 | plot(alco_mhhincome2['estimate']) 107 | ``` 108 | plot(alco_mhhincome['estimate']) 109 | plot(alco_mhhincome2['estimate']) 110 | ``` 111 | alco_mhhincome2015 <- get_acs(geography='tract', 112 | variables=c(median_hhincome = "B19019_001"), 113 | year = 2015, 114 | state='CA', 115 | county='Alameda', 116 | geometry=TRUE 117 | ) 118 | plot(alco_mhhincome2015['estimate']) 119 | head(alco_mhhincome) 120 | plot(alco_mhhincome['estimate']) 121 | alco_mhhincome2015 <- get_acs(geography='tract', 122 | variables=c(median_hhincome = "B19019_001"), 123 | year = 2015, 124 | state='CA', 125 | county='Alameda', 126 | geometry=TRUE 127 | ) 128 | plot(alco_mhhincome2015['estimate']) 129 | alco_mhhincome <- get_acs(geography='tract', 130 | variables=c(median_hhincome = "B19013_001"), 131 | year = 2020, 132 | state='CA', 133 | county='San Francisco', 134 | geometry=TRUE 135 | ) 136 | head(alco_mhhincome) 137 | plot(alco_mhhincome['estimate']) 138 | mapview(alco_mhhincome['estimate']) 139 | alco_mhhincome <- get_acs(geography='tract', 140 | variables=c(median_hhincome = "B19013_001", pop20='P001001'), 141 | year = 2020, 142 | state='CA', 143 | county='San Francisco', 144 | geometry=TRUE 145 | ) 146 | mapview(alco_mhhincome['estimate']) 147 | View(vars_acs2019) 148 | alco_mhhincome <- get_acs(geography='tract', 149 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'), 150 | year = 2020, 151 | state='CA', 152 | county='San Francisco', 153 | geometry=TRUE 154 | ) 155 | mapview(alco_mhhincome['estimate']) 156 | mapview(alco_mhhincome) 157 | head(alco_mhhincome) 158 | alco_mhhincome <- get_acs(geography='tract', 159 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'), 160 | year = 2020, 161 | state='CA', 162 | county='San Francisco', 163 | geometry=TRUE, 164 | wide=T 165 | ) 166 | head(alco_mhhincome) 167 | alco_mhhincome <- get_acs(geography='tract', 168 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'), 169 | year = 2020, 170 | state='CA', 171 | county='San Francisco', 172 | geometry=TRUE, 173 | output="wide" 174 | ) 175 | head(alco_mhhincome) 176 | mapview(alco_mhhincome) 177 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE']) 178 | mapview(alco_mhhincome[alco_mhhincome['median_hhincomeE'].isna(),]) 179 | mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),]) 180 | What is the variable? 181 | ```{r} 182 | mapview(alco_mhhincome) 183 | mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),]) 184 | What is the variable? 185 | ```{r} 186 | mapview(alco_mhhincome) 187 | mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),]) 188 | What is the variable? 189 | ```{r} 190 | mapview(alco_mhhincome) 191 | #mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),]) 192 | What is the variable? 193 | ```{r} 194 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = NA) 195 | #mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),]) 196 | What is the variable? 197 | ```{r} 198 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = black) 199 | #mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),]) 200 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = black) 201 | mapview(alco_mhhincome) + mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") 202 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome) 203 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, col.regions="white") 204 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, zcol="pop20", col.regions="white") 205 | alco_mhhincome <- get_acs(geography='tract', 206 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'), 207 | year = 2020, 208 | state='CA', 209 | county='San Francisco', 210 | geometry=TRUE, 211 | output="wide" 212 | ) 213 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, zcol="pop20", col.regions="white") 214 | What is the variable? 215 | ```{r} 216 | #mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, zcol="pop20") 217 | #mapview(alco_mhhincome[is.na(alco_mhhincome['median_hhincomeE']),]) 218 | mapview(alco_mhhincome, zcol="pop20") 219 | head(alco_mhhincome) 220 | mapview(alco_mhhincome, zcol="pop20E") 221 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "black") + mapview(alco_mhhincome, zcol="pop20E", col.regions="white") 222 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "yellow") + mapview(alco_mhhincome, zcol="pop20E", col.regions="blue") 223 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "yellow") + mapview(alco_mhhincome, zcol="pop20E", col.regions="blue", alpha.regions=0.5) 224 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "yellow",alpha.regions) + mapview(alco_mhhincome, zcol="pop20E", col.regions="blue", alpha.regions=0.5) 225 | mapview(alco_mhhincome['median_hhincomeE'],color = "cyan", col.regions = "yellow",alpha.regions=0.5) + mapview(alco_mhhincome, zcol="pop20E", col.regions="blue", alpha.regions=0.5) 226 | mapview(alco_mhhincome['median_hhincomeE'],col.regions = "yellow",alpha.regions=0.5) + mapview(alco_mhhincome, zcol="pop20E", col.regions="lightblue", alpha.regions=0.5) 227 | mapview(alco_mhhincome['median_hhincomeE'],col.regions = "white",alpha.regions=0.5) + mapview(alco_mhhincome, zcol="pop20E", col.regions="black", alpha.regions=0.5) 228 | mapview(alco_mhhincome['median_hhincomeE'],col.regions = "white",alpha.regions=0.75) + mapview(alco_mhhincome, zcol="pop20E", col.regions="black", alpha.regions=0.5) 229 | mapview(alco_mhhincome['median_hhincomeE'],col.regions = "white",alpha.regions=0.75) + mapview(alco_mhhincome, zcol="pop20E", col.regions="black", alpha.regions=0.25) 230 | alco_mhhincome <- get_acs(geography='tract', 231 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'), 232 | year = 2020, 233 | state='CA', 234 | county='San Francisco', 235 | geometry=TRUE, # get the geography too 236 | output="wide" 237 | ) 238 | head(alco_mhhincome) 239 | alco_mhhincome <- get_acs(geography='tract', 240 | variables=c(median_hhincome = "B19013_001", pop20='B01003_001'), 241 | year = 2020, 242 | state='CA', 243 | county='San Francisco', 244 | geometry=TRUE # get the geography too 245 | ) 246 | head(alco_mhhincome) 247 | alco_mhhincome <- get_acs(geography='tract', 248 | variables="B19013_001", 249 | year = 2020, 250 | state='CA', 251 | county='San Francisco', 252 | geometry=TRUE # get the geography too 253 | ) 254 | head(alco_mhhincome) 255 | alco_mhhincome <- get_acs(geography='tract', 256 | variables="B19013_001", 257 | year = 2020, 258 | state='CA', 259 | county='San Francisco', 260 | geometry=TRUE, # get the geography too 261 | output="wide" 262 | ) 263 | head(alco_mhhincome) 264 | ```{r} 265 | plot(alco_mhhincome['estimate']) 266 | alco_mhhincome <- get_acs(geography='tract', 267 | variables="B19013_001", 268 | year = 2020, 269 | state='CA', 270 | county='San Francisco', 271 | geometry=TRUE, # get the geography too 272 | output="wide" 273 | ) 274 | ```{r} 275 | plot(alco_mhhincome['estimate']) 276 | alco_mhhincome <- get_acs(geography='tract', 277 | variables="B19013_001", 278 | year = 2020, 279 | state='CA', 280 | county='San Francisco', 281 | geometry=TRUE # get the geography too 282 | ) 283 | ```{r} 284 | head(alco_mhhincome) 285 | ``` 286 | ```{r} 287 | plot(alco_mhhincome['estimate']) 288 | alco_mhhincome['estimate'] %>% select(estimate != NA) %>% plot() 289 | ``` 290 | alco_mhhincome['estimate'] %>% select(estimate != NA) %>% plot(estimate) 291 | ``` 292 | alco_mhhincome['estimate'] %>% filter(na.rm(estimate)) %>% plot(estimate) 293 | ``` 294 | alco_mhhincome %>% filter(estimate, na.rm= TRUE %>% plot(estimate) 295 | ``` 296 | alco_mhhincome %>% filter(estimate, na.rm= TRUE) %>% plot(estimate) 297 | ``` 298 | alco_mhhincome %>% filter(estimate, na.rm= TRUE) #%>% plot(estimate) 299 | ``` 300 | alco_mhhincome %>% filter(! is.na(estimate)) #%>% plot(estimate) 301 | ``` 302 | alco_mhhincome %>% filter(! is.na(estimate)) #%>% plot(estimate) 303 | ``` 304 | alco_mhhincome %>% filter(! is.na(estimate)) %>% plot(estimate) 305 | ``` 306 | alco_mhhincome %>% filter(! is.na(estimate)) %>% plot(alco_mhhincome['estimate']) 307 | ``` 308 | ```{r} 309 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),]['estimate']) 310 | plot(alco_mhhincome['estimate']) 311 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),]['estimate']) 312 | plot(alco_mhhincome['estimate']) 313 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),]['estimate']) 314 | plot(alco_mhhincome['estimate']) 315 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),]['estimate']) 316 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),][,'estimate']) 317 | plot(alco_mhhincome['estimate']) 318 | plot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),][,'estimate']) 319 | ggplot(alco_mhhincome, aes(fill = estimate)) + 320 | geom_sf() 321 | alco_mhhincome %>% filter(! is.na(estimate)) %>% 322 | ggplot(alco_mhhincome, aes(fill = estimate)) + 323 | geom_sf() 324 | ggplot(alco_mhhincome[!is.na(alco_mhhincome['estimate']),], aes(fill = estimate)) + 325 | geom_sf() 326 | ggplot(alco_mhhincome, aes(fill = estimate)) + 327 | geom_sf() + 328 | xlim(122.5, 122.35) 329 | ggplot(alco_mhhincome, aes(fill = estimate)) + 330 | geom_sf() + 331 | xlim(122.25, 122.35) 332 | ggplot(alco_mhhincome, aes(fill = estimate)) + 333 | geom_sf() + 334 | xlim(122.75, 122.35) 335 | ggplot(alco_mhhincome, aes(fill = estimate)) + 336 | geom_sf() + 337 | xlim(122.55, 122.35) 338 | alco_mhhincome <- get_acs(geography='tract', 339 | variables="B19013_001", 340 | year = 2020, 341 | state='CA', 342 | county='Alameda', 343 | geometry=TRUE # get the geography too 344 | ) 345 | ```{r} 346 | head(alco_mhhincome) 347 | ``` 348 | plot(alco_mhhincome['estimate']) 349 | ggplot(alco_mhhincome, aes(fill = estimate)) + 350 | geom_sf() 351 | ggplot(alco_mhhincome, aes(fill = estimate)) + 352 | geom_sf() 353 | ```{r} 354 | plot(med_hhincome['estimate']) 355 | med_hhincome <- get_acs(geography='tract', 356 | variables="B19013_001", 357 | year = 2020, 358 | state='CA', 359 | county='San Francisco', 360 | geometry=TRUE # get the geography too 361 | ) 362 | ```{r} 363 | head(med_hhincome) 364 | ``` 365 | ```{r} 366 | plot(med_hhincome['estimate']) 367 | ggplot(med_hhincome, aes(fill = estimate)) + 368 | geom_sf() 369 | ggplot(med_hhincome, aes(fill = estimate)) + 370 | geom_sf() + 371 | xlim(-122.55, -122.3) 372 | inc_by_race <- c(White = "B19013H_001", 373 | Black = "B19013B_001", 374 | Asian = "B19013D_001", 375 | Hispanic = "B19013I_001" ) 376 | alco_mhhinc_by_race <- get_acs(geography='tract', 377 | variables=inc_by_race, 378 | year = 2019, 379 | state='CA', 380 | county='Alameda', 381 | geometry=T ) 382 | medhhinc_facet_map <- alco_mhhinc_by_race %>% 383 | ggplot(aes(fill = estimate)) + 384 | facet_wrap(~variable) + 385 | geom_sf(color=NA) + 386 | scale_fill_viridis_c(option="magma") 387 | # Display the map 388 | medhhinc_facet_map 389 | # 390 | ``` 391 | medhhinc_facet_map <- alco_mhhinc_by_race %>% 392 | ggplot(aes(fill = estimate)) + 393 | facet_wrap(~variable) + 394 | geom_sf() 395 | #geom_sf(color=NA) + 396 | scale_fill_viridis_c(option="magma") 397 | medhhinc_facet_map <- alco_mhhinc_by_race %>% 398 | ggplot(aes(fill = estimate)) + 399 | facet_wrap(~variable) + 400 | geom_sf() + 401 | #geom_sf(color=NA) + 402 | scale_fill_viridis_c(option="magma") 403 | # Display the map 404 | medhhinc_facet_map 405 | # 406 | ``` 407 | medhhinc_facet_map <- alco_mhhinc_by_race %>% 408 | ggplot(aes(fill = estimate)) + 409 | facet_wrap(~variable) + 410 | geom_sf(color=NA) + # why color=NA? 411 | scale_fill_viridis_c(option="magma") 412 | # Display the map 413 | medhhinc_facet_map 414 | # 415 | ``` 416 | medhhinc_facet_map <- alco_mhhinc_by_race %>% 417 | ggplot(aes(fill = estimate)) + 418 | facet_wrap(~variable) + 419 | geom_sf(color=NA) + # why color=NA? 420 | scale_fill_viridis_c(option="magma") 421 | # Display the map 422 | medhhinc_facet_map 423 | # 424 | ``` 425 | # Create the map 426 | medhhinc_facet_map <- alco_mhhinc_by_race %>% 427 | ggplot(aes(fill = estimate)) + 428 | facet_wrap(~variable) + 429 | geom_sf(color=NA) + # why color=NA? 430 | scale_fill_viridis_c(option="plasma") 431 | # Display the map 432 | medhhinc_facet_map 433 | # Create the map 434 | medhhinc_facet_map <- alco_mhhinc_by_race %>% 435 | ggplot(aes(fill = estimate)) + 436 | facet_wrap(~variable) + 437 | geom_sf(color=NA) + # why color=NA? 438 | scale_fill_viridis_c(option="magma") 439 | # Display the map 440 | medhhinc_facet_map 441 | mapview(med_hhincome) 442 | mapview(med_hhincome, zcol="estimate") 443 | mapview(med_hhincome) 444 | # Create a color palette 445 | pal <- colorNumeric( 446 | palette = "YlOrRd", 447 | domain = med_hhincome$estimate 448 | ) 449 | # specify dataset 450 | leaflet(med_hhincome) %>% 451 | addProviderTiles(providers$CartoDB.Positron) %>% 452 | # adjust color palette and ploygon features. 453 | addPolygons(stroke = FALSE, smoothFactor = 0.2, fillOpacity = .5, 454 | color = ~pal(estimate)) %>% 455 | # add legend 456 | addLegend(pal = pal, values = ~estimate, 457 | title = "Median Household Income", 458 | labFormat = labelFormat(prefix = "$"), 459 | position = "bottomleft") 460 | sf_rented <- get_decennial(geography = "tract", # census tabulation unit 461 | variables = "H004004", #number of households rented 462 | year = 2010, 463 | summary_var = "H004001", # Total households 464 | state='CA', 465 | county='San Francisco', 466 | geometry=T) 467 | sf_pct_rented <- sf_rented[sf_rented$value > 0,] %>% 468 | mutate(pct = 100 * (value / summary_value)) 469 | # Take a look 470 | head(sf_pct_rented) 471 | ### Map the result 472 | ```{r, eval=F} 473 | plot(sf_pct_rented['pct']) 474 | ``` 475 | sf_medrent <- get_acs(geography="tract", 476 | variables=c(median_rent2019="B25064_001"), 477 | year =2019, 478 | state="CA", 479 | county=c("San Francisco"), 480 | geometry=T) 481 | plot(sf_medrent[!is.na(sf_medrent$estimate),]['estimate']) 482 | sf_medrent %>% 483 | drop_na(estimate) %>% 484 | ggplot(aes(fill = estimate)) + 485 | geom_sf(color=NA) + 486 | coord_sf(crs = 26910) + # CRS for Northern CA - UTM 10 487 | scale_fill_viridis_c(option = "magma") 488 | mapview(sf_medrent) 489 | mapview(sdf_medrent, zcol='estimate') 490 | ``` 491 | mapview(sdf_medrent, zcol='estimate') 492 | ``` 493 | mapview(sf_medrent, zcol='estimate') 494 | ``` 495 | View(sf_medrent) 496 | mapview(sf_medrent, zcol='moe') 497 | ?get_acs 498 | ?write_csv 499 | write_csv(state_pop, here('data_out/state_pop_2010.csv' ) 500 | ) 501 | ?write_csv 502 | write.csv(state_pop, here('data_out/state_pop_2010.csv') ) 503 | write_csv(state_pop, here('data_out/state_pop_2010.csv') ) 504 | # Uncomment this to install packages, if necessary. 505 | # install.packages(c("here", "tidyverse", "sf", "leaflet", "mapview", "tigris", "tidycensus")) 506 | library(here) 507 | library(tidyverse) 508 | library(sf) 509 | library(leaflet) 510 | library(mapview) 511 | library(tigris) 512 | library(tidycensus) 513 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /Census-Data-in-R.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: XeLaTeX 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 dlab-geo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Lessons/Census-Data-in-R-Challenges.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R-Census-Data-Maps-Challenges.Rmd" 3 | author: "Avery Richards" 4 | date: "3/21/2022" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ```{r} 13 | 14 | # install pacman for package management. 15 | if (!require("pacman")) install.packages("pacman") 16 | 17 | pacman::p_load( 18 | here, # locate files 19 | tidyverse, # data wrangling 20 | sf, # geospatial data management 21 | leaflet, # interactive mapping 22 | tigris, tidycensus # census data 23 | ) 24 | 25 | ``` 26 | 27 | These six libraries should be loaded in your environment now. 28 | 29 | ```{r} 30 | 31 | # If you run this chunk, output from the "here" function should be visible below. This is your local directory path. We can use this to import files later on. 32 | here() 33 | 34 | ``` 35 | 36 | *Solutions are available in the Solutions folder, as needed.* 37 | 38 | ### Challenge 1 39 | 40 | > Use the `get_decennial` function like we've seen above, but fill in the code arguments to fetch total population in 2010 just for California. 41 | 42 | ```{r, challenge_1_code_A } 43 | 44 | # add your code here. 45 | 46 | 47 | ``` 48 | 49 | Alter the above code to fetch total pop in 2010 for CA, TX and FL 50 | ```{r, challenge_1_code_B } 51 | 52 | # add your code here. 53 | 54 | ``` 55 | 56 | 57 | ### Challenge 2 58 | 59 | > You can also filter tidycensus results by `county`. Alter the code below to fetch 2010 population for Alameda & San Francisco Counties. 60 | 61 | 62 | ```{r} 63 | 64 | get_decennial(geography = "county", # census tabulation unit 65 | variables = "P001001", # variable(s) of interest 66 | year = 2010, # census year 67 | state='CA', # Filter by state is CA 68 | county='Alameda') # Filter by county Alameda 69 | ``` 70 | 71 | 72 | 73 | ```{r} 74 | 75 | # add your code here. 76 | 77 | ``` 78 | 79 | What was the total population in the US in 2010? 80 | ```{r, challenge_2_code_B } 81 | 82 | # add your code here. 83 | 84 | ``` 85 | 86 | What census region had the largest population in the US in 2010? 87 | ```{r, challenge_2_code_C } 88 | 89 | # add your code here. 90 | 91 | ``` 92 | 93 | ## Challenge 3 94 | 95 | > Use the `get_decennial` function to fetch and plot an `Avg Family Size`varaible by CA County in `2010`, and name the call as a dataframe, `ca_fam_size`. Once you've done that, plot the dataframe with the `ggplot`call below. __Hint: "P037001"__ 96 | 97 | ```{r, challenge_3A} 98 | 99 | # add you code here to create a ca_fam_size dataframe with get_decennial. 100 | 101 | ``` 102 | 103 | 104 | 105 | ```{r, challenge_3B} 106 | 107 | # uncomment and plot from the ca_fam_size dataframe. 108 | # ca_fam_size %>% 109 | # ggplot(aes(x = value, 110 | # y = reorder(NAME, value))) + 111 | # geom_point() 112 | 113 | ``` 114 | 115 | ## Challenge 4 116 | 117 | > Repeat the previous challenge with data from the `2000` decennial census. Don't assume variable names are the same across the 2000 and 2010 census 118 | 119 | - Use `load_variables` to check! 120 | 121 | 122 | ```{r, challenge_4} 123 | # Add your code below 124 | 125 | 126 | ``` 127 | 128 | 129 | ## Challenge 5 130 | 131 | Create a `map` of CA Median Age by county in 2010. 132 | 133 | ```{r, challenge_5A} 134 | 135 | # Add your code to fetch the data for the map. 136 | 137 | ``` 138 | 139 | 140 | ```{r, challenge_5B} 141 | 142 | # Map the data with the plot function 143 | 144 | 145 | ``` 146 | 147 | 148 | ```{r, challenge_5C} 149 | 150 | # Map the data with ggplot, setting the CRS to 3310 (CA Albers), the preferred default CRS for statewide maps of CA. 151 | 152 | ``` 153 | 154 | 155 | ## Challenge 6 156 | 157 | > Make a ggplot map of MEDIAN GROSS RENT (`"B25064_001"`) in San Francisco County by tract 158 | using data from the ACS 2016—2020 5-year dataset 159 | 160 | 161 | ```{r, challenge_6} 162 | 163 | # Add your code here. 164 | 165 | ``` 166 | 167 | ## Challenge 7 168 | 169 | >Use `mapview` to create an interactive map of SF median household rent. 170 | 171 | ```{r, challenge_7} 172 | 173 | # Use mapview to create an interactive map of median household rent (from challenge 6). 174 | # Your code here 175 | 176 | ``` 177 | 178 | > *Check Census-Data-in-R-Solutions.Rmd for answers, as needed.* 179 | -------------------------------------------------------------------------------- /Lessons/Census-Data-in-R-Slides.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Census Data Wrangling and Mapping in R" 3 | author: "Patty Frontiera, Irene Farah, Avery Richards" 4 | date: "4/1/2022" 5 | output: 6 | ioslides_presentation 7 | editor_options: 8 | chunk_output_type: console 9 | --- 10 | 11 | ```{r setup, include=FALSE} 12 | knitr::opts_chunk$set(echo = TRUE) 13 | 14 | ``` 15 | 16 | # Getting Started 17 | 18 | ## Setup 19 | 20 | Welcome! While we're waiting: 21 | 22 | - Navigate to the workshop webpage: 23 | 24 | - Scroll down and read the `Readme` section. 25 | 26 | - **Clone or download** the workshop files by clicking on the green `CODE` button. 27 | 28 | - If you download the zipfile, **unzip it**. 29 | 30 | - Make a note of the folder in which the workshop files reside. 31 | 32 | ## Introduction 33 | 34 | - About me 35 | 36 | - About you 37 | 38 | - Your familiarity with US Census data 39 | - With geospatial data 40 | - With geospatial data in R 41 | 42 | ## Outline 43 | 44 | - **Brief** overview of the primary US Census data products 45 | 46 | - Introduce R packages for working with census data 47 | 48 | - Use those packages to fetch census data 49 | 50 | - Use those packages to fetch census data plus census geographic boundary files 51 | 52 | - Make maps of census data 53 | 54 | # Census Data Overview 55 | 56 | ## US Census Bureau 57 | 58 | The "nation's leading provider of quality data about its people and economy." 59 | 60 | - 61 | 62 | ## Primary Census Products 63 | 64 | - Decennial Census 65 | 66 | - American Community Survey (ACS) 67 | 68 | ## Decennial Census 69 | 70 | Complete count of the population every 10 years since `1790` 71 | 72 | A snapshot of the American population in time, with an `April 1` reference date. 73 | 74 | Includes data on 75 | 76 | - `Population`: by sex, age, race/ethnicity, and family / household relationships 77 | 78 | - `Housing`: by occupancy (occupied, vacant), tenure (owned, rented), and group quarters 79 | 80 | From 1840 to 2000, additional questions were asked of a `sample` of the population. 81 | 82 | - See 83 | 84 | ## American Community Survey (ACS) 85 | 86 | Since 2005, the `American Community Survey (ACS)` has replaced the decennial census `sample` data questions. 87 | 88 | - Annual survey of a sample of about 3.5 million households released for 1, 3 or 5 year period. 89 | 90 | - Provides `period estimates` of demographic, social, economic, and housing characteristics 91 | 92 | - Includes `margin of error` values for the estimates 93 | 94 | ## ACS Data Products 95 | 96 | ACS 1-year and 5-year estimates are currently available through 2020 97 | 98 | - New data is released at the end of the next year (e.g., 2020 data in Dec 2021) 99 | - But COVID is causing a [delay in the release dates](https://www.census.gov/programs-surveys/acs/news/data-releases/2020/release-schedule.html) and the `2020 data was just released!!` 100 | 101 | `ACS 3-year` no longer available (2008---2013) 102 | 103 | - More data tables are available for the ACS 5-year estimates than for the ACS 1 year or ACS 3 year estimates. 104 | 105 | See: [Census ACS: Guidance for Data Users](https://www.census.gov/programs-surveys/acs/guidance.html) 106 | 107 | ## [ACS Period Estimates](https://www.census.gov/programs-surveys/decennial-census/decade/2020/news/blog-posts.html) 108 | 109 | The `ACS 1 year estimates` include data from a sample of the population collected over a one year period. 110 | 111 | Five years of data are pooled together, weighted and processed as a whole dataset to create the `ACS 5 year estimates`. 112 | 113 | Use the `ACS 1 year estimates` when you want the `most current data` and are less concerned about precision (larger margins of error). However, the ACS 1 year estimates are only available for areas with large populations (+65,000) and for a subset of data tables. 114 | 115 | Use the `ACS 5 year estimates` when you want `more stability in the estimates`, more data tables, and smaller geographic tabulation units. But can be tricky to interpret the data if the five year period is not stable (e.g., covid and 2016-2022 ACS 5yr.) 116 | 117 | ## 118 | 119 | | Demographic\* | Social | Economic | Housing | 120 | |------------------|------------------|------------------|------------------| 121 | | Sex | Families | Income | Tenure\* | 122 | | Age | Education | Benefits | Occupancy\* | 123 | | Race | Marital Status | Employment Status | Group quarters\* | 124 | | Hispanic Origin | Fertility | Occupation | Housing Value | 125 | | Relationships | Grandparents | Industry | Taxes & Insurance | 126 | | | Veterans | Commuting | Utilities | 127 | | | Disability Status | Place of Work | Mortgage | 128 | | | Language at Home | Health Insurance | Monthly Rent | 129 | | | Citizenship | | Structure Type | 130 | | *\*decennial census* | Mobility | | | 131 | 132 | ## Census Geographies 133 | 134 | Census data is collected from individuals. The individual-level response data is called `microdata`. 135 | 136 | For privacy reasons, only a very limited subset of census microdata is publicly available as the [Public Use Microdata Samples (PUMS)](https://www.census.gov/programs-surveys/acs/microdata.html) data. 137 | 138 | Most census data is made publicly available only when aggregated to a geographic `tabulation unit`. 139 | 140 | - Tabulation units include states, counties, census tracts, block groups, blocks, etc. 141 | 142 | Not all census data is available for all geographic tabulation units. For example, only decennial census data are available at the block level. 143 | 144 | ## Census Geographic Tabulation Units 145 | 146 | 147 | 148 | ## Census Data and Census Geographies 149 | 150 | 151 | 152 | ## Census Data Workflow 153 | 154 | Identify your 155 | 156 | - `Topic of interest`, e.g., population by age, income, monthly rents, etc... 157 | - `Dataset`: Decennial Census or ACS 1-yr or ACS 5-yr? 158 | - `Year(s)`: for what time period? 159 | - `Geographic tabulation unit` of aggregation (county, tract, etc.) 160 | - `Geographic filter` by state(s) or counties 161 | 162 | Then determine what specific census variables are available for your topic. 163 | 164 | ## CAUTION 165 | 166 | "If you want to measure change you can't change the measures!" 167 | 168 | **Census tables, variables, geographies, and geographic boundaries change over time!** 169 | 170 | Measuring change over time with census data is *its own thing*, complex, and not covered by this workshop! 171 | 172 | ## Getting Census Data 173 | 174 | Here are three of the primary websites from which you can directly download census data: 175 | 176 | - [data.census.gov](https://data.census.gov/cedsci) 177 | - [NHGIS.org](https://www.nhgis.org/) 178 | - [Social Explorer](https://www.socialexplorer.com/), a subscription web platform but FREE for UCB community 179 | 180 | You can download Census `geographic data` directly on the [Census website](https://www.census.gov/programs-surveys/geography/guidance/tiger-data-products-guide.html). 181 | 182 | ## Census APIs 183 | 184 | You can write code to fetch data from the [Census Web APIs](https://www.census.gov/data/developers/data-sets.html) 185 | 186 | - `API`: application programming interface 187 | 188 | - `Web API`: URLs can be formatted to make queries that return data 189 | 190 | Or you can leverage an existing R package to make this easier! 191 | 192 | - That's what we will do! 193 | 194 | *Only a subset of recent Census data products are available via APIs.* 195 | 196 | # R Packages for Working with Census Data 197 | 198 | ## R Packages for Working with Census Data 199 | 200 | These are the ones we recommend and will use today. 201 | 202 | - [tidycensus](https://walker-data.com/tidycensus/) 203 | 204 | - [tidyverse](https://www.tidyverse.org/) 205 | 206 | - [sf](https://r-spatial.github.io/sf/) 207 | 208 | - [mapview](https://r-spatial.github.io/mapview/) 209 | 210 | ## [tidycensus](https://walker-data.com/tidycensus/) 211 | 212 | An R package with functions that make it easier to fetch decennial census and ACS data from the Census APIs. 213 | 214 | Only a limited set of Census data available via `tidycensus` 215 | 216 | - Decennial census: 1990, 2000, and 2010 217 | 218 | - ACS 1 yr: 2005 through 2019 219 | 220 | - ACS 5 yr: 2005---2009 through 2015---2019 are available. 221 | 222 | Actively maintained and expanding to include more census data products (see `tidycensus` website) 223 | 224 | ## About [tidycensus](https://walker-data.com/tidycensus/) 225 | 226 | Developed by [Kyle Walker](https://walkerke.github.io/) to make it easier to fetch data from Census APIs in **R** in a `tidy` format to analyze, plot, and map. 227 | 228 | Check out his website() to keep abreast of his great packages, blog posts, and tutorials. 229 | 230 | And his new ebook [Analyzing the US Census with R](https://walker-data.com/census-r/), currently available to read online. 231 | 232 | ## [tidycensus tutorials](https://walker-data.com/tidycensus/articles/basic-usage.html) 233 | 234 | 235 | 236 | ## [tidyverse](https://www.tidyverse.org) 237 | 238 | The **tidyverse** package is an umbrella package that installs all the core tidyverse packages and makes them easier to manage and load in R, including: 239 | 240 | - `ggplot2`, for data visualization 241 | - `dplyr`, for data manipulation 242 | - `tidyr`, for data tidying 243 | - `readr`, for data import 244 | - `purrr`, for functional programming 245 | - `tibble`, for tibbles, a modern re-imagining of data frames 246 | - `stringr`, for strings 247 | - `forcats`, for factors 248 | 249 | ## [sf](https://r-spatial.github.io/sf/) package 250 | 251 | `Simple features` for geospatial data objects and methods. 252 | 253 | - The main R package for working with vector geospatial data 254 | - `vector`: locations represented as points, lines and polygons 255 | 256 | `sf` is loaded and used automatically by `tidycensus`. 257 | 258 | The online book [Geocomputation with R](https://geocompr.robinlovelace.net/) is a great resource for learning about the `sf` package and working with geospatial data in R. 259 | 260 | ## [mapview](https://r-spatial.github.io/mapview/) 261 | 262 | `mapview` provides functions to quickly and easily create interactive maps for data exploration. 263 | 264 | 265 | 266 | ## Requesting a Census API key 267 | 268 | Before you can fetch data from the Census APIs, you must have a free `Census API Key` 269 | 270 | Request one now if you don't have one yet! 271 | 272 | - () 273 | 274 | # Hands-on Tutorial Time! 275 | 276 | ## Setup 277 | 278 | **Clone or downloaded and unzip** the workshop files from: 279 | 280 | Then: 281 | 282 | 1. Open the folder with the workshop files 283 | 284 | 2. Double-click on the R Project file `Census-Data-in-R.Rproj` 285 | 286 | 3. This should open RStudio - with the `Files` panel displaying the workshop folder contents. 287 | 288 | 4. Double-click on the file `Census-Data-in-R.Rmd` in the `Lessons` folder to follow along! 289 | 290 | - You can also open the file `Census-Data-in-R.html` to follow along in a web brower. 291 | -------------------------------------------------------------------------------- /Lessons/Census-Data-in-R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Census Data Wrangling and Mapping in R" 3 | author: "Patty Frontiera" 4 | date: "04/01/2022" 5 | theme: readable 6 | output: 7 | html_document: 8 | toc: yes 9 | toc_float: yes 10 | --- 11 | 12 | ```{r setup, include=FALSE} 13 | knitr::opts_chunk$set(echo = TRUE) 14 | ``` 15 | 16 | # Census Data and `tidycensus` 17 | 18 | In this tutorial, we will work through several exercises using the [tidycensus](https://walker-data.com/tidycensus/index.html) R package to fetch, wrangle, and map census data. 19 | 20 | The key `tidycensus` functions we will use today are: 21 | 22 | - `census_api_key`: makes your Census API key available to tidycensus 23 | 24 | - `load_variables`: retrieves a dataframe of available census data variables 25 | 26 | - `get_decennial`: fetch census data from a recent decennial censuses - 2000, 2010 (and soon 2020) 27 | 28 | - `get_acs`: fetch census data from an ACS (American Community Survey) 1 or 5 year dataset, 2005 - 2020. 29 | 30 | # Setup 31 | 32 | Be sure to **clone** or **downloaded and unzip** the workshop files from: 33 | 34 | Then: 35 | 36 | 1. Open the folder with the workshop files 37 | 38 | 2. Double-click on the R Project file `Census-Data-in-R.Rproj` 39 | 40 | 3. This should open RStudio - with the `Files` panel displaying the workshop folder contents. 41 | 42 | 4. Double-click on the file `Census-Data-in-R.Rmd` to follow along! 43 | 44 | *You can also click on the file `Census-Data-in-R.html` in the Files tab to open the workshop tutorial in a web brower.* 45 | 46 | ## Install packages 47 | 48 | > If you installed any of these packages awhile ago, (especially `tidycensus`), it's a good idea to install updates when you can (though not during the workshop as things can break!). 49 | 50 | ```{r} 51 | 52 | # Uncomment this to install packages, if necessary. 53 | # install.packages(c("here", "tidyverse", "sf", "leaflet", "mapview", "tigris", "tidycensus")) 54 | 55 | library(here) 56 | library(tidyverse) 57 | library(sf) 58 | library(leaflet) 59 | library(mapview) 60 | library(tigris) 61 | library(tidycensus) 62 | 63 | ``` 64 | 65 | These seven libraries should be loaded in your environment now. 66 | 67 | ```{r} 68 | 69 | # If you run this chunk, output from the "here" function should be visible below. This is your local directory path. We can use this to import files later on. 70 | here() 71 | 72 | ``` 73 | 74 | ## Census API Key 75 | 76 | You need a `Census API key` to programmatically fetch census data. 77 | 78 | - Get it here (pretty quickly): 79 | 80 | - The key will be sent to your email and you will need to click to activate it. 81 | 82 | - Keep the email with the key open for use in this workshop. 83 | 84 | *For more info on all available Census APIs see: * 85 | 86 | ### Add Your Census API Key 87 | 88 | To use your Census API Key in R 89 | 90 | 1. Copy and paste your Census API key from your email 91 | 92 | 2. Use the tidycensus function **census_api_key** to register your API key with tidycensus. **Don't forget to put quotes around the key!**. 93 | 94 | ```{r, eval=FALSE} 95 | 96 | # Install your census api key - long alphanumeric string 97 | census_api_key("THE_BIG_LONG_ALPHANUMERIC_API_KEY_YOU_GOT_FROM_CENSUS") 98 | 99 | ``` 100 | 101 | Another way to add your Census API Key: 102 | 103 | I keep my key in a file so no one can see it. One way to do this is by making a script that creates a variable key, and then using the `source` function to add that script as an object into your coding environment. The code chunk below is an example of how you might do that: 104 | 105 | ```{r,eval=FALSE} 106 | 107 | # source (run) an r script that creates a variable with my key 108 | #source("/Users/pattyf/Documents/Dlab/workshops/keys/census_api_key.R") 109 | 110 | #print(my_census_api_key) 111 | 112 | # register the key 113 | census_api_key(key = my_census_api_key) 114 | ``` 115 | 116 | # Decennial Census Data 117 | 118 | ## The `get_decennial` function 119 | 120 | We start by fetching `total population` from the 2010 Census with tidycensus's `get_decennial` function. Let's first talk about the code. 121 | 122 | ```{r} 123 | pop2010 <- get_decennial(geography = "state", # census tabulation unit 124 | variables = "P001001", # variable(s) of interest 125 | year = 2010) # census year 126 | 127 | head(pop2010) 128 | ``` 129 | 130 | ## Fetching data for more than one Census variable 131 | 132 | We can pass a vector of census identifiers to the `variables` function argument if we want to get data for more than one variable. Below we add `P0002002` for population in urban areas. 133 | 134 | ```{r} 135 | pop2010 <- get_decennial(geography="state", 136 | variables = c("P001001","P002002"), # variable(s) of interest 137 | year = 2010) # census year 138 | 139 | # take a look 140 | head(pop2010) 141 | ``` 142 | 143 | We can see the data for both variables if we sort the output by county name. 144 | 145 | ```{r} 146 | # Sort dataframe by county names (the NAME column) 147 | pop2010 %>% arrange(NAME) %>% head() 148 | ``` 149 | 150 | ## tidycensus returns `tidy` data 151 | 152 | By default, tidycensus returns data in a `tidy`, or `long` format that allows data for multiple variables to be contained within the `variable` and `value` columns. 153 | 154 | This is in contrast to untidy, or `wide` data where each variable is in its own column. 155 | 156 | tidycensus can return `wide` data if you can add the parameter `output=wide` to the function call. 157 | 158 | ```{r} 159 | 160 | # wide format 161 | pop2010w <- get_decennial(geography = "state", # census tabulation unit 162 | variables = c("P001001","P002002"), # variable(s) of interest 163 | year = 2010, # census year 164 | output="wide") # get output in wide format 165 | head(pop2010w) 166 | ``` 167 | 168 | ### The `GEOID` column 169 | 170 | The `GEOID` column is included in tidycensus output by default. 171 | 172 | This is a Census geographic identifier for the tabulation unit. 173 | 174 | The `GEOID` is sometimes called the Census `FIPS` code and for most tabulation units these are the same. 175 | 176 | The `GEOID` makes it possible to link to Census demographic data to Census geographic data and make maps. We will do this in a bit. 177 | 178 | The `GEOID` is a text string and must be quoted. 179 | 180 | - Beware of GEOID leading zeros, since some software will remove these and convert GEIOD values to numbers (rather than text strings). 181 | 182 | > Question: **What is the GEOID for California?** 183 | 184 | # Census Tabulation Units 185 | 186 | Public census data is typically aggregated by census geographies to protect privacy. 187 | 188 | These census geographies are called `Census tabulation units`. 189 | 190 | - Some of these are real administrative units like states and counties. 191 | - Others are statistical units created by the census, like census tracts and block groups. 192 | 193 | Some of the most common geographic tabulation units and their tidycensus function abbreviations are shown below, along with **required** and available filters that limit what data are returned. 194 | 195 | | Geography | Definition | Filter(s) | Used in tidycensus | 196 | |-----------------|-----------------|-----------------|---------------------| 197 | | "us" | United States | | get_acs(), get_decennial() | 198 | | "region" | Census region | | get_acs(), get_decennial() | 199 | | "state" | State or equivalent | state | get_acs(), get_decennial() | 200 | | "county" | County or equivalent | state, county | get_acs(), get_decennial() | 201 | | "place" | Census place | state | get_acs(), get_decennial() | 202 | | "tract" | Census tract | **state**, county | get_acs(), get_decennial() | 203 | | "block group" | Census block group | **state**, county | get_acs(), get_decennial() | 204 | | "block" | Census block | **state**, **county** | get_decennial() only! | 205 | 206 | ## `get_decennial` Tabulation Units and Filters 207 | 208 | Let's take a few minutes to practice fetching population data with the `get_decennial` function. 209 | 210 | - See `?get_decennial` for help 211 | 212 | ### Challenge 1 213 | 214 | > Open **Census-Data-in-R-Challenges.Rmd** and use the `get_decennial` function like we've seen above to fetch population data. *Solutions are available in the Solutions folder, as needed.* 215 | 216 | ### Changing the tabulation unit 217 | 218 | Let's fetch 2010 population data for CA counties 219 | 220 | **What changes in the code?** 221 | 222 | ```{r} 223 | 224 | get_decennial(geography = "county", # census tabulation unit 225 | variables = "P001001", # variable(s) of interest 226 | year = 2010, # census year 227 | state='CA') # Filter by state is CA 228 | ``` 229 | 230 | #### Questions 231 | 232 | - How do we specify the state of CA above? How else can we? 233 | - Can you fetch population data for all counties in the USA or do you need to have a `state=` filter? 234 | 235 | ### Adding a county filter 236 | 237 | You can also filter tidycensus results by `county` 238 | 239 | ```{r} 240 | 241 | get_decennial(geography = "county", # census tabulation unit 242 | variables = "P001001", # variable(s) of interest 243 | year = 2010, # census year 244 | state='CA', # Filter by state is CA 245 | county='Alameda') # Filter by county Alameda 246 | ``` 247 | 248 | ### Challenge 2 249 | 250 | > In **Census-Data-in-R-Challenges.Rmd**, alter the code above to fetch 2010 population for Alameda & San Francisco Counties. Then try Challenge 2B & 2C. 251 | 252 | ## Visualizing Results 253 | 254 | We can visualize data to get a quick overview of the distribution of the values. 255 | 256 | It's a first step in exploratory data analysis and a last step in data communication. 257 | 258 | `ggplot2` is the most commonly used R package for data visualization. 259 | 260 | - It is loaded when you load the `tidyverse` package. 261 | 262 | Let's use it to visualize the population data. 263 | 264 | ### Plot 2010 Population by state 265 | 266 | Use `ggplot2` to create an ordered horizontal bar chart. 267 | 268 | ```{r} 269 | 270 | # create a plot. 271 | pop_plot <- ggplot(data=pop2010, 272 | # set aesthetic variables 273 | aes(x=value/1000000, y=reorder(NAME,value)) ) + 274 | # pick geometry 275 | geom_bar(stat="identity") + 276 | # add theme and titles. 277 | theme_minimal() + 278 | labs(title = "2010 US Population by State") + 279 | xlab("Population (in Millions)") + 280 | ylab("State") 281 | 282 | # display the plot. 283 | pop_plot 284 | 285 | ``` 286 | 287 | Developing your `ggplot2` knowledge can really enhance your data analysis skills. 288 | 289 | In combination with `tidycensus` it creates a powerful, reproducible data science workflow. 290 | 291 | # Identifying Census Variables 292 | 293 | In the code above we fetched data for total population in 2010 using the variable `"P001001"`. 294 | 295 | That is not an obvious variable name, so how do we get those census data identifiers? 296 | 297 | We can use the tidycensus `load_variables` function for this. 298 | 299 | ## `load_variables` function 300 | 301 | For any census dataset like the decennial census or the ACS 1 or 5-year estimates, use the `load_variables` function to fetch all available variables and identifiers. 302 | 303 | Since these datasets have many, many variables, save the resulting dataframe to a variable and cache it locally so you do not need to repeatedly retrieve it over the web. 304 | 305 | ```{r} 306 | 307 | vars2010 <- load_variables(year=2010, # Year or end year for ACS-5yr 308 | dataset = 'sf1', # 'sf1' for decennial census 309 | cache = TRUE) # Save fetched data locally 310 | 311 | # How large is the output 312 | dim(vars2010) 313 | 314 | # Take a look with head or View 315 | head(vars2010) 316 | 317 | ``` 318 | 319 | ## 2010 Decennial Census Tables & Variables 320 | 321 | - Over 3,000 unique variables that describe population and housing characteristics 322 | 323 | - Organized in `333` Tables 324 | 325 | - 177 population tables (identified with a ''P'') available to the block level 326 | - 58 housing tables (identified with an ''H'') available to the block level 327 | - 82 population tables (identified with a ''PCT'') available to the census tract level 328 | - 4 housing tables (identified with an "HCT") available to the census tract level 329 | - 10 population tables (identified with a "PCO") available to the county level 330 | - plus 2 additional PCT tables 331 | 332 | See: 333 | 334 | #### What Variable Has the 2010 Total Population value? 335 | 336 | We know this from our previous code blocks, but let's find it for practice navigating the dataframe. 337 | 338 | - Let's sort and filter the `vars2010` dataframe to find it. 339 | 340 | #### *Questions:* 341 | 342 | What 2010 decennial census variable contains... 343 | 344 | - `Median Age` 345 | 346 | - `Average Family Size` 347 | 348 | - `Number of occupied housing units` 349 | 350 | \*See **Census-Data-in-R-Solutions.Rmd** if needed (under Variable Questions) 351 | 352 | ### Challenge 3 353 | 354 | > Return to **Census-Data-in-R-Challenges.Rmd** and use the `get_decennial` function to fetch and plot an `Avg Family Size`varaible by CA County in `2010`, and name the call as a dataframe, `ca_fam_size`. Once you've done that, plot the dataframe with the `ggplot`call below. **Hint: "P037001"** 355 | 356 | ### Challenge 4 357 | 358 | > Repeat the previous challenge with data from the `2000` decennial census. Don't assume variable names are the same across the 2000 and 2010 census 359 | 360 | > **Use `load_variables` to check the variable name!** 361 | 362 | # Census Tract Data 363 | 364 | Census tracts are the most commonly used census tabulation unit. 365 | 366 | Let's fetch population data for the census tabulation unit to *tract* 367 | 368 | > Because of the large number of census tracts, you **MUST** specify a state when requesting these data with tidycensus. 369 | 370 | ```{r} 371 | ## Fetch population by **tract** for California. 372 | ca_tract_pop2010 <- get_decennial(geography = "tract", # census tab unit 373 | variables = "P001001", # var of interest 374 | year = 2010, # census year 375 | state='CA') # State filter 376 | 377 | # How many tracts in CA 378 | dim(ca_tract_pop2010) 379 | 380 | # take a look 381 | head(ca_tract_pop2010) 382 | ``` 383 | 384 | ## Fetching Census Tract Data 385 | 386 | Census tract data can be quite large! 387 | 388 | Fortunately, you can also limit the results to one or more counties. 389 | 390 | ```{r,} 391 | 392 | tract_pop2010 <- get_decennial(geography = "tract", # census tabulation unit 393 | variables = "P001001", # variable of interest 394 | year = 2010, # census year - only one! 395 | state="CA", # limit to California 396 | county=c("Alameda","Contra Costa")) # & counties 397 | 398 | dim(tract_pop2010) 399 | 400 | ``` 401 | 402 | ## Customizing tidycensus output 403 | 404 | What **two** things are new here? 405 | 406 | ```{r} 407 | 408 | #urban and rural pop for 3 CA counties 409 | ur_pop10 <- get_decennial(geography = "county", # census tabulation unit 410 | variables = c(urban="P002002",rural="P002005"), 411 | year = 2010, 412 | summary_var = "P002001", # The denominator 413 | state='CA', 414 | county=c("Napa","Sonoma","Mendocino")) 415 | 416 | ``` 417 | 418 | #### When fetching census data... 419 | 420 | We have already specified more than one variable: 421 | 422 | variables = c("P002002","P002005") 423 | 424 | 1. You can also rename the values in the output 'variable' column. 425 | 426 | 427 | 428 | variables = c(urban="P002002",rural="P002005") 429 | 430 | 2. You can identify a `summary_var` (a denominator - here, the total count of all people or households surveyed. Can be used for calculations like percent of total.) 431 | 432 | 433 | 434 | summary_var = "P002001" 435 | 436 | #### Now let's take a look at the resultant dataframe 437 | 438 | ```{r} 439 | # take a look at the results 440 | ur_pop10 441 | ``` 442 | 443 | #### Calculating Percents 444 | 445 | The `summary_value` column comes in handy when you want to compute percent of total, for example: 446 | 447 | ```{r} 448 | 449 | # Calculate the percent of population that is Urban or Rural 450 | ur_pop10 <- ur_pop10 %>% 451 | mutate(pct = 100 * (value / summary_value)) 452 | 453 | # Take a look at the output. 454 | ur_pop10 455 | 456 | ``` 457 | 458 | A plot gives us compact visual summaries of the data. 459 | 460 | ```{r} 461 | 462 | ## Plot it with ggplot2 463 | myplot <- ggplot(data = ur_pop10, 464 | mapping = aes(x = NAME, fill = variable, 465 | y = ifelse(test = variable == "urban", 466 | yes = -pct, no = pct))) + 467 | geom_bar(stat = "identity") + 468 | scale_y_continuous(labels = abs, limits=c(-100,100)) + 469 | labs(title="Urban & Rural Population in Wine Country", 470 | x="County", y = " Percent of Population", fill="") + 471 | coord_flip() 472 | 473 | myplot 474 | 475 | ``` 476 | 477 | Don't worry if you don't get all the ggplot code now. It's here for reference. 478 | 479 | - You may want to check out D-Lab's *R Data Visualization with ggplot* workshop! 480 | 481 | ## Data Wrangling to Combine Data from 2 Censuses 482 | 483 | You can use your R skills to reformat the data and make it more usable. 484 | 485 | Let's fetch population data for 2010 and 2000 by state. 486 | 487 | Then we will combine these into one data frame using the `tidyverses::bind_rows` function 488 | 489 | ```{r} 490 | 491 | # Fetch 2000 population data by state 492 | pop2000 <- get_decennial(geography = "state", 493 | variables = c(pop2000="P001001"), 494 | year = 2000) 495 | 496 | # Fetch 2010 population data by state 497 | pop2010 <- get_decennial(geography = "state", 498 | variables = c(pop2010="P001001"), 499 | year = 2010) 500 | 501 | # Use tidyverse `bind_rows` function to combine the data for these years 502 | state_pop <- bind_rows(pop2000, pop2010) 503 | 504 | # Take a look with head or View 505 | state_pop %>% arrange(NAME) %>% head(10) 506 | ``` 507 | 508 | # Saving `tidycensus` output 509 | 510 | The data we fetch using `tidycensus` is stored in an R dataframe. 511 | 512 | We can use the `write.csv` or `write_csv` function to save the contents of a dataframe to a CSV file. 513 | 514 | ```{r} 515 | write_csv(state_pop, here('data_out/state_pop_2010.csv') ) 516 | ``` 517 | 518 | > **Any Questions?** 519 | 520 | # Mapping Census Data 521 | 522 | You can fetch census geographic data by adding the parameter **geometry=TRUE** to `tidycensus` functions 523 | 524 | - Under the hood, tidycensus calls the `tigris` package to fetch data from the Census Geographic Data APIs. 525 | 526 | You can then use your favorite R mapping packages like `sf`, `ggplot`, `tmap`, `mapview` and `leaflet` to make maps. 527 | 528 | ## Geometry Options 529 | 530 | Before fetching census geographic data, we need to set the option `tigris_use_cache` to TRUE 531 | 532 | Caching saves data locally. This greatly speeds things up if you fetch the same census geographic data repeatedly. 533 | 534 | ```{r} 535 | 536 | # Tigris options - used by tidycensus 537 | # Cache retrieved geographic data locally 538 | options(tigris_use_cache = TRUE) 539 | 540 | ``` 541 | 542 | ## Fetch Geographic Boundary Data with `tidycensus` 543 | 544 | We fetch the census geographic data by setting **geometry=TRUE**. 545 | 546 | ```{r} 547 | 548 | pop2010geo <- get_decennial(geography = "state", 549 | variables = c(pop10="P001001"), 550 | year = 2010, 551 | output="wide", 552 | geometry=TRUE) # Fetch geometry data for mapping 553 | 554 | ``` 555 | 556 | ### Take a look 557 | 558 | Let's take a minute to discuss the format of an `sf` spatial object. 559 | 560 | ```{r} 561 | 562 | head(pop2010geo, 3) 563 | 564 | ``` 565 | 566 | # Geospatial Data in R 567 | 568 | The `tidycensus` package uses the R `sf` package to manage geospatial data. 569 | 570 | R `sf` objects include: 571 | 572 | - a dataframe with a `geometry` column labeled `geometry` 573 | 574 | - The geometry can be of type POINT, LINE, POLYGON 575 | - or, MULTIPOINT, MULTILINE or MULTIPOLGYON 576 | 577 | - a `CRS` (coordinate reference system), specified by 578 | 579 | - epsg(SRID) code 580 | - proj4string 581 | 582 | For a deeper understanding of the `sf` package and its functionality, we recommend 583 | 584 | - our [R-Geospatial-Fundamentals](https://github.com/dlab-berkeley/R-Geospatial-Fundamentals) workshop 585 | 586 | - the [Geocomputation with R ebook](https://geocompr.robinlovelace.net/) 587 | 588 | ## Census Data Coordinate Reference System (CRS) 589 | 590 | All geospatial data are referenced to the surface of the earth with a `CRS`, or `coordinate reference system`. Anyone working with geospatial data will need to develop an understanding of CRSs. 591 | 592 | Fortunately, many of us are familiar with longitude and latitude, which are geographic coordinates. But there are different versions of geographic CRSs. And there are also projected CRSs which transform longitude and latitude to 2 dimensional surface for mapping & analysis. 593 | 594 | All census geographic data use the `NAD83` geographic CRS. `NAD83` stands for North American Datum of 1983. This CRS (or version of latitude and longitude) is best for locations in North America. 595 | 596 | Many geospatial operations require you transform data to a common CRS before conducting spatial analysis or mapping. 597 | 598 | - This could be an issue if you try to combine the census geospatial data with other geospatial data. But it is not an issue in this tutorial. 599 | 600 | An in-depth discussion of CRSs is outside the scope of this workshop. See [Geocomputation in R](https://geocompr.robinlovelace.net/reproj-geo-data.html) for more information. 601 | 602 | ## Mapping `sf` Spatial Objects 603 | 604 | We can use `sf::plot` to make a quick map the geometry stored in an `sf` spatial object. 605 | 606 | ```{r} 607 | # plot the geometry column data 608 | plot(pop2010geo$geometry) 609 | ``` 610 | 611 | ### The Challenge of US maps 612 | 613 | The vast geographic extent and non-contiguous nature of the USA makes it difficult to map. 614 | 615 | Fortunately, tidycensus includes a `shift_geo` parameter to shift AK & HI to below Texas. 616 | 617 | ```{r} 618 | 619 | pop2010geo_shifted <- get_decennial(geography = "state", 620 | variables = c(pop10="P001001"), 621 | output="wide", 622 | year = 2010, 623 | geometry=TRUE, 624 | shift_geo=TRUE) 625 | 626 | ## Shift Happens! 627 | plot(pop2010geo_shifted$geometry) 628 | ``` 629 | 630 | ## Saving Spatial Objects 631 | 632 | You can save any `sf` spatial data object to an [ESRI shapefile](https://en.wikipedia.org/wiki/Shapefile) using `st_write` 633 | 634 | ```{r, eval=F} 635 | 636 | st_write(pop2010geo_shifted, here("data_out/usa_pop2010_shifted.shp")) 637 | 638 | ``` 639 | 640 | Now take a look at the output shapefile. 641 | 642 | ```{r, eval=F} 643 | 644 | # Check to see if the data was written out to a shapefile 645 | dir(here("data_out")) 646 | ``` 647 | 648 | ### ESRI Shapefiles 649 | 650 | You can see from this output that an ESRI shapefile is actually a collection of files that all have the same prefix. 651 | 652 | Shapefiles are the most common file format for geospatial data. So it's worthwhile to learn more about them if you will be working with census geographic data. 653 | 654 | - Check out the [Shapefile Wikipedia page](https://en.wikipedia.org/wiki/Shapefile) for more information. 655 | 656 | ## Mapping Data Values 657 | 658 | You can use the sf `plot` command to make a map that sets the color of the geometry by the data values 659 | 660 | - This type of map is called a `thematic map`. 661 | 662 | - When the features being plotted are areas (or polygons), it's called a `choropleth` map! 663 | 664 | ```{r} 665 | # Name the column with the variable values to make 666 | # a thematic map, also called a choropleth map. 667 | plot(pop2010geo_shifted['pop10']) 668 | 669 | ``` 670 | 671 | ### `ggplot2` Map 672 | 673 | `ggplot` knows how to map sf objects! 674 | 675 | ```{r} 676 | 677 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 678 | geom_sf() # tells ggplot that geographic data are being plotted 679 | 680 | ``` 681 | 682 | If you are familiar with `sf` objects and `ggplot` you can further customize your maps. 683 | 684 | ```{r} 685 | 686 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 687 | geom_sf(color=NA) + # What does color=NA do 688 | coord_sf(crs = 3857) + # Dynamically change the CRS 689 | scale_fill_viridis_c(option = "viridis") # Change the color palette 690 | # Try different options, e.g. 691 | # plasma, magma, inferno, cividis 692 | 693 | ``` 694 | 695 | ### Challenge 5 696 | 697 | > In your **Census-Data-in-R-Challenges.Rmd** file, create a map of `Median Age by California County in 2010`. *Solutions are in the Census-Data-in-R-Solutions.Rmd file* 698 | 699 | ### Fetch Census Data and Geometry for Multiple States or Counties 700 | 701 | We can fetch Census data and **geometry** for more than one state or county with same function call. 702 | 703 | - This is so much easier than any alternative approach! 704 | 705 | - It can be applied to any available geographic tabulation areas (eg states, counties, tracts, places). 706 | 707 | Let's try it with Census Tracts! 708 | 709 | Fetch tract population and geometry data for Bay Area Counties. 710 | 711 | ```{r} 712 | 713 | bay_counties <- c("Alameda", "Contra Costa", "Marin", "San Francisco", 714 | "Sonoma", "Napa","Solano", "San Mateo", "Santa Clara") 715 | 716 | bayarea_pop10 <- get_decennial(geography = "tract", 717 | variables = "P001001", 718 | year = 2010, 719 | state='CA', 720 | county=bay_counties, 721 | geometry=T) 722 | 723 | # Quick map 724 | plot(bayarea_pop10['value']) 725 | ``` 726 | 727 | > **Any Questions?** 728 | 729 | # Fetching ACS data with `get_acs` 730 | 731 | - ACS data contains the most recent information about the American population. 732 | 733 | - We can use the tidycensus function `get_acs` to retrieve `ACS data` using code very similar to `get_decennial`. 734 | 735 | **BUT** the workflow is more complex because: 736 | 737 | 1. ACS data has a lot more variables, and 738 | 739 | 2. ACS data are **sample data**, so each ACS variable that you retrieve with `tidycensus` will fetch both an **estimate** of the value and a **margin of error**. 740 | 741 | ## ACS Data Products 742 | 743 | The ACS has two primary data products - the ACS 1 year estimates and the 5 year estimates. 744 | 745 | - The `ACS 1 year estimates` are more current but hasve a larger margin of error and is not available for Census geographies with a population of \< 65,000. 746 | 747 | - The `ACS 5-year estimates` are more stable but represent a larger time period. 748 | 749 | - *The `ACS 3 year estimates` has been discontinued.* 750 | 751 | ## Fetch metadata on ACS 5-Year Variables 752 | 753 | Let's use the `load_variables` function to get a dataframe of all variables from the ACS 2016---2020 5-year dataset. 754 | 755 | - Note: we change the dataset value to `acs5` where before we used `sf1` to fetch info on the decennial census variables. 756 | 757 | ```{r} 758 | 759 | vars_acs2020 <- load_variables(year=2020, # end year 2016-2020 period 760 | dataset = 'acs5', # the ACS data product 761 | cache = T) # Save locally for future use 762 | 763 | # how many variables? 764 | dim(vars_acs2020) 765 | ``` 766 | 767 | ### Exploring the ACS Variables 768 | 769 | View the `vars_acs2020` dataframe to find the variable name for `median household income`. 770 | 771 | ```{r, eval=FALSE} 772 | #View(vars_acs2020) 773 | ``` 774 | 775 | > *Question* Is the variable name for `total population` in the ACS 5 year 2020 data the same as it is in the 2010 census data? 776 | 777 | ### Fetch ACS Data on Median Household Income 778 | 779 | Let's fetch the median household income data for San Francisco County by census tract. 780 | 781 | ```{r} 782 | 783 | med_hhincome <- get_acs(geography='tract', 784 | variables="B19013_001", 785 | year = 2020, 786 | state='CA', 787 | county='San Francisco', 788 | geometry=TRUE # get the geography too 789 | ) 790 | ``` 791 | 792 | Take a look at the output 793 | 794 | ```{r} 795 | head(med_hhincome) 796 | ``` 797 | 798 | ### ACS Output 799 | 800 | The census data returned by the `get_acs` function is a bit different from that returned by `get_decennial`. 801 | 802 | - What is the name of the variable containing the income data? 803 | 804 | - What is the name if we set `output="wide"`? 805 | 806 | ```{r} 807 | med_hhincome_wide <- get_acs(geography='tract', 808 | variables="B19013_001", 809 | year = 2020, 810 | state='CA', 811 | county='San Francisco', 812 | geometry=TRUE, # get the geography too 813 | output="wide" 814 | ) 815 | 816 | # uncomment and run to view 817 | # head(med_hhincome_wide) 818 | ``` 819 | 820 | ### Map Median Household Income by tract 821 | 822 | Use `sf::plot` to create a map of median household income in San Francisco. 823 | 824 | ```{r} 825 | plot(med_hhincome['estimate']) 826 | ``` 827 | 828 | *What do you think of that map?* 829 | 830 | It's odd because San Francisco County is not the same as the city of San Francisco and what we want to map is the city. 831 | 832 | #### Create a map with ggplot 833 | 834 | We can use ggplot to zoom in on the city by setting the x axis limits to a narrower geographic range. 835 | 836 | ```{r} 837 | ggplot(med_hhincome, aes(fill = estimate)) + 838 | geom_sf() + 839 | xlim(-122.55, -122.3) 840 | 841 | ``` 842 | 843 | ### Question 844 | 845 | > Why do you think we have NA values in the ACS estimates? 846 | 847 | ### Fetching Multiple ACS-5 Variables 848 | 849 | We can drill down into the ACS data by fetching data for subgroups, where available. 850 | 851 | Let's fetch median household income by race. 852 | 853 | First identify the variables of interest. 854 | 855 | ```{r} 856 | 857 | # Median household income by race/ethnicity: Variables from ACS 2015—19 858 | # All households = "B19013_001", 859 | inc_by_race <- c(White = "B19013H_001", 860 | Black = "B19013B_001", 861 | Asian = "B19013D_001", 862 | Hispanic = "B19013I_001" ) 863 | ``` 864 | 865 | Fetch census tract data for multiple variables at once. 866 | 867 | ```{r} 868 | # Fetch the Data 869 | alco_mhhinc_by_race <- get_acs(geography='tract', 870 | variables=inc_by_race, 871 | year = 2019, 872 | state='CA', 873 | county='Alameda', 874 | geometry=T ) 875 | ``` 876 | 877 | ## Facet Mapping 878 | 879 | Facet maps are a way to create visualizations of `small multiples`, or subsets of the data in order to facilitate comparisons. Here, we use ggplot's `facet_wrap` function to make multiple maps of median household income by race for Alameda County. 880 | 881 | ```{r} 882 | # Create the map 883 | medhhinc_facet_map <- alco_mhhinc_by_race %>% 884 | ggplot(aes(fill = estimate)) + 885 | facet_wrap(~variable) + 886 | geom_sf(color=NA) + # why color=NA? 887 | scale_fill_viridis_c(option="magma") 888 | 889 | # Display the map 890 | medhhinc_facet_map 891 | ``` 892 | 893 | ### Challenge 6 894 | 895 | > In **Census-Data-in-R-Challenges.Rmd** file, Make a ggplot map of MEDIAN GROSS `RENT` in San Francisco County by tract using data from the ACS 2016---2020 5-year product. *Check Census-Data-in-R-Solutions.Rmd for answers, as needed.* 896 | 897 | > **Any Questions?** 898 | 899 | # Interactive Mapping 900 | 901 | Interactive mapping gives the RStudio environment some of the functionality of desktop GIS. 902 | 903 | There are a number of R packages that you can use, including: 904 | 905 | - [mapview](https://r-spatial.github.io/mapview/): quick interactive exploratory data viewing 906 | 907 | - [tmap](https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html): great static and interactive maps 908 | 909 | - [Leaflet](https://rstudio.github.io/leaflet/): highly customizable interactive maps 910 | 911 | *All of these are based on the `Leaflet Javascript Library`.* 912 | 913 | ### [mapview](https://r-spatial.github.io/mapview/) 914 | 915 | Let's use `mapview` to make quick interactive maps of the median hhousehold income data 916 | 917 | ```{r} 918 | 919 | mapview(med_hhincome) 920 | 921 | ``` 922 | 923 | When passed the name of an `sf` object and no other options, `mapview` will: 924 | 925 | - display the geometry using a single color for the fill and for the stroke 926 | 927 | - display the feature ID `on hover` 928 | 929 | - display the data from the dataframe `on click` 930 | 931 | #### Mapview Thematic Maps 932 | 933 | The `zcol` argument will take a column name and color the features by the values in that column. 934 | 935 | ```{r} 936 | 937 | mapview(med_hhincome, zcol="estimate") 938 | 939 | ``` 940 | 941 | ### Challenge 7 942 | 943 | > In the **Census-Data-in-R-Challenges.Rmd**, use `mapview` to create an interactive map of median household rent. 944 | 945 | ## Determining what ACS Variables to use 946 | 947 | ACS variables can be confusing. 948 | 949 | Some ways to identify the best variables to explore: 950 | 951 | - Web search, especially Census web resources, can help. 952 | 953 | - The Census Reporter website () provides another tool for navigating topics, tables, and variable names. 954 | 955 | - The NHGIS website (nhgis.org) is a great way to browse variables of interest. 956 | 957 | ## ACS Margins of Error (MOE) 958 | 959 | We haven't talked about it but it may be important in your work with ACS data. 960 | 961 | Math is needed to combine MOEs when you combine variables. 962 | 963 | - `tidycensus` includes some nice [functions](https://walker-data.com/tidycensus/articles/margins-of-error.html) for these calculations and a good overview of the topic. 964 | 965 | # Summary 966 | 967 | `tidycensus` offers two key functions for fetching census tabular and geographic: **get_acs** and **get_decennial**. 968 | 969 | - The **load_variables** function helps identify the names of census variables of interest. 970 | 971 | Support for fetching `population estimates` and `migration flow` census data was recently added to tidycensus. You can read up on it on the [tidycensus documentation website](https://walker-data.com/tidycensus/articles/other-datasets.html) 972 | 973 | Using `tidycensus` to fetch the tabular data or both tabular and geographic data is IMO way easier than any alternatives, **IF** you (1) know R, (2) know a bit about working with geographic data in R. 974 | 975 | This approach is also scaleable if you want multiple census variables for various locations and tabulation areas. 976 | 977 | You can make publication or report ready maps with highly customizable `ggplot2` code or use the `sf::plot` command to make quick maps. 978 | 979 | Interactive mapping greatly enhances your ability to do exploratory data analysis in RStudio. 980 | 981 | ### References 982 | 983 | Much of this tutorial is based on resources by Kyle Walker, author of `tidycensus`. See: 984 | 985 | - [tidycensus webpage](https://walker-data.com/tidycensus/index.html)) 986 | - [Analyzing the US Census with R](https://walker-data.com/census-r), an online book. 987 | 988 | Related D-Lab Workshops 989 | 990 | - [R Fundamentals](https://github.com/dlab-berkeley/R-Fundamentals) 991 | - [Geospatial Data in R, parts 1, 2, & 3](https://github.com/dlab-berkeley/R-Geospatial-Fundamentals) 992 | 993 | Great online resource for working with spatial data in R 994 | 995 | - [Geocomputation with R](https://geocompr.robinlovelace.net/) 996 | 997 | ------------------------------------------------------------------------ 998 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Census Data in R 2 | 3 | This workshop provides an introduction to working with census data in R using the `tidycensus` package. 4 | 5 | ## Description 6 | 7 | Since 1790, the US Census has been THE source of data about American people, providing valuable insights to social scientists and humanists. Mapping these data by census geographies adds more value by allowing researchers to explore spatial trends and outliers. This workshop will introduce `tidycensus`, an important and powerful R package for streamlining census data workflows. Participants will learn how to download census tabular data for one or more geographic tabulation units and/or years, download the associated census geographic data, and use these data for analysis and mapping. 8 | 9 | Specifically, we will: 10 | 11 | - Describe the primary Census data products 12 | - Introduce the R `tidycensus` package for working with Census Data 13 | - Use that packages to fetch decennial and ACS census data 14 | - Use those packages to fetch census geographic boundary files 15 | - Make maps of census data, symbolizing the color of those maps by the data values 16 | 17 | ### Knowledge Requirements: 18 | 19 | R experience equivalent to the D-Lab R Fundamentals workshop series is required to follow along with the tutorial. Basic knowledge of census data and geospatial data will be very helpful. 20 | 21 | ### Tech Requirements: 22 | 23 | Bring a laptop with R, RStudio and the following R packages listed below installed. 24 | 25 | ## R Packages to install 26 | 27 | Here are the R packages you will need for this workshop: 28 | 29 | - `tidyverse` 30 | - `ggplot2` 31 | - `sf` 32 | - `tidycensus` 33 | - `tigris` 34 | - `mapview` 35 | - `leaflet` 36 | 37 | ## Is R not working on your laptop? 38 | 39 | If you have a Berkeley CalNet ID, you can run these lessons on UC Berkeley's DataHub by clicking [![Datauhb](https://img.shields.io/badge/launch-datahub-blue)](https://datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FCensus-Data-in-R&urlpath=rstudio%2F&branch=master). By using this link, you can save your work and come back to it at any time. When you want to return to your saved work, just go straight to DataHub (), sign in, and you click on the `Census-Data-in-R` folder. 40 | 41 | If you don't have a CalNet ID, you are able to access the workshop by clicking [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dlab-berkeley/Census-Data-in-R/HEAD?urlpath=rstudio). This link takes a moment to load, so patience is required! 42 | 43 | ## Requesting a Census API key 44 | 45 | The `tidycensus` package, and any R package that accesses the Census APIs, require you to first get a Census API key. Get one now if you don’t have one yet here: (just takes a minute): 46 | 47 | ## Going Further with Geospatial Analysis 48 | 49 | This workshop in mainly about using R to access US Census data in dynamic and efficient ways. We also explore data-centric mapping techniques to explore our queries over space. Is geospatial topics interest you and you'd like to study more, we recommed these resources as good starting points: 50 | 51 | - [Geospatial Fundamentals in R](https://github.com/dlab-berkeley/R-Geospatial-Fundamentals) - D-Lab workshop focusing on mapping and geospatial analysis in R. 52 | 53 | - [Leaflet for R](https://rstudio.github.io/leaflet/) - Well written introduciton to mapping with the `leaflet` package in R. 54 | 55 | - [Geocomputation in R](https://geocompr.robinlovelace.net/) - Textbook (Lovelace, Nowosad, and Muenchow, 2019) is an excellent resource for getting up and running. 56 | 57 | - [R Shiny Tutorials](https://shiny.rstudio.com/tutorial/) - `shiny` is an R package that makes it easy to build interactive web apps straight from R, incuding maps! 58 | 59 | ## Contributors 60 | 61 | - Patty Frontiera 62 | - Irene Farah 63 | - [Avery Richards](https://github.com/Averysaurus) 64 | 65 | ------------------------------------------------------------------------ 66 | 67 |
68 |
D-Lab @ University of California - Berkeley 69 |
70 | https://dlab.berkeley.edu 71 |
72 |   73 |
74 |
75 | -------------------------------------------------------------------------------- /Solutions/Census-Data-in-R-Solutions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Solutions: R-Census-Data-Challenges' 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r, libraries} 11 | 12 | # run this chunk if you have not loaded the necessary libraries yet. 13 | if (!require("pacman")) install.packages("pacman") 14 | 15 | pacman::p_load( 16 | here, # locate files 17 | tidyverse, # data wrangling 18 | sf, # geospatial data management 19 | mapview, # mapping 20 | tigris, tidycensus # census data 21 | ) 22 | 23 | # These six libraries should be loaded in your environment now. 24 | 25 | # Tigris options - used by tidycensus 26 | # Cache retrieved geographic data locally 27 | options(tigris_use_cache = TRUE) 28 | ``` 29 | 30 | 31 | ```{r, challenge_1A_solution} 32 | 33 | ## Fill in the code to fetch total population in 2010 just for California 34 | get_decennial(geography = "state", # census tabulation unit 35 | variables = "P001001", # variable(s) of interest 36 | year = 2010, # census year 37 | state="CA") # Filter by state is CA 38 | 39 | ``` 40 | 41 | 42 | ```{r, challenge_1B_solution} 43 | 44 | ## fetch total pop in 2010 for CA, TX and FL 45 | get_decennial(geography = "state", # census tabulation unit 46 | variables = "P001001", # variable(s) of interest 47 | year = 2010, # census year 48 | state=c("CA","FL","TX")) # Filter by state is CA, Fl or TX 49 | 50 | 51 | ``` 52 | 53 | 54 | 55 | ```{r, challenge_2A_solution} 56 | 57 | ## fetch 2010 population for Alameda & San Francisco Counties 58 | get_decennial(geography = "county", # census tabulation unit 59 | variables = "P001001", # variable(s) of interest 60 | year = 2010, # census year 61 | state='CA', # filter by state 62 | county=c('Alameda', 63 | 'San Francisco')) # filter by County 64 | 65 | ``` 66 | ```{r, challenge_2B_solution} 67 | 68 | ## Fill in the code to fetch total population in 2010 the US 69 | get_decennial(geography = "us", # census tabulation unit 70 | variables = "P001001", # variable(s) of interest 71 | year = 2010) # census year 72 | 73 | ``` 74 | 75 | ```{r, challenge_2C_solution} 76 | 77 | ## Fill in the code to fetch total population in 2010 census regions 78 | get_decennial(geography = "region", # census tabulation unit 79 | variables = "P001001", # variable(s) of interest 80 | year = 2010) # census year 81 | ``` 82 | 83 | ```{Variable Questions} 84 | # Search vars2010 85 | ## vars20210 <- load_variables(2010, dataset='sf1', cache=T) 86 | 87 | - `Median Age`: "P013001" 88 | 89 | - `Average Family Size`: "P037001" 90 | 91 | - `Number of occupied housing units` : "H003002" 92 | ``` 93 | 94 | ```{r, challenge_3_solution} 95 | 96 | # Fetch Avg family size by CA County in 2010 97 | ca_fam_size <- get_decennial(geography='county', # census tabulation unit 98 | variables="P037001", # variable(s) of interest 99 | state='CA', # filter by state, county, 100 | year=2010) # and year. 101 | 102 | 103 | # create a ggplot 104 | ca_fam_size %>% 105 | ggplot(aes(x = value, y = reorder(NAME, value))) + 106 | geom_point() 107 | 108 | # display the ggplot 109 | ca_fam_size 110 | 111 | ``` 112 | 113 | 114 | ```{r, challenge_4_solution} 115 | 116 | # Load 2000 decennial census variables 117 | vars2000 <- load_variables(year=2000, dataset = 'sf1', cache = T) 118 | 119 | # Fetch Avg family size by CA County in 2000 120 | ca_fam_size2000 <- get_decennial(geography='county', 121 | variables='P033001', 122 | state='CA', 123 | year=2000) 124 | 125 | ``` 126 | 127 | 128 | ```{r, challenge_5_solution} 129 | 130 | ## Fetch data + geometry for CA Median Age by county in 2010 131 | ca_med_age <- get_decennial(geography = "county", 132 | variables = "P013001", 133 | year = 2010, 134 | state='CA', 135 | geometry=TRUE) 136 | 137 | # map it with plot 138 | plot(ca_med_age['value']) 139 | 140 | # map it with ggplot - setting CRS to 3310 141 | ca_med_age %>% 142 | ggplot(aes(fill = value)) + 143 | geom_sf(color=NA) + 144 | coord_sf(crs = 3310) + 145 | scale_fill_viridis_c(option = "viridis") 146 | 147 | ``` 148 | 149 | ```{r, challenge_6_solution } 150 | 151 | # Fetch the data 152 | # Median household rent for San Francisco County 153 | # using data from the ACS 2016—2020 5-year product. 154 | sf_medrent <- get_acs(geography="tract", 155 | variables=c(median_rent="B25064_001"), 156 | year =2020, 157 | state="CA", 158 | county=c("San Francisco"), 159 | geometry=T) 160 | 161 | # Map it with plot - remove rows with NAs 162 | plot(sf_medrent[!is.na(sf_medrent$estimate),]['estimate']) 163 | 164 | # Map it with ggplot - remove rows with NAs 165 | sf_medrent %>% 166 | drop_na(estimate) %>% 167 | ggplot(aes(fill = estimate)) + 168 | geom_sf(color=NA) + 169 | coord_sf(crs = 26910) + # CRS for Northern CA - UTM 10 170 | scale_fill_viridis_c(option = "magma") 171 | 172 | ``` 173 | 174 | 175 | ```{r, challenge_7_solution } 176 | 177 | # simple interactive map 178 | mapview(sf_medrent) 179 | 180 | # thematic (or choropleth) map of median rent 181 | mapview(sf_medrent, zcol='estimate') 182 | 183 | ``` 184 | 185 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/.DS_Store -------------------------------------------------------------------------------- /data/.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/.Rapp.history -------------------------------------------------------------------------------- /data/census2010_vars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census2010_vars.png -------------------------------------------------------------------------------- /data/census_data_by_prod_geo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census_data_by_prod_geo.png -------------------------------------------------------------------------------- /data/census_geo_hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census_geo_hierarchy.png -------------------------------------------------------------------------------- /data/census_geodata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census_geodata.png -------------------------------------------------------------------------------- /data/census_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/census_page.png -------------------------------------------------------------------------------- /data/cenvar_lookup.csv: -------------------------------------------------------------------------------- 1 | my_cen_var_names,my_cen_vars 2 | citizenship_totpop,B05001_001E 3 | citizenship_non_citizen,B05001_006E 4 | entry_totpop,B05005_001E 5 | entry_2010,B05005_002E 6 | entry_2000_2009,B05005_007E 7 | birthplace_totpop,B05007_001E 8 | birthplace_europ,B05007_014E 9 | birthplace_asian,B05007_027E 10 | birthplace_latinAmerica,B05007_040E 11 | birthplace_southAmerica,B05007_081E 12 | birthplace_other_nonUSA,B05007_094E 13 | birthplace_byage_totpop,B06001_001E 14 | birthplace_byage_fborn,B06001_049E 15 | poverty_totpop,B06012_001E 16 | below_pov,B06012_002E 17 | below_pov2,B06012_003E 18 | poverty_fborn_totpop,B06012_017E 19 | below_pov_fborn,B06012_018E 20 | below_pov2_fborn,B06012_019E 21 | health_native_totpop,B27020_002E 22 | health_native_noinsurance,B27020_006E 23 | health_fborn_nat_totpop,B27020_008E 24 | fborn_nohealth_naturalized,B27020_012E 25 | health_fborn_noncit_totpop,B27020_013E 26 | fborn_nohealth_noncitizen,B27020_017E -------------------------------------------------------------------------------- /data/mapview_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/mapview_example.png -------------------------------------------------------------------------------- /data/request_api_key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/request_api_key.png -------------------------------------------------------------------------------- /data/swd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/swd.png -------------------------------------------------------------------------------- /data/tidycensus_articles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-Census-Data-Legacy/aab9181a6802bc3b8adc17aff9dcb86cda707865/data/tidycensus_articles.png -------------------------------------------------------------------------------- /data_out/readme_data_out.txt: -------------------------------------------------------------------------------- 1 | Folder for output from tutorial 2 | -------------------------------------------------------------------------------- /install.R: -------------------------------------------------------------------------------- 1 | install.packages(c("tidyverse", 2 | "tidycensus", 3 | "sf", 4 | "mapview", 5 | "tigris", 6 | "here")) 7 | -------------------------------------------------------------------------------- /previous_versions/Rcensus_data_maps-slides.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Census Data Wrangling and Mapping in R" 3 | author: "Patty Frontiera" 4 | date: "03/21/2019" 5 | output: 6 | ioslides_presentation 7 | editor_options: 8 | chunk_output_type: console 9 | --- 10 | 11 | ```{r setup, include=FALSE} 12 | knitr::opts_chunk$set(echo = TRUE) 13 | 14 | ``` 15 | 16 | # Getting Started 17 | 18 | ## Setup 19 | 20 | Welcome! While we're waiting: 21 | 22 | * **Clone or download** the workshop files from: [https://github.com/dlab-geo/rCensus_workshop](https://github.com/dlab-geo/rCensus_workshop) 23 | - If you downloaded the zipfile, **unzip it**. 24 | - Make a note of the folder in which the files reside. 25 | 26 | 27 | * Open **RStudio** 28 | 29 | * Open a new **R script** file 30 | 31 | ## Introduction 32 | 33 | - About me 34 | 35 | - About you 36 | - Your familiarity with US Census data 37 | - with geospatial data 38 | - with geospatial data in R 39 | 40 | ## Outline 41 | 42 | - Describe primary Census data products 43 | 44 | - Introduce R packages for working with Census Data 45 | 46 | - Use those packages to fetch census data 47 | 48 | - Use those packages to fetch census data plus census geograpic boundary files 49 | 50 | - Make maps of census data 51 | 52 | # Census Data Overview 53 | 54 | ## US Census Data 55 | 56 | The "nation's leading provider of quality data about its people and economy." 57 | 58 | 59 | 60 | Available at [www.census.gov](www.census.gov) 61 | 62 | ## Primary Census Products 63 | 64 | - Decennial Census 65 | 66 | - American Community Survey (ACS) 67 | 68 | ## Decennial Census 69 | 70 | Complete count of the population every 10 years since `1790` 71 | 72 | Includes data on 73 | 74 | - population, by age & race/ethnicity 75 | 76 | - housing, by occupancy & tenure (owned, rented) 77 | 78 | ## American Community Survey (ACS) 79 | 80 | - Annual survey of a sample of about 3 million households 81 | 82 | - Provides estimates of demographic, social, economic & housing characteristics 83 | 84 | - Includes margin of error values for the estimates. 85 | 86 | 87 | ## Decennial Census* vs ACS Data 88 | | Demographic* | Social | Economic | Housing | 89 | |-----------------|--------------------|-------------------|-------------------| 90 | | Sex | Families | Income | Tenure* | 91 | | Age | Education | Benefits | Occupancy* | 92 | | Race | Marital Status | Employment Status | Structure Type | 93 | | Hispanic Origin | Fertility | Occupation | Housing Value | 94 | | | Grandparents | Industry | Taxes & Insurance | 95 | | | Veterans | Commuting | Utilities | 96 | | | Disability Status | Place of Work | Mortgage | 97 | | | Language at Home | Health Insurance | Monthly Rent | 98 | | | Citizenship | | | 99 | | | Mobility | | | 100 | 101 | 102 | 103 | ## Census Geographies 104 | 105 | Census data are publicly available at one or more levels of geographic aggregation. 106 | 107 | 108 | 109 | ## Census Data & Census Geographies 110 | 111 | 112 | 113 | ## ACS 5 Year Dataset RECOMMENDED 114 | 115 | ACS 1 year and 5 year products are currently available 116 | 117 | - ACS 3 year no longer available 118 | 119 | ACS 5 year data provdes much better estimates, lower margins of error 120 | 121 | More data available for ACS 5 Year product 122 | 123 | 124 | ## Census Data Workflow 125 | 126 | Identify your 127 | 128 | - topic of interest 129 | - year(s) 130 | - geographic level of detail 131 | - for what locations? 132 | 133 | Then determine what specific tables and variables 134 | are available - ACS or Decennial? 135 | 136 | ## CAUTION 137 | 138 | "If you want to measure change you can't change the measures!" 139 | 140 | **Census tables, variables, geographies, and geographic boundaries change over time!** 141 | 142 | Measuring change over time with census data is *its own thing*, complex and not covered by this workshop! 143 | 144 | # R Packages 145 | 146 | ## Packages for Working with Census Data 147 | 148 | These are the ones we recommend and will use today. 149 | 150 | - [tidycensus](https://walkerke.github.io/tidycensus) & [tigris](https://github.com/walkerke/tigris) 151 | 152 | - [tidyverse](https://www.tidyverse.org/) 153 | 154 | - [sf](https://r-spatial.github.io/sf/) 155 | 156 | 157 | # tidycensus & tigris 158 | 159 | ## [tidycensus](https://walkerke.github.io/tidycensus) 160 | 161 | Functions for accessing census decennial and ACS 5 year datasets via Census APIs 162 | 163 | - only a subset of datasets / years available 164 | - requires a `Census API key` 165 | 166 | ## [tidycensus](https://walkerke.github.io/tidycensus) 167 | 168 | Limited set of years available via `tidycensus` 169 | 170 | - decennial census: 1990, 2000, and 2010 171 | - ACS 5 yr: 2006-2010 through 2014-2018 are available. 172 | - Note: tidycensus refers to ACS 5year datasets by the endyear. 173 | - Need to check availability of latest census data releases in `tidycensus` 174 | 175 | ## [tigris](https://github.com/walkerke/tigris) 176 | 177 | Provides access to census geographic data files 178 | 179 | - detailed TIGER/Line boundary files (e.g., shapefiles), or 180 | - simplified Cartographic boundary files 181 | 182 | Also provides access to census `feature data`, 183 | 184 | - eg, rivers, roads, coastlands, landmarks, and more 185 | 186 | 187 | Used by `tidycensus` to access state, county, tract, block group, block, and ZCTA boundaries. 188 | 189 | - Use `tigris` directly to access other census geographic data. 190 | 191 | ## tidycensus & tigris 192 | 193 | Packages developed by [Kyle Walker](https://walkerke.github.io/) to make it easier to fetch data from Census websites and APIs in **R** and get that data in a useable format to analyze, plot, and map. 194 | 195 | Check out his website to keep abreast of his great packages, blog posts, and tutorials. 196 | 197 | - http://personal.tcu.edu/kylewalker/ 198 | 199 | - https://walkerke.github.io/ 200 | 201 | Walker also develped a new [DataCamp](https://www.datacamp.com) course: [Analyzing US Census Data in R!](https://www.datacamp.com/courses/analyzing-us-census-data-in-r) 202 | 203 | - Highly recommended! First chapter free! 204 | 205 | 206 | ## [tidyverse](https://www.tidyverse.org/) 207 | 208 | A collection of R Packages for data science 209 | - developed primarily by [Hadley Wickham](http://hadley.nz/), Chief Scientist at [RStudio](https://www.rstudio.com/). 210 | 211 | - `dplyr` and `tidyr` for reshaping data 212 | 213 | - `ggplot2` for plotting 214 | 215 | - `purr`, `readr` and `tibble` for improved performance 216 | 217 | These packages are used by `tidyverse` under the hood. 218 | 219 | ## [sf](https://r-spatial.github.io/sf/) 220 | 221 | Simple features for geospatial data objects and methods. 222 | 223 | - Next generation R package for working with vector geospatial data 224 | - superceding the `sp` package 225 | 226 | `sf` includes the functionality of the `sp`, `rgdal`, `rgeos` and `proj4` packages. 227 | 228 | - but with improved performance, simplified command syntax, and easier workflows. 229 | 230 | ## Alternatives to Accessing Census Data in R 231 | 232 | You can write code to access the [Census APIs](https://www.census.gov/data/developers/data-sets.html) directly. 233 | 234 | You can download Census data directly from: 235 | 236 | - [American Factfinder](https://factfinder.census.gov/faces/nav/jsf/pages/index.xhtml) or 237 | - [NHGIS.org](https://www.nhgis.org/) 238 | - [Social Explorer](https://www.socialexplorer.com/) 239 | - Subscription service but FREE for UCB community 240 | 241 | You can download Census `geographic data` directly on the [census website](https://www.census.gov/geo/maps-data/) 242 | 243 | 244 | # Tutorial Time! 245 | 246 | ## Part 1 247 | 248 | We will work through several exercises using `tidycensus` to fetch, wrangle and map census data. 249 | 250 | ## Loading packages 251 | 252 | Load the packages we will use today 253 | 254 | ```{r, message=FALSE, warning=FALSE} 255 | library(tidycensus) 256 | library(tidyverse) 257 | library(tigris) 258 | library(sf) 259 | ``` 260 | 261 | If you are getting errors try importing dplyr or reinstalling dplyr package as that has worked for some. 262 | 263 | ## Install any packages that you do not have on your computer 264 | 265 | Also install any dependancies. 266 | 267 | ```{r, eval=FALSE} 268 | # install.packages("tidyverse") 269 | # install.packages("tidycensus") 270 | # install.packages("sf") 271 | ``` 272 | 273 | 274 | ## Census API Key 275 | 276 | You need a census API key to programmatically fetch census data. 277 | 278 | Get it here (pretty quick): 279 | 280 | * (https://api.census.gov/data/key_signup.html) 281 | 282 | For more info see: 283 | 284 | * https://www.census.gov/data/developers/data-sets.html 285 | 286 | ## Install your Census API Key 287 | 288 | Use the tidycensus function **census_api_key** to make tidycensus use your key when it fetches data from the census. 289 | 290 | ```{r, eval=F} 291 | # Install your census api key - long alphanumeric string 292 | census_api_key(THE_BIG_LONG_ALPHANUMERIC_API_KEY_YOU_GOT_FROM_CENSUS) 293 | ``` 294 | 295 | ## Set working directory 296 | 297 | Be sure to **Clone or downloaded & unzip** the workshop files from: [https://github.com/dlab-geo/rCensus_workshop](https://github.com/dlab-geo/rCensus_workshop) 298 | 299 | Then, set your working directory this folder, e.g., 300 | 301 | * `setwd("~/Documents/Dlab/workshops/2019/rCensus_workshop")` 302 | 303 | 304 | 305 | # Fetching Decennial Census Data 306 | 307 | ## Population Data 308 | 309 | Let's start by fetching **population data** from the 2010 Census **for all states** 310 | 311 | In order to fetch census data you need to identify the census **variables** that contain the data of interest. 312 | 313 | ## Topics, Tables & Variables 314 | 315 | Census data **variables** are organized in **tables** 316 | 317 | Which are organized by **topic** or concept. 318 | 319 | The tidycensus **load_variables** function can help with this step. 320 | 321 | First, take a look at the function documentation. 322 | ```{r, eval=F} 323 | ?load_variables 324 | ``` 325 | 326 | ## load_variables 327 | 328 | Use `load_variables` to fetch all variables used in the 2010 census into a dataframe. 329 | ```{r} 330 | vars2010 <- load_variables(year=2010, # Year or end year for ACS 331 | dataset = 'sf1', # 'sf1' for decennial or 'acs5' 332 | cache = TRUE) # Whether to save fetched data locally 333 | ``` 334 | 335 | ## Decennial Census Variables 336 | 337 | Let's take a look at and discuss the resultant dataframe. 338 | 339 | - How many 2010 census variables are in the dataframe? 340 | ```{r, eval=F} 341 | View(vars2010) 342 | ``` 343 | 344 | ## 2010 Decennial Census Tables 345 | 346 | - Variables: 3,346 347 | 348 | - Topics: Population, housing 349 | 350 | - Tables: currenty `333` - *that's a lot*! 351 | - 177 population tables (identified with a ‘‘P’’) available to the block level 352 | - 58 housing tables (identified with an ‘‘H’’) available to the block level 353 | - 82 population tables (identified with a ‘‘PCT’’) available to the census tract level 354 | - 4 housing tables (identified with an “HCT”) available to the census tract level 355 | - 10 population tables (identified with a “PCO”) available to the county level 356 | - plus 2 additoinal PCT tables 357 | 358 | ## What Variable has the 2010 Total Population value? 359 | 360 | We can sort and filter the vars2010 dataframe to find it. 361 | 362 | 363 | 364 | ## get_decennial 365 | 366 | We can use the tidycensus function **get_decenial** to fetch the 2010 census data for total population by state. 367 | 368 | First, check the documentation for the function. 369 | ```{r, eval=F} 370 | ?get_decennial 371 | ``` 372 | 373 | ## get_decennial 374 | 375 | Fetch total population by state (**P001001**) from the 2010 census using `get_decennial`. 376 | 377 | ```{r} 378 | 379 | pop2010 <- get_decennial(geography = "state", # census tabulation unit 380 | variables = "P001001", # variable(s) of interest 381 | year = 2010) # census year 382 | 383 | ``` 384 | 385 | ## View the Data 386 | 387 | - How many rows and columns? 388 | 389 | - Do you see the expected number of states? 390 | 391 | - What column contains the population counts? 392 | 393 | - Do the data values see to be right? 394 | ```{r} 395 | #pop2010 396 | ``` 397 | 398 | ## Visualize results 399 | 400 | We can visualize the data to get a quick overview of the distribution of data values. 401 | 402 | It's a first step in exploratory data analysis and a last step in data communication. 403 | 404 | `ggplot2` is the most commonly used R package for data visualization. 405 | 406 | - It is loaded when you load the `tidyverse` package. 407 | 408 | Let's use it to visualize the population data. 409 | 410 | ## Plot 2010 Population by state 411 | 412 | Use `ggplot2` to create an ordered horizontal bar chart. 413 | ```{r} 414 | pop_plot<- ggplot(data=pop2010, aes(x=reorder(NAME,value), y=value/1000000)) + 415 | geom_bar(stat="identity") + coord_flip() + 416 | theme_minimal() + 417 | labs(title = "2010 US Population by State") + 418 | xlab("State") + 419 | ylab("in millions") 420 | ``` 421 | 422 | ## Display the plot 423 | 424 | ```{r, echo=F} 425 | pop_plot 426 | ``` 427 | 428 | ## Challenge 429 | 430 | Fetch population data by state for 2000. 431 | 432 | *Don't assume variable names are the same across years.* Check first! 433 | 434 | ## Challenge Solution 435 | 436 | Total Population in 2000 437 | 438 | ```{r, eval = F, code_folding = "hide"} 439 | # What is the variable name in 2000? 440 | vars2000 <- load_variables(year=2000, dataset = 'sf1', cache = T) 441 | 442 | # Take a look and search in the dataframe 443 | View(vars2000) 444 | 445 | # Fetch the 2000 pop data 446 | pop2000 <- get_decennial(geography = "state", variables = "P001001", year = 2000) 447 | 448 | # Take a look (plot if time) 449 | pop2000 450 | ``` 451 | 452 | ## Limiting by Area of Interest 453 | 454 | In the previous example we retrieved population data for all states. 455 | 456 | - This is the default behavior if you don't specify a subset. 457 | 458 | - But you can limit the data to be retrieved by subunits like state. 459 | 460 | ## Limit Areas of Interest 461 | 462 | Let's fetch data for just 3 states. 463 | 464 | ```{r} 465 | state_pop2010 <- get_decennial(geography = "state", # census tabulation unit 466 | variables = "P001001", # variables of interest 467 | year = 2010, # census year 468 | state=c("CA","OR","WA")) # Filter by states of interest 469 | 470 | ``` 471 | 472 | *Note we are referencing states by their abbrevation.* 473 | 474 | ## View Results 475 | ```{r} 476 | state_pop2010 477 | ``` 478 | 479 | ## Changing Census Tabulation unit 480 | 481 | `get_decennial` accepts a number of different values for **tabulation unit**. 482 | 483 | - Options include: `state`, `county`, `tract`, `block group`, `block`, and `ZCTA`. 484 | 485 | Let's change the tabulation unit from `state` to `county`. 486 | ```{r} 487 | co_pop2010 <- get_decennial(geography = "county", # census tabulation unit 488 | variables = "P001001", # variables of interest 489 | year = 2010) 490 | ``` 491 | 492 | ## Changing Census Tabulation unit 493 | 494 | View the county data to see what was retrieved. 495 | ```{r} 496 | co_pop2010 497 | ``` 498 | 499 | ## Challenge 500 | 501 | * Fetch population by **county** for just California 502 | 503 | * Fetch population by **county** for Oregon & California 504 | 505 | *Try it before you look ahead at solutions.* 506 | 507 | ## Challenge Solution 508 | ```{r} 509 | ## Fetch population by **county** for just California 510 | co_pop2010_ca <- get_decennial(geography = "county", # census tabulation unit 511 | variables = "P001001", # variables of interest 512 | year = 2010, 513 | state=c('CA')) 514 | #co_pop2010_ca 515 | 516 | ## Fetch population by **county** for Oregon & California 517 | co_pop2010_caor <- get_decennial(geography = "county", # census tabulation unit 518 | variables = "P001001", # variables of interest 519 | year = 2010, 520 | state=c('CA','OR')) 521 | co_pop2010_caor 522 | 523 | ``` 524 | 525 | ## Challenge 526 | 527 | * Fetch population by **tract** for all states. 528 | 529 | * Fetch population by **tract** for California. 530 | 531 | ## Challenge Solution 532 | ```{r, eval=F} 533 | ## Fetch population by **tract** for California. 534 | cal_pop2010_tracts <- get_decennial(geography = "tract", # census tabulation unit 535 | variables = "P001001", # variables of interest 536 | year = 2010, 537 | state=c('CA')) 538 | cal_pop2010_tracts 539 | 540 | 541 | ## Fetch population by **tract** for all states. 542 | pop2010_tracts <- get_decennial(geography = "tract", # census tabulation unit 543 | variables = "P001001", # variables of interest 544 | year = 2010) 545 | 546 | pop2010_tracts ## DOES THIS WORK? 547 | ``` 548 | 549 | ## Fetching Census Tract Data 550 | 551 | If you want census data at the tract level or below you **must** specifiy the state & county or counties. 552 | ```{r,} 553 | tract_pop2010 <- get_decennial(geography = "tract", # census tabulation unit 554 | variables = "P001001", # variable of interest 555 | year = 2010, # census year 556 | state="CA", # limit to state of California 557 | county=c("Alameda","Contra Costa")) # and only these counties 558 | ``` 559 | 560 | ## Fetching Census Tract Data 561 | 562 | View the results! How many census tracts are in these 3 counties? 563 | 564 | ```{r} 565 | tract_pop2010 566 | ``` 567 | 568 | ## Challenge 569 | 570 | 1. Fetch population by **county** for Alameda County, California 571 | 572 | 2. Fetch population by **tract** for the nine county Bay Area: 573 | - Alameda, SF, Contra Costa, Marin County, Napa, 574 | - San Mateo, Santa Clara, Solano, Sonoma, Santa Cruz 575 | 576 | Note: You can use names, abbreviations or FIPS codes for your `state` and `county`. 577 | 578 | ```{r} 579 | # County FIPS Codes for 580 | # Alameda, SF, Contra Costa, Marin County, Napa, 581 | # San Mateo, Santa Clara, Solano, Sonoma, santa cruz 582 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097") 583 | ``` 584 | 585 | ## Challenge Solution 586 | 587 | ```{r} 588 | # population by **county** for Alameda County, California 589 | alco_pop2010 <- get_decennial(geography = "county", # census tabulation unit 590 | variables = "P001001", # variables of interest 591 | year = 2010, 592 | state=c('CA'), 593 | county=c('Alameda County')) 594 | #alco_pop2010 595 | 596 | ``` 597 | 598 | ## Challenge Solution 599 | 600 | Fetch population by **tract** for the nine county Bay Area 601 | ```{r} 602 | # County FIPs Codes for 603 | # Alameda, SF, Contra Costa, Marin County, Napa, 604 | # San Mateo, Santa Clara, Solano, Sonoma, santa cruz 605 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097") 606 | 607 | bayarea_pop2010_tract <- get_decennial(geography = "tract", # census tabulation unit 608 | variables = "P001001", # variable of interest 609 | year = 2010, # census year 610 | state="CA", # limit to state of California 611 | county=nine_counties) # and only these counties 612 | #bayarea_pop2010_tract 613 | ``` 614 | 615 | 616 | ## RECAP & QUESTIONS 617 | 618 | Fetch population by **tract** for the nine county Bay Area 619 | ```{r, eval=F} 620 | # County FIPs Codes for 621 | # Alameda, SF, Contra Costa, Marin County, Napa, 622 | # San Mateo, Santa Clara, Solano, Sonoma, santa cruz 623 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097") 624 | 625 | bayarea_pop2010 <- get_decennial(geography = "tract", # census tabulation unit 626 | variables = "P001001", # variable of interest 627 | year = 2010, # census year 628 | state="CA", # limit to state of California 629 | county=nine_counties) # and only these counties 630 | 631 | # View the data 632 | bayarea_pop2010 633 | ``` 634 | 635 | 636 | 637 | ## Fetching data for more than one census variable 638 | 639 | What **three** things are new here? 640 | ```{r} 641 | #urban rural pop for 3 counties 642 | ur_pop10 <- get_decennial(geography = "county", # census tabulation unit 643 | variables = c(urban="P002002",rural="P002005"), 644 | year = 2010, 645 | summary_var = "P002001", # The denominator 646 | state='CA', 647 | county=c("Napa","Sonoma","Mendocino")) 648 | 649 | ``` 650 | 651 | ## Fetching data for more than one census variable 652 | 653 | 1. You can specify more than one variable: 654 | ``` 655 | variables = c("P002002","P002005") 656 | ``` 657 | 658 | 2. You can rename the values in the output 'variable' column. 659 | ``` 660 | variables = c(urban="P002002",rural="P002005") 661 | ``` 662 | 663 | 3. You can identify a `summary_var` (a denominator - here, the total count of all people or households surveyed. Can be used for calcuations like percent of total.) 664 | ``` 665 | summary_var = "P002001" 666 | ``` 667 | 668 | ## Take a look at the results 669 | ```{r} 670 | ur_pop10 671 | ``` 672 | 673 | ## Calculating Percents 674 | 675 | The `summary_value` column comes in handy when you want to compute percent of total. 676 | 677 | Here's one way to do it. 678 | ```{r} 679 | # Calculate the percent of population that is Urban or Rural 680 | ur_pop10 <- ur_pop10 %>% 681 | mutate(pct = 100 * (value / summary_value)) 682 | 683 | ``` 684 | 685 | ## Calculating Percents 686 | 687 | Let's take a look at the output 688 | ```{r} 689 | ur_pop10 # Take a look 690 | ``` 691 | 692 | ## Plot it 693 | 694 | Plots give us compact visual summaries of the data 695 | ```{r} 696 | myplot <- ggplot(data = ur_pop10, 697 | mapping = aes(x = NAME, fill = variable, 698 | y = ifelse(test = variable == "urban", 699 | yes = -pct, no = pct))) + 700 | geom_bar(stat = "identity") + 701 | scale_y_continuous(labels = abs, limits=c(-100,100)) + 702 | labs(title="Urban & Rural Population in Wine Country", 703 | x="County", y = " Percent of Population", fill="") + 704 | coord_flip() 705 | ``` 706 | *Don't worry if you don't get all the ggplot code now. It's here for reference.* 707 | 708 | ## Plot it 709 | ```{r} 710 | myplot 711 | ``` 712 | 713 | ## Fetch all the data in one table 714 | 715 | This is often helpful **but** you need to keep tract of the meaning of each variable. 716 | ```{r} 717 | alco_pop10 <- get_decennial(geography = "tract", # Census tabulation unit 718 | table = "P002", # Table of urban & rural population counts 719 | year = 2010, # Decennial census year 720 | state='CA', # Filter state 721 | county="Alameda") # Filter county 722 | 723 | ``` 724 | 725 | ## Take a look 726 | ```{r} 727 | unique(alco_pop10$variable) # What and how many unique vars in table? 728 | 729 | head(alco_pop10,3) # Take a look at output 730 | ``` 731 | 732 | 733 | ## Output options 734 | 735 | Let's try all three of these commands and then look at the ouput to see what's different? 736 | 737 | ```{r, eval=F} 738 | get_decennial(geography = "state", variables = "P001001", 739 | year = 2010) 740 | 741 | get_decennial(geography = "state", variables = c(pop10="P001001"), 742 | year = 2010) 743 | 744 | get_decennial(geography = "state", variables = c(pop10="P001001"), 745 | year = 2010, output="wide") 746 | ``` 747 | 748 | ## Output options 749 | 750 | ```{r} 751 | head(get_decennial(geography = "state", variables = "P001001", 752 | year = 2010), 2) 753 | head(get_decennial(geography = "state", variables = c(pop10="P001001"), 754 | year = 2010), 2) 755 | head(get_decennial(geography = "state", variables = c(pop10="P001001"), 756 | year = 2010, output="wide"), 2) 757 | ``` 758 | 759 | 760 | ## Data Wrangling 761 | 762 | Your R skills can help you reformat the data and make it more useable. 763 | 764 | Let's fetch population data for 2010 & 2000 by state with **output=wide**. 765 | 766 | - We will label the variables **pop00** and **pop10**. 767 | 768 | Then we will combine these into one data frame. 769 | 770 | ## Data Wrangling 771 | 772 | Fetch pop by state from both the 2000 and 2010 census 773 | ```{r} 774 | pop2000 <- get_decennial(geography = "state", 775 | variables = c(pop00="P001001"), 776 | year = 2000, output="wide") 777 | 778 | pop2010 <- get_decennial(geography = "state", 779 | variables = c(pop10="P001001"), 780 | year = 2010, output="wide") 781 | 782 | ``` 783 | 784 | ## Merge population by state from both censuses 785 | 786 | Save in a new dataframe with both columns 787 | ```{r} 788 | pop2000_2010 <- pop2000 %>% merge(pop2010, by="NAME") %>% 789 | select(NAME, pop00, pop10) 790 | 791 | head(pop2000_2010,3) 792 | ``` 793 | 794 | ## Save the data 795 | 796 | Use `write.csv` to save a data frame to a `CSV` file. 797 | 798 | ```{r, eval=F} 799 | write.csv(pop2000_2010, file="pop2000_2010.csv", row.names = FALSE) 800 | ``` 801 | 802 | # TIME FOR QUESTIONS 803 | 804 | 805 | # Part 2. Mapping 806 | 807 | 808 | ## Mapping Census Data with `tidycensus` 809 | 810 | You can fetch geographic data by adding the parameter **geometry=TRUE** to `tidycensus` functions 811 | 812 | - Under the hood, tidycensus calls the `tigris` package to fetch data from the Census Geographic Data APIs. 813 | 814 | - Only a subset of data available via `tigris` can be accessed via `tidycensus`. 815 | 816 | You can then use common mapping options like `plot`, `ggplot` and `tmap` to make maps. 817 | 818 | ## Geometry Options 819 | 820 | Before fetching geometry, we need to specify a few `tigris` options 821 | 822 | - Set the `class` of returned data to be `sf` objects (not `sp`, the default) 823 | 824 | - Set `tigris_use_cache` to TRUE 825 | 826 | ```{r} 827 | # Tigris options - used by tidycensus 828 | options(tigris_class = "sf") # SP is the default format returned by tigris 829 | options(tigris_use_cache = TRUE) # Save retrieved data locally 830 | 831 | ``` 832 | 833 | Caching the data is important because it speeds things up if you often fetch census data for the same geographies over and over again. 834 | 835 | ## tigris cache directory 836 | 837 | You may want to use the geographic data downloaded by tigris in other applications. 838 | 839 | To do this, you need to know where the files are saved locally. 840 | 841 | You can also specify where tigris should save cached data. 842 | ```{r, eval=F} 843 | # Check the location of the tigris cached data 844 | Sys.getenv('TIGRIS_CACHE_DIR') 845 | 846 | # Set it 847 | tigris_cache_dir("~/Documents/gis_data/census") # Folder for local data 848 | 849 | # Check it again 850 | Sys.getenv('TIGRIS_CACHE_DIR') 851 | ``` 852 | 853 | ## Fetch geographic boundary data with tidycensus 854 | 855 | We fetch the geospatial data by setting **geometry=TRUE**. 856 | 857 | ```{r} 858 | pop2010geo <- get_decennial(geography = "state", 859 | variables = c(pop10="P001001"), 860 | year = 2010, 861 | output="wide", 862 | geometry=TRUE) # Fetch geometry with the data for mapping 863 | 864 | ``` 865 | 866 | ## Take a look 867 | 868 | Let's take a minute to discuss the format of an `sf` spatial object. 869 | ```{r} 870 | pop2010geo 871 | ``` 872 | 873 | 874 | ## Geospatial Data in R 875 | 876 | R `sf` objects include 877 | 878 | - a dataframe with a `geometry` column named of `geometry` 879 | 880 | - The geometry can be of type POINT, LINE, POLYGON 881 | - or, MULTIPOINT, MULTILINE or MULTIPOLGYON 882 | 883 | - a `CRS` (coordinate reference system), specified by 884 | - epsg(SRID) code 885 | - proj4string 886 | 887 | For a deeper understanding of the `sf` package and its functionality, we recommend our [Geospatial-Fundamentals-in-R-with-sf](https://github.com/dlab-berkeley/Geospatial-Fundamentals-in-R-with-sf) workshop. 888 | 889 | ## Census Data Coordinate Reference System (CRS) 890 | 891 | All census geographic data use the `NAD83` CRS, or coordinate reference system. `NAD83` stands for North American Datum of 1983. The geographic coordinates are longitude and latitude values encoded as decimal degrees. 892 | 893 | `WGS84`, or [The World Geodetic System of 1984](https://en.wikipedia.org/wiki/World_Geodetic_System) is the most commonly used geographic CRS. The difference between points in these systems varies up to 1 meter in continental US. 894 | 895 | Many geospatial operations require you transform data to a common CRS before conducting spatial analysis or mapping. 896 | 897 | An in-depth discussion of CRSs is outside the scope of this workshop. See [Geocomputation in R](https://geocompr.robinlovelace.net/reproj-geo-data.html) for more information. 898 | 899 | ## Mapping sf Spatial Objects 900 | 901 | We can use `plot` to make a quick map the geometry stored in an `sf` spatial object. 902 | 903 | ```{r} 904 | plot(pop2010geo$geometry) 905 | ``` 906 | 907 | ## Question 908 | 909 | What do you get if you plot the `sf` object without specifying "$geometry" 910 | 911 | 912 | ## The Challenge of US maps 913 | 914 | The vast geographic extent and non-contiguous nature of the USA makes it difficult to map. 915 | 916 | ```{r, echo=F} 917 | plot(pop2010geo$geometry) #view again 918 | ``` 919 | 920 | ## Fetch geographic data with tidycensus, SHIFTED 921 | 922 | tidycensus includes a `shift_geo` parameter to shift AK & HI to below Texas. 923 | ```{r} 924 | 925 | pop2010geo_shifted <- get_decennial(geography = "state", 926 | variables = c(pop10="P001001"), 927 | output="wide", 928 | year = 2010, 929 | geometry=TRUE, 930 | shift_geo=TRUE) 931 | 932 | ``` 933 | 934 | ## Shift Happens! 935 | ```{r} 936 | plot(pop2010geo_shifted$geometry) 937 | ``` 938 | 939 | ## Save it 940 | 941 | You can save `sf` data to a shapefile using `st_write` 942 | 943 | ```{r, eval=F} 944 | st_write(pop2010geo_shifted,"usa_2010_shifted.shp") 945 | ``` 946 | 947 | ## Check your TIGRIS_CACHE_DIR to see it 948 | 949 | ```{r, eval=F} 950 | my_cache_dir <- Sys.getenv('TIGRIS_CACHE_DIR') 951 | 952 | dir(my_cache_dir) # What files stored there? 953 | ``` 954 | 955 | ## Mapping Data Values 956 | 957 | ```{r} 958 | plot(pop2010geo_shifted['pop10']) 959 | ``` 960 | 961 | ## ggplot2 Maps 962 | 963 | ```{r} 964 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 965 | geom_sf() 966 | ``` 967 | 968 | ## ggplot2 Maps 969 | 970 | Note the use of **geom_sf** which tells ggplot that spatial data objects are being mapped. 971 | - this is a huge improvement!! 972 | 973 | ```{r, include=F} 974 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 975 | geom_sf() 976 | ``` 977 | 978 | ## Challenge 979 | 980 | Create a `map` of CA Population in 2010 by county 981 | 982 | 983 | ## Challenge Solution 984 | 985 | 2010 pop Data for California Counties 986 | ```{r, eval=F} 987 | 988 | #fetch it 989 | cal_pop10 <- get_decennial(geography = "county", 990 | variables = "P001001", 991 | year = 2010, 992 | state='CA', 993 | geometry=TRUE) 994 | 995 | # map it 996 | #plot(cal_pop10['value']) 997 | ``` 998 | 999 | 1000 | ## Fetch County data for more than one state 1001 | 1002 | We can fetch both the census data and the **geometry** for more than one state! 1003 | 1004 | - *this is so much easier than any alternative approach!* 1005 | ```{r} 1006 | west_pop10 <- get_decennial(geography = "county", 1007 | variables = "P001001", 1008 | year = 2010, 1009 | state=c('CA','OR','NV',"AZ"), 1010 | geometry=T) 1011 | ``` 1012 | 1013 | ## Map it 1014 | 1015 | These are just quick plots to make sure we got the right data! 1016 | ```{r} 1017 | plot(west_pop10['value']) 1018 | ``` 1019 | 1020 | ## Census Tract Data 1021 | 1022 | Fetching the data for all `tracts` in one state. 1023 | 1024 | - **but** you need to specify one or more counties. 1025 | ```{r} 1026 | # Fetch tract data 1027 | alco_pop10 <- get_decennial(geography = "tract", 1028 | variables = "P001001", 1029 | year = 2010, 1030 | state='CA', 1031 | county='Alameda', 1032 | geometry=T) 1033 | ``` 1034 | 1035 | ## Challenge 1036 | 1037 | Fetch and map the 2010 population by census tract for Alameda and Contra Costa counties. 1038 | 1039 | 1040 | ## Challenge Solution 1041 | 1042 | Fetch Tract population & geometry data for Alameda & Contra Costa Counties 1043 | 1044 | ```{r} 1045 | 1046 | alcc_pop10 <- get_decennial(geography = "tract", 1047 | variables = "P001001", 1048 | year = 2010, 1049 | state='CA', 1050 | county=c("Alameda","Contra Costa"), 1051 | geometry=T) 1052 | ``` 1053 | 1054 | ## Challenge Solution 1055 | 1056 | Map it 1057 | ```{r} 1058 | plot(alcc_pop10['value']) 1059 | ``` 1060 | 1061 | 1062 | ## More Complex Challenge (if time) 1063 | 1064 | Fetch and map the percent of San Francicso properties by census tract that were coded as rented in the 2010 Census. 1065 | 1066 | To start, identify the variables for the 1067 | 1068 | - total number of hounsing units 1069 | 1070 | - number of renter occupied units 1071 | 1072 | ## Complex Challenge Solution 1073 | 1074 | SF Rented Units, 2010 1075 | ```{r, eval=F} 1076 | sf_rented <- get_decennial(geography = "tract", # census tabulation unit 1077 | variables = "H004004", 1078 | year = 2010, 1079 | summary_var = "H004001", # Total Urban - the denominator 1080 | state='CA', 1081 | county='San Francisco', 1082 | geometry=T) 1083 | 1084 | sf_pct_rented <- sf_rented[sf_rented$value > 0,] %>% 1085 | mutate(pct = 100 * (value / summary_value)) 1086 | 1087 | plot(sf_pct_rented['pct']) 1088 | ``` 1089 | 1090 | # Questions? 1091 | 1092 | # Part 3. ACS 5 year data 1093 | 1094 | ## ACS Data with tidycensus 1095 | 1096 | The tidycensus workflow for ACS data is similar to that used for decennial census data. 1097 | 1098 | - But there are many more variables in the ACS. 1099 | 1100 | Because the ACS contains **sample data**, each ACS variable of interest includes both an **estimate** of the value and a **margin of error**. 1101 | 1102 | ## ACS 5 year 1103 | 1104 | You can use the tidycensus **get_acs** function to retrieve data for the ACS 5 year products, beginning with the 2006 - 2010 dataset. 1105 | 1106 | The **default** end year for my version of `tidycensus` (as of April 9, 2020) is **2018** for the 2014-2018 ACS 5 year dataset. 1107 | 1108 | 1109 | ## Fetch List of ACS 5 year Variables 1110 | 1111 | Let's start by fetching ACS 5-year 2016 data on poverty (not all variables appear included in 2018 data yet). 1112 | 1113 | We want to explore the number of folks living below the poverty level by census tract. 1114 | 1115 | First we need to find the variable name(s)! 1116 | 1117 | ## Load ACS Table Vars 1118 | 1119 | Load the ACS 2012-2016 5 year data variables into a dataframe. 1120 | 1121 | - ACS 5-year datasets are referenced by `end year` in tidycensus! 1122 | 1123 | Then take a look at the variable names, labels and concepts. 1124 | 1125 | How many variables refer to the concept of poverty? 1126 | 1127 | ```{r} 1128 | acs2016vars <- load_variables(year=2016, dataset = 'acs5', cache = T) 1129 | View(acs2016vars) 1130 | ``` 1131 | 1132 | ## ACS Tables and variables 1133 | 1134 | Many thousands more than for decennial census! 1135 | 1136 | See the documentation on the [census website](https://www.census.gov/programs-surveys/acs/guidance/which-data-tool/table-ids-explained.html) 1137 | 1138 | Types of tables: 1139 | 1140 | - `B` prefix = base tables 1141 | - `C` = collapsed tables 1142 | - `DP` = data profiles 1143 | - `S` = Subject tables 1144 | 1145 | ## Census Reporter 1146 | 1147 | ACS variables can be confusing. 1148 | 1149 | The Census Reporter website (https://censusreporter.org) provides another tool for navigating topics, tables, and variable names. 1150 | 1151 | Let's check it out to see what tables/variables we should use. 1152 | 1153 | ## Filter the ACS Variables 1154 | 1155 | In RStudio, view the dataframe **acs2016vars** and interactively filter the name column to display only the variables in the table **C17002** 1156 | 1157 | Take a look at the different variables in this table. 1158 | 1159 | What variable(s) contain the estimate of the number of people living below poverty? 1160 | 1161 | ## get_acs 1162 | 1163 | Use the tidycensus `get_acs` function to fetch the poverty data for census tracts in San Francisco 1164 | ```{r, eval=F} 1165 | ?get_acs 1166 | ``` 1167 | 1168 | ## get_acs in action 1169 | 1170 | Fetch the data in the table **C17002** that contain the counts of people living below 100% of the poverty line. 1171 | ```{r} 1172 | sf_poor <- get_acs(geography = "tract", 1173 | variables = c('C17002_002','C17002_003'), # poverty variables 1174 | year = 2016, 1175 | state="CA", 1176 | summary_var = "C17002_001", # Est of num people - denom 1177 | county="San Francisco", 1178 | geometry=T) 1179 | ``` 1180 | 1181 | ## View output 1182 | 1183 | Let's take a look at the output of `get_acs` and discuss how it differs from `get_decennial`. 1184 | 1185 | ```{r, eval=F} 1186 | sf_poor 1187 | ``` 1188 | 1189 | ## Create Poverty Map, try 2 1190 | 1191 | What are we mapping! 1192 | ```{r} 1193 | # What are we mapping? 1194 | plot(sf_poor['estimate']) 1195 | ``` 1196 | 1197 | ## Create Poverty Map, try 2 1198 | 1199 | ```{r} 1200 | # Remove census tracts that have no people! 1201 | sf_poor <- subset(sf_poor, summary_est > 0) 1202 | 1203 | # What are we mapping? 1204 | plot(sf_poor['estimate']) 1205 | ``` 1206 | 1207 | ## Calculating percents 1208 | 1209 | Let's calculate the percent below poverty by tract. 1210 | 1211 | ```{r} 1212 | sf_poor <- sf_poor %>% 1213 | mutate(pct = 100 * (estimate / summary_est)) 1214 | 1215 | head(sf_poor, 3) 1216 | ``` 1217 | 1218 | 1219 | ## Group by and sum 1220 | 1221 | We want to group the data by the geometry and then sum the data values so that we have one value per geometry. 1222 | ```{r} 1223 | sf_poor_summed <- sf_poor %>% 1224 | select(GEOID, estimate, pct, geometry) %>% 1225 | group_by(GEOID) %>% 1226 | summarise(count_below_pov = sum(estimate), 1227 | pct_below_pov = sum(pct)) 1228 | ``` 1229 | 1230 | ## Group by and sum 1231 | 1232 | ```{r} 1233 | head(sf_poor_summed) 1234 | ``` 1235 | 1236 | ## Map Counts 1237 | 1238 | Where are SF's poorest areas? 1239 | ```{r} 1240 | plot(sf_poor_summed['count_below_pov']) 1241 | 1242 | ``` 1243 | 1244 | ## Map Percents 1245 | 1246 | Where are SF's poorest areas? 1247 | ```{r} 1248 | plot(sf_poor_summed['pct_below_pov']) 1249 | 1250 | ``` 1251 | 1252 | 1253 | ## Challenge 1254 | 1255 | The ACS **2013-2017** 5 year dataset was released Dec 6, 2018. 1256 | 1257 | Although my current version of `tidycensus` states that 2012-2016 is the latest ACS 5-year product, see if you can fetch & map the percent of people below poverty line in San Francisco using the **2013-2017** ACS 5-year data. 1258 | 1259 | ## Challenge Solution 1260 | ```{r, eval=F} 1261 | sf_poor_2017 <- get_acs(geography = "tract", 1262 | variables = c('C17002_002','C17002_003'), # poverty variables 1263 | year = 2017, 1264 | state="CA", 1265 | summary_var = "C17002_001", # Est of num people - denom 1266 | county="San Francisco", 1267 | geometry=T) 1268 | 1269 | head(sf_poor_2017) 1270 | ``` 1271 | 1272 | 1273 | ## Margins of Error (MOE) 1274 | 1275 | We haven't talked about it but it may be important in your work with ACS data. 1276 | 1277 | Math is needed to combine MOEs when you combine variables. 1278 | 1279 | - tidycensus includes some nice [functions](https://walkerke.github.io/tidycensus/reference/index.html) for these calculations. 1280 | 1281 | See this web page on how to handle [MOEs in tidycensus](https://walkerke.github.io/tidycensus/articles/margins-of-error.html) 1282 | 1283 | # Questions? 1284 | 1285 | 1286 | # Maps with tmap - Demo 1287 | 1288 | ## tmap 1289 | 1290 | The `tmap` package is great for making both static and interactive maps. It turns R into a `GIS`. 1291 | 1292 | Let's check it out with our last dataframe. 1293 | 1294 | ## tmap 1295 | 1296 | ```{r} 1297 | library(tmap) 1298 | tmap_mode("view") # set mode to interactive 1299 | 1300 | poverty_map <- tm_shape(sf_poor_summed) + 1301 | tm_polygons(col="pct_below_pov", alpha=0.7) 1302 | ``` 1303 | 1304 | ## tmap 1305 | 1306 | View the map - click on tracts 1307 | 1308 | ```{r} 1309 | poverty_map 1310 | ``` 1311 | 1312 | ## tmap 1313 | 1314 | There are a number of great tutorials online for working with `tmap`. 1315 | 1316 | See the `References` at the end of this workshop document. 1317 | 1318 | # Census Geographic Data Files 1319 | 1320 | ## Census Geographic Data Files 1321 | 1322 | **Cartographic Boundary** vs **Detailed TIGER/Line** data 1323 | 1324 | By default, `tidycensus` downloads census **cartographic boundary** data. 1325 | 1326 | - These are simplifed geometries, clipped to coastlines. 1327 | 1328 | In `get_acs` you can also request the more detailed census **TIGER/Line** data. 1329 | 1330 | The cartographic boundary data is great for mapping but the detailed data is often better for analysis. 1331 | 1332 | Let's check it out. 1333 | 1334 | 1335 | ## Fetch Cartographic Boundary Data 1336 | ```{r} 1337 | 1338 | sf_poor_cb <- get_acs(geography = "tract", 1339 | variables = c('C17002_002','C17002_003'), # poverty variables 1340 | summary_var = "C17002_001", 1341 | year = 2016, 1342 | state="CA", 1343 | county="San Francisco", 1344 | geometry=TRUE, 1345 | cb = TRUE) # THIS IS THE DEFAULT! 1346 | ``` 1347 | 1348 | ## Fetch Detailed TIGER/Line Geometry 1349 | ```{r} 1350 | 1351 | sf_poor_tl <- get_acs(geography = "tract", 1352 | variables = c('C17002_002','C17002_003'), # poverty variables 1353 | summary_var = "C17002_001", 1354 | year = 2016, 1355 | state="CA", 1356 | county="San Francisco", 1357 | geometry=TRUE, 1358 | cb = FALSE) # Fetching the TIGER/Line data 1359 | ``` 1360 | 1361 | 1362 | ## Visualize differences with Tmap 1363 | 1364 | zoom in to explore, especially around the coastline. 1365 | ```{r} 1366 | tm_shape(sf_poor_tl) + tm_borders() + 1367 | tm_shape(sf_poor_cb) + tm_borders(col="red") 1368 | 1369 | ``` 1370 | 1371 | 1372 | # Questions? 1373 | 1374 | # Summary 1375 | 1376 | ## Summary 1377 | 1378 | - `tidycensus` offers two key functions for fetching census tabular and geographic: **get_acs** and **get_decennial** 1379 | 1380 | - Using `tidycensus` to fetch the tabular data or both tabular and geographic data is IMHO way easier than any alternatives, **IF** you (1) know R, (2) know a bit about working with geographic data in R. 1381 | 1382 | - This approach is also scaleable if you want multiple census variables and geographies. 1383 | 1384 | - If you just want to fetch the geographic data it may be easier to use the **tigris** package or download it directly from the census. 1385 | 1386 | ## References 1387 | 1388 | - [DataCamp](https://www.datacamp.com) course [Analyzing US Census Data in R!](https://www.datacamp.com/courses/analyzing-us-census-data-in-r) 1389 | - [Geocomputation in R](https://geocompr.robinlovelace.net/) 1390 | - [Creating beautiful demographic maps with tidycensus and tmap packages](https://www.zevross.com/blog/2018/10/02/creating-beautiful-demographic-maps-in-r-with-the-tidycensus-and-tmap-packages/) 1391 | 1392 | ## Related D-Lab Workshops 1393 | 1394 | - [R Fundamentals](https://github.com/dlab-berkeley/R-Fundamentals) 1395 | - [Geospatial Data in R, parts 1, 2, & 3](https://github.com/dlab-berkeley/Geospatial-Fundamentals-in-R-with-sf) 1396 | - [Web Maps in R with Leaflet](https://github.com/dlab-berkeley/Leaflet-Maps-in-R) 1397 | - [Geocoding & Mapping in R](https://github.com/dlab-berkeley/Geocoding-in-R) 1398 | 1399 | # Extras for Enthusiasts 1400 | 1401 | ## Scaling Up Example 1402 | 1403 | In this example we show you how you can read in census variables of interest from a file into an R dataframe. You can then use that dataframe to fetch data for all those variables using `tidycensus`. 1404 | 1405 | ```{r} 1406 | 1407 | # Load cenvar lookup table of vars of interest 1408 | my_cenvar_df <-read.csv("data/cenvar_lookup.csv", strip.white = T, stringsAsFactors = F) 1409 | 1410 | my_cenvar_df 1411 | ``` 1412 | 1413 | ## Fetch the ACS data 1414 | 1415 | Fetch the ACS data for these variables for the 9 county bay area 1416 | 1417 | ```{r} 1418 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097") 1419 | bay9_data <-get_acs(geography = "tract", 1420 | variables = my_cenvar_df$my_cen_vars, 1421 | year=2016, 1422 | state = "CA", 1423 | county = nine_counties, 1424 | geometry = T) 1425 | 1426 | bay9_data 1427 | ``` 1428 | 1429 | ## Reformat Ouput 1430 | 1431 | 1. We only want to keep the estimate column for each variable of interest, plus the GEOID and geometry columns. 1432 | 1433 | 2. We then want to make the data `wide` using the `spread` function. This will put each estimate variable is in its own column. 1434 | ```{r} 1435 | bay9_data2 <- bay9_data %>% 1436 | select("GEOID", "variable", "estimate") %>% 1437 | spread(key=variable, value=estimate) 1438 | ``` 1439 | 1440 | ## Take a look 1441 | ```{r} 1442 | bay9_data2 1443 | ``` 1444 | 1445 | ## Rename the columns 1446 | 1447 | Use the dataframe of census variables to rename the columns so that they are self-describing. 1448 | ```{r} 1449 | colnames(bay9_data2)<-c("GEOID", my_cenvar_df$my_cen_var_names, "geometry") 1450 | 1451 | ``` 1452 | 1453 | ## Take a look 1454 | ```{r} 1455 | 1456 | bay9_data2 1457 | ``` 1458 | 1459 | 1460 | ## Fetching data for multiple years 1461 | 1462 | This requires variable name to be the same across years! 1463 | ```{r, eval=FALSE} 1464 | # use purr::map_df to get data for multiple years (must have same vars!) 1465 | pop90_10 <- map_df(c(1990, 2000, 2010), function(x) { 1466 | get_decennial(geography = "state", 1467 | variables = c(totalpop = "P001001"), 1468 | dataset = "sf1", 1469 | year = x) %>% 1470 | mutate(year = x) } 1471 | ) 1472 | 1473 | # View output 1474 | head(pop90_10) 1475 | tail(pop90_10) 1476 | 1477 | # Plot it 1478 | pop90_10 %>% ggplot(aes(x=reorder(NAME,value), y=value/1000000, fill=factor(year))) + 1479 | geom_bar(stat="identity", position=position_dodge()) + coord_flip() 1480 | 1481 | ``` 1482 | 1483 | 1484 | # Combining Census Data with Other Data 1485 | 1486 | ## Area Weighted Interpolation 1487 | 1488 | One of the strenghts of the `sf` package is how relatively easy it is to reaggregate data from one geometry to another. This process is called areal interpolation. 1489 | 1490 | Area weighted interpolation reaggregates the data based on the percent of area shared by input and output geometeries. 1491 | 1492 | ## Read in a Shapefile 1493 | ```{r, eval=F} 1494 | sfnhoods<- st_read("data/sfnhoods.shp") 1495 | head(sfnhoods) 1496 | plot(sfnhoods['nhood']) 1497 | ``` 1498 | 1499 | ## Check the CRS 1500 | ```{r, eval=F} 1501 | st_crs(sfnhoods) 1502 | st_crs(sf_poor5) 1503 | ``` 1504 | 1505 | ## CRS transformation 1506 | ```{r, eval=F} 1507 | sf_poor5_4326 = st_transform(sf_poor5, st_crs(sfnhoods)) 1508 | ``` 1509 | 1510 | ## Area Weighted Interpolation 1511 | 1512 | Reaggregate percent of people below poverty from census tract to neighborhood polygons. 1513 | 1514 | ```{r, eval=F} 1515 | sfhoods2 = st_interpolate_aw(sf_poor5_4326[, "pct_below_pov"], sfnhoods, 1516 | extensive = F) # True= aw sum; False= aw avg 1517 | ``` 1518 | 1519 | ## Map it 1520 | ```{r, eval=F} 1521 | par(mfrow=c(1,2)) 1522 | plot(sf_poor5['pct_below_pov']) 1523 | plot(sfhoods2['pct_below_pov']) 1524 | par(mfrow=c(1,1)) 1525 | ``` 1526 | 1527 | ## Map it with `tmap` 1528 | ```{r, eval=F} 1529 | tm_shape(sfhoods2) + 1530 | tm_polygons(col="pct_below_pov") 1531 | ``` 1532 | 1533 | ## Combine the values 1534 | ```{r, eval=F} 1535 | head(sfhoods2) 1536 | sfnhoods$pct_below_pov <- sfhoods2$pct_below_pov 1537 | 1538 | # map again - click on polygons and view data in popups 1539 | # to confirm the AWI output values 1540 | tm_shape(sfnhoods) + 1541 | tm_polygons(col="pct_below_pov", 1542 | popup.vars = c("nhood", "pct_below_pov") 1543 | ) 1544 | ``` 1545 | 1546 | -------------------------------------------------------------------------------- /previous_versions/Rcensus_data_maps-tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Census Data Wrangling and Mapping in R" 3 | author: "Patty Frontiera" 4 | date: "03/21/2019" 5 | output: 6 | html_document: 7 | toc: true 8 | number_sections: true 9 | toc_float: true 10 | --- 11 | 12 | ```{r setup, include=FALSE} 13 | knitr::opts_chunk$set(echo = TRUE) 14 | 15 | ``` 16 | 17 | # Getting Started 18 | 19 | ## Setup 20 | 21 | Welcome! While we're waiting: 22 | 23 | * **Clone or download** the workshop files from: [https://github.com/dlab-geo/rCensus_workshop](https://github.com/dlab-geo/rCensus_workshop) 24 | - If you downloaded the zipfile, **unzip it**. 25 | - Make a note of the folder in which the files reside. 26 | 27 | 28 | * Open **RStudio** 29 | 30 | * Open a new **R script** file 31 | 32 | ## Introduction 33 | 34 | - About me 35 | 36 | - About you 37 | - Your familiarity with US Census data 38 | - with geospatial data 39 | - with geospatial data in R 40 | 41 | ## Outline 42 | 43 | - Describe primary Census data products 44 | 45 | - Introduce R packages for working with Census Data 46 | 47 | - Use those packages to fetch census data 48 | 49 | - Use those packages to fetch census data plus census geograpic boundary files 50 | 51 | - Make maps of census data 52 | 53 | # Census Data Overview 54 | 55 | ## US Census Data 56 | 57 | The "nation's leading provider of quality data about its people and economy." 58 | 59 | 60 | 61 | Available at [www.census.gov](www.census.gov) 62 | 63 | ## Primary Census Products 64 | 65 | - Decennial Census 66 | 67 | - American Community Survey (ACS) 68 | 69 | ## Decennial Census 70 | 71 | Complete count of the population every 10 years since `1790` 72 | 73 | Includes data on 74 | 75 | - population, by age & race/ethnicity 76 | 77 | - housing, by occupancy & tenure (owned, rented) 78 | 79 | ## American Community Survey (ACS) 80 | 81 | - Annual survey of a sample of about 3 million household 82 | 83 | - Provides estimates of demographic, social, economic & housing characteristics 84 | 85 | - Includes margin of error values for the estimates. 86 | 87 | 88 | ## Decennial Census* vs ACS Data 89 | | Demographic* | Social | Economic | Housing | 90 | |-----------------|--------------------|-------------------|-------------------| 91 | | Sex | Families | Income | Tenure* | 92 | | Age | Education | Benefits | Occupancy* | 93 | | Race | Marital Status | Employment Status | Structure Type | 94 | | Hispanic Origin | Fertility | Occupation | Housing Value | 95 | | | Grandparents | Industry | Taxes & Insurance | 96 | | | Veterans | Commuting | Utilities | 97 | | | Disability Status | Place of Work | Mortgage | 98 | | | Language at Home | Health Insurance | Monthly Rent | 99 | | | Citizenship | | | 100 | | | Mobility | | | 101 | 102 | 103 | 104 | ## Census Geographies 105 | 106 | Census data are publicly available at one or more levels of geographic aggregation. 107 | 108 | 109 | 110 | ## Census Data & Census Geographies 111 | 112 | 113 | 114 | ## ACS 5 Year Dataset RECOMMENDED 115 | 116 | ACS 1 year and 5 year products are currently available 117 | 118 | - ACS 3 year no longer available 119 | 120 | ACS 5 year data provdes much better estimates, lower margins of error 121 | 122 | More data available for ACS 5 Year product 123 | 124 | 125 | ## Census Data Workflow 126 | 127 | Identify your 128 | 129 | - topic of interest 130 | - year(s) 131 | - geographic level of detail 132 | - for what locations? 133 | 134 | Then determine what specific tables and variables 135 | are available - ACS or Decennial? 136 | 137 | ## CAUTION 138 | 139 | "If you want to measure change you can't change the measures!" 140 | 141 | **Census tables, variables, geographies, and geographic boundaries change over time!** 142 | 143 | Measuring change over time with census data is *it's own thing*, complex and not covered by this workshop! 144 | 145 | # R Packages 146 | 147 | ## Packages for Working with Census Data 148 | 149 | These are the ones we recommend and will use today. 150 | 151 | - [tidycensus](https://walkerke.github.io/tidycensus) & [tigris](https://github.com/walkerke/tigris) 152 | 153 | - [tidyverse](https://www.tidyverse.org/) 154 | 155 | - [sf](https://r-spatial.github.io/sf/) 156 | 157 | 158 | # tidycensus & tigris 159 | 160 | ## [tidycensus](https://walkerke.github.io/tidycensus) 161 | 162 | Functions for accessing census decennial and ACS 5 year datasets via Census APIs 163 | 164 | - only a subset of datasets / years available 165 | - requires a `Census API key` 166 | 167 | ## [tidycensus](https://walkerke.github.io/tidycensus) 168 | 169 | Limited set of years available via `tidycensus` 170 | 171 | - decennial census: 1990, 2000, and 2010 172 | - ACS 5 yr: 2005-2010 through 2012-2016 are available. 173 | - Note: tidycensus referes to ACS 5year datasets by the endyear. 174 | - 2013 - 2017 released [Dec 6, 2018](https://www.census.gov/programs-surveys/acs/news/data-releases/2017/release-schedule.html) by the Census. 175 | - Need to check its availability in `tidycensus`. 176 | 177 | ## [tigris](https://github.com/walkerke/tigris) 178 | 179 | Provides access to census geographic data files 180 | 181 | - detailed TIGER/Line boundary files (e.g., shapefiles), or 182 | - simplified Cartographic boundary files 183 | 184 | Also provides access to census `feature data`, 185 | 186 | - eg, rivers, roads, coastlands, landmarks, and more 187 | 188 | 189 | Used by `tidycensus` to access state, county, tract, block group, block, and ZCTA boundaries. 190 | 191 | - Use `tigris` directly to access other census geographic data. 192 | 193 | ## tidycensus & tigris 194 | 195 | Packages developed by [Kyle Walker](https://walkerke.github.io/) to make it easier to fetch data from Census websites and APIs in **R** and get that data in a useable format to analyze, plot, and map. 196 | 197 | Check out his website to keep abreast of his great packages, blog posts and tutorials. 198 | 199 | - http://personal.tcu.edu/kylewalker/ 200 | 201 | - https://walkerke.github.io/ 202 | 203 | Walker also develped a new [DataCamp](https://www.datacamp.com) course [Analyzing US Census Data in R!](https://www.datacamp.com/courses/analyzing-us-census-data-in-r) 204 | 205 | - Highly recommended! First chapter free! 206 | 207 | 208 | ## [tidyverse](https://www.tidyverse.org/) 209 | 210 | A collection of R Packages for data science 211 | - developed primarily by [Hadley Wickham](http://hadley.nz/), Chief Scientist at [RStudio](https://www.rstudio.com/). 212 | 213 | - `dplyr` and `tidyr` for reshaping data 214 | 215 | - `ggplot2` for plotting 216 | 217 | - `purr`, `readr` and `tibble` for improved performance 218 | 219 | These packages are used by `tidyverse` under the hood. 220 | 221 | ## [sf](https://r-spatial.github.io/sf/) 222 | 223 | Simple features for geospatial data objects and methods. 224 | 225 | - Next generation R package for working with vector geospatial data 226 | - will soon supercede the `sp` package 227 | 228 | `sf` includes the functionality of the `sp`, `rgdal`, `rgeos` and `proj4` packages. 229 | 230 | - but with improved performance, simplified command syntax and easier workflows. 231 | 232 | ## Alternatives to Accessing Census Data in R 233 | 234 | You can write code to access the [Census APIs](https://www.census.gov/data/developers/data-sets.html) directly. 235 | 236 | You can download Census data directly from: 237 | 238 | - [American Factfinder](https://factfinder.census.gov/faces/nav/jsf/pages/index.xhtml) or 239 | - [NHGIS.org](https://www.nhgis.org/) 240 | - [Social Explorer](https://www.socialexplorer.com/) 241 | - Subscription service but FREE for UCB community 242 | 243 | You can download Census `geographic data` directly on the [census website](https://www.census.gov/geo/maps-data/) 244 | 245 | 246 | # Tutorial Time! 247 | 248 | ## Part 1 249 | 250 | We will work through several exercises using `tidycensus` to fetch, wrangle and map census data. 251 | 252 | ## Loading packages 253 | 254 | Load the packages we will use today 255 | 256 | ```{r, message=FALSE, warning=FALSE} 257 | library(tidycensus) 258 | library(tidyverse) 259 | library(tigris) 260 | library(sf) 261 | ``` 262 | 263 | If you are getting errors try importing dplyr or reinstalling dplyr package as that has worked for some. 264 | 265 | ## Install any packages that you do not have on your computer 266 | 267 | Also install any dependancies. 268 | 269 | ```{r, eval=FALSE} 270 | # install.packages("tidyverse") 271 | # install.packages("tidycensus") 272 | # install.packages("sf") 273 | ``` 274 | 275 | 276 | ## Census API Key 277 | 278 | You need a census API key to programmatically fetch census data. 279 | 280 | Get it here (pretty quick): 281 | 282 | * (https://api.census.gov/data/key_signup.html) 283 | 284 | For more info see: 285 | 286 | * https://www.census.gov/data/developers/data-sets.html 287 | 288 | ## Install your Census API Key 289 | 290 | Use the tidycensus function **census_api_key** to make tidycensus use your key when it fetches data from the census. 291 | 292 | ```{r, eval=F} 293 | # Install your census api key - long alphanumeric string 294 | census_api_key(THE_BIG_LONG_ALPHANUMERIC_API_KEY_YOU_GOT_FROM_CENSUS) 295 | ``` 296 | 297 | ## Set working directory 298 | 299 | Be sure to **Clone or downloaded & unzip** the workshop files from: [https://github.com/dlab-geo/rCensus_workshop](https://github.com/dlab-geo/rCensus_workshop) 300 | 301 | * unzip if needed 302 | 303 | THen, set your working directory this folder, e.g., 304 | 305 | * `setwd("~/Documents/Dlab/workshops/2019/rCensus_workshop")` 306 | 307 | 308 | 309 | # Fetching Decennial Census Data 310 | 311 | ## Population Data 312 | 313 | Let's start by fetching **population data** from the 2010 Census **for all states** 314 | 315 | In order to fetch census data you need to identify the census **variables** that contain the data of interest. 316 | 317 | ## Topics, Tables & Variables 318 | 319 | Census data **variables** are organized in **tables** 320 | 321 | Which are organized by **topic** or concept. 322 | 323 | The tidycensus **load_variables** function can help with this step. 324 | 325 | First, take a look at the function documentation. 326 | ```{r, eval=F} 327 | ?load_variables 328 | ``` 329 | 330 | ## load_variables 331 | 332 | Use `load_variables` to fetch all variables used in the 2010 census into a dataframe. 333 | ```{r} 334 | vars2010 <- load_variables(year=2010, # Year or end year for ACS 335 | dataset = 'sf1', # 'sf1' for decennial or 'acs5' 336 | cache = TRUE) # Whether to save fetched data locally 337 | ``` 338 | 339 | ## Decennial Census Variables 340 | 341 | Let's take a look at and discuss the resultant dataframe. 342 | 343 | - How many 2010 census variables are in the dataframe? 344 | ```{r, eval=F} 345 | View(vars2010) 346 | ``` 347 | 348 | ## 2010 Decennial Census Tables 349 | 350 | - Variables: 3,346 351 | 352 | - Topics: Population, housing 353 | 354 | - Tables: currenty `333` - *that's a lot*! 355 | - 177 population tables (identified with a ‘‘P’’) available to the block level 356 | - 58 housing tables (identified with an ‘‘H’’) available to the block level 357 | - 82 population tables (identified with a ‘‘PCT’’) available to the census tract level 358 | - 4 housing tables (identified with an “HCT”) available to the census tract level 359 | - 10 population tables (identified with a “PCO”) available to the county level 360 | - plus 2 additoinal PCT tables 361 | 362 | ## What Variable has the 2010 Total Population value? 363 | 364 | We can sort and filter the vars2010 dataframe to find it. 365 | 366 | 367 | 368 | ## get_decennial 369 | 370 | We can use the tidycensus function **get_decenial** to fetch the 2010 census data for total population by state. 371 | 372 | First, check the documentation for the function. 373 | ```{r, eval=F} 374 | ?get_decennial 375 | ``` 376 | 377 | ## get_decennial 378 | 379 | Fetch total population by state (**P001001**) from the 2010 census using `get_decennial`. 380 | 381 | ```{r} 382 | 383 | pop2010 <- get_decennial(geography = "state", # census tabulation unit 384 | variables = "P001001", # variable(s) of interest 385 | year = 2010) # census year 386 | 387 | ``` 388 | 389 | ## View the Data 390 | 391 | - How many rows and columns? 392 | 393 | - Do you see the expected number of states? 394 | 395 | - What column contains the population counts? 396 | 397 | - Do the data values see to be right? 398 | ```{r} 399 | #pop2010 400 | ``` 401 | 402 | ## Visualize results 403 | 404 | We can visualize the data to get a quick overview of the distribution of data values. 405 | 406 | It's a first step in exploratory data analysis and a last step in data communication. 407 | 408 | `ggplot2` is the most commonly used R package for data visualization. 409 | 410 | - It is loaded when you load the `tidyverse` package. 411 | 412 | Let's use it to visualize the population data. 413 | 414 | ## Plot 2010 Population by state 415 | 416 | Use `ggplot2` to create an ordered horizontal bar chart. 417 | ```{r} 418 | pop_plot<- ggplot(data=pop2010, aes(x=reorder(NAME,value), y=value/1000000)) + 419 | geom_bar(stat="identity") + coord_flip() + 420 | theme_minimal() + 421 | labs(title = "2010 US Population by State") + 422 | xlab("State") + 423 | ylab("in millions") 424 | ``` 425 | 426 | ## Display the plot 427 | 428 | ```{r, echo=F} 429 | pop_plot 430 | ``` 431 | 432 | ## Challenge 433 | 434 | Fetch population data by state for 2000. 435 | 436 | *Don't assume variable names are the same across years.* Check first! 437 | 438 | ## Challenge Solution 439 | 440 | Total Population in 2000 441 | 442 | ```{r, eval = F, code_folding = "hide"} 443 | # What is the variable name in 2000? 444 | vars2000 <- load_variables(year=2000, dataset = 'sf1', cache = T) 445 | 446 | # Take a look and search in the dataframe 447 | View(vars2000) 448 | 449 | # Fetch the 2000 pop data 450 | pop2000 <- get_decennial(geography = "state", variables = "P001001", year = 2000) 451 | 452 | # Take a look (plot if time) 453 | pop2000 454 | ``` 455 | 456 | ## Limiting by Area of Interest 457 | 458 | In the previous example we retrieved population data for all states. 459 | 460 | - This is the default behavior if you don't specify a subset. 461 | 462 | - But you can limit the data to be retrieved by subunits like state. 463 | 464 | ## Limit Areas of Interest 465 | 466 | Let's fetch data for just 3 states. 467 | 468 | ```{r} 469 | state_pop2010 <- get_decennial(geography = "state", # census tabulation unit 470 | variables = "P001001", # variables of interest 471 | year = 2010, # census year 472 | state=c("CA","OR","WA")) # Filter by states of interest 473 | 474 | ``` 475 | 476 | *Note we are referencing states by their abbrevation.* 477 | 478 | ## View Results 479 | ```{r} 480 | state_pop2010 481 | ``` 482 | 483 | ## Changing Census Tabulation unit 484 | 485 | `get_decennial` accepts a number of different values for **tabulation unit**. 486 | 487 | - Options include: `state`, `county`, `tract`, `block group`, `block`, and `ZCTA`. 488 | 489 | Let's change the tabulation unit from `state` to `county`. 490 | ```{r} 491 | co_pop2010 <- get_decennial(geography = "county", # census tabulation unit 492 | variables = "P001001", # variables of interest 493 | year = 2010) 494 | ``` 495 | 496 | ## Changing Census Tabulation unit 497 | 498 | View the county data to see what was retrieved. 499 | ```{r} 500 | co_pop2010 501 | ``` 502 | 503 | ## Challenge 504 | 505 | * Fetch population by **county** for just California 506 | 507 | * Fetch population by **county** for Oregon & California 508 | 509 | *Try it before you look ahead at solutions.* 510 | 511 | ## Challenge Solution 512 | ```{r} 513 | ## Fetch population by **county** for just California 514 | co_pop2010_ca <- get_decennial(geography = "county", # census tabulation unit 515 | variables = "P001001", # variables of interest 516 | year = 2010, 517 | state=c('CA')) 518 | #co_pop2010_ca 519 | 520 | ## Fetch population by **county** for Oregon & California 521 | co_pop2010_caor <- get_decennial(geography = "county", # census tabulation unit 522 | variables = "P001001", # variables of interest 523 | year = 2010, 524 | state=c('CA','OR')) 525 | co_pop2010_caor 526 | 527 | ``` 528 | 529 | ## Challenge 530 | 531 | * Fetch population by **tract** for all states. 532 | 533 | * Fetch population by **tract** for California. 534 | 535 | ## Challenge Solution 536 | ```{r, eval=F} 537 | ## Fetch population by **tract** for California. 538 | cal_pop2010_tracts <- get_decennial(geography = "tract", # census tabulation unit 539 | variables = "P001001", # variables of interest 540 | year = 2010, 541 | state=c('CA')) 542 | cal_pop2010_tracts 543 | 544 | 545 | ## Fetch population by **tract** for all states. 546 | pop2010_tracts <- get_decennial(geography = "tract", # census tabulation unit 547 | variables = "P001001", # variables of interest 548 | year = 2010) 549 | 550 | pop2010_tracts ## DOES THIS WORK? 551 | ``` 552 | 553 | ## Fetching Census Tract Data 554 | 555 | If you want census data at the tract level or below you **must** specifiy the state & county or counties. 556 | ```{r,} 557 | tract_pop2010 <- get_decennial(geography = "tract", # census tabulation unit 558 | variables = "P001001", # variable of interest 559 | year = 2010, # census year 560 | state="CA", # limit to state of California 561 | county=c("Alameda","Contra Costa")) # and only these counties 562 | ``` 563 | 564 | ## Fetching Census Tract Data 565 | 566 | View the results! How many census tracts are in these 3 counties? 567 | 568 | ```{r} 569 | tract_pop2010 570 | ``` 571 | 572 | ## Challenge 573 | 574 | 1. Fetch population by **county** for Alameda County, California 575 | 576 | 2. Fetch population by **tract** for the nine county Bay Area: 577 | - Alameda, SF, Contra Costa, Marin County, Napa, 578 | - San Mateo, Santa Clara, Solano, Sonoma, Santa Cruz 579 | 580 | Note: You can use names, abbreviations or FIPs codes for your `state` and `county`. 581 | 582 | ```{r} 583 | # County FIPs Codes for 584 | # Alameda, SF, Contra Costa, Marin County, Napa, 585 | # San Mateo, Santa Clara, Solano, Sonoma, santa cruz 586 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097") 587 | ``` 588 | 589 | ## Challenge Solution 590 | 591 | ```{r} 592 | # population by **county** for Alameda County, California 593 | alco_pop2010 <- get_decennial(geography = "county", # census tabulation unit 594 | variables = "P001001", # variables of interest 595 | year = 2010, 596 | state=c('CA'), 597 | county=c('Alameda County')) 598 | #alco_pop2010 599 | 600 | ``` 601 | 602 | ## Challenge Solution 603 | 604 | Fetch population by **tract** for the nine county Bay Area 605 | ```{r} 606 | # County FIPs Codes for 607 | # Alameda, SF, Contra Costa, Marin County, Napa, 608 | # San Mateo, Santa Clara, Solano, Sonoma, santa cruz 609 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097") 610 | 611 | bayarea_pop2010_tract <- get_decennial(geography = "tract", # census tabulation unit 612 | variables = "P001001", # variable of interest 613 | year = 2010, # census year 614 | state="CA", # limit to state of California 615 | county=nine_counties) # and only these counties 616 | #bayarea_pop2010_tract 617 | ``` 618 | 619 | 620 | ## RECAP & QUESTIONS 621 | 622 | Fetch population by **tract** for the nine county Bay Area 623 | ```{r, eval=F} 624 | # County FIPs Codes for 625 | # Alameda, SF, Contra Costa, Marin County, Napa, 626 | # San Mateo, Santa Clara, Solano, Sonoma, santa cruz 627 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097") 628 | 629 | bayarea_pop2010 <- get_decennial(geography = "tract", # census tabulation unit 630 | variables = "P001001", # variable of interest 631 | year = 2010, # census year 632 | state="CA", # limit to state of California 633 | county=nine_counties) # and only these counties 634 | 635 | # View the data 636 | bayarea_pop2010 637 | ``` 638 | 639 | 640 | 641 | ## Fetching data for more than one census variable 642 | 643 | What **three** things are new here? 644 | ```{r} 645 | #urban rural pop for 3 counties 646 | ur_pop10 <- get_decennial(geography = "county", # census tabulation unit 647 | variables = c(urban="P002002",rural="P002005"), 648 | year = 2010, 649 | summary_var = "P002001", # The denominator 650 | state='CA', 651 | county=c("Napa","Sonoma","Mendocino")) 652 | 653 | ``` 654 | 655 | ## Fetching data for more than one census variable 656 | 657 | What `three` things are new here? 658 | 659 | 1. You can specify more than one variable: 660 | ``` 661 | variables = c("P002002","P002005") 662 | ``` 663 | 664 | 2. You can name the output columns. 665 | ``` 666 | variables = c(urban="P002002",rural="P002005") 667 | ``` 668 | 669 | 3. You can identify a `summary_var`. 670 | ``` 671 | summary_var = "P002001" 672 | ``` 673 | 674 | This value is the denominator - the total count of all people or households surveyed. The values in this column can be used as a demoninator for other calcuations like percent of total. 675 | 676 | ## Take a look at the results 677 | ```{r} 678 | ur_pop10 679 | ``` 680 | 681 | ## Calculating Percents 682 | 683 | The `summary_value` column comes in handy when you want to compute percent of total. 684 | 685 | Here's one way to do it. 686 | ```{r} 687 | # Calculate the percent of population that is Urban or Rural 688 | ur_pop10 <- ur_pop10 %>% 689 | mutate(pct = 100 * (value / summary_value)) 690 | 691 | ``` 692 | 693 | ## Calculating Percents 694 | 695 | Let's take a look at the output 696 | ```{r} 697 | ur_pop10 # Take a look 698 | ``` 699 | 700 | ## Plot it 701 | 702 | Plots give us compact visual summaries of the data 703 | ```{r} 704 | myplot <- ggplot(data = ur_pop10, 705 | mapping = aes(x = NAME, fill = variable, 706 | y = ifelse(test = variable == "urban", 707 | yes = -pct, no = pct))) + 708 | geom_bar(stat = "identity") + 709 | scale_y_continuous(labels = abs, limits=c(-100,100)) + 710 | labs(title="Urban & Rural Population in Wine Country", 711 | x="County", y = " Percent of Population", fill="") + 712 | coord_flip() 713 | ``` 714 | *Don't worry if you don't get all the ggplot code now. It's here for reference.* 715 | 716 | ## Plot it 717 | ```{r} 718 | myplot 719 | ``` 720 | 721 | ## Fetch all the data in one table 722 | 723 | This is often helpful **but** you need to keep tract of the meaning of each variable. 724 | ```{r} 725 | alco_pop10 <- get_decennial(geography = "tract", # Census tabulation unit 726 | table = "P002", # Table of urban & rural population counts 727 | year = 2010, # Decennial census year 728 | state='CA', # Filter state 729 | county="Alameda") # Filter county 730 | 731 | ``` 732 | 733 | ## Take a look 734 | ```{r} 735 | unique(alco_pop10$variable) # What and how many unique vars in table? 736 | 737 | head(alco_pop10,3) # Take a look at output 738 | ``` 739 | 740 | 741 | ## Output options 742 | 743 | Let's try all three of these commands and then look at the ouput to see what's different? 744 | 745 | ```{r, eval=F} 746 | get_decennial(geography = "state", variables = "P001001", year = 2010) 747 | 748 | get_decennial(geography = "state", variables = c(pop10="P001001"), year = 2010) 749 | 750 | get_decennial(geography = "state", variables = c(pop00="P001001"), year = 2010, 751 | output="wide") 752 | ``` 753 | 754 | ## Output options 755 | 756 | ```{r} 757 | head(get_decennial(geography = "state", variables = "P001001", year = 2010),2) 758 | head(get_decennial(geography = "state", variables = c(pop10="P001001"), year = 2010),2) 759 | head(get_decennial(geography = "state", variables = c(pop00="P001001"), year = 2010, output="wide"), 2) 760 | ``` 761 | 762 | 763 | ## Data Wrangling 764 | 765 | Your R skills can help you reformat the data and make it more useable. 766 | 767 | Let's fetch population data for 2010 & 2000 by state with **output=wide**. 768 | 769 | - We will label the variables **pop00** and **pop10**. 770 | 771 | Then we will combine these into one data frame. 772 | 773 | ## Data Wrangling 774 | 775 | Fetch pop by state from both the 2000 and 2010 census 776 | ```{r} 777 | pop2000 <- get_decennial(geography = "state", variables = c(pop00="P001001"), 778 | year = 2000, output="wide") 779 | 780 | pop2010 <- get_decennial(geography = "state", variables = c(pop10="P001001"), 781 | year = 2010, output="wide") 782 | 783 | ``` 784 | 785 | ## Merge population by state from both censuses 786 | 787 | Save in a new dataframe with both columns 788 | ```{r} 789 | pop2000_2010 <- pop2000 %>% merge(pop2010, by="NAME") %>% 790 | select(NAME, pop00, pop10) 791 | 792 | head(pop2000_2010,3) 793 | ``` 794 | 795 | ## Save the data 796 | 797 | Use `write.csv` to save a data frame to a `CSV` file. 798 | 799 | ```{r, eval=F} 800 | write.csv(pop2000_2010, file="pop2000_2010.csv", row.names = FALSE) 801 | ``` 802 | 803 | # QUESTIONS? 804 | 805 | 806 | # Part 2. Mapping 807 | 808 | 809 | ## Mapping Census Data with `tidycensus` 810 | 811 | You can fetch geographic data by adding the parameter **geometry=TRUE** to `tidycensus` functions 812 | 813 | - Under the hood, tidycensus calls the `tigris` package to fetch data from the Census Geographic Data APIs. 814 | 815 | - Only a subset of data available via `tigris` can be accessed via `tidycensus`. 816 | 817 | You can then use common mapping functions like `plot`, `ggplot` and `tmap` to make maps. 818 | 819 | ## Geometry Options 820 | 821 | Before fetching geometry, we need to specify a few `tigris` options 822 | 823 | - Set the `class` of returned data to be `sf` objects (not `sp`, the default) 824 | 825 | - Set `tigris_use_cache` to TRUE 826 | 827 | ```{r} 828 | # Tigris options - used by tidycensus 829 | options(tigris_class = "sf") # SP is the default format returned by tigris 830 | options(tigris_use_cache = TRUE) # Save retrieved data locally 831 | 832 | ``` 833 | 834 | Caching the data is important because it speeds things up if you often fetch census data for the same geographies over and over again. 835 | 836 | ## tigris cache directory 837 | 838 | You may want to use the geographic data downloaded by tigris in other applications. 839 | 840 | To do this, you need to know where the files are saved locally. 841 | 842 | You can also specify where tigris should save cached data. 843 | ```{r, eval=F} 844 | # Check the location of the tigris cached data 845 | Sys.getenv('TIGRIS_CACHE_DIR') 846 | 847 | # Set it 848 | tigris_cache_dir("~/Documents/gis_data/census") # Folder for local data 849 | 850 | # Check it again 851 | Sys.getenv('TIGRIS_CACHE_DIR') 852 | ``` 853 | 854 | ## Fetch geographic boundary data with tidycensus 855 | 856 | We fetch the geospatial data by setting **geometry=TRUE**. 857 | 858 | ```{r} 859 | pop2010geo <- get_decennial(geography = "state", 860 | variables = c(pop10="P001001"), 861 | year = 2010, 862 | output="wide", 863 | geometry=TRUE) # Fetch geometry with the data for mapping 864 | 865 | ``` 866 | 867 | ## Take a look 868 | 869 | Let's take a minute to discuss the format of an `sf` spatial object. 870 | ```{r} 871 | pop2010geo 872 | ``` 873 | 874 | 875 | ## Geospatial Data in R 876 | 877 | R `sf` objects include 878 | 879 | - a dataframe with a `geometry` column named of `geometry` 880 | 881 | - The geometry can be of type POINT, LINE, POLYGON 882 | - or, MULTIPOINT, MULTILINE or MULTIPOLGYON 883 | 884 | - a `CRS` (coordinate reference system), specified by 885 | - epsg(SRID) code 886 | - proj4string 887 | 888 | ## Census Data Coordinate Reference System (CRS) 889 | 890 | All census geographic data use the `NAD83` CRS, or coordinate reference system. 891 | 892 | `NAD83` stands for North American Datum of 1983. The geographic coordinates are longitude and latitude values encoded as decimal degrees. 893 | 894 | `WGS84`, or [The World Geodetic System of 1984](https://en.wikipedia.org/wiki/World_Geodetic_System) is the most commonly used geographic CRS. The difference between points encoded in these two systems can vary, on average, up to 1 meter in the continental US. 895 | 896 | Many geospatial operations require you transform data to a common CRS before conducting spatial analysis or mapping. 897 | 898 | As an in depth discussion of CRSs is outside the scope of this workshop, see [Geocomputation in R](https://geocompr.robinlovelace.net/reproj-geo-data.html) for more information. 899 | 900 | ## Mapping sf Spatial Objects 901 | 902 | We can use `plot` to make a quick map the geometry stored in an `sf` spatial object. 903 | 904 | ```{r} 905 | plot(pop2010geo$geometry) 906 | ``` 907 | 908 | ## Question 909 | 910 | What do you get if you plot the `sf` object without specifying "$geometry" 911 | 912 | 913 | ## The Challenge of US maps 914 | 915 | The vast geographic extent and non-contiguous nature of the USA makes it difficult to map. 916 | 917 | ```{r, echo=F} 918 | plot(pop2010geo$geometry) #view again 919 | ``` 920 | 921 | ## Fetch geographic data with tidycensus, SHIFTED 922 | 923 | tidycensus includes a `shift_geo` parameter to shift AK & HI to below Texas. 924 | ```{r} 925 | 926 | pop2010geo_shifted <- get_decennial(geography = "state", 927 | variables = c(pop10="P001001"), 928 | output="wide", 929 | year = 2010, 930 | geometry=TRUE, 931 | shift_geo=TRUE) 932 | 933 | ``` 934 | 935 | ## Shift Happens! 936 | ```{r} 937 | plot(pop2010geo_shifted$geometry) 938 | ``` 939 | 940 | ## Save it 941 | 942 | You can save `sf` data to a shapefile using `st_write` 943 | 944 | ```{r, eval=F} 945 | st_write(pop2010geo_shifted,"usa_2010_shifted.shp") 946 | ``` 947 | 948 | ## Check your TIGRIS_CACHE_DIR to see it 949 | 950 | ```{r, eval=F} 951 | my_cache_dir <- Sys.getenv('TIGRIS_CACHE_DIR') 952 | 953 | dir(my_cache_dir) # What files stored there? 954 | ``` 955 | 956 | ## Mapping Data Values 957 | 958 | ```{r} 959 | plot(pop2010geo_shifted['pop10']) 960 | ``` 961 | 962 | ## ggplot2 Maps 963 | 964 | ```{r} 965 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 966 | geom_sf() 967 | ``` 968 | 969 | ## ggplot2 Maps 970 | 971 | Note the use of **geom_sf** which tells ggplot that spatial data objects are being mapped. 972 | - this is a huge improvememnt!! 973 | 974 | ```{r} 975 | ggplot(pop2010geo_shifted, aes(fill = pop10)) + 976 | geom_sf() 977 | ``` 978 | 979 | ## Challenge 980 | 981 | Create a `map` of CA Population in 2010 by county 982 | 983 | 984 | ## Challenge Solution 985 | 986 | 2010 pop Data for California Counties 987 | ```{r, eval=F} 988 | 989 | #fetch it 990 | cal_pop10 <- get_decennial(geography = "county", 991 | variables = "P001001", 992 | year = 2010, 993 | state='CA', 994 | geometry=TRUE) 995 | 996 | # map it 997 | #plot(cal_pop10['value']) 998 | ``` 999 | 1000 | 1001 | ## Fetch County data for more than one state 1002 | 1003 | We can fetch both the census data and the **geometry** for more than one state! 1004 | 1005 | - *this is so much easier than any alternative approach!* 1006 | ```{r} 1007 | west_pop10 <- get_decennial(geography = "county", 1008 | variables = "P001001", 1009 | year = 2010, 1010 | state=c('CA','OR','NV',"AZ"), 1011 | geometry=T) 1012 | ``` 1013 | 1014 | ## Map it 1015 | 1016 | These are just quick plots to make sure we got the right data! 1017 | ```{r} 1018 | plot(west_pop10['value']) 1019 | ``` 1020 | 1021 | ## Census Tract Data 1022 | 1023 | Fetching the data for all `tracts` in one state. 1024 | 1025 | - **but** you need to specify one or more counties. 1026 | ```{r} 1027 | # Fetch tract data 1028 | alco_pop10 <- get_decennial(geography = "tract", 1029 | variables = "P001001", 1030 | year = 2010, 1031 | state='CA', 1032 | county='Alameda', 1033 | geometry=T) 1034 | ``` 1035 | 1036 | ## Challenge 1037 | 1038 | Fetch and map the 2010 population by census tract for Alameda and Countra Costa counties. 1039 | 1040 | 1041 | ## Challenge Solution 1042 | 1043 | Fetch Tract population & geometry data for Alameda & Contra Costa Counties 1044 | 1045 | ```{r} 1046 | 1047 | alcc_pop10 <- get_decennial(geography = "tract", 1048 | variables = "P001001", 1049 | year = 2010, 1050 | state='CA', 1051 | county=c("Alameda","Contra Costa"), 1052 | geometry=T) 1053 | ``` 1054 | 1055 | ## Challenge Solution 1056 | 1057 | Map it 1058 | ```{r} 1059 | plot(alcc_pop10['value']) 1060 | ``` 1061 | 1062 | 1063 | ## More Complex Challenge (if time) 1064 | 1065 | Fetch and map the percent of San Francicso properties by census tract that were coded as rented in the 2010 Census. 1066 | 1067 | To start, indentify the variables for the 1068 | 1069 | - total number of hounsing units 1070 | 1071 | - number of renter occupied units 1072 | 1073 | ## Complex Challenge Solution 1074 | 1075 | SF Rented Units, 2010 1076 | ```{r, eval=F} 1077 | sf_rented <- get_decennial(geography = "tract", # census tabulation unit 1078 | variables = "H004004", 1079 | year = 2010, 1080 | summary_var = "H004001", # Total Urban - the denominator 1081 | state='CA', 1082 | county='San Francisco', 1083 | geometry=T) 1084 | 1085 | sf_pct_rented <- sf_rented[sf_rented$value > 0,] %>% 1086 | mutate(pct = 100 * (value / summary_value)) 1087 | 1088 | plot(sf_pct_rented['pct']) 1089 | ``` 1090 | 1091 | # Questions? 1092 | 1093 | # Part 3. ACS 5 year data 1094 | 1095 | ## ACS Data with tidycensus 1096 | 1097 | The tidycensus workflow for ACS data is similar to that used for decennial census data. 1098 | 1099 | - But there are many more variables in the ACS. 1100 | 1101 | Because the ACS contains **sample data**, each ACS variable of interest includes both an **estimate** of the value and a **margin of error**. 1102 | 1103 | ## ACS 5 year 1104 | 1105 | You can use the tidycensus **get_acs** function to retrieve data for the ACS 5 year products, beginning with the 2005 - 2010 dataset. 1106 | 1107 | The **default** end year for my version of `tidycensus` (as of Dec 4, 2018) is **2016** for the 2012-2016 ACS 5 year dataset. 1108 | 1109 | 1110 | ## Fetch List of ACS 5 year Variables 1111 | 1112 | Let's start by fetching ACS 5-year 2016 data on poverty. 1113 | 1114 | We want to explore the number of folks living below the poverty level by census tract. 1115 | 1116 | First we need to find the variable name(s)! 1117 | 1118 | ## Load ACS Table Vars 1119 | 1120 | Load the ACS 2012-2016 5 year data variables into a dataframe. 1121 | 1122 | - ACS 5 year datasets are referenced by `end year` in tidycensus! 1123 | 1124 | Then take a look at the variable names, labels and concepts. 1125 | 1126 | How many variables refer to the concept of poverty? 1127 | 1128 | ```{r} 1129 | acs2016vars <- load_variables(year=2016, dataset = 'acs5', cache = T) 1130 | #View(acs2016vars) 1131 | ``` 1132 | 1133 | ## ACS Tables and variables 1134 | 1135 | Many hundreds (thousands?) more than for decennial census! 1136 | 1137 | See the documentation on the [census website](https://www.census.gov/programs-surveys/acs/guidance/which-data-tool/table-ids-explained.html) 1138 | 1139 | Types of tables: 1140 | 1141 | - `B` prefix = base tables 1142 | - `C` = collapsed tables 1143 | - `DP` = data profiles 1144 | - `S` = Subject tables 1145 | 1146 | ## Census Reporter 1147 | 1148 | ACS variables can be confusing. 1149 | 1150 | The Census Reporter website (https://censusreporter.org) provides another tool for navigating topics, tables, and variable names. 1151 | 1152 | Let's check it out to see what tables/variables we should use. 1153 | 1154 | ## Filter the ACS Variables 1155 | 1156 | In RStudio, view the dataframe **acs2016vars** and interactively filter the name column to display only the variables in the table **C17002** 1157 | 1158 | Take a look at the different variables in this table. 1159 | 1160 | What variable(s) contain the estimate of the number of people living below poverty? 1161 | 1162 | ## get_acs 1163 | 1164 | Use the tidycensus `get_acs` function to fetch the poverty data for census tracts in San Francisco 1165 | ```{r, eval=F} 1166 | ?get_acs 1167 | ``` 1168 | 1169 | ## get_acs in action 1170 | 1171 | Fetch the data in the table **C17002** that contain the counts of people living below 100% of the poverty line. 1172 | ```{r} 1173 | sf_poor <- get_acs(geography = "tract", 1174 | variables = c('C17002_002','C17002_003'), # poverty variables 1175 | year = 2016, 1176 | state="CA", 1177 | summary_var = "C17002_001", # Est of num people - denom 1178 | county="San Francisco", 1179 | geometry=T) 1180 | ``` 1181 | 1182 | ## View output 1183 | 1184 | Let's take a look at the output of `get_acs` and discuss how it differs from `get_decennial`. 1185 | 1186 | ```{r, eval=F} 1187 | sf_poor 1188 | ``` 1189 | 1190 | ## Create Poverty Map, try 2 1191 | 1192 | What are we mapping! 1193 | ```{r} 1194 | # What are we mapping? 1195 | plot(sf_poor['estimate']) 1196 | ``` 1197 | 1198 | ## Create Poverty Map, try 2 1199 | 1200 | ```{r} 1201 | # Remove census tracts that have no people! 1202 | sf_poor <- subset(sf_poor, summary_est > 0) 1203 | 1204 | # What are we mapping? 1205 | plot(sf_poor['estimate']) 1206 | ``` 1207 | 1208 | ## Calculating percents 1209 | 1210 | Let's calculate the percent below poverty by tract. 1211 | 1212 | ```{r} 1213 | sf_poor <- sf_poor %>% 1214 | mutate(pct = 100 * (estimate / summary_est)) 1215 | 1216 | head(sf_poor, 3) 1217 | ``` 1218 | 1219 | 1220 | ## Group by and sum 1221 | 1222 | We want to group the data by the geometry and then sum the data values so that we have one value per geometry. 1223 | ```{r} 1224 | sf_poor_summed <- sf_poor %>% 1225 | select(GEOID, estimate, pct, geometry) %>% 1226 | group_by(GEOID) %>% 1227 | summarise(count_below_pov = sum(estimate), 1228 | pct_below_pov = sum(pct)) 1229 | ``` 1230 | 1231 | ## Group by and sum 1232 | 1233 | ```{r} 1234 | head(sf_poor_summed) 1235 | ``` 1236 | 1237 | ## Map Counts 1238 | 1239 | Where are SF's poorest areas? 1240 | ```{r} 1241 | plot(sf_poor_summed['count_below_pov']) 1242 | 1243 | ``` 1244 | 1245 | ## Map Percents 1246 | 1247 | Where are SF's poorest areas? 1248 | ```{r} 1249 | plot(sf_poor_summed['pct_below_pov']) 1250 | 1251 | ``` 1252 | 1253 | 1254 | ## Challenge 1255 | 1256 | The ACS **2013-2017** 5 year dataset was released Dec 6, 2018. 1257 | 1258 | Although my current version of `tidycensus` states that 2012-2016 is the latest ACS 5-year product, see if you can fetch & map the percent of people below poverty line in San Francisco using the **2013-2017** ACS 5-year data. 1259 | 1260 | ## Challenge Solution 1261 | ```{r, eval=F} 1262 | sf_poor_2017 <- get_acs(geography = "tract", 1263 | variables = c('C17002_002','C17002_003'), # poverty variables 1264 | year = 2017, 1265 | state="CA", 1266 | summary_var = "C17002_001", # Est of num people - denom 1267 | county="San Francisco", 1268 | geometry=T) 1269 | 1270 | head(sf_poor_2017) 1271 | ``` 1272 | 1273 | 1274 | ## Margins of Error (MOE) 1275 | 1276 | We haven't talked about it but it may be important in your work with ACS data. 1277 | 1278 | Math is needed to combine MOEs when you combine variables. 1279 | 1280 | - tidycensus includes some nice [functions](https://walkerke.github.io/tidycensus/reference/index.html) for these calculations. 1281 | 1282 | See this web page on how to handle [MOEs in tidycensus](https://walkerke.github.io/tidycensus/articles/margins-of-error.html) 1283 | 1284 | # Questions? 1285 | 1286 | 1287 | # Maps with tmap - Demo 1288 | 1289 | ## tmap 1290 | 1291 | The `tmap` package is great for making both static and interactive maps. It turns R into a `GIS`. 1292 | 1293 | Let's check it out with our last dataframe. 1294 | 1295 | ## tmap 1296 | 1297 | ```{r} 1298 | library(tmap) 1299 | tmap_mode("view") # set mode to interactive 1300 | 1301 | poverty_map <- tm_shape(sf_poor_summed) + 1302 | tm_polygons(col="pct_below_pov") 1303 | ``` 1304 | 1305 | ## tmap 1306 | 1307 | View the map - click on tracts 1308 | 1309 | ```{r} 1310 | poverty_map 1311 | ``` 1312 | 1313 | ## tmap 1314 | 1315 | There are a number of great tutorials online for working with `tmap`. 1316 | 1317 | See the `References` at the end of this workshop document. 1318 | 1319 | # Census Geographic Data Files 1320 | 1321 | ## Census Geographic Data Files 1322 | 1323 | **Cartographic Boundary** vs **Detailed TIGER/Line** data 1324 | 1325 | By default, `tidycensus` downloads census **cartographic boundary** data. 1326 | 1327 | - These are simplifed geometries, clipped to coastlines. 1328 | 1329 | In `get_acs` you can also request the more detailed census **TIGER/Line** data. 1330 | 1331 | The cartographic boundary data is great for mapping but the detailed data is often better for analysis. 1332 | 1333 | Let's check it out. 1334 | 1335 | 1336 | ## Fetch Cartographic Boundary Data 1337 | ```{r} 1338 | 1339 | sf_poor_cb <- get_acs(geography = "tract", 1340 | variables = c('C17002_002','C17002_003'), # poverty variables 1341 | summary_var = "C17002_001", 1342 | year = 2016, 1343 | state="CA", 1344 | county="San Francisco", 1345 | geometry=TRUE, 1346 | cb = TRUE) # THIS IS THE DEFAULT! 1347 | ``` 1348 | 1349 | ## Fetch Detailed TIGER/Line Geometry 1350 | ```{r} 1351 | 1352 | sf_poor_tl <- get_acs(geography = "tract", 1353 | variables = c('C17002_002','C17002_003'), # poverty variables 1354 | summary_var = "C17002_001", 1355 | year = 2016, 1356 | state="CA", 1357 | county="San Francisco", 1358 | geometry=TRUE, 1359 | cb = FALSE) # Fetching the TIGER/Line data 1360 | ``` 1361 | 1362 | 1363 | ## Visualize differences with Tmap 1364 | 1365 | zoom in to explore, especially around the coastline. 1366 | ```{r} 1367 | tm_shape(sf_poor_tl) + tm_borders() + 1368 | tm_shape(sf_poor_cb) + tm_borders(col="red") 1369 | 1370 | ``` 1371 | 1372 | 1373 | # Questions? 1374 | 1375 | # Summary 1376 | 1377 | ## Summary 1378 | 1379 | - `tidycensus` offers two key functions for fetching census tabular and geographic: **get_acs** and **get_decennial** 1380 | 1381 | - Using `tidycensus` to fetch the tabular data or both tabular and geographic data is IMOH way easier than any alternatives, **IF** you (1) know R, (2)know a bit about working with geographic data in R. 1382 | 1383 | - This approach is also scaleable if you want multiple census variables and geographies. 1384 | 1385 | - If you just want to fetcch the geographic data it may be easier to use the **tigris** package or download it directly from the census. 1386 | 1387 | 1388 | ## References 1389 | 1390 | - [DataCamp](https://www.datacamp.com) course [Analyzing US Census Data in R!](https://www.datacamp.com/courses/analyzing-us-census-data-in-r) 1391 | - [Geocomputation in R](https://geocompr.robinlovelace.net/) 1392 | - [Creating beautiful demographic maps with tidycensus and tmap packages](https://www.zevross.com/blog/2018/10/02/creating-beautiful-demographic-maps-in-r-with-the-tidycensus-and-tmap-packages/) 1393 | 1394 | ## Related D-Lab Workshops 1395 | 1396 | - R Fundamentals 1397 | - Geospatial Data in R, parts 1, 2, & 3 1398 | - Web Maps in R with Leaflet 1399 | - Geocoding & Mapping in R 1400 | 1401 | # Extras for Enthusiasts 1402 | 1403 | ## Scaling Up Example 1404 | 1405 | In this example we show you how you can read in census variables of interest from a file into an R dataframe. You can then use that dataframe to fetch data for all those variables using `tidycensus`. 1406 | 1407 | ```{r} 1408 | 1409 | # Load cenvar lookup table of vars of interest 1410 | my_cenvar_df <-read.csv("data/cenvar_lookup.csv", strip.white = T, stringsAsFactors = F) 1411 | 1412 | my_cenvar_df 1413 | ``` 1414 | 1415 | ## Fetch the ACS data 1416 | 1417 | Fetch the ACS data for these variables for the 9 county bay area 1418 | 1419 | ```{r} 1420 | nine_counties <- c("001", "075", "013", "041", "055", "081", "085", "095", "097") 1421 | bay9_data <-get_acs(geography = "tract", 1422 | variables = my_cenvar_df$my_cen_vars, 1423 | year=2016, 1424 | state = "CA", 1425 | county = nine_counties, 1426 | geometry = T) 1427 | 1428 | bay9_data 1429 | ``` 1430 | 1431 | ## Reformat Ouput 1432 | 1433 | 1. We only want to keep the estimate column for each variable of interest, plus the GEOID and geometry columns. 1434 | 1435 | 2. We then want to make the data `wide` using the `spread` function. This will put each estimate variable is in its own column. 1436 | ```{r} 1437 | bay9_data2 <- bay9_data %>% 1438 | select("GEOID", "variable", "estimate") %>% 1439 | spread(key=variable, value=estimate) 1440 | ``` 1441 | 1442 | ## Take a look 1443 | ```{r} 1444 | bay9_data2 1445 | ``` 1446 | 1447 | ## Rename the columns 1448 | 1449 | Use the dataframe of census variables to rename the columns so that they are self-describing. 1450 | ```{r} 1451 | colnames(bay9_data2)<-c("GEOID", my_cenvar_df$my_cen_var_names, "geometry") 1452 | 1453 | ``` 1454 | 1455 | ## Take a look 1456 | ```{r} 1457 | 1458 | bay9_data2 1459 | ``` 1460 | 1461 | 1462 | ## Fetching data for multiple years 1463 | 1464 | This requires variable name to be the same across years! 1465 | ```{r, eval=FALSE} 1466 | # use purr::map_df to get data for multiple years (must have same vars!) 1467 | pop90_10 <- map_df(c(1990, 2000, 2010), function(x) { 1468 | get_decennial(geography = "state", 1469 | variables = c(totalpop = "P001001"), 1470 | dataset = "sf1", 1471 | year = x) %>% 1472 | mutate(year = x) } 1473 | ) 1474 | 1475 | # View output 1476 | head(pop90_10) 1477 | tail(pop90_10) 1478 | 1479 | # Plot it 1480 | pop90_10 %>% ggplot(aes(x=reorder(NAME,value), y=value/1000000, fill=factor(year))) + 1481 | geom_bar(stat="identity", position=position_dodge()) + coord_flip() 1482 | 1483 | ``` 1484 | 1485 | 1486 | # Combining Census Data with Other Data 1487 | 1488 | ## Area Weighted Interpolation 1489 | 1490 | One of the strenghts of the `sf` package is how relatively easy it is to reaggregate data from one geometry to another. This process is called areal interpolation. 1491 | 1492 | Area weighted interpolation reaggregates the data based on the percent of area shared by input and output geometeries. 1493 | 1494 | ## Read in a Shapefile 1495 | ```{r, eval=F} 1496 | sfnhoods<- st_read("data/sfnhoods.shp") 1497 | head(sfnhoods) 1498 | plot(sfnhoods['nhood']) 1499 | ``` 1500 | 1501 | ## Check the CRS 1502 | ```{r, eval=F} 1503 | st_crs(sfnhoods) 1504 | st_crs(sf_poor5) 1505 | ``` 1506 | 1507 | ## CRS transformation 1508 | ```{r, eval=F} 1509 | sf_poor5_4326 = st_transform(sf_poor5, st_crs(sfnhoods)) 1510 | ``` 1511 | 1512 | ## Area Weighted Interpolation 1513 | 1514 | Reaggregate percent of people below poverty from census tract to neighborhood polygons. 1515 | 1516 | ```{r, eval=F} 1517 | sfhoods2 = st_interpolate_aw(sf_poor5_4326[, "pct_below_pov"], sfnhoods, 1518 | extensive = F) # True= aw sum; False= aw avg 1519 | ``` 1520 | 1521 | ## Map it 1522 | ```{r, eval=F} 1523 | par(mfrow=c(1,2)) 1524 | plot(sf_poor5['pct_below_pov']) 1525 | plot(sfhoods2['pct_below_pov']) 1526 | par(mfrow=c(1,1)) 1527 | ``` 1528 | 1529 | ## Map it with `tmap` 1530 | ```{r, eval=F} 1531 | tm_shape(sfhoods2) + 1532 | tm_polygons(col="pct_below_pov") 1533 | ``` 1534 | 1535 | ## Combine the values 1536 | ```{r, eval=F} 1537 | head(sfhoods2) 1538 | sfnhoods$pct_below_pov <- sfhoods2$pct_below_pov 1539 | 1540 | # map again - click on polygons and view data in popups 1541 | # to confirm the AWI output values 1542 | tm_shape(sfnhoods) + 1543 | tm_polygons(col="pct_below_pov", 1544 | popup.vars = c("nhood", "pct_below_pov") 1545 | ) 1546 | ``` 1547 | 1548 | -------------------------------------------------------------------------------- /previous_versions/snippets_to_save_for_later.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "snippets_to_save_for_later" 3 | output: html_document 4 | date: '2022-04-01' 5 | --- 6 | 7 | ```{r setup, include=FALSE} 8 | knitr::opts_chunk$set(echo = TRUE) 9 | ``` 10 | 11 | ### [Leaflet](https://rstudio.github.io/leaflet/): 12 | 13 | Leaflet is the `ggplot2` of interactive mapping. Leaflet in R follows a tidyverse convention, using pipes (%>%) to create layers in the mapping object. We can use leaflet to create interactive maps allowing for more flexibility in design and features we can create. With added complexity in the code, of course! 14 | 15 | ```{r} 16 | 17 | # Create a color palette 18 | pal <- colorNumeric( 19 | palette = "YlOrRd", 20 | domain = med_hhincome$estimate 21 | ) 22 | 23 | # specify dataset 24 | leaflet(med_hhincome) %>% 25 | addProviderTiles(providers$CartoDB.Positron) %>% 26 | # adjust color palette and ploygon features. 27 | addPolygons(stroke = FALSE, smoothFactor = 0.2, fillOpacity = .5, 28 | color = ~pal(estimate)) %>% 29 | # add legend 30 | addLegend(pal = pal, values = ~estimate, 31 | title = "Median Household Income", 32 | labFormat = labelFormat(prefix = "$"), 33 | position = "bottomleft") 34 | 35 | 36 | ``` 37 | 38 | --- 39 | ## Appendix 40 | 41 | ### A More Complex Query 42 | 43 | Let's use the 2010 census data to map the percent of San Francisco (SF) properties that were rented. 44 | 45 | To start, identify the variables for the 46 | 47 | - Total number of housing units 48 | 49 | - Number of renter occupied units 50 | 51 | ### Complete the query 52 | ```{r, eval=F} 53 | 54 | sf_rented <- get_decennial(geography = , # census tabulation unit 55 | variables = , # number of households rented 56 | year = , 57 | summary_var = , # Total households 58 | state=, 59 | county=, 60 | geometry=) 61 | ``` 62 | 63 | > And here it is SF Percent Rented Units, 2010 64 | 65 | ```{r, eval=F} 66 | sf_rented <- get_decennial(geography = "tract", # census tabulation unit 67 | variables = "H004004", #number of households rented 68 | year = 2010, 69 | summary_var = "H004001", # Total households 70 | state='CA', 71 | county='San Francisco', 72 | geometry=T) 73 | 74 | # take a look at the output 75 | head(sf_rented) 76 | ``` 77 | 78 | 79 | ### Calculate Percent Rented 80 | 81 | ```{r, eval=F} 82 | sf_pct_rented <- sf_rented[sf_rented$value > 0,] %>% 83 | mutate(pct = 100 * (value / summary_value)) 84 | 85 | # Take a look 86 | head(sf_pct_rented) 87 | 88 | ``` 89 | 90 | ### Map the result 91 | ```{r, eval=F} 92 | plot(sf_pct_rented['pct']) 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | r-4.0-2020-10-10 2 | 3 | --------------------------------------------------------------------------------