├── pyproject.toml ├── LICENSE ├── README.md ├── .gitignore ├── crosswalks ├── judicial_districts │ ├── population_by_district_acs2018_5yr.csv │ ├── COUNTY_DISTRICT_README.md │ ├── population_by_district.ipynb │ └── race_by_district.ipynb └── zip_to_zcta │ ├── ZIP_ZCTA_README.md │ └── build_crosswalk.ipynb ├── chicago_2010pop_by_2020policedistricts.csv ├── generalized └── GENERALIZED_README.md ├── BOUNDARIES.md └── requirements.txt /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "acs-aggregate" 3 | version = "0.1.0" 4 | description = "Tools to help aggregate American Community Survey data to non-Census geographies" 5 | authors = ["Joe Germuska "] 6 | license = "MIT" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.9" 10 | pandas = "^1.4.2" 11 | requests = "^2.24.0" 12 | cenpy = {version = "^1.0.0", extras = ["python-Levenshtein"]} 13 | python-Levenshtein = "^0.12.0" 14 | census-data-aggregator = "^0.0.6" 15 | 16 | [tool.poetry.dev-dependencies] 17 | jupyterlab = "^2.2.10" 18 | folium = "^0.11.0" 19 | 20 | [build-system] 21 | requires = ["poetry>=0.12"] 22 | build-backend = "poetry.masonry.api" 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Census Reporter 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # acs-aggregate 2 | 3 | Tools to help aggregate American Community Survey data to non-Census ("custom") geographies. 4 | 5 | A common problem for journalists and other analysts is wishing that the Census Bureau tabulated American Community Survey (ACS) data for locally meaningful geographies, such as neighborhoods, wards, or police districts. This project aims to make that as easy as possible, while acknowledging that there are some wrinkles. 6 | 7 | * See the `crosswalks` directory for crosswalks for specific geography types. 8 | * See `generalized` for examples of a general method (with a worked example). 9 | * see BOUNDARIES.md for a randomly assembled list of available GIS data which might be the kinds of things for which people would want to use this. 10 | 11 | The longer term goal is to make this as automated as possible, but we're still getting a sense of the problem. We welcome discussion, or even just expressions of interest and votes of confidence. 12 | 13 | To read: [Target‐Density Weighting Interpolation and Uncertainty Evaluation for Temporal Analysis of Census Data](https://onlinelibrary.wiley.com/doi/full/10.1111/j.1538-4632.2007.00706.x), which may provide insights on whether these methods are well-designed. 14 | 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | work 2 | crosswalks/zip_to_zcta/tl_2019_us_zcta510.zip 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /crosswalks/judicial_districts/population_by_district_acs2018_5yr.csv: -------------------------------------------------------------------------------- 1 | state,district,total_pop 2 | Alabama,Middle,1151252 3 | Alabama,Northern,2870454 4 | Alabama,Southern,842974 5 | Alaska,Alaska,738516 6 | Arizona,Arizona,6946685 7 | Arkansas,Eastern,1639567 8 | Arkansas,Western,1351104 9 | California,Central,19354238 10 | California,Eastern,7993050 11 | California,Northern,8318423 12 | California,Southern,3483049 13 | Colorado,Colorado,5531141 14 | Connecticut,Connecticut,3581504 15 | Delaware,Delaware,949495 16 | District of Columbia,District of Columbia,684498 17 | Florida,Middle,11853354 18 | Florida,Northern,1840687 19 | Florida,Southern,6904098 20 | Georgia,Middle,2008351 21 | Georgia,Northern,6716197 22 | Georgia,Southern,1572936 23 | Hawaii,Hawaii,1422029 24 | Idaho,Idaho,1687809 25 | Illinois,Central,2220390 26 | Illinois,Northern,9340002 27 | Illinois,Southern,1261105 28 | Indiana,Northern,2589824 29 | Indiana,Southern,4047602 30 | Iowa,Northern,1325394 31 | Iowa,Southern,1807105 32 | Kansas,Kansas,2908776 33 | Kentucky,Eastern,2202959 34 | Kentucky,Western,2237245 35 | Louisiana,Eastern,1674111 36 | Louisiana,Middle,829642 37 | Louisiana,Western,2159863 38 | Maine,Maine,1332813 39 | Maryland,Maryland,6003435 40 | Massachusetts,Massachusetts,6830193 41 | Michigan,Eastern,6461168 42 | Michigan,Western,3496320 43 | Minnesota,Minnesota,5527358 44 | Mississippi,northern,1114229 45 | Mississippi,southern,1874533 46 | Missouri,Eastern,2927578 47 | Missouri,Western,3162484 48 | Montana,Montana,1041732 49 | Nebraska,Nebraska,1904760 50 | Nevada,Nevada,2922849 51 | New Hampshire,New Hampshire,1343622 52 | New Jersey,New Jersey,8881845 53 | New Mexico,New Mexico,2092434 54 | New York,Eastern,8217826 55 | New York,Northern,3396820 56 | New York,Southern,5209255 57 | New York,Western,2794552 58 | North Carolina,Eastern,3999741 59 | North Carolina,Middle,2952469 60 | North Carolina,Western,3203414 61 | North Dakota,North Dakota,752201 62 | Ohio,Northern,5733949 63 | Ohio,Southern,5907930 64 | Oklahoma,Eastern,748060 65 | Oklahoma,Northern,1060029 66 | Oklahoma,Western,2110048 67 | Oregon,Oregon,4081943 68 | Pennsylvania,Eastern,5723256 69 | Pennsylvania,Middle,3325682 70 | Pennsylvania,Western,3742243 71 | Puerto Rico,Puerto Rico,3386941 72 | Rhode Island,Rhode Island,1056611 73 | South Carolina,South Carolina,4955925 74 | South Dakota,South Dakota,864289 75 | Tennessee,Eastern,2601077 76 | Tennessee,Middle,2478940 77 | Tennessee,Western,1571072 78 | Texas,Eastern,3923823 79 | Texas,Northern,7252194 80 | Texas,Southern,9611091 81 | Texas,Western,7098087 82 | Utah,Utah,3045350 83 | Vermont,Vermont,624977 84 | Virginia,Eastern,6097466 85 | Virginia,Western,2316308 86 | Washington,Eastern,1584162 87 | Washington,Western,5710174 88 | West Virginia,Northern,869001 89 | West Virginia,Southern,960053 90 | Wisconsin,Eastern,3405147 91 | Wisconsin,Western,2373247 92 | Wyoming,Wyoming,581836 93 | -------------------------------------------------------------------------------- /chicago_2010pop_by_2020policedistricts.csv: -------------------------------------------------------------------------------- 1 | dist_num,P003001,P003002,P003003,P003004,P003005,P003006,P003007,P003008,P005001,P005002,P005003,P005004,P005005,P005006,P005007,P005008,P005009,P005010,P005011,P005012,P005013,P005014,P005015,P005016,P005017 2 | 1,62781,35208,13657,138,10835,29,1088,1826,62781,59015,32952,13452,95,10790,22,157,1547,3766,2256,205,43,45,7,931,279 3 | 2,95439,19189,66577,187,5867,13,963,2643,95439,92197,17747,65993,138,5837,13,202,2267,3242,1442,584,49,30,0,761,376 4 | 3,75235,1654,71508,182,318,4,332,1237,75235,74112,1472,71010,169,312,4,98,1047,1123,182,498,13,6,0,234,190 5 | 4,123575,26117,77303,689,311,31,16378,2746,123575,88194,9925,76399,223,258,29,138,1222,35381,16192,904,466,53,2,16240,1524 6 | 5,74396,1629,70429,166,41,10,1202,919,74396,71872,843,70064,118,38,8,55,746,2524,786,365,48,3,2,1147,173 7 | 6,90841,446,88938,182,61,7,269,938,90841,89927,312,88525,164,61,6,60,799,914,134,413,18,0,1,209,139 8 | 7,71071,511,69202,149,62,9,435,703,71071,69904,262,68787,136,56,8,49,606,1167,249,415,13,6,1,386,97 9 | 8,247373,118778,53462,1632,2168,61,64904,6368,247373,107519,51491,52219,247,2001,18,231,1312,139854,67287,1243,1385,167,43,64673,5056 10 | 9,165201,65820,19860,1153,26106,35,47743,4484,165201,70591,24307,19044,173,25894,16,146,1011,94610,41513,816,980,212,19,47597,3473 11 | 10,118093,38171,40080,1099,321,16,35543,2863,118093,44652,4300,39440,144,239,3,121,405,73441,33871,640,955,82,13,35422,2458 12 | 11,70474,4673,60385,174,356,18,3782,1086,70474,62356,1664,59671,73,329,4,46,569,8118,3009,714,101,27,14,3736,517 13 | 12,127869,69537,23781,839,8290,94,21374,3954,127869,85540,51775,23039,185,8148,49,273,2071,42329,17762,742,654,142,45,21101,1883 14 | 14,117738,75162,9448,828,3779,68,23759,4694,117738,63699,49809,8027,189,3604,33,236,1801,54039,25353,1421,639,175,35,23523,2893 15 | 15,59458,1571,55861,144,187,7,1041,647,59458,57193,959,55468,107,174,5,24,456,2265,612,393,37,13,2,1017,191 16 | 16,199476,162554,2438,854,10920,99,17142,5469,199476,154042,138043,2022,299,10711,44,266,2657,45434,24511,416,555,209,55,16876,2812 17 | 17,144096,84601,5625,1024,17692,85,27794,7275,144096,81864,55743,4782,321,17373,48,409,3188,62232,28858,843,703,319,37,27385,4087 18 | 18,117041,92302,10875,153,9837,49,1455,2370,117041,111235,88418,10681,108,9775,39,237,1977,5806,3884,194,45,62,10,1218,393 19 | 19,200786,161709,13753,558,12426,85,7031,5224,200786,180761,150551,13305,308,12277,67,341,3912,20025,11158,448,250,149,18,6690,1312 20 | 20,91279,57451,10230,510,12938,34,6922,3194,91279,74760,49420,9909,223,12792,27,251,2138,16519,8031,321,287,146,7,6671,1056 21 | 22,101941,36750,62510,165,366,27,686,1437,101941,98623,34863,62094,112,352,25,82,1095,3318,1887,416,53,14,2,604,342 22 | 24,141038,73653,26057,780,20954,72,13795,5727,141038,111292,60488,25322,351,20796,45,466,3824,29746,13165,735,429,158,27,13329,1903 23 | 25,200391,85347,35629,1731,3329,160,66851,7344,200391,67386,29371,33033,214,3086,44,339,1299,133005,55976,2596,1517,243,116,66512,6045 24 | 31,24266,21756,129,43,1227,6,744,361,24266,22036,20452,116,23,1205,3,23,214,2230,1304,13,20,22,3,721,147 25 | -------------------------------------------------------------------------------- /crosswalks/judicial_districts/COUNTY_DISTRICT_README.md: -------------------------------------------------------------------------------- 1 | # US Federal Court District to US County Crosswalk 2 | 3 | As part of our work with the [SCALES](https://scales-okn.org/) project, we set out to create a crosswalk which would support analysis of Census and other data by US Federal Court District. 4 | 5 | The districts are established by statute, specifically, [Title 28, United States Code, Chapter 5](https://www.law.cornell.edu/uscode/text/28/part-I/chapter-5). Generally, each US county is in exactly one District. There are a few special cases such as with waterways around New York City, a Federal Correctional Institution in North Carolina, and the like, but those are disregarded for the purposes of creating this cross-reference. 6 | 7 | Some Federal Court Districts are split into "Divisions" by the statute. There are also cases of local (non-statutory) rules creating Divisions in Federal Court Districts. Statutory divisions are included in this data set -- if a District has Divisions, each county is in exactly one. Local-rule Divisions are **not included** in this data. 8 | 9 | ## The data file 10 | 11 | The [crosswalk](county_district_xref.csv) is a UTF-8 encoded CSV file with one row per US county or county-equivalent. Each row has the following columns, each of which should be treated as text, even though some have only digits: 12 | 13 | * geoid - a US Census Bureau [GEOID](https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html). Each value is unique in this file. 14 | * state_fips - a two-digit [state FIPS code](https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code#FIPS_state_codes) 15 | * county_fips - a three-digit [county FIPS code](https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county) 16 | * state - the state's name, in text 17 | * county - the county or county-equivalent's name 18 | * district - the name of the district, or the state name if the state is not divided into districts 19 | * division - the name of the division, or blank if the district does not have statutory divisions 20 | * statute_url - a link to the Cornell LII version of the statute for the given state, in case one wants to validate/review the data 21 | 22 | We welcome input from people with expertise about whether there's a more systematic way to represent the districts and divisions, such as numeric or coded identifiers. 23 | 24 | (We've since learned about [a GIS file of districts](https://hifld-geoplatform.opendata.arcgis.com/datasets/us-district-court-jurisdictions) which includes identifiers. In the future we may either use that file to create the crosswalk, or at least integrate its IDs to make it easier to create maps based on aggregated data. See also [this nice interactive javascript map](https://observablehq.com/@caged/the-united-states-courts-of-appeals-and-district-courts) of the districts and counties...) 25 | 26 | ## Using this crosswalk 27 | 28 | This repository includes two notebooks demonstrating how you can use the crosswalk with python code to aggregate ACS data by Judicial District: 29 | 30 | * [population_by_district.ipynb](population_by_district.ipynb) - a simple case to get the estimated total population for each district. If you just want that data, download [population_by_district_acs2018_5yr.csv](population_by_district_acs2018_5yr.csv) 31 | * [race_by_district.ipynb](race_by_district.ipynb) - a more involved example which also shows how to account for the aggregated margin of error, and how to test the reliability of the aggregates. 32 | 33 | Of course, you don't have to use python to use the crosswalk, but it's our working language, so it was easiest to use for demonstration. We'll gladly link to examples using `R` or other tools. 34 | 35 | ## More on the method 36 | 37 | The crosswalk here is not purely created by code. We were able to match most of the counties with a scraper (available in this [Google Colab notebook](https://colab.research.google.com/drive/1ghrzwtNhwlN6E3GBH8N5zqP9cAOPOGd0#scrollTo=LtDXNodX4KO9)), but at a certain point, it didn't seem worth working through formatting peculiarities, misspellings in the statute, and annoying nuances of regular expressions. 38 | 39 | We are particularly grateful to Mary Catherine Talbott, University of Richmond Law Student, Class of 2022, for careful review of the cities of Virginia, which, while treated as "county-equivalents" by the Census, are not specifically enumerated in the statute. 40 | -------------------------------------------------------------------------------- /generalized/GENERALIZED_README.md: -------------------------------------------------------------------------------- 1 | # Towards a Generalized Tool for Aggregating ACS Data for non-Census Geographies 2 | 3 | A common problem for journalists and other analysts is wishing that the Census Bureau tabulated American Community Survey (ACS) data for locally meaningful geographies, such as neighborhoods, wards, or police districts. 4 | 5 | The [jupyter notebook](notebook.ipynb) in this directory provides working python code which does this, and demonstrates its use to create datasets of ACS estimates for Chicago Police districts. An obvious next step would be to factor the code out of a notebook into a reusable library. 6 | 7 | It would be even more convenient to provide this as a web-hosted service, but we have some concerns about the system resources to support whatever requests people might bring. But it's still something to consider. 8 | 9 | 10 | ## Method 11 | 12 | Without access to individual Census responses, the only way to obtain Census data for custom geographies is to map Census geographies to your custom geographies and add up the figures. This is relatively straightforward, unless the Census geographies are split between two or more custom geographies. 13 | 14 | While census blocks vary in size, [more than half of them are smaller than 0.1 sq. miles](http://proximityone.com/geo_blocks.htm). Smaller census blocks are less likely to cross boundaries of custom geographies, and since they also, generally, have smaller populations, the inaccuracy introduced by treating them as they are not split is usually tolerable. 15 | 16 | However, block-level data is only provided for the Decennial Census. The ACS, which is released every year, and which also includes many topics not covered by the Decennial Census, uses the *block group* as its smallest geography. While small, block groups still contain dozens of blocks or more, increasing the likelihood of distorting the data if block groups are simply assigned to a single custom geography when they are, in reality, split between two or more. 17 | 18 | To address this issue, when a block group is split, we allocate its data to each segment segment in proportion to the population of that segment, based on the block-level population of the most recent Decennial Census. (For certain data, using the block-level housing unit count is more appropriate.) 19 | 20 | It's actually still an open question as to whether it would be better to use block groups or tracts as the building block of aggregate data. Block groups, being smaller, might seem to provide closer alignment. However, because the ACS is a survey, smaller geographies tend to have larger margins of error, especially for small sub-populations. ACS census tract level data may reduce some of that uncertainty. For now, this library supports using either. 21 | 22 | 23 | ## Status 24 | 25 | Currently, there's a [Jupyter notebook](https://github.com/censusreporter/acs-aggregate/blob/master/notebook.ipynb) which explains the origin of the project and demonstrates the basic method. If you have a "block assignment" file, you can use it now to pull ACS data for your custom geographies. 26 | 27 | Next steps: 28 | 29 | * Package the code in the notebook into a library 30 | * Make it easier to create a "block assignment file" 31 | * Address the caveats below 32 | 33 | ### Caveats and limitations 34 | 35 | * Right now, margin of error is simply disregarded. It would not be too hard to aggregate the margin of error as part of the process, but it sort of clutters up things by doubling the number of columns. At some point, I think I'd like to add an option to include aggregated MOE. 36 | 37 | * The library is not equipped to aggregate median values such as "median household income". Folks at the LA Times have [done work in this area](https://github.com/datadesk/census-data-aggregator#approximating-medians) but applying it requires a slightly different API than the library currently uses. It's something I'd like to work on, though. [This post to the ACS Data Community](https://acsdatacommunity.prb.org/discussion-forum/f/forum/898/allocating-median-household-income-across-census-boundaries/2290#2290) recommends a more nuanced approach, and cautions against problems that come from assuming a symmetric income distribution. 38 | 39 | * The library is not equipped to aggregate percent values. The ACS Subject tables and Data Profile tables have a mix of "total" and "percent" variables. It's probably possible to aggregate percentages, but I'm not clear on the method. 40 | 41 | * Making a block assignment file is a lot of work, if not just out-of-reach, for most people. In a project which re-ignited this one, [John Keefe reported out cases](https://johnkeefe.net/chicago-race-and-ethnicity-data-by-police-district) where blocks themselves were split by custom geographies. Other approaches simply assign blocks to whatever custom geography contains their centroid, which should be automatable. A near-term future goal is to support creating the block assignment files using centroid-assignments. I'd imagined trying to make a web-tool to help with the review/assignment, but I'm not sure it's worth the considerable effort, especially for ACS data, which is always imprecise by its survey nature. 42 | -------------------------------------------------------------------------------- /crosswalks/zip_to_zcta/ZIP_ZCTA_README.md: -------------------------------------------------------------------------------- 1 | # ZIP Code to ZCTA Crosswalk 2 | 3 | While Census Reporter refers to ZIP codes in its interface, it's actually the case that American Community Survey (ACS) data is not available by ZIP code. Instead, the geography we call ZIP Code is a ZIP Code Tabulation Area, or ZCTA. 4 | 5 | While folks commonly think of them as geographic areas, ZIP Codes actually identify a post office which handles delivering the mail to its final destination. While, in many cases, there's an implicit area that contains all of the addresses in that ZIP code, there are other ZIP codes where that doesn't work. There are ZIP codes which are only used for PO Boxes, and others which collect all the mail for a large business or organization that then handles the final delivery. In both of these cases, there's no straightforward way to draw them as an area on a map. 6 | 7 | The key issue is that there are thousands of ZIP codes for which there is no corresponding ZCTA. And while the ACS data is tabulated by ZCTA, not ZIP code, there are other data sources which are at the ZIP Code level, not the ZCTA level, including Census programs like ZIP Code Business Patterns, or ZBP (part of the [County Business Patterns](https://www.census.gov/programs-surveys/cbp.html) program). 8 | 9 | For a project where we wanted to integrate data from the ACS and the ZBP, we needed to come up with [a crosswalk assigning each ZIP Code to a ZCTA](zip_zcta_xref.csv). Since it was a bit of work, we wanted to share it with others who might need it. But, again, ZIP Codes change, so this file may go out of date. So we are also sharing the code and method in case you need to do it again with updated data files. 10 | 11 | ## The data file 12 | 13 | `zip_zcta_xref.csv` provides a crosswalk between ZIP Codes and ZCTAs. It was created using [build_crosswalk.ipynb](build_crosswalk.ipynb), a reimplementation of the method described below. 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 |
columndatatypenotes
zip_codetextSource: Census Gazetteer, GeoNames or ZIP Code Business Patterns
zctatextthe best available ZCTA for the ZIP Code, or null in a small number of cases
sourcetextthe original source of the ZIP Code in our data processing pipeline. You can probably ignore this.
38 | 39 | ## Our method 40 | 41 | Getting data about USPS ZIP codes is not exactly straightforward. The USPS does not provide a simple, free list. We used [a dataset](https://download.geonames.org/export/dump/US.zip) from [GeoNames](https://www.geonames.org/) as our master list. For a comprehensive list of ZCTAs, we used [the ZCTA file](https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2017_Gazetteer/2017_Gaz_zcta_national.zip) from the [2017 Gazetteer files](https://www.census.gov/geographies/reference-files/time-series/geo/gazetteer-files.2017.html), although for this version, we use the Census's ZCTA ESRI Shapefile, which is just the Gazetteer plus geographic boundaries. 42 | 43 | To the GeoNames master list, we added ZIP Codes in the ZIP Code Business Patterns data which weren't already in GeoNames. (See what we mean about it being hard without an authoritative master list?) 44 | 45 | We began with the assumption that any ZIP Code which had a corresponding ZCTA (same 5-digit identifier) should be treated as the same as that ZCTA. This is not strictly true: the Census Bureau acknowledges that their method for assigning ZCTAs sometimes results in addresses being placed in a ZCTA that differs from the address's ZIP code. However, we didn't see any way we could feasibly deal with that issue. 46 | 47 | After comparing the ZIP and ZCTA master lists, we identified nearly 8,000 ZIP Codes which do not have matching ZCTAs. 48 | 49 | The GeoNames dataset includes a geocode (latitude/longitude) for each ZIP code. It's not clear how those geocodes were assigned, so this is a leap of faith, but it's the best we had to go-on. We use GIS software to try locating non-ZCTA ZIP Codes in a ZCTA. It’s difficult to estimate what distortion might be introduced by this approach. 50 | 51 | After the GIS analysis, we were left with about 100 ZIP Codes which were in GeoNames, and so had a geocode, but were not located in any ZCTA. There were also a few dozen ZIP codes which appeared in the ZBP dataset but which were not in either GeoNames or the ZCTA gazetteer. These were put to a manual review process. 52 | 53 | The process was thus: we loaded the ZIP Codes which weren't ZCTAs into [this Google Sheets document](https://docs.google.com/spreadsheets/d/1sbf-15PzHTnT6CsUMKcVnmhoHx-wKZ_PR-f_1WS5l5A/edit#gid=1978067583). We added a couple of columns to do Google Map searches: `point map url`, based on the latitude and longitude, if we had them, and `zip map url`, searching Google Maps for the ZIP Code. We included ZIP Codes which were matched to ZCTAs by geocoding (as above), in case we wanted to spot check any of them, but we were focused on those which had no ZCTA. 54 | 55 | For each row, the reviewer would load the point map. If the result was implausible, like many which came up in water, then the reviewer would load the zip code map. 56 | 57 | For plausible maps, the reviewer would right-click on the map to bring up the Google Maps context menu, and choose "What's Here?" If that gave further information that had a ZIP Code, we used it, otherwise we tried "What's Here" for very nearby points on the map until a ZIP Code was found. 58 | 59 | That ZIP Code was placed in the `result` column, which would then generate a link in the `census_reporter_check` column for that row. Clicking on that link would try to open the Census Reporter page for the ZCTA that was entered in the "result" column. If loading the page on Census Reporter errored, that was a sign that the ZIP Code found was not a ZCTA. 60 | 61 | For ZIPs which could not be resolved to ZCTAs, we used a `notes` column to provide more information. 62 | 63 | After this process, only three ZIP Codes remained unresolved, and for each, plausible explanations were documented in the ‘notes’ column of the Google Sheets document. In the rebuilt process, documented here, we also turned up two new ZIP Codes from the ZBP data which haven't been manually reviewed. 64 | 65 | We're very grateful to Caroline Dudlak, Medill '21, for her human review efforts. 66 | 67 | 68 | ## More info 69 | 70 | * [US Census: ZIP Code Tabulation Areas (ZCTAs)](https://www.census.gov/programs-surveys/geography/guidance/geo-areas/zctas.html) 71 | * [What is the difference between ZIP code "boundaries" and ZCTA areas?](http://gis.washington.edu/phurvitz/zip_or_zcta/index.html) by Phil Hurvitz, University of Washington 72 | * [HUD USPS ZIP Code Crosswalk Files](https://www.huduser.gov/portal/datasets/usps_crosswalk.html) — we found this after we created our own, but it looks like this is regularly updated, and ZIP Codes change fairly frequently, so it is probably a better resource than ours. Registration is required, so that is a bit of friction. 73 | * HRSA [ZIP Code to ZCTA Crosswalk](https://data.hrsa.gov/DataDownload/GeoCareNavigator/ZIP%20Code%20to%20ZCTA%20Crosswalk.xlsx) (linked from [Health Center Program GeoCare Navigator](https://geocarenavigator.hrsa.gov/)) — this is a directly downloadable Excel file provided by the US Dept. of Health and Human Services, and it seems to be updated frequently. 74 | * [ZIP Codes by Area and District codes](https://postalpro.usps.com/ZIP_Locale_Detail), provided by the USPS, may be the most authoritative source. This is another we found after we did our work, and this one requires synthesizing data from different worksheets in an Excel file, but for many data users, that will be straightforward. 75 | -------------------------------------------------------------------------------- /BOUNDARIES.md: -------------------------------------------------------------------------------- 1 | # Sources of Boundary Information 2 | 3 | Often people who want to use a tool like `acs-aggregate` have a bootstrapping problem: where are the GIS files for the areas for which they want to aggregate data? 4 | 5 | While this will probably be hard to keep up to date, and perhaps should be in some form other than markdown in this repository, let's take a stab at building a list. 6 | 7 | Since we started this, Census Reporter launched [a tool](https://censusreporter.org/2020/) which uses this kind of data, and which may list some options which are not included here. 8 | 9 | *Since this is about aggregating Census data, let's stipulate that it should only include polygon data, not points and lines. And, only boundaries relevant to the United States and its territories. We'll organize it by state (or state-like.)* 10 | 11 | ## US - United States 12 | 13 | * [Home Owners' Loan Corporation (HOLC) "Redlining" maps](https://dsl.richmond.edu/panorama/redlining/#text=downloads) (also available for many specific cities) 14 | * [US District Court Jurisdictions](https://hifld-geoplatform.opendata.arcgis.com/datasets/us-district-court-jurisdictions) (see also [`COUNTY_DISTRICT_README.md`](crosswalks/judicial_districts/COUNTY_DISTRICT_README.md)) 15 | 16 | ## AL - Alabama 17 | 18 | * [Birmingham Neighborhoods](https://data.birminghamal.gov/dataset/gis-mapping-files/resource/bb378880-fdbb-40a2-89ef-27582adef3bc) 19 | * [Huntsville City Council Districts](https://gis-huntsvilleal.opendata.arcgis.com/datasets/city-council-districts/explore) 20 | 21 | ## CA - California 22 | 23 | * [Los Angeles County, CA Neighborhoods](https://apps.gis.ucla.edu/geodata/dataset/los-angeles-county-neighborhoods) 24 | * [Oakland Community Police Beats](https://data.oaklandca.gov/dataset/Oakland-Community-Police-Beats/tp8r-5gzs) 25 | * [Oakland Council Districts](https://data.oaklandca.gov/City-Government/City-of-Oakland-Council-Districts/g7vb-tiyh) 26 | * [Sacramento, CA City Council Districts](https://data.cityofsacramento.org/datasets/28bd505c8e674a49ba5f782d0d806033_0/about) 27 | * [San Francisco, CA Supervisor Districts (2012 Redistricting)](https://data.sfgov.org/Geographic-Locations-and-Boundaries/Current-Supervisor-Districts/keex-zmn4) 28 | 29 | ## CO - Colorado 30 | 31 | * [Various state-level files](https://demography.dola.colorado.gov/gis/gis-data/) including hospital, library, water, fire protection districts and more 32 | * [Denver "Statistical Neighborhoods"](https://www.denvergov.org/opendata/dataset/city-and-county-of-denver-statistical-neighborhoods) 33 | 34 | 35 | ## DC - Washington, DC 36 | 37 | * [Washington, DC Neighborhood Clusters](https://opendata.dc.gov/datasets/f6c703ebe2534fc3800609a07bad8f5b_17) 38 | 39 | ## FL - Florida 40 | 41 | * [Orlando, FL Neighborhoods](https://orl.hub.arcgis.com/datasets/orlando-political-neighborhoods/explore?location=28.481107%2C-81.342842%2C11.31) 42 | 43 | ## GA - Georgia 44 | 45 | * [Atlanta Neighborhoods](https://dpcd-coaplangis.opendata.arcgis.com/datasets/neighborhood/) 46 | * [Atlanta Neighborhood Planning Units](https://dpcd-coaplangis.opendata.arcgis.com/datasets/npu) (NPUs) 47 | 48 | 49 | ## HI - Hawaii 50 | 51 | * [Honolulu "Realtor Neighborhoods"](https://honolulu-cchnl.opendata.arcgis.com/datasets/neighborhoods-realtor-neighborhoods/) 52 | 53 | ## IL - Illinois 54 | 55 | * [Chicago Neighborhoods](https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Neighborhoods/bbvz-uum9) 56 | * [Chicago Community Areas](https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Community-Areas-current-/cauq-8yn6) 57 | * [Chicago Police Districts](https://data.cityofchicago.org/Public-Safety/Boundaries-Police-Districts/4dt9-88ua) 58 | * [Chicago City Council Wards](https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Wards-2015-/sp34-6z76) 59 | 60 | ## IN - Indiana 61 | 62 | * [Bloomington City Neighborhood Associations](https://data.bloomington.in.gov/dataset/city-neigbhorhoods-gis-data) 63 | 64 | 65 | ## MA - Massachusetts 66 | 67 | * [Boston Neighborhoods](https://data.boston.gov/dataset/boston-neighborhoods) 68 | * [Cambridge CDD (Community Development Department) Neighborhoods](https://www.cambridgema.gov/GIS/gisdatadictionary/Boundary/BOUNDARY_CDDNeighborhoods) 69 | 70 | ## MD - Maryland 71 | 72 | * [Baltimore Neighborhoods](https://data.baltimorecity.gov/datasets/baltimore::neighborhoods/about) 73 | 74 | 75 | ## MI - Michigan 76 | 77 | * [Detroit neighborhood boundaries](https://data.detroitmi.gov/datasets/neighborhoods) 78 | 79 | ## MN - Minnesota 80 | 81 | * [Minneapolis, MN Neighborhoods](https://opendata.minneapolismn.gov/datasets/cityoflakes::minneapolis-neighborhoods/about) 82 | 83 | ## MO - Missouri 84 | 85 | * [Kansas City neighborhood boundaries](https://data.kcmo.org/Neighborhoods/Kansas-City-Neighborhood-Boundaries/q45j-ejyk) 86 | * [St. Louis city shapefile data](https://www.stlouis-mo.gov/data/formats/format.cfm?id=21) (includes ward and neighborhood boundaries as well as other divisions) 87 | 88 | ## NY - New York 89 | 90 | * [New York City Community Districts](https://data.cityofnewyork.us/City-Government/Community-Districts/yfnk-k7r4) 91 | * [New York City Council Districts](https://data.cityofnewyork.us/City-Government/City-Council-Districts/yusd-j4xi) 92 | * [New York City Election Districts](https://data.cityofnewyork.us/City-Government/Election-Districts/h2n3-98hq) 93 | * [New York City Neighborhood Tabulation Areas (NTAs)](https://data.cityofnewyork.us/City-Government/NTA-map/d3qk-pfyz) (note that an [NTA-census tract crosswalk](https://www1.nyc.gov/assets/planning/download/office/data-maps/nyc-population/census2010/nyc2010census_tabulation_equiv.xlsx) is available) 94 | * [New York City School Districts](https://data.cityofnewyork.us/Education/School-Districts/r8nu-ymqj) (to the Census, NYC is one big district) 95 | * [Syracuse, NY Common Council Districts](https://data.syrgov.net/datasets/881e71dbbea84de28b3fb0b840bc2067_0/explore?location=43.035052%2C-76.139450%2C13.41) 96 | 97 | ## OH - Ohio 98 | 99 | * [Cleveland City Neighborhoods (Statistical Planning Areas)](https://planning.clevelandohio.gov/maps/OpenData.php) 100 | 101 | ## OK - Oklahoma 102 | 103 | * [Oklahoma City Neighborhood Associations](https://data.okc.gov/portal/page/viewer?datasetName=Neighborhood%20Associations&view=map) 104 | * [Oklahoma City City Council Wards](https://data.okc.gov/portal/page/viewer?datasetName=City%20Council%20Wards&view=map) 105 | 106 | ## OR - Oregon 107 | 108 | * [Portland Neighborhood Association Boundaries](https://hub.arcgis.com/datasets/1ef75e34b8504ab9b14bef0c26cade2c_3) 109 | 110 | ## PA - Pennsylvania 111 | 112 | * [Philadelphia Neighborhoods](https://www.opendataphilly.org/dataset/philadelphia-neighborhoods) 113 | * [Pittsburgh Neighborhoods](https://data.wprdc.org/dataset/neighborhoods2) 114 | 115 | ## RI - Rhode Island 116 | 117 | * [Providence Neighborhoods](https://pvdgis.maps.arcgis.com/home/item.html?id=368395369304497090ddb33f5636da87) 118 | * [Providence Wards](https://pvdgis.maps.arcgis.com/home/item.html?id=36468e873abd482ba89aa58be9613ce0) 119 | 120 | ## TX - Texas 121 | 122 | * [Houston "Super Neighborhoods"](https://cohgis-mycity.opendata.arcgis.com/datasets/c3bfee99cbc14a899e4a603ee73203ee_3/) 123 | 124 | ## VA - Virginia 125 | 126 | * [Richmond, VA neighborhoods](https://data.richmondgov.com/Unique-and-Inclusive-Neighborhoods/Neighborhoods/e9k6-65id) 127 | 128 | ## WA - Washington 129 | 130 | * [Seattle Community Reporting Areas](http://data-seattlecitygis.opendata.arcgis.com/datasets/community-reporting-areas) 131 | * [Seattle "City Clerk" Neighborhoods](http://data-seattlecitygis.opendata.arcgis.com/datasets/city-clerk-neighborhoods) 132 | * [Seattle Council Districts](http://data-seattlecitygis.opendata.arcgis.com/datasets/council-districts) 133 | * [Spokane, WA Neighborhoods](https://data-spokane.opendata.arcgis.com/datasets/neighborhood-1/explore) 134 | 135 | --- 136 | If you aren't finding what you're looking for above, here are some other resources which haven't been fully explored yet: 137 | 138 | 139 | * While it may or may not be current, the [GitHub repo](https://github.com/codeforgermany/click_that_hood/tree/main/public/data) for [Click That Hood](http://click-that-hood.com/) is worth a look if you don't find what you're looking for here -- and it goes far beyond the US as well. 140 | * The Big Ten Academic Alliance has a [Geoportal](https://geo.btaa.org/) with links to [geodata for a number of US municipalities](https://geo.btaa.org/?f%5Bdc_subject_sm%5D%5B%5D=Municipalities+geospatial+data) 141 | * For a while, Zillow offered neighborhood maps that they pulled together for their service. They no longer provide it, but I came across [this site](https://mapcruzin.com/free-download-neighborhood-boundary-shapefiles.htm) which seems to have archived them. They're organized by state, but for many states, there are only neighborhoods for a single city. 142 | * [Koordinates.com](https://Koordinates.com) is a geospatial data management platform which has aggregated GIS data from diverse sources. At the time of this writing, [a search for 'neighborhood'](https://koordinates.com/search/?q=neighborhood) gets well over 500 hits. 143 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4; python_version >= "3.7" \ 2 | --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 \ 3 | --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 4 | attrs==21.4.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7" \ 5 | --hash=sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4 \ 6 | --hash=sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd 7 | beautifulsoup4==4.11.1; python_full_version >= "3.6.0" and python_version >= "3.7" \ 8 | --hash=sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30 \ 9 | --hash=sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693 10 | cenpy==1.0.0.post4; python_version >= "3.5" \ 11 | --hash=sha256:43d24ffbff6d1c2879a05499f2ac0776f10803524d466614cdec2cd3c9e9ff20 12 | census-data-aggregator==0.0.6 \ 13 | --hash=sha256:4443165f9e9fc00becb346e7af58868b4a0f80c77b3ca8eb1f468e35bf920f52 \ 14 | --hash=sha256:cf527a1378aebe688584f5828a403778f73be5a74b2f04b9a13edc65c57db49e 15 | certifi==2021.10.8; python_version >= "3.8" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.8" \ 16 | --hash=sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569 \ 17 | --hash=sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872 18 | charset-normalizer==2.0.12; python_full_version >= "3.6.0" and python_version >= "3.7" \ 19 | --hash=sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597 \ 20 | --hash=sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df 21 | click-plugins==1.1.1; python_version >= "3.7" \ 22 | --hash=sha256:46ab999744a9d831159c3411bb0c79346d94a444df9a3a3742e9ed63645f264b \ 23 | --hash=sha256:5d262006d3222f5057fd81e1623d4443e41dcda5dc815c06b442aa3c02889fc8 24 | click==8.1.3; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version < "4" and python_version >= "3.7" \ 25 | --hash=sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48 \ 26 | --hash=sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e 27 | cligj==0.7.2; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version < "4" and python_version >= "3.7" \ 28 | --hash=sha256:c1ca117dbce1fe20a5809dc96f01e1c2840f6dcc939b3ddbb1111bf330ba82df \ 29 | --hash=sha256:a4bc13d623356b373c2c27c53dbd9c68cae5d526270bfa71f6c6fa69669c6b27 30 | colorama==0.4.4; python_version >= "3.7" and python_full_version < "3.0.0" and platform_system == "Windows" or platform_system == "Windows" and python_version >= "3.7" and python_full_version >= "3.5.0" \ 31 | --hash=sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2 \ 32 | --hash=sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b 33 | fiona==1.8.21; python_version >= "3.7" \ 34 | --hash=sha256:39c656421e25b4d0d73d0b6acdcbf9848e71f3d9b74f44c27d2d516d463409ae \ 35 | --hash=sha256:43b1d2e45506e56cf3a9f59ba5d6f7981f3f75f4725d1e6cb9a33ba856371ebd \ 36 | --hash=sha256:315e186cb880a8128e110312eb92f5956bbc54d7152af999d3483b463758d6f9 \ 37 | --hash=sha256:9fb2407623c4f44732a33b3f056f8c58c54152b51f0324bf8f10945e711eb549 \ 38 | --hash=sha256:b69054ed810eb7339d7effa88589afca48003206d7627d0b0b149715fc3fde41 \ 39 | --hash=sha256:11532ccfda1073d3f5f558e4bb78d45b268e8680fd6e14993a394c564ddbd069 \ 40 | --hash=sha256:3789523c811809a6e2e170cf9c437631f959f4c7a868f024081612d30afab468 \ 41 | --hash=sha256:085f18d943097ac3396f3f9664ac1ae04ad0ff272f54829f03442187f01b6116 \ 42 | --hash=sha256:388acc9fa07ba7858d508dfe826d4b04d813818bced16c4049de19cc7ca322ef \ 43 | --hash=sha256:40b4eaf5b88407421d6c9e707520abd2ff16d7cd43efb59cd398aa41d2de332c \ 44 | --hash=sha256:3a0edca2a7a070db405d71187214a43d2333a57b4097544a3fcc282066a58bfc 45 | fuzzywuzzy==0.18.0; python_version >= "3.5" \ 46 | --hash=sha256:928244b28db720d1e0ee7587acf660ea49d7e4c632569cad4f1cd7e68a5f0993 \ 47 | --hash=sha256:45016e92264780e58972dca1b3d939ac864b78437422beecebb3095f8efd00e8 48 | geopandas==0.10.2; python_version >= "3.7" \ 49 | --hash=sha256:1722853464441b603d9be3d35baf8bde43831424a891e82a8545eb8997b65d6c \ 50 | --hash=sha256:efbf47e70732e25c3727222019c92b39b2e0a66ebe4fe379fbe1aa43a2a871db 51 | idna==3.3; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.7" \ 52 | --hash=sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff \ 53 | --hash=sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d 54 | jinja2==3.1.2; python_version >= "3.7" \ 55 | --hash=sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61 \ 56 | --hash=sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852 57 | libpysal==4.6.2; python_version >= "3.7" \ 58 | --hash=sha256:dfb30f4ad8c882492571120487b246fbad19370bc9bb2bbc77c89d0fcddb0792 \ 59 | --hash=sha256:8a4c4651394aefc6332f2fb1f38336c559e50dc89f977bfaa3d8541610eaa634 60 | markupsafe==2.1.1; python_version >= "3.7" \ 61 | --hash=sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812 \ 62 | --hash=sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a \ 63 | --hash=sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e \ 64 | --hash=sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5 \ 65 | --hash=sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4 \ 66 | --hash=sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f \ 67 | --hash=sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e \ 68 | --hash=sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933 \ 69 | --hash=sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6 \ 70 | --hash=sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417 \ 71 | --hash=sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02 \ 72 | --hash=sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a \ 73 | --hash=sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37 \ 74 | --hash=sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980 \ 75 | --hash=sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a \ 76 | --hash=sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3 \ 77 | --hash=sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a \ 78 | --hash=sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff \ 79 | --hash=sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a \ 80 | --hash=sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452 \ 81 | --hash=sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003 \ 82 | --hash=sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1 \ 83 | --hash=sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601 \ 84 | --hash=sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925 \ 85 | --hash=sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f \ 86 | --hash=sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88 \ 87 | --hash=sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63 \ 88 | --hash=sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1 \ 89 | --hash=sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7 \ 90 | --hash=sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a \ 91 | --hash=sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f \ 92 | --hash=sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6 \ 93 | --hash=sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77 \ 94 | --hash=sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603 \ 95 | --hash=sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7 \ 96 | --hash=sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135 \ 97 | --hash=sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96 \ 98 | --hash=sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c \ 99 | --hash=sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247 \ 100 | --hash=sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b 101 | munch==2.5.0; python_version >= "3.7" \ 102 | --hash=sha256:6f44af89a2ce4ed04ff8de41f70b226b984db10a91dcc7b9ac2efc1c77022fdd \ 103 | --hash=sha256:2d735f6f24d4dba3417fa448cae40c6e896ec1fdab6cdb5e6510999758a4dbd2 104 | numpy==1.22.3 \ 105 | --hash=sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75 \ 106 | --hash=sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab \ 107 | --hash=sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e \ 108 | --hash=sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4 \ 109 | --hash=sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430 \ 110 | --hash=sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4 \ 111 | --hash=sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce \ 112 | --hash=sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe \ 113 | --hash=sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5 \ 114 | --hash=sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1 \ 115 | --hash=sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62 \ 116 | --hash=sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676 \ 117 | --hash=sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123 \ 118 | --hash=sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802 \ 119 | --hash=sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d \ 120 | --hash=sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168 \ 121 | --hash=sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa \ 122 | --hash=sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a \ 123 | --hash=sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f \ 124 | --hash=sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18 125 | packaging==21.3; python_version >= "3.7" \ 126 | --hash=sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522 \ 127 | --hash=sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb 128 | pandas==1.4.2; python_version >= "3.8" \ 129 | --hash=sha256:be67c782c4f1b1f24c2f16a157e12c2693fd510f8df18e3287c77f33d124ed07 \ 130 | --hash=sha256:5a206afa84ed20e07603f50d22b5f0db3fb556486d8c2462d8bc364831a4b417 \ 131 | --hash=sha256:0010771bd9223f7afe5f051eb47c4a49534345dfa144f2f5470b27189a4dd3b5 \ 132 | --hash=sha256:3228198333dd13c90b6434ddf61aa6d57deaca98cf7b654f4ad68a2db84f8cfe \ 133 | --hash=sha256:5b79af3a69e5175c6fa7b4e046b21a646c8b74e92c6581a9d825687d92071b51 \ 134 | --hash=sha256:5586cc95692564b441f4747c47c8a9746792e87b40a4680a2feb7794defb1ce3 \ 135 | --hash=sha256:061609334a8182ab500a90fe66d46f6f387de62d3a9cb9aa7e62e3146c712167 \ 136 | --hash=sha256:b8134651258bce418cb79c71adeff0a44090c98d955f6953168ba16cc285d9f7 \ 137 | --hash=sha256:df82739e00bb6daf4bba4479a40f38c718b598a84654cbd8bb498fd6b0aa8c16 \ 138 | --hash=sha256:385c52e85aaa8ea6a4c600a9b2821181a51f8be0aee3af6f2dcb41dafc4fc1d0 \ 139 | --hash=sha256:295872bf1a09758aba199992c3ecde455f01caf32266d50abc1a073e828a7b9d \ 140 | --hash=sha256:95c1e422ced0199cf4a34385ff124b69412c4bc912011ce895582bee620dfcaa \ 141 | --hash=sha256:5c54ea4ef3823108cd4ec7fb27ccba4c3a775e0f83e39c5e17f5094cb17748bc \ 142 | --hash=sha256:c072c7f06b9242c855ed8021ff970c0e8f8b10b35e2640c657d2a541c5950f59 \ 143 | --hash=sha256:f549097993744ff8c41b5e8f2f0d3cbfaabe89b4ae32c8c08ead6cc535b80139 \ 144 | --hash=sha256:ff08a14ef21d94cdf18eef7c569d66f2e24e0bc89350bcd7d243dd804e3b5eb2 \ 145 | --hash=sha256:8c5bf555b6b0075294b73965adaafb39cf71c312e38c5935c93d78f41c19828a \ 146 | --hash=sha256:51649ef604a945f781105a6d2ecf88db7da0f4868ac5d45c51cb66081c4d9c73 \ 147 | --hash=sha256:d0d4f13e4be7ce89d7057a786023c461dd9370040bdb5efa0a7fe76b556867a0 \ 148 | --hash=sha256:09d8be7dd9e1c4c98224c4dfe8abd60d145d934e9fc1f5f411266308ae683e6a \ 149 | --hash=sha256:92bc1fc585f1463ca827b45535957815b7deb218c549b7c18402c322c7549a12 150 | pyparsing==3.0.8; python_full_version >= "3.6.8" and python_version >= "3.7" \ 151 | --hash=sha256:ef7b523f6356f763771559412c0d7134753f037822dad1b16945b7b846f7ad06 \ 152 | --hash=sha256:7bf433498c016c4314268d95df76c81b842a4cb2b276fa3312cfb1e1d85f6954 153 | pyproj==3.3.1; python_version >= "3.8" \ 154 | --hash=sha256:473961faef7a9fd723c5d432f65220ea6ab3854e606bf84b4d409a75a4261c78 \ 155 | --hash=sha256:2fef9c1e339f25c57f6ae0558b5ab1bbdf7994529a30d8d7504fc6302ea51c03 \ 156 | --hash=sha256:140fa649fedd04f680a39f8ad339799a55cb1c49f6a84e1b32b97e49646647aa \ 157 | --hash=sha256:b59c08aea13ee428cf8a919212d55c036cc94784805ed77c8f31a4d1f541058c \ 158 | --hash=sha256:1adc9ccd1bf04998493b6a2e87e60656c75ab790653b36cfe351e9ef214828ed \ 159 | --hash=sha256:42eea10afc750fccd1c5c4ba56de29ab791ab4d83c1f7db72705566282ac5396 \ 160 | --hash=sha256:531ea36519fa7b581466d4b6ab32f66ae4dadd9499d726352f71ee5e19c3d1c5 \ 161 | --hash=sha256:67025e37598a6bbed2c9c6c9e4c911f6dd39315d3e1148ead935a5c4d64309d5 \ 162 | --hash=sha256:aed1a3c0cd4182425f91b48d5db39f459bc2fe0d88017ead6425a1bc85faee33 \ 163 | --hash=sha256:3cc4771403db54494e1e55bca8e6d33cde322f8cf0ed39f1557ff109c66d2cd1 \ 164 | --hash=sha256:c99f7b5757a28040a2dd4a28c9805fdf13eef79a796f4a566ab5cb362d10630d \ 165 | --hash=sha256:5dac03d4338a4c8bd0f69144c527474f517b4cbd7d2d8c532cd8937799723248 \ 166 | --hash=sha256:56b0f9ee2c5b2520b18db30a393a7b86130cf527ddbb8c96e7f3c837474a9d79 \ 167 | --hash=sha256:5f92d8f6514516124abb714dce912b20867831162cfff9fae2678ef07b6fcf0f \ 168 | --hash=sha256:1ef1bfbe2dcc558c7a98e2f1836abdcd630390f3160724a6f4f5c818b2be0ad5 \ 169 | --hash=sha256:5ca5f32b56210429b367ca4f9a57ffe67975c487af82e179a24370879a3daf68 \ 170 | --hash=sha256:aba199704c824fb84ab64927e7bc9ef71e603e483130ec0f7e09e97259b8f61f \ 171 | --hash=sha256:120d45ed73144c65e9677dc73ba8a531c495d179dd9f9f0471ac5acc02d7ac4b \ 172 | --hash=sha256:52efb681647dfac185cc655a709bc0caaf910031a0390f816f5fc8ce150cbedc \ 173 | --hash=sha256:5ab0d6e38fda7c13726afacaf62e9f9dd858089d67910471758afd9cb24e0ecd \ 174 | --hash=sha256:45487942c19c5a8b09c91964ea3201f4e094518e34743cae373889a36e3d9260 \ 175 | --hash=sha256:797ad5655d484feac14b0fbb4a4efeaac0cf780a223046e2465494c767fd1c3b \ 176 | --hash=sha256:b3d8e14d91cc95fb3dbc03a9d0588ac58326803eefa5bbb0978d109de3304fbe 177 | python-dateutil==2.8.2; python_version >= "3.8" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.8" \ 178 | --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ 179 | --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 180 | python-levenshtein==0.12.2 \ 181 | --hash=sha256:dc2395fbd148a1ab31090dd113c366695934b9e85fe5a4b2a032745efd0346f6 182 | pytz==2022.1; python_version >= "3.8" \ 183 | --hash=sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c \ 184 | --hash=sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7 185 | requests==2.27.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.6.0") \ 186 | --hash=sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d \ 187 | --hash=sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61 188 | rtree==1.0.0; python_version >= "3.7" \ 189 | --hash=sha256:757bbf9ca38c241e34812a646f16ffda2cabd535bcd815041b83fe091df7a85c \ 190 | --hash=sha256:fe3954a51d691d3938cbac42ac97f4acacbea8ea622a375df901318a5c4ab0e9 \ 191 | --hash=sha256:24185f39b277aaca0566284858de02edc80dc7b120233be38fcf3b4c7d2e72dc \ 192 | --hash=sha256:b2110fb8675bf809bba431a1876ba76ca5dde829a4de40aa7851941452a01278 \ 193 | --hash=sha256:b0256ed9c27037892bcb7167e7f5c469ee7c5de38c5a895145e33c320584babe \ 194 | --hash=sha256:7f2c0bd3e7d4b68cc27ab605b18487440427d5febba5f4b747b694f9de601c6f \ 195 | --hash=sha256:c2b14f7603576b73a5e0fd2e35394db08c5ca3cfa41e4c8530128d91e5e43dd3 \ 196 | --hash=sha256:973ce22ee8bafa44b3df24c6bf78012e534e1f36103e0bbfbb193ec48e9be22a \ 197 | --hash=sha256:55b771e62b1e391a44776ef9f906944796213cc3cb48ffd6b22493684c68a859 \ 198 | --hash=sha256:0475b2e7fe813c427ceb21e57c22f8b4b7fee6e5966db8a200688163d4853f14 \ 199 | --hash=sha256:e436d8da7527655fd0512dd6a5218f604a3806849f3981ec0ca64930dc19b7f2 \ 200 | --hash=sha256:8d18efe4e69f6b7daee9aaced21e0218786209d55235c909c78dbc5c12368790 \ 201 | --hash=sha256:728cf9b774ed6f120f2ed072082431c14af8243d477656b5b7dc1ff855fe7786 \ 202 | --hash=sha256:3e28303d84f8b5509e26db7c2aa533692a6112a430cc955a7a7e6d899c9d5996 \ 203 | --hash=sha256:062439d3a33d95281445960af76b6189b987cda0803fdc1818e31b68bce989d1 \ 204 | --hash=sha256:0ab0dccff665389329f8d2e623131a1af3ab82b6de570f8c494a429c129f3e65 \ 205 | --hash=sha256:44df5adc12841b94adcbc4e5aaada248e98a4dc2017c8c7060f9a782ef63e050 \ 206 | --hash=sha256:29a1a4452e334eaf3299c8b95f137a2ccafbccfd856041f612ec933eeafb2cf5 \ 207 | --hash=sha256:efdaf7137303af7a85ddd224bacdb27f9f7ece99e0dec627c900e12f22cdefd0 \ 208 | --hash=sha256:264e3b255a1fc6aaa2ddbcedfc15ac40578433f6b35a0c7aaba026215d91d8c3 \ 209 | --hash=sha256:26b2275ebc738cb6a0473c15d80fdfe820ef319015009f8f0789e586552cf411 \ 210 | --hash=sha256:825c1f74a84e9857657c04503c4c50b9f170114183fa2db9211a5d8650cf1ffa \ 211 | --hash=sha256:a91d7b514210ae93029c2a7ed83b2595ca73de5e08a9d87fcdf3a784a7b3ef54 \ 212 | --hash=sha256:0ffaa03d1f7e8291de7cd8a11f92e10579f145dc3a08cd46a9eea65cc7b42173 \ 213 | --hash=sha256:4f2f93c997de551a1a0fa4065e713270ad9a509aeeb143c5b46f332c0759f314 \ 214 | --hash=sha256:a48f46dbb6ab0cb135a43d90529e1fa09a6dd80149a34844f2adf8414b4ab71a \ 215 | --hash=sha256:171aa361b3542bf1e47bdee54c611644bb33d35502e2ceea57ac89cf35330554 \ 216 | --hash=sha256:bc18d4df3edb3b889b177ba39238770afdb5787fb803677c3aadea42a6931485 \ 217 | --hash=sha256:bc6e7384684a260eb2f04fcac64ca5ffe28876132a11d1a883db2a5db8becb64 \ 218 | --hash=sha256:d0483482121346b093b9a42518d40f921adf445915b7aea307eb26768c839682 219 | scipy==1.6.1; python_version >= "3.7" \ 220 | --hash=sha256:a15a1f3fc0abff33e792d6049161b7795909b40b97c6cc2934ed54384017ab76 \ 221 | --hash=sha256:e79570979ccdc3d165456dd62041d9556fb9733b86b4b6d818af7a0afc15f092 \ 222 | --hash=sha256:a423533c55fec61456dedee7b6ee7dce0bb6bfa395424ea374d25afa262be261 \ 223 | --hash=sha256:33d6b7df40d197bdd3049d64e8e680227151673465e5d85723b3b8f6b15a6ced \ 224 | --hash=sha256:6725e3fbb47da428794f243864f2297462e9ee448297c93ed1dcbc44335feb78 \ 225 | --hash=sha256:5fa9c6530b1661f1370bcd332a1e62ca7881785cc0f80c0d559b636567fab63c \ 226 | --hash=sha256:bd50daf727f7c195e26f27467c85ce653d41df4358a25b32434a50d8870fc519 \ 227 | --hash=sha256:f46dd15335e8a320b0fb4685f58b7471702234cba8bb3442b69a3e1dc329c345 \ 228 | --hash=sha256:0e5b0ccf63155d90da576edd2768b66fb276446c371b73841e3503be1d63fb5d \ 229 | --hash=sha256:2481efbb3740977e3c831edfd0bd9867be26387cacf24eb5e366a6a374d3d00d \ 230 | --hash=sha256:68cb4c424112cd4be886b4d979c5497fba190714085f46b8ae67a5e4416c32b4 \ 231 | --hash=sha256:5f331eeed0297232d2e6eea51b54e8278ed8bb10b099f69c44e2558c090d06bf \ 232 | --hash=sha256:0c8a51d33556bf70367452d4d601d1742c0e806cd0194785914daf19775f0e67 \ 233 | --hash=sha256:83bf7c16245c15bc58ee76c5418e46ea1811edcc2e2b03041b804e46084ab627 \ 234 | --hash=sha256:794e768cc5f779736593046c9714e0f3a5940bc6dcc1dba885ad64cbfb28e9f0 \ 235 | --hash=sha256:5da5471aed911fe7e52b86bf9ea32fb55ae93e2f0fac66c32e58897cfb02fa07 \ 236 | --hash=sha256:8e403a337749ed40af60e537cc4d4c03febddcc56cd26e774c9b1b600a70d3e4 \ 237 | --hash=sha256:a5193a098ae9f29af283dcf0041f762601faf2e595c0db1da929875b7570353f \ 238 | --hash=sha256:c4fceb864890b6168e79b0e714c585dbe2fd4222768ee90bc1aa0f8218691b11 239 | shapely==1.8.2; python_version >= "3.7" \ 240 | --hash=sha256:7c9e3400b716c51ba43eea1678c28272580114e009b6c78cdd00c44df3e325fa \ 241 | --hash=sha256:ce0b5c5f7acbccf98b3460eecaa40e9b18272b2a734f74fcddf1d7696e047e95 \ 242 | --hash=sha256:3a40bf497b57a6625b83996aed10ce2233bca0e5471b8af771b186d681433ac5 \ 243 | --hash=sha256:6bdc7728f1e5df430d8c588661f79f1eed4a2728c8b689e12707cfec217f68f8 \ 244 | --hash=sha256:a60861b5ca2c488ebcdc706eca94d325c26d1567921c74acc83df5e6913590c7 \ 245 | --hash=sha256:840be3f27a1152851c54b968f2e12d718c9f13b7acd51c482e58a70f60f29e31 \ 246 | --hash=sha256:c60f3758212ec480675b820b13035dda8af8f7cc560d2cc67999b2717fb8faef \ 247 | --hash=sha256:56413f7d32c70b63f239eb0865b24c0c61029e38757de456cc4ab3c416559a0b \ 248 | --hash=sha256:256bdf8080bb7bb504d47b2c76919ecebab9708cc1b26266b3ec32b42448f642 \ 249 | --hash=sha256:c0a0d7752b145343838bd36ed09382d85f5befe426832d7384c5b051c147acbd \ 250 | --hash=sha256:62056e64b12b6d483d79f8e34bf058d2fe734d51c9227c1713705399434eff3b \ 251 | --hash=sha256:8e3ed52a081da58eb4a885c157c594876633dbd4eb283f13ba5bf39c82322d76 \ 252 | --hash=sha256:7c8eda45085ccdd7f9805ea4a93fdd5eb0b6039a61d5f0cefb960487e6dc17a1 \ 253 | --hash=sha256:beee3949ddf381735049cfa6532fb234d5d20a5be910c4f2fb7c7295fd7960e3 \ 254 | --hash=sha256:e07b0bd2a0e61a8afd4d1c1bd23f3550b711f01274ffb53de99358fd781eefd8 \ 255 | --hash=sha256:78966332a89813b237de357a03f612fd451a871fe6e26c12b6b71645fe8eee39 \ 256 | --hash=sha256:8fe641f1f61b3d43dd61b5a85d2ef023e6e19bf8f204a5160a1cb1ec645cbc09 \ 257 | --hash=sha256:cec89a5617c0137f4678282e983c3d63bf838fb00cdf318cc555b4d8409f7130 \ 258 | --hash=sha256:68c8e18dc9dc8a198c3addc8c9596f64137101f566f04b96ecfca0b214cb8b12 \ 259 | --hash=sha256:f12695662c3ad1e6031b3de98f191963d0f09de6d1a4988acd907405644032ba \ 260 | --hash=sha256:15a856fbb588ad5d042784e00918c662902776452008c771ecba2ff615cd197a \ 261 | --hash=sha256:d74de394684d66e25e780b0359fda85be7766af85940fa2dfad728b1a815c71f \ 262 | --hash=sha256:d3f3fac625690f01f35af665649e993f15f924e740b5c0ac0376900655815521 \ 263 | --hash=sha256:1d95842cc6bbbeab673061b63e70b07be9a375c15a60f4098f8fbd29f43af1b4 \ 264 | --hash=sha256:a58e1f362f2091743e5e13212f5d5d16251a4bb63dd0ed587c652d3be9620d3a \ 265 | --hash=sha256:5254240eefc44139ab0d128faf671635d8bdd9c23955ee063d4d6b8f20073ae0 \ 266 | --hash=sha256:75042e8039c79dd01f102bb288beace9dc2f49fc44a2dea875f9b697aa8cd30d \ 267 | --hash=sha256:0c0fd457ce477b1dced507a72f1e2084c9191bfcb8a1e09886990ebd02acf024 \ 268 | --hash=sha256:6fcb28836ae93809de1dde73c03c9c24bab0ba2b2bf419ddb2aeb72c96d110e9 \ 269 | --hash=sha256:44d2832c1b706bf43101fda92831a083467cc4b4923a7ed17319ab599c1025d8 \ 270 | --hash=sha256:137f1369630408024a62ff79a437a5657e6c5b76b9cd352dde704b425acdb298 \ 271 | --hash=sha256:2e02da2e988e74d61f15c720f9f613fab51942aae2dfeacdcb78eadece00e1f3 \ 272 | --hash=sha256:3423299254deec075e79fb7dc7909d702104e4167149de7f45510c3a6342eeea \ 273 | --hash=sha256:572af9d5006fd5e3213e37ee548912b0341fb26724d6dc8a4e3950c10197ebb6 274 | six==1.16.0; python_version >= "3.8" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.8" \ 275 | --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 \ 276 | --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 277 | soupsieve==2.3.2.post1; python_full_version >= "3.6.0" and python_version >= "3.7" \ 278 | --hash=sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759 \ 279 | --hash=sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d 280 | urllib3==1.26.9; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version < "4" and python_version >= "3.7" \ 281 | --hash=sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14 \ 282 | --hash=sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e 283 | -------------------------------------------------------------------------------- /crosswalks/judicial_districts/population_by_district.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Computing Total Population of each Judicial District\n", 8 | "\n", 9 | "This notebook demonstrates using the `county_district_xref.csv` crosswalk to aggregate data from the American Community Survey by Federal Judicial District." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import cenpy # https://pypi.org/project/cenpy/ \n", 20 | "import census_data_aggregator # https://pypi.org/project/census-data-aggregator/" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "First, let's get the population for all counties from the most recent ACS" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "acs = cenpy.products.APIConnection('ACSDT5Y2018')\n", 37 | "county_pop = acs.query(cols=['GEO_ID','B01003_001E','B01003_001M'],geo_unit='county')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/html": [ 48 | "
\n", 49 | "\n", 62 | "\n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | "
GEO_IDB01003_001EB01003_001Mstatecounty
00500000US2815147086-55555555528151
10500000US2811112028-55555555528111
20500000US280198321-55555555528019
30500000US2805723480-55555555528057
40500000US2801510129-55555555528015
\n", 116 | "
" 117 | ], 118 | "text/plain": [ 119 | " GEO_ID B01003_001E B01003_001M state county\n", 120 | "0 0500000US28151 47086 -555555555 28 151\n", 121 | "1 0500000US28111 12028 -555555555 28 111\n", 122 | "2 0500000US28019 8321 -555555555 28 019\n", 123 | "3 0500000US28057 23480 -555555555 28 057\n", 124 | "4 0500000US28015 10129 -555555555 28 015" 125 | ] 126 | }, 127 | "execution_count": 3, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "county_pop.head()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "We wanted to be responsible and handle the margin of error correctly, but our sanity check above shows a number of `-555555555` values. According to the Census Bureau's [Notes on ACS 5-Year Data\n", 141 | "](https://www.census.gov/data/developers/data-sets/acs-5year/data-notes.html), \n", 142 | "> A '*****' entry in the margin of error column indicates that the estimate is controlled. A statistical test for sampling variability is not appropriate.\n", 143 | "\n", 144 | "That `*****` would be in the \"margin of error annotation\" column, which we didn't get, but the corresponding value for the \"margin of error estimate\" (`B01003_001M`) is `-555555555` so... in short, maybe we don't need to deal with the MOE? Let's take another look:" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 4, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/html": [ 155 | "
\n", 156 | "\n", 169 | "\n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | "
GEO_IDB01003_001Estatecounty
B01003_001M
-5555555553104310431043104
1002222
1022222
1044444
1052222
...............
903333
922222
952222
962222
971111
\n", 266 | "

64 rows × 4 columns

\n", 267 | "
" 268 | ], 269 | "text/plain": [ 270 | " GEO_ID B01003_001E state county\n", 271 | "B01003_001M \n", 272 | "-555555555 3104 3104 3104 3104\n", 273 | "100 2 2 2 2\n", 274 | "102 2 2 2 2\n", 275 | "104 4 4 4 4\n", 276 | "105 2 2 2 2\n", 277 | "... ... ... ... ...\n", 278 | "90 3 3 3 3\n", 279 | "92 2 2 2 2\n", 280 | "95 2 2 2 2\n", 281 | "96 2 2 2 2\n", 282 | "97 1 1 1 1\n", 283 | "\n", 284 | "[64 rows x 4 columns]" 285 | ] 286 | }, 287 | "execution_count": 4, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "county_pop.groupby('B01003_001M').count()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "Yep, almost all of the rows indicate that we shouldn't or can't take the margin of error into account, so now all we need to do is sum the values." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 5, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "county_pop = county_pop.rename(columns={'B01003_001E': 'total_pop'}).set_index('GEO_ID')\n", 310 | "\n", 311 | "xref = pd.read_csv('county_district_xref.csv',index_col='geoid', usecols=['geoid','state', 'district'])\n", 312 | "joined = xref.join(county_pop[['total_pop']].astype(int))" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 6, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/html": [ 323 | "
\n", 324 | "\n", 337 | "\n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | "
total_pop
statedistrict
AlabamaMiddle1151252
Northern2870454
Southern842974
AlaskaAlaska738516
ArizonaArizona6946685
.........
West VirginiaNorthern869001
Southern960053
WisconsinEastern3405147
Western2373247
WyomingWyoming581836
\n", 404 | "

91 rows × 1 columns

\n", 405 | "
" 406 | ], 407 | "text/plain": [ 408 | " total_pop\n", 409 | "state district \n", 410 | "Alabama Middle 1151252\n", 411 | " Northern 2870454\n", 412 | " Southern 842974\n", 413 | "Alaska Alaska 738516\n", 414 | "Arizona Arizona 6946685\n", 415 | "... ...\n", 416 | "West Virginia Northern 869001\n", 417 | " Southern 960053\n", 418 | "Wisconsin Eastern 3405147\n", 419 | " Western 2373247\n", 420 | "Wyoming Wyoming 581836\n", 421 | "\n", 422 | "[91 rows x 1 columns]" 423 | ] 424 | }, 425 | "execution_count": 6, 426 | "metadata": {}, 427 | "output_type": "execute_result" 428 | } 429 | ], 430 | "source": [ 431 | "joined.groupby(['state', 'district'])[['total_pop']].sum()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 7, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "joined.groupby(['state', 'district'])[['total_pop']].sum().to_csv('population_by_district_acs2018_5yr.csv')" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 55, 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "data": { 450 | "text/plain": [ 451 | "total_moe 567.2\n", 452 | "nh_white_moe 2622.7\n", 453 | "nh_black_moe 4705.5\n", 454 | "nh_amerind_moe 1834.2\n", 455 | "nh_asian_moe 4833.5\n", 456 | "nh_nhpi_moe 2184.8\n", 457 | "nh_some_other_moe 3311.7\n", 458 | "nh_twoplus_moe 6888.9\n", 459 | "hispanic_moe 2001.3\n", 460 | "dtype: float64" 461 | ] 462 | }, 463 | "execution_count": 55, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "race_by_district_base[moe_cols.values()].max()" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 10, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/html": [ 480 | "
\n", 481 | "\n", 494 | "\n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | "
GEO_IDB03002_001EB03002_003EB03002_004EB03002_005EB03002_006EB03002_007EB03002_008EB03002_009EB03002_012E...B03002_003MB03002_004MB03002_005MB03002_006MB03002_007MB03002_008MB03002_009MB03002_012Mstatecounty
40500000US28015101296511350160008229...20121102020201103528015
50500000US2804321278116839205135000115140...242171632424249312528043
70500000US28041137149896349653000107162...41084320202095-55555555528041
80500000US280933578717068171181524002811281...2714018352727135-55555555528093
100500000US280113259210579208694121300189701...2770521122727125-55555555528011
..................................................................
32080500000US19067158581459327490224270183467...1579110324015112-55555555519067
32120500000US19087199261758544951522915347948...256641691425104-55555555519087
32130500000US19145153631402026010114700323512...1511782571515105-55555555519145
32140500000US191559350382251133525761911719017122...8193126179188285-55555555519155
32160500000US1902120260122265513212597111405107...125741421411265-55555555519021
\n", 788 | "

2069 rows × 21 columns

\n", 789 | "
" 790 | ], 791 | "text/plain": [ 792 | " GEO_ID B03002_001E B03002_003E B03002_004E B03002_005E \\\n", 793 | "4 0500000US28015 10129 6511 3501 6 \n", 794 | "5 0500000US28043 21278 11683 9205 135 \n", 795 | "7 0500000US28041 13714 9896 3496 53 \n", 796 | "8 0500000US28093 35787 17068 17118 15 \n", 797 | "10 0500000US28011 32592 10579 20869 41 \n", 798 | "... ... ... ... ... ... \n", 799 | "3208 0500000US19067 15858 14593 274 90 \n", 800 | "3212 0500000US19087 19926 17585 449 51 \n", 801 | "3213 0500000US19145 15363 14020 260 101 \n", 802 | "3214 0500000US19155 93503 82251 1335 257 \n", 803 | "3216 0500000US19021 20260 12226 551 3 \n", 804 | "\n", 805 | " B03002_006E B03002_007E B03002_008E B03002_009E B03002_012E ... \\\n", 806 | "4 0 0 0 82 29 ... \n", 807 | "5 0 0 0 115 140 ... \n", 808 | "7 0 0 0 107 162 ... \n", 809 | "8 24 0 0 281 1281 ... \n", 810 | "10 213 0 0 189 701 ... \n", 811 | "... ... ... ... ... ... ... \n", 812 | "3208 224 27 0 183 467 ... \n", 813 | "3212 522 9 15 347 948 ... \n", 814 | "3213 147 0 0 323 512 ... \n", 815 | "3214 619 11 7 1901 7122 ... \n", 816 | "3216 2125 97 11 140 5107 ... \n", 817 | "\n", 818 | " B03002_003M B03002_004M B03002_005M B03002_006M B03002_007M \\\n", 819 | "4 20 121 10 20 20 \n", 820 | "5 24 217 163 24 24 \n", 821 | "7 4 108 43 20 20 \n", 822 | "8 27 140 18 35 27 \n", 823 | "10 27 70 52 112 27 \n", 824 | "... ... ... ... ... ... \n", 825 | "3208 15 79 110 32 40 \n", 826 | "3212 25 66 41 69 14 \n", 827 | "3213 15 117 82 57 15 \n", 828 | "3214 8 193 126 179 18 \n", 829 | "3216 12 57 4 142 141 \n", 830 | "\n", 831 | " B03002_008M B03002_009M B03002_012M state county \n", 832 | "4 20 110 35 28 015 \n", 833 | "5 24 93 125 28 043 \n", 834 | "7 20 95 -555555555 28 041 \n", 835 | "8 27 135 -555555555 28 093 \n", 836 | "10 27 125 -555555555 28 011 \n", 837 | "... ... ... ... ... ... \n", 838 | "3208 15 112 -555555555 19 067 \n", 839 | "3212 25 104 -555555555 19 087 \n", 840 | "3213 15 105 -555555555 19 145 \n", 841 | "3214 8 285 -555555555 19 155 \n", 842 | "3216 12 65 -555555555 19 021 \n", 843 | "\n", 844 | "[2069 rows x 21 columns]" 845 | ] 846 | }, 847 | "execution_count": 10, 848 | "metadata": {}, 849 | "output_type": "execute_result" 850 | } 851 | ], 852 | "source": [ 853 | "county_race[(county_race['B03002_004M'] > 100) \n", 854 | " | (county_race['B03002_006M'] > 100) \n", 855 | " | (county_race['B03002_008M'] > 100) \n", 856 | " | (county_race['B03002_009M'] > 100) ]" 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": null, 862 | "metadata": {}, 863 | "outputs": [], 864 | "source": [] 865 | } 866 | ], 867 | "metadata": { 868 | "kernelspec": { 869 | "display_name": "Python 3", 870 | "language": "python", 871 | "name": "python3" 872 | }, 873 | "language_info": { 874 | "codemirror_mode": { 875 | "name": "ipython", 876 | "version": 3 877 | }, 878 | "file_extension": ".py", 879 | "mimetype": "text/x-python", 880 | "name": "python", 881 | "nbconvert_exporter": "python", 882 | "pygments_lexer": "ipython3", 883 | "version": "3.7.6" 884 | } 885 | }, 886 | "nbformat": 4, 887 | "nbformat_minor": 4 888 | } 889 | -------------------------------------------------------------------------------- /crosswalks/zip_to_zcta/build_crosswalk.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import geopandas as gpd\n", 11 | "import json\n", 12 | "import urllib.request\n", 13 | "from pathlib import Path" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Build a master list of ZIP Codes\n", 21 | "\n", 22 | "ZIP Codes change frequently, so this is challenging, and they aren't authoritatively documented in any public resource we know about. \n", 23 | "We'll merge together two sources, GeoNames, and a ZIP Code Business Patters (ZBP) dataset, to get the biggest list of potential ZIPs we'd need to map to a ZCTA. \n", 24 | "\n", 25 | "## GeoNames\n", 26 | "\n", 27 | "The good thing about GeoNames is that each ZIP is assigned a latitude/longitude. It's not clear how those were assigned, which is a liability for this entire process, but we'll hope that they are accurate and, for ZIPs that are not ZCTAs, we'll try to locate the GeoNames coordinate in a ZCTA geometry (below)." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | "
countryzipcitystatestusabcountycounty_fipscommunitycommunity_codelatitudelongitudeaccuracysource
0US99553AkutanAlaskaAKAleutians East013NaNNaN54.1430-165.78541.0geonames
1US99571Cold BayAlaskaAKAleutians East013NaNNaN55.1858-162.72111.0geonames
2US99583False PassAlaskaAKAleutians East013NaNNaN54.8542-163.41131.0geonames
3US99612King CoveAlaskaAKAleutians East013NaNNaN55.0628-162.30561.0geonames
4US99661Sand PointAlaskaAKAleutians East013NaNNaN55.3192-160.49141.0geonames
\n", 154 | "
" 155 | ], 156 | "text/plain": [ 157 | " country zip city state stusab county county_fips \\\n", 158 | "0 US 99553 Akutan Alaska AK Aleutians East 013 \n", 159 | "1 US 99571 Cold Bay Alaska AK Aleutians East 013 \n", 160 | "2 US 99583 False Pass Alaska AK Aleutians East 013 \n", 161 | "3 US 99612 King Cove Alaska AK Aleutians East 013 \n", 162 | "4 US 99661 Sand Point Alaska AK Aleutians East 013 \n", 163 | "\n", 164 | " community community_code latitude longitude accuracy source \n", 165 | "0 NaN NaN 54.1430 -165.7854 1.0 geonames \n", 166 | "1 NaN NaN 55.1858 -162.7211 1.0 geonames \n", 167 | "2 NaN NaN 54.8542 -163.4113 1.0 geonames \n", 168 | "3 NaN NaN 55.0628 -162.3056 1.0 geonames \n", 169 | "4 NaN NaN 55.3192 -160.4914 1.0 geonames " 170 | ] 171 | }, 172 | "execution_count": 2, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "gn = pd.read_csv('geonames_us_zips.csv', dtype={\n", 179 | " 'zip': 'object',\n", 180 | " 'county_fips': 'object'\n", 181 | "})\n", 182 | "gn['source'] = 'geonames'\n", 183 | "\n", 184 | "# We know that GeoNames includes military and diplomatic ZIP Codes and ZIP Codes in the Marshall Islands, none of which have ZCTAs. \n", 185 | "# drop those now so we can avoid the trouble. We'll include other US Island Area postal codes, too, in case we run this with a new file.\n", 186 | "# Puerto Rico DOES have ZCTAs\n", 187 | "NON_ZCTA_POSTAL_ABBRS = ['AS', 'GU', 'MP', 'VI', 'FM', 'MH', 'PW', 'AA', 'AE', 'AP']\n", 188 | "gn = gn[(gn['stusab'].notna()) & (~gn['stusab'].isin(NON_ZCTA_POSTAL_ABBRS))]\n", 189 | "gn.head()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## ZIP Code Business Patterns\n", 197 | "\n", 198 | "The Census Bureau's ZIP Code Business Patterns was the original dataset we wanted to integrate with other data collected at the ZBP level. \n", 199 | "We'll get a bit of data from that program to give us a list of ZIP Codes that \"matter\". The specific query doesn't matter much. We set the `NAICS2017` and `EMPSZES` predicates to values indicating summary statistics, so that we only get back one row per ZIP. \n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 3, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "data": { 209 | "text/html": [ 210 | "
\n", 211 | "\n", 224 | "\n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | "
NAMEZIPCODEESTABNAICS2017EMPSZES
0ZIP 01001 (Agawam, MA)0100147300001
1ZIP 01002 (Amherst, MA)0100253900001
2ZIP 01007 (Belchertown, MA)0100722200001
3ZIP 01550 (Southbridge, MA)0155031600001
4ZIP 01003 (Amherst, MA)010032000001
\n", 278 | "
" 279 | ], 280 | "text/plain": [ 281 | " NAME ZIPCODE ESTAB NAICS2017 EMPSZES\n", 282 | "0 ZIP 01001 (Agawam, MA) 01001 473 00 001\n", 283 | "1 ZIP 01002 (Amherst, MA) 01002 539 00 001\n", 284 | "2 ZIP 01007 (Belchertown, MA) 01007 222 00 001\n", 285 | "3 ZIP 01550 (Southbridge, MA) 01550 316 00 001\n", 286 | "4 ZIP 01003 (Amherst, MA) 01003 20 00 001" 287 | ] 288 | }, 289 | "execution_count": 3, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "request = urllib.request.urlopen('https://api.census.gov/data/2018/zbp?get=NAME,ZIPCODE,ESTAB&NAICS2017=00&EMPSZES=001')\n", 296 | "data = request.read() \n", 297 | "raw_zbp_data = json.loads(data.decode(request.info().get_content_charset()))\n", 298 | "zbp = pd.DataFrame(data=raw_zbp_data[1:],columns=raw_zbp_data[0])\n", 299 | "zbp.head()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 4, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "# we don't need all this data\n", 309 | "# from GeoNames, we'll use 'zip', 'city', 'stusab', 'latitude', 'longitude' -- for context, and to position ZIPs in ZCTAs\n", 310 | "# from ZBP so we'll only merge the ZIPCODE and NAME -- for context\n", 311 | "master_zip = gn[['zip', 'city', 'stusab', 'latitude', 'longitude', 'source']].merge(zbp[['ZIPCODE', 'NAME']],left_on='zip', right_on='ZIPCODE',how='outer')\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 5, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "master_zip = master_zip.rename(columns={\n", 321 | " 'zip': 'geonames_zip',\n", 322 | " 'ZIPCODE': 'zbp_zip',\n", 323 | " 'NAME': 'zbp_title'\n", 324 | "})\n", 325 | "master_zip['zip_code'] = master_zip.apply(lambda x: x['geonames_zip'] if not pd.isnull(x['geonames_zip']) else x['zbp_zip'],axis=1)\n", 326 | "master_zip['source'] = master_zip['source'].fillna('zbp')" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "## ZCTAs\n", 334 | "\n", 335 | "The TIGER ZCTA shapefile provides us with a master list of ZCTAs and their geometries (boundaries). This requires `tl_2019_us_zcta510.zip`, a 500MB shapefile, which is larger than we can store in GitHub.\n", 336 | "\n", 337 | "This code will download it if it's not available, or you can get it from https://www2.census.gov/geo/tiger/TIGER2019/ZCTA5/" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 6, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "tl_2019_us_zcta510.zip is available for use\n" 350 | ] 351 | } 352 | ], 353 | "source": [ 354 | "p = Path('tl_2019_us_zcta510.zip')\n", 355 | "p.exists()\n", 356 | "if not p.exists():\n", 357 | " print(f\"{p.resolve()} not found. Downloading\")\n", 358 | " urllib.request.urlretrieve('https://www2.census.gov/geo/tiger/TIGER2019/ZCTA5/tl_2019_us_zcta510.zip',p.resolve())\n", 359 | "else:\n", 360 | " print(f\"{p} is available for use\")" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 7, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/html": [ 371 | "
\n", 372 | "\n", 385 | "\n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | "
ZCTA5CE10GEOID10CLASSFP10MTFCC10FUNCSTAT10ALAND10AWATER10INTPTLAT10INTPTLON10geometry
04345143451B5G6350S63484186157689+41.3183010-083.6174935POLYGON ((-83.70873 41.32733, -83.70815 41.327...
14345243452B5G6350S12152230413721730+41.5157923-082.9809454POLYGON ((-83.08698 41.53780, -83.08256 41.537...
24345643456B5G6350S93209751003775+41.6318300-082.8393923MULTIPOLYGON (((-82.83558 41.71082, -82.83515 ...
34345743457B5G6350S480046810+41.2673301-083.4274872POLYGON ((-83.49650 41.25371, -83.48382 41.253...
44345843458B5G6350S257381639915+41.5304461-083.2133648POLYGON ((-83.22229 41.53102, -83.22228 41.532...
\n", 469 | "
" 470 | ], 471 | "text/plain": [ 472 | " ZCTA5CE10 GEOID10 CLASSFP10 MTFCC10 FUNCSTAT10 ALAND10 AWATER10 \\\n", 473 | "0 43451 43451 B5 G6350 S 63484186 157689 \n", 474 | "1 43452 43452 B5 G6350 S 121522304 13721730 \n", 475 | "2 43456 43456 B5 G6350 S 9320975 1003775 \n", 476 | "3 43457 43457 B5 G6350 S 48004681 0 \n", 477 | "4 43458 43458 B5 G6350 S 2573816 39915 \n", 478 | "\n", 479 | " INTPTLAT10 INTPTLON10 \\\n", 480 | "0 +41.3183010 -083.6174935 \n", 481 | "1 +41.5157923 -082.9809454 \n", 482 | "2 +41.6318300 -082.8393923 \n", 483 | "3 +41.2673301 -083.4274872 \n", 484 | "4 +41.5304461 -083.2133648 \n", 485 | "\n", 486 | " geometry \n", 487 | "0 POLYGON ((-83.70873 41.32733, -83.70815 41.327... \n", 488 | "1 POLYGON ((-83.08698 41.53780, -83.08256 41.537... \n", 489 | "2 MULTIPOLYGON (((-82.83558 41.71082, -82.83515 ... \n", 490 | "3 POLYGON ((-83.49650 41.25371, -83.48382 41.253... \n", 491 | "4 POLYGON ((-83.22229 41.53102, -83.22228 41.532... " 492 | ] 493 | }, 494 | "execution_count": 7, 495 | "metadata": {}, 496 | "output_type": "execute_result" 497 | } 498 | ], 499 | "source": [ 500 | "zcta_geo = gpd.read_file('zip://tl_2019_us_zcta510.zip')\n", 501 | "zcta_geo.head()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 8, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "# update the `master_zip` data frame with all ZCTAs that match ZIP codes\n", 511 | "mz_w_zcta = master_zip.merge(zcta_geo.rename(columns={'ZCTA5CE10': 'zcta'})[['zcta']],left_on='zip_code', right_on='zcta', how='outer')\n", 512 | "\n", 513 | "# get rid of some of the columns we've been dragging along, and re-order\n", 514 | "mz_w_zcta = mz_w_zcta[['zip_code', 'zcta', 'geonames_zip', 'zbp_zip', 'city', 'stusab', 'zbp_title', 'latitude', 'longitude', 'source']]\n", 515 | "mz_w_zcta['source'] = mz_w_zcta['source'].fillna('tiger')" 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "## Handle ZIPs with no ZCTA\n", 523 | "\n", 524 | "How many are there?" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 9, 530 | "metadata": {}, 531 | "outputs": [ 532 | { 533 | "name": "stdout", 534 | "output_type": "stream", 535 | "text": [ 536 | "ZIPs with no ZCTA: 7987\n" 537 | ] 538 | } 539 | ], 540 | "source": [ 541 | "print(f\"ZIPs with no ZCTA: {len(mz_w_zcta[pd.isnull(mz_w_zcta['zcta'])])}\")" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 10, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "# Create a GeoDataFrame for the ZIP Codes which don't yet have ZCTAs but which do have lat/lon\n", 551 | "temp = mz_w_zcta[pd.isnull(mz_w_zcta['zcta'])][['zip_code', 'latitude', 'longitude']].dropna() # no point in keeping null lat/lng\n", 552 | "zip_wo_zcta_gdf = gpd.GeoDataFrame(temp,geometry=gpd.points_from_xy(temp['longitude'],temp['latitude']), \n", 553 | " crs=\"EPSG:4269\") # projection wasn't actually specified but this is a good bet" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 11, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "# Create a new dataframe which adds ZCTAs for ZIP Codes which can be located within some ZCTA\n", 563 | "# only keep the useful columns from zcta_geo\n", 564 | "geo_joined = gpd.sjoin(zip_wo_zcta_gdf,zcta_geo[['ZCTA5CE10', 'geometry']],how='inner',op='intersects')" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 12, 570 | "metadata": {}, 571 | "outputs": [ 572 | { 573 | "data": { 574 | "text/html": [ 575 | "
\n", 576 | "\n", 589 | "\n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | "
zip_codelatitudelongitudegeometryindex_rightZCTA5CE10
209950961.2181-149.9003POINT (-149.90030 61.21810)1945999501
249951461.2181-149.9003POINT (-149.90030 61.21810)1945999501
309952061.2181-149.9003POINT (-149.90030 61.21810)1945999501
319952161.2181-149.9003POINT (-149.90030 61.21810)1945999501
329952261.2181-149.9003POINT (-149.90030 61.21810)1945999501
\n", 649 | "
" 650 | ], 651 | "text/plain": [ 652 | " zip_code latitude longitude geometry index_right \\\n", 653 | "20 99509 61.2181 -149.9003 POINT (-149.90030 61.21810) 19459 \n", 654 | "24 99514 61.2181 -149.9003 POINT (-149.90030 61.21810) 19459 \n", 655 | "30 99520 61.2181 -149.9003 POINT (-149.90030 61.21810) 19459 \n", 656 | "31 99521 61.2181 -149.9003 POINT (-149.90030 61.21810) 19459 \n", 657 | "32 99522 61.2181 -149.9003 POINT (-149.90030 61.21810) 19459 \n", 658 | "\n", 659 | " ZCTA5CE10 \n", 660 | "20 99501 \n", 661 | "24 99501 \n", 662 | "30 99501 \n", 663 | "31 99501 \n", 664 | "32 99501 " 665 | ] 666 | }, 667 | "execution_count": 12, 668 | "metadata": {}, 669 | "output_type": "execute_result" 670 | } 671 | ], 672 | "source": [ 673 | "geo_joined.head()" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 13, 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "# update the zcta column with values we found by geocoding\n", 683 | "mz_w_zcta = mz_w_zcta.set_index('zip_code')\n", 684 | "mz_w_zcta['zcta'].update(geo_joined.set_index('zip_code')['ZCTA5CE10'])" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 14, 690 | "metadata": {}, 691 | "outputs": [ 692 | { 693 | "name": "stdout", 694 | "output_type": "stream", 695 | "text": [ 696 | "Still need 110\n" 697 | ] 698 | } 699 | ], 700 | "source": [ 701 | "# what's left?\n", 702 | "print(f\"Still need {len(mz_w_zcta[pd.isnull(mz_w_zcta['zcta'])])}\")" 703 | ] 704 | }, 705 | { 706 | "cell_type": "markdown", 707 | "metadata": {}, 708 | "source": [ 709 | "## Manual review\n", 710 | "\n", 711 | "At a certain point, one runs out of technical strategies. We enlisted a student to manually review the remaining unmatched ZIP Codes. The list that student worked with was shorter than our `still_null` here, so even after including these manual updates, this process will leave ZIP Codes not in any ZCTA. See [ZIP_ZCTA_README.md]() for more details on the method.\n", 712 | "\n" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 15, 718 | "metadata": {}, 719 | "outputs": [ 720 | { 721 | "data": { 722 | "text/html": [ 723 | "
\n", 724 | "\n", 737 | "\n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | "
zipzcta
00212302215
10220402203
20220602203
30221702108
40228302111
\n", 773 | "
" 774 | ], 775 | "text/plain": [ 776 | " zip zcta\n", 777 | "0 02123 02215\n", 778 | "1 02204 02203\n", 779 | "2 02206 02203\n", 780 | "3 02217 02108\n", 781 | "4 02283 02111" 782 | ] 783 | }, 784 | "execution_count": 15, 785 | "metadata": {}, 786 | "output_type": "execute_result" 787 | } 788 | ], 789 | "source": [ 790 | "# Load in the key columns from the manual review process\n", 791 | "manual = pd.read_csv('zcta_review.csv',\n", 792 | " dtype={'zip': 'object', 'result': 'object'},\n", 793 | " usecols=['zip','result']).rename(\n", 794 | " columns={ 'result': 'zcta' }\n", 795 | " ).dropna() # drop rows which didn't get a ZCTA\n", 796 | "manual.head()" 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": 16, 802 | "metadata": {}, 803 | "outputs": [], 804 | "source": [ 805 | "# We'll raise errors if anything in manual tries to overwrite something which \n", 806 | "# is not null, since the manual review was based off of a slightly different \n", 807 | "# starting dataset. It would probably be fine to just let them go, or to use\n", 808 | "# overwrite=False to silently ignore manual values if mw_w_zcta already has something\n", 809 | "mz_w_zcta.update(manual.set_index('zip'),errors='raise')" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 17, 815 | "metadata": {}, 816 | "outputs": [ 817 | { 818 | "data": { 819 | "text/html": [ 820 | "
\n", 821 | "\n", 834 | "\n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | "
zctageonames_zipzbp_zipcitystusabzbp_titlelatitudelongitudesource
zip_code
96718NaN9671896718Hawaii National ParkHIZIP 96718 (Hawaii National Park, HI)19.5935-155.4380geonames
04737NaN04737NaNClayton LakeMENaN46.6109-69.5223geonames
89023NaN8902389023MercuryNVZIP 89023 (Mercury, NV)36.6605-115.9945geonames
72405NaNNaN72405NaNNaNZIP 72405 (Jonesboro, AR)NaNNaNzbp
89437NaNNaN89437NaNNaNZIP 89437 (Sparks, NV)NaNNaNzbp
99999NaNNaN99999NaNNaNZIP 99999 (Unclassified)NaNNaNzbp
\n", 936 | "
" 937 | ], 938 | "text/plain": [ 939 | " zcta geonames_zip zbp_zip city stusab \\\n", 940 | "zip_code \n", 941 | "96718 NaN 96718 96718 Hawaii National Park HI \n", 942 | "04737 NaN 04737 NaN Clayton Lake ME \n", 943 | "89023 NaN 89023 89023 Mercury NV \n", 944 | "72405 NaN NaN 72405 NaN NaN \n", 945 | "89437 NaN NaN 89437 NaN NaN \n", 946 | "99999 NaN NaN 99999 NaN NaN \n", 947 | "\n", 948 | " zbp_title latitude longitude source \n", 949 | "zip_code \n", 950 | "96718 ZIP 96718 (Hawaii National Park, HI) 19.5935 -155.4380 geonames \n", 951 | "04737 NaN 46.6109 -69.5223 geonames \n", 952 | "89023 ZIP 89023 (Mercury, NV) 36.6605 -115.9945 geonames \n", 953 | "72405 ZIP 72405 (Jonesboro, AR) NaN NaN zbp \n", 954 | "89437 ZIP 89437 (Sparks, NV) NaN NaN zbp \n", 955 | "99999 ZIP 99999 (Unclassified) NaN NaN zbp " 956 | ] 957 | }, 958 | "execution_count": 17, 959 | "metadata": {}, 960 | "output_type": "execute_result" 961 | } 962 | ], 963 | "source": [ 964 | "# what's left?\n", 965 | "mz_w_zcta[pd.isnull(mz_w_zcta['zcta'])]" 966 | ] 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "metadata": {}, 971 | "source": [ 972 | "## This will have to do!\n", 973 | "\n", 974 | "The three geonames addresses were ones our student reviewed and found good reasons for them not having ZCTAs.\n", 975 | "\n", 976 | "ZIP Code 99999 isn't real, and maybe we should have just dropped it above!\n", 977 | "\n", 978 | "[72405](https://about.usps.com/newsroom/local-releases/ar/2019/0603-new-jonesboro-zip-code.htm) and [89437](https://www.kolotv.com/content/news/Tahoe-Reno-Industrial-Center-to-get-its-own-zip-code-497853001.html) are both quite new, so may get ZCTAs in an upcoming update, or could be added to the manual review file in a future update." 979 | ] 980 | }, 981 | { 982 | "cell_type": "code", 983 | "execution_count": 18, 984 | "metadata": {}, 985 | "outputs": [], 986 | "source": [ 987 | "# just keep the columns we care about\n", 988 | "# If we were working more on this, we might somehow save the \"authority\" or \"source\" so that we would have some idea about where\n", 989 | "# we got the ZIP Codes\n", 990 | "temp = mz_w_zcta.reset_index()[['zip_code','zcta','source']] \n", 991 | "\n", 992 | "# some null zip codes got in here from the ZCTA Shapefile. Why aren't those in GeoNames or ZIP Code Business Patterns?\n", 993 | "# Who can know? But logically, if it's a ZCTA, then we assume that it has a matching ZIP Code\n", 994 | "temp['zip_code'] = temp['zip_code'].fillna(temp['zcta'])\n" 995 | ] 996 | }, 997 | { 998 | "cell_type": "code", 999 | "execution_count": 19, 1000 | "metadata": {}, 1001 | "outputs": [], 1002 | "source": [ 1003 | "temp.to_csv('zip_zcta_xref.csv',index=False)" 1004 | ] 1005 | } 1006 | ], 1007 | "metadata": { 1008 | "kernelspec": { 1009 | "display_name": "Python 3", 1010 | "language": "python", 1011 | "name": "python3" 1012 | }, 1013 | "language_info": { 1014 | "codemirror_mode": { 1015 | "name": "ipython", 1016 | "version": 3 1017 | }, 1018 | "file_extension": ".py", 1019 | "mimetype": "text/x-python", 1020 | "name": "python", 1021 | "nbconvert_exporter": "python", 1022 | "pygments_lexer": "ipython3", 1023 | "version": "3.7.6" 1024 | } 1025 | }, 1026 | "nbformat": 4, 1027 | "nbformat_minor": 4 1028 | } 1029 | -------------------------------------------------------------------------------- /crosswalks/judicial_districts/race_by_district.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Race by District (with Margin of Error)\n", 8 | "\n", 9 | "This workbook demonstrates how to aggregate ACS data where some estimates may be less reliable, typically because they are for small subgroups.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import cenpy # https://pypi.org/project/cenpy/ \n", 20 | "import census_data_aggregator # https://pypi.org/project/census-data-aggregator/" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | "
label
B03002_001EEstimate!!Total
B03002_002EEstimate!!Total!!Not Hispanic or Latino
B03002_003EEstimate!!Total!!Not Hispanic or Latino!!White alone
B03002_004EEstimate!!Total!!Not Hispanic or Latino!!Black or African American alone
B03002_005EEstimate!!Total!!Not Hispanic or Latino!!American Indian and Alaska Native alone
B03002_006EEstimate!!Total!!Not Hispanic or Latino!!Asian alone
B03002_007EEstimate!!Total!!Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone
B03002_008EEstimate!!Total!!Not Hispanic or Latino!!Some other race alone
B03002_009EEstimate!!Total!!Not Hispanic or Latino!!Two or more races
B03002_010EEstimate!!Total!!Not Hispanic or Latino!!Two or more races!!Two races including Some other race
B03002_011EEstimate!!Total!!Not Hispanic or Latino!!Two or more races!!Two races excluding Some other race, and three or more races
B03002_012EEstimate!!Total!!Hispanic or Latino
B03002_013EEstimate!!Total!!Hispanic or Latino!!White alone
B03002_014EEstimate!!Total!!Hispanic or Latino!!Black or African American alone
B03002_015EEstimate!!Total!!Hispanic or Latino!!American Indian and Alaska Native alone
B03002_016EEstimate!!Total!!Hispanic or Latino!!Asian alone
B03002_017EEstimate!!Total!!Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone
B03002_018EEstimate!!Total!!Hispanic or Latino!!Some other race alone
B03002_019EEstimate!!Total!!Hispanic or Latino!!Two or more races
B03002_020EEstimate!!Total!!Hispanic or Latino!!Two or more races!!Two races including Some other race
B03002_021EEstimate!!Total!!Hispanic or Latino!!Two or more races!!Two races excluding Some other race, and three or more races
\n", 139 | "
" 140 | ], 141 | "text/plain": [ 142 | " label\n", 143 | "B03002_001E Estimate!!Total\n", 144 | "B03002_002E Estimate!!Total!!Not Hispanic or Latino\n", 145 | "B03002_003E Estimate!!Total!!Not Hispanic or Latino!!White alone\n", 146 | "B03002_004E Estimate!!Total!!Not Hispanic or Latino!!Black or African American alone\n", 147 | "B03002_005E Estimate!!Total!!Not Hispanic or Latino!!American Indian and Alaska Native alone\n", 148 | "B03002_006E Estimate!!Total!!Not Hispanic or Latino!!Asian alone\n", 149 | "B03002_007E Estimate!!Total!!Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone\n", 150 | "B03002_008E Estimate!!Total!!Not Hispanic or Latino!!Some other race alone\n", 151 | "B03002_009E Estimate!!Total!!Not Hispanic or Latino!!Two or more races\n", 152 | "B03002_010E Estimate!!Total!!Not Hispanic or Latino!!Two or more races!!Two races including Some other race\n", 153 | "B03002_011E Estimate!!Total!!Not Hispanic or Latino!!Two or more races!!Two races excluding Some other race, and three or more races\n", 154 | "B03002_012E Estimate!!Total!!Hispanic or Latino\n", 155 | "B03002_013E Estimate!!Total!!Hispanic or Latino!!White alone\n", 156 | "B03002_014E Estimate!!Total!!Hispanic or Latino!!Black or African American alone\n", 157 | "B03002_015E Estimate!!Total!!Hispanic or Latino!!American Indian and Alaska Native alone\n", 158 | "B03002_016E Estimate!!Total!!Hispanic or Latino!!Asian alone\n", 159 | "B03002_017E Estimate!!Total!!Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone\n", 160 | "B03002_018E Estimate!!Total!!Hispanic or Latino!!Some other race alone\n", 161 | "B03002_019E Estimate!!Total!!Hispanic or Latino!!Two or more races\n", 162 | "B03002_020E Estimate!!Total!!Hispanic or Latino!!Two or more races!!Two races including Some other race\n", 163 | "B03002_021E Estimate!!Total!!Hispanic or Latino!!Two or more races!!Two races excluding Some other race, and three or more races" 164 | ] 165 | }, 166 | "execution_count": 2, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "acs = cenpy.products.APIConnection('ACSDT5Y2018')\n", 173 | "\n", 174 | "# Refresh our memory on the variable codes for various columns in the race tables\n", 175 | "pd.set_option('display.max_colwidth',None)\n", 176 | "acs.varslike('B03002_*')[['label']].sort_index()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 3, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "# make something to help us use friendlier names for the columns\n", 186 | "# Use an ordered dict to ensure that things between estimate and MOE cols stay in sync\n", 187 | "from collections import OrderedDict\n", 188 | "race_cols = OrderedDict([\n", 189 | " ('B03002_001E', 'total'),\n", 190 | " ('B03002_003E', 'nh_white'),\n", 191 | " ('B03002_004E', 'nh_black'),\n", 192 | " ('B03002_005E', 'nh_amerind'),\n", 193 | " ('B03002_006E', 'nh_asian'),\n", 194 | " ('B03002_007E', 'nh_nhpi'),\n", 195 | " ('B03002_008E', 'nh_some_other'),\n", 196 | " ('B03002_009E', 'nh_twoplus'),\n", 197 | " ('B03002_012E', 'hispanic')\n", 198 | "])\n", 199 | "moe_cols = OrderedDict((k.replace('E','M'),v+\"_moe\") for k,v in race_cols.items())\n", 200 | "query_cols = ['GEO_ID'] + list(race_cols.keys()) + list(moe_cols.keys())\n", 201 | "county_race = acs.query(query_cols,'county')\n", 202 | "for k in query_cols[1:]: # cenpy doesn't cast estimates to integer so we have to handle that.\n", 203 | " county_race[k] = county_race[k].astype(int)\n", 204 | "county_race = county_race.rename(columns=race_cols).rename(columns=moe_cols)\n", 205 | "\n", 206 | "# a Margin of Error value of -555555555 \"indicates that the estimate is controlled. \n", 207 | "# A statistical test for sampling variability is not appropriate.\"\n", 208 | "# The math doesn't work with that value, so replace those with 0\n", 209 | "county_race = county_race.replace(-555555555,0) \n", 210 | "county_race = county_race.drop(['state', 'county'], axis='columns') # API gives us those but we don't need them" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 4, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "# Join our crosswalk to the ACS data\n", 220 | "xref = pd.read_csv('county_district_xref.csv',index_col='geoid', usecols=['geoid','state', 'district'])\n", 221 | "joined = xref.join(county_race.set_index('GEO_ID'))# xref\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 5, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "# a helper function so that we can sum more than one estimate/moe pair in a given data frame\n", 231 | "def sum_with_moe(df, *column_pairs):\n", 232 | " \"\"\"Given a data frame and a list of one or more tuples representing estimate/error pairs,\n", 233 | " return a dictionary where each key is one of the values from column pairs and the corresponding\n", 234 | " value is the approximate sum, or approximate error for the sum.\n", 235 | " \"\"\"\n", 236 | " result = {}\n", 237 | " for est,err in column_pairs:\n", 238 | " tuples = [tuple(x) for x in df[[est,err]].to_numpy()]\n", 239 | " est_sum, err_sum = census_data_aggregator.approximate_sum(*tuples)\n", 240 | " result[est] = est_sum\n", 241 | " result[err] = err_sum\n", 242 | " return result\n", 243 | "\n", 244 | "def compute_single_cv(est,moe):\n", 245 | " se = moe/1.645 # assumes normal distribution\n", 246 | " cv = se/est*100\n", 247 | " return cv\n", 248 | "\n", 249 | "def compute_cvs(df, *column_pairs):\n", 250 | " \"\"\"Given a data frame and a list of one or more tuples representing estimate/error pairs,\n", 251 | " return a new DataFrame where each column represents the CV for one of the pairs.\n", 252 | " Columns in the new DataFrame will be named by appending \"_cv\" to the first value\n", 253 | " in each column_pair.\n", 254 | " \"\"\"\n", 255 | " cvs = []\n", 256 | " for est,moe in column_pairs:\n", 257 | " cv = df[[est,moe]].apply(lambda x: compute_single_cv(x[est],x[moe]),axis=1)\n", 258 | " cv.name = f\"{est}_cv\"\n", 259 | " cvs.append(cv)\n", 260 | " return pd.concat(cvs,axis=1)\n" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 6, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/html": [ 271 | "
\n", 272 | "\n", 285 | "\n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | "
statedistricttotaltotal_moenh_whitenh_white_moenh_blacknh_black_moenh_amerindnh_amerind_moenh_asiannh_asian_moenh_nhpinh_nhpi_moenh_some_othernh_some_other_moenh_twoplusnh_twoplus_moehispanichispanic_moe
0AlabamaMiddle11512520.0684038454.93822061223.03308396.818388533.3205143.71551479.4207991337.340757148.2
1AlabamaNorthern28704540.01999982585.56284661964.212469846.8350081034.51169223.04155918.7505062237.2138699193.9
2AlabamaSouthern8429740.0512710370.52750651002.87466537.910540583.5147103.01797566.0115591097.323690294.9
3AlaskaAlaska738516564.0450754677.222817730.11035061448.545617998.28544395.91459515.6546331880.251186257.9
4ArizonaArizona69466850.038258861206.22866142527.92719461834.22224772081.612523561.291771290.81547503919.121633120.0
\n", 429 | "
" 430 | ], 431 | "text/plain": [ 432 | " state district total total_moe nh_white nh_white_moe nh_black \\\n", 433 | "0 Alabama Middle 1151252 0.0 684038 454.9 382206 \n", 434 | "1 Alabama Northern 2870454 0.0 1999982 585.5 628466 \n", 435 | "2 Alabama Southern 842974 0.0 512710 370.5 275065 \n", 436 | "3 Alaska Alaska 738516 564.0 450754 677.2 22817 \n", 437 | "4 Arizona Arizona 6946685 0.0 3825886 1206.2 286614 \n", 438 | "\n", 439 | " nh_black_moe nh_amerind nh_amerind_moe nh_asian nh_asian_moe nh_nhpi \\\n", 440 | "0 1223.0 3308 396.8 18388 533.3 205 \n", 441 | "1 1964.2 12469 846.8 35008 1034.5 1169 \n", 442 | "2 1002.8 7466 537.9 10540 583.5 147 \n", 443 | "3 730.1 103506 1448.5 45617 998.2 8544 \n", 444 | "4 2527.9 271946 1834.2 222477 2081.6 12523 \n", 445 | "\n", 446 | " nh_nhpi_moe nh_some_other nh_some_other_moe nh_twoplus nh_twoplus_moe \\\n", 447 | "0 143.7 1551 479.4 20799 1337.3 \n", 448 | "1 223.0 4155 918.7 50506 2237.2 \n", 449 | "2 103.0 1797 566.0 11559 1097.3 \n", 450 | "3 395.9 1459 515.6 54633 1880.2 \n", 451 | "4 561.2 9177 1290.8 154750 3919.1 \n", 452 | "\n", 453 | " hispanic hispanic_moe \n", 454 | "0 40757 148.2 \n", 455 | "1 138699 193.9 \n", 456 | "2 23690 294.9 \n", 457 | "3 51186 257.9 \n", 458 | "4 2163312 0.0 " 459 | ] 460 | }, 461 | "execution_count": 6, 462 | "metadata": {}, 463 | "output_type": "execute_result" 464 | } 465 | ], 466 | "source": [ 467 | "# sum the counties\n", 468 | "sums = []\n", 469 | "\n", 470 | "for (state, district), df in joined.groupby(['state', 'district']):\n", 471 | " tuples = zip(race_cols.values(), moe_cols.values()) # we've renamed the columns to the values of those dicts\n", 472 | " d = sum_with_moe(df, *tuples)\n", 473 | " d['state'] = state\n", 474 | " d['district'] = district\n", 475 | " sums.append(d)\n", 476 | "\n", 477 | "race_by_district_base = pd.DataFrame(sums) \n", 478 | "\n", 479 | "cols = list(race_by_district_base.columns) # for review purposes, it will be nice to have our grouping values at the front\n", 480 | "cols.remove('state') # so take them out\n", 481 | "cols.remove('district')\n", 482 | "cols = ['state', 'district'] + cols # put them where we want them\n", 483 | "race_by_district_base = race_by_district_base[cols]\n", 484 | "pd.options.display.float_format = '{:.1f}'.format\n", 485 | "race_by_district_base.head() # how does that look?" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "## Now what?\n", 493 | "\n", 494 | "Having aggregated margins of error enables two things: to test whether any given estimate is \"reliable\", and to test whether any two values are *significantly* different. \n", 495 | "\n", 496 | "For now, we'll defer checking for \"significant difference,\" since I didn't feel like fishing around for pairs to compare. I'll just say that the LA Times DataDesk team has a python library which encapsulates the [statistical difference test](https://github.com/datadesk/census-error-analyzer#test-statistical-difference), so you might want to use that instead of re-implementing it. \n", 497 | "\n", 498 | "\n", 499 | "Testing reliability involves computing the Coefficient of Variation (CV). There are no hard and fast rules, but, as documented in this [Tufts GIS tutorial](http://sites.tufts.edu/gis/files/2013/11/Amercian-Community-Survey_Margin-of-error-tutorial.pdf), here are two rules of thumb about how to proceed with a given CV.\n", 500 | "\n", 501 | "\n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | "
SourceHigh reliabilityMedium \"be careful\"Low \"use extreme caution\"
Census BureauCV <15%CV 15-30%CV >30%
ESRICV <12%CV 12-40%CV >40%
\n", 521 | " \n", 522 | " " 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 7, 528 | "metadata": {}, 529 | "outputs": [ 530 | { 531 | "data": { 532 | "text/html": [ 533 | "
\n", 534 | "\n", 547 | "\n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | "
statedistricttotal_cvnh_white_cvnh_black_cvnh_amerind_cvnh_asian_cvnh_nhpi_cvnh_some_other_cvnh_twoplus_cvhispanic_cv
0AlabamaMiddle0.00.00.27.31.842.618.83.90.2
1AlabamaNorthern0.00.00.24.11.811.613.42.70.1
2AlabamaSouthern0.00.00.24.43.442.619.15.80.8
3AlaskaAlaska0.00.11.90.91.32.821.52.10.3
4ArizonaArizona0.00.00.50.40.62.78.61.50.0
\n", 637 | "
" 638 | ], 639 | "text/plain": [ 640 | " state district total_cv nh_white_cv nh_black_cv nh_amerind_cv \\\n", 641 | "0 Alabama Middle 0.0 0.0 0.2 7.3 \n", 642 | "1 Alabama Northern 0.0 0.0 0.2 4.1 \n", 643 | "2 Alabama Southern 0.0 0.0 0.2 4.4 \n", 644 | "3 Alaska Alaska 0.0 0.1 1.9 0.9 \n", 645 | "4 Arizona Arizona 0.0 0.0 0.5 0.4 \n", 646 | "\n", 647 | " nh_asian_cv nh_nhpi_cv nh_some_other_cv nh_twoplus_cv hispanic_cv \n", 648 | "0 1.8 42.6 18.8 3.9 0.2 \n", 649 | "1 1.8 11.6 13.4 2.7 0.1 \n", 650 | "2 3.4 42.6 19.1 5.8 0.8 \n", 651 | "3 1.3 2.8 21.5 2.1 0.3 \n", 652 | "4 0.6 2.7 8.6 1.5 0.0 " 653 | ] 654 | }, 655 | "execution_count": 7, 656 | "metadata": {}, 657 | "output_type": "execute_result" 658 | } 659 | ], 660 | "source": [ 661 | "tuples = zip(race_cols.values(), moe_cols.values()) # again, get pairs of column names for estimate/moe\n", 662 | "race_district_cvs = compute_cvs(race_by_district_base,*tuples)\n", 663 | "race_district_cvs.insert(0,'state',race_by_district_base['state']) # the indexes will be aligned, so we can just\n", 664 | "race_district_cvs.insert(1,'district',race_by_district_base['district']) # insert the group labels\n", 665 | "race_district_cvs.head() " 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [] 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "metadata": {}, 678 | "source": [ 679 | " ## What have we got\n", 680 | " \n", 681 | "Typically, you'd probably just consult the CV matrix for specific values before you went too far using them, but for our purposes, let's iterate through and see where we should take care. You'll see that the most common cases of caution are for populations which tend to be small -- \"Native Hawaiian/Pacific Islander\" (except in Hawaii) and \"Some other race\" (which is most often used by Latinos, and so is often quite small among non-hispanic populations)" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 8, 687 | "metadata": {}, 688 | "outputs": [ 689 | { 690 | "name": "stdout", 691 | "output_type": "stream", 692 | "text": [ 693 | "Reviewing reliability of aggregated race by district\n", 694 | "\n", 695 | "Alabama Middle\n", 696 | " nh_nhpi_cv - 42.6 - low reliability - use extreme caution\n", 697 | " nh_some_other_cv - 18.8 - med reliability - use caution\n", 698 | "\n", 699 | "Alabama Northern\n", 700 | " No warnings\n", 701 | "\n", 702 | "Alabama Southern\n", 703 | " nh_nhpi_cv - 42.6 - low reliability - use extreme caution\n", 704 | " nh_some_other_cv - 19.1 - med reliability - use caution\n", 705 | "\n", 706 | "Alaska\n", 707 | " nh_some_other_cv - 21.5 - med reliability - use caution\n", 708 | "\n", 709 | "Arizona\n", 710 | " No warnings\n", 711 | "\n", 712 | "Arkansas Eastern\n", 713 | " nh_nhpi_cv - 28.5 - med reliability - use caution\n", 714 | " nh_some_other_cv - 22.4 - med reliability - use caution\n", 715 | "\n", 716 | "Arkansas Western\n", 717 | " nh_some_other_cv - 18.3 - med reliability - use caution\n", 718 | "\n", 719 | "California Central\n", 720 | " No warnings\n", 721 | "\n", 722 | "California Eastern\n", 723 | " No warnings\n", 724 | "\n", 725 | "California Northern\n", 726 | " No warnings\n", 727 | "\n", 728 | "California Southern\n", 729 | " No warnings\n", 730 | "\n", 731 | "Colorado\n", 732 | " No warnings\n", 733 | "\n", 734 | "Connecticut\n", 735 | " nh_nhpi_cv - 18.2 - med reliability - use caution\n", 736 | "\n", 737 | "Delaware\n", 738 | " nh_nhpi_cv - 18.7 - med reliability - use caution\n", 739 | "\n", 740 | "District of Columbia\n", 741 | " nh_nhpi_cv - 16.3 - med reliability - use caution\n", 742 | "\n", 743 | "Florida Middle\n", 744 | " No warnings\n", 745 | "\n", 746 | "Florida Northern\n", 747 | " No warnings\n", 748 | "\n", 749 | "Florida Southern\n", 750 | " No warnings\n", 751 | "\n", 752 | "Georgia Middle\n", 753 | " nh_nhpi_cv - 17.6 - med reliability - use caution\n", 754 | "\n", 755 | "Georgia Northern\n", 756 | " nh_nhpi_cv - 16.2 - med reliability - use caution\n", 757 | "\n", 758 | "Georgia Southern\n", 759 | " No warnings\n", 760 | "\n", 761 | "Hawaii\n", 762 | " No warnings\n", 763 | "\n", 764 | "Idaho\n", 765 | " nh_some_other_cv - 16.7 - med reliability - use caution\n", 766 | "\n", 767 | "Illinois Central\n", 768 | " nh_nhpi_cv - 20.0 - med reliability - use caution\n", 769 | "\n", 770 | "Illinois Northern\n", 771 | " No warnings\n", 772 | "\n", 773 | "Illinois Southern\n", 774 | " nh_nhpi_cv - 27.0 - med reliability - use caution\n", 775 | " nh_some_other_cv - 18.1 - med reliability - use caution\n", 776 | "\n", 777 | "Indiana Northern\n", 778 | " No warnings\n", 779 | "\n", 780 | "Indiana Southern\n", 781 | " No warnings\n", 782 | "\n", 783 | "Iowa Northern\n", 784 | " nh_some_other_cv - 23.6 - med reliability - use caution\n", 785 | "\n", 786 | "Iowa Southern\n", 787 | " nh_some_other_cv - 18.5 - med reliability - use caution\n", 788 | "\n", 789 | "Kansas\n", 790 | " No warnings\n", 791 | "\n", 792 | "Kentucky Eastern\n", 793 | " nh_nhpi_cv - 16.8 - med reliability - use caution\n", 794 | " nh_some_other_cv - 16.4 - med reliability - use caution\n", 795 | "\n", 796 | "Kentucky Western\n", 797 | " No warnings\n", 798 | "\n", 799 | "Louisiana Eastern\n", 800 | " nh_nhpi_cv - 28.1 - med reliability - use caution\n", 801 | "\n", 802 | "Louisiana Middle\n", 803 | " nh_nhpi_cv - 24.8 - med reliability - use caution\n", 804 | " nh_some_other_cv - 26.6 - med reliability - use caution\n", 805 | "\n", 806 | "Louisiana Western\n", 807 | " No warnings\n", 808 | "\n", 809 | "Maine\n", 810 | " nh_nhpi_cv - 22.9 - med reliability - use caution\n", 811 | " nh_some_other_cv - 18.4 - med reliability - use caution\n", 812 | "\n", 813 | "Maryland\n", 814 | " No warnings\n", 815 | "\n", 816 | "Massachusetts\n", 817 | " No warnings\n", 818 | "\n", 819 | "Michigan Eastern\n", 820 | " No warnings\n", 821 | "\n", 822 | "Michigan Western\n", 823 | " No warnings\n", 824 | "\n", 825 | "Minnesota\n", 826 | " No warnings\n", 827 | "\n", 828 | "Mississippi northern\n", 829 | " nh_nhpi_cv - 30.0 - med reliability - use caution\n", 830 | " nh_some_other_cv - 24.4 - med reliability - use caution\n", 831 | "\n", 832 | "Mississippi southern\n", 833 | " nh_nhpi_cv - 28.8 - med reliability - use caution\n", 834 | " nh_some_other_cv - 16.1 - med reliability - use caution\n", 835 | "\n", 836 | "Missouri Eastern\n", 837 | " No warnings\n", 838 | "\n", 839 | "Missouri Western\n", 840 | " No warnings\n", 841 | "\n", 842 | "Montana\n", 843 | " nh_some_other_cv - 25.0 - med reliability - use caution\n", 844 | "\n", 845 | "Nebraska\n", 846 | " No warnings\n", 847 | "\n", 848 | "Nevada\n", 849 | " No warnings\n", 850 | "\n", 851 | "New Hampshire\n", 852 | " nh_nhpi_cv - 20.7 - med reliability - use caution\n", 853 | " nh_some_other_cv - 18.0 - med reliability - use caution\n", 854 | "\n", 855 | "New Jersey\n", 856 | " No warnings\n", 857 | "\n", 858 | "New Mexico\n", 859 | " No warnings\n", 860 | "\n", 861 | "New York Eastern\n", 862 | " No warnings\n", 863 | "\n", 864 | "New York Northern\n", 865 | " nh_nhpi_cv - 15.1 - med reliability - use caution\n", 866 | "\n", 867 | "New York Southern\n", 868 | " No warnings\n", 869 | "\n", 870 | "New York Western\n", 871 | " No warnings\n", 872 | "\n", 873 | "North Carolina Eastern\n", 874 | " No warnings\n", 875 | "\n", 876 | "North Carolina Middle\n", 877 | " No warnings\n", 878 | "\n", 879 | "North Carolina Western\n", 880 | " No warnings\n", 881 | "\n", 882 | "North Dakota\n", 883 | " nh_nhpi_cv - 24.4 - med reliability - use caution\n", 884 | " nh_some_other_cv - 30.5 - low reliability - use extreme caution\n", 885 | "\n", 886 | "Ohio Northern\n", 887 | " No warnings\n", 888 | "\n", 889 | "Ohio Southern\n", 890 | " No warnings\n", 891 | "\n", 892 | "Oklahoma Eastern\n", 893 | " nh_some_other_cv - 16.7 - med reliability - use caution\n", 894 | "\n", 895 | "Oklahoma Northern\n", 896 | " No warnings\n", 897 | "\n", 898 | "Oklahoma Western\n", 899 | " No warnings\n", 900 | "\n", 901 | "Oregon\n", 902 | " No warnings\n", 903 | "\n", 904 | "Pennsylvania Eastern\n", 905 | " No warnings\n", 906 | "\n", 907 | "Pennsylvania Middle\n", 908 | " nh_nhpi_cv - 17.8 - med reliability - use caution\n", 909 | "\n", 910 | "Pennsylvania Western\n", 911 | " No warnings\n", 912 | "\n", 913 | "Puerto Rico\n", 914 | " nh_amerind_cv - 53.5 - low reliability - use extreme caution\n", 915 | " nh_asian_cv - 17.3 - med reliability - use caution\n", 916 | " nh_nhpi_cv - 66.0 - low reliability - use extreme caution\n", 917 | "\n", 918 | "Rhode Island\n", 919 | " nh_nhpi_cv - 22.4 - med reliability - use caution\n", 920 | "\n", 921 | "South Carolina\n", 922 | " No warnings\n", 923 | "\n", 924 | "South Dakota\n", 925 | " nh_nhpi_cv - 23.9 - med reliability - use caution\n", 926 | " nh_some_other_cv - 33.3 - low reliability - use extreme caution\n", 927 | "\n", 928 | "Tennessee Eastern\n", 929 | " nh_nhpi_cv - 18.3 - med reliability - use caution\n", 930 | " nh_some_other_cv - 15.6 - med reliability - use caution\n", 931 | "\n", 932 | "Tennessee Middle\n", 933 | " No warnings\n", 934 | "\n", 935 | "Tennessee Western\n", 936 | " nh_nhpi_cv - 23.7 - med reliability - use caution\n", 937 | " nh_some_other_cv - 18.4 - med reliability - use caution\n", 938 | "\n", 939 | "Texas Eastern\n", 940 | " No warnings\n", 941 | "\n", 942 | "Texas Northern\n", 943 | " No warnings\n", 944 | "\n", 945 | "Texas Southern\n", 946 | " No warnings\n", 947 | "\n", 948 | "Texas Western\n", 949 | " No warnings\n", 950 | "\n", 951 | "Utah\n", 952 | " No warnings\n", 953 | "\n", 954 | "Vermont\n", 955 | " nh_nhpi_cv - 25.5 - med reliability - use caution\n", 956 | " nh_some_other_cv - 17.9 - med reliability - use caution\n", 957 | "\n", 958 | "Virginia Eastern\n", 959 | " No warnings\n", 960 | "\n", 961 | "Virginia Western\n", 962 | " nh_nhpi_cv - 18.4 - med reliability - use caution\n", 963 | "\n", 964 | "Washington Eastern\n", 965 | " nh_some_other_cv - 20.6 - med reliability - use caution\n", 966 | "\n", 967 | "Washington Western\n", 968 | " No warnings\n", 969 | "\n", 970 | "West Virginia Northern\n", 971 | " nh_nhpi_cv - 46.7 - low reliability - use extreme caution\n", 972 | " nh_some_other_cv - 18.6 - med reliability - use caution\n", 973 | "\n", 974 | "West Virginia Southern\n", 975 | " nh_nhpi_cv - 30.5 - low reliability - use extreme caution\n", 976 | " nh_some_other_cv - 22.1 - med reliability - use caution\n", 977 | "\n", 978 | "Wisconsin Eastern\n", 979 | " No warnings\n", 980 | "\n", 981 | "Wisconsin Western\n", 982 | " nh_nhpi_cv - 23.7 - med reliability - use caution\n", 983 | "\n", 984 | "Wyoming\n", 985 | " nh_nhpi_cv - 32.5 - low reliability - use extreme caution\n", 986 | " nh_some_other_cv - 34.3 - low reliability - use extreme caution\n", 987 | "\n" 988 | ] 989 | } 990 | ], 991 | "source": [ 992 | "print(\"Reviewing reliability of aggregated race by district\\n\")\n", 993 | "for idx, row in race_district_cvs.iterrows():\n", 994 | " warnings = []\n", 995 | " for col in race_district_cvs.columns[2:]: # iterate all the non-label columns\n", 996 | " if row[col] > 30:\n", 997 | " warnings.append(f\"{col:>17} - {row[col]:.1f} - low reliability - use extreme caution\")\n", 998 | " elif row[col] > 15:\n", 999 | " warnings.append(f\"{col:>17} - {row[col]:.1f} - med reliability - use caution\")\n", 1000 | " if row['state'] == row['district']: # simplify for single-district states\n", 1001 | " print(f\"{row['state']}\")\n", 1002 | " else:\n", 1003 | " print(f\"{row['state']} {row['district']}\")\n", 1004 | " if len(warnings) == 0:\n", 1005 | " print(\" No warnings\")\n", 1006 | " else:\n", 1007 | " for w in warnings:\n", 1008 | " print(f\" {w}\")\n", 1009 | " print(\"\")\n", 1010 | " " 1011 | ] 1012 | } 1013 | ], 1014 | "metadata": { 1015 | "kernelspec": { 1016 | "display_name": "Python 3", 1017 | "language": "python", 1018 | "name": "python3" 1019 | }, 1020 | "language_info": { 1021 | "codemirror_mode": { 1022 | "name": "ipython", 1023 | "version": 3 1024 | }, 1025 | "file_extension": ".py", 1026 | "mimetype": "text/x-python", 1027 | "name": "python", 1028 | "nbconvert_exporter": "python", 1029 | "pygments_lexer": "ipython3", 1030 | "version": "3.7.6" 1031 | } 1032 | }, 1033 | "nbformat": 4, 1034 | "nbformat_minor": 4 1035 | } 1036 | --------------------------------------------------------------------------------