├── census_parquet ├── __init__.py ├── tests │ ├── __init__.py │ ├── general_checks.py │ └── test_polygons_to_points.py ├── download_blocks.sh ├── download_boundaries.sh ├── download_population_stats.sh ├── cli.py ├── generate_synthetic_people.py ├── process_boundaries.py └── process_blocks.py ├── .github ├── pull_request_template.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature-proposal.md └── workflows │ └── pypi-publish.yml ├── setup.cfg ├── .gitignore ├── setup.py ├── CHANGELOG.md ├── LICENSE.txt ├── RELEASE.md ├── README.md └── CODE_OF_CONDUCT.md /census_parquet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /census_parquet/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Fixes # 2 | 3 | ## Proposed Changes 4 | 5 | - 6 | - 7 | - 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license = MIT 3 | license_files = LICENSE.txt 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown 6 | -------------------------------------------------------------------------------- /census_parquet/download_blocks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget -w 0.5 -c -r -np -nH -nv -e robots=off -R "index.html*" --cut-dirs=3 https://www2.census.gov/geo/tiger/TIGER2020/TABBLOCK20/ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # MacOS Files 2 | .DS_Store 3 | 4 | # Distribution / packaging 5 | build/ 6 | dist/ 7 | census_parquet.egg-info/ 8 | 9 | #Outputs and Downloads 10 | population_stats/ 11 | outputs/ 12 | boundary_outputs/ 13 | census_boundaries/ 14 | TABBLOCK20/ 15 | *.parquet -------------------------------------------------------------------------------- /census_parquet/download_boundaries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p census_boundaries 3 | cd census_boundaries 4 | wget https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_all_500k.zip 5 | unzip cb_2020_us_all_500k.zip 6 | rm cb_2020_us_all_500k.zip 7 | wget https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_nation_5m.zip -------------------------------------------------------------------------------- /census_parquet/tests/general_checks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import geopandas as gpd 3 | import dask_geopandas 4 | import pandas as pd 5 | 6 | def general_output_checks_pd(input_gdf: gpd.GeoDataFrame, output_df: pd.DataFrame): 7 | assert isinstance(output_df, pd.DataFrame) 8 | 9 | def general_output_checks_dask(input_gdf: dask_geopandas.GeoDataFrame, output: str): 10 | a = dask_geopandas.read_parquet(output) 11 | assert a.known_divisions 12 | 13 | -------------------------------------------------------------------------------- /census_parquet/download_population_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget -w 0.5 -r -np -nH -nv -e robots=off -R "index.html*" --cut-dirs=4 https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/ 3 | mkdir -p population_stats 4 | find 01-Redistricting_File--PL_94-171 -name '*.pl.zip' -exec mv {} ./population_stats \; 5 | find ./population_stats -name '*.pl.zip' -execdir unzip {} \; 6 | 7 | cd population_stats 8 | wget https://www2.census.gov/programs-surveys/decennial/rdo/about/2020-census-program/Phase3/SupportMaterials/2020_PLSummaryFile_FieldNames.xlsx 9 | cd - 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='census-parquet', 5 | version='0.0.10', 6 | packages=['census_parquet'], 7 | description='Tools for generating Parquet files from US Census 2020', 8 | author='makepath', 9 | url='https://github.com/makepath/census-parquet', 10 | entry_points={ 11 | 'console_scripts': ['run_census_parquet=census_parquet.cli:start', 'run_synthetic_people=census_parquet.cli:synthetic_people'] 12 | }, 13 | install_requires=[ 14 | 'click', 15 | 'dask_geopandas', 16 | 'openpyxl', 17 | 'pyarrow', 18 | ], 19 | package_data={ 20 | 'census_parquet': ['*.sh'], 21 | }, 22 | ) 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **Expected behavior** 14 | A clear and concise description of what you expected to happen. 15 | 16 | **Screenshots** 17 | If applicable, add screenshots to help explain your problem. 18 | 19 | **Desktop (please complete the following information):** 20 | - OS: [e.g. iOS] 21 | - Browser [e.g. chrome, safari] 22 | - Version [e.g. 22] 23 | 24 | **Smartphone (please complete the following information):** 25 | - Device: [e.g. iPhone6] 26 | - OS: [e.g. iOS8.1] 27 | - Browser [e.g. stock browser, safari] 28 | - Version [e.g. 22] 29 | 30 | **Additional context** 31 | Add any other context about the problem here. 32 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Census-Parquet Changelog 2 | ----------- 3 | ### Version 0.0.9 - 8 July 2022 4 | - Fixing parquet fail at stage 5 (#6) 5 | - Adding Race/ethnicity columns (#7) 6 | - Saving outputs to Geoparquet (#8) 7 | - Balancing partition sizes 8 | 9 | ### Version 0.0.8 - 4 January 2022 10 | - Better use of dask and general code improvements (#3) 11 | 12 | ### Version 0.0.7 - 20 September 2021 13 | - Added wget for nation outline 14 | 15 | ### Version 0.0.6 - 16 September 2021 16 | - Ensure shell scripts in both sdist and wheels 17 | 18 | ### Version 0.0.5 - 16 September 2021 19 | - Added .sh to manifest 20 | 21 | ### Version 0.0.4 - 16 September 2021 22 | - Use click to run all 5 scripts in order 23 | 24 | ### Version 0.0.3 - 15 September 2021 25 | - Small tweak to setup.py 26 | 27 | ### Version 0.0.2 - 15 September 2021 28 | - Created tagged release for PyPI 29 | 30 | ### Version 0.0.1 - 15 September 2021 31 | - Initial release of processing scripts 32 | -------------------------------------------------------------------------------- /census_parquet/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | import os 3 | from subprocess import run 4 | import sys 5 | 6 | from . import process_boundaries, process_blocks, generate_synthetic_people 7 | 8 | 9 | @click.command() 10 | def start(): 11 | """Download US 2020 Census Data and convert to parquet files.""" 12 | module_path = sys.modules['census_parquet'].__path__[0] 13 | 14 | click.echo('Stage 1: Download boundaries') 15 | run(os.path.join(module_path, 'download_boundaries.sh')) 16 | 17 | click.echo('Stage 2: Download population stats') 18 | run(os.path.join(module_path, 'download_population_stats.sh')) 19 | 20 | click.echo('Stage 3: Download blocks') 21 | run(os.path.join(module_path, 'download_blocks.sh')) 22 | 23 | click.echo('Stage 4: Process boundaries') 24 | process_boundaries.main() 25 | 26 | click.echo('Stage 5: Process blocks') 27 | process_blocks.main() 28 | 29 | @click.command() 30 | def synthetic_people(): 31 | """Generate a point for each person within the census data.""" 32 | click.echo('Generating Synthetic People') 33 | generate_synthetic_people.main() 34 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021-2022 makepath 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /census_parquet/tests/test_polygons_to_points.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import geopandas as gpd 4 | import pandas as pd 5 | import dask_geopandas 6 | from pygeos import convex_hull, Geometry, to_shapely 7 | from census_parquet.generate_synthetic_people import polygons_to_points 8 | from census_parquet.tests.general_checks import ( 9 | general_output_checks_dask, 10 | general_output_checks_pd, 11 | ) 12 | 13 | def create_test_gdf(): 14 | gdf = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")).to_crs(3857).drop(columns=["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]) 15 | gdf = gdf.iloc[[20, 23]] 16 | gdf["GEOID"] = gdf.index.values 17 | gdf["POP"] = 100*np.ones((2,),dtype=int) 18 | gdf["P0010003"] = 10*np.ones((2,),dtype=int) 19 | gdf["P0010004"] = 10*np.ones((2,),dtype=int) 20 | gdf["P0010005"] = 10*np.ones((2,),dtype=int) 21 | gdf["P0010006"] = 10*np.ones((2,),dtype=int) 22 | gdf["P0010007"] = 30*np.ones((2,),dtype=int) 23 | gdf["P0010008"] = 10*np.ones((2,),dtype=int) 24 | gdf["P0010009"] = 20*np.ones((2,),dtype=int) 25 | return gdf 26 | 27 | def test_polygons_to_points_gpd(): 28 | gdf = create_test_gdf() 29 | out = polygons_to_points(gdf) 30 | general_output_checks_pd(gdf, out) 31 | 32 | 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-proposal.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature proposal 3 | about: Suggest an idea 4 | title: '' 5 | labels: proposal 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Author of Proposal:** 11 | ## Reason or Problem 12 | Describe what the need for this new feature is or what problem this new feature will address. 13 | ## Proposal 14 | Description of the new feature, how it will be used, what it will fix, etc. 15 | 16 | **Design:** 17 | Include description of this feature's design with enough detail for those who are familiar enough with this project to understand the feature and how it could be implmented. This section should get into specifics of how the feature will be designed and implemented. 18 | 19 | **Usage:** 20 | Detailed instructions for this feature's use. 21 | 22 | **Value:** What value does the implementation of this new feature bring to census-parquet. 23 | ## Stakeholders and Impacts 24 | Who are the stakeholders in this update? Will you be implementing this new feature or will someone else? What is the potential impact of implementing this new feature? Specifically, what are some other components would be impacted? 25 | ## Drawbacks 26 | What are potential reasons why this feature should not be implemented? 27 | ## Alternatives 28 | Describe other solutions or features you have considered when coming up with this proposal. 29 | 30 | ## Unresolved Questions 31 | What are parts of this feature's design that are undecided. 32 | ## Additional Notes or Context 33 | Anything elses that is important to know for the implementation of this new feature. 34 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | ## Release process 2 | 3 | ### Preparation 4 | - Create a new branch containing the following changes: 5 | - Update version number in setup.py 6 | - Update CHANGELOG.md with new version number and list of changes extracted from `git log`. 7 | - Commit changes 8 | - Check you can build the `sdist` and binary `wheel` using 9 | ```bash 10 | python setup.py sdist bdist_wheel 11 | ``` 12 | You may need to `pip install wheel` first. It should create a `.tar.gz` file and a `.whl` file in the `dist` directory. Check these are OK manually. 13 | - `git push` the branch. 14 | - Submit the branch as a PR to the `master` branch. 15 | - If the CI passes OK, merge the PR. 16 | 17 | ### Tag release 18 | - To sign the release you need a GPG key registered with your github account. See 19 | https://docs.github.com/en/authentication/managing-commit-signature-verification 20 | - Create new tag, with the correct version number, using: 21 | ```bash 22 | git tag -a v0.1.2 -s -m "Version 0.1.2" 23 | git push --tags 24 | ``` 25 | 26 | ### PyPI packages 27 | - These are automatically built and uploaded to PyPI via a github action when a new tag is pushed to the github repo. 28 | - Check that both an sdist (`.tar.gz` file) and wheel (`.whl` file) are available on PyPI at https://pypi.org/project/census-parquet 29 | - Check you can install the new version in a new virtual environment using `pip install census-parquet`. 30 | 31 | ### github release notes 32 | - Convert the tag into a release on github: 33 | - On the right-hand side of the github repo, click on `Releases`. 34 | - Click on `Draft a new release`. 35 | - Select the correct tag, and enter the title and description by copying and pasting from the CHANGELOG.md. 36 | - Click `Publish release`. 37 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish package to PyPI 2 | on: 3 | push: 4 | tags: 5 | - '*' 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-18.04 9 | steps: 10 | - uses: actions/checkout@master 11 | - name: Set up Python 3.9 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.8 15 | - name: Get release version 16 | run: | 17 | echo "CHANGELOG_VERSION=$(cat CHANGELOG.md | grep -oP '(?<=###\s)(.*)(?=\s\-)' | head -n 1 | sed 's/Version\s/v/')" >> $GITHUB_ENV 18 | echo "TAG_VERSION=`echo $(git describe --tags --abbrev=0)`" >> $GITHUB_ENV 19 | - name: Check changelog release version 20 | if: ${{ env.TAG_VERSION != env.CHANGELOG_VERSION }} 21 | run: | 22 | echo "CHANGELOG_VERSION($CHANGELOG_VERSION) is different from TAG_VERSION($TAG_VERSION)" 23 | exit 1 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install pyct \ 28 | wheel \ 29 | setuptools 30 | - name: Get all git tags 31 | run: git fetch --tags -f 32 | - name: Build package 33 | run: | 34 | python setup.py sdist bdist_wheel 35 | - name: Get package size 36 | run: echo "PKG_SIZE=$(find dist -maxdepth 1 -regex '.*gz' | xargs stat --format='%s')" >> $GITHUB_ENV 37 | - name: Check package size 38 | if: ${{ env.PKG_SIZE > 1e+8 }} 39 | run: | 40 | echo "PKG_SIZE($PKG_SIZE bytes) is greater than 100MB" 41 | exit 1 42 | - name: Publish package 43 | if: startsWith(github.ref, 'refs/tags') 44 | uses: pypa/gh-action-pypi-publish@master 45 | with: 46 | skip_existing: true 47 | password: ${{ secrets.PYPI_API_TOKEN }} 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # census-parquet 2 | Python tools for creating and maintaining Parquet files from [US 2020 Census Data](https://www.census.gov/programs-surveys/decennial-census/decade/2020/2020-census-main.html). 3 | 4 | 5 | ## Installation 6 | 7 | To use the data download shell script files first install [wget](https://en.wikipedia.org/wiki/Wget). 8 | 9 | To install the census-parquet package use 10 | ``` 11 | pip install census-parquet 12 | ``` 13 | 14 | This will also install the required Python dependencies which are: 15 | 1. [click](https://github.com/pallets/click) 16 | 2. [dask](https://docs.dask.org/en/latest/install.html) 17 | 3. [dask_geopandas](https://github.com/geopandas/dask-geopandas) 18 | 4. [geopandas](https://geopandas.org/getting_started/install.html) 19 | 5. [numpy](https://numpy.org/install/) 20 | 6. [openpyxl](https://openpyxl.readthedocs.io/en/stable/#installation) 21 | 7. [pandas](https://pandas.pydata.org/docs/getting_started/install.html) 22 | 8. [pyarrow](https://arrow.apache.org/docs/python/install.html) 23 | 24 | ## Usage 25 | To run the census-parquet code simply use 26 | ``` 27 | run_census_parquet 28 | ``` 29 | 30 | This runs the following scripts in order: 31 | 1. `download_boundaries.sh` - This script downloads the Census Boundary data needed to run `process_boundaries.py` 32 | 2. `download_population_stats.sh` - This script downloads population stat data needed for process_blocks.py 33 | 3. `download_blocks.sh` - This script downloads the Census Block data needed to run process_blocks.py 34 | 4. `process_boundaries.py` - This script processes the Census Boundary data and creates parquet files. The parquet files will be output into a `boundary_outputs` folder. 35 | 5. `process_blocks.py` - This script processes Census Block data and creates parquet files. The final combined parquet file will have the name `tl_2020_FULL_tabblock20.parquet`. 36 | -------------------------------------------------------------------------------- /census_parquet/generate_synthetic_people.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate Synthetic People 3 | 4 | We create a table with a single point for each person within census. 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import geopandas as gpd 10 | import dask_geopandas 11 | from dask.dataframe.dispatch import make_meta_dispatch 12 | import dask.dataframe as dd 13 | from dask.diagnostics import ProgressBar 14 | gpd.options.use_pygeos=True 15 | 16 | 17 | def polygons_to_points(gdf): 18 | keep_points = [] 19 | rng = np.random.default_rng() 20 | if gdf.empty: 21 | return pd.DataFrame( 22 | {"x": [float(0.0)], 23 | "y": [float(0.0)], 24 | "GEOID": ["00"], 25 | "R":[" "] 26 | } 27 | ).set_index("GEOID") 28 | for index, row in gdf.iterrows(): 29 | pop = int(row["POP"]) 30 | if pop==0: 31 | continue 32 | _w = row["P0010003"] 33 | _b = row["P0010004"] 34 | _n = row["P0010005"] 35 | _a = row["P0010006"] 36 | _hpi = row["P0010007"] 37 | _o = row["P0010008"] 38 | _m = row["P0010009"] 39 | len_within = 0 40 | it = 1 41 | while len_within < pop: 42 | x_min, y_min, x_max, y_max = row["geometry"].bounds 43 | xs = np.random.uniform(x_min, x_max, pop * it) 44 | ys = np.random.uniform(y_min, y_max, pop * it) 45 | gdf_points = gpd.GeoSeries(gpd.points_from_xy(xs, ys), crs=3857) 46 | within_points = gdf_points.clip(row["geometry"]) 47 | len_within = len(within_points) 48 | it+=1 49 | within_pdf = pd.DataFrame( 50 | { 51 | "x": within_points.iloc[:pop].x, 52 | "y": within_points.iloc[:pop].y, 53 | "GEOID": [index] * pop, 54 | "R": ["w"] * _w + ["b"] * _b + ["n"] * _n + ["a"] * _a + ["hpi"] * _hpi + ["o"] * _o + ["m"] * _m 55 | } 56 | ) 57 | keep_points.append(within_pdf) 58 | return pd.concat(keep_points, ignore_index=True).set_index("GEOID") 59 | 60 | 61 | def main(): 62 | blocks_ddf = dask_geopandas.read_parquet("outputs/census_blocks_pops.parquet", calculate_divisions=True) 63 | meta_df = pd.DataFrame( 64 | {"x": [float(0.0)], 65 | "y": [float(0.0)], 66 | "GEOID": ["00"], 67 | "R":[" "]} 68 | ).set_index("GEOID") 69 | with ProgressBar(): 70 | blocks_ddf.map_partitions(polygons_to_points, 71 | meta=meta_df, 72 | ).to_parquet("outputs/synthetic_people.parquet", write_metadata_file=True) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /census_parquet/process_boundaries.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from pathlib import Path 3 | 4 | import dask 5 | import geopandas 6 | import pandas as pd 7 | 8 | 9 | warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") 10 | 11 | DTYPES = { 12 | "AFFGEOID": "string", 13 | "AFFGEOID20": "string", 14 | "AIANNHCE": "int", 15 | "AIANNHNS": "int", 16 | "ALAND": "int", 17 | "ALAND20": "int", 18 | "AWATER": "int", 19 | "AWATER20": "int", 20 | "ANRCFP": "int", 21 | "ANRCNS": "int", 22 | "BLKGRPCE": "category", 23 | "CBSAFP": "int", 24 | "CD116FP": "int", 25 | "CDSESSN": "int", 26 | "CNECTAFP": "category", 27 | "CONCTYFP": "int", 28 | "CONCTYNS": "int", 29 | "COUNTYNS": "string", 30 | "COUNTYFP": "category", 31 | "COUNTYFP20": "category", 32 | "COUSUBFP": "category", 33 | "COUSUBNS": "string", 34 | # "CSAFP": pd.Int64Dtype(), # can't astype object -> Int64 35 | "DIVISIONCE": "int", 36 | "ELSDLEA": "int", 37 | "GEOID": "string", 38 | "GEOID20": "string", 39 | "LSAD": "category", 40 | "LSAD20": "category", 41 | "LSY": "category", 42 | "METDIVFP": "int", 43 | "NAME": "string", 44 | "NAME20": "string", 45 | "NAMELSAD": "string", 46 | "NAMELSAD20": "string", 47 | "NAMELSADCO": "category", 48 | "NCTADVFP": "int", 49 | "NECTAFP": "int", 50 | "PARTFLG": "category", 51 | "PLACEFP": "int", 52 | "PLACENS": "int", 53 | "REGIONCE": "int", 54 | "SCSDLEA": "int", 55 | "SLDLST": "string", 56 | "SLDUST": "string", 57 | "STATE_NAME": "category", 58 | "STATEFP": "category", 59 | "STATEFP20": "category", 60 | "STATENS": "int", 61 | "STUSPS": "category", 62 | "SUBMCDFP": "int", 63 | "SUBMCDNS": "int", 64 | "TBLKGPCE": "category", 65 | "TRACTCE": "int", 66 | "TTRACTCE": "category", 67 | "TRSUBCE": "int", 68 | "TRSUBNS": "int", 69 | "UNSDLEA": "int", 70 | "VTDI20": "category", 71 | "VTDST20": "string", 72 | } 73 | 74 | 75 | def process_boundary_file(path: Path) -> Path: 76 | print(f"Started {path}") 77 | gdf = geopandas.read_file(path, driver="SHP") 78 | gdf = gdf.astype({k: DTYPES[k] for k in set(gdf.columns) & set(DTYPES)}) 79 | if "CSAFP" in gdf.columns: 80 | gdf["CSAFP"] = gdf["CSAFP"].astype("float64").astype(pd.Int64Dtype()) 81 | 82 | output = Path(path).parent / "boundary_outputs" / path.with_suffix(".parquet").name 83 | output.parent.mkdir(parents=True, exist_ok=True) 84 | gdf.to_parquet(output, index=False) 85 | print(f"Finished {output}") 86 | return output 87 | 88 | 89 | def main(): 90 | files = list(Path("census_boundaries").glob("*.zip")) 91 | print(f"Found {len(files)} files") 92 | results = [dask.delayed(process_boundary_file)(file) for file in files] 93 | dask.compute(results) 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Citizen Code of Conduct 2 | 3 | ## 1. Purpose 4 | 5 | A primary goal of census-parquet is to be inclusive to the largest number of contributors, with the most varied and diverse backgrounds possible. As such, we are committed to providing a friendly, safe and welcoming environment for all, regardless of gender, sexual orientation, ability, ethnicity, socioeconomic status, and religion (or lack thereof). 6 | 7 | This code of conduct outlines our expectations for all those who participate in our community, as well as the consequences for unacceptable behavior. 8 | 9 | We invite all those who participate in census-parquet to help us create safe and positive experiences for everyone. 10 | 11 | ## 2. Open [Source/Culture/Tech] Citizenship 12 | 13 | A supplemental goal of this Code of Conduct is to increase open [source/culture/tech] citizenship by encouraging participants to recognize and strengthen the relationships between our actions and their effects on our community. 14 | 15 | Communities mirror the societies in which they exist and positive action is essential to counteract the many forms of inequality and abuses of power that exist in society. 16 | 17 | If you see someone who is making an extra effort to ensure our community is welcoming, friendly, and encourages all participants to contribute to the fullest extent, we want to know. 18 | 19 | ## 3. Expected Behavior 20 | 21 | The following behaviors are expected and requested of all community members: 22 | 23 | * Participate in an authentic and active way. In doing so, you contribute to the health and longevity of this community. 24 | * Exercise consideration and respect in your speech and actions. 25 | * Attempt collaboration before conflict. 26 | * Refrain from demeaning, discriminatory, or harassing behavior and speech. 27 | * Be mindful of your surroundings and of your fellow participants. Alert community leaders if you notice a dangerous situation, someone in distress, or violations of this Code of Conduct, even if they seem inconsequential. 28 | * Remember that community event venues may be shared with members of the public; please be respectful to all patrons of these locations. 29 | 30 | ## 4. Unacceptable Behavior 31 | 32 | The following behaviors are considered harassment and are unacceptable within our community: 33 | 34 | * Violence, threats of violence or violent language directed against another person. 35 | * Sexist, racist, homophobic, transphobic, ableist or otherwise discriminatory jokes and language. 36 | * Posting or displaying sexually explicit or violent material. 37 | * Posting or threatening to post other people's personally identifying information ("doxing"). 38 | * Personal insults, particularly those related to gender, sexual orientation, race, religion, or disability. 39 | * Inappropriate photography or recording. 40 | * Inappropriate physical contact. You should have someone's consent before touching them. 41 | * Unwelcome sexual attention. This includes sexualized comments or jokes, inappropriate touching, groping, and unwelcomed sexual advances. 42 | * Deliberate intimidation, stalking or following (online or in-person). 43 | * Advocating for, or encouraging, any of the above behavior. 44 | * Sustained disruption of community events, including talks and presentations. 45 | 46 | ## 5. Weapons Policy 47 | 48 | No weapons will be allowed at census-parquet events, community spaces, or in other spaces covered by the scope of this Code of Conduct. Weapons include but are not limited to guns, explosives (including fireworks), and large knives such as those used for hunting or display, as well as any other item used for the purpose of causing injury or harm to others. Anyone seen in possession of one of these items will be asked to leave immediately, and will only be allowed to return without the weapon. Community members are further expected to comply with all state and local laws on this matter. 49 | 50 | ## 6. Consequences of Unacceptable Behavior 51 | 52 | Unacceptable behavior from any community member, including sponsors and those with decision-making authority, will not be tolerated. 53 | 54 | Anyone asked to stop unacceptable behavior is expected to comply immediately. 55 | 56 | If a community member engages in unacceptable behavior, the community organizers may take any action they deem appropriate, up to and including a temporary ban or permanent expulsion from the community without warning (and without refund in the case of a paid event). 57 | 58 | ## 7. Reporting Guidelines 59 | 60 | If you are subject to or witness unacceptable behavior, or have any other concerns, please notify a community organizer as soon as possible. 61 | 62 | Additionally, community organizers are available to help community members engage with local law enforcement or to otherwise help those experiencing unacceptable behavior feel safe. In the context of in-person events, organizers will also provide escorts as desired by the person experiencing distress. 63 | 64 | ## 8. Addressing Grievances 65 | 66 | If you feel you have been falsely or unfairly accused of violating this Code of Conduct, you should notify makepath with a concise description of your grievance. Your grievance will be handled in accordance with our existing governing policies. 67 | 68 | 69 | 70 | ## 9. Scope 71 | 72 | We expect all community participants (contributors, paid or otherwise; sponsors; and other guests) to abide by this Code of Conduct in all community venues--online and in-person--as well as in all one-on-one communications pertaining to community business. 73 | 74 | This code of conduct and its related procedures also applies to unacceptable behavior occurring outside the scope of community activities when such behavior has the potential to adversely affect the safety and well-being of community members. 75 | 76 | ## 10. Contact info 77 | 78 | Brendan Collins (brendan@makepath.com) 79 | 80 | ## 11. License and attribution 81 | 82 | The Citizen Code of Conduct is distributed by [Stumptown Syndicate](http://stumptownsyndicate.org) under a [Creative Commons Attribution-ShareAlike license](http://creativecommons.org/licenses/by-sa/3.0/). 83 | 84 | Portions of text derived from the [Django Code of Conduct](https://www.djangoproject.com/conduct/) and the [Geek Feminism Anti-Harassment Policy](http://geekfeminism.wikia.com/wiki/Conference_anti-harassment/Policy). 85 | 86 | _Revision 2.3. Posted 6 March 2017._ 87 | 88 | _Revision 2.2. Posted 4 February 2016._ 89 | 90 | _Revision 2.1. Posted 23 June 2014._ 91 | 92 | _Revision 2.0, adopted by the [Stumptown Syndicate](http://stumptownsyndicate.org) board on 10 January 2013. Posted 17 March 2013._ 93 | -------------------------------------------------------------------------------- /census_parquet/process_blocks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Processing Census Blocks. 3 | 4 | We create two logical tables: 5 | 6 | 1. Geometries only 7 | 2. Populations only 8 | 9 | This is driven by the Census Bureau not providing population statistics 10 | for territories (yet?). 11 | """ 12 | from pathlib import Path 13 | import warnings 14 | 15 | import dask 16 | import dask.dataframe as dd 17 | import dask_geopandas 18 | from dask.diagnostics import ProgressBar 19 | import geopandas 20 | import pandas as pd 21 | warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") 22 | 23 | 24 | statelookup = { 25 | "01": "AL", 26 | "02": "AK", 27 | "04": "AZ", 28 | "05": "AR", 29 | "06": "CA", 30 | "08": "CO", 31 | "09": "CT", 32 | "10": "DE", 33 | "11": "DC", 34 | "12": "FL", 35 | "13": "GA", 36 | "15": "HI", 37 | "16": "ID", 38 | "17": "IL", 39 | "18": "IN", 40 | "19": "IA", 41 | "20": "KS", 42 | "21": "KY", 43 | "22": "LA", 44 | "23": "ME", 45 | "24": "MD", 46 | "25": "MA", 47 | "26": "MI", 48 | "27": "MN", 49 | "28": "MS", 50 | "29": "MO", 51 | "30": "MT", 52 | "31": "NE", 53 | "32": "NV", 54 | "33": "NH", 55 | "34": "NJ", 56 | "35": "NM", 57 | "36": "NY", 58 | "37": "NC", 59 | "38": "ND", 60 | "39": "OH", 61 | "40": "OK", 62 | "41": "OR", 63 | "42": "PA", 64 | "44": "RI", 65 | "45": "SC", 66 | "46": "SD", 67 | "47": "TN", 68 | "48": "TX", 69 | "49": "UT", 70 | "50": "VT", 71 | "51": "VA", 72 | "53": "WA", 73 | "54": "WV", 74 | "55": "WI", 75 | "56": "WY", 76 | "72": "PR", 77 | } 78 | SUMMARY_TABLE = "./population_stats/2020_PLSummaryFile_FieldNames.xlsx" 79 | 80 | 81 | def process_pop(file): 82 | FIPS = file.stem.split("_")[2] 83 | ABBR = statelookup[FIPS] 84 | 85 | root = Path("population_stats") 86 | state_1 = root / (ABBR.lower() + "000012020.pl") 87 | state_geo = root / (ABBR.lower() + "geo2020.pl") 88 | 89 | seg_1_header_df = pd.read_excel( 90 | SUMMARY_TABLE, sheet_name="2020 P.L. Segment 1 Fields" 91 | ) 92 | 93 | geo_header_df = pd.read_excel( 94 | SUMMARY_TABLE, sheet_name="2020 P.L. Geoheader Fields" 95 | ) 96 | 97 | seg_1_df = pd.read_csv( 98 | state_1, 99 | encoding="latin-1", 100 | delimiter="|", 101 | names=seg_1_header_df.columns.to_list(), 102 | low_memory=False, 103 | ).drop(columns=["STUSAB"]) 104 | 105 | geo_df = pd.read_csv( 106 | state_geo, 107 | encoding="latin-1", 108 | delimiter="|", 109 | names=geo_header_df.columns.to_list(), 110 | low_memory=False, 111 | ) 112 | geo_df = geo_df[geo_df["SUMLEV"] == 750] 113 | 114 | block_df = pd.merge( 115 | left=geo_df[["LOGRECNO", "GEOID", "STUSAB"]], 116 | right=seg_1_df, 117 | how="left", 118 | on="LOGRECNO", 119 | ).drop(columns=["LOGRECNO", "CHARITER", "STUSAB", "FILEID", "CIFSN"]) 120 | block_df["GEOID"] = block_df["GEOID"].str.replace("7500000US", "") 121 | block_df = block_df.set_index("GEOID").sort_index() 122 | 123 | assert block_df.index.is_unique 124 | return block_df 125 | 126 | 127 | def process_geo(file): 128 | dtypes = { 129 | "STATEFP": "int", 130 | "COUNTYFP": "int", 131 | "TRACTCE": "int", 132 | "BLOCKCE": "int", 133 | "HOUSING": "int", 134 | "POP": "int" 135 | } 136 | 137 | gdf = ( 138 | geopandas.read_file(file, driver="SHP") 139 | .drop(columns=["MTFCC20", "UR20", "UACE20", "UATYPE20", "FUNCSTAT20", "NAME20"]) 140 | .rename(columns=lambda x: x.rstrip("20")) 141 | .astype(dtypes) 142 | .set_index("GEOID") 143 | ) 144 | gdf["INTPTLON"] = pd.to_numeric(gdf["INTPTLON"]) 145 | gdf["INTPTLAT"] = pd.to_numeric(gdf["INTPTLAT"]) 146 | gdf = gdf.replace([None],0) 147 | 148 | return gdf 149 | 150 | 151 | def process(file): 152 | geo = process_geo(file) 153 | FIPS = file.stem.split("_")[2] 154 | 155 | if FIPS in statelookup: 156 | pop = process_pop(file) 157 | result = pd.merge(geo, pop) 158 | assert len(result) == len(geo) 159 | else: 160 | pop = None 161 | 162 | return file, geo, pop 163 | 164 | def process_pop_geo(file): 165 | geo = process_geo(file) 166 | FIPS = file.stem.split("_")[2] 167 | block_ddf = dask_geopandas.from_geopandas(geo, npartitions=1) 168 | output_geo = Path(f"tmp/geo/{file.stem.split('_')[2]}.parquet") 169 | output_geo.parent.mkdir(parents=True, exist_ok=True) 170 | block_ddf.to_parquet(output_geo) 171 | if FIPS in statelookup: 172 | pop = process_pop(file) 173 | pop_ddf = dd.from_pandas(pop, npartitions=1) 174 | output_pop = Path(f"tmp/pop/{FIPS}.parquet") 175 | output_pop.parent.mkdir(parents=True, exist_ok=True) 176 | pop_ddf.to_parquet(output_pop) 177 | result = pd.merge(geo,pop,left_index=True,right_index=True) 178 | result = result[['POP', 179 | 'P0010003', 180 | 'P0010004', 181 | 'P0010005', 182 | 'P0010006', 183 | 'P0010007', 184 | 'P0010008', 185 | 'P0010009', 186 | 'geometry'] 187 | ] 188 | result = result.to_crs(3857) 189 | result = dask_geopandas.from_geopandas(result,npartitions=1) 190 | assert len(result) == len(geo) 191 | output = Path(f"tmp/comb/{file.stem.split('_')[2]}.parquet") 192 | output.parent.mkdir(parents=True,exist_ok=True) 193 | result.to_parquet(output) 194 | else: 195 | pop = None 196 | return output, output_pop, output_geo 197 | 198 | def main(): 199 | files = list(Path("TABBLOCK20").glob("*.zip")) 200 | 201 | combs = [dask.delayed(process_pop_geo)(file) 202 | for file in files 203 | if file.stem.split("_")[2] in statelookup 204 | ] 205 | 206 | print("combining geo and pops") 207 | with ProgressBar(): 208 | outs = dask.compute(*combs) 209 | 210 | comb_files = [x[0] for x in outs] 211 | pop_files = [x[1] for x in outs] 212 | geo_files = [x[2] for x in outs] 213 | pop = dd.concat([dd.read_parquet(f) for f in sorted(pop_files)]) 214 | geo = dd.concat([dask_geopandas.read_parquet(f) for f in sorted(geo_files)]) 215 | 216 | comb = dd.concat([dask_geopandas.read_parquet(f) for f in sorted(comb_files)]) 217 | 218 | Path("outputs").mkdir(exist_ok=True) 219 | print("repartitioning combined files into like sizes") 220 | with ProgressBar(): 221 | comb = comb.repartition(partition_size="10MB") 222 | 223 | print("spatial partitioning combined files") 224 | with ProgressBar(): 225 | comb.calculate_spatial_partitions() 226 | 227 | print("finalizing census blocks and population data") 228 | with ProgressBar(): 229 | comb.to_parquet("outputs/census_blocks_pops.parquet", write_metadata_file=True) 230 | 231 | print("finalizing population files") 232 | with ProgressBar(): 233 | pop.to_parquet("outputs/census_population.parquet", write_metadata_file=True) 234 | 235 | print("computing spatial partitions for geo files") 236 | with ProgressBar(): 237 | geo.calculate_spatial_partitions() 238 | 239 | print("finalizing geo files") 240 | with ProgressBar(): 241 | geo.to_parquet("outputs/census_blocks_geo.parquet", write_metadata_file=True) 242 | 243 | print("validating") 244 | a = dd.read_parquet("outputs/census_population.parquet", calculate_divisions=True) 245 | assert a.known_divisions 246 | 247 | b = dask_geopandas.read_parquet("outputs/census_blocks_geo.parquet", calculate_divisions=True) 248 | assert b.known_divisions 249 | 250 | c = dask_geopandas.read_parquet("outputs/census_blocks_pops.parquet", calculate_divisions=True) 251 | assert c.known_divisions 252 | 253 | print("complete") 254 | 255 | if __name__ == "__main__": 256 | main() 257 | --------------------------------------------------------------------------------