├── census_parquet
    ├── __init__.py
    ├── tests
    │   ├── __init__.py
    │   ├── general_checks.py
    │   └── test_polygons_to_points.py
    ├── download_blocks.sh
    ├── download_boundaries.sh
    ├── download_population_stats.sh
    ├── cli.py
    ├── generate_synthetic_people.py
    ├── process_boundaries.py
    └── process_blocks.py
├── .github
    ├── pull_request_template.md
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature-proposal.md
    └── workflows
    │   └── pypi-publish.yml
├── setup.cfg
├── .gitignore
├── setup.py
├── CHANGELOG.md
├── LICENSE.txt
├── RELEASE.md
├── README.md
└── CODE_OF_CONDUCT.md


/census_parquet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/census_parquet/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | Fixes #
2 | 
3 | ## Proposed Changes
4 | 
5 |   -
6 |   -
7 |   -
8 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | license = MIT
3 | license_files = LICENSE.txt
4 | long_description = file: README.md
5 | long_description_content_type = text/markdown
6 | 


--------------------------------------------------------------------------------
/census_parquet/download_blocks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget -w 0.5 -c -r -np -nH -nv -e robots=off -R "index.html*" --cut-dirs=3 https://www2.census.gov/geo/tiger/TIGER2020/TABBLOCK20/
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # MacOS Files
 2 | .DS_Store
 3 | 
 4 | # Distribution / packaging
 5 | build/
 6 | dist/
 7 | census_parquet.egg-info/
 8 | 
 9 | #Outputs and Downloads
10 | population_stats/
11 | outputs/
12 | boundary_outputs/
13 | census_boundaries/
14 | TABBLOCK20/
15 | *.parquet


--------------------------------------------------------------------------------
/census_parquet/download_boundaries.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p census_boundaries
3 | cd census_boundaries
4 | wget https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_all_500k.zip
5 | unzip cb_2020_us_all_500k.zip
6 | rm cb_2020_us_all_500k.zip
7 | wget https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_nation_5m.zip


--------------------------------------------------------------------------------
/census_parquet/tests/general_checks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import geopandas as gpd
 3 | import dask_geopandas
 4 | import pandas as pd
 5 | 
 6 | def general_output_checks_pd(input_gdf: gpd.GeoDataFrame, output_df: pd.DataFrame):
 7 |     assert isinstance(output_df, pd.DataFrame)
 8 | 
 9 | def general_output_checks_dask(input_gdf: dask_geopandas.GeoDataFrame, output: str):
10 |     a = dask_geopandas.read_parquet(output)
11 |     assert a.known_divisions
12 |     
13 | 


--------------------------------------------------------------------------------
/census_parquet/download_population_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | wget -w 0.5 -r -np -nH -nv -e robots=off -R "index.html*" --cut-dirs=4 https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/
 3 | mkdir -p population_stats
 4 | find 01-Redistricting_File--PL_94-171 -name '*.pl.zip' -exec mv {} ./population_stats \;
 5 | find ./population_stats -name '*.pl.zip' -execdir unzip {} \;
 6 | 
 7 | cd population_stats
 8 | wget https://www2.census.gov/programs-surveys/decennial/rdo/about/2020-census-program/Phase3/SupportMaterials/2020_PLSummaryFile_FieldNames.xlsx
 9 | cd -
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='census-parquet',
 5 |     version='0.0.10',
 6 |     packages=['census_parquet'],
 7 |     description='Tools for generating Parquet files from US Census 2020',
 8 |     author='makepath',
 9 |     url='https://github.com/makepath/census-parquet',
10 |     entry_points={
11 |         'console_scripts': ['run_census_parquet=census_parquet.cli:start', 'run_synthetic_people=census_parquet.cli:synthetic_people']
12 |     },
13 |     install_requires=[
14 |         'click',
15 |         'dask_geopandas',
16 |         'openpyxl',
17 |         'pyarrow',
18 |     ],
19 |     package_data={
20 |         'census_parquet': ['*.sh'],
21 |     },
22 | )
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **Expected behavior**
14 | A clear and concise description of what you expected to happen.
15 | 
16 | **Screenshots**
17 | If applicable, add screenshots to help explain your problem.
18 | 
19 | **Desktop (please complete the following information):**
20 |  - OS: [e.g. iOS]
21 |  - Browser [e.g. chrome, safari]
22 |  - Version [e.g. 22]
23 | 
24 | **Smartphone (please complete the following information):**
25 |  - Device: [e.g. iPhone6]
26 |  - OS: [e.g. iOS8.1]
27 |  - Browser [e.g. stock browser, safari]
28 |  - Version [e.g. 22]
29 | 
30 | **Additional context**
31 | Add any other context about the problem here.
32 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## Census-Parquet Changelog
 2 | -----------
 3 | ### Version 0.0.9 - 8 July 2022
 4 | - Fixing parquet fail at stage 5 (#6)
 5 | - Adding Race/ethnicity columns (#7)
 6 | - Saving outputs to Geoparquet (#8)
 7 | - Balancing partition sizes
 8 | 
 9 | ### Version 0.0.8 - 4 January 2022
10 | - Better use of dask and general code improvements (#3)
11 | 
12 | ### Version 0.0.7 - 20 September 2021
13 | - Added wget for nation outline
14 | 
15 | ### Version 0.0.6 - 16 September 2021
16 | - Ensure shell scripts in both sdist and wheels
17 | 
18 | ### Version 0.0.5 - 16 September 2021
19 | - Added .sh to manifest
20 | 
21 | ### Version 0.0.4 - 16 September 2021
22 | - Use click to run all 5 scripts in order
23 | 
24 | ### Version 0.0.3 - 15 September 2021
25 | - Small tweak to setup.py
26 | 
27 | ### Version 0.0.2 - 15 September 2021
28 | - Created tagged release for PyPI
29 | 
30 | ### Version 0.0.1 - 15 September 2021
31 | - Initial release of processing scripts
32 | 


--------------------------------------------------------------------------------
/census_parquet/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import os
 3 | from subprocess import run
 4 | import sys
 5 | 
 6 | from . import process_boundaries, process_blocks, generate_synthetic_people
 7 | 
 8 | 
 9 | @click.command()
10 | def start():
11 |     """Download US 2020 Census Data and convert to parquet files."""
12 |     module_path = sys.modules['census_parquet'].__path__[0]
13 | 
14 |     click.echo('Stage 1: Download boundaries')
15 |     run(os.path.join(module_path, 'download_boundaries.sh'))
16 | 
17 |     click.echo('Stage 2: Download population stats')
18 |     run(os.path.join(module_path, 'download_population_stats.sh'))
19 | 
20 |     click.echo('Stage 3: Download blocks')
21 |     run(os.path.join(module_path, 'download_blocks.sh'))
22 | 
23 |     click.echo('Stage 4: Process boundaries')
24 |     process_boundaries.main()
25 | 
26 |     click.echo('Stage 5: Process blocks')
27 |     process_blocks.main()
28 | 
29 | @click.command()
30 | def synthetic_people():
31 |     """Generate a point for each person within the census data."""
32 |     click.echo('Generating Synthetic People')
33 |     generate_synthetic_people.main()
34 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021-2022 makepath
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/census_parquet/tests/test_polygons_to_points.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import geopandas as gpd
 4 | import pandas as pd
 5 | import dask_geopandas
 6 | from pygeos import convex_hull, Geometry, to_shapely
 7 | from census_parquet.generate_synthetic_people import polygons_to_points
 8 | from census_parquet.tests.general_checks import (
 9 |     general_output_checks_dask,
10 |     general_output_checks_pd,
11 | )
12 | 
13 | def create_test_gdf():
14 |     gdf = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")).to_crs(3857).drop(columns=["pop_est", "continent", "name", "iso_a3", "gdp_md_est"])
15 |     gdf = gdf.iloc[[20, 23]]
16 |     gdf["GEOID"] = gdf.index.values
17 |     gdf["POP"] = 100*np.ones((2,),dtype=int)
18 |     gdf["P0010003"] = 10*np.ones((2,),dtype=int)
19 |     gdf["P0010004"] = 10*np.ones((2,),dtype=int)
20 |     gdf["P0010005"] = 10*np.ones((2,),dtype=int)
21 |     gdf["P0010006"] = 10*np.ones((2,),dtype=int)
22 |     gdf["P0010007"] = 30*np.ones((2,),dtype=int)
23 |     gdf["P0010008"] = 10*np.ones((2,),dtype=int)
24 |     gdf["P0010009"] = 20*np.ones((2,),dtype=int)
25 |     return gdf
26 | 
27 | def test_polygons_to_points_gpd():
28 |     gdf = create_test_gdf()
29 |     out = polygons_to_points(gdf)
30 |     general_output_checks_pd(gdf, out)
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-proposal.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature proposal
 3 | about: Suggest an idea
 4 | title: ''
 5 | labels: proposal
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Author of Proposal:**
11 | ## Reason or Problem
12 | Describe what the need for this new feature is or what problem this new feature will address.
13 | ## Proposal
14 | Description of the new feature, how it will be used, what it will fix, etc. 
15 | 
16 | **Design:**
17 | Include description of this feature's design with enough detail for those who are familiar enough with this project to understand the feature and how it could be implmented. This section should get into specifics of how the feature will be designed and implemented. 
18 | 
19 | **Usage:** 
20 | Detailed instructions for this feature's use. 
21 | 
22 | **Value:** What value does the implementation of this new feature bring to census-parquet.
23 | ## Stakeholders and Impacts
24 | Who are the stakeholders in this update? Will you be implementing this new feature or will someone else? What is the potential impact of implementing this new feature? Specifically, what are some other components would be impacted? 
25 | ## Drawbacks
26 | What are potential reasons why this feature should not be implemented?
27 | ## Alternatives
28 | Describe other solutions or features you have considered when coming up with this proposal.
29 | 
30 | ## Unresolved Questions
31 | What are parts of this feature's design that are undecided.
32 | ## Additional Notes or Context
33 | Anything elses that is important to know for the implementation of this new feature.
34 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | ## Release process
 2 | 
 3 | ### Preparation
 4 | - Create a new branch containing the following changes:
 5 |   - Update version number in setup.py
 6 |   - Update CHANGELOG.md with new version number and list of changes extracted from `git log`.
 7 | - Commit changes
 8 | - Check you can build the `sdist` and binary `wheel` using
 9 |   ```bash
10 |   python setup.py sdist bdist_wheel
11 |   ```
12 |   You may need to `pip install wheel` first. It should create a `.tar.gz` file and a `.whl` file in the `dist` directory. Check these are OK manually.
13 | - `git push` the branch.
14 | - Submit the branch as a PR to the `master` branch.
15 | - If the CI passes OK, merge the PR.
16 | 
17 | ### Tag release
18 | - To sign the release you need a GPG key registered with your github account. See
19 | https://docs.github.com/en/authentication/managing-commit-signature-verification
20 | - Create new tag, with the correct version number, using:
21 |   ```bash
22 |   git tag -a v0.1.2 -s -m "Version 0.1.2"
23 |   git push --tags
24 |   ```
25 | 
26 | ### PyPI packages
27 | - These are automatically built and uploaded to PyPI via a github action when a new tag is pushed to the github repo.
28 | - Check that both an sdist (`.tar.gz` file) and wheel (`.whl` file) are available on PyPI at https://pypi.org/project/census-parquet
29 | - Check you can install the new version in a new virtual environment using `pip install census-parquet`.
30 | 
31 | ### github release notes
32 | - Convert the tag into a release on github:
33 |   - On the right-hand side of the github repo, click on `Releases`.
34 |   - Click on `Draft a new release`.
35 |   - Select the correct tag, and enter the title and description by copying and pasting from the CHANGELOG.md.
36 |   - Click `Publish release`.
37 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish package to PyPI
 2 | on:
 3 |   push:
 4 |     tags:
 5 |        - '*'
 6 | jobs:
 7 |   publish:
 8 |     runs-on: ubuntu-18.04
 9 |     steps:
10 |       - uses: actions/checkout@master
11 |       - name: Set up Python 3.9
12 |         uses: actions/setup-python@v1
13 |         with:
14 |           python-version: 3.8
15 |       - name: Get release version
16 |         run: |
17 |           echo "CHANGELOG_VERSION=$(cat CHANGELOG.md | grep -oP '(?<=###\s)(.*)(?=\s\-)' | head -n 1 | sed 's/Version\s/v/')" >> $GITHUB_ENV
18 |           echo "TAG_VERSION=`echo $(git describe --tags --abbrev=0)`" >> $GITHUB_ENV
19 |       - name: Check changelog release version
20 |         if: ${{ env.TAG_VERSION != env.CHANGELOG_VERSION }}
21 |         run: |
22 |           echo "CHANGELOG_VERSION($CHANGELOG_VERSION) is different from TAG_VERSION($TAG_VERSION)"
23 |           exit 1
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install pyct \
28 |                       wheel \
29 |                       setuptools
30 |       - name: Get all git tags
31 |         run: git fetch --tags -f
32 |       - name: Build package
33 |         run: |
34 |           python setup.py sdist bdist_wheel
35 |       - name: Get package size
36 |         run: echo "PKG_SIZE=$(find dist -maxdepth 1 -regex '.*gz' | xargs stat --format='%s')" >> $GITHUB_ENV
37 |       - name: Check package size
38 |         if: ${{ env.PKG_SIZE > 1e+8 }}
39 |         run: |
40 |           echo "PKG_SIZE($PKG_SIZE bytes) is greater than 100MB"
41 |           exit 1
42 |       - name: Publish package
43 |         if: startsWith(github.ref, 'refs/tags')
44 |         uses: pypa/gh-action-pypi-publish@master
45 |         with:
46 |           skip_existing: true
47 |           password: ${{ secrets.PYPI_API_TOKEN }}
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # census-parquet
 2 | Python tools for creating and maintaining Parquet files from [US 2020 Census Data](https://www.census.gov/programs-surveys/decennial-census/decade/2020/2020-census-main.html).
 3 | 
 4 | 
 5 | ## Installation
 6 | 
 7 | To use the data download shell script files first install [wget](https://en.wikipedia.org/wiki/Wget).
 8 | 
 9 | To install the census-parquet package use
10 | ```
11 | pip install census-parquet
12 | ```
13 | 
14 | This will also install the required Python dependencies which are:
15 | 1. [click](https://github.com/pallets/click)
16 | 2. [dask](https://docs.dask.org/en/latest/install.html)
17 | 3. [dask_geopandas](https://github.com/geopandas/dask-geopandas)
18 | 4. [geopandas](https://geopandas.org/getting_started/install.html)
19 | 5. [numpy](https://numpy.org/install/)
20 | 6. [openpyxl](https://openpyxl.readthedocs.io/en/stable/#installation)
21 | 7. [pandas](https://pandas.pydata.org/docs/getting_started/install.html)
22 | 8. [pyarrow](https://arrow.apache.org/docs/python/install.html)
23 | 
24 | ## Usage
25 | To run the census-parquet code simply use
26 | ```
27 | run_census_parquet
28 | ```
29 | 
30 | This runs the following scripts in order:
31 | 1. `download_boundaries.sh` - This script downloads the Census Boundary data needed to run `process_boundaries.py`
32 | 2. `download_population_stats.sh` - This script downloads population stat data needed for process_blocks.py
33 | 3. `download_blocks.sh` - This script downloads the Census Block data needed to run process_blocks.py
34 | 4. `process_boundaries.py` - This script processes the Census Boundary data and creates parquet files. The parquet files will be output into a `boundary_outputs` folder.
35 | 5. `process_blocks.py` - This script processes Census Block data and creates parquet files. The final combined parquet file will have the name `tl_2020_FULL_tabblock20.parquet`.
36 | 


--------------------------------------------------------------------------------
/census_parquet/generate_synthetic_people.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generate Synthetic People
 3 | 
 4 | We create a table with a single point for each person within census.
 5 | """
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | import geopandas as gpd
10 | import dask_geopandas
11 | from dask.dataframe.dispatch import make_meta_dispatch
12 | import dask.dataframe as dd
13 | from dask.diagnostics import ProgressBar
14 | gpd.options.use_pygeos=True
15 | 
16 | 
17 | def polygons_to_points(gdf):
18 |     keep_points = []
19 |     rng = np.random.default_rng()
20 |     if gdf.empty:
21 |         return pd.DataFrame(
22 |             {"x": [float(0.0)],
23 |              "y": [float(0.0)],
24 |              "GEOID": ["00"], 
25 |              "R":[" "]
26 |             }
27 |         ).set_index("GEOID")
28 |     for index, row in gdf.iterrows():
29 |         pop = int(row["POP"])
30 |         if pop==0:
31 |             continue
32 |         _w = row["P0010003"]
33 |         _b = row["P0010004"]
34 |         _n = row["P0010005"]
35 |         _a = row["P0010006"]
36 |         _hpi = row["P0010007"]
37 |         _o = row["P0010008"]
38 |         _m = row["P0010009"]
39 |         len_within = 0
40 |         it = 1
41 |         while len_within < pop:
42 |             x_min, y_min, x_max, y_max = row["geometry"].bounds
43 |             xs = np.random.uniform(x_min, x_max, pop * it)
44 |             ys = np.random.uniform(y_min, y_max, pop * it)
45 |             gdf_points = gpd.GeoSeries(gpd.points_from_xy(xs, ys), crs=3857)
46 |             within_points = gdf_points.clip(row["geometry"])
47 |             len_within = len(within_points)
48 |             it+=1
49 |         within_pdf = pd.DataFrame(
50 |             {
51 |                 "x": within_points.iloc[:pop].x,
52 |                 "y": within_points.iloc[:pop].y,
53 |                 "GEOID": [index] * pop,
54 |                 "R": ["w"] * _w + ["b"] * _b + ["n"] * _n + ["a"] * _a + ["hpi"] * _hpi + ["o"] * _o + ["m"] * _m                    
55 |             }
56 |         )
57 |         keep_points.append(within_pdf)
58 |     return pd.concat(keep_points, ignore_index=True).set_index("GEOID")
59 | 
60 | 
61 | def main():
62 |     blocks_ddf = dask_geopandas.read_parquet("outputs/census_blocks_pops.parquet", calculate_divisions=True)
63 |     meta_df = pd.DataFrame(
64 |     			{"x": [float(0.0)],
65 |      			 "y": [float(0.0)],
66 |      			 "GEOID": ["00"],
67 |     			 "R":[" "]}
68 | 		).set_index("GEOID")
69 |     with ProgressBar():
70 |         blocks_ddf.map_partitions(polygons_to_points,
71 | 				  meta=meta_df,
72 | 				).to_parquet("outputs/synthetic_people.parquet", write_metadata_file=True)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/census_parquet/process_boundaries.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from pathlib import Path
 3 | 
 4 | import dask
 5 | import geopandas
 6 | import pandas as pd
 7 | 
 8 | 
 9 | warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*")
10 | 
11 | DTYPES = {
12 |     "AFFGEOID": "string",
13 |     "AFFGEOID20": "string",
14 |     "AIANNHCE": "int",
15 |     "AIANNHNS": "int",
16 |     "ALAND": "int",
17 |     "ALAND20": "int",
18 |     "AWATER": "int",
19 |     "AWATER20": "int",
20 |     "ANRCFP": "int",
21 |     "ANRCNS": "int",
22 |     "BLKGRPCE": "category",
23 |     "CBSAFP": "int",
24 |     "CD116FP": "int",
25 |     "CDSESSN": "int",
26 |     "CNECTAFP": "category",
27 |     "CONCTYFP": "int",
28 |     "CONCTYNS": "int",
29 |     "COUNTYNS": "string",
30 |     "COUNTYFP": "category",
31 |     "COUNTYFP20": "category",
32 |     "COUSUBFP": "category",
33 |     "COUSUBNS": "string",
34 |     # "CSAFP": pd.Int64Dtype(),  # can't astype object -> Int64
35 |     "DIVISIONCE": "int",
36 |     "ELSDLEA": "int",
37 |     "GEOID": "string",
38 |     "GEOID20": "string",
39 |     "LSAD": "category",
40 |     "LSAD20": "category",
41 |     "LSY": "category",
42 |     "METDIVFP": "int",
43 |     "NAME": "string",
44 |     "NAME20": "string",
45 |     "NAMELSAD": "string",
46 |     "NAMELSAD20": "string",
47 |     "NAMELSADCO": "category",
48 |     "NCTADVFP": "int",
49 |     "NECTAFP": "int",
50 |     "PARTFLG": "category",
51 |     "PLACEFP": "int",
52 |     "PLACENS": "int",
53 |     "REGIONCE": "int",
54 |     "SCSDLEA": "int",
55 |     "SLDLST": "string",
56 |     "SLDUST": "string",
57 |     "STATE_NAME": "category",
58 |     "STATEFP": "category",
59 |     "STATEFP20": "category",
60 |     "STATENS": "int",
61 |     "STUSPS": "category",
62 |     "SUBMCDFP": "int",
63 |     "SUBMCDNS": "int",
64 |     "TBLKGPCE": "category",
65 |     "TRACTCE": "int",
66 |     "TTRACTCE": "category",
67 |     "TRSUBCE": "int",
68 |     "TRSUBNS": "int",
69 |     "UNSDLEA": "int",
70 |     "VTDI20": "category",
71 |     "VTDST20": "string",
72 | }
73 | 
74 | 
75 | def process_boundary_file(path: Path) -> Path:
76 |     print(f"Started {path}")
77 |     gdf = geopandas.read_file(path, driver="SHP")
78 |     gdf = gdf.astype({k: DTYPES[k] for k in set(gdf.columns) & set(DTYPES)})
79 |     if "CSAFP" in gdf.columns:
80 |         gdf["CSAFP"] = gdf["CSAFP"].astype("float64").astype(pd.Int64Dtype())
81 | 
82 |     output = Path(path).parent / "boundary_outputs" / path.with_suffix(".parquet").name
83 |     output.parent.mkdir(parents=True, exist_ok=True)
84 |     gdf.to_parquet(output, index=False)
85 |     print(f"Finished {output}")
86 |     return output
87 | 
88 | 
89 | def main():
90 |     files = list(Path("census_boundaries").glob("*.zip"))
91 |     print(f"Found {len(files)} files")
92 |     results = [dask.delayed(process_boundary_file)(file) for file in files]
93 |     dask.compute(results)
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Citizen Code of Conduct
 2 | 
 3 | ## 1. Purpose
 4 | 
 5 | A primary goal of census-parquet is to be inclusive to the largest number of contributors, with the most varied and diverse backgrounds possible. As such, we are committed to providing a friendly, safe and welcoming environment for all, regardless of gender, sexual orientation, ability, ethnicity, socioeconomic status, and religion (or lack thereof).
 6 | 
 7 | This code of conduct outlines our expectations for all those who participate in our community, as well as the consequences for unacceptable behavior.
 8 | 
 9 | We invite all those who participate in census-parquet to help us create safe and positive experiences for everyone.
10 | 
11 | ## 2. Open [Source/Culture/Tech] Citizenship
12 | 
13 | A supplemental goal of this Code of Conduct is to increase open [source/culture/tech] citizenship by encouraging participants to recognize and strengthen the relationships between our actions and their effects on our community.
14 | 
15 | Communities mirror the societies in which they exist and positive action is essential to counteract the many forms of inequality and abuses of power that exist in society.
16 | 
17 | If you see someone who is making an extra effort to ensure our community is welcoming, friendly, and encourages all participants to contribute to the fullest extent, we want to know.
18 | 
19 | ## 3. Expected Behavior
20 | 
21 | The following behaviors are expected and requested of all community members:
22 | 
23 |  * Participate in an authentic and active way. In doing so, you contribute to the health and longevity of this community.
24 |  * Exercise consideration and respect in your speech and actions.
25 |  * Attempt collaboration before conflict.
26 |  * Refrain from demeaning, discriminatory, or harassing behavior and speech.
27 |  * Be mindful of your surroundings and of your fellow participants. Alert community leaders if you notice a dangerous situation, someone in distress, or violations of this Code of Conduct, even if they seem inconsequential.
28 |  * Remember that community event venues may be shared with members of the public; please be respectful to all patrons of these locations.
29 | 
30 | ## 4. Unacceptable Behavior
31 | 
32 | The following behaviors are considered harassment and are unacceptable within our community:
33 | 
34 |  * Violence, threats of violence or violent language directed against another person.
35 |  * Sexist, racist, homophobic, transphobic, ableist or otherwise discriminatory jokes and language.
36 |  * Posting or displaying sexually explicit or violent material.
37 |  * Posting or threatening to post other people's personally identifying information ("doxing").
38 |  * Personal insults, particularly those related to gender, sexual orientation, race, religion, or disability.
39 |  * Inappropriate photography or recording.
40 |  * Inappropriate physical contact. You should have someone's consent before touching them.
41 |  * Unwelcome sexual attention. This includes sexualized comments or jokes, inappropriate touching, groping, and unwelcomed sexual advances.
42 |  * Deliberate intimidation, stalking or following (online or in-person).
43 |  * Advocating for, or encouraging, any of the above behavior.
44 |  * Sustained disruption of community events, including talks and presentations.
45 | 
46 | ## 5. Weapons Policy
47 | 
48 | No weapons will be allowed at census-parquet events, community spaces, or in other spaces covered by the scope of this Code of Conduct. Weapons include but are not limited to guns, explosives (including fireworks), and large knives such as those used for hunting or display, as well as any other item used for the purpose of causing injury or harm to others. Anyone seen in possession of one of these items will be asked to leave immediately, and will only be allowed to return without the weapon. Community members are further expected to comply with all state and local laws on this matter.
49 | 
50 | ## 6. Consequences of Unacceptable Behavior
51 | 
52 | Unacceptable behavior from any community member, including sponsors and those with decision-making authority, will not be tolerated.
53 | 
54 | Anyone asked to stop unacceptable behavior is expected to comply immediately.
55 | 
56 | If a community member engages in unacceptable behavior, the community organizers may take any action they deem appropriate, up to and including a temporary ban or permanent expulsion from the community without warning (and without refund in the case of a paid event).
57 | 
58 | ## 7. Reporting Guidelines
59 | 
60 | If you are subject to or witness unacceptable behavior, or have any other concerns, please notify a community organizer as soon as possible.
61 | 
62 | Additionally, community organizers are available to help community members engage with local law enforcement or to otherwise help those experiencing unacceptable behavior feel safe. In the context of in-person events, organizers will also provide escorts as desired by the person experiencing distress.
63 | 
64 | ## 8. Addressing Grievances
65 | 
66 | If you feel you have been falsely or unfairly accused of violating this Code of Conduct, you should notify makepath with a concise description of your grievance. Your grievance will be handled in accordance with our existing governing policies. 
67 | 
68 | 
69 | 
70 | ## 9. Scope
71 | 
72 | We expect all community participants (contributors, paid or otherwise; sponsors; and other guests) to abide by this Code of Conduct in all community venues--online and in-person--as well as in all one-on-one communications pertaining to community business.
73 | 
74 | This code of conduct and its related procedures also applies to unacceptable behavior occurring outside the scope of community activities when such behavior has the potential to adversely affect the safety and well-being of community members.
75 | 
76 | ## 10. Contact info
77 | 
78 | Brendan Collins (brendan@makepath.com)
79 | 
80 | ## 11. License and attribution
81 | 
82 | The Citizen Code of Conduct is distributed by [Stumptown Syndicate](http://stumptownsyndicate.org) under a [Creative Commons Attribution-ShareAlike license](http://creativecommons.org/licenses/by-sa/3.0/). 
83 | 
84 | Portions of text derived from the [Django Code of Conduct](https://www.djangoproject.com/conduct/) and the [Geek Feminism Anti-Harassment Policy](http://geekfeminism.wikia.com/wiki/Conference_anti-harassment/Policy).
85 | 
86 | _Revision 2.3. Posted 6 March 2017._
87 | 
88 | _Revision 2.2. Posted 4 February 2016._
89 | 
90 | _Revision 2.1. Posted 23 June 2014._
91 | 
92 | _Revision 2.0, adopted by the [Stumptown Syndicate](http://stumptownsyndicate.org) board on 10 January 2013. Posted 17 March 2013._
93 | 


--------------------------------------------------------------------------------
/census_parquet/process_blocks.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Processing Census Blocks.
  3 | 
  4 | We create two logical tables:
  5 | 
  6 | 1. Geometries only
  7 | 2. Populations only
  8 | 
  9 | This is driven by the Census Bureau not providing population statistics
 10 | for territories (yet?).
 11 | """
 12 | from pathlib import Path
 13 | import warnings
 14 | 
 15 | import dask
 16 | import dask.dataframe as dd
 17 | import dask_geopandas
 18 | from dask.diagnostics import ProgressBar
 19 | import geopandas
 20 | import pandas as pd
 21 | warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*")
 22 | 
 23 | 
 24 | statelookup = {
 25 |     "01": "AL",
 26 |     "02": "AK",
 27 |     "04": "AZ",
 28 |     "05": "AR",
 29 |     "06": "CA",
 30 |     "08": "CO",
 31 |     "09": "CT",
 32 |     "10": "DE",
 33 |     "11": "DC",
 34 |     "12": "FL",
 35 |     "13": "GA",
 36 |     "15": "HI",
 37 |     "16": "ID",
 38 |     "17": "IL",
 39 |     "18": "IN",
 40 |     "19": "IA",
 41 |     "20": "KS",
 42 |     "21": "KY",
 43 |     "22": "LA",
 44 |     "23": "ME",
 45 |     "24": "MD",
 46 |     "25": "MA",
 47 |     "26": "MI",
 48 |     "27": "MN",
 49 |     "28": "MS",
 50 |     "29": "MO",
 51 |     "30": "MT",
 52 |     "31": "NE",
 53 |     "32": "NV",
 54 |     "33": "NH",
 55 |     "34": "NJ",
 56 |     "35": "NM",
 57 |     "36": "NY",
 58 |     "37": "NC",
 59 |     "38": "ND",
 60 |     "39": "OH",
 61 |     "40": "OK",
 62 |     "41": "OR",
 63 |     "42": "PA",
 64 |     "44": "RI",
 65 |     "45": "SC",
 66 |     "46": "SD",
 67 |     "47": "TN",
 68 |     "48": "TX",
 69 |     "49": "UT",
 70 |     "50": "VT",
 71 |     "51": "VA",
 72 |     "53": "WA",
 73 |     "54": "WV",
 74 |     "55": "WI",
 75 |     "56": "WY",
 76 |     "72": "PR",
 77 | }
 78 | SUMMARY_TABLE = "./population_stats/2020_PLSummaryFile_FieldNames.xlsx"
 79 | 
 80 | 
 81 | def process_pop(file):
 82 |     FIPS = file.stem.split("_")[2]
 83 |     ABBR = statelookup[FIPS]
 84 | 
 85 |     root = Path("population_stats")
 86 |     state_1 = root / (ABBR.lower() + "000012020.pl")
 87 |     state_geo = root / (ABBR.lower() + "geo2020.pl")
 88 | 
 89 |     seg_1_header_df = pd.read_excel(
 90 |         SUMMARY_TABLE, sheet_name="2020 P.L. Segment 1 Fields"
 91 |     )
 92 | 
 93 |     geo_header_df = pd.read_excel(
 94 |         SUMMARY_TABLE, sheet_name="2020 P.L. Geoheader Fields"
 95 |     )
 96 | 
 97 |     seg_1_df = pd.read_csv(
 98 |         state_1,
 99 |         encoding="latin-1",
100 |         delimiter="|",
101 |         names=seg_1_header_df.columns.to_list(),
102 |         low_memory=False,
103 |     ).drop(columns=["STUSAB"])
104 | 
105 |     geo_df = pd.read_csv(
106 |         state_geo,
107 |         encoding="latin-1",
108 |         delimiter="|",
109 |         names=geo_header_df.columns.to_list(),
110 |         low_memory=False,
111 |     )
112 |     geo_df = geo_df[geo_df["SUMLEV"] == 750]
113 | 
114 |     block_df = pd.merge(
115 |         left=geo_df[["LOGRECNO", "GEOID", "STUSAB"]],
116 |         right=seg_1_df,
117 |         how="left",
118 |         on="LOGRECNO",
119 |     ).drop(columns=["LOGRECNO", "CHARITER", "STUSAB", "FILEID", "CIFSN"])
120 |     block_df["GEOID"] = block_df["GEOID"].str.replace("7500000US", "")
121 |     block_df = block_df.set_index("GEOID").sort_index()
122 | 
123 |     assert block_df.index.is_unique
124 |     return block_df   
125 | 
126 | 
127 | def process_geo(file):
128 |     dtypes = {
129 |         "STATEFP": "int",
130 |         "COUNTYFP": "int",
131 |         "TRACTCE": "int",
132 |         "BLOCKCE": "int",
133 |         "HOUSING": "int",
134 |         "POP":    "int"
135 |     }
136 | 
137 |     gdf = (
138 |         geopandas.read_file(file, driver="SHP")
139 |         .drop(columns=["MTFCC20", "UR20", "UACE20", "UATYPE20", "FUNCSTAT20", "NAME20"])
140 |         .rename(columns=lambda x: x.rstrip("20"))
141 |         .astype(dtypes)
142 |         .set_index("GEOID")
143 |     )
144 |     gdf["INTPTLON"] = pd.to_numeric(gdf["INTPTLON"])
145 |     gdf["INTPTLAT"] = pd.to_numeric(gdf["INTPTLAT"])
146 |     gdf = gdf.replace([None],0)
147 | 
148 |     return gdf
149 | 
150 | 
151 | def process(file):
152 |     geo = process_geo(file)
153 |     FIPS = file.stem.split("_")[2]
154 | 
155 |     if FIPS in statelookup:
156 |         pop = process_pop(file)
157 |         result = pd.merge(geo, pop)
158 |         assert len(result) == len(geo)
159 |     else:
160 |         pop = None
161 | 
162 |     return file, geo, pop
163 | 
164 | def process_pop_geo(file):
165 |     geo = process_geo(file)
166 |     FIPS = file.stem.split("_")[2]
167 |     block_ddf = dask_geopandas.from_geopandas(geo, npartitions=1)
168 |     output_geo = Path(f"tmp/geo/{file.stem.split('_')[2]}.parquet")
169 |     output_geo.parent.mkdir(parents=True, exist_ok=True)
170 |     block_ddf.to_parquet(output_geo) 
171 |     if FIPS in statelookup:
172 |         pop = process_pop(file)
173 |         pop_ddf = dd.from_pandas(pop, npartitions=1)
174 |         output_pop = Path(f"tmp/pop/{FIPS}.parquet")
175 |         output_pop.parent.mkdir(parents=True, exist_ok=True)
176 |         pop_ddf.to_parquet(output_pop)
177 |         result = pd.merge(geo,pop,left_index=True,right_index=True)
178 |         result = result[['POP',
179 | 			 'P0010003',
180 |                          'P0010004',
181 |                          'P0010005',
182 |                          'P0010006',
183 |                          'P0010007',
184 |                          'P0010008',
185 |                          'P0010009',
186 |                          'geometry']
187 |                         ]
188 |         result = result.to_crs(3857)
189 |         result = dask_geopandas.from_geopandas(result,npartitions=1)
190 |         assert len(result) == len(geo)
191 |         output = Path(f"tmp/comb/{file.stem.split('_')[2]}.parquet")
192 |         output.parent.mkdir(parents=True,exist_ok=True)
193 |         result.to_parquet(output)
194 |     else:
195 |         pop = None
196 |     return output, output_pop, output_geo
197 | 
198 | def main():
199 |     files = list(Path("TABBLOCK20").glob("*.zip"))
200 |     
201 |     combs = [dask.delayed(process_pop_geo)(file)
202 |             for file in files
203 |             if file.stem.split("_")[2] in statelookup
204 |     ]
205 | 
206 |     print("combining geo and pops")
207 |     with ProgressBar():
208 |         outs  = dask.compute(*combs)
209 |     
210 |     comb_files = [x[0] for x in outs]
211 |     pop_files = [x[1] for x in outs]
212 |     geo_files = [x[2] for x in outs]
213 |     pop = dd.concat([dd.read_parquet(f) for f in sorted(pop_files)])
214 |     geo = dd.concat([dask_geopandas.read_parquet(f) for f in sorted(geo_files)])
215 |     
216 |     comb = dd.concat([dask_geopandas.read_parquet(f) for f in sorted(comb_files)])
217 | 
218 |     Path("outputs").mkdir(exist_ok=True)
219 |     print("repartitioning combined files into like sizes")
220 |     with ProgressBar():
221 |         comb = comb.repartition(partition_size="10MB")    
222 | 
223 |     print("spatial partitioning combined files")
224 |     with ProgressBar():
225 |         comb.calculate_spatial_partitions()
226 | 
227 |     print("finalizing census blocks and population data")
228 |     with ProgressBar():
229 |         comb.to_parquet("outputs/census_blocks_pops.parquet", write_metadata_file=True)
230 | 
231 |     print("finalizing population files")
232 |     with ProgressBar():
233 |         pop.to_parquet("outputs/census_population.parquet", write_metadata_file=True)
234 | 
235 |     print("computing spatial partitions for geo files")
236 |     with ProgressBar():
237 |         geo.calculate_spatial_partitions()
238 | 
239 |     print("finalizing geo files")
240 |     with ProgressBar():
241 |         geo.to_parquet("outputs/census_blocks_geo.parquet", write_metadata_file=True)
242 |     
243 |     print("validating")
244 |     a = dd.read_parquet("outputs/census_population.parquet", calculate_divisions=True)
245 |     assert a.known_divisions
246 | 
247 |     b = dask_geopandas.read_parquet("outputs/census_blocks_geo.parquet", calculate_divisions=True)
248 |     assert b.known_divisions
249 |     
250 |     c = dask_geopandas.read_parquet("outputs/census_blocks_pops.parquet", calculate_divisions=True)
251 |     assert c.known_divisions
252 |     
253 |     print("complete")
254 | 
255 | if __name__ == "__main__":
256 |     main()
257 | 


--------------------------------------------------------------------------------