├── .gitattributes ├── .github ├── release.yml └── workflows │ ├── build_docs.yml │ ├── unittests.yml │ └── upload_package.yml ├── .gitignore ├── LICENSE.txt ├── README.md ├── ci ├── 311.yml ├── 312.yml ├── 313-dev.yml └── 313.yml ├── codecov.yml ├── docs ├── .nojekyll ├── Makefile ├── _static │ ├── images │ │ ├── notebooks_census_to_hexgrid_25_1.png │ │ ├── pysal_favicon.ico │ │ ├── tobler2.png │ │ └── tobler3.png │ ├── pysal-styles.css │ └── references.bib ├── api.rst ├── apimyst.md ├── conf.py ├── figs │ ├── nsf_logo.jpg │ ├── raster_lattice_example.png │ ├── tobler_long.png │ ├── tobler_long.svg │ ├── toy_census_tracts_example.png │ ├── toy_census_tracts_example_old.png │ └── waldo_travel.jpg ├── index.rst ├── installation.rst ├── notebooks │ ├── 01_interpolation_methods_overview.nblink │ ├── 02_areal_interpolation_example.nblink │ ├── 03_areal_interpolation_details.nblink │ ├── binary_dasymetric.nblink │ ├── census_to_hexgrid.nblink │ └── extract_urban_areas.nblink ├── references.rst └── tutorial.rst ├── environment.yml ├── notebooks ├── 01_interpolation_methods_overview.ipynb ├── 02_areal_interpolation_example.ipynb ├── 03_areal_interpolation_details.ipynb ├── 04_area_interpolate_dask.ipynb ├── area_interpolate_perf.ipynb ├── binary_dasymetric.ipynb ├── census_to_hexgrid.ipynb ├── extract_urban_areas.ipynb └── h3fill.ipynb ├── pyproject.toml └── tobler ├── __init__.py ├── area_weighted ├── __init__.py ├── area_interpolate.py ├── area_interpolate_dask.py └── area_join.py ├── dasymetric ├── __init__.py ├── masked_area_interpolate.py └── raster_tools.py ├── model ├── __init__.py └── glm.py ├── pycno ├── __init__.py └── pycno.py ├── tests ├── test_area_interpolators.py ├── test_area_join.py ├── test_dasymetric.py ├── test_model.py ├── test_pycno.py └── test_utils.py └── util ├── __init__.py └── util.py /.gitattributes: -------------------------------------------------------------------------------- 1 | tobler/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- 1 | changelog: 2 | exclude: 3 | labels: 4 | - ignore-for-release 5 | authors: 6 | - dependabot 7 | categories: 8 | - title: Bug Fixes 9 | labels: 10 | - bug 11 | - title: Enhancements 12 | labels: 13 | - enhancement 14 | - title: Other Changes 15 | labels: 16 | - "*" 17 | -------------------------------------------------------------------------------- /.github/workflows/build_docs.yml: -------------------------------------------------------------------------------- 1 | name: Build Docs 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | docs: 9 | name: Build & Push Docs 10 | runs-on: ${{ matrix.os }} 11 | timeout-minutes: 90 12 | strategy: 13 | matrix: 14 | os: ['ubuntu-latest'] 15 | environment-file: [ci/312.yml] 16 | experimental: [false] 17 | defaults: 18 | run: 19 | shell: bash -l {0} 20 | 21 | steps: 22 | - name: Checkout repo 23 | uses: actions/checkout@v4 24 | with: 25 | fetch-depth: 0 # Fetch all history for all branches and tags. 26 | 27 | - name: Setup micromamba 28 | uses: mamba-org/setup-micromamba@v1 29 | with: 30 | environment-file: ${{ matrix.environment-file }} 31 | micromamba-version: 'latest' 32 | 33 | - name: Install 34 | run: pip install -e . --no-deps --force-reinstall 35 | 36 | - name: Make Docs 37 | run: cd docs; make html 38 | 39 | - name: Commit Docs 40 | run: | 41 | git clone https://github.com/ammaraskar/sphinx-action-test.git --branch gh-pages --single-branch gh-pages 42 | cp -r docs/_build/html/* gh-pages/ 43 | cd gh-pages 44 | git config --local user.email "action@github.com" 45 | git config --local user.name "GitHub Action" 46 | git add . 47 | git commit -m "Update documentation" -a || true 48 | # The above command will fail if no changes were present, 49 | # so we ignore the return code. 50 | 51 | - name: Push to gh-pages 52 | uses: ad-m/github-push-action@master 53 | with: 54 | branch: gh-pages 55 | directory: gh-pages 56 | github_token: ${{ secrets.GITHUB_TOKEN }} 57 | force: true -------------------------------------------------------------------------------- /.github/workflows/unittests.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: 4 | push: 5 | branches: 6 | - '*' 7 | pull_request: 8 | branches: 9 | - '*' 10 | schedule: 11 | - cron: '59 23 * * *' 12 | workflow_dispatch: 13 | inputs: 14 | version: 15 | description: Manual CI Run 16 | default: test 17 | required: false 18 | 19 | jobs: 20 | tests: 21 | name: ${{ matrix.os }}, ${{ matrix.environment-file }} 22 | runs-on: ${{ matrix.os }} 23 | timeout-minutes: 30 24 | strategy: 25 | matrix: 26 | os: [ubuntu-latest] 27 | environment-file: [ 28 | ci/311.yml, 29 | ci/312.yml, 30 | ci/313.yml, 31 | ci/313-dev.yml, 32 | ] 33 | include: 34 | - environment-file: ci/312.yml 35 | os: macos-13 # Intel 36 | - environment-file: ci/312.yml 37 | os: macos-14 # Apple Silicon 38 | - environment-file: ci/312.yml 39 | os: windows-latest 40 | fail-fast: false 41 | 42 | defaults: 43 | run: 44 | shell: bash -l {0} 45 | 46 | steps: 47 | - name: checkout repo 48 | uses: actions/checkout@v4 49 | with: 50 | fetch-depth: 0 # Fetch all history for all branches and tags. 51 | 52 | - name: setup micromamba 53 | uses: mamba-org/setup-micromamba@v1 54 | with: 55 | environment-file: ${{ matrix.environment-file }} 56 | micromamba-version: 'latest' 57 | 58 | - name: environment info 59 | run: | 60 | micromamba info 61 | micromamba list 62 | 63 | - name: spatial versions 64 | run: 'python -c "import geopandas; geopandas.show_versions();"' 65 | 66 | - name: install package 67 | run: 'pip install -e . --no-deps' 68 | 69 | - name: Download test files 70 | run: | 71 | python -c ' 72 | import libpysal 73 | 74 | libpysal.examples.fetch_all() 75 | ' 76 | 77 | - name: run tests 78 | run: | 79 | pytest tobler \ 80 | -v \ 81 | -r a \ 82 | -n auto \ 83 | --color yes \ 84 | --cov tobler \ 85 | --cov-append \ 86 | --cov-report term-missing \ 87 | --cov-report xml . 88 | 89 | - name: codecov 90 | uses: codecov/codecov-action@v4 91 | with: 92 | token: ${{ secrets.CODECOV_TOKEN }} 93 | 94 | - name: Generate and publish the report 95 | if: | 96 | failure() 97 | && steps.status.outcome == 'failure' 98 | && github.event_name == 'schedule' 99 | && github.repository_owner == 'pysal' 100 | uses: xarray-contrib/issue-from-pytest-log@v1 101 | with: 102 | log-path: pytest-log.jsonl 103 | -------------------------------------------------------------------------------- /.github/workflows/upload_package.yml: -------------------------------------------------------------------------------- 1 | name: Release & Publish 2 | 3 | on: 4 | push: 5 | # Sequence of patterns matched against refs/tags 6 | tags: 7 | - "v*" # Push events to matching v*, i.e. v1.0, v20.15.10 8 | workflow_dispatch: 9 | inputs: 10 | version: 11 | description: Manual Release 12 | default: test 13 | required: false 14 | 15 | 16 | jobs: 17 | build: 18 | name: Create release & publish to PyPI 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Checkout repo 22 | uses: actions/checkout@v3 23 | 24 | - name: Set up python 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: "3.x" 28 | 29 | - name: Install Dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | python -m pip install --upgrade build twine 33 | python -m build 34 | twine check --strict dist/* 35 | 36 | - name: Create Release Notes 37 | uses: actions/github-script@v6 38 | with: 39 | github-token: ${{secrets.GITHUB_TOKEN}} 40 | script: | 41 | await github.request(`POST /repos/${{ github.repository }}/releases`, { 42 | tag_name: "${{ github.ref }}", 43 | generate_release_notes: true 44 | }); 45 | 46 | - name: Publish distribution 📦 to PyPI 47 | uses: pypa/gh-action-pypi-publish@release/v1 48 | with: 49 | user: __token__ 50 | password: ${{ secrets.PYPI_PASSWORD }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | .venv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | # Pycharm 94 | .idea 95 | .vscode 96 | 97 | nlcd_2011.tif 98 | 99 | docs/generated/ 100 | docs/apidocs/ -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright 2018 pysal-spopt developers 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | [![Continuous Integration](https://github.com/pysal/tobler/actions/workflows/unittests.yml/badge.svg)](https://github.com/pysal/tobler/actions/workflows/unittests.yml) 6 | [![codecov](https://codecov.io/gh/pysal/tobler/branch/main/graph/badge.svg?token=XO4SilfBEb)](https://codecov.io/gh/pysal/tobler)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/tobler) 7 | ![PyPI](https://img.shields.io/pypi/v/tobler) 8 | ![Conda (channel only)](https://img.shields.io/conda/vn/conda-forge/tobler) 9 | ![GitHub commits since latest release (branch)](https://img.shields.io/github/commits-since/pysal/tobler/latest) 10 | [![DOI](https://zenodo.org/badge/202220824.svg)](https://zenodo.org/badge/latestdoi/202220824) 11 | 12 | # PySAL `tobler` 13 | 14 | `tobler` is a python package for areal interpolation, dasymetric mapping, change of support, and small area estimation. It provides a suite of tools with a simple interface for transferring data from one polygonal representation to another. Common examples include standardizing census data from different time periods to a single representation (i.e. to overcome boundary changes in successive years), or the conversion of data collected at different spatial scales into shared units of analysis (e.g. converting zip code and neighborhood data into a regular grid). `tobler` is part of the [PySAL](https://pysal.org) family of packages for spatial data science and provides highly performant implementations of basic and advanced interpolation methods, leveraging [`shapely`](https://shapely.readthedocs.io/en/latest/) to optimize for multicore architecture. The package name is an homage to the legendary quantitative geographer [Waldo Tobler](https://en.wikipedia.org/wiki/Waldo_R._Tobler), a pioneer in geographic interpolation methods, spatial analysis, and computational social science. 15 | 16 | ![DC tracts to hexgrid](docs/_static/images/notebooks_census_to_hexgrid_25_1.png) 17 | 18 | ## Interpolation Methods 19 | 20 | `tobler` provides functionality for three families of spatial interpolation methods. The utility of each technique depends on the context of the problem and varies according to e.g. data availability, the properties of the interpolated variable, and the resolution of source and target geometries. For a further explanation of different interpolation techniques, please explore some of the field's [background literature](https://pysal.org/tobler/references.html) 21 | 22 | ### Area Weighted 23 | 24 | Areal interpolation uses the area of overlapping geometries to apportion variables. This is the simplest method with no additional data requirements, aside from input and output geometries, however this method is also most susceptible to the [modifiable areal unit problem](https://en.wikipedia.org/wiki/Modifiable_areal_unit_problem). 25 | 26 | ### Dasymetric 27 | 28 | Dasymetric interpolation uses auxiliary data to improve estimation, for example 29 | by constraining the areal interpolation to areas that are known to be inhabited. Formally, `tobler` adopts a binary dasymetric approach, using auxiliary data to define which land is available or unavailable for interpolation. The package can incorporate additional sources such as 30 | 31 | * raster data such as satellite imagery that define land types 32 | * vector features such as roads or water bodies that define habitable or inhabitable land 33 | 34 | either (or both) of which may be used to help ensure that variables from the source geometries are not allocated to inappropriate areas of the target geometries. Naturally, dasymetric approaches are sensitive to the quality of ancillary data and underlying assumptions used to guide the estimation. 35 | 36 | ### Model-based 37 | 38 | Model-based interpolation uses [spatial] statistical models to estimate a relationship between the target variable and a set of covariates such as physical features, administrative designations, or demographic and architectural characteristics. Model-based approaches offer the ability to incorporate the richest set of additional data, but they can also difficult to wield in practice because the true relationship between variables is never known. By definition, some formal assumptions of regression models are violated because the target variable is always predicted using data from a different spatial scale than it was estimated. 39 | 40 | ### Extensions 41 | 42 | `tobler` is under active development and will continue to incorporate emerging interpolation methods as they are introduced to the field. We welcome any and all contributions and if you would like to propose an additional method for adoption please raise an issue for discussion or open a new pull request! 43 | 44 | ![Charleston zipcodes to tracts](docs/_static/images/tobler3.png) 45 | 46 | ![raster example](docs/figs/raster_lattice_example.png) 47 | 48 | ## Installation 49 | 50 | ```bash 51 | $ conda env create -f environment.yml 52 | $ conda activate tobler 53 | $ pip install -e . --no-deps 54 | 55 | ``` 56 | 57 | ## Contribute 58 | 59 | PySAL-tobler is under active development and contributors are welcome. 60 | 61 | If you have any suggestion, feature request, or bug report, please open a new [issue](https://github.com/pysal/tobler/issues) on GitHub. To submit patches, please follow the PySAL development [guidelines](http://pysal.readthedocs.io/en/latest/developers/index.html) and open a [pull request](https://github.com/pysal/tobler). Once your changes get merged, you’ll automatically be added to the [Contributors List](https://github.com/pysal/tobler/graphs/contributors). 62 | 63 | ## License 64 | 65 | The project is licensed under the [BSD license](https://github.com/pysal/tobler/blob/main/LICENSE.txt). 66 | 67 | ## Funding 68 | 69 | 70 | 71 | Award #1733705 [Neighborhoods in Space-Time Contexts](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1733705&HistoricalAwards=false) 72 | 73 | Award #1831615 [Scalable Geospatial Analytics for Social Science Research](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1831615) 74 | -------------------------------------------------------------------------------- /ci/311.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.11 6 | - jupyterlab 7 | - dask 8 | - dask-geopandas 9 | - numpy 10 | - geopandas 11 | - pandas 12 | - numpy 13 | - rasterio 14 | - rasterstats 15 | - statsmodels 16 | - scikit-learn 17 | - scipy 18 | - libpysal 19 | - tqdm 20 | - codecov 21 | - pytest 22 | - pytest-mpl 23 | - pytest-cov 24 | - pytest-xdist 25 | - coverage 26 | - twine 27 | - pip 28 | - h3-py 29 | - joblib 30 | - astropy 31 | - mapclassify 32 | -------------------------------------------------------------------------------- /ci/312.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.12 6 | - dask 7 | - dask-geopandas 8 | - numpy 9 | - geopandas 10 | - pandas 11 | - rasterio 12 | - rasterstats 13 | - statsmodels 14 | - scikit-learn 15 | - scipy 16 | - libpysal 17 | - tqdm 18 | - codecov 19 | - pytest-xdist 20 | - coverage 21 | - pytest 22 | - pytest-mpl 23 | - pytest-cov 24 | - twine 25 | - pip 26 | - h3-py<4 27 | - mapclassify 28 | - sphinx>=1.4.3 29 | - sphinxcontrib-bibtex 30 | - sphinx_bootstrap_theme 31 | - numpydoc 32 | - nbsphinx 33 | - joblib 34 | - astropy 35 | - myst-parser 36 | - sphinx-autodoc2 37 | - nbsphinx-link 38 | - linkify-it-py 39 | - pip: 40 | - nbsphinx-link>=1.3.1 # not on conda yet -------------------------------------------------------------------------------- /ci/313-dev.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.13 6 | - dask 7 | - numpy 8 | - pandas 9 | - rasterio 10 | - rasterstats 11 | - statsmodels 12 | - scipy 13 | - libpysal 14 | - tqdm 15 | - codecov 16 | - pytest-xdist 17 | - coverage 18 | - pytest 19 | - pytest-mpl 20 | - pytest-cov 21 | - twine 22 | - h3-py 23 | - mapclassify 24 | - sphinx>=1.4.3 25 | - sphinxcontrib-bibtex 26 | - sphinx_bootstrap_theme 27 | - linkify-it-py 28 | - numpydoc 29 | - nbsphinx 30 | - joblib 31 | - astropy 32 | - pip 33 | - pip: 34 | # dev versions of packages 35 | - --pre --index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple --extra-index-url https://pypi.org/simple 36 | - scikit-learn 37 | - scipy 38 | - git+https://github.com/geopandas/geopandas.git@main 39 | - git+https://github.com/geopandas/dask-geopandas.git@main 40 | - shapely 41 | -------------------------------------------------------------------------------- /ci/313.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.13 6 | - dask 7 | - dask-geopandas 8 | - jupyterlab 9 | - numpy 10 | - geopandas 11 | - pandas 12 | - numpy 13 | - rasterio 14 | - rasterstats 15 | - statsmodels 16 | - scikit-learn 17 | - scipy 18 | - libpysal 19 | - tqdm 20 | - codecov 21 | - pytest 22 | - pytest-mpl 23 | - pytest-cov 24 | - pytest-xdist 25 | - coverage 26 | - twine 27 | - pip 28 | - h3-py 29 | - joblib 30 | - astropy 31 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | after_n_builds: 7 4 | coverage: 5 | range: 50..95 6 | round: nearest 7 | precision: 1 8 | status: 9 | project: 10 | default: 11 | threshold: 15% 12 | patch: 13 | default: 14 | threshold: 20% 15 | target: 60% 16 | comment: 17 | layout: "reach, diff, files" 18 | behavior: once 19 | after_n_builds: 7 20 | require_changes: true 21 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/.nojekyll -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = PACKAGE_NAME 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | github: 23 | @make html 24 | 25 | sync: 26 | @rsync -avh _build/html/ ../docs/ --delete 27 | @rsync -avh figs/ ../docs/figs/ --delete 28 | @make clean 29 | touch ../docs/.nojekyll 30 | 31 | clean: 32 | rm -rf $(BUILDDIR)/* 33 | rm -rf auto_examples/ 34 | rm -rf generated/ 35 | rm -rf apidocs/ 36 | 37 | -------------------------------------------------------------------------------- /docs/_static/images/notebooks_census_to_hexgrid_25_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/_static/images/notebooks_census_to_hexgrid_25_1.png -------------------------------------------------------------------------------- /docs/_static/images/pysal_favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/_static/images/pysal_favicon.ico -------------------------------------------------------------------------------- /docs/_static/images/tobler2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/_static/images/tobler2.png -------------------------------------------------------------------------------- /docs/_static/images/tobler3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/_static/images/tobler3.png -------------------------------------------------------------------------------- /docs/_static/pysal-styles.css: -------------------------------------------------------------------------------- 1 | /* Make thumbnails with equal heights */ 2 | @media only screen and (min-width : 481px) { 3 | .row.equal-height { 4 | display: flex; 5 | flex-wrap: wrap; 6 | } 7 | .row.equal-height > [class*='col-'] { 8 | display: flex; 9 | flex-direction: column; 10 | } 11 | .row.equal-height.row:after, 12 | .row.equal-height.row:before { 13 | display: flex; 14 | } 15 | 16 | .row.equal-height > [class*='col-'] > .thumbnail, 17 | .row.equal-height > [class*='col-'] > .thumbnail > .caption { 18 | display: flex; 19 | flex: 1 0 auto; 20 | flex-direction: column; 21 | } 22 | .row.equal-height > [class*='col-'] > .thumbnail > .caption > .flex-text { 23 | flex-grow: 1; 24 | } 25 | .row.equal-height > [class*='col-'] > .thumbnail > img { 26 | width: 100%; 27 | height: 200px; /* force image's height */ 28 | 29 | /* force image fit inside it's "box" */ 30 | -webkit-object-fit: cover; 31 | -moz-object-fit: cover; 32 | -ms-object-fit: cover; 33 | -o-object-fit: cover; 34 | object-fit: cover; 35 | } 36 | } 37 | 38 | .row.extra-bottom-padding{ 39 | margin-bottom: 20px; 40 | } 41 | 42 | 43 | .topnavicons { 44 | margin-left: 10% !important; 45 | } 46 | 47 | .topnavicons li { 48 | margin-left: 0px !important; 49 | min-width: 100px; 50 | text-align: center; 51 | } 52 | 53 | .topnavicons .thumbnail { 54 | margin-right: 10px; 55 | border: none; 56 | box-shadow: none; 57 | text-align: center; 58 | font-size: 85%; 59 | font-weight: bold; 60 | line-height: 10px; 61 | height: 100px; 62 | } 63 | 64 | .topnavicons .thumbnail img { 65 | display: block; 66 | margin-left: auto; 67 | margin-right: auto; 68 | } 69 | 70 | 71 | /* Table with a scrollbar */ 72 | .bodycontainer { max-height: 600px; width: 100%; margin: 0; overflow-y: auto; } 73 | .table-scrollable { margin: 0; padding: 0; } 74 | 75 | .label { 76 | color: #222222; 77 | font-size: 100%; 78 | } -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api_ref: 2 | 3 | .. currentmodule:: tobler 4 | 5 | API reference 6 | ============= 7 | 8 | .. _data_api: 9 | 10 | Area Weighted 11 | -------------- 12 | Area weighted approaches use the area of overlap between the source and target geometries to weight the 13 | variables being assigned to the target 14 | 15 | .. currentmodule:: tobler.area_weighted 16 | 17 | .. autosummary:: 18 | :toctree: generated/ 19 | 20 | area_interpolate 21 | area_join 22 | 23 | 24 | Dasymetric 25 | ----------------------- 26 | 27 | Dasymetric approaches use auxiliary data in addition to use the area of overlap between the source and target geometries to weight the 28 | variables being assigned to the target 29 | 30 | .. currentmodule:: tobler.dasymetric 31 | 32 | .. autosummary:: 33 | :toctree: generated/ 34 | 35 | extract_raster_features 36 | masked_area_interpolate 37 | 38 | 39 | Model 40 | --------- 41 | Model based approaches use additional spatial data, such as a land cover raster, to estimate the relationships between population 42 | and the auxiliary data. It then uses that model to predict population levels at different scales 43 | 44 | .. currentmodule:: tobler.model 45 | 46 | .. autosummary:: 47 | :toctree: generated/ 48 | 49 | glm 50 | 51 | Pycnophylactic 52 | ------------------ 53 | Pycnophylactic interpolation is based on `Tobler's technique `_ 54 | for generating smooth, volume-preserving contour maps 55 | 56 | .. currentmodule:: tobler.pycno 57 | 58 | .. autosummary:: 59 | :toctree: generated/ 60 | 61 | pycno_interpolate 62 | 63 | Util 64 | --------- 65 | Utility Functions 66 | 67 | .. currentmodule:: tobler.util 68 | 69 | .. autosummary:: 70 | :toctree: generated/ 71 | 72 | h3fy -------------------------------------------------------------------------------- /docs/apimyst.md: -------------------------------------------------------------------------------- 1 | # API 2 | 3 | This is a beta version of the documentation written in [MyST](https://myst-parser.readthedocs.io/en/latest/index.html) with [autodoc2](https://sphinx-autodoc2.readthedocs.io/en/latest/quickstart.html) 4 | 5 | ## Area Weighted Interpolation 6 | 7 | Area weighted approaches use the area of overlap between the source and target geometries to weight the variables being assigned to the target. 8 | 9 | ```{autodoc2-summary} tobler.area_weighted.area_interpolate.area_interpolate 10 | ``` 11 | 12 | ```{autodoc2-summary} tobler.area_weighted.area_join.area_join 13 | ``` 14 | 15 | ## Dasymetric Interpolation 16 | 17 | Dasymetric approaches use auxiliary data in addition to use the area of overlap 18 | between the source and target geometries to weight the variables being assigned 19 | to the target. 20 | 21 | ```{autodoc2-summary} tobler.dasymetric.raster_tools.extract_raster_features 22 | ``` 23 | 24 | ```{autodoc2-summary} tobler.dasymetric.masked_area_interpolate.masked_area_interpolate 25 | ``` 26 | 27 | ## Pycnophylactic Interpolation 28 | 29 | Pycnophylactic interpolation is based on 30 | [Tobler's technique](https://www.tandfonline.com/doi/abs/10.1080/01621459.1979.10481647) 31 | for generating smooth, volume-preserving contour maps. 32 | 33 | ```{autodoc2-summary} tobler.pycno.pycno.pycno_interpolate 34 | ``` 35 | 36 | ## Utility Functions 37 | 38 | ```{autodoc2-summary} tobler.util.util.h3fy 39 | ``` 40 | 41 | ```{autodoc2-summary} tobler.util.util.circumradius 42 | ``` -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # giddy documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Jun 6 15:54:22 2018. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import sphinx_bootstrap_theme 20 | import tobler 21 | 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | # 27 | # needs_sphinx = '1.0' 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ #'sphinx_gallery.gen_gallery', 32 | "sphinx.ext.autodoc", 33 | "sphinx.ext.autosummary", 34 | "sphinx.ext.viewcode", 35 | "sphinxcontrib.bibtex", 36 | "sphinx.ext.mathjax", 37 | "sphinx.ext.doctest", 38 | "sphinx.ext.intersphinx", 39 | "numpydoc", 40 | "matplotlib.sphinxext.plot_directive", 41 | "nbsphinx", 42 | "myst_parser", 43 | "autodoc2", 44 | "nbsphinx_link"] 45 | 46 | myst_enable_extensions = [ 47 | "amsmath", 48 | "attrs_inline", 49 | "colon_fence", 50 | "deflist", 51 | "dollarmath", 52 | "fieldlist", 53 | "html_admonition", 54 | "html_image", 55 | "linkify", 56 | "replacements", 57 | "smartquotes", 58 | "strikethrough", 59 | "substitution", 60 | "tasklist", 61 | ] 62 | 63 | autodoc2_packages = [ 64 | "../tobler", 65 | ] 66 | autodoc2_render_plugin = "myst" 67 | 68 | autodoc2_module_all_regexes = [ 69 | r"tobler\..*", 70 | ] 71 | 72 | # sphinx_gallery_conf = { 73 | # # path to your examples scripts 74 | # 'examples_dirs': '../examples', 75 | # # path where to save gallery generated examples 76 | # 'gallery_dirs': 'auto_examples', 77 | # 'backreferences_dir': False, 78 | # } 79 | 80 | 81 | # Add any paths that contain templates here, relative to this directory. 82 | templates_path = ["_templates"] 83 | 84 | # The suffix(es) of source filenames. 85 | # You can specify multiple suffix as a list of string: 86 | # 87 | {'.rst': 'restructuredtext', '.md': 'markdown'}# The master toctree document. 88 | master_doc = "index" 89 | 90 | # General information about the project. 91 | project = "tobler" 92 | copyright = "2018-, pysal developers" 93 | author = "pysal developers" 94 | 95 | # The version info for the project you're documenting, acts as replacement for 96 | # |version| and |release|, also used in various other places throughout the 97 | # built documents. 98 | # 99 | # The full version. 100 | version = tobler.__version__ 101 | release = tobler.__version__ 102 | 103 | # The language for content autogenerated by Sphinx. Refer to documentation 104 | # for a list of supported languages. 105 | # 106 | # This is also used if you do content translation via gettext catalogs. 107 | # Usually you set "language" from the command line for these cases. 108 | language = 'en' 109 | 110 | # List of patterns, relative to source directory, that match files and 111 | # directories to ignore when looking for source files. 112 | # This patterns also effect to html_static_path and html_extra_path 113 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "tests/*"] 114 | 115 | # The name of the Pygments (syntax highlighting) style to use. 116 | pygments_style = "sphinx" 117 | 118 | # If true, `todo` and `todoList` produce output, else they produce nothing. 119 | todo_include_todos = False 120 | 121 | # Bib Variables 122 | bibtex_default_style = 'alpha' 123 | bibtex_reference_style ="author_year" 124 | bibtex_bibfiles = ["_static/references.bib"] 125 | 126 | 127 | # -- Options for HTML output ---------------------------------------------- 128 | 129 | # The theme to use for HTML and HTML Help pages. See the documentation for 130 | # a list of builtin themes. 131 | # 132 | # html_theme = 'alabaster' 133 | html_theme = "bootstrap" 134 | html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() 135 | html_title = "%s v%s Manual" % (project, version) 136 | 137 | # (Optional) Logo. Should be small enough to fit the navbar (ideally 24x24). 138 | # Path should be relative to the ``_static`` files directory. 139 | # html_logo = "_static/images/CGS_logo.jpg" 140 | # html_logo = "_static/images/CGS_logo_green.png" 141 | html_logo = "figs/tobler_long.svg" 142 | html_favicon = "figs/tobler.ico" 143 | 144 | 145 | # Theme options are theme-specific and customize the look and feel of a theme 146 | # further. For a list of options available for each theme, see the 147 | # documentation. 148 | # 149 | html_theme_options = { 150 | # Navigation bar title. (Default: ``project`` value) 151 | "navbar_title": "tobler", 152 | # Render the next and previous page links in navbar. (Default: true) 153 | "navbar_sidebarrel": False, 154 | # Render the current pages TOC in the navbar. (Default: true) 155 | #'navbar_pagenav': True, 156 | #'navbar_pagenav': False, 157 | # No sidebar 158 | "nosidebar": True, 159 | # Tab name for the current pages TOC. (Default: "Page") 160 | #'navbar_pagenav_name': "Page", 161 | # Global TOC depth for "site" navbar tab. (Default: 1) 162 | # Switching to -1 shows all levels. 163 | "globaltoc_depth": 2, 164 | # Include hidden TOCs in Site navbar? 165 | # 166 | # Note: If this is "false", you cannot have mixed ``:hidden:`` and 167 | # non-hidden ``toctree`` directives in the same page, or else the build 168 | # will break. 169 | # 170 | # Values: "true" (default) or "false" 171 | "globaltoc_includehidden": "true", 172 | # HTML navbar class (Default: "navbar") to attach to
element. 173 | # For black navbar, do "navbar navbar-inverse" 174 | #'navbar_class': "navbar navbar-inverse", 175 | # Fix navigation bar to top of page? 176 | # Values: "true" (default) or "false" 177 | "navbar_fixed_top": "true", 178 | # Location of link to source. 179 | # Options are "nav" (default), "footer" or anything else to exclude. 180 | "source_link_position": "footer", 181 | # Bootswatch (http://bootswatch.com/) theme. 182 | # 183 | # Options are nothing (default) or the name of a valid theme 184 | # such as "amelia" or "cosmo", "yeti", "flatly". 185 | "bootswatch_theme": "yeti", 186 | # Choose Bootstrap version. 187 | # Values: "3" (default) or "2" (in quotes) 188 | "bootstrap_version": "3", 189 | "navbar_links": [ 190 | # ("Gallery", "auto_examples/index"), 191 | ("Installation", "installation"), 192 | ("Tutorial", "tutorial"), 193 | ("API", "api"), 194 | ("References", "references"), 195 | ], 196 | } 197 | 198 | # Add any paths that contain custom static files (such as style sheets) here, 199 | # relative to this directory. They are copied after the builtin static files, 200 | # so a file named "default.css" will overwrite the builtin "default.css". 201 | html_static_path = ["_static"] 202 | 203 | # Custom sidebar templates, maps document names to template names. 204 | # html_sidebars = {} 205 | # html_sidebars = {'sidebar': ['localtoc.html', 'sourcelink.html', 'searchbox.html']} 206 | 207 | # -- Options for HTMLHelp output ------------------------------------------ 208 | 209 | # Output file base name for HTML help builder. 210 | htmlhelp_basename = "toblerdoc" 211 | 212 | 213 | # -- Options for LaTeX output --------------------------------------------- 214 | 215 | latex_elements = { 216 | # The paper size ('letterpaper' or 'a4paper'). 217 | # 218 | # 'papersize': 'letterpaper', 219 | # The font size ('10pt', '11pt' or '12pt'). 220 | # 221 | # 'pointsize': '10pt', 222 | # Additional stuff for the LaTeX preamble. 223 | # 224 | # 'preamble': '', 225 | # Latex figure (float) alignment 226 | # 227 | # 'figure_align': 'htbp', 228 | } 229 | 230 | # Grouping the document tree into LaTeX files. List of tuples 231 | # (source start file, target name, title, 232 | # author, documentclass [howto, manual, or own class]). 233 | latex_documents = [ 234 | ( 235 | master_doc, 236 | "toblerdoc.tex", 237 | u"tobler Documentation", 238 | u"tobler developers", 239 | "manual", 240 | ) 241 | ] 242 | 243 | 244 | # -- Options for manual page output --------------------------------------- 245 | 246 | # One entry per manual page. List of tuples 247 | # (source start file, name, description, authors, manual section). 248 | man_pages = [(master_doc, "tobler", u"tobler Documentation", [author], 1)] 249 | 250 | 251 | # -- Options for Texinfo output ------------------------------------------- 252 | 253 | # Grouping the document tree into Texinfo files. List of tuples 254 | # (source start file, target name, title, author, 255 | # dir menu entry, description, category) 256 | texinfo_documents = [ 257 | ( 258 | master_doc, 259 | "tobler", 260 | u"tobler Documentation", 261 | author, 262 | "tobler", 263 | "One line description of project.", 264 | "Miscellaneous", 265 | ) 266 | ] 267 | 268 | # ----------------------------------------------------------------------------- 269 | # Napoleon configuration 270 | # ----------------------------------------------------------------------------- 271 | # numpydoc_show_class_members = True 272 | # numpydoc_class_members_toctree = False 273 | # 274 | # napoleon_use_ivar = True 275 | 276 | # ----------------------------------------------------------------------------- 277 | # Autosummary 278 | # ----------------------------------------------------------------------------- 279 | 280 | # Generate the API documentation when building 281 | autosummary_generate = True 282 | numpydoc_show_class_members = False 283 | class_members_toctree = True 284 | numpydoc_show_inherited_class_members = True 285 | numpydoc_use_plots = True 286 | numpydoc_xref_param_type=True 287 | # display the source code for Plot directive 288 | plot_include_source = True 289 | 290 | 291 | # Example configuration for intersphinx: refer to the Python standard library. 292 | intersphinx_mapping = { 293 | 'python': ('https://docs.python.org/3/', None), 294 | 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None), 295 | 'numpy': ('https://docs.scipy.org/doc/numpy', None), 296 | 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 297 | 'geopandas': ('https://geopandas.readthedocs.io/en/latest/', None), 298 | 'sklearn': ('https://scikit-learn.org/stable/', None), 299 | 'giddy': ('https://giddy.readthedocs.io/en/latest/', None), 300 | 'libpysal': ('https://pysal.org/libpysal/', None), 301 | 'esda': ('https://pysal.org/esda/', None), 302 | 'region': ('https://region.readthedocs.io/en/latest/', None), 303 | 'hdbscan': ('https://hdbscan.readthedocs.io/en/latest/', None) 304 | 305 | } 306 | 307 | numpydoc_xref_ignore = {'type', 'optional', 'default', 'shape', 'fitted', 'instance', 308 | 'cluster', 'of', 'or', 'if', 'using', 'otherwise', 'required', 309 | 'from'} 310 | 311 | -------------------------------------------------------------------------------- /docs/figs/nsf_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/nsf_logo.jpg -------------------------------------------------------------------------------- /docs/figs/raster_lattice_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/raster_lattice_example.png -------------------------------------------------------------------------------- /docs/figs/tobler_long.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/tobler_long.png -------------------------------------------------------------------------------- /docs/figs/tobler_long.svg: -------------------------------------------------------------------------------- 1 | 2 | 21 | 23 | 29 | 35 | 36 | 56 | 58 | 59 | 61 | image/svg+xml 62 | 64 | 65 | 66 | 67 | 68 | 72 | 418 | TOBLER 427 | 431 | 432 | 433 | -------------------------------------------------------------------------------- /docs/figs/toy_census_tracts_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/toy_census_tracts_example.png -------------------------------------------------------------------------------- /docs/figs/toy_census_tracts_example_old.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/toy_census_tracts_example_old.png -------------------------------------------------------------------------------- /docs/figs/waldo_travel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/waldo_travel.jpg -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. image:: figs/tobler_long.png 2 | :align: center 3 | :height: 200px 4 | :width: 425px 5 | :alt: tobler 6 | 7 | 8 | .. raw:: html 9 | 10 | 38 | 39 | 40 | 41 | 42 | 43 | Tobler 44 | ``````````````````````````````````````````````````````````` 45 | 46 | :code:`tobler` is a library for areal interpolation and dasymetric mapping. The name is an homage to the legendary geographer `Waldo Tobler`_. It is a subpackage of `PySAL`_ (Python Spatial Analysis Library), and is under active `development`_. 47 | 48 | 49 | 50 | .. toctree:: 51 | :hidden: 52 | :maxdepth: 3 53 | :caption: Contents: 54 | 55 | Installation 56 | Tutorial 57 | API 58 | References 59 | 60 | 61 | 62 | .. _PySAL: https://pysal.org 63 | .. _development: https://github.com/pysal/tobler 64 | .. _Waldo Tobler: https://en.wikipedia.org/wiki/Waldo_R._Tobler 65 | 66 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. Installation 2 | 3 | .. highlight:: rst 4 | 5 | .. role:: python(code) 6 | :language: python 7 | 8 | 9 | Installation 10 | =============== 11 | 12 | tobler supports Python `3.6`_ and `3.7`_ only. Please make sure that you are 13 | operating in a Python 3 environment. 14 | 15 | Installing a released version 16 | ------------------------------ 17 | ``tobler`` is available on both conda and pip, and can be installed with either 18 | 19 | .. code-block:: bash 20 | 21 | conda install -c conda-forge tobler 22 | 23 | or 24 | 25 | .. code-block:: bash 26 | 27 | pip install tobler 28 | 29 | 30 | Installing a development from source 31 | ------------------------------------- 32 | For working with a development version, we recommend `anaconda`_. To get started, clone this repository or download it manually then ``cd`` into the directory and run the following commands: 33 | 34 | .. code-block:: bash 35 | 36 | conda env create -f environment.yml 37 | source activate tobler 38 | python setup.py develop 39 | 40 | You can also `fork`_ the `pysal/tobler`_ repo and create a local clone of 41 | your fork. By making changes 42 | to your local clone and submitting a pull request to `pysal/tobler`_, you can 43 | contribute to the tobler development. 44 | 45 | .. _3.6: https://docs.python.org/3.6/ 46 | .. _3.7: https://docs.python.org/3.7/ 47 | .. _Python Package Index: https://pypi.org/pysal/tobler/ 48 | .. _pysal/tobler: https://github.com/pysal/tobler 49 | .. _fork: https://help.github.com/articles/fork-a-repo/ 50 | .. _anaconda: https://www.anaconda.com/download/ 51 | -------------------------------------------------------------------------------- /docs/notebooks/01_interpolation_methods_overview.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../notebooks/01_interpolation_methods_overview.ipynb" 3 | } 4 | -------------------------------------------------------------------------------- /docs/notebooks/02_areal_interpolation_example.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../notebooks/02_areal_interpolation_example.ipynb" 3 | } 4 | -------------------------------------------------------------------------------- /docs/notebooks/03_areal_interpolation_details.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../notebooks/03_areal_interpolation_details.ipynb" 3 | } 4 | -------------------------------------------------------------------------------- /docs/notebooks/binary_dasymetric.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../notebooks/binary_dasymetric.ipynb" 3 | } 4 | -------------------------------------------------------------------------------- /docs/notebooks/census_to_hexgrid.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../notebooks/census_to_hexgrid.ipynb" 3 | } 4 | -------------------------------------------------------------------------------- /docs/notebooks/extract_urban_areas.nblink: -------------------------------------------------------------------------------- 1 | { 2 | "path": "../../notebooks/extract_urban_areas.ipynb" 3 | } 4 | -------------------------------------------------------------------------------- /docs/references.rst: -------------------------------------------------------------------------------- 1 | .. reference for the docs 2 | 3 | References 4 | ========== 5 | 6 | .. bibliography:: _static/references.bib 7 | :all: 8 | -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tobler Tutorial 2 | =============== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Contents: 7 | 8 | notebooks/01_interpolation_methods_overview.ipynb 9 | notebooks/02_areal_interpolation_example.ipynb 10 | notebooks/03_areal_interpolation_details.ipynb 11 | notebooks/census_to_hexgrid.ipynb 12 | notebooks/extract_urban_areas 13 | notebooks/binary_dasymetric 14 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: tobler 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - dask-geopandas 6 | - dask 7 | - jupyterlab 8 | - numpy 9 | - geopandas >=0.13 10 | - pandas 11 | - numpy 12 | - rasterio 13 | - rasterstats 14 | - statsmodels 15 | - scikit-learn 16 | - scipy 17 | - libpysal 18 | - tqdm 19 | - pip 20 | - mapclassify 21 | - descartes 22 | - joblib 23 | -------------------------------------------------------------------------------- /notebooks/04_area_interpolate_dask.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e3f2586a-5b6a-4d46-b6e8-1991ae3bec6f", 6 | "metadata": {}, 7 | "source": [ 8 | "# (Distributed) areal interpolation" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "00f875bd-2714-4551-b10c-1ef3f514478d", 14 | "metadata": {}, 15 | "source": [ 16 | "In this notebook, we compare the single-core version in `tobler.area_weighted.area_interpolate` with the distributed version in `tobler.area_weighted.area_interpolate_dask`. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "id": "4084f715-3989-4424-943a-2a4066a8bcf2", 23 | "metadata": { 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import os\n", 29 | "os.environ['USE_PYGEOS'] = '1'\n", 30 | "\n", 31 | "import pandas\n", 32 | "import geopandas\n", 33 | "import dask_geopandas\n", 34 | "import tobler\n", 35 | "from libpysal.examples import load_example\n", 36 | "import numpy as np\n", 37 | "\n", 38 | "from dask.distributed import Client, LocalCluster" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "d16a2e15-866b-407d-b65d-54a675aefbd7", 44 | "metadata": {}, 45 | "source": [ 46 | "## Setup" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "080369e7-f3d4-41c6-a629-12ed458eb743", 52 | "metadata": {}, 53 | "source": [ 54 | "Load example data from `pysal`:" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "id": "cb395dc5-67f2-462e-a1cf-919c8e6d0ae8", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "c1 = load_example('Charleston1')\n", 65 | "c2 = load_example('Charleston2')\n", 66 | "\n", 67 | "crs = 6569 # https://epsg.io/6569\n", 68 | "\n", 69 | "tracts = geopandas.read_file(c1.get_path('sc_final_census2.shp')).to_crs(crs)\n", 70 | "zip_codes = geopandas.read_file(c2.get_path('CharlestonMSA2.shp')).to_crs(crs)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "1d11c1d7-6435-40cb-a4d4-851f63eccf01", 76 | "metadata": {}, 77 | "source": [ 78 | "We make up a categorical variable with four classes distributed randomly across the dataset:" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "id": "3543702f-5e8a-4336-a14d-19a4eeb77b1b", 85 | "metadata": { 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "rng = np.random.default_rng(seed=42)\n", 91 | "\n", 92 | "tracts['rando'] = pandas.Series(\n", 93 | " rng.integers(0, 4, len(tracts)), dtype='category'\n", 94 | ")" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "id": "d2546bb7-abcb-4cad-8db8-c569ea9289ae", 100 | "metadata": {}, 101 | "source": [ 102 | "We will set up a local Dask cluster so you can follow the computations on the dashboard (`http://localhost:8787` by default):" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 2, 108 | "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749", 109 | "metadata": { 110 | "tags": [] 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "client = Client(LocalCluster(n_workers=8))" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "88c32c7d-0ca8-4945-a1f8-edfbc8917880", 120 | "metadata": {}, 121 | "source": [ 122 | "Finally, for Dask, we need to provide `dask_geopandas.GeoDataFrame` objects with spatial partitions and categorical variables properly set up:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 7, 128 | "id": "a31a1a91-4071-40e2-a21f-7e035d734976", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "dtracts = (\n", 133 | " dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=16)\n", 134 | " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", 135 | ")\n", 136 | "\n", 137 | "dzips = (\n", 138 | " dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=16)\n", 139 | " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", 140 | ")" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "54f986ec-ea46-479e-aed8-5edeeaf16fda", 146 | "metadata": {}, 147 | "source": [ 148 | "---\n", 149 | "\n", 150 | "**IMPORTANT** - At this point, only *categorical* variables are implemented, so those are what we will test.\n", 151 | "\n", 152 | "---" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "b783aabc-8221-40f6-a0d5-bf21dd75e2a6", 158 | "metadata": { 159 | "tags": [] 160 | }, 161 | "source": [ 162 | "## Correctness" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "id": "92dafb11-ec94-43c2-baec-2a5e2a0b380d", 168 | "metadata": {}, 169 | "source": [ 170 | "- Single core" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 8, 176 | "id": "4d4cde6d-73c1-4197-86ed-131724e21296", 177 | "metadata": { 178 | "tags": [] 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "cat_sc = tobler.area_weighted.area_interpolate(\n", 183 | " tracts, zip_codes, categorical_variables=['rando']\n", 184 | ")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "2982d8dc-c1e9-4927-8643-9900b1b09890", 190 | "metadata": {}, 191 | "source": [ 192 | "- Dask" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 9, 198 | "id": "d8c7896f-9004-4a07-b3ba-75301f8120e5", 199 | "metadata": { 200 | "tags": [] 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", 205 | " dtracts, dzips, 'ZIP', categorical_variables=['rando']\n", 206 | ").compute()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "id": "5e19b8dd-505f-4dc1-ba85-9fd825e59b43", 212 | "metadata": {}, 213 | "source": [ 214 | "And we can compare both results are the same:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 8, 220 | "id": "81de5e35-f3b6-4567-86b1-36d98583dca0", 221 | "metadata": { 222 | "tags": [] 223 | }, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "rando_0 4.188295e-08\n", 229 | "rando_1 5.328575e-08\n", 230 | "rando_2 5.396667e-08\n", 231 | "rando_3 2.935173e-08\n", 232 | "dtype: float64" 233 | ] 234 | }, 235 | "execution_count": 8, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "a = (\n", 242 | " cat_dk\n", 243 | " .set_index('ZIP')\n", 244 | " .reindex(zip_codes['ZIP'].values)\n", 245 | " .drop(columns='geometry')\n", 246 | ")\n", 247 | "\n", 248 | "b = (\n", 249 | " cat_sc\n", 250 | " .drop(columns='geometry')\n", 251 | " [['rando_0', 'rando_1', 'rando_2', 'rando_3']]\n", 252 | ")\n", 253 | "b.index = a.index\n", 254 | "\n", 255 | "(a - b).max()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "id": "e2e04df1-3331-449c-b74c-e910239c3067", 261 | "metadata": {}, 262 | "source": [ 263 | "The differences in the estimates for the proportions of each area start at the 8th decimal, and thus likely rounding errors derived from the different approaches used to compute the interpolation (the single core does it in one-shot, while Dask computes parts and brings them together later with a sum)." 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "id": "1debbdf4-892f-4fda-834a-0403595794ef", 269 | "metadata": { 270 | "tags": [] 271 | }, 272 | "source": [ 273 | "## Performance\n", 274 | "\n", 275 | "---\n", 276 | "\n", 277 | "**NOTE** - Timings below do _not_ include computation time required for spatial shuffling and partitioning (which can be substantial with large datasets), or converting from `geopandas`. These are \"sunk costs\" that'll only make this approach preferable with large datasets, although they can be computed once and the result stored in disk efficiently (e.g., as Parquet files). Having said that, when \"larger\" is large enough is not very large in modern terms: from a handful of thousand observations the gains will be substantial if several cores/workers are available.\n", 278 | "\n", 279 | "---" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "id": "e5242c13-c4cd-46e2-9131-ec1734bcc142", 285 | "metadata": {}, 286 | "source": [ 287 | "We can now time the example above:\n" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 12, 293 | "id": "902e494b-65ba-4fa2-99e6-eb3a513769f8", 294 | "metadata": { 295 | "tags": [] 296 | }, 297 | "outputs": [ 298 | { 299 | "name": "stdout", 300 | "output_type": "stream", 301 | "text": [ 302 | "85 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "%%timeit\n", 308 | "cat_sc = tobler.area_weighted.area_interpolate(\n", 309 | " tracts, zip_codes, categorical_variables=['rando']\n", 310 | ")" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 13, 316 | "id": "5cfc44d9-f79a-4b8e-9caa-975ea64d5f0e", 317 | "metadata": { 318 | "tags": [] 319 | }, 320 | "outputs": [ 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "1.41 s ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "%%timeit\n", 331 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", 332 | " dtracts, dzips, 'ZIP', categorical_variables=['rando']\n", 333 | ").compute()" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "a124ee86-c527-4386-be8d-2dc833270fd9", 339 | "metadata": {}, 340 | "source": [ 341 | "This is notably slower (about 5x!). For such a small dataset, the overhead in distributing computations and collecting them overcomes any gains in parallelism.\n", 342 | "\n", 343 | "Now we can artificially increase the size of the datasets by concatenating them several times and re-computing (this time we only time one execution):" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 17, 349 | "id": "5f56d579-0022-45c2-845c-f351bf96ed01", 350 | "metadata": { 351 | "tags": [] 352 | }, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "40x increase | N. tracts: 4680 | N. ZIPs: 1680\n" 359 | ] 360 | }, 361 | { 362 | "name": "stderr", 363 | "output_type": "stream", 364 | "text": [ 365 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.18 MiB.\n", 366 | "This may cause some slowdown.\n", 367 | "Consider scattering data ahead of time and using futures.\n", 368 | " warnings.warn(\n", 369 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.18 MiB.\n", 370 | "This may cause some slowdown.\n", 371 | "Consider scattering data ahead of time and using futures.\n", 372 | " warnings.warn(\n" 373 | ] 374 | } 375 | ], 376 | "source": [ 377 | "sizeup = 40\n", 378 | "tracts_lrg = pandas.concat([tracts] * sizeup)\n", 379 | "zips_lrg = pandas.concat([zip_codes] * sizeup)\n", 380 | "print(\n", 381 | " f'{sizeup}x increase | N. tracts: {len(tracts_lrg)} | N. ZIPs: {len(zips_lrg)}'\n", 382 | ")\n", 383 | "\n", 384 | "dtracts_lrg = (\n", 385 | " dask_geopandas.from_geopandas(tracts_lrg[['geometry', 'rando']], chunksize=500)\n", 386 | " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", 387 | ")\n", 388 | "\n", 389 | "dzips_lrg = (\n", 390 | " dask_geopandas.from_geopandas(zips_lrg[['ZIP', 'geometry']], chunksize=500)\n", 391 | " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", 392 | ")" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "id": "e5187109-ba95-4b5f-b373-2ec4745d0289", 398 | "metadata": {}, 399 | "source": [ 400 | "And re-compute the timings:" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "id": "c0da372a-f791-47fb-ade0-317a1cf6ff9c", 406 | "metadata": { 407 | "jp-MarkdownHeadingCollapsed": true, 408 | "tags": [] 409 | }, 410 | "source": [ 411 | "---\n", 412 | "\n", 413 | "### 10x" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 14, 419 | "id": "620cf9ab-7b9e-4458-809c-c7a73d13f26c", 420 | "metadata": { 421 | "tags": [] 422 | }, 423 | "outputs": [ 424 | { 425 | "name": "stdout", 426 | "output_type": "stream", 427 | "text": [ 428 | "Computing for a sizeup of 10x\n", 429 | "CPU times: user 7.21 s, sys: 11.3 ms, total: 7.23 s\n", 430 | "Wall time: 6.95 s\n" 431 | ] 432 | } 433 | ], 434 | "source": [ 435 | "%%time\n", 436 | "print(f'Computing for a sizeup of {sizeup}x')\n", 437 | "cat_sc = tobler.area_weighted.area_interpolate(\n", 438 | " tracts_lrg, zips_lrg, categorical_variables=['rando']\n", 439 | ")" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 16, 445 | "id": "c615b27a-e004-429b-a0c5-e4b237516f9f", 446 | "metadata": { 447 | "tags": [] 448 | }, 449 | "outputs": [ 450 | { 451 | "name": "stdout", 452 | "output_type": "stream", 453 | "text": [ 454 | "Computing for a sizeup of 10x\n", 455 | "CPU times: user 548 ms, sys: 18 ms, total: 566 ms\n", 456 | "Wall time: 3.56 s\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "%%time\n", 462 | "print(f'Computing for a sizeup of {sizeup}x')\n", 463 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", 464 | " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", 465 | ").compute()" 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "id": "cc13af25-e97e-4b34-bb1f-bb946c15748e", 471 | "metadata": { 472 | "jp-MarkdownHeadingCollapsed": true, 473 | "tags": [] 474 | }, 475 | "source": [ 476 | "---\n", 477 | "\n", 478 | "### 20x" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 18, 484 | "id": "8dbb40d4-4b3b-446d-9d1b-99462a122d6e", 485 | "metadata": { 486 | "tags": [] 487 | }, 488 | "outputs": [ 489 | { 490 | "name": "stdout", 491 | "output_type": "stream", 492 | "text": [ 493 | "Computing for a sizeup of 20x\n", 494 | "CPU times: user 28.6 s, sys: 26.1 ms, total: 28.7 s\n", 495 | "Wall time: 27.6 s\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "%%time\n", 501 | "print(f'Computing for a sizeup of {sizeup}x')\n", 502 | "cat_sc = tobler.area_weighted.area_interpolate(\n", 503 | " tracts_lrg, zips_lrg, categorical_variables=['rando']\n", 504 | ")" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 24, 510 | "id": "f2ca1394-5f8d-428f-a61c-87beb8778322", 511 | "metadata": { 512 | "tags": [] 513 | }, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "Computing for a sizeup of 20x\n" 520 | ] 521 | }, 522 | { 523 | "name": "stderr", 524 | "output_type": "stream", 525 | "text": [ 526 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 16.77 MiB.\n", 527 | "This may cause some slowdown.\n", 528 | "Consider scattering data ahead of time and using futures.\n", 529 | " warnings.warn(\n" 530 | ] 531 | }, 532 | { 533 | "name": "stdout", 534 | "output_type": "stream", 535 | "text": [ 536 | "CPU times: user 1.32 s, sys: 65.3 ms, total: 1.38 s\n", 537 | "Wall time: 9.86 s\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "%%time\n", 543 | "print(f'Computing for a sizeup of {sizeup}x')\n", 544 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", 545 | " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", 546 | ").compute()" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "id": "335b34b4-9fea-48a6-b38b-8b1a5d755ca1", 552 | "metadata": { 553 | "jp-MarkdownHeadingCollapsed": true, 554 | "tags": [] 555 | }, 556 | "source": [ 557 | "---\n", 558 | "\n", 559 | "### 30x" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 26, 565 | "id": "1598ce3f-d21e-4a60-9619-ee5b1eb4932f", 566 | "metadata": { 567 | "tags": [] 568 | }, 569 | "outputs": [ 570 | { 571 | "name": "stdout", 572 | "output_type": "stream", 573 | "text": [ 574 | "Computing for a sizeup of 30x\n", 575 | "CPU times: user 1min 4s, sys: 176 ms, total: 1min 4s\n", 576 | "Wall time: 1min 1s\n" 577 | ] 578 | } 579 | ], 580 | "source": [ 581 | "%%time\n", 582 | "print(f'Computing for a sizeup of {sizeup}x')\n", 583 | "cat_sc = tobler.area_weighted.area_interpolate(\n", 584 | " tracts_lrg, zips_lrg, categorical_variables=['rando']\n", 585 | ")" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 7, 591 | "id": "224ffbca-7690-4b20-bad2-efbf042623a9", 592 | "metadata": { 593 | "tags": [] 594 | }, 595 | "outputs": [ 596 | { 597 | "name": "stdout", 598 | "output_type": "stream", 599 | "text": [ 600 | "Computing for a sizeup of 30x\n" 601 | ] 602 | }, 603 | { 604 | "name": "stderr", 605 | "output_type": "stream", 606 | "text": [ 607 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 25.14 MiB.\n", 608 | "This may cause some slowdown.\n", 609 | "Consider scattering data ahead of time and using futures.\n", 610 | " warnings.warn(\n" 611 | ] 612 | }, 613 | { 614 | "name": "stdout", 615 | "output_type": "stream", 616 | "text": [ 617 | "CPU times: user 1.91 s, sys: 58.8 ms, total: 1.97 s\n", 618 | "Wall time: 14.6 s\n" 619 | ] 620 | } 621 | ], 622 | "source": [ 623 | "%%time\n", 624 | "print(f'Computing for a sizeup of {sizeup}x')\n", 625 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", 626 | " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", 627 | ").compute()" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "id": "b004834f-c5ce-4f92-be9a-364a07c7996b", 633 | "metadata": { 634 | "tags": [] 635 | }, 636 | "source": [ 637 | "---\n", 638 | "\n", 639 | "### 40x" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 17, 645 | "id": "b6b9d06a-9034-4c39-b3a9-92fc6408d5c6", 646 | "metadata": { 647 | "tags": [] 648 | }, 649 | "outputs": [ 650 | { 651 | "name": "stdout", 652 | "output_type": "stream", 653 | "text": [ 654 | "Computing for a sizeup of 40x\n", 655 | "CPU times: user 2min 2s, sys: 1.71 s, total: 2min 3s\n", 656 | "Wall time: 1min 53s\n" 657 | ] 658 | } 659 | ], 660 | "source": [ 661 | "%%time\n", 662 | "print(f'Computing for a sizeup of {sizeup}x')\n", 663 | "cat_sc = tobler.area_weighted.area_interpolate(\n", 664 | " tracts_lrg, zips_lrg, categorical_variables=['rando']\n", 665 | ")" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 18, 671 | "id": "8a68e5fe-ee41-48cc-9222-6554a7651c28", 672 | "metadata": { 673 | "tags": [] 674 | }, 675 | "outputs": [ 676 | { 677 | "name": "stdout", 678 | "output_type": "stream", 679 | "text": [ 680 | "Computing for a sizeup of 40x\n" 681 | ] 682 | }, 683 | { 684 | "name": "stderr", 685 | "output_type": "stream", 686 | "text": [ 687 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 33.52 MiB.\n", 688 | "This may cause some slowdown.\n", 689 | "Consider scattering data ahead of time and using futures.\n", 690 | " warnings.warn(\n" 691 | ] 692 | }, 693 | { 694 | "name": "stdout", 695 | "output_type": "stream", 696 | "text": [ 697 | "CPU times: user 6.99 s, sys: 512 ms, total: 7.5 s\n", 698 | "Wall time: 30.5 s\n" 699 | ] 700 | } 701 | ], 702 | "source": [ 703 | "%%time\n", 704 | "print(f'Computing for a sizeup of {sizeup}x')\n", 705 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", 706 | " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", 707 | ").compute()" 708 | ] 709 | } 710 | ], 711 | "metadata": { 712 | "kernelspec": { 713 | "display_name": "tobler", 714 | "language": "python", 715 | "name": "tobler" 716 | }, 717 | "language_info": { 718 | "codemirror_mode": { 719 | "name": "ipython", 720 | "version": 3 721 | }, 722 | "file_extension": ".py", 723 | "mimetype": "text/x-python", 724 | "name": "python", 725 | "nbconvert_exporter": "python", 726 | "pygments_lexer": "ipython3", 727 | "version": "3.11.4" 728 | } 729 | }, 730 | "nbformat": 4, 731 | "nbformat_minor": 5 732 | } 733 | -------------------------------------------------------------------------------- /notebooks/area_interpolate_perf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# `numba` tests\n", 8 | "\n", 9 | "This notebook documents and serves as a scratchpad for exploring `numba`-based acceleration on areal interpolation.\n", 10 | "\n", 11 | "**NOTE** - To be removed/relocated once/if functionality is merged\n", 12 | "\n", 13 | "---\n", 14 | "\n", 15 | "**IMPORTANT**\n", 16 | "\n", 17 | "As of Dec. 17th'20, the multi-core implementation requires the versions in `main` for `pygeos` and `geopandas`. On a working environment with the latest released versions (as the `gds_env:5.0`), this can be achieved by:\n", 18 | "\n", 19 | "```shell\n", 20 | "pip install --no-deps git+https://github.com/pygeos/pygeos.git\n", 21 | "pip install --no-deps git+https://github.com/geopandas/geopandas.git\n", 22 | "```\n", 23 | "\n", 24 | "---" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "from tobler.area_weighted.area_interpolate import _area_tables_binning, _area_tables_binning_parallel\n", 34 | "import geopandas, pandas\n", 35 | "\n", 36 | "summary = lambda src, tgt: print(\n", 37 | " f\"Transfer {src.shape[0]} polygons into {tgt.shape[0]}\"\n", 38 | ")\n", 39 | "\n", 40 | "def down_load(p):\n", 41 | " fn = f\"/home/jovyan/{p.split('/')[0]}\"\n", 42 | " try:\n", 43 | " return geopandas.read_file(fn)\n", 44 | " except:\n", 45 | " ! wget $p -O $fn\n", 46 | " return geopandas.read_file(fn)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Data setup" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "- Minimal problem" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "Transfer 628 polygons into 628\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "p = (\"https://geographicdata.science/book/_downloads/\"\\\n", 78 | " \"f2341ee89163afe06b42fc5d5ed38060/sandiego_tracts.gpkg\")\n", 79 | "src = down_load(p).rename(lambda i: 'i'+str(i))\n", 80 | "\n", 81 | "p = (\"https://geographicdata.science/book/_downloads/\"\\\n", 82 | " \"d740a1069144baa1302b9561c3d31afe/sd_h3_grid.gpkg\")\n", 83 | "tgt = down_load(p).rename(lambda i: 'i'+str(i)).to_crs(src.crs)\n", 84 | "\n", 85 | "w, s, e, n = tgt.total_bounds\n", 86 | "#src = src.cx[w:e, s:n]\n", 87 | "summary(src, tgt)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "- Slightly larger problem" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 3, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "Transfer 3140 polygons into 2512\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# Tracts\n", 112 | "p = \"https://ndownloader.figshare.com/files/20460645\"\n", 113 | "src = down_load(p)\n", 114 | "src = pandas.concat([src]*5)\n", 115 | "\n", 116 | "# Precincts\n", 117 | "p = \"https://ndownloader.figshare.com/files/20460549\"\n", 118 | "tgt = down_load(p).to_crs(src.crs)\n", 119 | "tgt = pandas.concat([tgt]*4)\n", 120 | "summary(src, tgt)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Correctness" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 4, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "0" 139 | ] 140 | }, 141 | "execution_count": 4, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "cross2 = _area_tables_binning_parallel(src, tgt, n_jobs=1)\n", 148 | "cross = _area_tables_binning(src, tgt, 'auto')\n", 149 | "(cross != cross2).sum()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## Performance" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "Results with all observations in first dataset:" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 5, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "2.22 s ± 20.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "%timeit cross = _area_tables_binning(src, tgt, 'auto')" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 6, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "2.22 s ± 25.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "%timeit cross2 = _area_tables_binning_parallel(src, tgt, n_jobs=1)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 7, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "756 ms ± 21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "%timeit cross3 = _area_tables_binning_parallel(src, tgt, n_jobs=-1)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "---" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Results with second dataset:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 5, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "name": "stdout", 238 | "output_type": "stream", 239 | "text": [ 240 | "CPU times: user 47.5 s, sys: 15.8 ms, total: 47.5 s\n", 241 | "Wall time: 47.6 s\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "%time cross = _area_tables_binning(src, tgt, 'auto')" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 8, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "CPU times: user 46.8 s, sys: 108 ms, total: 46.9 s\n", 259 | "Wall time: 46.9 s\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "%time cross3 = _area_tables_binning_parallel(src, tgt, n_jobs=1)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 6, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "CPU times: user 1.86 s, sys: 488 ms, total: 2.35 s\n", 277 | "Wall time: 9.61 s\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "%time cross3 = _area_tables_binning_parallel(src, tgt, n_jobs=-1)" 283 | ] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.7.8" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 4 307 | } 308 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "setuptools_scm[toml]>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools_scm] 6 | 7 | [project] 8 | name = "tobler" 9 | dynamic = ["version"] 10 | authors = [ 11 | { name = "Eli Knaap", email = "ek@knaaptime.com" }, 12 | { name = "Serge Rey", email = "sjsrey@gmail.com" }, 13 | ] 14 | maintainers = [{ name = "pysal contributors" }] 15 | license = { text = "BSD 3-Clause" } 16 | description = "Tobler is a Python library for areal interpolation." 17 | keywords = [ 18 | "dasymetric mapping, spatial analysis, interpolation, change of support" 19 | ] 20 | readme = { text = """\ 21 | Spatial interpolation, Dasymetric Mapping, & Change of Support" 22 | 23 | """, content-type = "text/x-rst" } 24 | classifiers = [ 25 | "Programming Language :: Python :: 3", 26 | "License :: OSI Approved :: BSD License", 27 | "Operating System :: OS Independent", 28 | "Intended Audience :: Science/Research", 29 | "Topic :: Scientific/Engineering :: GIS", 30 | ] 31 | requires-python = ">=3.9" 32 | dependencies = [ 33 | "numpy", 34 | "pandas", 35 | "geopandas >=0.13", 36 | "rasterio", 37 | "scipy", 38 | "statsmodels", 39 | "rasterstats", 40 | "libpysal", 41 | "tqdm", 42 | "joblib" 43 | ] 44 | 45 | [project.urls] 46 | Home = "https://github.com/pysal/tobler/" 47 | Repository = "https://github.com/pysal/tobler" 48 | 49 | [project.optional-dependencies] 50 | dev = ["pre-commit"] 51 | docs = [ 52 | "nbsphinx", 53 | "numpydoc", 54 | "pandoc", 55 | "sphinx", 56 | "sphinxcontrib-bibtex", 57 | "sphinx_bootstrap_theme", 58 | "mkdocs-jupyter", 59 | "myst-parser" 60 | ] 61 | tests = [ 62 | "codecov", 63 | "coverage", 64 | "pytest", 65 | "pytest-mpl", 66 | "pytest-cov", 67 | "pytest-xdist", 68 | "watermark", 69 | "h3", 70 | "astropy" 71 | ] 72 | 73 | [tool.setuptools.packages.find] 74 | include = ["tobler", "tobler.*"] 75 | 76 | [tool.black] 77 | line-length = 88 78 | 79 | [tool.ruff] 80 | line-length = 88 81 | select = ["E", "F", "W", "I", "UP", "N", "B", "A", "C4", "SIM", "ARG"] 82 | target-version = "py39" 83 | ignore = [ 84 | "B006", 85 | "B008", 86 | "B009", 87 | "B010", 88 | "C408", 89 | "E731", 90 | "F401", 91 | "F403", 92 | "N803", 93 | "N806", 94 | "N999", 95 | "UP007" 96 | ] 97 | exclude = ["tobler/tests/*", "docs/*"] 98 | 99 | [tool.coverage.run] 100 | source = ["./tobler"] 101 | 102 | [tool.coverage.report] 103 | exclude_lines = [ 104 | "if self.debug:", 105 | "pragma: no cover", 106 | "raise NotImplementedError", 107 | "except ModuleNotFoundError:", 108 | "except ImportError", 109 | ] 110 | ignore_errors = true 111 | omit = ["tobler/tests/*", "docs/conf.py"] 112 | -------------------------------------------------------------------------------- /tobler/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`tobler` --- A library for spatial interpolation 3 | ================================================= 4 | 5 | """ 6 | import contextlib 7 | from importlib.metadata import PackageNotFoundError, version 8 | 9 | from . import area_weighted, dasymetric, model, pycno, util 10 | 11 | with contextlib.suppress(PackageNotFoundError): 12 | __version__ = version("tobler") 13 | -------------------------------------------------------------------------------- /tobler/area_weighted/__init__.py: -------------------------------------------------------------------------------- 1 | from .area_interpolate import area_interpolate 2 | from .area_interpolate import _area_tables_binning 3 | from .area_join import area_join 4 | from .area_interpolate_dask import area_interpolate_dask 5 | 6 | __all__ = [area_interpolate, area_join, area_interpolate_dask] 7 | -------------------------------------------------------------------------------- /tobler/area_weighted/area_interpolate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Area Weighted Interpolation 3 | 4 | """ 5 | 6 | import os 7 | 8 | import geopandas as gpd 9 | import numpy as np 10 | import pandas as pd 11 | from scipy.sparse import coo_matrix, diags 12 | 13 | from tobler.util.util import _check_crs, _inf_check, _nan_check 14 | 15 | __all__ = ["area_interpolate"] 16 | 17 | 18 | def _chunk_dfs(geoms_to_chunk, geoms_full, n_jobs): 19 | chunk_size = geoms_to_chunk.shape[0] // n_jobs + 1 20 | for i in range(n_jobs): 21 | start = i * chunk_size 22 | yield geoms_to_chunk.iloc[start : start + chunk_size], geoms_full 23 | 24 | 25 | def _index_n_query(geoms1, geoms2): 26 | # Pick largest for STRTree, query the smallest 27 | if geoms1.shape[0] > geoms2.shape[0]: 28 | large = geoms1 29 | small = geoms2 30 | else: 31 | large = geoms2 32 | small = geoms1 33 | # Build tree + query 34 | qry_polyIDs, tree_polyIDs = large.sindex.query(small, predicate="intersects") 35 | # Remap IDs to global 36 | large_global_ids = large.iloc[tree_polyIDs].index.values 37 | small_global_ids = small.iloc[qry_polyIDs].index.values 38 | # Return always global IDs for geoms1, geoms2 39 | if geoms1.shape[0] > geoms2.shape[0]: 40 | return np.array([large_global_ids, small_global_ids]).T 41 | else: 42 | return np.array([small_global_ids, large_global_ids]).T 43 | 44 | 45 | def _chunk_polys(id_pairs, geoms_left, geoms_right, n_jobs): 46 | chunk_size = id_pairs.shape[0] // n_jobs + 1 47 | for i in range(n_jobs): 48 | start = i * chunk_size 49 | chunk1 = geoms_left.array[id_pairs[start : start + chunk_size, 0]] 50 | chunk2 = geoms_right.array[id_pairs[start : start + chunk_size, 1]] 51 | yield chunk1, chunk2 52 | 53 | 54 | def _intersect_area_on_chunk(geoms1, geoms2): 55 | areas = geoms1.intersection(geoms2).area 56 | return areas 57 | 58 | 59 | def _area_tables_binning_parallel(source_df, target_df, n_jobs=-1): 60 | """Construct area allocation and source-target correspondence tables using 61 | a parallel spatial indexing approach 62 | ... 63 | 64 | NOTE: currently, the largest df is chunked and the other one is shipped in 65 | full to each core; within each process, the spatial index is built for the 66 | largest set of geometries, and the other one used for `query` 67 | 68 | Parameters 69 | ---------- 70 | source_df : geopandas.GeoDataFrame 71 | GeoDataFrame containing input data and polygons 72 | target_df : geopandas.GeoDataFramee 73 | GeoDataFrame defining the output geometries 74 | n_jobs : int 75 | [Optional. Default=-1] Number of processes to run in parallel. If -1, 76 | this is set to the number of CPUs available 77 | 78 | Returns 79 | ------- 80 | tables : scipy.sparse.csr_matrix 81 | 82 | """ 83 | from joblib import Parallel, delayed, parallel_backend 84 | 85 | if _check_crs(source_df, target_df): 86 | pass 87 | else: 88 | return None 89 | if n_jobs == -1: 90 | n_jobs = os.cpu_count() 91 | 92 | df1 = source_df.copy() 93 | df2 = target_df.copy() 94 | 95 | # Chunk the largest, ship the smallest in full 96 | if df1.shape[0] > df2.shape[1]: 97 | to_chunk = df1 98 | df_full = df2 99 | else: 100 | to_chunk = df2 101 | df_full = df1 102 | 103 | # Spatial index query 104 | ## Reindex on positional IDs 105 | to_workers = _chunk_dfs( 106 | gpd.GeoSeries(to_chunk.geometry.values, crs=to_chunk.crs), 107 | gpd.GeoSeries(df_full.geometry.values, crs=df_full.crs), 108 | n_jobs, 109 | ) 110 | 111 | with parallel_backend("loky", inner_max_num_threads=1): 112 | worker_out = Parallel(n_jobs=n_jobs)( 113 | delayed(_index_n_query)(*chunk_pair) for chunk_pair in to_workers 114 | ) 115 | 116 | ids_src, ids_tgt = np.concatenate(worker_out).T 117 | 118 | # Intersection + area calculation 119 | chunks_to_intersection = _chunk_polys( 120 | np.vstack([ids_src, ids_tgt]).T, df1.geometry, df2.geometry, n_jobs 121 | ) 122 | with parallel_backend("loky", inner_max_num_threads=1): 123 | worker_out = Parallel(n_jobs=n_jobs)( 124 | delayed(_intersect_area_on_chunk)(*chunk_pair) 125 | for chunk_pair in chunks_to_intersection 126 | ) 127 | areas = np.concatenate(worker_out) 128 | 129 | # Build CSR table 130 | table = coo_matrix( 131 | ( 132 | areas, 133 | (ids_src, ids_tgt), 134 | ), 135 | shape=(df1.shape[0], df2.shape[0]), 136 | dtype=np.float32, 137 | ) 138 | table = table.tocsr() 139 | return table 140 | 141 | 142 | def _area_tables_binning(source_df, target_df, spatial_index): 143 | """Construct area allocation and source-target correspondence tables using a spatial indexing approach 144 | ... 145 | 146 | NOTE: this currently relies on Geopandas' spatial index machinery 147 | 148 | Parameters 149 | ---------- 150 | source_df : geopandas.GeoDataFrame 151 | GeoDataFrame containing input data and polygons 152 | target_df : geopandas.GeoDataFramee 153 | GeoDataFrame defining the output geometries 154 | spatial_index : str 155 | Spatial index to use to build the allocation of area from source to 156 | target tables. It currently support the following values: 157 | - "source": build the spatial index on `source_df` 158 | - "target": build the spatial index on `target_df` 159 | - "auto": attempts to guess the most efficient alternative. 160 | Currently, this option uses the largest table to build the 161 | index, and performs a `bulk_query` on the shorter table. 162 | 163 | Returns 164 | ------- 165 | tables : scipy.sparse.csr_matrix 166 | 167 | """ 168 | if _check_crs(source_df, target_df): 169 | pass 170 | else: 171 | return None 172 | 173 | df1 = source_df.copy() 174 | df2 = target_df.copy() 175 | 176 | # it is generally more performant to use the longer df as spatial index 177 | if spatial_index == "auto": 178 | if df1.shape[0] > df2.shape[0]: 179 | spatial_index = "source" 180 | else: 181 | spatial_index = "target" 182 | 183 | if spatial_index == "source": 184 | ids_tgt, ids_src = df1.sindex.query(df2.geometry, predicate="intersects") 185 | elif spatial_index == "target": 186 | ids_src, ids_tgt = df2.sindex.query(df1.geometry, predicate="intersects") 187 | else: 188 | raise ValueError( 189 | f"'{spatial_index}' is not a valid option. Use 'auto', 'source' or 'target'." 190 | ) 191 | 192 | areas = df1.geometry.values[ids_src].intersection(df2.geometry.values[ids_tgt]).area 193 | 194 | table = coo_matrix( 195 | ( 196 | areas, 197 | (ids_src, ids_tgt), 198 | ), 199 | shape=(df1.shape[0], df2.shape[0]), 200 | dtype=np.float32, 201 | ) 202 | 203 | table = table.tocsr() 204 | 205 | return table 206 | 207 | 208 | def area_interpolate( 209 | source_df, 210 | target_df, 211 | extensive_variables=None, 212 | intensive_variables=None, 213 | table=None, 214 | allocate_total=True, 215 | spatial_index="auto", 216 | n_jobs=1, 217 | categorical_variables=None, 218 | categorical_frequency=True, 219 | ): 220 | """ 221 | Area interpolation for extensive, intensive and categorical variables. 222 | 223 | Parameters 224 | ---------- 225 | source_df : geopandas.GeoDataFrame 226 | 227 | target_df : geopandas.GeoDataFrame 228 | 229 | extensive_variables : list 230 | [Optional. Default=None] Columns in dataframes for extensive variables 231 | 232 | intensive_variables : list 233 | [Optional. Default=None] Columns in dataframes for intensive variables 234 | 235 | table : scipy.sparse.csr_matrix 236 | [Optional. Default=None] Area allocation source-target correspondence 237 | table. If not provided, it will be built from `source_df` and 238 | `target_df` using `tobler.area_interpolate._area_tables_binning` 239 | 240 | allocate_total : boolean 241 | [Optional. Default=True] True if total value of source area should be 242 | allocated. False if denominator is area of i. Note that the two cases 243 | would be identical when the area of the source polygon is exhausted by 244 | intersections. See Notes for more details. 245 | 246 | spatial_index : str 247 | [Optional. Default="auto"] Spatial index to use to build the 248 | allocation of area from source to target tables. It currently support 249 | the following values: 250 | 251 | - "source": build the spatial index on `source_df` 252 | - "target": build the spatial index on `target_df` 253 | - "auto": attempts to guess the most efficient alternative. 254 | 255 | Currently, this option uses the largest table to build the 256 | index, and performs a `bulk_query` on the shorter table. 257 | This argument is ignored if n_jobs>1 (or n_jobs=-1). 258 | 259 | n_jobs : int 260 | [Optional. Default=1] Number of processes to run in parallel to 261 | generate the area allocation. If -1, this is set to the number of CPUs 262 | available. If `table` is passed, this is ignored. 263 | 264 | categorical_variables : list 265 | [Optional. Default=None] Columns in dataframes for categorical variables 266 | 267 | categorical_frequency : Boolean 268 | [Optional. Default=True] If True, `estimates` returns the frequency of each 269 | value in a categorical variable in every polygon of `target_df` (proportion of 270 | area). If False, `estimates` contains the area in every polygon of `target_df` 271 | that is occupied by each value of the categorical 272 | 273 | Returns 274 | ------- 275 | estimates : geopandas.GeoDataFrame 276 | new geodataframe with interpolated variables as columns and target_df geometry 277 | as output geometry 278 | 279 | Notes 280 | ----- 281 | The assumption is both dataframes have the same coordinate reference system. 282 | For an extensive variable, the estimate at target polygon j (default case) is: 283 | 284 | .. math:: 285 | v_j = \\sum_i v_i w_{i,j} 286 | 287 | w_{i,j} = a_{i,j} / \\sum_k a_{i,k} 288 | 289 | If the area of the source polygon is not exhausted by intersections with 290 | target polygons and there is reason to not allocate the complete value of 291 | an extensive attribute, then setting allocate_total=False will use the 292 | following weights: 293 | 294 | $$v_j = \\sum_i v_i w_{i,j}$$ 295 | 296 | $$w_{i,j} = a_{i,j} / a_i$$ 297 | 298 | where a_i is the total area of source polygon i. 299 | For an intensive variable, the estimate at target polygon j is: 300 | 301 | $$v_j = \\sum_i v_i w_{i,j}$$ 302 | 303 | $$w_{i,j} = a_{i,j} / \\sum_k a_{k,j}$$ 304 | 305 | For categorical variables, the estimate returns ratio of presence of each 306 | unique category. 307 | """ 308 | source_df = source_df.copy() 309 | target_df = target_df.copy() 310 | 311 | if _check_crs(source_df, target_df): 312 | pass 313 | else: 314 | return None 315 | 316 | if table is None: 317 | if n_jobs == 1: 318 | table = _area_tables_binning(source_df, target_df, spatial_index) 319 | else: 320 | table = _area_tables_binning_parallel(source_df, target_df, n_jobs=n_jobs) 321 | 322 | dfs = [] 323 | extensive = [] 324 | if extensive_variables: 325 | den = source_df.area.values 326 | if allocate_total: 327 | den = np.asarray(table.sum(axis=1)) 328 | den = den + (den == 0) 329 | den = 1.0 / den 330 | n = den.shape[0] 331 | den = den.reshape((n,)) 332 | den = diags([den], [0]) 333 | weights = den.dot(table) # row standardize table 334 | 335 | for variable in extensive_variables: 336 | vals = _nan_check(source_df, variable) 337 | vals = _inf_check(source_df, variable) 338 | estimates = diags([vals], [0]).dot(weights) 339 | estimates = estimates.sum(axis=0) 340 | extensive.append(estimates.tolist()[0]) 341 | 342 | extensive = np.asarray(extensive) 343 | extensive = np.array(extensive) 344 | extensive = pd.DataFrame(extensive.T, columns=extensive_variables) 345 | 346 | intensive = [] 347 | if intensive_variables: 348 | area = np.asarray(table.sum(axis=0)) 349 | den = 1.0 / (area + (area == 0)) 350 | n, k = den.shape 351 | den = den.reshape((k,)) 352 | den = diags([den], [0]) 353 | weights = table.dot(den) 354 | 355 | for variable in intensive_variables: 356 | vals = _nan_check(source_df, variable) 357 | vals = _inf_check(source_df, variable) 358 | n = vals.shape[0] 359 | vals = vals.reshape((n,)) 360 | estimates = diags([vals], [0]) 361 | estimates = estimates.dot(weights).sum(axis=0) 362 | intensive.append(estimates.tolist()[0]) 363 | 364 | intensive = np.asarray(intensive) 365 | intensive = pd.DataFrame(intensive.T, columns=intensive_variables) 366 | 367 | if categorical_variables: 368 | categorical = {} 369 | for variable in categorical_variables: 370 | unique = source_df[variable].unique() 371 | for value in unique: 372 | mask = source_df[variable] == value 373 | categorical[f"{variable}_{value}"] = np.asarray( 374 | table[mask.to_numpy()].sum(axis=0) 375 | )[0] 376 | 377 | categorical = pd.DataFrame(categorical) 378 | if categorical_frequency is True: 379 | categorical = categorical.div(target_df.area.values, axis="rows") 380 | 381 | if extensive_variables: 382 | dfs.append(extensive) 383 | if intensive_variables: 384 | dfs.append(intensive) 385 | if categorical_variables: 386 | dfs.append(categorical) 387 | 388 | df = pd.concat(dfs, axis=1) 389 | df["geometry"] = target_df[target_df.geometry.name].reset_index(drop=True) 390 | df = gpd.GeoDataFrame(df.replace(np.inf, np.nan)) 391 | 392 | return df.set_index(target_df.index) 393 | -------------------------------------------------------------------------------- /tobler/area_weighted/area_interpolate_dask.py: -------------------------------------------------------------------------------- 1 | """ 2 | Area Weighted Interpolation, out-of-core and parallel through Dask 3 | """ 4 | 5 | import geopandas 6 | import numpy as np 7 | import pandas 8 | 9 | from .area_interpolate import area_interpolate 10 | 11 | __all__ = ['area_interpolate_dask'] 12 | 13 | def area_interpolate_dask( 14 | source_dgdf, 15 | target_dgdf, 16 | id_col, 17 | extensive_variables=None, 18 | intensive_variables=None, 19 | categorical_variables=None, 20 | categorical_frequency=True, 21 | ): 22 | """ 23 | Out-of-core and parallel area interpolation for categorical variables. 24 | 25 | Parameters 26 | ---------- 27 | source_dgdf : dask_geopandas.GeoDataFrame 28 | Dask-geopandas GeoDataFrame 29 | IMPORTANT: the table needs to be spatially shuffled and with spatial partitions. 30 | This is required so only overlapping partitions are checked for interpolation. See 31 | more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html 32 | target_dgdf : dask_geopandas.GeoDataFrame 33 | Dask-geopandas GeoDataFrame 34 | IMPORTANT: the table needs to be spatially shuffled and with spatial partitions. 35 | This is required so only overlapping partitions are checked for interpolation. See 36 | more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html 37 | id_col : str 38 | Name of the column in `target_dgdf` with unique IDs to be used in output table 39 | extensive_variables : list 40 | [Optional. Default=None] Columns in `source_dgdf` for extensive variables. 41 | IMPORTANT: currently NOT implemented. 42 | intensive_variables : list 43 | [Optional. Default=None] Columns in `source_dgdf` for intensive variables 44 | IMPORTANT: currently NOT implemented. 45 | categorical_variables : list 46 | [Optional. Default=None] Columns in `source_dgdf` for categorical variables 47 | IMPORTANT: categorical variables must be of type `'category[known]'`. This is so 48 | all categories are known ahead of time and Dask can run lazily. 49 | categorical_frequency : Boolean 50 | [Optional. Default=True] If True, `estimates` returns the frequency of each 51 | value in a categorical variable in every polygon of `target_df` (proportion of 52 | area). If False, `estimates` contains the area in every polygon of `target_df` 53 | that is occupied by each value of the categorical 54 | 55 | 56 | Returns 57 | ------- 58 | estimates : dask_geopandas.GeoDataFrame 59 | new dask-geopandas geodataframe with interpolated variables and `id_col` as 60 | columns and target_df geometry as output geometry 61 | 62 | """ 63 | try: 64 | import dask_geopandas 65 | from dask.base import tokenize 66 | from dask.highlevelgraph import HighLevelGraph 67 | except ImportError: 68 | raise ImportError( 69 | "Area interpolation with Dask requires `dask` and " 70 | "`dask_geopandas` installed to run. Please install them " 71 | "before importing this functionality." 72 | ) 73 | 74 | if intensive_variables is not None: 75 | raise NotImplementedError( 76 | ( 77 | "Dask-based interpolation of intensive variables is " 78 | "not implemented yet. Please remove intensive variables to " 79 | "be able to run the rest." 80 | ) 81 | ) 82 | if extensive_variables is not None: 83 | raise NotImplementedError( 84 | ( 85 | "Dask-based interpolation of extensive variables is " 86 | "not implemented yet. Please remove intensive variables to " 87 | "be able to run the rest." 88 | ) 89 | ) 90 | # Categoricals must be Dask's known categorical 91 | if categorical_variables is not None: 92 | category_vars = [] 93 | for cat_var in categorical_variables: 94 | var_names = [f"{cat_var}_{c}" for c in source_dgdf[cat_var].cat.categories] 95 | category_vars.extend(var_names) 96 | else: 97 | category_vars = None 98 | # Build tasks by joining pairs of chunks from left/right 99 | dsk = {} 100 | new_spatial_partitions = [] 101 | parts = geopandas.sjoin( 102 | source_dgdf.spatial_partitions.to_frame("geometry"), 103 | target_dgdf.spatial_partitions.to_frame("geometry"), 104 | how="inner", 105 | predicate="intersects", 106 | ) 107 | parts_left = np.asarray(parts.index) 108 | parts_right = np.asarray(parts["index_right"].values) 109 | name = "area_interpolate-" + tokenize(target_dgdf, source_dgdf) 110 | for i, (l, r) in enumerate(zip(parts_left, parts_right)): 111 | dsk[(name, i)] = ( 112 | id_area_interpolate, 113 | (source_dgdf._name, l), 114 | (target_dgdf._name, r), 115 | id_col, 116 | extensive_variables, 117 | intensive_variables, 118 | None, 119 | True, 120 | "auto", 121 | 1, 122 | categorical_variables, 123 | category_vars, 124 | ) 125 | lr = source_dgdf.spatial_partitions.iloc[l] 126 | rr = target_dgdf.spatial_partitions.iloc[r] 127 | extent = lr.intersection(rr) 128 | new_spatial_partitions.append(extent) 129 | # Create geometries for new spatial partitions 130 | new_spatial_partitions = geopandas.GeoSeries( 131 | data=new_spatial_partitions, crs=source_dgdf.crs 132 | ) 133 | # Build Dask graph 134 | graph = HighLevelGraph.from_collections( 135 | name, dsk, dependencies=[source_dgdf, target_dgdf] 136 | ) 137 | # Get metadata for the outcome table 138 | meta = id_area_interpolate( 139 | source_dgdf._meta, 140 | target_dgdf._meta, 141 | id_col, 142 | extensive_variables=extensive_variables, 143 | intensive_variables=intensive_variables, 144 | table=None, 145 | allocate_total=True, 146 | spatial_index="auto", 147 | n_jobs=1, 148 | categorical_variables=categorical_variables, 149 | category_vars=category_vars, 150 | ) 151 | # Build output table 152 | transferred = dask_geopandas.GeoDataFrame( 153 | graph, name, meta, [None] * (len(dsk) + 1), new_spatial_partitions 154 | ) 155 | # Merge chunks 156 | out = target_dgdf[[id_col, "geometry"]] 157 | ## Extensive --> Not implemented (DAB: the below does not match single-core) 158 | """ 159 | if extensive_variables is not None: 160 | out_extensive = ( 161 | transferred 162 | .groupby(id_col) 163 | [extensive_variables] 164 | .agg({v: 'sum' for v in extensive_variables}) 165 | ) 166 | out = out.join(out_extensive, on=id_col) 167 | """ 168 | ## Intensive --> Weight by area of the chunk (Not implemented) 169 | ## Categorical --> Add up proportions 170 | if categorical_variables is not None: 171 | out_categorical = ( 172 | transferred[category_vars] 173 | .astype(float) 174 | .groupby(transferred[id_col]) 175 | .agg({v: "sum" for v in category_vars}) 176 | ) 177 | out = out.join(out_categorical, on=id_col) 178 | if categorical_frequency is True: 179 | cols = out_categorical.columns.tolist() 180 | out[cols] = out[cols].div(out.area, axis="index") 181 | return out 182 | 183 | 184 | def id_area_interpolate( 185 | source_df, 186 | target_df, 187 | id_col, 188 | extensive_variables=None, 189 | intensive_variables=None, 190 | table=None, 191 | allocate_total=True, 192 | spatial_index="auto", 193 | n_jobs=1, 194 | categorical_variables=None, 195 | category_vars=None, 196 | ): 197 | """ 198 | Light wrapper around single-core area interpolation to be run on distributed workers 199 | 200 | Parameters 201 | ---------- 202 | source_df : geopandas.GeoDataFrame 203 | target_df : geopandas.GeoDataFrame 204 | id_col : str 205 | Name of the column in `target_dgdf` with unique IDs to be used in output table 206 | extensive_variables : list 207 | [Optional. Default=None] Columns in dataframes for extensive variables 208 | intensive_variables : list 209 | [Optional. Default=None] Columns in dataframes for intensive variables 210 | table : scipy.sparse.csr_matrix 211 | [Optional. Default=None] Area allocation source-target correspondence 212 | table. If not provided, it will be built from `source_df` and 213 | `target_df` using `tobler.area_interpolate._area_tables_binning` 214 | allocate_total : boolean 215 | [Optional. Default=True] True if total value of source area should be 216 | allocated. False if denominator is area of i. Note that the two cases 217 | would be identical when the area of the source polygon is exhausted by 218 | intersections. See Notes for more details. 219 | spatial_index : str 220 | [Optional. Default="auto"] Spatial index to use to build the 221 | allocation of area from source to target tables. It currently support 222 | the following values: 223 | - "source": build the spatial index on `source_df` 224 | - "target": build the spatial index on `target_df` 225 | - "auto": attempts to guess the most efficient alternative. 226 | Currently, this option uses the largest table to build the 227 | index, and performs a `bulk_query` on the shorter table. 228 | This argument is ignored if n_jobs>1 (or n_jobs=-1). 229 | n_jobs : int 230 | [Optional. Default=1] Number of processes to run in parallel to 231 | generate the area allocation. If -1, this is set to the number of CPUs 232 | available. If `table` is passed, this is ignored. 233 | categorical_variables : list 234 | [Optional. Default=None] Columns in dataframes for categorical variables 235 | categories : list 236 | [Optional. Default=None] Full list of category names in the format 237 | `f'{var_name}_{cat_name}'` 238 | 239 | Returns 240 | ------- 241 | estimates : geopandas.GeoDataFrame 242 | new geodaraframe with interpolated variables as columns and target_df geometry 243 | as output geometry 244 | 245 | """ 246 | estimates = area_interpolate( 247 | source_df, 248 | target_df, 249 | extensive_variables=extensive_variables, 250 | intensive_variables=intensive_variables, 251 | table=table, 252 | allocate_total=allocate_total, 253 | spatial_index=spatial_index, 254 | n_jobs=n_jobs, 255 | categorical_variables=categorical_variables, 256 | categorical_frequency=False, 257 | ) 258 | estimates[id_col] = target_df[id_col].values 259 | 260 | if categorical_variables is not None: 261 | category_vars_to_add = [] 262 | for category_var in category_vars: 263 | if category_var not in estimates.columns: 264 | category_vars_to_add.append(category_var) 265 | estimates = estimates.join( 266 | pandas.DataFrame(index=estimates.index, columns=category_vars_to_add) 267 | ) 268 | return estimates 269 | -------------------------------------------------------------------------------- /tobler/area_weighted/area_join.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import warnings 4 | 5 | __author__ = "Martin Fleischmann " 6 | 7 | __all__ = ["area_join"] 8 | 9 | 10 | def area_join(source_df, target_df, variables): 11 | """ 12 | Join variables from source_df based on the largest intersection. In case of a tie it picks the first one. 13 | 14 | Parameters 15 | ---------- 16 | source_df : geopandas.GeoDataFrame 17 | GeoDataFrame containing source values 18 | target_df : geopandas.GeoDataFrame 19 | GeoDataFrame containing source values 20 | variables : string or list-like 21 | column(s) in source_df dataframe for variable(s) to be joined 22 | 23 | Returns 24 | ------- 25 | joined : geopandas.GeoDataFrame 26 | target_df GeoDataFrame with joined variables as additional columns 27 | 28 | """ 29 | if not pd.api.types.is_list_like(variables): 30 | variables = [variables] 31 | 32 | for v in variables: 33 | if v in target_df.columns: 34 | raise ValueError(f"Column '{v}' already present in target_df.") 35 | 36 | target_df = target_df.copy() 37 | target_ix, source_ix = source_df.sindex.query( 38 | target_df.geometry, predicate="intersects" 39 | ) 40 | areas = ( 41 | target_df.geometry.values[target_ix] 42 | .intersection(source_df.geometry.values[source_ix]) 43 | .area 44 | ) 45 | 46 | main = [] 47 | for i in range(len(target_df)): # vectorise this loop? 48 | mask = target_ix == i 49 | if np.any(mask): 50 | main.append(source_ix[mask][np.argmax(areas[mask])]) 51 | else: 52 | main.append(np.nan) 53 | 54 | main = np.array(main, dtype=float) 55 | mask = ~np.isnan(main) 56 | 57 | for v in variables: 58 | arr = np.empty(len(main), dtype=object) 59 | arr[mask] = source_df[v].values[main[mask].astype(int)] 60 | try: 61 | arr = arr.astype(source_df[v].dtype) 62 | except TypeError: 63 | warnings.warn( 64 | f"Cannot preserve dtype of '{v}'. Falling back to `dtype=object`.", 65 | ) 66 | target_df[v] = arr 67 | 68 | return target_df 69 | -------------------------------------------------------------------------------- /tobler/dasymetric/__init__.py: -------------------------------------------------------------------------------- 1 | from .masked_area_interpolate import masked_area_interpolate 2 | from .raster_tools import extract_raster_features, _fast_append_profile_in_gdf 3 | 4 | __all__ = ["masked_area_interpolate"] -------------------------------------------------------------------------------- /tobler/dasymetric/masked_area_interpolate.py: -------------------------------------------------------------------------------- 1 | from warnings import warn 2 | 3 | import geopandas as gpd 4 | 5 | from ..area_weighted import area_interpolate 6 | from .raster_tools import extract_raster_features 7 | 8 | __all__ = ["masked_area_interpolate"] 9 | 10 | 11 | def masked_area_interpolate( 12 | source_df, 13 | target_df, 14 | raster, 15 | pixel_values, 16 | extensive_variables=None, 17 | intensive_variables=None, 18 | categorical_variables=None, 19 | allocate_total=True, 20 | nodata=255, 21 | n_jobs=-1, 22 | codes=None, 23 | ): 24 | """Interpolate data between two polygonal datasets using an auxiliary raster to mask out uninhabited land. 25 | 26 | Parameters 27 | ---------- 28 | source_df : geopandas.GeoDataFrame 29 | source data to be converted to another geometric representation. 30 | target_df : geopandas.GeoDataFrame 31 | target geometries that will form the new representation of the input data 32 | raster : str 33 | path to raster file that contains ancillary data 34 | pixel_values : list of ints 35 | list of pixel values that should be considered part of the mask. For example if 36 | using data from NLCD Land Cover Database , a common 37 | input might be [21,22,23,24], which match the "developed" land types in that dataset 38 | extensive_variables : list 39 | Columns of the input dataframe containing extensive variables to interpolate 40 | intensive_variables : list 41 | Columns of the input dataframe containing intensive variables to interpolate 42 | categorical_variables : list 43 | [Optional. Default=None] Columns in dataframes for categorical variables 44 | allocate_total : bool 45 | whether to allocate the total from the source geometries (the default is True). 46 | nodata : int 47 | value in raster that indicates null or missing values. Default is 255 48 | n_jobs : int 49 | [Optional. Default=-1] Number of processes to run in parallel to 50 | generate the area allocation. If -1, this is set to the number of CPUs 51 | available. 52 | 53 | 54 | Returns 55 | ------- 56 | geopandas.GeoDataFrame 57 | GeoDataFrame with geometries matching the target_df and extensive and intensive 58 | variables as the columns 59 | 60 | """ 61 | if codes: 62 | warn( 63 | "The `codes` keyword is deprecated and will be removed shortly. Please use `pixel_values` instead" 64 | ) 65 | pixel_values = codes 66 | source_df = source_df.copy() 67 | assert not any( 68 | source_df.index.duplicated() 69 | ), "The index of the source_df cannot contain duplicates." 70 | 71 | # create a vector mask from the raster data 72 | raster_mask = extract_raster_features( 73 | source_df, raster, pixel_values, nodata, n_jobs, collapse_values=True 74 | ) 75 | # create a column in the source_df to dissolve on 76 | idx_name = source_df.index.name if source_df.index.name else "idx" 77 | source_df[idx_name] = source_df.index 78 | 79 | # clip source_df by its mask (overlay/dissolve is faster than gpd.clip here) 80 | source_df = gpd.overlay( 81 | source_df, raster_mask.to_crs(source_df.crs), how="intersection" 82 | ).dissolve(idx_name) 83 | 84 | # continue with standard areal interpolation using the clipped source 85 | interpolation = area_interpolate( 86 | source_df, 87 | target_df.copy(), 88 | extensive_variables=extensive_variables, 89 | intensive_variables=intensive_variables, 90 | n_jobs=n_jobs, 91 | categorical_variables=categorical_variables, 92 | allocate_total=allocate_total, 93 | ) 94 | return interpolation 95 | -------------------------------------------------------------------------------- /tobler/dasymetric/raster_tools.py: -------------------------------------------------------------------------------- 1 | """tools for working with rasters.""" 2 | 3 | import ast 4 | import multiprocessing 5 | import warnings 6 | 7 | import geopandas as gpd 8 | import numpy as np 9 | import pandas as pd 10 | import rasterio as rio 11 | import rasterstats as rs 12 | from joblib import Parallel, delayed 13 | from packaging.version import Version 14 | from rasterio import features 15 | from rasterio.mask import mask 16 | from shapely.geometry import shape 17 | 18 | from ..util.util import _check_presence_of_crs 19 | 20 | GPD_10 = Version(gpd.__version__) >= Version("1.0.0dev") 21 | 22 | __all__ = ["extract_raster_features"] 23 | 24 | 25 | def _chunk_dfs(geoms_to_chunk, n_jobs): 26 | chunk_size = geoms_to_chunk.shape[0] // n_jobs + 1 27 | for i in range(n_jobs): 28 | start = i * chunk_size 29 | yield geoms_to_chunk.iloc[start : start + chunk_size] 30 | 31 | 32 | def _parse_geom(geom_str): 33 | return shape(ast.literal_eval(geom_str)) 34 | 35 | 36 | def _apply_parser(df): 37 | return df.apply(_parse_geom) 38 | 39 | 40 | def _fast_append_profile_in_gdf(geodataframe, raster_path, force_crs_match=True): 41 | """Append categorical zonal statistics (counts by pixel type) as columns to an input geodataframe. 42 | 43 | geodataframe : geopandas.GeoDataFrame 44 | geodataframe that has overlay with the raster. If some polygon do not overlay the raster, 45 | consider a preprocessing step using the function subset_gdf_polygons_from_raster. 46 | raster_path : str 47 | path to the raster image. 48 | force_crs_match : bool, Default is True. 49 | Whether the Coordinate Reference System (CRS) of the polygon will be reprojected to 50 | the CRS of the raster file. It is recommended to let this argument as True. 51 | 52 | Notes 53 | ----- 54 | The generated geodataframe will input the value 0 for each Type that is not present in the raster 55 | for each polygon. 56 | """ 57 | 58 | _check_presence_of_crs(geodataframe) 59 | if force_crs_match: 60 | with rio.open(raster_path) as raster: 61 | geodataframe = geodataframe.to_crs(crs=raster.crs.data) 62 | else: 63 | warnings.warn( 64 | "The GeoDataFrame is not being reprojected. The clipping might be being performing on unmatching polygon to the raster." 65 | ) 66 | 67 | zonal_gjson = rs.zonal_stats( 68 | geodataframe, raster_path, prefix="Type_", geojson_out=True, categorical=True 69 | ) 70 | 71 | zonal_ppt_gdf = gpd.GeoDataFrame.from_features(zonal_gjson) 72 | 73 | return zonal_ppt_gdf 74 | 75 | 76 | def extract_raster_features( 77 | gdf, raster_path, pixel_values=None, nodata=255, n_jobs=-1, collapse_values=False 78 | ): 79 | """Generate a geodataframe from raster data by polygonizing contiguous pixels with the same value using rasterio's features module. 80 | 81 | Parameters 82 | ---------- 83 | gdf : geopandas.GeoDataFrame 84 | geodataframe defining the area of interest. The input raster will be 85 | clipped to the extent of the geodataframe 86 | raster_path : str 87 | path to raster file, such as downloaded from 88 | pixel_values : list-like, optional 89 | subset of pixel values to extract, by default None. If None, this function 90 | may generate a very large geodataframe 91 | nodata : int, optional 92 | pixel value denoting "no data" in input raster 93 | n_jobs : int 94 | [Optional. Default=-1] Number of processes to run in parallel. If -1, 95 | this is set to the number of CPUs available 96 | collapse_values : bool, optional 97 | If True, multiple values passed to the pixel_values argument are treated 98 | as a single type. I.e. polygons will be generated from any contiguous collection 99 | of values from pixel_types, instead of unique polygons generated for each value 100 | This can dramatically reduce the complexity of the resulting geodataframe a 101 | fewer polygons are required to represent the study area. 102 | 103 | Returns 104 | ------- 105 | geopandas.GeoDataFrame 106 | geodataframe whose rows are the zones extracted by the rasterio.features module. 107 | The geometry of each zone is the boundary of a contiguous group of pixels with 108 | the same value; the `value` column contains the pixel value of each zone. 109 | """ 110 | if n_jobs == -1: 111 | n_jobs = multiprocessing.cpu_count() 112 | with rio.open(raster_path) as src: 113 | raster_crs = src.crs.to_dict() 114 | gdf = gdf.to_crs(raster_crs) 115 | if GPD_10: 116 | geomask = [gdf.union_all().__geo_interface__] 117 | else: 118 | geomask = [gdf.unary_union.__geo_interface__] 119 | 120 | out_image, out_transform = mask( 121 | src, geomask, nodata=nodata, crop=True 122 | ) # clip to AoI using a vector layer 123 | 124 | if pixel_values: 125 | if collapse_values: 126 | out_image = np.where( 127 | np.isin(out_image, pixel_values), pixel_values[0], out_image 128 | ) # replace values to generate fewer polys 129 | pixel_values = np.isin( 130 | out_image, pixel_values 131 | ) # only include requested pixels 132 | 133 | shapes = list( 134 | features.shapes(out_image, mask=pixel_values, transform=out_transform) 135 | ) # convert regions to polygons 136 | res = list(zip(*shapes)) 137 | geoms = pd.Series(res[0], name="geometry").astype(str) 138 | pieces = _chunk_dfs(geoms, n_jobs) 139 | geoms = pd.concat( 140 | Parallel(n_jobs=n_jobs)(delayed(_apply_parser)(i) for i in pieces) 141 | ) 142 | geoms = gpd.GeoSeries(geoms).buffer(0) # we sometimes get self-intersecting rings 143 | vals = pd.Series(res[1], name="value") 144 | gdf = gpd.GeoDataFrame(vals, geometry=geoms, crs=raster_crs) 145 | if collapse_values: 146 | gdf = gdf.drop(columns=["value"]) # values col is misleading in this case 147 | 148 | return gdf 149 | -------------------------------------------------------------------------------- /tobler/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .glm import glm 2 | 3 | __all__ = ['glm'] -------------------------------------------------------------------------------- /tobler/model/glm.py: -------------------------------------------------------------------------------- 1 | """Model-based methods for areal interpolation.""" 2 | 3 | import numpy as np 4 | import statsmodels.formula.api as smf 5 | from statsmodels.genmod.families import Gaussian, NegativeBinomial, Poisson 6 | 7 | from ..dasymetric import _fast_append_profile_in_gdf 8 | from ..util.util import _check_presence_of_crs 9 | 10 | __all__ = ["glm"] 11 | 12 | 13 | def glm( 14 | source_df=None, 15 | target_df=None, 16 | raster="nlcd_2011", 17 | raster_codes=None, 18 | variable=None, 19 | formula=None, 20 | likelihood="poisson", 21 | force_crs_match=True, 22 | return_model=False, 23 | ): 24 | """Train a generalized linear model to predict polygon attributes based on the collection of pixel values they contain. 25 | 26 | Parameters 27 | ---------- 28 | source_df : geopandas.GeoDataFrame, required 29 | geodataframe containing source original data to be represented by another geometry 30 | target_df : geopandas.GeoDataFrame, required 31 | geodataframe containing target boundaries that will be used to represent the source data 32 | raster : str, required (default="nlcd_2011") 33 | path to raster file that will be used to input data to the regression model. 34 | i.e. a coefficients refer to the relationship between pixel counts and population counts. 35 | Defaults to 2011 NLCD 36 | raster_codes : list, required (default =[21, 22, 23, 24, 41, 42, 52]) 37 | list of integers that represent different types of raster cells. If no formula is given, 38 | the model will be fit from a linear combination of the logged count of each cell type 39 | listed here. Defaults to [21, 22, 23, 24, 41, 42, 52] which 40 | are informative land type cells from the NLCD 41 | variable : str, required 42 | name of the variable (column) to be modeled from the `source_df` 43 | formula : str, optional 44 | patsy-style model formula that specifies the model. Raster codes should be prefixed with 45 | "Type_", e.g. `"n_total_pop ~ -1 + np.log1p(Type_21) + np.log1p(Type_22)` 46 | likelihood : str, {'poisson', 'gaussian', 'neg_binomial'} (default = "poisson") 47 | the likelihood function used in the model 48 | force_crs_match : bool 49 | whether to coerce geodataframe and raster to the same CRS 50 | return model : bool 51 | whether to return the fitted model in addition to the interpolated geodataframe. 52 | If true, this will return (geodataframe, model) 53 | 54 | Returns 55 | -------- 56 | interpolated : geopandas.GeoDataFrame 57 | a new geopandas dataframe with boundaries from `target_df` and modeled attribute 58 | data from the `source_df`. If `return_model` is true, the function will also return 59 | the fitted regression model for further diagnostics 60 | 61 | 62 | """ 63 | source_df = source_df.copy() 64 | target_df = target_df.copy() 65 | _check_presence_of_crs(source_df) 66 | liks = {"poisson": Poisson, "gaussian": Gaussian, "neg_binomial": NegativeBinomial} 67 | 68 | if likelihood not in liks.keys(): 69 | raise ValueError(f"likelihood must one of {liks.keys()}") 70 | 71 | if not raster_codes: 72 | raster_codes = [21, 22, 23, 24, 41, 42, 52] 73 | raster_codes = ["Type_" + str(i) for i in raster_codes] 74 | 75 | if not formula: 76 | formula = ( 77 | variable 78 | + "~ -1 +" 79 | + "+".join(["np.log1p(" + code + ")" for code in raster_codes]) 80 | ) 81 | 82 | profiled_df = _fast_append_profile_in_gdf( 83 | source_df[[source_df.geometry.name, variable]], raster, force_crs_match 84 | ) 85 | 86 | results = smf.glm(formula, data=profiled_df, family=liks[likelihood]()).fit() 87 | 88 | out = target_df[[target_df.geometry.name]] 89 | temp = _fast_append_profile_in_gdf( 90 | out[[out.geometry.name]], raster, force_crs_match 91 | ) 92 | 93 | out[variable] = results.predict(temp.drop(columns=[temp.geometry.name]).fillna(0)) 94 | 95 | if return_model: 96 | return out, results 97 | 98 | return out 99 | -------------------------------------------------------------------------------- /tobler/pycno/__init__.py: -------------------------------------------------------------------------------- 1 | from .pycno import pycno_interpolate 2 | 3 | __all__ = ['pycno_interpolate'] 4 | -------------------------------------------------------------------------------- /tobler/pycno/pycno.py: -------------------------------------------------------------------------------- 1 | """Pycnophylactic Interpolation (contributed by @danlewis85).""" 2 | # https://github.com/danlewis85/pycno/ 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import rasterio 7 | from numpy import ( 8 | absolute, 9 | apply_along_axis, 10 | asarray, 11 | convolve, 12 | copy, 13 | nan, 14 | nanmax, 15 | nanmean, 16 | nansum, 17 | pad, 18 | power, 19 | round, 20 | unique, 21 | ) 22 | from numpy.ma import masked_invalid, masked_where 23 | from pandas import DataFrame 24 | from rasterio.features import rasterize 25 | 26 | __all__ = ["pycno_interpolate"] 27 | 28 | 29 | def pycno( 30 | gdf, value_field, cellsize, r=0.2, handle_null=True, converge=3, verbose=True 31 | ): 32 | """Returns a smooth pycnophylactic interpolation raster for a given geodataframe 33 | 34 | Args: 35 | gdf (geopandas.geodataframe.GeoDataFrame): Input GeoDataFrame. 36 | value_field (str): Field name of values to be used to produce pycnophylactic surface 37 | cellsize (int): Pixel size of raster in planar units (i.e. metres, feet) 38 | r (float, optional): Relaxation parameter, default of 0.2 is generally fine. 39 | handle_null (boolean, optional): Changes how nodata values are smoothed. Default True. 40 | converge (int, optional): Index for stopping value, default 3 is generally fine. 41 | verbose (boolean, optional): Print out progress at each iteration. 42 | 43 | Returns: 44 | Numpy Array: Smooth pycnophylactic interpolation. 45 | Rasterio geotransform 46 | GeoPandas crs 47 | """ 48 | # set nodata value 49 | nodata = -9999 50 | 51 | # work out raster rows and columns based on gdf extent and cellsize 52 | xmin, ymin, xmax, ymax = gdf.total_bounds 53 | xres = int((xmax - xmin) / cellsize) 54 | yres = int((ymax - ymin) / cellsize) 55 | 56 | # Work out transform so that we rasterize the area where the data are! 57 | trans = rasterio.Affine.from_gdal(xmin, cellsize, 0, ymax, 0, -cellsize) 58 | 59 | # First make a zone array 60 | # NB using index values as ids can often be too large/alphanumeric. Limit is int32 polygon features. 61 | # create a generator of geom, index pairs to use in rasterizing 62 | shapes = ((geom, value) for geom, value in zip(gdf.geometry, gdf.index)) 63 | # burn the features into a raster array 64 | feature_array = rasterize( 65 | shapes=shapes, fill=nodata, out_shape=(yres, xres), transform=trans 66 | ) 67 | 68 | # Get cell counts per index value (feature) 69 | unique, count = np.unique(feature_array, return_counts=True) 70 | cellcounts = asarray((unique, count)).T 71 | # Lose the nodata counts 72 | cellcounts = cellcounts[cellcounts[:, 0] != nodata, :] 73 | # Adjust value totals by cells 74 | # Make cell counts dataframe 75 | celldf = DataFrame(cellcounts[:, 1], index=cellcounts[:, 0], columns=["cellcount"]) 76 | # Merge cell counts 77 | gdf = gdf.merge(celldf, how="left", left_index=True, right_index=True) 78 | # Calculate cell values 79 | gdf["cellvalues"] = gdf[value_field] / gdf["cellcount"] 80 | 81 | # create a generator of geom, cellvalue pairs to use in rasterizing 82 | shapes = ((geom, value) for geom, value in zip(gdf.geometry, gdf.cellvalues)) 83 | # Now burn the initial value raster 84 | value_array = rasterize( 85 | shapes=shapes, fill=nodata, out_shape=(yres, xres), transform=trans 86 | ) 87 | 88 | # Set nodata in value array to np.nan 89 | value_array[value_array == -9999] = nan 90 | 91 | # Set stopper value based on converge parameter 92 | stopper = nanmax(value_array) * power(10.0, -converge) 93 | 94 | # The basic numpy convolve function doesn't handle nulls. 95 | def smooth2D(data): 96 | # Create function that calls a 1 dimensionsal smoother. 97 | s1d = lambda s: convolve(s, [0.5, 0.0, 0.5], mode="same") 98 | # pad the data array with the mean value 99 | padarray = pad(data, 1, "constant", constant_values=nanmean(data)) 100 | # make nodata mask 101 | mask = masked_invalid(padarray).mask 102 | # set nodata as zero to avoid eroding the raster 103 | padarray[mask] = 0.0 104 | # Apply the convolution along each axis of the data and average 105 | padarray = ( 106 | apply_along_axis(s1d, 1, padarray) + apply_along_axis(s1d, 0, padarray) 107 | ) / 2 108 | # Reinstate nodata 109 | padarray[mask] = nan 110 | return padarray[1:-1, 1:-1] 111 | 112 | # The convolution function from astropy handles nulls. 113 | def astroSmooth2d(data): 114 | try: 115 | from astropy.convolution import convolve as astro_convolve 116 | except (ImportError, ModuleNotFoundError) as err: 117 | raise ImportError( 118 | "Pycnophylactic interpolation with handle_null=True " 119 | "requires the astropy package" 120 | ) from err 121 | s1d = lambda s: astro_convolve(s, [0.5, 0, 0.5]) 122 | # pad the data array with the mean value 123 | padarray = pad(data, 1, "constant", constant_values=nanmean(data)) 124 | # Apply the convolution along each axis of the data and average 125 | padarray = ( 126 | apply_along_axis(s1d, 1, padarray) + apply_along_axis(s1d, 0, padarray) 127 | ) / 2 128 | return padarray[1:-1, 1:-1] 129 | 130 | def correct2Da(data): 131 | for idx, val in gdf[value_field].items(): 132 | # Create zone mask from feature_array 133 | mask = masked_where(feature_array == idx, feature_array).mask 134 | # Work out the correction factor 135 | correct = (val - nansum(data[mask])) / mask.sum() 136 | # Apply correction 137 | data[mask] += correct 138 | 139 | return data 140 | 141 | def correct2Dm(data): 142 | for idx, val in gdf[value_field].items(): 143 | # Create zone mask from feature_array 144 | mask = masked_where(feature_array == idx, feature_array).mask 145 | # Work out the correction factor 146 | correct = val / nansum(data[mask]) 147 | if correct != 0.0: 148 | # Apply correction 149 | data[mask] *= correct 150 | 151 | return data 152 | 153 | while True: 154 | # Store the current iteration 155 | old = copy(value_array) 156 | 157 | # Smooth the value_array 158 | if handle_null: 159 | sm = astroSmooth2d(value_array) 160 | else: 161 | sm = smooth2D(value_array) 162 | 163 | # Relaxation to prevent overcompensation in the smoothing step 164 | value_array = value_array * r + (1.0 - r) * sm 165 | 166 | # Perform correction 167 | value_array = correct2Da(value_array) 168 | 169 | # Reset any negative values to zero. 170 | value_array[value_array < 0] = 0.0 171 | 172 | # Perform correction 173 | value_array = correct2Dm(value_array) 174 | 175 | if verbose: 176 | print( 177 | "Maximum Change: " 178 | + str(round(nanmax(absolute(old - value_array)), 4)) 179 | + " - will stop at " 180 | + str(round(stopper, 4)) 181 | ) 182 | 183 | if nanmax(absolute(old - value_array)) < stopper: 184 | break 185 | 186 | return (value_array, trans, gdf.crs) 187 | 188 | 189 | def save_pycno(pycno_array, transform, crs, filestring, driver="GTiff"): 190 | """Saves a numpy array as a raster, largely a helper function for pycno 191 | Args: 192 | pycno_array (numpy array): 2D numpy array of pycnophylactic surface 193 | transform (rasterio geotransform): Relevant transform from pycno() 194 | crs (GeoPandas crs): Coordinate reference system of GeoDataFrame used in pycno() 195 | filestring (str): File path to save raster 196 | driver (str, optional): Format for output raster, default: geoTiff. 197 | Returns: 198 | None 199 | """ 200 | import rasterio 201 | 202 | # Save raster 203 | new_dataset = rasterio.open( 204 | filestring, 205 | "w", 206 | driver=driver, 207 | height=pycno_array.shape[0], 208 | width=pycno_array.shape[1], 209 | count=1, 210 | dtype="float64", 211 | crs=crs, 212 | transform=transform, 213 | ) 214 | new_dataset.write(pycno_array.astype("float64"), 1) 215 | new_dataset.close() 216 | 217 | return None 218 | 219 | 220 | def extract_values(pycno_array, gdf, transform, fieldname="Estimate"): 221 | """Extract raster value sums according to a provided polygon geodataframe 222 | Args: 223 | pycno_array (numpy array): 2D numpy array of pycnophylactic surface. 224 | gdf (geopandas.geodataframe.GeoDataFrame): Target GeoDataFrame. 225 | transform (rasterio geotransform): Relevant transform from pycno() 226 | fieldname (str, optional): New gdf field to save estimates in. Default name: 'Estimate'. 227 | Returns: 228 | geopandas.geodataframe.GeoDataFrame: Target GeoDataFrame with appended estimates. 229 | """ 230 | from numpy import nansum 231 | from rasterio.features import geometry_mask 232 | 233 | estimates = [] 234 | # Iterate through geodataframe and extract values 235 | for idx, geom in gdf["geometry"].items(): 236 | mask = geometry_mask( 237 | [geom], pycno_array.shape, transform=transform, invert=True 238 | ) 239 | estimates.append(nansum(pycno_array[mask])) 240 | out = pd.Series(estimates, index=gdf.index) 241 | return out 242 | 243 | 244 | def pycno_interpolate( 245 | source_df, 246 | target_df, 247 | variables, 248 | cellsize, 249 | r=0.2, 250 | handle_null=True, 251 | converge=3, 252 | verbose=False, 253 | ): 254 | """Pycnophylactic Inerpolation. 255 | 256 | Parameters 257 | ---------- 258 | source_df : geopandas.GeoDataFrame (required) 259 | geodataframe with polygon geometries and data to transfer 260 | target_df : geopandas.GeoDataFrame (required) 261 | geodataframe with polygon geometries to receive new data 262 | variables : list 263 | columns on the source_df containing data to transfer 264 | cellsize : int 265 | Pixel size of intermediate raster in planar units (i.e. metres, feet) 266 | r : float, optional 267 | Relaxation parameter, default of 0.2 is generally fine 268 | handle_null : bool, optional 269 | Changes how nodata values are smoothed. Default True. 270 | converge : int, optional 271 | Index for stopping value, default 3 is generally fine. 272 | verbose : bool, optional 273 | Print out progress at each iteration. 274 | 275 | Returns 276 | ------- 277 | geopandas.GeoDataFrame 278 | new geodataframe with interpolated variables as columns and target_df geometry 279 | as output geometry 280 | 281 | Notes 282 | ----- 283 | The formula is based on Tobler, W. R. (1979). Smooth pycnophylactic interpolation for geographical regions. Journal of the American Statistical Association, 74(367), 519–529. https://doi.org/10.1080/01621459.1979.10481647 284 | 285 | Original implementation written by @danlewis85 at 286 | and based in part on the R pycno package by Chris Brusndon () 287 | 288 | References: :cite:`tobler_smooth_1979` 289 | """ 290 | assert source_df.crs.equals( 291 | target_df.crs 292 | ), "source_df CRS and target_df CRS are not the same. Reproject into consistent systems before proceeding" 293 | output_vars = target_df.copy()[[target_df.geometry.name]] 294 | for variable in variables: 295 | pyc, trans, _ = pycno( 296 | source_df, 297 | variable, 298 | cellsize=cellsize, 299 | r=r, 300 | handle_null=handle_null, 301 | converge=converge, 302 | verbose=verbose, 303 | ) 304 | vals = extract_values(pyc, target_df, transform=trans) 305 | output_vars[variable] = vals 306 | 307 | return output_vars 308 | -------------------------------------------------------------------------------- /tobler/tests/test_area_interpolators.py: -------------------------------------------------------------------------------- 1 | """test interpolation functions.""" 2 | 3 | import geopandas 4 | import dask_geopandas 5 | 6 | from libpysal.examples import load_example 7 | from numpy.testing import assert_almost_equal 8 | from tobler.area_weighted import area_interpolate 9 | from tobler.area_weighted import area_interpolate_dask 10 | from tobler.area_weighted.area_interpolate import _area_tables_binning 11 | from geopandas.testing import assert_geodataframe_equal 12 | import pytest 13 | 14 | 15 | def datasets(): 16 | sac1 = load_example("Sacramento1") 17 | sac2 = load_example("Sacramento2") 18 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 19 | sac1 = sac1.to_crs(sac1.estimate_utm_crs()) 20 | sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp")) 21 | sac2 = sac2.to_crs(sac1.crs) 22 | sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT 23 | categories = ["cat", "dog", "donkey", "wombat", "capybara"] 24 | sac1["animal"] = (categories * ((len(sac1) // len(categories)) + 1))[: len(sac1)] 25 | 26 | return sac1, sac2 27 | 28 | 29 | def test_area_interpolate_singlecore(): 30 | sac1, sac2 = datasets() 31 | area = area_interpolate( 32 | source_df=sac1, 33 | target_df=sac2, 34 | extensive_variables=["TOT_POP"], 35 | intensive_variables=["pct_poverty"], 36 | categorical_variables=["animal"], 37 | n_jobs=1, 38 | ) 39 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) 40 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) 41 | assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) 42 | assert_almost_equal(area.animal_dog.sum(), 19, decimal=0) 43 | assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0) 44 | assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0) 45 | assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0) 46 | 47 | 48 | def test_area_interpolate_extensive(): 49 | sac1, sac2 = datasets() 50 | area = area_interpolate( 51 | source_df=sac1.to_crs(4326), # trigger warning once 52 | target_df=sac2.to_crs(4326), 53 | extensive_variables=["TOT_POP"], 54 | n_jobs=1, 55 | ) 56 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) 57 | 58 | 59 | def test_area_interpolate_intensive(): 60 | sac1, sac2 = datasets() 61 | area = area_interpolate( 62 | source_df=sac1, 63 | target_df=sac2, 64 | intensive_variables=["pct_poverty"], 65 | n_jobs=1, 66 | ) 67 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) 68 | 69 | 70 | def test_area_interpolate_categorical(): 71 | sac1, sac2 = datasets() 72 | area = area_interpolate( 73 | source_df=sac1, 74 | target_df=sac2, 75 | extensive_variables=["TOT_POP"], 76 | intensive_variables=["pct_poverty"], 77 | categorical_variables=["animal"], 78 | n_jobs=1, 79 | ) 80 | assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) 81 | assert_almost_equal(area.animal_dog.sum(), 19, decimal=0) 82 | assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0) 83 | assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0) 84 | assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0) 85 | 86 | 87 | @pytest.mark.xfail(reason="dask_geopandas is broken with dask-expr backend") 88 | def test_area_interpolate_categorical_dask(): 89 | sac1, sac2 = datasets() 90 | sac1["animal"] = sac1["animal"].astype("category") 91 | dsac1 = dask_geopandas.from_geopandas(sac1, npartitions=2).spatial_shuffle( 92 | by="hilbert", shuffle="tasks" 93 | ) 94 | dsac2 = dask_geopandas.from_geopandas(sac2, npartitions=2).spatial_shuffle( 95 | by="hilbert", shuffle="tasks" 96 | ) 97 | area = area_interpolate_dask( 98 | source_dgdf=dsac1, 99 | target_dgdf=dsac2, 100 | id_col="ZIP", 101 | categorical_variables=["animal"], 102 | ).compute() 103 | assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) 104 | assert_almost_equal(area.animal_dog.sum(), 19, decimal=0) 105 | assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0) 106 | assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0) 107 | assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0) 108 | 109 | 110 | def test_area_interpolate_custom_index(): 111 | sac1, sac2 = datasets() 112 | sac1.index = sac1.index * 2 113 | sac2.index = sac2.index * 13 114 | area = area_interpolate( 115 | source_df=sac1, 116 | target_df=sac2, 117 | extensive_variables=["TOT_POP"], 118 | intensive_variables=["pct_poverty"], 119 | categorical_variables=["animal"], 120 | ) 121 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) 122 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) 123 | assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) 124 | assert_almost_equal(area.animal_dog.sum(), 19, decimal=0) 125 | assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0) 126 | assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0) 127 | assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0) 128 | assert not area.isna().any().any() 129 | 130 | 131 | def test_area_interpolate_sindex_options(): 132 | sac1, sac2 = datasets() 133 | auto = area_interpolate( 134 | source_df=sac1, 135 | target_df=sac2, 136 | extensive_variables=["TOT_POP"], 137 | intensive_variables=["pct_poverty"], 138 | ) 139 | source = area_interpolate( 140 | source_df=sac1, 141 | target_df=sac2, 142 | extensive_variables=["TOT_POP"], 143 | intensive_variables=["pct_poverty"], 144 | spatial_index="source", 145 | ) 146 | target = area_interpolate( 147 | source_df=sac1, 148 | target_df=sac2, 149 | extensive_variables=["TOT_POP"], 150 | intensive_variables=["pct_poverty"], 151 | spatial_index="target", 152 | ) 153 | 154 | assert_geodataframe_equal(auto, source) 155 | assert_geodataframe_equal(auto, target) 156 | 157 | with pytest.raises(ValueError): 158 | area_interpolate( 159 | source_df=sac1, 160 | target_df=sac2, 161 | extensive_variables=["TOT_POP"], 162 | intensive_variables=["pct_poverty"], 163 | spatial_index="non-existent", 164 | ) 165 | 166 | 167 | def test_area_interpolate_parallel(): 168 | sac1, sac2 = datasets() 169 | area = area_interpolate( 170 | source_df=sac1, 171 | target_df=sac2, 172 | extensive_variables=["TOT_POP"], 173 | intensive_variables=["pct_poverty"], 174 | n_jobs=-1, 175 | ) 176 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) 177 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) 178 | 179 | 180 | def test_area_tables_binning(): 181 | sac1, sac2 = datasets() 182 | sac1 = sac1.to_crs(4326) 183 | sac2 = sac2.to_crs(4326) 184 | 185 | auto = _area_tables_binning( 186 | source_df=sac1, target_df=sac2, spatial_index="auto" 187 | ) 188 | source = _area_tables_binning( 189 | source_df=sac1, target_df=sac2, spatial_index="source" 190 | ) 191 | target = _area_tables_binning( 192 | source_df=sac1, target_df=sac2, spatial_index="target" 193 | ) 194 | 195 | assert (auto != source).sum() == 0 196 | assert (auto != target).sum() == 0 197 | 198 | assert auto.sum() == pytest.approx(1.3879647) 199 | assert auto.mean() == pytest.approx(2.7552649e-05) 200 | 201 | assert (auto[5][0].toarray() > 0).sum() == 7 202 | 203 | 204 | def test_passed_table(): 205 | sac1, sac2 = datasets() 206 | csr = _area_tables_binning(source_df=sac1, target_df=sac2, spatial_index="auto") 207 | 208 | area = area_interpolate( 209 | source_df=sac1, 210 | target_df=sac2, 211 | extensive_variables=["TOT_POP"], 212 | intensive_variables=["pct_poverty"], 213 | table=csr, 214 | ) 215 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) 216 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) 217 | 218 | dok = csr.todok() 219 | 220 | area = area_interpolate( 221 | source_df=sac1, 222 | target_df=sac2, 223 | extensive_variables=["TOT_POP"], 224 | intensive_variables=["pct_poverty"], 225 | table=dok, 226 | ) 227 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) 228 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) 229 | -------------------------------------------------------------------------------- /tobler/tests/test_area_join.py: -------------------------------------------------------------------------------- 1 | import geopandas as gpd 2 | import numpy as np 3 | from shapely.geometry import Point 4 | 5 | import pytest 6 | 7 | from tobler.area_weighted import area_join 8 | 9 | 10 | class TestAreaJoin: 11 | def setup_method(self): 12 | self.grid = gpd.points_from_xy( 13 | np.repeat(np.linspace(1, 10, 10), 10), np.tile(np.linspace(1, 10, 10), 10) 14 | ).buffer(0.5, cap_style=3) 15 | self.source = gpd.GeoDataFrame( 16 | { 17 | "floats": np.linspace(1, 10, 100), 18 | "ints": np.linspace(1, 100, 100, dtype="int"), 19 | "strings": np.array(["darribas", "is", "the", "king"] * 25), 20 | }, 21 | geometry=self.grid, 22 | ) 23 | 24 | self.target = gpd.GeoDataFrame(geometry=self.grid.translate(xoff=2.2, yoff=0.2)) 25 | 26 | def test_area_join_float(self): 27 | result = area_join(self.source, self.target, "floats") 28 | assert (result.columns == ["geometry", "floats"]).all() 29 | np.testing.assert_almost_equal(result.floats.mean(), 6.409, 3) 30 | assert result.floats.dtype == float 31 | assert result.floats.isna().sum() == 20 32 | 33 | def test_area_join_ints(self): 34 | with pytest.warns(UserWarning, match="Cannot preserve dtype of"): 35 | result = area_join(self.source, self.target, "ints") 36 | 37 | assert (result.columns == ["geometry", "ints"]).all() 38 | np.testing.assert_almost_equal(result.ints.mean(), 60.5, 3) 39 | assert result.ints.dtype == object 40 | assert type(result.ints.iloc[0]) == int 41 | assert result.ints.isna().sum() == 20 42 | 43 | def test_area_join_strings(self): 44 | result = area_join(self.source, self.target, "strings") 45 | assert (result.columns == ["geometry", "strings"]).all() 46 | assert result.strings.dtype == object 47 | assert type(result.strings.iloc[0]) == str 48 | assert result.strings.isna().sum() == 20 49 | 50 | def test_area_join_array(self): 51 | with pytest.warns(UserWarning, match="Cannot preserve dtype of"): 52 | result = area_join(self.source, self.target, ["floats", "ints", "strings"]) 53 | 54 | assert (result.columns == ["geometry", "floats", "ints", "strings"]).all() 55 | np.testing.assert_almost_equal(result.floats.mean(), 6.409, 3) 56 | assert result.floats.dtype == float 57 | assert result.floats.isna().sum() == 20 58 | np.testing.assert_almost_equal(result.ints.mean(), 60.5, 3) 59 | assert result.ints.dtype == object 60 | assert type(result.ints.iloc[0]) == int 61 | assert result.ints.isna().sum() == 20 62 | assert result.strings.dtype == object 63 | assert type(result.strings.iloc[0]) == str 64 | assert result.strings.isna().sum() == 20 65 | 66 | def test_area_join_error(self): 67 | target = self.target 68 | target["floats"] = 0 69 | with pytest.raises(ValueError, match="Column 'floats'"): 70 | area_join(self.source, target, "floats") 71 | -------------------------------------------------------------------------------- /tobler/tests/test_dasymetric.py: -------------------------------------------------------------------------------- 1 | """test interpolation functions.""" 2 | import geopandas 3 | 4 | from libpysal.examples import load_example 5 | from tobler.dasymetric import masked_area_interpolate 6 | 7 | 8 | def datasets(): 9 | sac1 = load_example("Sacramento1") 10 | sac2 = load_example("Sacramento2") 11 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 12 | sac1 = sac1.to_crs(sac1.estimate_utm_crs()) 13 | sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp")) 14 | sac2 = sac2.to_crs(sac2.estimate_utm_crs()) 15 | sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT 16 | categories = ["cat", "dog", "donkey", "wombat", "capybara"] 17 | sac1["animal"] = (categories * ((len(sac1) // len(categories)) + 1))[ 18 | : len(sac1) 19 | ] 20 | return sac1, sac2 21 | 22 | 23 | def test_masked_area_interpolate(): 24 | sac1, sac2 = datasets() 25 | masked = masked_area_interpolate( 26 | source_df=sac1, 27 | target_df=sac2, 28 | extensive_variables=["TOT_POP"], 29 | intensive_variables=["pct_poverty"], 30 | raster="https://spatial-ucr.s3.amazonaws.com/nlcd/landcover/nlcd_landcover_2011.tif", 31 | pixel_values=[21, 22, 23, 24], 32 | ) 33 | assert masked.TOT_POP.sum().round(0) == sac1.TOT_POP.sum() 34 | assert masked.pct_poverty.sum() > 2000 35 | -------------------------------------------------------------------------------- /tobler/tests/test_model.py: -------------------------------------------------------------------------------- 1 | """test interpolation functions.""" 2 | import geopandas 3 | 4 | from libpysal.examples import load_example 5 | 6 | from tobler.model import glm 7 | 8 | 9 | def datasets(): 10 | sac1 = load_example("Sacramento1") 11 | sac2 = load_example("Sacramento2") 12 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 13 | sac1 = sac1.to_crs(sac1.estimate_utm_crs()) 14 | sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp")) 15 | sac2 = sac2.to_crs(sac2.estimate_utm_crs()) 16 | sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT 17 | categories = ["cat", "dog", "donkey", "wombat", "capybara"] 18 | sac1["animal"] = (categories * ((len(sac1) // len(categories)) + 1))[ 19 | : len(sac1) 20 | ] 21 | 22 | return sac1, sac2 23 | 24 | 25 | def test_glm_poisson(): 26 | sac1, sac2 = datasets() 27 | glm_poisson = glm( 28 | source_df=sac2, target_df=sac1, variable="POP2001", raster="https://spatial-ucr.s3.amazonaws.com/nlcd/landcover/nlcd_landcover_2011.tif", 29 | ) 30 | assert glm_poisson.POP2001.sum() > 1469000 31 | -------------------------------------------------------------------------------- /tobler/tests/test_pycno.py: -------------------------------------------------------------------------------- 1 | """test interpolation functions.""" 2 | import geopandas 3 | 4 | from libpysal.examples import load_example 5 | from numpy.testing import assert_almost_equal 6 | from tobler.pycno import pycno_interpolate 7 | 8 | 9 | def datasets(): 10 | sac1 = load_example("Sacramento1") 11 | sac2 = load_example("Sacramento2") 12 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 13 | sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp")) 14 | sac1 = sac1.to_crs(sac1.estimate_utm_crs()) 15 | sac2 = sac2.to_crs(sac1.crs) 16 | sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT 17 | 18 | return sac1, sac2 19 | 20 | 21 | def test_pycno_interpolate(): 22 | sac1, sac2 = datasets() 23 | pyc = pycno_interpolate( 24 | source_df=sac1, target_df=sac2, variables=["TOT_POP"], cellsize=500 25 | ) 26 | assert_almost_equal(pyc.TOT_POP.sum(), 1794618.503, decimal=1) 27 | 28 | def test_custom_index(): 29 | sac1, sac2 = datasets() 30 | sac2 = sac2.set_index("ZIP") 31 | pyc = pycno_interpolate( 32 | source_df=sac1, target_df=sac2, variables=["TOT_POP"], cellsize=500 33 | ) 34 | assert_almost_equal(pyc.TOT_POP.sum(), 1794618.503, decimal=1) -------------------------------------------------------------------------------- /tobler/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """test utility functions.""" 2 | 3 | import platform 4 | 5 | import geopandas 6 | import pytest 7 | from libpysal.examples import load_example 8 | from numpy.testing import assert_almost_equal 9 | 10 | from tobler.util import h3fy 11 | 12 | 13 | def test_h3fy(): 14 | sac1 = load_example("Sacramento1") 15 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 16 | sac_hex = h3fy(sac1, return_geoms=True) 17 | assert sac_hex.shape == (364, 1) 18 | 19 | 20 | def test_h3fy_nogeoms(): 21 | sac1 = load_example("Sacramento1") 22 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 23 | sac_hex = h3fy(sac1, return_geoms=False) 24 | assert len(sac_hex) == 364 25 | 26 | 27 | def test_h3fy_nocrs(): 28 | sac1 = load_example("Sacramento1") 29 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 30 | sac1.crs = None 31 | try: 32 | sac_hex = h3fy(sac1, return_geoms=True) 33 | except ValueError: 34 | pass 35 | 36 | 37 | def test_h3fy_diff_crs(): 38 | sac1 = load_example("Sacramento1") 39 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 40 | sac1 = sac1.to_crs(32710) 41 | sac_hex = h3fy(sac1) 42 | assert sac_hex.shape == (364, 1) 43 | assert sac_hex.crs.to_string() == "EPSG:32710" 44 | 45 | 46 | def test_h3fy_clip(): 47 | sac1 = load_example("Sacramento1") 48 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 49 | sac_hex = h3fy(sac1, clip=True) 50 | sac_hex = sac_hex.to_crs(sac_hex.estimate_utm_crs()) 51 | assert_almost_equal( 52 | sac_hex.area.sum(), 13131736346.537422, decimal=0 53 | ) 54 | 55 | def test_h3fy_clip_buffer(): 56 | sac1 = load_example("Sacramento1") 57 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp")) 58 | sac_hex = h3fy(sac1, clip=True, buffer=True) 59 | sac_hex = sac_hex.to_crs(sac_hex.estimate_utm_crs()) 60 | sac1 = sac1.to_crs(sac_hex.estimate_utm_crs()) 61 | assert_almost_equal( 62 | sac_hex.area.sum(), sac1.area.sum(), decimal=-8 63 | ) 64 | 65 | @pytest.mark.skipif(platform.system() == "Windows", reason='Unknown precision error on Windows. See #174 for details') 66 | def test_h3_multipoly(): 67 | va = geopandas.read_file(load_example("virginia").get_path("virginia.shp")) 68 | va = va.to_crs(va.estimate_utm_crs()) 69 | 70 | va = h3fy(va) 71 | assert_almost_equal(va.area.sum(), 102888497504.47836, decimal=0) 72 | -------------------------------------------------------------------------------- /tobler/util/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | 3 | __all__ = ['h3fy', 'circumradius'] -------------------------------------------------------------------------------- /tobler/util/util.py: -------------------------------------------------------------------------------- 1 | """Useful functions to support tobler's interpolation methods.""" 2 | 3 | from warnings import warn 4 | 5 | import geopandas 6 | import numpy as np 7 | import pandas 8 | import shapely 9 | from packaging.version import Version 10 | from shapely.geometry import Polygon 11 | 12 | GPD_10 = Version(geopandas.__version__) >= Version("1.0.0dev") 13 | 14 | __all__ = ["h3fy", "circumradius"] 15 | 16 | 17 | def circumradius(resolution): 18 | """Find the circumradius of an h3 hexagon at given resolution. 19 | 20 | Parameters 21 | ---------- 22 | resolution : int 23 | h3 grid resolution 24 | 25 | Returns 26 | ------- 27 | circumradius : float 28 | circumradius in meters 29 | """ 30 | try: 31 | import h3 32 | except ImportError: 33 | raise ImportError( 34 | "This function requires the `h3` library. " 35 | "You can install it with `conda install h3-py` or " 36 | "`pip install h3`" 37 | ) 38 | if Version(h3.__version__) < Version("4.0"): 39 | return h3.edge_length(resolution, "m") 40 | return h3.average_hexagon_edge_length(resolution, "m") 41 | 42 | 43 | def _check_crs(source_df, target_df): 44 | """check if crs is identical""" 45 | if not (source_df.crs == target_df.crs): 46 | print("Source and target dataframes have different crs. Please correct.") 47 | return False 48 | return True 49 | 50 | 51 | def _nan_check(df, column): 52 | """Check if variable has nan values. 53 | 54 | Warn and replace nan with 0.0. 55 | """ 56 | values = df[column].values 57 | if np.any(np.isnan(values)) or np.any(np.isinf(values)): 58 | wherenan = np.isnan(values) 59 | values[wherenan] = 0.0 60 | warn(f"nan values in variable: {column}, replacing with 0") 61 | return values 62 | 63 | 64 | def _inf_check(df, column): 65 | """Check if variable has nan values. 66 | 67 | Warn and replace inf with 0.0. 68 | """ 69 | values = df[column].values 70 | if np.any(np.isinf(values)): 71 | wherenan = np.isinf(values) 72 | values[wherenan] = 0.0 73 | warn(f"inf values in variable: {column}, replacing with 0") 74 | return values 75 | 76 | 77 | def _check_presence_of_crs(geoinput): 78 | """check if there is crs in the polygon/geodataframe""" 79 | if geoinput.crs is None: 80 | raise KeyError("Geodataframe must have a CRS set before using this function.") 81 | 82 | 83 | def h3fy(source, resolution=6, clip=False, buffer=False, return_geoms=True): 84 | """Generate a hexgrid geodataframe that covers the face of a source geodataframe. 85 | 86 | Parameters 87 | ---------- 88 | source : geopandas.GeoDataFrame 89 | GeoDataFrame to transform into a hexagonal grid 90 | resolution : int, optional (default is 6) 91 | resolution of output h3 hexgrid. 92 | See for more information 93 | clip : bool, optional (default is False) 94 | if True, hexagons are clipped by the boundary of the source gdf. Otherwise, 95 | heaxgons along the boundary will be left intact. 96 | buffer : bool, optional (default is False) 97 | if True, force hexagons to completely fill the interior of the source area. 98 | if False, (h3 default) may result in empty areas within the source area. 99 | return_geoms: bool, optional (default is True) 100 | whether to generate hexagon geometries as a geodataframe or simply return 101 | hex ids as a pandas.Series 102 | 103 | Returns 104 | ------- 105 | pandas.Series or geopandas.GeoDataFrame 106 | if `return_geoms` is True, a geopandas.GeoDataFrame whose rows comprise a hexagonal h3 grid (indexed on h3 hex id). 107 | if `return_geoms` is False, a pandas.Series of h3 hexagon ids 108 | """ 109 | try: 110 | import h3 111 | except ImportError as err: 112 | raise ImportError( 113 | "This function requires the `h3` library. " 114 | "You can install it with `conda install h3-py` or " 115 | "`pip install h3`" 116 | ) from err 117 | # h3 hexes only work on polygons, not multipolygons 118 | if source.crs is None: 119 | raise ValueError( 120 | "source geodataframe must have a valid CRS set before using this function" 121 | ) 122 | 123 | orig_crs = source.crs 124 | clipper = source 125 | 126 | if source.crs.is_geographic: 127 | if buffer: # if CRS is geographic but user wants a buffer, we need to estimate 128 | warn( 129 | "The source geodataframe is stored in a geographic CRS. Falling back to estimated UTM zone " 130 | "to generate desired buffer. If this produces unexpected results, reproject the input data " 131 | "prior to using this function" 132 | ) 133 | source = ( 134 | source.to_crs(source.estimate_utm_crs()) 135 | .buffer(circumradius(resolution)) 136 | .to_crs(4326) 137 | ) 138 | 139 | else: # if CRS is projected, we need lat/long 140 | crs_units = source.crs.to_dict()["units"] 141 | if buffer: # we can only convert between units we know 142 | if not crs_units in ["m", "us-ft"]: 143 | raise ValueError( 144 | f"The CRS of source geodataframe uses an unknown measurement unit: `{crs_units}`. " 145 | "The `buffer` argument requires either a geographic CRS or a projected one measured " 146 | "in meters or feet (U.S.)" 147 | ) 148 | clipper = source.to_crs(4326) 149 | distance = circumradius(resolution) 150 | if crs_units == "ft-us": 151 | distance = distance * 3.281 152 | source = source.buffer(distance).to_crs(4326) 153 | else: 154 | source = source.to_crs(4326) 155 | 156 | if GPD_10: 157 | source_unary = shapely.force_2d(source.union_all()) 158 | else: 159 | source_unary = shapely.force_2d(source.unary_union) 160 | 161 | if type(source_unary) == Polygon: 162 | hexagons = _to_hex( 163 | source_unary, resolution=resolution, return_geoms=return_geoms 164 | ) 165 | else: 166 | output = [] 167 | for geom in source_unary.geoms: 168 | hexes = _to_hex(geom, resolution=resolution, return_geoms=return_geoms) 169 | output.append(hexes) 170 | hexagons = pandas.concat(output) 171 | 172 | if return_geoms and clip: 173 | hexagons = geopandas.clip(hexagons, clipper) 174 | 175 | if return_geoms and not hexagons.crs.equals(orig_crs): 176 | hexagons = hexagons.to_crs(orig_crs) 177 | 178 | return hexagons 179 | 180 | 181 | def _to_hex(source, resolution=6, return_geoms=True, buffer=True): 182 | """Generate a hexgrid geodataframe that covers the face of a source geometry. 183 | 184 | Parameters 185 | ---------- 186 | source : geometry 187 | geometry to transform into a hexagonal grid (needs to support __geo_interface__) 188 | resolution : int, optional (default is 6) 189 | resolution of output h3 hexgrid. 190 | See for more information 191 | return_geoms: bool, optional (default is True) 192 | whether to generate hexagon geometries as a geodataframe or simply return 193 | hex ids as a pandas.Series 194 | 195 | Returns 196 | ------- 197 | pandas.Series or geopandas.GeoDataFrame 198 | if `return_geoms` is True, a geopandas.GeoDataFrame whose rows comprise a hexagonal h3 grid (indexed on h3 hex id). 199 | if `return_geoms` is False, a pandas.Series of h3 hexagon ids 200 | """ 201 | try: 202 | import h3 203 | except ImportError as err: 204 | raise ImportError( 205 | "This function requires the `h3` library. " 206 | "You can install it with `conda install h3-py` or " 207 | "`pip install h3`" 208 | ) from err 209 | 210 | if Version(h3.__version__) > Version("4.0"): 211 | polyfill = h3.geo_to_cells 212 | kwargs = {} 213 | else: 214 | polyfill = h3.polyfill 215 | kwargs = dict(geo_json_conformant=True) 216 | 217 | hexids = pandas.Series( 218 | list(polyfill(source.__geo_interface__, resolution, **kwargs)), 219 | name="hex_id", 220 | ) 221 | 222 | if not return_geoms: 223 | return hexids 224 | 225 | if Version(h3.__version__) > Version("4.0"): 226 | polys = hexids.apply( 227 | lambda hex_id: shapely.geometry.shape(h3.cells_to_geo([hex_id])), 228 | ) 229 | else: 230 | polys = hexids.apply( 231 | lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)), 232 | ) 233 | 234 | hexs = geopandas.GeoDataFrame(hexids, geometry=polys.values, crs=4326).set_index( 235 | "hex_id" 236 | ) 237 | 238 | return hexs 239 | --------------------------------------------------------------------------------