├── .coveragerc ├── .gitattributes ├── .github ├── dependabot.yml └── workflows │ ├── release.yml │ └── tests.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── ci └── envs │ ├── 310-minimal.yaml │ ├── 310-no-optional-deps.yaml │ ├── 311-latest-no-expr.yaml │ ├── 311-latest.yaml │ ├── 312-dev.yaml │ └── 312-latest.yaml ├── dask_geopandas ├── __init__.py ├── _expr.py ├── _version.py ├── backends.py ├── clip.py ├── core.py ├── expr.py ├── geohash.py ├── hilbert_distance.py ├── io │ ├── __init__.py │ ├── arrow.py │ ├── file.py │ └── parquet.py ├── morton_distance.py ├── sjoin.py └── tests │ ├── __init__.py │ ├── conftest.py │ ├── data │ ├── README.md │ ├── naturalearth_cities │ │ ├── naturalearth_cities.VERSION.txt │ │ ├── naturalearth_cities.cpg │ │ ├── naturalearth_cities.dbf │ │ ├── naturalearth_cities.prj │ │ ├── naturalearth_cities.shp │ │ └── naturalearth_cities.shx │ └── naturalearth_lowres │ │ ├── naturalearth_lowres.cpg │ │ ├── naturalearth_lowres.dbf │ │ ├── naturalearth_lowres.prj │ │ ├── naturalearth_lowres.shp │ │ └── naturalearth_lowres.shx │ ├── io │ ├── __init__.py │ ├── conftest.py │ ├── test_arrow.py │ ├── test_backend_integration.py │ ├── test_file.py │ └── test_parquet.py │ ├── test_clip.py │ ├── test_core.py │ ├── test_distributed.py │ ├── test_geohash.py │ ├── test_hilbert_distance.py │ ├── test_morton_distance.py │ ├── test_sjoin.py │ └── test_spatial_partitioning.py ├── doc ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── _static │ ├── binary_geo-difference.svg │ ├── binary_geo-intersection.svg │ ├── binary_geo-symm_diff.svg │ ├── binary_geo-union.svg │ ├── binary_op-01.svg │ ├── binary_op-02.svg │ ├── binary_op-03.svg │ └── custom.css │ ├── api.rst │ ├── changelog.rst │ ├── conf.py │ ├── docs │ └── reference │ │ ├── geodataframe.rst │ │ ├── geoseries.rst │ │ ├── io.rst │ │ └── tools.rst │ ├── getting_started.md │ ├── guide.md │ ├── guide │ ├── basic-intro.ipynb │ ├── dissolve.ipynb │ └── spatial-partitioning.ipynb │ ├── index.md │ ├── installation.md │ └── parquet.md ├── pyproject.toml ├── readthedocs.yml ├── requirements-dev.txt ├── setup.cfg ├── setup.py └── versioneer.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | dask_geopandas/tests/*.py 4 | */_version.py 5 | source = 6 | dask_geopandas 7 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | dask_geopandas/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for GitHub Actions 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | # Check for updates to GitHub Actions every week 8 | interval: "weekly" 9 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build and publish dask-geopandas to PyPI / GitHub 2 | 3 | on: 4 | push: 5 | branches: 6 | - main # just build the sdist & wheel, skip release 7 | tags: 8 | - "v*" 9 | pull_request: # also build on PRs touching this file 10 | paths: 11 | - ".github/workflows/release.yml" 12 | - "MANIFEST.in" 13 | - "pyproject.toml" 14 | - "setup.py" 15 | 16 | jobs: 17 | build: 18 | name: Build dask-geopandas 19 | runs-on: ubuntu-latest 20 | 21 | steps: 22 | - name: Checkout source 23 | uses: actions/checkout@v4 24 | with: 25 | fetch-depth: 0 26 | 27 | - name: Set up Python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: "3.x" 31 | 32 | - name: Build a binary wheel and a source tarball 33 | run: | 34 | python -m pip install --upgrade pip build 35 | python -m build 36 | 37 | - uses: actions/upload-artifact@v4 38 | with: 39 | path: ./dist/* 40 | retention-days: 5 41 | 42 | publish: 43 | name: Publish dask-geopandas to PyPI 44 | needs: [build] 45 | runs-on: ubuntu-latest 46 | # release on every tag 47 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') 48 | 49 | steps: 50 | - uses: actions/download-artifact@v4 51 | with: 52 | name: artifact 53 | path: dist 54 | 55 | - name: Publish distribution to PyPI 56 | uses: pypa/gh-action-pypi-publish@release/v1 57 | with: 58 | user: __token__ 59 | password: ${{ secrets.PYPI_API_TOKEN }} 60 | 61 | - name: Create GitHub Release 62 | id: create_release 63 | uses: actions/create-release@v1 64 | env: 65 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token 66 | with: 67 | tag_name: ${{ github.ref }} 68 | release_name: ${{ github.ref }} 69 | draft: false 70 | prerelease: false 71 | 72 | - name: Get Asset name 73 | run: | 74 | export PKG=$(ls dist/ | grep tar) 75 | set -- $PKG 76 | echo "name=$1" >> $GITHUB_ENV 77 | 78 | - name: Upload Release Asset (sdist) to GitHub 79 | id: upload-release-asset 80 | uses: actions/upload-release-asset@v1 81 | env: 82 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 83 | with: 84 | upload_url: ${{ steps.create_release.outputs.upload_url }} 85 | asset_path: dist/${{ env.name }} 86 | asset_name: ${{ env.name }} 87 | asset_content_type: application/zip 88 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | schedule: 9 | - cron: "0 0 * * *" 10 | 11 | jobs: 12 | Linting: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | - uses: actions/setup-python@v5 18 | - uses: pre-commit/action@v3.0.1 19 | 20 | Test: 21 | needs: Linting 22 | name: ${{ matrix.os }}, ${{ matrix.env }} 23 | runs-on: ${{ matrix.os }} 24 | defaults: 25 | run: 26 | shell: bash -l {0} 27 | continue-on-error: true 28 | strategy: 29 | matrix: 30 | os: [ubuntu-latest] 31 | env: 32 | - ci/envs/310-minimal.yaml 33 | - ci/envs/310-no-optional-deps.yaml 34 | - ci/envs/311-latest.yaml 35 | - ci/envs/312-latest.yaml 36 | 37 | include: 38 | - env: ci/envs/311-latest.yaml 39 | os: macos-latest 40 | - env: ci/envs/311-latest.yaml 41 | os: windows-latest 42 | - env: ci/envs/312-dev.yaml 43 | os: ubuntu-latest 44 | 45 | steps: 46 | - uses: actions/checkout@v4 47 | 48 | - name: Setup Conda 49 | uses: conda-incubator/setup-miniconda@v3 50 | with: 51 | environment-file: ${{ matrix.env }} 52 | miniforge-version: latest 53 | miniforge-variant: Miniforge3 54 | use-mamba: true 55 | 56 | - name: Check and Log Environment 57 | run: | 58 | python -V 59 | python -c "import geopandas; geopandas.show_versions();" 60 | conda info 61 | conda list 62 | 63 | - name: Test 64 | run: | 65 | pytest -v -r a --color=yes --cov=dask_geopandas --cov-append --cov-report term-missing --cov-report xml . 66 | 67 | - uses: codecov/codecov-action@v5 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .hypothesis 2 | *.py[cod] 3 | __pycache__/ 4 | *.egg-info 5 | dask-worker-space/ 6 | docs/build 7 | build/ 8 | dist/ 9 | .idea/ 10 | log.* 11 | log 12 | .pytest_cache/ 13 | .coverage 14 | .DS_Store 15 | *.swp 16 | *.swo 17 | .cache/ 18 | .ipynb_checkpoints 19 | .vscode/ 20 | 21 | coverage.xml 22 | 23 | doc/source/docs/reference/api -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_prs: false 3 | autoupdate_schedule: quarterly 4 | 5 | files: 'dask_geopandas\/' 6 | repos: 7 | - repo: https://github.com/psf/black 8 | rev: 24.2.0 9 | hooks: 10 | - id: black 11 | language_version: python3 12 | - repo: https://github.com/astral-sh/ruff-pre-commit 13 | rev: "v0.4.4" 14 | hooks: 15 | - id: ruff 16 | name: sort imports with ruff 17 | args: [--select, I, --fix] 18 | - id: ruff 19 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Version 0.5.0 (upcoming) 5 | ------------------------ 6 | 7 | Deprecations and compatibility notes: 8 | 9 | - The deprecated `geom_almost_equals` method has been removed. Use `geom_equals_exact` instead. 10 | 11 | 12 | Version 0.4.3 (January, 2025) 13 | ----------------------------- 14 | 15 | Packaging: 16 | 17 | - `dask>=2025.1.0` is now required. 18 | - `python>=3.10` is now required. 19 | 20 | Bug fixes: 21 | 22 | - Fixed `GeoDataFrame.drop` returning a `GeoDataFrame` 23 | instead of a `DataFrame`, when dropping the geometry 24 | column (#321). 25 | 26 | Version 0.4.2 (September 24, 2024) 27 | ---------------------------------- 28 | 29 | Bug fixes: 30 | 31 | - Ensure `read_file()` produces a correct empty meta object, avoiding later 32 | errors in `spatial_shuffle()` (#302). 33 | - Fix in `sjoin()` to work with GeoDataFrames after a `spatial_shuffle()` (#303). 34 | 35 | Packaging: 36 | 37 | - `distributed` was dropped as a required dependency, only depending on 38 | `dask[dataframe]` (#258). 39 | 40 | 41 | Version 0.4.1 (June 25, 2024) 42 | ----------------------------- 43 | 44 | Bug fixes: 45 | 46 | - Allow to run dask-geopandas with recent dask versions without using query 47 | planning (without dask-expr being installed). 48 | 49 | Packaging: 50 | 51 | - The `dask` dependency was updated to `dask[dataframe]` in pyproject.toml (when 52 | installing from source or binary wheels from PyPI). This ensures dask-expr 53 | gets installed automatically for recent versions of dask. 54 | 55 | Version 0.4.0 (June 24, 2024) 56 | ----------------------------- 57 | 58 | Enhancements: 59 | 60 | - Added preliminary support for dask's new query planning (dask >= 2024.3.0) (#285). 61 | - Added support for using dask-geopandas with distributed's P2P shuffle (this 62 | requires the latest distributed>=2024.6.0 to work) (#295). 63 | - Added new `from_wkb()` and `from_wkt()` functions to convert a dask Series of 64 | WKB or WKT values into a dask-geopandas GeoSeries (#293). 65 | 66 | Notes on dependencies: 67 | 68 | - Removed support for PyGEOS, now requiring Shapely >= 2 (#280). 69 | - Updated minimum supported versions of dependencies, now requiring Python 3.9, 70 | GeoPandas 0.12, numpy 1.23 and dask/distributed 2022.06.0. 71 | 72 | Version 0.3.1 (April 28, 2023) 73 | ------------------------------ 74 | 75 | Bug fixes: 76 | 77 | - Compatibility with dask >= 2023.4 and changes regarding ``use_nullable_dtypes`` 78 | keyword (#242). 79 | - Ensure ``spatial_partitions`` are preserved when serialized deserialized 80 | with pickle (#237). 81 | 82 | Version 0.3.0 (January 23, 2023) 83 | -------------------------------- 84 | 85 | Enhancements: 86 | 87 | - Dask-GeoPandas is now compatible with Shapely 2.0 (and if this version is 88 | installed, no longer requires PyGEOS) 89 | 90 | Bug fixes: 91 | 92 | - Compatibility with dask >= 2022.12 for ``read_parquet()`` (#230) and for 93 | ``dissolve()`` (#229) 94 | - Fix the ``spatial_partitions`` of the result of ``sjoin()`` (#216) 95 | 96 | Version 0.2.0 (July 1, 2022) 97 | ---------------------------- 98 | 99 | Enhancements: 100 | 101 | - Optionally skip spatial bounds in ``read_parquet`` (#203) 102 | 103 | Bug fixes: 104 | 105 | - Don't put ``GeoSeries`` in ``map_partitions`` kwarg (#205) 106 | 107 | Version 0.1.3 (June 21, 2021) 108 | ----------------------------- 109 | 110 | Compatibility: 111 | 112 | - MAINT: use ``predicate`` instead of ``op`` in ``sjoin`` (#204) 113 | 114 | Version 0.1.2 (June 20, 2021) 115 | ----------------------------- 116 | 117 | Bug fixes: 118 | 119 | - Update ``to_parquet`` to handle custom schema (to fix writing partitions with all missing data) (#201) 120 | 121 | Version 0.1.1 (June 19, 2021) 122 | ----------------------------- 123 | 124 | Bug fixes: 125 | 126 | - Compat with dask 2022.06.0: fix schema inference in ``to_parquet`` (#199) 127 | - Remove custom ``__dask_postcompute__`` (#191) 128 | - BUG: persist ``spatial_partitions`` information in ``persist()`` (#192) 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Dask-geopandas is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more. 2 | 3 | Also for general information on how to contribute to GeoPandas projects see https://geopandas.org/en/latest/community/contributing.html. 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, Dask Developers 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include dask_geopandas *.py 2 | recursive-include dask_geopandas *.yaml 3 | 4 | include versioneer.py 5 | include setup.py 6 | include README.rst 7 | include LICENSE 8 | include dask_geopandas/_version.py 9 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | dask-geopandas |conda| |pypi| |docs| |gitter| 2 | ============================================= 3 | 4 | Parallel GeoPandas with Dask 5 | 6 | Dask-GeoPandas is a project merging the geospatial capabilities of GeoPandas 7 | and scalability of Dask. GeoPandas is an open source project designed to make working with geospatial data in Python easier. GeoPandas extends the datatypes used by pandas to allow spatial operations on geometric types. 8 | Dask provides advanced parallelism and distributed out-of-core computation with a dask.dataframe module designed to scale 9 | pandas. Since GeoPandas is an extension to the pandas DataFrame, the same way Dask scales pandas can also be applied to GeoPandas. 10 | 11 | This project is a bridge between Dask and GeoPandas and offers geospatial capabilities of GeoPandas backed by Dask. 12 | 13 | Documentation 14 | ------------- 15 | 16 | See the documentation on https://dask-geopandas.readthedocs.io/en/latest/ 17 | 18 | Installation 19 | ------------ 20 | 21 | This package depends on Shapely, GeoPandas and Dask. 22 | 23 | One way to install all required dependencies is to use the ``conda`` package manager to 24 | create a new environment: 25 | 26 | :: 27 | 28 | conda create -n geo_env 29 | conda activate geo_env 30 | conda config --env --add channels conda-forge 31 | conda config --env --set channel_priority strict 32 | conda install dask-geopandas 33 | 34 | 35 | 36 | Example 37 | ------- 38 | 39 | Given a GeoPandas dataframe 40 | 41 | .. code-block:: python 42 | 43 | import geopandas 44 | df = geopandas.read_file('...') 45 | 46 | We can repartition it into a Dask-GeoPandas dataframe: 47 | 48 | .. code-block:: python 49 | 50 | import dask_geopandas 51 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 52 | 53 | The familiar spatial attributes and methods of GeoPandas are also available 54 | and will be computed in parallel: 55 | 56 | .. code-block:: python 57 | 58 | ddf.geometry.area.compute() 59 | ddf.within(polygon) 60 | 61 | 62 | .. |pypi| image:: https://img.shields.io/pypi/v/dask-geopandas.svg 63 | :target: https://pypi.python.org/pypi/dask-geopandas/ 64 | 65 | .. |conda| image:: https://img.shields.io/conda/vn/conda-forge/dask-geopandas.svg 66 | :target: https://anaconda.org/conda-forge/dask-geopandas 67 | :alt: Conda Version 68 | 69 | .. |docs| image:: https://readthedocs.org/projects/dask-geopandas/badge/?version=latest 70 | :target: https://dask-geopandas.readthedocs.io/en/latest/?badge=latest 71 | :alt: Documentation Status 72 | 73 | .. |gitter| image:: https://badges.gitter.im/geopandas/geopandas.svg 74 | :target: https://gitter.im/geopandas/geopandas 75 | :alt: Gitter 76 | -------------------------------------------------------------------------------- /ci/envs/310-minimal.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - python=3.10 7 | - numpy=1.24 8 | - dask=2025.1.0 9 | - distributed=2025.1.0 10 | - geopandas=0.14.3 11 | - pandas=2.0.0 12 | - shapely=2.0 13 | - pyproj=3.4 14 | - packaging 15 | # test dependencies 16 | - pytest 17 | - pytest-cov 18 | - hilbertcurve 19 | - pygeohash 20 | # optional dependencies 21 | - pyarrow 22 | - pip 23 | - pip: 24 | - pymorton 25 | -------------------------------------------------------------------------------- /ci/envs/310-no-optional-deps.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - python=3.10 7 | - dask 8 | - geopandas 9 | - pyproj 10 | - packaging 11 | # test dependencies 12 | - pytest 13 | - pytest-cov 14 | -------------------------------------------------------------------------------- /ci/envs/311-latest-no-expr.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - python=3.11 7 | - dask-core 8 | - geopandas 9 | - pyproj=3.4 10 | - packaging 11 | # test dependencies 12 | - pytest 13 | - pytest-cov 14 | - hilbertcurve 15 | - s3fs 16 | - moto<5 # <5 pin because of https://github.com/dask/dask/issues/10869 17 | - flask # needed for moto server 18 | # optional dependencies 19 | - pyarrow 20 | - pyogrio>=0.4 21 | - pygeohash 22 | - pip 23 | - pip: 24 | - pymorton 25 | -------------------------------------------------------------------------------- /ci/envs/311-latest.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - python=3.11 7 | - dask 8 | - distributed 9 | - geopandas 10 | - pyproj=3.4 11 | - packaging 12 | # test dependencies 13 | - pytest 14 | - pytest-cov 15 | - hilbertcurve 16 | - s3fs 17 | - moto<5 # <5 pin because of https://github.com/dask/dask/issues/10869 18 | - flask # needed for moto server 19 | # optional dependencies 20 | - pyarrow 21 | - pyogrio>=0.4 22 | - pygeohash 23 | - pip 24 | - pip: 25 | - pymorton 26 | -------------------------------------------------------------------------------- /ci/envs/312-dev.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - python=3.12 7 | - distributed 8 | - pandas 9 | - geos # for shapely main 10 | - fiona 11 | - pyproj 12 | - fsspec 13 | - packaging 14 | # test dependencies 15 | - pytest 16 | - pytest-cov 17 | - hilbertcurve 18 | - s3fs 19 | - moto<5 # <5 pin because of https://github.com/dask/dask/issues/10869 20 | - flask # needed for moto server 21 | # optional dependencies 22 | - pyarrow 23 | - pyogrio 24 | - pygeohash 25 | - pip 26 | - pip: 27 | - pymorton 28 | - git+https://github.com/shapely/shapely.git@main 29 | - git+https://github.com/geopandas/geopandas.git@main 30 | - git+https://github.com/dask/dask.git@main 31 | -------------------------------------------------------------------------------- /ci/envs/312-latest.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - python=3.12 7 | - dask 8 | - distributed 9 | - geopandas 10 | - shapely >= 2 11 | - pyproj 12 | - packaging 13 | # test dependencies 14 | - pytest 15 | - pytest-cov 16 | - hilbertcurve 17 | - s3fs 18 | - moto<5 # <5 pin because of https://github.com/dask/dask/issues/10869 19 | - flask # needed for moto server 20 | # optional dependencies 21 | - pyarrow 22 | - pyogrio 23 | - pygeohash 24 | - pip 25 | - pip: 26 | - pymorton 27 | -------------------------------------------------------------------------------- /dask_geopandas/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import get_versions 2 | 3 | from .expr import ( 4 | points_from_xy, 5 | from_wkt, 6 | from_wkb, 7 | GeoDataFrame, 8 | GeoSeries, 9 | from_geopandas, 10 | from_dask_dataframe, 11 | ) 12 | from .io.file import read_file 13 | from .io.parquet import read_parquet, to_parquet 14 | from .io.arrow import read_feather, to_feather 15 | from .clip import clip 16 | from .sjoin import sjoin 17 | from . import backends as _ # needed to register dispatch functions with dask 18 | 19 | 20 | __version__ = get_versions()["version"] 21 | del get_versions 22 | 23 | __all__ = [ 24 | "GeoDataFrame", 25 | "GeoSeries", 26 | "clip", 27 | "from_dask_dataframe", 28 | "from_geopandas", 29 | "from_wkb", 30 | "from_wkt", 31 | "points_from_xy", 32 | "read_feather", 33 | "read_file", 34 | "read_parquet", 35 | "sjoin", 36 | "to_feather", 37 | "to_parquet", 38 | ] 39 | 40 | from . import _version 41 | 42 | __version__ = _version.get_versions()["version"] 43 | -------------------------------------------------------------------------------- /dask_geopandas/_expr.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | import dask.dataframe.dask_expr as dx 4 | 5 | import geopandas 6 | 7 | 8 | def _drop(df: geopandas.GeoDataFrame, columns, errors): 9 | return df.drop(columns=columns, errors=errors) 10 | 11 | 12 | def _validate_axis(axis=0, none_is_zero: bool = True) -> None | Literal[0, 1]: 13 | if axis not in (0, 1, "index", "columns", None): 14 | raise ValueError(f"No axis named {axis}") 15 | # convert to numeric axis 16 | numeric_axis: dict[str | None, Literal[0, 1]] = {"index": 0, "columns": 1} 17 | if none_is_zero: 18 | numeric_axis[None] = 0 19 | 20 | return numeric_axis.get(axis, axis) 21 | 22 | 23 | class Drop(dx.expr.Drop): 24 | operation = staticmethod(_drop) 25 | -------------------------------------------------------------------------------- /dask_geopandas/backends.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from packaging.version import Version 3 | 4 | import pandas as pd 5 | 6 | import dask 7 | from dask.base import normalize_token 8 | from dask.dataframe.backends import _nonempty_index, meta_nonempty_dataframe 9 | from dask.dataframe.core import get_parallel_type 10 | from dask.dataframe.dispatch import make_meta_dispatch, pyarrow_schema_dispatch 11 | from dask.dataframe.extensions import make_array_nonempty, make_scalar 12 | from dask.dataframe.utils import meta_nonempty 13 | 14 | import geopandas 15 | import shapely.geometry 16 | from geopandas.array import GeometryArray, GeometryDtype, from_shapely 17 | from shapely.geometry.base import BaseGeometry 18 | 19 | from .expr import GeoDataFrame, GeoSeries 20 | 21 | get_parallel_type.register(geopandas.GeoDataFrame, lambda _: GeoDataFrame) 22 | get_parallel_type.register(geopandas.GeoSeries, lambda _: GeoSeries) 23 | 24 | 25 | @make_meta_dispatch.register(BaseGeometry) 26 | def make_meta_shapely_geometry(x, index=None): 27 | return x 28 | 29 | 30 | @make_array_nonempty.register(GeometryDtype) 31 | def _(dtype): 32 | return from_shapely( 33 | [shapely.geometry.LineString([(i, i), (i, i + 1)]) for i in range(2)] 34 | ) 35 | 36 | 37 | @make_scalar.register(GeometryDtype.type) 38 | def _(x): 39 | return shapely.geometry.Point(0, 0) 40 | 41 | 42 | @meta_nonempty.register(geopandas.GeoSeries) 43 | def _nonempty_geoseries(x, idx=None): 44 | if idx is None: 45 | idx = _nonempty_index(x.index) 46 | data = make_array_nonempty(x.dtype) 47 | return geopandas.GeoSeries(data, name=x.name, crs=x.crs) 48 | 49 | 50 | @meta_nonempty.register(geopandas.GeoDataFrame) 51 | def _nonempty_geodataframe(x): 52 | df = meta_nonempty_dataframe(x) 53 | return geopandas.GeoDataFrame(df, geometry=x._geometry_column_name, crs=x.crs) 54 | 55 | 56 | @make_meta_dispatch.register((geopandas.GeoSeries, geopandas.GeoDataFrame)) 57 | def make_meta_geodataframe(df, index=None): 58 | return df.head(0) 59 | 60 | 61 | @normalize_token.register(GeometryArray) 62 | def tokenize_geometryarray(x): 63 | # TODO if we can find an efficient hashing function (eg hashing integer 64 | # pointers on the C level?), we could replace this random uuid 65 | return uuid.uuid4().hex 66 | 67 | 68 | @pyarrow_schema_dispatch.register((geopandas.GeoDataFrame,)) 69 | def get_pyarrow_schema_geopandas(obj): 70 | import pyarrow as pa 71 | 72 | df = pd.DataFrame(obj.copy()) 73 | for col in obj.columns[obj.dtypes == "geometry"]: 74 | df[col] = obj[col].to_wkb() 75 | return pa.Schema.from_pandas(df) 76 | 77 | 78 | if Version(dask.__version__) >= Version("2023.6.1"): 79 | from dask.dataframe.dispatch import ( 80 | from_pyarrow_table_dispatch, 81 | to_pyarrow_table_dispatch, 82 | ) 83 | 84 | @to_pyarrow_table_dispatch.register((geopandas.GeoDataFrame,)) 85 | def get_pyarrow_table_from_geopandas(obj, **kwargs): 86 | # `kwargs` must be supported by `pyarrow.Table.from_pandas` 87 | import pyarrow as pa 88 | 89 | if Version(geopandas.__version__).major < 1: 90 | return pa.Table.from_pandas(obj.to_wkb(), **kwargs) 91 | else: 92 | # TODO handle kwargs? 93 | return pa.table(obj.to_arrow()) 94 | 95 | @from_pyarrow_table_dispatch.register((geopandas.GeoDataFrame,)) 96 | def get_geopandas_geodataframe_from_pyarrow(meta, table, **kwargs): 97 | # `kwargs` must be supported by `pyarrow.Table.to_pandas` 98 | if Version(geopandas.__version__).major < 1: 99 | df = table.to_pandas(**kwargs) 100 | 101 | for col in meta.columns[meta.dtypes == "geometry"]: 102 | df[col] = geopandas.GeoSeries.from_wkb(df[col], crs=meta[col].crs) 103 | 104 | return df 105 | 106 | else: 107 | # TODO handle kwargs? 108 | return geopandas.GeoDataFrame.from_arrow(table) 109 | -------------------------------------------------------------------------------- /dask_geopandas/clip.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from dask.base import tokenize 4 | from dask.dataframe import from_graph 5 | from dask.highlevelgraph import HighLevelGraph 6 | from dask.utils import derived_from 7 | 8 | import geopandas 9 | 10 | 11 | @derived_from(geopandas.tools) 12 | def clip(gdf, mask, keep_geom_type=False): 13 | from dask_geopandas import GeoDataFrame, GeoSeries 14 | 15 | if isinstance(mask, (GeoDataFrame, GeoSeries)): 16 | raise NotImplementedError("Mask cannot be a Dask GeoDataFrame or GeoSeries.") 17 | 18 | if gdf.spatial_partitions is None: 19 | return gdf.map_partitions( 20 | lambda partition: geopandas.clip( 21 | gdf=partition, mask=mask, keep_geom_type=keep_geom_type 22 | ), 23 | token="clip", 24 | meta=gdf._meta, 25 | ) 26 | 27 | new_spatial_partitions = geopandas.clip( 28 | gdf=gdf.spatial_partitions, 29 | mask=mask, 30 | # keep_geom_type is always false for clipping the spatial partitions 31 | # otherwise we'd be falsely creating new partition(s) 32 | keep_geom_type=False, 33 | ) 34 | intersecting_partitions = np.asarray(new_spatial_partitions.index) 35 | 36 | name = f"clip-{tokenize(gdf, mask, keep_geom_type)}" 37 | dsk = { 38 | (name, i): (geopandas.clip, (gdf._name, part), mask, keep_geom_type) 39 | for i, part in enumerate(intersecting_partitions) 40 | } 41 | divisions = [None] * (len(dsk) + 1) 42 | graph = HighLevelGraph.from_collections(name, dsk, dependencies=[gdf]) 43 | 44 | result = from_graph(graph, gdf._meta, tuple(divisions), dsk.keys(), "clip") 45 | 46 | result.spatial_partitions = new_spatial_partitions 47 | return result 48 | -------------------------------------------------------------------------------- /dask_geopandas/core.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from .expr import * # noqa: F403 4 | 5 | warnings.warn( 6 | "dask_geopandas.core is deprecated and will be removed in a future version.", 7 | category=FutureWarning, 8 | stacklevel=1, 9 | ) 10 | -------------------------------------------------------------------------------- /dask_geopandas/geohash.py: -------------------------------------------------------------------------------- 1 | """ 2 | Geohash implementation 3 | 4 | The code is originally based on the neathgeohash package, 5 | Copyright (c) 2020 Marek Dwulit, MIT License 6 | (https://pypi.org/project/neathgeohash/#description). 7 | The vectorized implementation for quantization and bit interleaving is in turn based on, 8 | "Geohash in Golang Assembly" blog (https://mmcloughlin.com/posts/geohash-assembly). 9 | 10 | """ 11 | 12 | import warnings 13 | 14 | import numpy as np 15 | import pandas as pd 16 | 17 | 18 | def _geohash(gdf, as_string, precision): 19 | """ 20 | Calculate geohash based on the middle points of the geometry bounds 21 | for a given precision 22 | 23 | Parameters 24 | ---------- 25 | gdf : GeoDataFrame 26 | as_string : bool 27 | to return string or int Geohash 28 | precision : int 29 | precision of the string Geohash 30 | 31 | 32 | Returns 33 | --------- 34 | type : pandas.Series 35 | Series containing geohash 36 | """ 37 | with warnings.catch_warnings(): 38 | warnings.filterwarnings( 39 | "ignore", "GeoSeries.isna() previously returned True", UserWarning 40 | ) 41 | if gdf.is_empty.any() | gdf.geometry.isna().any(): 42 | raise ValueError( 43 | "Geohash cannot be computed on a GeoSeries with empty or " 44 | "missing geometries.", 45 | ) 46 | 47 | # Calculate bounds 48 | bounds = gdf.bounds.to_numpy() 49 | # Calculate mid points based on bounds 50 | x_mids, y_mids = _calculate_mid_points(bounds) 51 | # Create pairs of x and y midpoints 52 | coords = np.array([y_mids, x_mids]).T 53 | # Encode coords with Geohash 54 | geohash = encode_geohash(coords, as_string, precision) 55 | 56 | return pd.Series(geohash, index=gdf.index, name="geohash") 57 | 58 | 59 | def _calculate_mid_points(bounds): 60 | """ 61 | Calculate middle points based on the geometry bounds 62 | 63 | Parameters 64 | ---------- 65 | bounds : array_like 66 | array containing xmin, ymin, xmax, ymax 67 | 68 | Returns 69 | --------- 70 | x_mids : mid points of x values 71 | y_mids : mid points of y values 72 | """ 73 | 74 | # Calculate mid points for x and y bound coords 75 | x_mids = (bounds[:, 0] + bounds[:, 2]) / 2.0 76 | y_mids = (bounds[:, 1] + bounds[:, 3]) / 2.0 77 | 78 | return x_mids, y_mids 79 | 80 | 81 | def encode_geohash(coords, as_string, precision): 82 | """ 83 | Calculate geohash based on coordinates for a 84 | given precision 85 | 86 | Parameters 87 | ---------- 88 | coords : array_like of shape (n, 2) 89 | array of [x, y] pairs 90 | as_string : bool 91 | to return string or int Geohash 92 | precision : int 93 | precision of the string Geohash 94 | Returns 95 | --------- 96 | geohash: array containing either int or string 97 | geohashes for each mid point 98 | """ 99 | 100 | quantized_coords = _quantize_points(coords) 101 | int_geohash = _encode_into_uint64(quantized_coords) 102 | 103 | if not as_string: 104 | return int_geohash 105 | 106 | gs_uint8_mat = _encode_base32(int_geohash) 107 | str_geohash = _encode_unicode(gs_uint8_mat, precision) 108 | 109 | return str_geohash 110 | 111 | 112 | def _quantize_points(coords): 113 | """ 114 | Quantize coordinates by mapping onto 115 | unit intervals [0, 1] and multiplying by 2^32. 116 | 117 | Parameters 118 | ---------- 119 | coords : array_like of shape (n, 2) 120 | array of [x, y] pairs 121 | coordinate pairs 122 | 123 | Returns 124 | --------- 125 | array_like of shape (n, 2) 126 | """ 127 | 128 | _q = np.array([(2.0**32 / 180, 0), (0, 2.0**32 / (180 * 2))], dtype="float64") 129 | 130 | quantized_coords = coords + np.array([90, 180]) 131 | quantized_coords = np.dot(quantized_coords, _q) 132 | quantized_coords = np.floor(quantized_coords) 133 | 134 | return quantized_coords 135 | 136 | 137 | def _encode_into_uint64(quantized_coords): 138 | """ 139 | 140 | Encode quantized coordinates into uint64 141 | using both spreading and interleaving bits 142 | 143 | Implementation based on "Geohash in Golang Assembly" 144 | blog (https://mmcloughlin.com/posts/geohash-assembly) 145 | 146 | Parameters 147 | ---------- 148 | quantized_coords : array_like of shape (n, 2) 149 | array of quantized coordinate pairs 150 | 151 | Returns 152 | --------- 153 | array_like of shape (n, 2) 154 | coordinate pairs encoded to uint64 values 155 | quantized coordinate pairs 156 | """ 157 | 158 | # spread out 32 bits of x into 64 bits, where the bits occupy even bit positions. 159 | x = quantized_coords.astype(np.uint64) 160 | x = x.reshape(-1, 2) 161 | x = (x | (x << 16)) & 0x0000FFFF0000FFFF 162 | x = (x | (x << 8)) & 0x00FF00FF00FF00FF 163 | x = (x | (x << 4)) & 0x0F0F0F0F0F0F0F0F 164 | x = (x | (x << 2)) & 0x3333333333333333 165 | x = (x | (x << 1)) & 0x5555555555555555 166 | 167 | # Dot 168 | __s1 = np.array([(1, 0), (0, 2)], dtype=np.uint64) 169 | x = x @ __s1 170 | # Interleave x and y bits so that x and y occupy even and odd bit levels 171 | x = x[:, 0] | x[:, 1] 172 | x = x >> 4 173 | 174 | return x 175 | 176 | 177 | def _encode_base32(encoded_uint64): 178 | """ 179 | Encode quantized coordinates into base32 pairs. 180 | Encoding starts at the highest bit, consuming 5 bits for each character precision. 181 | This means encoding happens 12 times for the 12 character precision or 60 bits. 182 | 183 | Implementation is based on "Geohash in Golang Assembly" 184 | blog (https://mmcloughlin.com/posts/geohash-assembly) 185 | 186 | Parameters 187 | ---------- 188 | g_uint64 : array_like 189 | coordinate pairs encoded to uint64 values 190 | 191 | Returns 192 | --------- 193 | array_like of shape (n, 12) 194 | with base 32 values as 8-bit unasigned integer 195 | """ 196 | # Define 32 bit mask 197 | mask = np.uint64(0x1F).flatten() # equivalent to 32-1 198 | # Return array for each character 199 | c11 = (encoded_uint64 >> 0) & mask 200 | c10 = (encoded_uint64 >> 5) & mask 201 | c9 = (encoded_uint64 >> 10) & mask 202 | c8 = (encoded_uint64 >> 15) & mask 203 | c7 = (encoded_uint64 >> 20) & mask 204 | c6 = (encoded_uint64 >> 25) & mask 205 | c5 = (encoded_uint64 >> 30) & mask 206 | c4 = (encoded_uint64 >> 35) & mask 207 | c3 = (encoded_uint64 >> 40) & mask 208 | c2 = (encoded_uint64 >> 45) & mask 209 | c1 = (encoded_uint64 >> 50) & mask 210 | c0 = (encoded_uint64 >> 55) & mask 211 | 212 | # Stack each array vertically 213 | return np.column_stack((c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11)).astype( 214 | "uint8" 215 | ) 216 | 217 | 218 | def _encode_unicode(encoded_base32, precision): 219 | """ 220 | Encode base32 pairs into geohash bytes with an option to return 221 | the geohash in unicode format 222 | 223 | Parameters 224 | ---------- 225 | encoded_base32 : array_like 226 | coordinate pairs 227 | p : int 228 | precision of the Geohash 229 | 230 | Returns 231 | --------- 232 | array_like of shape (n, precision) 233 | containing geohash for a given precision 234 | """ 235 | 236 | # Define replacement values 237 | replacement = np.array( 238 | [ 239 | 48, 240 | 49, 241 | 50, 242 | 51, 243 | 52, 244 | 53, 245 | 54, 246 | 55, 247 | 56, 248 | 57, 249 | 98, 250 | 99, 251 | 100, 252 | 101, 253 | 102, 254 | 103, 255 | 104, 256 | 106, 257 | 107, 258 | 109, 259 | 110, 260 | 112, 261 | 113, 262 | 114, 263 | 115, 264 | 116, 265 | 117, 266 | 118, 267 | 119, 268 | 120, 269 | 121, 270 | 122, 271 | ], 272 | dtype="uint8", 273 | ) 274 | 275 | encoded_base32 = replacement[encoded_base32] 276 | 277 | encoded_base32 = encoded_base32.view(np.dtype("|S12")) 278 | return encoded_base32.flatten().astype(f"U{precision}") 279 | -------------------------------------------------------------------------------- /dask_geopandas/hilbert_distance.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def _hilbert_distance(gdf, total_bounds=None, level=16): 8 | """ 9 | Calculate the distance along a Hilbert curve. 10 | 11 | The distances are calculated for the midpoints of the geometries in the 12 | GeoDataFrame. 13 | 14 | Parameters 15 | ---------- 16 | gdf : GeoDataFrame 17 | total_bounds : 4-element array 18 | Total bounds of geometries - array 19 | level : int (1 - 16), default 16 20 | Determines the precision of the curve (points on the curve will 21 | have coordinates in the range [0, 2^level - 1]). 22 | 23 | Returns 24 | --------- 25 | Pandas Series containing distances along the Hilbert curve 26 | 27 | """ 28 | with warnings.catch_warnings(): 29 | warnings.filterwarnings( 30 | "ignore", "GeoSeries.isna() previously returned True", UserWarning 31 | ) 32 | if gdf.is_empty.any() | gdf.geometry.isna().any(): 33 | raise ValueError( 34 | "Hilbert distance cannot be computed on a GeoSeries with empty or " 35 | "missing geometries.", 36 | ) 37 | # Calculate bounds as numpy array 38 | bounds = gdf.bounds.to_numpy() 39 | 40 | # Calculate discrete coords based on total bounds and bounds 41 | x, y = _continuous_to_discrete_coords(bounds, level, total_bounds) 42 | # Compute distance along hilbert curve 43 | distances = _encode(level, x, y) 44 | 45 | return pd.Series(distances, index=gdf.index, name="hilbert_distance") 46 | 47 | 48 | def _continuous_to_discrete_coords(bounds, level, total_bounds): 49 | """ 50 | Calculates mid points & ranges of geoms and returns 51 | as discrete coords 52 | 53 | Parameters 54 | ---------- 55 | 56 | bounds : Bounds of each geometry - array 57 | 58 | p : The number of iterations used in constructing the Hilbert curve 59 | 60 | total_bounds : Total bounds of geometries - array 61 | 62 | Returns 63 | --------- 64 | Discrete two-dimensional numpy array 65 | Two-dimensional array Array of hilbert distances for each geom 66 | 67 | """ 68 | # Hilbert Side length 69 | side_length = (2**level) - 1 70 | 71 | # Calculate mid points for x and y bound coords - returns array 72 | x_mids = (bounds[:, 0] + bounds[:, 2]) / 2.0 73 | y_mids = (bounds[:, 1] + bounds[:, 3]) / 2.0 74 | 75 | # Calculate x and y range of total bound coords - returns array 76 | if total_bounds is None: 77 | total_bounds = np.array( 78 | (np.nanmin(x_mids), np.nanmin(y_mids), np.nanmax(x_mids), np.nanmax(y_mids)) 79 | ) 80 | 81 | xmin, ymin, xmax, ymax = total_bounds 82 | 83 | # Transform continuous value to discrete integer for each dimension 84 | x_int = _continuous_to_discrete(x_mids, (xmin, xmax), side_length) 85 | y_int = _continuous_to_discrete(y_mids, (ymin, ymax), side_length) 86 | 87 | return x_int, y_int 88 | 89 | 90 | def _continuous_to_discrete(vals, val_range, n): 91 | """ 92 | Convert a continuous one-dimensional array to discrete integer values 93 | based their ranges 94 | 95 | Parameters 96 | ---------- 97 | vals : Array of continuous values 98 | 99 | val_range : Tuple containing range of continuous values 100 | 101 | n : Number of discrete values 102 | 103 | Returns 104 | --------- 105 | One-dimensional array of discrete ints 106 | 107 | """ 108 | width = val_range[1] - val_range[0] 109 | res = (vals - val_range[0]) * (n / width) 110 | 111 | np.clip(res, 0, n, out=res) 112 | return res.astype(np.uint32) 113 | 114 | 115 | # Fast Hilbert curve algorithm by http://threadlocalmutex.com/ 116 | # From C++ https://github.com/rawrunprotected/hilbert_curves 117 | # (public domain) 118 | 119 | 120 | MAX_LEVEL = 16 121 | 122 | 123 | def _interleave(x): 124 | x = (x | (x << 8)) & 0x00FF00FF 125 | x = (x | (x << 4)) & 0x0F0F0F0F 126 | x = (x | (x << 2)) & 0x33333333 127 | x = (x | (x << 1)) & 0x55555555 128 | return x 129 | 130 | 131 | def _encode(level, x, y): 132 | 133 | x = np.asarray(x, dtype="uint32") 134 | y = np.asarray(y, dtype="uint32") 135 | 136 | if level > MAX_LEVEL: 137 | raise ValueError("Level out of range") 138 | 139 | x = x << (16 - level) 140 | y = y << (16 - level) 141 | 142 | # Initial prefix scan round, prime with x and y 143 | a = x ^ y 144 | b = 0xFFFF ^ a 145 | c = 0xFFFF ^ (x | y) 146 | d = x & (y ^ 0xFFFF) 147 | 148 | A = a | (b >> 1) 149 | B = (a >> 1) ^ a 150 | C = ((c >> 1) ^ (b & (d >> 1))) ^ c 151 | D = ((a & (c >> 1)) ^ (d >> 1)) ^ d 152 | 153 | a = A.copy() 154 | b = B.copy() 155 | c = C.copy() 156 | d = D.copy() 157 | 158 | A = (a & (a >> 2)) ^ (b & (b >> 2)) 159 | B = (a & (b >> 2)) ^ (b & ((a ^ b) >> 2)) 160 | C ^= (a & (c >> 2)) ^ (b & (d >> 2)) 161 | D ^= (b & (c >> 2)) ^ ((a ^ b) & (d >> 2)) 162 | 163 | a = A.copy() 164 | b = B.copy() 165 | c = C.copy() 166 | d = D.copy() 167 | 168 | A = (a & (a >> 4)) ^ (b & (b >> 4)) 169 | B = (a & (b >> 4)) ^ (b & ((a ^ b) >> 4)) 170 | C ^= (a & (c >> 4)) ^ (b & (d >> 4)) 171 | D ^= (b & (c >> 4)) ^ ((a ^ b) & (d >> 4)) 172 | 173 | # Final round and projection 174 | a = A.copy() 175 | b = B.copy() 176 | c = C.copy() 177 | d = D.copy() 178 | 179 | C ^= (a & (c >> 8)) ^ (b & (d >> 8)) 180 | D ^= (b & (c >> 8)) ^ ((a ^ b) & (d >> 8)) 181 | 182 | # Undo transformation prefix scan 183 | a = C ^ (C >> 1) 184 | b = D ^ (D >> 1) 185 | 186 | # Recover index bits 187 | i0 = x ^ y 188 | i1 = b | (0xFFFF ^ (i0 | a)) 189 | 190 | return ((_interleave(i1) << 1) | _interleave(i0)) >> (32 - 2 * level) 191 | -------------------------------------------------------------------------------- /dask_geopandas/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/io/__init__.py -------------------------------------------------------------------------------- /dask_geopandas/io/arrow.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import math 4 | from packaging.version import Version 5 | from typing import TYPE_CHECKING 6 | 7 | import pandas as pd 8 | from fsspec.core import get_fs_token_paths 9 | 10 | import dask 11 | from dask.base import compute_as_if_collection, tokenize 12 | from dask.dataframe import Scalar, from_graph 13 | from dask.highlevelgraph import HighLevelGraph 14 | from dask.layers import DataFrameIOLayer 15 | from dask.utils import apply, natural_sort_key 16 | 17 | import geopandas 18 | import shapely.geometry 19 | 20 | DASK_2022_12_0_PLUS = Version(dask.__version__) >= Version("2022.12.0") 21 | DASK_2023_04_0 = Version(dask.__version__) >= Version("2023.4.0") 22 | 23 | 24 | if TYPE_CHECKING: 25 | import pyarrow 26 | 27 | 28 | def _update_meta_to_geodataframe(meta, schema_metadata): 29 | """ 30 | Convert meta to a GeoDataFrame and update with potential GEO metadata 31 | """ 32 | if schema_metadata and b"geo" in schema_metadata: 33 | geo_meta = json.loads(schema_metadata[b"geo"]) 34 | geometry_column_name = geo_meta["primary_column"] 35 | crs = geo_meta["columns"][geometry_column_name].get("crs", "OGC:CRS84") 36 | geometry_columns = geo_meta["columns"] 37 | else: 38 | # TODO we could allow the user to pass those explicitly if not 39 | # stored in the metadata 40 | raise ValueError( 41 | "Missing geo metadata in the Parquet/Feather file. " 42 | "Use dask.dataframe.read_parquet/pandas.read_feather() instead." 43 | ) 44 | 45 | # Update meta to be a GeoDataFrame 46 | meta = geopandas.GeoDataFrame(meta, geometry=geometry_column_name, crs=crs) 47 | for col, item in geometry_columns.items(): 48 | if not col == meta._geometry_column_name: 49 | meta[col] = geopandas.GeoSeries(meta[col], crs=item.get("crs", "OGC:CRS84")) 50 | 51 | return meta 52 | 53 | 54 | def _get_partition_bounds(schema_metadata): 55 | """ 56 | Get the partition bounds, if available, for the dataset fragment. 57 | """ 58 | if not (schema_metadata and b"geo" in schema_metadata): 59 | return None 60 | 61 | metadata = json.loads(schema_metadata[b"geo"].decode("utf-8")) 62 | 63 | # for now only check the primary column (TODO generalize this to follow 64 | # the logic of geopandas to fallback to other geometry columns) 65 | geometry = metadata["primary_column"] 66 | bbox = metadata["columns"][geometry].get("bbox", None) 67 | if bbox is None or all(math.isnan(val) for val in bbox): 68 | return None 69 | return shapely.geometry.box(*bbox) 70 | 71 | 72 | class ArrowDatasetEngine: 73 | """ 74 | Custom IO engine based on pyarrow.dataset. 75 | 76 | This is designed after dask's ArrowDatasetEngine for Parquet IO (but simpler 77 | with less options, and not dealing with a legacy engine) and ArrowORCEngine 78 | for ORC IO (but using pyarrow.dataset for the read_metadata discovery). 79 | """ 80 | 81 | file_format: str 82 | 83 | @classmethod 84 | def read_metadata(cls, fs, paths, columns, filters, index): 85 | import pyarrow.dataset as ds 86 | from pyarrow.parquet import _filters_to_expression 87 | 88 | # dataset discovery 89 | if len(paths) == 1: 90 | # list of 1 directory path is not supported 91 | paths = paths[0] 92 | dataset = ds.dataset( 93 | paths, partitioning="hive", filesystem=fs, format=cls.file_format 94 | ) 95 | 96 | # Get all (filtered) fragments 97 | if filters is not None: 98 | filter = _filters_to_expression(filters) 99 | else: 100 | filter = None 101 | 102 | fragments = list(dataset.get_fragments(filter=filter)) 103 | 104 | # numeric rather than glob ordering 105 | # TODO how does this handle different partitioned directories? 106 | fragments = sorted(fragments, key=lambda f: natural_sort_key(f.path)) 107 | 108 | # TODO potential splitting / aggregating of fragments 109 | 110 | # Create dask meta 111 | schema = dataset.schema 112 | # TODO add support for `categories`keyword 113 | meta = schema.empty_table().to_pandas() 114 | 115 | if index: 116 | meta = meta.set_index(index) 117 | 118 | if columns is not None: 119 | ex = set(columns) - set(meta.columns) 120 | if ex: 121 | raise ValueError( 122 | f"Requested columns {ex} not in schema {set(meta.columns)}" 123 | ) 124 | meta = meta[columns] 125 | 126 | return fragments, meta, schema, filter 127 | 128 | @classmethod 129 | def _arrow_table_to_pandas( 130 | cls, arrow_table: "pyarrow.Table", categories, **kwargs 131 | ) -> pd.DataFrame: 132 | _kwargs = kwargs.get("arrow_to_pandas", {}) 133 | _kwargs.update({"use_threads": False, "ignore_metadata": False}) 134 | 135 | return arrow_table.to_pandas(categories=categories, **_kwargs) 136 | 137 | @classmethod 138 | def read_partition(cls, fs, fragment, schema, columns, filter, **kwargs): 139 | table = fragment.to_table( 140 | schema=schema, columns=columns, filter=filter, use_threads=False 141 | ) 142 | df = cls._arrow_table_to_pandas(table, None) 143 | return df 144 | 145 | @classmethod 146 | def write_partition(cls, df, path, fs, filename, **kwargs): 147 | from pyarrow import feather 148 | 149 | table = cls._pandas_to_arrow_table(df, preserve_index=None) 150 | # TODO using the datasets API could automatically support partitioning 151 | # on columns 152 | with fs.open(fs.sep.join([path, filename]), "wb") as f: 153 | feather.write_feather(table, f) 154 | 155 | 156 | class GeoDatasetEngine: 157 | """ 158 | Mixin to combine with an IO Engine (the custom engine defined above for 159 | Feather IO, or dask's engine for Parquet IO) that holds the custom logic 160 | for geospatial data: overriding the arrow <-> pandas conversions to ensure 161 | we read/write GeoDataFrames. 162 | 163 | """ 164 | 165 | @classmethod 166 | def _arrow_table_to_pandas( 167 | cls, arrow_table: "pyarrow.Table", categories, **kwargs 168 | ) -> pd.DataFrame: 169 | from geopandas.io.arrow import _arrow_to_geopandas 170 | 171 | _kwargs = kwargs.get("arrow_to_pandas", {}) 172 | _kwargs.update({"use_threads": False, "ignore_metadata": False}) 173 | 174 | # TODO support additional keywords 175 | try: 176 | return _arrow_to_geopandas(arrow_table) 177 | except ValueError as err: 178 | # when no geometry column is selected, the above will error. 179 | # We want to fallback to reading it as a plain dask object, because 180 | # the column selection can be an automatic pushdown (eg `ddf['col']`) 181 | # TODO more robust detection of when to fall back? 182 | if "No geometry columns are included" in str(err): 183 | return super()._arrow_table_to_pandas( 184 | arrow_table, categories=categories, **kwargs 185 | ) 186 | # when there are no columns, we also fall back (the dataset might 187 | # have no files, and so we don't want to raise a confusing error 188 | # about no geometry column) 189 | elif not arrow_table.schema.names: 190 | return super()._arrow_table_to_pandas( 191 | arrow_table, categories=categories, **kwargs 192 | ) 193 | else: 194 | raise 195 | 196 | @classmethod 197 | def _pandas_to_arrow_table( 198 | cls, df: pd.DataFrame, preserve_index=False, schema=None, **kwargs 199 | ) -> "pyarrow.Table": 200 | from geopandas.io.arrow import _geopandas_to_arrow 201 | 202 | table = _geopandas_to_arrow(df, index=preserve_index) 203 | 204 | if schema is not None: 205 | if not table.schema.equals(schema): 206 | # table.schema.metadata contains the "geo" metadata, so 207 | # ensure to preserve this in the cast operation 208 | if table.schema.metadata and not schema.metadata: 209 | schema = schema.with_metadata(table.schema.metadata) 210 | table = table.cast(schema) 211 | 212 | return table 213 | 214 | 215 | class FeatherDatasetEngine(GeoDatasetEngine, ArrowDatasetEngine): 216 | file_format = "feather" 217 | 218 | 219 | class FeatherFunctionWrapper: 220 | """ 221 | Feather Function-Wrapper Class 222 | Reads Feather data from disk to produce a partition. 223 | """ 224 | 225 | def __init__(self, engine, fs, columns, filter, schema, index): 226 | self.engine = engine 227 | self.fs = fs 228 | self.columns = columns 229 | self.filter = filter 230 | self.schema = schema 231 | self.index = index 232 | 233 | def project_columns(self, columns): 234 | """Return a new FeatherFunctionWrapper object with 235 | a sub-column projection. 236 | """ 237 | if columns == self.columns: 238 | return self 239 | func = copy.deepcopy(self) 240 | func.columns = columns 241 | return func 242 | 243 | def __call__(self, parts): 244 | _df = self.engine.read_partition( 245 | self.fs, parts, self.schema, self.columns, self.filter 246 | ) 247 | if self.index: 248 | _df.set_index(self.index, inplace=True) 249 | return _df 250 | 251 | 252 | def read_feather( 253 | path, 254 | columns=None, 255 | filters=None, 256 | index=None, 257 | storage_options=None, 258 | ): 259 | """Read a Feather dataset into a Dask-GeoPandas DataFrame. 260 | 261 | Parameters 262 | ---------- 263 | path: str or list(str) 264 | Source directory for data, or path(s) to individual Feather files. 265 | Paths can be a full URL with protocol specifier, and may include 266 | glob character if a single string. 267 | columns: None or list(str) 268 | Columns to load. If None, loads all. 269 | filters : list (of list) of tuples or pyarrow.dataset.Expression, default None 270 | Row-wise filter to apply while reading the dataset. Can be specified 271 | as a ``pyarrow.dataset.Expression`` object or using a list of tuples 272 | notation, like ``[[('col1', '==', 0), ...], ...]``. The filter will 273 | be applied both at the partition level, this is to prevent the loading 274 | of some files, as at the file level to filter the actual rows. 275 | 276 | For the list of tuples format, predicates can be expressed in disjunctive 277 | normal form (DNF). This means that the innermost tuple describes a single 278 | column predicate. These inner predicates are combined with an AND 279 | conjunction into a larger predicate. The outer-most list then combines all 280 | of the combined filters with an OR disjunction. 281 | 282 | Predicates can also be expressed as a List[Tuple]. These are evaluated 283 | as an AND conjunction. To express OR in predictates, one must use the 284 | List[List[Tuple]] notation. 285 | index : str, list or False, default None 286 | Field name(s) to use as the output frame index. By default will be 287 | inferred from the pandas metadata (if present in the files). Use False 288 | to read all fields as columns. 289 | storage_options : dict, default None 290 | Key/value pairs to be passed on to the file-system backend, if any 291 | (inferred from the path, such as "s3://..."). 292 | Please see ``fsspec`` for more details. 293 | 294 | Returns 295 | ------- 296 | dask_geopandas.GeoDataFrame (even if there is only one column) 297 | 298 | """ 299 | if index is False: 300 | raise NotImplementedError("Specifying index=False is not yet implemented") 301 | 302 | # Get engine 303 | engine = FeatherDatasetEngine 304 | 305 | # Process file path(s) 306 | storage_options = storage_options or {} 307 | fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) 308 | paths = sorted(paths, key=natural_sort_key) # numeric rather than glob ordering 309 | 310 | # Let backend engine generate a list of parts from the dataset metadata 311 | parts, meta, schema, filter = engine.read_metadata( 312 | fs, 313 | paths, 314 | columns, 315 | filters, 316 | index, 317 | ) 318 | 319 | # Update meta to be a GeoDataFrame 320 | meta = _update_meta_to_geodataframe(meta, schema.metadata) 321 | 322 | # Construct spatial partitioning information, if available 323 | spatial_partitions = geopandas.GeoSeries( 324 | [_get_partition_bounds(frag.physical_schema.metadata) for frag in parts], 325 | crs=meta.crs, 326 | ) 327 | if spatial_partitions.isna().any(): 328 | spatial_partitions = None 329 | 330 | # Construct and return a Blockwise layer 331 | label = "read-feather-" 332 | output_name = label + tokenize(path, columns, filters, index) 333 | layer = DataFrameIOLayer( 334 | output_name, 335 | columns, 336 | parts, 337 | FeatherFunctionWrapper(engine, fs, columns, filter, schema, index), 338 | label=label, 339 | ) 340 | graph = HighLevelGraph({output_name: layer}, {output_name: set()}) 341 | result = from_graph( 342 | graph, 343 | meta, 344 | [None] * (len(parts) + 1), 345 | [(output_name, i) for i in range(len(parts))], 346 | "read_feather", 347 | ) 348 | 349 | result.spatial_partitions = spatial_partitions 350 | return result 351 | 352 | 353 | def to_feather( 354 | df, 355 | path, 356 | write_index=True, 357 | storage_options=None, 358 | compute=True, 359 | compute_kwargs=None, 360 | ): 361 | """Store Dask.dataframe to Feather files 362 | 363 | Notes 364 | ----- 365 | Each partition will be written to a separate file. 366 | 367 | Parameters 368 | ---------- 369 | df : dask_geopandas.GeoDataFrame 370 | path : string or pathlib.Path 371 | Destination directory for data. Prepend with protocol like ``s3://`` 372 | or ``hdfs://`` for remote data. 373 | write_index : boolean, default True 374 | Whether or not to write the index. Defaults to True. 375 | storage_options : dict, default None 376 | Key/value pairs to be passed on to the file-system backend, if any 377 | (inferred from the path, such as "s3://..."). 378 | Please see ``fsspec`` for more details. 379 | compute : bool, default True 380 | If True (default) then the result is computed immediately. If False 381 | then a ``dask.delayed`` object is returned for future computation. 382 | compute_kwargs : dict, default True 383 | Options to be passed in to the compute method 384 | 385 | See Also 386 | -------- 387 | dask_geopandas.read_feather: Read Feather data to dask.dataframe 388 | """ 389 | # based on the to_orc function from dask 390 | 391 | # Get engine 392 | engine = FeatherDatasetEngine 393 | 394 | # Process file path 395 | storage_options = storage_options or {} 396 | fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) 397 | # Trim any protocol information from the path before forwarding 398 | path = fs._strip_protocol(path) 399 | 400 | if not write_index: 401 | # Not writing index - might as well drop it 402 | df = df.reset_index(drop=True) 403 | 404 | # Use df.npartitions to define file-name list 405 | fs.mkdirs(path, exist_ok=True) 406 | filenames = [f"part.{i}.feather" for i in range(df.npartitions)] 407 | 408 | # Construct IO graph 409 | dsk = {} 410 | name = "to-feather-" + tokenize(df, fs, path, write_index, storage_options) 411 | part_tasks = [] 412 | for d, filename in enumerate(filenames): 413 | dsk[(name, d)] = ( 414 | apply, 415 | engine.write_partition, 416 | [ 417 | (df._name, d), 418 | path, 419 | fs, 420 | filename, 421 | ], 422 | ) 423 | part_tasks.append((name, d)) 424 | dsk[name] = (lambda x: None, part_tasks) 425 | graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) 426 | 427 | # Compute or return future 428 | if compute: 429 | if compute_kwargs is None: 430 | compute_kwargs = dict() 431 | from dask_geopandas import GeoDataFrame 432 | 433 | return compute_as_if_collection( 434 | GeoDataFrame, graph, part_tasks, **compute_kwargs 435 | ) 436 | return Scalar(graph, name, "") 437 | -------------------------------------------------------------------------------- /dask_geopandas/io/file.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | 3 | from pandas import RangeIndex 4 | 5 | from dask.base import tokenize 6 | from dask.dataframe import from_graph 7 | from dask.highlevelgraph import HighLevelGraph 8 | 9 | 10 | class FileFunctionWrapper: 11 | """ 12 | GDAL File reader Function-Wrapper Class 13 | 14 | Reads data from disk to produce a partition (given row subset to read). 15 | """ 16 | 17 | def __init__(self, layer, columns): 18 | self.layer = layer 19 | self.columns = columns 20 | self.read_geometry = True 21 | if columns is not None and "geometry" not in columns: 22 | self.read_geometry = False 23 | 24 | def project_columns(self, columns): 25 | """Return a new FileFunctionWrapper object with 26 | a sub-column projection. 27 | """ 28 | if columns == self.columns: 29 | return self 30 | return FileFunctionWrapper(self.layer, columns) 31 | 32 | def __call__(self, part): 33 | path, row_offset, batch_size = part 34 | 35 | import pyogrio 36 | 37 | df = pyogrio.read_dataframe( 38 | path, 39 | layer=self.layer, 40 | columns=self.columns, 41 | read_geometry=self.read_geometry, 42 | skip_features=row_offset, 43 | max_features=batch_size, 44 | ) 45 | df.index = RangeIndex(row_offset, row_offset + batch_size) 46 | return df 47 | 48 | 49 | def read_file( 50 | path, npartitions=None, chunksize=None, layer=None, columns=None, **kwargs 51 | ): 52 | """ 53 | Read a GIS file into a Dask GeoDataFrame. 54 | 55 | This function requires `pyogrio `__. 56 | 57 | Parameters 58 | ---------- 59 | path : str 60 | The absolute or relative path to the file or URL to 61 | be opened. 62 | npartitions : int, optional 63 | The number of partitions to create. Either this or `chunksize` should 64 | be specified. 65 | chunksize : int, optional 66 | The number of rows per partition to use. Either this or `npartitions` 67 | should be specified. 68 | layer : int or str, optional (default: first layer) 69 | If an integer is provided, it corresponds to the index of the layer 70 | with the data source. If a string is provided, it must match the name 71 | of the layer in the data source. Defaults to first layer in data source. 72 | columns : list-like, optional (default: all columns) 73 | List of column names to import from the data source. Column names must 74 | exactly match the names in the data source, and will be returned in 75 | the order they occur in the data source. To avoid reading any columns, 76 | pass an empty list-like. 77 | 78 | """ 79 | try: 80 | import pyogrio 81 | except ImportError as err: 82 | raise ImportError( 83 | "The 'read_file' function requires the 'pyogrio' package, but it is " 84 | "not installed or does not import correctly." 85 | f"\nImporting pyogrio resulted in: {err}" 86 | ) 87 | 88 | from dask.layers import DataFrameIOLayer 89 | 90 | # TODO smart inference for a good default partition size ? 91 | if (npartitions is None) == (chunksize is None): 92 | raise ValueError("Exactly one of npartitions and chunksize must be specified.") 93 | 94 | if "skip_features" in kwargs or "max_features" in kwargs: 95 | # TODO we currently use those keywords already for reading in each 96 | # partition (we would need to take those into account for determining 97 | # the part start/ends) 98 | raise ValueError( 99 | "The 'skip_features'/'max_feature' keywords are not yet supported" 100 | ) 101 | if kwargs: 102 | raise ValueError("Additional pyogrio keywords are not yet supported") 103 | 104 | total_size = pyogrio.read_info(path, layer=layer)["features"] 105 | 106 | if chunksize is None: 107 | chunksize = int(ceil(total_size / npartitions)) 108 | 109 | # TODO this could be inferred from read_info ? 110 | read_geometry = True 111 | if columns is not None and "geometry" not in columns: 112 | read_geometry = False 113 | meta = pyogrio.read_dataframe( 114 | path, layer=layer, columns=columns, read_geometry=read_geometry, max_features=5 115 | ).head(0) 116 | 117 | # Define parts 118 | parts = [] 119 | row_offset = 0 120 | divs = [row_offset] 121 | 122 | while row_offset < total_size: 123 | batch_size = min(chunksize, total_size - row_offset) 124 | parts.append((path, row_offset, batch_size)) 125 | row_offset += batch_size 126 | divs.append(row_offset) 127 | # Set the last division value to be the largest index value in the last partition 128 | divs[-1] = divs[-1] - 1 129 | 130 | # Create Blockwise layer 131 | label = "read-file-" 132 | output_name = label + tokenize(path, chunksize, layer, columns) 133 | layer = DataFrameIOLayer( 134 | output_name, 135 | columns, 136 | parts, 137 | FileFunctionWrapper(layer, columns), 138 | label=label, 139 | ) 140 | graph = HighLevelGraph({output_name: layer}, {output_name: set()}) 141 | 142 | result = from_graph( 143 | graph, 144 | meta, 145 | divs, 146 | [(output_name, i) for i in range(len(divs) - 1)], 147 | "read_file", 148 | ) 149 | return result 150 | -------------------------------------------------------------------------------- /dask_geopandas/io/parquet.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import dask.dataframe as dd 4 | 5 | import geopandas 6 | 7 | from .arrow import ( 8 | GeoDatasetEngine, 9 | _get_partition_bounds, 10 | _update_meta_to_geodataframe, 11 | ) 12 | 13 | try: 14 | # pyarrow is imported here, but is an optional dependency 15 | from dask.dataframe.io.parquet.arrow import ( 16 | ArrowDatasetEngine as DaskArrowDatasetEngine, 17 | ) 18 | except ImportError: 19 | DaskArrowDatasetEngine = object 20 | 21 | 22 | def _get_partition_bounds_parquet(part, fs): 23 | """ 24 | Based on the part information gathered by dask, get the partition bounds 25 | if available. 26 | 27 | """ 28 | from pyarrow.parquet import ParquetFile 29 | 30 | # read the metadata from the actual file (this is again file IO, but 31 | # we can't rely on the schema metadata, because this is only the 32 | # metadata of the first piece) 33 | pq_metadata = None 34 | if "piece" in part: 35 | path = part["piece"][0] 36 | if isinstance(path, str): 37 | with fs.open(path, "rb") as f: 38 | pq_metadata = ParquetFile(f).metadata 39 | if pq_metadata is None: 40 | return None 41 | 42 | return _get_partition_bounds(pq_metadata.metadata) 43 | 44 | 45 | class GeoArrowEngine(GeoDatasetEngine, DaskArrowDatasetEngine): 46 | """ 47 | Engine for reading geospatial Parquet datasets. Subclasses dask's 48 | ArrowEngine for Parquet, but overriding some methods to ensure we 49 | correctly read/write GeoDataFrames. 50 | 51 | """ 52 | 53 | @classmethod 54 | def _update_meta(cls, meta, schema): 55 | """ 56 | Convert meta to a GeoDataFrame and update with potential GEO metadata 57 | """ 58 | return _update_meta_to_geodataframe(meta, schema.metadata) 59 | 60 | @classmethod 61 | def _create_dd_meta(cls, dataset_info): 62 | meta = super()._create_dd_meta(dataset_info) 63 | schema = dataset_info["schema"] 64 | if not schema.names and not schema.metadata: 65 | if len(list(dataset_info["ds"].get_fragments())) == 0: 66 | raise ValueError( 67 | "No dataset parts discovered. Use dask.dataframe.read_parquet " 68 | "to read it as an empty DataFrame" 69 | ) 70 | meta = cls._update_meta(meta, schema) 71 | 72 | if dataset_info["kwargs"].get("gather_spatial_partitions", True): 73 | fs = dataset_info["fs"] 74 | parts, _, _ = cls._construct_collection_plan(dataset_info) 75 | regions = geopandas.GeoSeries( 76 | [_get_partition_bounds_parquet(part, fs) for part in parts], 77 | crs=meta.crs, 78 | ) 79 | if regions.notna().all(): 80 | # a bit hacky, but this allows us to get this passed through 81 | meta.attrs["spatial_partitions"] = regions 82 | 83 | return meta 84 | 85 | 86 | to_parquet = partial(dd.to_parquet, engine=GeoArrowEngine) 87 | to_parquet.__doc__ = dd.to_parquet.__doc__ 88 | 89 | 90 | def read_parquet(*args, **kwargs): 91 | from dask.dataframe import read_parquet 92 | 93 | result = read_parquet(*args, engine=GeoArrowEngine, **kwargs) 94 | # check if spatial partitioning information was stored 95 | spatial_partitions = result._meta.attrs.get("spatial_partitions", None) 96 | 97 | result = dd.from_graph( 98 | result.dask, 99 | result._meta, 100 | result.divisions, 101 | result.__dask_keys__(), 102 | "read_parquet", 103 | ) 104 | 105 | result.spatial_partitions = spatial_partitions 106 | return result 107 | 108 | 109 | read_parquet.__doc__ = dd.read_parquet.__doc__ 110 | -------------------------------------------------------------------------------- /dask_geopandas/morton_distance.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import pandas as pd 4 | 5 | from dask_geopandas.hilbert_distance import _continuous_to_discrete_coords 6 | 7 | 8 | def _morton_distance(gdf, total_bounds, level): 9 | """ 10 | Calculate distance of geometries along Morton curve 11 | 12 | The Morton curve is also known as Z-order https://en.wikipedia.org/wiki/Z-order_curve 13 | 14 | Parameters 15 | ---------- 16 | gdf : GeoDataFrame 17 | total_bounds : array_like 18 | array containing xmin, ymin, xmax, ymax 19 | level : int (1 - 16) 20 | Determines the precision of the Morton curve. 21 | 22 | Returns 23 | ------- 24 | type : pandas.Series 25 | Series containing distances from Morton curve 26 | 27 | """ 28 | with warnings.catch_warnings(): 29 | warnings.filterwarnings( 30 | "ignore", "GeoSeries.isna() previously returned True", UserWarning 31 | ) 32 | if gdf.is_empty.any() | gdf.geometry.isna().any(): 33 | raise ValueError( 34 | "Morton distance cannot be computed on a GeoSeries with empty or " 35 | "missing geometries.", 36 | ) 37 | # Calculate bounds as numpy array 38 | bounds = gdf.bounds.to_numpy() 39 | # Calculate discrete coords based on total bounds and bounds 40 | x_int, y_int = _continuous_to_discrete_coords(bounds, level, total_bounds) 41 | # Calculate distance from morton curve 42 | distances = _distances_from_coordinates(x_int, y_int) 43 | 44 | return pd.Series(distances, index=gdf.index, name="morton_distance") 45 | 46 | 47 | def _distances_from_coordinates(x, y): 48 | """ 49 | Calculate distances from geometry mid-points along Morton curve 50 | 51 | Parameters 52 | ---------- 53 | x, y : array_like 54 | x, y coordinate pairs based on mid-points of geoms 55 | 56 | Returns 57 | ------- 58 | type : int 59 | Integer distances from Morton curve 60 | """ 61 | 62 | return _part1by1(x) | (_part1by1(y) << 1) 63 | 64 | 65 | def _part1by1(n): 66 | """ 67 | Interleave bits by ninary magic numbers 68 | 69 | Based on #http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN 70 | 71 | Parameters 72 | ---------- 73 | n : np.array 74 | X or Y coordinates 75 | 76 | Returns 77 | ------- 78 | n : int 79 | Interleaved bits 80 | """ 81 | n &= 0x0000FFFF 82 | n = (n | (n << 8)) & 0x00FF00FF 83 | n = (n | (n << 4)) & 0x0F0F0F0F 84 | n = (n | (n << 2)) & 0x33333333 85 | n = (n | (n << 1)) & 0x55555555 86 | 87 | return n 88 | -------------------------------------------------------------------------------- /dask_geopandas/sjoin.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | 5 | from dask.base import tokenize 6 | from dask.dataframe import from_graph 7 | from dask.highlevelgraph import HighLevelGraph 8 | 9 | import geopandas 10 | 11 | from .expr import from_geopandas 12 | 13 | 14 | def sjoin(left, right, how="inner", predicate="intersects", **kwargs): 15 | """ 16 | Spatial join of two GeoDataFrames. 17 | 18 | Parameters 19 | ---------- 20 | left, right : geopandas or dask_geopandas GeoDataFrames 21 | If a geopandas.GeoDataFrame is passed, it is considered as a 22 | dask_geopandas.GeoDataFrame with 1 partition (without spatial 23 | partitioning information). 24 | how : string, default 'inner' 25 | The type of join. Currently only 'inner' is supported. 26 | predicate : string, default 'intersects' 27 | Binary predicate how to match corresponding rows of the left and right 28 | GeoDataFrame. Possible values: 'contains', 'contains_properly', 29 | 'covered_by', 'covers', 'crosses', 'intersects', 'overlaps', 30 | 'touches', 'within'. 31 | 32 | Returns 33 | ------- 34 | dask_geopandas.GeoDataFrame 35 | 36 | Notes 37 | ----- 38 | If both the left and right GeoDataFrame have spatial partitioning 39 | information available (the ``spatial_partitions`` attribute is set), 40 | the output partitions are determined based on intersection of the 41 | spatial partitions. In all other cases, the output partitions are 42 | all combinations (cartesian/cross product) of all input partition 43 | of the left and right GeoDataFrame. 44 | """ 45 | if "op" in kwargs: 46 | predicate = kwargs.pop("op") 47 | deprecation_message = ( 48 | "The `op` parameter is deprecated and will be removed" 49 | " in a future release. Please use the `predicate` parameter" 50 | " instead." 51 | ) 52 | warnings.warn(deprecation_message, FutureWarning, stacklevel=2) 53 | if how != "inner": 54 | raise NotImplementedError("Only how='inner' is supported right now") 55 | 56 | if isinstance(left, geopandas.GeoDataFrame): 57 | left = from_geopandas(left, npartitions=1) 58 | if isinstance(right, geopandas.GeoDataFrame): 59 | right = from_geopandas(right, npartitions=1) 60 | 61 | # We call optimize on the inputs to ensure that any optimizations 62 | # done by dask-expr (which might change the expression, and thus the 63 | # name of the DataFrame) *before* we build the HighLevelGraph. 64 | # https://github.com/dask/dask-expr/issues/1129 65 | left = left.optimize() 66 | right = right.optimize() 67 | 68 | name = "sjoin-" + tokenize(left, right, how, predicate) 69 | meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate) 70 | 71 | if left.spatial_partitions is not None and right.spatial_partitions is not None: 72 | # Spatial partitions are known -> use them to trim down the list of 73 | # partitions that need to be joined 74 | parts = geopandas.sjoin( 75 | left.spatial_partitions.to_frame("geometry"), 76 | right.spatial_partitions.to_frame("geometry"), 77 | how="inner", 78 | predicate="intersects", 79 | ) 80 | parts_left = np.asarray(parts.index).tolist() 81 | parts_right = np.asarray(parts["index_right"].values).tolist() 82 | using_spatial_partitions = True 83 | else: 84 | # Unknown spatial partitions -> full cartesian (cross) product of all 85 | # combinations of the partitions of the left and right dataframe 86 | n_left = left.npartitions 87 | n_right = right.npartitions 88 | parts_left = np.repeat(np.arange(n_left), n_right) 89 | parts_right = np.tile(np.arange(n_right), n_left) 90 | using_spatial_partitions = False 91 | 92 | dsk = {} 93 | new_spatial_partitions = [] 94 | for i, (part_left, part_right) in enumerate(zip(parts_left, parts_right)): 95 | dsk[(name, i)] = ( 96 | geopandas.sjoin, 97 | (left._name, part_left), 98 | (right._name, part_right), 99 | how, 100 | predicate, 101 | ) 102 | # TODO preserve spatial partitions of the output if only left has spatial 103 | # partitions 104 | if using_spatial_partitions: 105 | lr = left.spatial_partitions.iloc[part_left] 106 | rr = right.spatial_partitions.iloc[part_right] 107 | # extent = lr.intersection(rr).buffer(buffer).intersection(lr.union(rr)) 108 | extent = lr.intersection(rr) 109 | new_spatial_partitions.append(extent) 110 | 111 | divisions = [None] * (len(dsk) + 1) 112 | graph = HighLevelGraph.from_collections(name, dsk, dependencies=[left, right]) 113 | if using_spatial_partitions: 114 | new_spatial_partitions = geopandas.GeoSeries( 115 | data=new_spatial_partitions, crs=left.crs 116 | ) 117 | else: 118 | new_spatial_partitions = None 119 | 120 | result = from_graph(graph, meta, divisions, dsk.keys(), "sjoin") 121 | result.spatial_partitions = new_spatial_partitions 122 | return result 123 | -------------------------------------------------------------------------------- /dask_geopandas/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/tests/__init__.py -------------------------------------------------------------------------------- /dask_geopandas/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from packaging.version import Version 3 | 4 | import dask 5 | 6 | import geopandas 7 | 8 | import pytest 9 | 10 | # TODO update version once geopandas has a proper tag for 1.0 11 | GEOPANDAS_GE_10 = (Version(geopandas.__version__) >= Version("0.14.0+70")) and ( 12 | Version(geopandas.__version__) < Version("0.14.1") 13 | ) 14 | 15 | 16 | # TODO Disable usage of pyarrow strings until the expected results in the tests 17 | # are updated to use those as well 18 | dask.config.set({"dataframe.convert-string": False}) 19 | 20 | 21 | # Datasets used in our tests 22 | 23 | _HERE = os.path.abspath(os.path.dirname(__file__)) 24 | _TEST_DATA_DIR = os.path.join(_HERE, "data") 25 | _NATURALEARTH_CITIES = os.path.join( 26 | _TEST_DATA_DIR, "naturalearth_cities", "naturalearth_cities.shp" 27 | ) 28 | _NATURALEARTH_LOWRES = os.path.join( 29 | _TEST_DATA_DIR, "naturalearth_lowres", "naturalearth_lowres.shp" 30 | ) 31 | 32 | 33 | @pytest.fixture(scope="session") 34 | def naturalearth_lowres() -> str: 35 | # skip if data missing, unless on github actions 36 | if os.path.isfile(_NATURALEARTH_LOWRES) or os.getenv("GITHUB_ACTIONS"): 37 | return _NATURALEARTH_LOWRES 38 | else: 39 | pytest.skip("Naturalearth lowres dataset not found") 40 | 41 | 42 | @pytest.fixture(scope="session") 43 | def naturalearth_cities() -> str: 44 | # skip if data missing, unless on github actions 45 | if os.path.isfile(_NATURALEARTH_CITIES) or os.getenv("GITHUB_ACTIONS"): 46 | return _NATURALEARTH_CITIES 47 | else: 48 | pytest.skip("Naturalearth cities dataset not found") 49 | -------------------------------------------------------------------------------- /dask_geopandas/tests/data/README.md: -------------------------------------------------------------------------------- 1 | # Datasets previously included with geopandas 2 | 3 | - `'naturalearth_cities'`: capital cities, based on http://www.naturalearthdata.com/downloads/10m-cultural-vectors/110m-populated-places/ 4 | - `'naturalearth_lowres'`: country boundaries, based on http://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-admin-0-countries/ 5 | 6 | -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.VERSION.txt: -------------------------------------------------------------------------------- 1 | 2.0.0 -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.cpg: -------------------------------------------------------------------------------- 1 | ISO-8859-1 -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.dbf -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.prj: -------------------------------------------------------------------------------- 1 | GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]] -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.shp -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/tests/data/naturalearth_cities/naturalearth_cities.shx -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_lowres/naturalearth_lowres.cpg: -------------------------------------------------------------------------------- 1 | ISO-8859-1 -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_lowres/naturalearth_lowres.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/tests/data/naturalearth_lowres/naturalearth_lowres.dbf -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_lowres/naturalearth_lowres.prj: -------------------------------------------------------------------------------- 1 | GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]] -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_lowres/naturalearth_lowres.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/tests/data/naturalearth_lowres/naturalearth_lowres.shp -------------------------------------------------------------------------------- /dask_geopandas/tests/data/naturalearth_lowres/naturalearth_lowres.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/tests/data/naturalearth_lowres/naturalearth_lowres.shx -------------------------------------------------------------------------------- /dask_geopandas/tests/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geopandas/dask-geopandas/d60b432f21361516948a70b36ab22b6486c97622/dask_geopandas/tests/io/__init__.py -------------------------------------------------------------------------------- /dask_geopandas/tests/io/conftest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shlex 4 | import subprocess 5 | import sys 6 | import time 7 | from contextlib import contextmanager 8 | 9 | import pytest 10 | 11 | 12 | @contextmanager 13 | def ensure_safe_environment_variables(): 14 | """ 15 | Get a context manager to safely set environment variables 16 | 17 | All changes will be undone on close, hence environment variables set 18 | within this contextmanager will neither persist nor change global state. 19 | """ 20 | saved_environ = dict(os.environ) 21 | try: 22 | yield 23 | finally: 24 | os.environ.clear() 25 | os.environ.update(saved_environ) 26 | 27 | 28 | @pytest.fixture(scope="session") 29 | def s3_server(): 30 | """ 31 | Fixture for mocking S3 interaction. 32 | 33 | Sets up moto server in separate process 34 | """ 35 | pytest.importorskip("s3fs") 36 | pytest.importorskip("boto3") 37 | pytest.importorskip("moto", minversion="1.3.14") 38 | pytest.importorskip("flask") # server mode needs flask too 39 | requests = pytest.importorskip("requests") 40 | logging.getLogger("requests").disabled = True 41 | 42 | endpoint_url = "http://127.0.0.1:5555/" 43 | 44 | with ensure_safe_environment_variables(): 45 | os.environ["AWS_ACCESS_KEY_ID"] = "testing" 46 | os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" 47 | os.environ["AWS_SECURITY_TOKEN"] = "testing" 48 | os.environ["AWS_SESSION_TOKEN"] = "testing" 49 | 50 | # Launching moto in server mode, i.e., as a separate process 51 | # with an S3 endpoint on localhost 52 | 53 | # pipe to null to avoid logging in terminal 54 | proc = subprocess.Popen( 55 | shlex.split("moto_server s3 -p 5555"), 56 | stdout=subprocess.DEVNULL, 57 | ) 58 | 59 | timeout = 5 60 | while True: 61 | try: 62 | # OK to go once server is accepting connections 63 | r = requests.get(endpoint_url) 64 | if r.ok: 65 | break 66 | except Exception: 67 | pass 68 | timeout -= 0.1 69 | time.sleep(0.1) 70 | assert timeout > 0, "Timed out waiting for moto server" 71 | yield endpoint_url 72 | 73 | # shut down external process 74 | proc.terminate() 75 | try: 76 | proc.wait(timeout=3) 77 | except subprocess.TimeoutExpired: 78 | proc.kill() 79 | if sys.platform == "win32": 80 | # belt & braces 81 | subprocess.call("TASKKILL /F /PID {pid} /T".format(pid=proc.pid)) 82 | 83 | 84 | @pytest.fixture 85 | def s3_storage_options(): 86 | return {"client_kwargs": {"endpoint_url": "http://127.0.0.1:5555/"}} 87 | 88 | 89 | @pytest.fixture() 90 | def s3_resource(s3_server): 91 | """ 92 | Sets up S3 bucket 'geopandas-test'. 93 | """ 94 | endpoint_url = s3_server 95 | 96 | import boto3 97 | import s3fs 98 | 99 | bucket = "geopandas-test" 100 | client = boto3.client("s3", endpoint_url=endpoint_url) 101 | 102 | client.create_bucket(Bucket=bucket, ACL="public-read-write") 103 | 104 | fs = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_url}) 105 | s3fs.S3FileSystem.clear_instance_cache() 106 | fs.invalidate_cache() 107 | 108 | try: 109 | yield fs, endpoint_url 110 | finally: 111 | fs.rm(bucket, recursive=True) 112 | -------------------------------------------------------------------------------- /dask_geopandas/tests/io/test_arrow.py: -------------------------------------------------------------------------------- 1 | import dask.dataframe as dd 2 | 3 | import geopandas 4 | 5 | import dask_geopandas 6 | 7 | import pytest 8 | from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal 9 | from pandas.testing import assert_index_equal 10 | 11 | pa = pytest.importorskip("pyarrow") 12 | ds = pytest.importorskip("pyarrow.dataset") 13 | 14 | 15 | pytestmark = pytest.mark.filterwarnings( 16 | "ignore:this is an initial implementation:UserWarning" 17 | ) 18 | 19 | 20 | def test_read(tmp_path, naturalearth_lowres): 21 | df = geopandas.read_file(naturalearth_lowres) 22 | 23 | # writing a partitioned dataset with geopandas (to not rely on roundtrip) 24 | basedir = tmp_path / "dataset" 25 | basedir.mkdir() 26 | df.iloc[:100].to_feather(basedir / "data.0.feather") 27 | df.iloc[100:].to_feather(basedir / "data.1.feather") 28 | 29 | result = dask_geopandas.read_feather(basedir) 30 | assert isinstance(result, dask_geopandas.GeoDataFrame) 31 | assert result.npartitions == 2 32 | assert result.crs == df.crs 33 | assert result.spatial_partitions is not None 34 | # TODO this reset_index should not be necessary 35 | result_gpd = result.compute().reset_index(drop=True) 36 | assert_geodataframe_equal(result_gpd, df) 37 | 38 | 39 | def test_write(tmp_path, naturalearth_lowres): 40 | df = geopandas.read_file(naturalearth_lowres) 41 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 42 | 43 | basedir = tmp_path / "dataset" 44 | ddf.to_feather(basedir) 45 | 46 | # each partition (4) is written as a feather file 47 | paths = list(basedir.glob("*.feather")) 48 | assert len(paths) == 4 49 | 50 | # each individual file is a valid feather file 51 | result_part0 = geopandas.read_feather(basedir / "part.0.feather") 52 | result_part0.index.name = None 53 | assert_geodataframe_equal(result_part0, df.iloc[:45]) 54 | 55 | # TODO geopandas doesn't actually support this for "feather" format 56 | # # the written dataset is also readable by plain geopandas 57 | # result_gpd = geopandas.read_feather(basedir) 58 | # # the dataset written by dask has "__null_dask_index__" index column name 59 | # result_gpd.index.name = None 60 | # assert_geodataframe_equal(result_gpd, df) 61 | 62 | 63 | @pytest.mark.xfail # https://github.com/dask/dask/issues/8022 64 | def test_write_delayed(tmp_path, naturalearth_lowres): 65 | df = geopandas.read_file(naturalearth_lowres) 66 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 67 | 68 | basedir = tmp_path / "dataset" 69 | dataset = ddf.to_feather(basedir, compute=False) 70 | dataset.compute() 71 | result = dask_geopandas.read_feather(basedir) 72 | assert result.npartitions == 4 73 | # TODO this reset_index should not be necessary 74 | result_gpd = result.compute().reset_index(drop=True) 75 | assert_geodataframe_equal(result_gpd, df) 76 | 77 | 78 | def test_roundtrip(tmp_path, naturalearth_lowres): 79 | # basic roundtrip 80 | df = geopandas.read_file(naturalearth_lowres) 81 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 82 | 83 | basedir = tmp_path / "dataset" 84 | ddf.to_feather(basedir) 85 | 86 | # reading back gives identical GeoDataFrame 87 | result = dask_geopandas.read_feather(basedir) 88 | assert result.npartitions == 4 89 | assert result.crs == df.crs 90 | # TODO this reset_index should not be necessary 91 | result_gpd = result.compute().reset_index(drop=True) 92 | assert_geodataframe_equal(result_gpd, df) 93 | # reading back also populates the spatial partitioning property 94 | ddf.calculate_spatial_partitions() 95 | assert_geoseries_equal(result.spatial_partitions, ddf.spatial_partitions.envelope) 96 | 97 | 98 | def test_roundtrip_s3(s3_resource, s3_storage_options, naturalearth_lowres): 99 | fs, endpoint_url = s3_resource 100 | 101 | # basic roundtrip to S3 102 | df = geopandas.read_file(naturalearth_lowres) 103 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 104 | 105 | uri = "s3://geopandas-test/dataset.feather" 106 | ddf.to_feather(uri, storage_options=s3_storage_options) 107 | 108 | # reading back gives identical GeoDataFrame 109 | result = dask_geopandas.read_feather(uri, storage_options=s3_storage_options) 110 | assert result.npartitions == 4 111 | assert_geodataframe_equal(result.compute().reset_index(drop=True), df) 112 | # reading back correctly sets the CRS in meta 113 | assert result.crs == df.crs 114 | # reading back also populates the spatial partitioning property 115 | assert result.spatial_partitions is not None 116 | 117 | 118 | def test_column_selection_push_down(tmp_path, naturalearth_lowres): 119 | # set up dataset 120 | df = geopandas.read_file(naturalearth_lowres) 121 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 122 | basedir = tmp_path / "dataset" 123 | # TODO awaiting a `to_feather` implementation 124 | # ddf.to_feather(basedir) 125 | basedir.mkdir() 126 | for i, part in enumerate(ddf.partitions): 127 | part.compute().to_feather(basedir / f"part.{i}.feather") 128 | 129 | ddf = dask_geopandas.read_feather(basedir) 130 | 131 | # selecting columns including geometry column still gives GeoDataFrame 132 | ddf_subset = ddf[["pop_est", "geometry"]] 133 | assert type(ddf_subset) is dask_geopandas.GeoDataFrame 134 | # and also preserves the spatial partitioning information 135 | assert ddf_subset.spatial_partitions is not None 136 | 137 | # selecting a single non-geometry column on the dataframe should work 138 | s = ddf["pop_est"] 139 | assert type(s) is dd.Series 140 | assert s.max().compute() == df["pop_est"].max() 141 | 142 | 143 | def test_missing_metadata(tmp_path, naturalearth_lowres): 144 | df = geopandas.read_file(naturalearth_lowres) 145 | path = tmp_path / "test.feather" 146 | 147 | # convert to DataFrame with wkb -> writing to feather will have only pandas metadata 148 | df = df.to_wkb() 149 | df.to_feather(path) 150 | 151 | with pytest.raises(ValueError, match="Missing geo metadata"): 152 | dask_geopandas.read_feather(path) 153 | 154 | # remove metadata completely 155 | from pyarrow import feather 156 | 157 | table = feather.read_table(path) 158 | feather.write_feather(table.replace_schema_metadata(), path) 159 | 160 | with pytest.raises(ValueError, match="Missing geo metadata"): 161 | dask_geopandas.read_feather(path) 162 | 163 | 164 | @pytest.mark.parametrize( 165 | "filter", [[("continent", "=", "Africa")], ds.field("continent") == "Africa"] 166 | ) 167 | def test_filters(tmp_path, naturalearth_lowres, filter): 168 | # set up dataset 169 | df = geopandas.read_file(naturalearth_lowres) 170 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 171 | basedir = tmp_path / "dataset" 172 | ddf.to_feather(basedir) 173 | 174 | # specifying filters argument 175 | result = dask_geopandas.read_feather(basedir, filters=filter) 176 | assert result.npartitions == 4 177 | 178 | result_gpd = result.compute().reset_index(drop=True) 179 | expected = df[df["continent"] == "Africa"].reset_index(drop=True) 180 | assert_geodataframe_equal(result_gpd, expected) 181 | 182 | 183 | def test_index(tmp_path, naturalearth_lowres): 184 | # set up dataset 185 | df = geopandas.read_file(naturalearth_lowres) 186 | # get meaningful index by shuffling (hilbert distance) 187 | df = dask_geopandas.from_geopandas(df, npartitions=2).spatial_shuffle().compute() 188 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 189 | 190 | # roundtrip preserves the index by default 191 | basedir = tmp_path / "dataset" 192 | ddf.to_feather(basedir) 193 | result = dask_geopandas.read_feather(basedir) 194 | assert "hilbert_distance" not in result.columns 195 | assert result.index.name == "hilbert_distance" 196 | assert_index_equal(result.index.compute(), df.index) 197 | 198 | # TODO not setting the index 199 | with pytest.raises(NotImplementedError): 200 | result = dask_geopandas.read_feather(basedir, index=False) 201 | # assert "hilbert_distance" in result.columns 202 | # assert result.index.name is None 203 | 204 | # setting specific columns as the index 205 | result = dask_geopandas.read_feather(basedir, index="iso_a3") 206 | assert "iso_a3" not in result.columns 207 | assert result.index.name == "iso_a3" 208 | assert_geodataframe_equal(result.compute(), df.set_index("iso_a3")) 209 | 210 | # not writing the index 211 | basedir = tmp_path / "dataset" 212 | ddf.to_feather(basedir, write_index=False) 213 | result = dask_geopandas.read_feather(basedir) 214 | assert "hilbert_distance" not in result.columns 215 | assert result.index.name is None 216 | assert result.index.compute()[0] == 0 217 | 218 | 219 | def test_read_meta_is_empty(tmp_path, naturalearth_lowres): 220 | df = geopandas.read_file(naturalearth_lowres) 221 | 222 | basedir = tmp_path / "dataset" 223 | basedir.mkdir() 224 | df.iloc[:100].to_feather(basedir / "data.0.feather") 225 | df.iloc[100:].to_feather(basedir / "data.1.feather") 226 | 227 | result = dask_geopandas.read_feather(basedir) 228 | assert len(result._meta) == 0 229 | -------------------------------------------------------------------------------- /dask_geopandas/tests/io/test_backend_integration.py: -------------------------------------------------------------------------------- 1 | import geopandas 2 | 3 | import dask_geopandas 4 | 5 | import pytest 6 | from geopandas.testing import assert_geodataframe_equal 7 | 8 | try: 9 | import pyogrio # noqa: F401 10 | 11 | PYOGRIO = True 12 | except ImportError: 13 | PYOGRIO = False 14 | 15 | BACKENDS = ["arrow", "file", "parquet"] 16 | 17 | 18 | @pytest.fixture(params=BACKENDS) 19 | def backend(request): 20 | param = request.param 21 | if not PYOGRIO and param == "file": 22 | pytest.skip("Unable to import pyogrio for file backend") 23 | return param 24 | 25 | 26 | def from_arrow_backend(path, tmp_path, npartitions): 27 | df = geopandas.read_file(path) 28 | basedir = tmp_path / "dataset" 29 | basedir.mkdir() 30 | ddf = dask_geopandas.from_geopandas(df, npartitions=npartitions) 31 | for i, part in enumerate(ddf.partitions): 32 | part.compute().to_feather(basedir / f"data.{i}.feather") 33 | return dask_geopandas.read_feather(basedir) 34 | 35 | 36 | def from_file_backend(path, tmp_path, npartitions): 37 | return dask_geopandas.read_file(path, npartitions=npartitions) 38 | 39 | 40 | def from_parquet_backend(path, tmp_path, npartitions): 41 | ddf = dask_geopandas.from_geopandas( 42 | geopandas.read_file(path), npartitions=npartitions 43 | ) 44 | basedir = tmp_path / "dataset" 45 | ddf.to_parquet(basedir) 46 | return dask_geopandas.read_parquet(basedir) 47 | 48 | 49 | def get_from_backend(backend, data_path, tmp_path, npartitions=4): 50 | if backend == "arrow": 51 | ddf = from_arrow_backend(data_path, tmp_path, npartitions) 52 | elif backend == "file": 53 | ddf = from_file_backend(data_path, tmp_path, npartitions) 54 | elif backend == "parquet": 55 | ddf = from_parquet_backend(data_path, tmp_path, npartitions) 56 | else: 57 | raise ValueError() 58 | return ddf 59 | 60 | 61 | def test_spatial_shuffle_integration(backend, naturalearth_lowres, tmp_path): 62 | ddf = get_from_backend(backend, naturalearth_lowres, tmp_path) 63 | new_idx = ddf.hilbert_distance() 64 | expected = ddf.compute().set_index(new_idx.compute()) 65 | 66 | result = ddf.spatial_shuffle() 67 | # Sort because the index is shuffled 68 | assert_geodataframe_equal(result.compute().sort_index(), expected.sort_index()) 69 | -------------------------------------------------------------------------------- /dask_geopandas/tests/io/test_file.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pandas as pd 4 | 5 | import dask.dataframe as dd 6 | 7 | import geopandas 8 | from shapely.geometry import Polygon 9 | 10 | import dask_geopandas 11 | 12 | import pytest 13 | from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal 14 | from pandas.testing import assert_frame_equal, assert_series_equal 15 | 16 | pytest.importorskip("pyogrio") 17 | 18 | 19 | def test_read_file(naturalearth_lowres): 20 | path = naturalearth_lowres 21 | df = geopandas.read_file(path) 22 | result = dask_geopandas.read_file(path, npartitions=4) 23 | assert isinstance(result, dask_geopandas.GeoDataFrame) 24 | assert result.npartitions == 4 25 | assert result.crs == df.crs 26 | assert_geodataframe_equal(result.compute(), df) 27 | 28 | result = dask_geopandas.read_file(path, chunksize=100) 29 | assert isinstance(result, dask_geopandas.GeoDataFrame) 30 | assert result.npartitions == 2 31 | assert result.crs == df.crs 32 | assert_geodataframe_equal(result.compute(), df) 33 | 34 | msg = "Exactly one of npartitions and chunksize must be specified" 35 | with pytest.raises(ValueError, match=msg): 36 | dask_geopandas.read_file(path) 37 | with pytest.raises(ValueError, match=msg): 38 | dask_geopandas.read_file(path, npartitions=4, chunksize=100) 39 | 40 | 41 | def test_read_file_divisions(naturalearth_lowres): 42 | path = naturalearth_lowres 43 | result = dask_geopandas.read_file(path, npartitions=4) 44 | assert result.known_divisions 45 | assert result.index.divisions == (0, 45, 90, 135, 176) 46 | assert result.divisions == (0, 45, 90, 135, 176) 47 | 48 | 49 | def test_read_file_index(naturalearth_lowres): 50 | path = naturalearth_lowres 51 | df = geopandas.read_file(path) 52 | result = dask_geopandas.read_file(path, npartitions=4) 53 | assert (result.index.compute() == pd.RangeIndex(0, len(df))).all() 54 | 55 | 56 | def test_read_file_columns(naturalearth_lowres): 57 | path = naturalearth_lowres 58 | df = geopandas.read_file(path) 59 | 60 | # explicit column selection 61 | result = dask_geopandas.read_file( 62 | path, npartitions=4, columns=["pop_est", "geometry"] 63 | ) 64 | assert isinstance(result, dask_geopandas.GeoDataFrame) 65 | assert result.npartitions == 4 66 | assert result.crs == df.crs 67 | assert len(result.columns) == 2 68 | assert_geodataframe_equal(result.compute(), df[["pop_est", "geometry"]]) 69 | # only selecting non-geometry column 70 | result = dask_geopandas.read_file(path, npartitions=4, columns=["pop_est"]) 71 | assert type(result) == dd.DataFrame 72 | assert len(result.columns) == 1 73 | assert result.npartitions == 4 74 | assert_frame_equal(result.compute(), df[["pop_est"]]) 75 | 76 | # column selection through getitem 77 | ddf = dask_geopandas.read_file(path, npartitions=4) 78 | result = ddf[["pop_est", "geometry"]] 79 | assert isinstance(result, dask_geopandas.GeoDataFrame) 80 | assert result.npartitions == 4 81 | assert result.crs == df.crs 82 | assert_geodataframe_equal(result.compute(), df[["pop_est", "geometry"]]) 83 | 84 | # only select non-geometry column 85 | result = ddf["pop_est"] 86 | assert isinstance(result, dd.Series) 87 | assert_series_equal(result.compute(), df["pop_est"]) 88 | 89 | # only select geometry column 90 | result = ddf["geometry"] 91 | assert isinstance(result, dask_geopandas.GeoSeries) 92 | assert_geoseries_equal(result.compute(), df["geometry"]) 93 | 94 | 95 | def test_read_file_meta_is_empty(naturalearth_lowres): 96 | path = naturalearth_lowres 97 | result = dask_geopandas.read_file(path, npartitions=4) 98 | assert len(result._meta) == 0 99 | 100 | 101 | def test_read_file_layer(tmp_path): 102 | df_points = geopandas.GeoDataFrame( 103 | { 104 | "col": [1, 2, 3, 4], 105 | "geometry": geopandas.points_from_xy([1, 2, 3, 4], [2, 3, 4, 1]), 106 | }, 107 | crs=4326, 108 | ) 109 | df_polygons = geopandas.GeoDataFrame( 110 | { 111 | "col": [5, 6, 7, 8], 112 | "geometry": [ 113 | Polygon([(random.random(), random.random()) for i in range(3)]) 114 | for _ in range(4) 115 | ], 116 | }, 117 | crs=4326, 118 | ) 119 | 120 | path = tmp_path / "test_layers.gpkg" 121 | df_points.to_file(path, layer="points") 122 | df_polygons.to_file(path, layer="polygons") 123 | 124 | ddf_points = dask_geopandas.read_file(path, npartitions=2, layer="points") 125 | assert_geodataframe_equal(ddf_points.compute(), df_points) 126 | ddf_polygons = dask_geopandas.read_file(path, npartitions=2, layer="polygons") 127 | assert_geodataframe_equal(ddf_polygons.compute(), df_polygons) 128 | -------------------------------------------------------------------------------- /dask_geopandas/tests/io/test_parquet.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import dask.dataframe as dd 4 | 5 | import geopandas 6 | import shapely 7 | 8 | import dask_geopandas 9 | 10 | import pytest 11 | from geopandas.testing import assert_geodataframe_equal 12 | from pandas.testing import assert_series_equal 13 | 14 | pa = pytest.importorskip("pyarrow") 15 | 16 | 17 | pytestmark = pytest.mark.filterwarnings( 18 | "ignore:this is an initial implementation:UserWarning" 19 | ) 20 | 21 | 22 | def test_parquet_roundtrip(tmp_path, naturalearth_lowres): 23 | # basic roundtrip 24 | df = geopandas.read_file(naturalearth_lowres) 25 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 26 | 27 | basedir = tmp_path / "dataset" 28 | ddf.to_parquet(basedir) 29 | 30 | # each partition (4) is written as parquet file 31 | paths = list(basedir.glob("*.parquet")) 32 | assert len(paths) == 4 33 | 34 | # reading back gives identical GeoDataFrame 35 | result = dask_geopandas.read_parquet(basedir) 36 | assert result.npartitions == 4 37 | assert_geodataframe_equal(result.compute(), df) 38 | # reading back correctly sets the CRS in meta 39 | assert result.crs == df.crs 40 | # reading back also populates the spatial partitioning property 41 | assert result.spatial_partitions is not None 42 | assert result.spatial_partitions.crs == df.crs 43 | 44 | # the written dataset is also readable by plain geopandas 45 | result_gpd = geopandas.read_parquet(basedir) 46 | # the dataset written by dask has "__null_dask_index__" index column name 47 | result_gpd.index.name = None 48 | assert_geodataframe_equal(result_gpd, df) 49 | 50 | result_part0 = geopandas.read_parquet(basedir / "part.0.parquet") 51 | result_part0.index.name = None 52 | assert_geodataframe_equal(result_part0, df.iloc[:45]) 53 | 54 | 55 | def test_roundtrip_geometry_column_name(tmp_path, naturalearth_lowres): 56 | # basic roundtrip with different geometry column name 57 | df = geopandas.read_file(naturalearth_lowres) 58 | df = df.rename_geometry("geom") 59 | 60 | # geopandas -> dask-geopandas roundtrip 61 | path = tmp_path / "data.parquet" 62 | df.to_parquet(path) 63 | result = dask_geopandas.read_parquet(path) 64 | assert isinstance(result, dask_geopandas.GeoDataFrame) 65 | assert result.geometry.name == "geom" 66 | assert result.crs == df.crs 67 | assert result.spatial_partitions is not None 68 | assert_geodataframe_equal(result.compute(), df) 69 | 70 | # dask-geopandas -> dask-geopandas roundtrip 71 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 72 | assert ddf.geometry.name == "geom" 73 | basedir = tmp_path / "dataset" 74 | ddf.to_parquet(basedir) 75 | 76 | result = dask_geopandas.read_parquet(basedir) 77 | assert isinstance(result, dask_geopandas.GeoDataFrame) 78 | assert result.geometry.name == "geom" 79 | assert result.crs == df.crs 80 | assert result.spatial_partitions is not None 81 | assert_geodataframe_equal(result.compute(), df) 82 | 83 | 84 | def test_roundtrip_multiple_geometry_columns(tmp_path, naturalearth_lowres): 85 | # basic roundtrip with different geometry column name 86 | df = geopandas.read_file(naturalearth_lowres) 87 | df["geometry2"] = df.geometry.representative_point().to_crs("EPSG:3857") 88 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 89 | 90 | basedir = tmp_path / "dataset" 91 | ddf.to_parquet(basedir) 92 | 93 | result = dask_geopandas.read_parquet(basedir) 94 | assert isinstance(result, dask_geopandas.GeoDataFrame) 95 | assert result.crs == df.crs 96 | assert result.spatial_partitions is not None 97 | assert_geodataframe_equal(result.compute(), df) 98 | 99 | # ensure the geometry2 column is also considered as geometry in meta 100 | assert_series_equal(result.dtypes, df.dtypes) 101 | assert isinstance(result["geometry2"], dask_geopandas.GeoSeries) 102 | assert result["geometry"].crs == "EPSG:4326" 103 | assert result["geometry2"].crs == "EPSG:3857" 104 | 105 | 106 | def test_column_selection_push_down(tmp_path, naturalearth_lowres): 107 | # set up dataset 108 | df = geopandas.read_file(naturalearth_lowres) 109 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 110 | basedir = tmp_path / "dataset" 111 | ddf.to_parquet(basedir) 112 | 113 | ddf = dask_geopandas.read_parquet(basedir) 114 | 115 | # selecting columns including geometry column still gives GeoDataFrame 116 | ddf_subset = ddf[["pop_est", "geometry"]] 117 | assert type(ddf_subset) is dask_geopandas.GeoDataFrame 118 | # and also preserves the spatial partitioning information 119 | assert ddf_subset.spatial_partitions is not None 120 | 121 | # selecting a single non-geometry column on the dataframe should work 122 | s = ddf["pop_est"] 123 | assert type(s) is dd.Series 124 | assert s.max().compute() == df["pop_est"].max() 125 | 126 | 127 | def test_parquet_roundtrip_s3(s3_resource, s3_storage_options, naturalearth_lowres): 128 | fs, endpoint_url = s3_resource 129 | 130 | # basic roundtrip 131 | df = geopandas.read_file(naturalearth_lowres) 132 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 133 | 134 | uri = "s3://geopandas-test/dataset.parquet" 135 | ddf.to_parquet(uri, storage_options=s3_storage_options) 136 | 137 | # reading back gives identical GeoDataFrame 138 | result = dask_geopandas.read_parquet(uri, storage_options=s3_storage_options) 139 | assert result.npartitions == 4 140 | assert_geodataframe_equal(result.compute(), df) 141 | # reading back correctly sets the CRS in meta 142 | assert result.crs == df.crs 143 | # reading back also populates the spatial partitioning property 144 | assert result.spatial_partitions is not None 145 | 146 | 147 | def test_parquet_empty_partitions(tmp_path, naturalearth_lowres): 148 | df = geopandas.read_file(naturalearth_lowres) 149 | # Creating filtered dask dataframe with at least one empty partition 150 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 151 | ddf_filtered = ddf[ddf["pop_est"] > 1_000_000_000] 152 | assert (ddf_filtered.map_partitions(len).compute() == 0).any() 153 | 154 | basedir = tmp_path / "dataset" 155 | # TODO don't write metadata file as that fails with empty partitions on 156 | # inferring the schema 157 | ddf_filtered.to_parquet(basedir, write_metadata_file=False) 158 | 159 | result = dask_geopandas.read_parquet(basedir) 160 | assert_geodataframe_equal(result.compute(), df[df["pop_est"] > 1_000_000_000]) 161 | # once one partition has no spatial extent, we don't restore the spatial partitions 162 | assert result.spatial_partitions is None 163 | 164 | 165 | def test_parquet_partitions_with_all_missing_strings(tmp_path): 166 | df = geopandas.GeoDataFrame( 167 | {"col": ["a", "b", None, None]}, 168 | geometry=geopandas.points_from_xy([0, 1, 2, 3], [0, 1, 2, 3]), 169 | ) 170 | # Creating filtered dask dataframe with at least one empty partition 171 | ddf = dask_geopandas.from_geopandas(df, npartitions=2) 172 | 173 | basedir = tmp_path / "dataset" 174 | ddf.to_parquet(basedir) 175 | 176 | result = dask_geopandas.read_parquet(basedir) 177 | assert_geodataframe_equal(result.compute(), df) 178 | 179 | 180 | def test_parquet_empty_dataset(tmp_path): 181 | # ensure informative error message if there are no parts (otherwise 182 | # will raise in not finding any geo metadata) 183 | with pytest.raises(ValueError, match="No dataset parts discovered"): 184 | dask_geopandas.read_parquet(tmp_path / "data.*.parquet") 185 | 186 | 187 | @pytest.mark.parametrize("write_metadata_file", [True, False]) 188 | def test_parquet_partition_on(tmp_path, naturalearth_lowres, write_metadata_file): 189 | df = geopandas.read_file(naturalearth_lowres) 190 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 191 | 192 | # Writing a partitioned dataset based on one of the attribute columns 193 | basedir = tmp_path / "naturalearth_lowres_by_continent.parquet" 194 | ddf.to_parquet( 195 | basedir, partition_on="continent", write_metadata_file=write_metadata_file 196 | ) 197 | 198 | # Check for one of the partitions that the file is present and is correct 199 | n_files = 10 if write_metadata_file else 8 # 8 continents + 2 metadata files 200 | assert len(list(basedir.iterdir())) == n_files 201 | assert (basedir / "continent=Africa").exists() 202 | result_africa = geopandas.read_parquet(basedir / "continent=Africa") 203 | expected = df[df["continent"] == "Africa"].drop(columns=["continent"]) 204 | result_africa.index.name = None 205 | assert_geodataframe_equal(result_africa, expected) 206 | 207 | # Check roundtrip 208 | result = dask_geopandas.read_parquet(basedir) 209 | assert result.npartitions >= 8 210 | assert result.spatial_partitions is not None 211 | expected = df.copy() 212 | expected["continent"] = expected["continent"].astype("category") 213 | assert_geodataframe_equal(result.compute(), expected, check_like=True) 214 | 215 | 216 | def test_no_gather_spatial_partitions(tmp_path, naturalearth_lowres): 217 | # basic roundtrip 218 | df = geopandas.read_file(naturalearth_lowres) 219 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 220 | 221 | basedir = tmp_path / "dataset" 222 | ddf.to_parquet(basedir) 223 | 224 | result = dask_geopandas.read_parquet(basedir, gather_spatial_partitions=False) 225 | assert result.spatial_partitions is None 226 | assert result.crs == df.crs 227 | 228 | 229 | def test_read_parquet_default_crs(tmp_path): 230 | pyproj = pytest.importorskip("pyproj") 231 | import pyarrow.parquet as pq 232 | 233 | from geopandas.io.arrow import _geopandas_to_arrow 234 | 235 | gdf = geopandas.GeoDataFrame(geometry=[shapely.box(0, 0, 10, 10)]) 236 | gdf["other_geom"] = gdf["geometry"].centroid 237 | table = _geopandas_to_arrow(gdf) 238 | # update the geo metadata to strip 'crs' entry 239 | metadata = table.schema.metadata 240 | geo_metadata = json.loads(metadata[b"geo"].decode("utf-8")) 241 | del geo_metadata["columns"]["geometry"]["crs"] 242 | del geo_metadata["columns"]["other_geom"]["crs"] 243 | metadata.update({b"geo": json.dumps(geo_metadata).encode("utf-8")}) 244 | table = table.replace_schema_metadata(metadata) 245 | filename = str(tmp_path / "test.parquet") 246 | pq.write_table(table, filename) 247 | 248 | result = dask_geopandas.read_parquet(filename) 249 | assert result.crs.equals(pyproj.CRS("OGC:CRS84")) 250 | assert result["other_geom"].crs.equals(pyproj.CRS("OGC:CRS84")) 251 | 252 | 253 | def test_read_parquet_meta_is_empty(tmp_path, naturalearth_lowres): 254 | # basic roundtrip 255 | df = geopandas.read_file(naturalearth_lowres) 256 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 257 | 258 | basedir = tmp_path / "dataset" 259 | ddf.to_parquet(basedir) 260 | 261 | result = dask_geopandas.read_parquet(basedir) 262 | assert len(result._meta) == 0 263 | -------------------------------------------------------------------------------- /dask_geopandas/tests/test_clip.py: -------------------------------------------------------------------------------- 1 | import geopandas 2 | 3 | import dask_geopandas 4 | 5 | from .test_core import geodf_points # noqa: F401 6 | 7 | import pytest 8 | from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal 9 | 10 | 11 | def test_clip(naturalearth_lowres, naturalearth_cities): 12 | cities = geopandas.read_file(naturalearth_cities) 13 | dask_obj = dask_geopandas.from_geopandas(cities, npartitions=4) 14 | dask_obj.calculate_spatial_partitions() 15 | mask = geopandas.read_file(naturalearth_lowres).query("continent == 'Africa'") 16 | expected = geopandas.clip(cities, mask) 17 | clipped = dask_geopandas.clip(dask_obj, mask) 18 | 19 | assert isinstance(clipped.spatial_partitions, geopandas.GeoSeries) 20 | 21 | result = clipped.compute() 22 | assert_geodataframe_equal(expected.sort_index(), result.sort_index()) 23 | 24 | 25 | def test_clip_no_spatial_partitions(geodf_points): # noqa: F811 26 | dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) 27 | mask = geodf_points.iloc[:1] 28 | mask["geometry"] = mask["geometry"].buffer(2) 29 | expected = geodf_points.iloc[:2] 30 | result = dask_geopandas.clip(dask_obj, mask).compute() 31 | assert_geodataframe_equal(expected, result) 32 | 33 | 34 | def test_clip_dask_mask(geodf_points): # noqa: F811 35 | dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) 36 | mask = dask_geopandas.from_geopandas(geodf_points.iloc[:1], npartitions=1) 37 | with pytest.raises( 38 | NotImplementedError, match=r"Mask cannot be a Dask GeoDataFrame or GeoSeries." 39 | ): 40 | dask_geopandas.clip(dask_obj, mask) 41 | 42 | 43 | def test_clip_geoseries(geodf_points): # noqa: F811 44 | dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) 45 | dask_obj.calculate_spatial_partitions() 46 | mask = geodf_points.iloc[:1] 47 | mask["geometry"] = mask["geometry"].buffer(2) 48 | expected = geopandas.clip(geodf_points.geometry, mask) 49 | result = dask_geopandas.clip(dask_obj.geometry, mask).compute() 50 | assert_geoseries_equal(expected, result) 51 | -------------------------------------------------------------------------------- /dask_geopandas/tests/test_distributed.py: -------------------------------------------------------------------------------- 1 | from packaging.version import Version 2 | 3 | import geopandas 4 | 5 | import dask_geopandas 6 | 7 | import pytest 8 | from geopandas.testing import assert_geodataframe_equal 9 | 10 | distributed = pytest.importorskip("distributed") 11 | 12 | 13 | from distributed import Client, LocalCluster 14 | 15 | 16 | @pytest.mark.skipif( 17 | Version(distributed.__version__) < Version("2024.6.0"), 18 | reason="distributed < 2024.6 has a wrong assertion", 19 | # https://github.com/dask/distributed/pull/8667 20 | ) 21 | @pytest.mark.skipif( 22 | Version(distributed.__version__) < Version("0.13"), 23 | reason="geopandas < 0.13 does not implement sorting geometries", 24 | ) 25 | def test_spatial_shuffle(naturalearth_cities): 26 | df_points = geopandas.read_file(naturalearth_cities) 27 | 28 | with LocalCluster(n_workers=1) as cluster: 29 | with Client(cluster): 30 | ddf_points = dask_geopandas.from_geopandas(df_points, npartitions=4) 31 | 32 | ddf_result = ddf_points.spatial_shuffle( 33 | by="hilbert", calculate_partitions=False 34 | ) 35 | result = ddf_result.compute() 36 | 37 | expected = df_points.sort_values("geometry").reset_index(drop=True) 38 | assert_geodataframe_equal(result.reset_index(drop=True), expected) 39 | -------------------------------------------------------------------------------- /dask_geopandas/tests/test_geohash.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import geopandas 5 | from shapely.geometry import LineString, Point, Polygon 6 | from shapely.wkt import loads 7 | 8 | from dask_geopandas import from_geopandas 9 | from dask_geopandas.geohash import _calculate_mid_points 10 | 11 | import pytest 12 | from numpy.testing import assert_array_equal 13 | from pandas.testing import assert_index_equal 14 | 15 | 16 | @pytest.fixture 17 | def geoseries_points(): 18 | p1 = Point(1, 2) 19 | p2 = Point(2, 3) 20 | p3 = Point(3, 4) 21 | p4 = Point(4, 1) 22 | return geopandas.GeoSeries([p1, p2, p3, p4]) 23 | 24 | 25 | @pytest.fixture 26 | def geoseries_lines(): 27 | l1 = LineString([(0, 0), (0, 1), (1, 1)]) 28 | l2 = LineString([(0, 0), (1, 0), (1, 1), (0, 1)]) 29 | return geopandas.GeoSeries([l1, l2] * 2) 30 | 31 | 32 | @pytest.fixture() 33 | def geoseries_polygons(): 34 | t1 = Polygon([(0, 3.5), (7, 2.4), (1, 0.1)]) 35 | t2 = Polygon([(0, 0), (1, 1), (0, 1)]) 36 | sq1 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]) 37 | sq2 = Polygon([(0, 0), (1, 0), (1, 2), (0, 2)]) 38 | return geopandas.GeoSeries([t1, t2, sq1, sq2]) 39 | 40 | 41 | def geohash_dask(geoseries): 42 | pygeohash = pytest.importorskip("pygeohash") 43 | 44 | p = 12 45 | as_string = True 46 | bounds = geoseries.bounds.to_numpy() 47 | x_mids, y_mids = _calculate_mid_points(bounds) 48 | 49 | geohash_vec = np.vectorize(pygeohash.encode) 50 | # Encode mid points of geometries using geohash 51 | expected = geohash_vec(y_mids, x_mids, p) 52 | 53 | ddf = from_geopandas(geoseries, npartitions=1) 54 | result = ddf.geohash(precision=p, as_string=as_string).compute() 55 | 56 | assert_array_equal(np.array(result), expected) 57 | assert isinstance(result, pd.Series) 58 | assert_index_equal(ddf.index.compute(), result.index) 59 | 60 | 61 | def test_geohash_points(geoseries_points): 62 | geohash_dask(geoseries_points) 63 | 64 | 65 | def test_geohash_lines(geoseries_lines): 66 | geohash_dask(geoseries_lines) 67 | 68 | 69 | def test_geohash_polygons(geoseries_polygons): 70 | geohash_dask(geoseries_polygons) 71 | 72 | 73 | def test_geohash_range(geoseries_points): 74 | 75 | ddf = from_geopandas(geoseries_points, npartitions=1) 76 | 77 | with pytest.raises(ValueError): 78 | ddf.geohash(precision=0, as_string=False) 79 | ddf.geohash(precision=12, as_string=False) 80 | 81 | 82 | def test_world(naturalearth_lowres): 83 | # world without Fiji 84 | geohash_dask(geopandas.read_file(naturalearth_lowres).iloc[1:]) 85 | 86 | 87 | @pytest.mark.parametrize( 88 | "empty", 89 | [ 90 | None, 91 | loads("POLYGON EMPTY"), 92 | ], 93 | ) 94 | def test_empty(geoseries_polygons, empty): 95 | s = geoseries_polygons 96 | s.iloc[-1] = empty 97 | dask_obj = from_geopandas(s, npartitions=2) 98 | with pytest.raises( 99 | ValueError, match="cannot be computed on a GeoSeries with empty" 100 | ): 101 | dask_obj.geohash().compute() 102 | -------------------------------------------------------------------------------- /dask_geopandas/tests/test_hilbert_distance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import geopandas 5 | from shapely.geometry import LineString, Point, Polygon 6 | from shapely.wkt import loads 7 | 8 | from dask_geopandas import from_geopandas 9 | from dask_geopandas.hilbert_distance import ( 10 | _continuous_to_discrete_coords, 11 | _hilbert_distance, 12 | ) 13 | 14 | import pytest 15 | from pandas.testing import assert_index_equal, assert_series_equal 16 | 17 | 18 | def test_hilbert_distance(): 19 | # test the actual Hilbert Code algorithm against some hardcoded values 20 | geoms = geopandas.GeoSeries.from_wkt( 21 | [ 22 | "POINT (0 0)", 23 | "POINT (1 1)", 24 | "POINT (1 0)", 25 | "POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))", 26 | ] 27 | ) 28 | result = _hilbert_distance(geoms, total_bounds=(0, 0, 1, 1), level=2) 29 | assert result.tolist() == [0, 10, 15, 2] 30 | 31 | result = _hilbert_distance(geoms, total_bounds=(0, 0, 1, 1), level=3) 32 | assert result.tolist() == [0, 42, 63, 10] 33 | 34 | result = _hilbert_distance(geoms, total_bounds=(0, 0, 1, 1), level=16) 35 | assert result.tolist() == [0, 2863311530, 4294967295, 715827882] 36 | 37 | 38 | @pytest.fixture 39 | def geoseries_points(): 40 | p1 = Point(1, 2) 41 | p2 = Point(2, 3) 42 | p3 = Point(3, 4) 43 | p4 = Point(4, 1) 44 | return geopandas.GeoSeries([p1, p2, p3, p4]) 45 | 46 | 47 | @pytest.fixture 48 | def geoseries_lines(): 49 | l1 = LineString([(0, 0), (0, 1), (1, 1)]) 50 | l2 = LineString([(0, 0), (1, 0), (1, 1), (0, 1)]) 51 | return geopandas.GeoSeries([l1, l2] * 2) 52 | 53 | 54 | @pytest.fixture() 55 | def geoseries_polygons(): 56 | t1 = Polygon([(0, 3.5), (7, 2.4), (1, 0.1)]) 57 | t2 = Polygon([(0, 0), (1, 1), (0, 1)]) 58 | sq1 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]) 59 | sq2 = Polygon([(0, 0), (1, 0), (1, 2), (0, 2)]) 60 | return geopandas.GeoSeries([t1, t2, sq1, sq2]) 61 | 62 | 63 | def hilbert_distance_dask(geoseries, level=16): 64 | pytest.importorskip("hilbertcurve") 65 | from hilbertcurve.hilbertcurve import HilbertCurve 66 | 67 | bounds = geoseries.bounds.to_numpy() 68 | total_bounds = geoseries.total_bounds 69 | x, y = _continuous_to_discrete_coords( 70 | bounds, level=level, total_bounds=total_bounds 71 | ) 72 | coords = np.stack((x, y), axis=1) 73 | 74 | hilbert_curve = HilbertCurve(p=level, n=2) 75 | expected = hilbert_curve.distances_from_points(coords) 76 | 77 | ddf = from_geopandas(geoseries, npartitions=1) 78 | result = ddf.hilbert_distance(level=level).compute() 79 | 80 | assert list(result) == expected 81 | assert isinstance(result, pd.Series) 82 | assert_index_equal(ddf.index.compute(), result.index) 83 | 84 | 85 | @pytest.mark.parametrize("level", [2, 10, 15, 16]) 86 | def test_hilbert_distance_points(geoseries_points, level): 87 | hilbert_distance_dask(geoseries_points, level) 88 | 89 | 90 | @pytest.mark.parametrize("level", [2, 10, 15, 16]) 91 | def test_hilbert_distance_lines(geoseries_lines, level): 92 | hilbert_distance_dask(geoseries_lines, level) 93 | 94 | 95 | @pytest.mark.parametrize("level", [2, 10, 15, 16]) 96 | def test_hilbert_distance_polygons(geoseries_polygons, level): 97 | hilbert_distance_dask(geoseries_polygons, level) 98 | 99 | 100 | def test_hilbert_distance_level(geoseries_points): 101 | ddf = from_geopandas(geoseries_points, npartitions=1) 102 | with pytest.raises(ValueError): 103 | ddf.hilbert_distance(level=20).compute() 104 | 105 | 106 | def test_specified_total_bounds(geoseries_polygons): 107 | ddf = from_geopandas(geoseries_polygons, npartitions=2) 108 | 109 | result = ddf.hilbert_distance(total_bounds=geoseries_polygons.total_bounds) 110 | expected = ddf.hilbert_distance() 111 | assert_series_equal(result.compute(), expected.compute()) 112 | 113 | 114 | def test_total_bounds_from_partitions(geoseries_polygons): 115 | ddf = from_geopandas(geoseries_polygons, npartitions=2) 116 | expected = ddf.hilbert_distance().compute() 117 | 118 | ddf.calculate_spatial_partitions() 119 | result = ddf.hilbert_distance().compute() 120 | assert_series_equal(result, expected) 121 | 122 | 123 | def test_world(naturalearth_lowres): 124 | # world without Fiji 125 | hilbert_distance_dask(geopandas.read_file(naturalearth_lowres).iloc[1:]) 126 | 127 | 128 | @pytest.mark.parametrize( 129 | "empty", 130 | [ 131 | None, 132 | loads("POLYGON EMPTY"), 133 | ], 134 | ) 135 | def test_empty(geoseries_polygons, empty): 136 | s = geoseries_polygons 137 | s.iloc[-1] = empty 138 | dask_obj = from_geopandas(s, npartitions=2) 139 | with pytest.raises( 140 | ValueError, match="cannot be computed on a GeoSeries with empty" 141 | ): 142 | dask_obj.hilbert_distance().compute() 143 | -------------------------------------------------------------------------------- /dask_geopandas/tests/test_morton_distance.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import geopandas 4 | from shapely.geometry import LineString, Point, Polygon 5 | from shapely.wkt import loads 6 | 7 | from dask_geopandas import from_geopandas 8 | from dask_geopandas.hilbert_distance import _continuous_to_discrete_coords 9 | 10 | import pytest 11 | from pandas.testing import assert_index_equal, assert_series_equal 12 | 13 | 14 | @pytest.fixture 15 | def geoseries_points(): 16 | p1 = Point(1, 2) 17 | p2 = Point(2, 3) 18 | p3 = Point(3, 4) 19 | p4 = Point(4, 1) 20 | return geopandas.GeoSeries([p1, p2, p3, p4]) 21 | 22 | 23 | @pytest.fixture 24 | def geoseries_lines(): 25 | l1 = LineString([(0, 0), (0, 1), (1, 1)]) 26 | l2 = LineString([(0, 0), (1, 0), (1, 1), (0, 1)]) 27 | return geopandas.GeoSeries([l1, l2] * 2) 28 | 29 | 30 | @pytest.fixture() 31 | def geoseries_polygons(): 32 | t1 = Polygon([(0, 3.5), (7, 2.4), (1, 0.1)]) 33 | t2 = Polygon([(0, 0), (1, 1), (0, 1)]) 34 | sq1 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]) 35 | sq2 = Polygon([(0, 0), (1, 0), (1, 2), (0, 2)]) 36 | return geopandas.GeoSeries([t1, t2, sq1, sq2]) 37 | 38 | 39 | def morton_distance_dask(geoseries): 40 | # https://github.com/trevorprater/pymorton 41 | pymorton = pytest.importorskip("pymorton") 42 | 43 | bounds = geoseries.bounds.to_numpy() 44 | total_bounds = geoseries.total_bounds 45 | x_coords, y_coords = _continuous_to_discrete_coords( 46 | bounds, level=16, total_bounds=total_bounds 47 | ) 48 | 49 | ddf = from_geopandas(geoseries, npartitions=1) 50 | result = ddf.morton_distance().compute() 51 | 52 | expected = [] 53 | 54 | for i in range(len(x_coords)): 55 | x = int(x_coords[i]) 56 | y = int(y_coords[i]) 57 | expected.append(pymorton.interleave(x, y)) 58 | 59 | assert list(result) == expected 60 | assert isinstance(result, pd.Series) 61 | assert_index_equal(ddf.index.compute(), result.index) 62 | 63 | 64 | def test_morton_distance_points(geoseries_points): 65 | morton_distance_dask(geoseries_points) 66 | 67 | 68 | def test_morton_distance_lines(geoseries_lines): 69 | morton_distance_dask(geoseries_lines) 70 | 71 | 72 | def test_morton_distance_polygons(geoseries_polygons): 73 | morton_distance_dask(geoseries_polygons) 74 | 75 | 76 | def test_specified_total_bounds(geoseries_polygons): 77 | ddf = from_geopandas(geoseries_polygons, npartitions=2) 78 | 79 | result = ddf.morton_distance(total_bounds=geoseries_polygons.total_bounds) 80 | expected = ddf.morton_distance() 81 | assert_series_equal(result.compute(), expected.compute()) 82 | 83 | 84 | def test_total_bounds_from_partitions(geoseries_polygons): 85 | ddf = from_geopandas(geoseries_polygons, npartitions=2) 86 | expected = ddf.morton_distance().compute() 87 | 88 | ddf.calculate_spatial_partitions() 89 | result = ddf.morton_distance().compute() 90 | assert_series_equal(result, expected) 91 | 92 | 93 | def test_world(naturalearth_lowres): 94 | # world without Fiji 95 | morton_distance_dask(geopandas.read_file(naturalearth_lowres).iloc[1:]) 96 | 97 | 98 | @pytest.mark.parametrize( 99 | "empty", 100 | [ 101 | None, 102 | loads("POLYGON EMPTY"), 103 | ], 104 | ) 105 | def test_empty(geoseries_polygons, empty): 106 | s = geoseries_polygons 107 | s.iloc[-1] = empty 108 | dask_obj = from_geopandas(s, npartitions=2) 109 | with pytest.raises( 110 | ValueError, match="cannot be computed on a GeoSeries with empty" 111 | ): 112 | dask_obj.morton_distance().compute() 113 | -------------------------------------------------------------------------------- /dask_geopandas/tests/test_sjoin.py: -------------------------------------------------------------------------------- 1 | import geopandas 2 | import shapely 3 | 4 | import dask_geopandas 5 | 6 | import pytest 7 | from geopandas.testing import assert_geodataframe_equal 8 | 9 | 10 | def test_sjoin_dask_geopandas(naturalearth_lowres, naturalearth_cities): 11 | df_points = geopandas.read_file(naturalearth_cities) 12 | ddf_points = dask_geopandas.from_geopandas(df_points, npartitions=4) 13 | 14 | df_polygons = geopandas.read_file(naturalearth_lowres) 15 | ddf_polygons = dask_geopandas.from_geopandas(df_polygons, npartitions=4) 16 | 17 | expected = geopandas.sjoin(df_points, df_polygons, predicate="within", how="inner") 18 | expected = expected.sort_index() 19 | 20 | # dask / geopandas 21 | result = dask_geopandas.sjoin( 22 | ddf_points, df_polygons, predicate="within", how="inner" 23 | ) 24 | assert_geodataframe_equal(expected, result.compute().sort_index()) 25 | 26 | # geopandas / dask 27 | result = dask_geopandas.sjoin( 28 | df_points, ddf_polygons, predicate="within", how="inner" 29 | ) 30 | assert_geodataframe_equal(expected, result.compute().sort_index()) 31 | 32 | # dask / dask 33 | result = dask_geopandas.sjoin( 34 | ddf_points, ddf_polygons, predicate="within", how="inner" 35 | ) 36 | assert_geodataframe_equal(expected, result.compute().sort_index()) 37 | 38 | # with spatial_partitions 39 | ddf_points.calculate_spatial_partitions() 40 | ddf_polygons.calculate_spatial_partitions() 41 | result = dask_geopandas.sjoin( 42 | ddf_points, ddf_polygons, predicate="within", how="inner" 43 | ) 44 | assert isinstance(result.spatial_partitions, geopandas.GeoSeries) 45 | assert_geodataframe_equal(expected, result.compute().sort_index()) 46 | 47 | # check warning 48 | with pytest.warns(FutureWarning, match="The `op` parameter is deprecated"): 49 | dask_geopandas.sjoin(df_points, ddf_polygons, op="within", how="inner") 50 | 51 | 52 | def test_no_value_error(): 53 | # https://github.com/geopandas/dask-geopandas/issues/303 54 | shape = shapely.geometry.box(-74.5, -74.0, 4.5, 5.0) 55 | df = dask_geopandas.from_geopandas( 56 | geopandas.GeoDataFrame(geometry=[shape]), npartitions=1 57 | ).spatial_shuffle() 58 | # no TypeError 59 | df.sjoin(df).compute() 60 | -------------------------------------------------------------------------------- /dask_geopandas/tests/test_spatial_partitioning.py: -------------------------------------------------------------------------------- 1 | import geopandas 2 | 3 | import dask_geopandas 4 | 5 | import pytest 6 | from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal 7 | 8 | 9 | def test_propagate_on_geometry_access(naturalearth_lowres): 10 | # ensure the spatial_partitioning information is preserved in GeoSeries 11 | df = geopandas.read_file(naturalearth_lowres) 12 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 13 | ddf.calculate_spatial_partitions() 14 | spatial_partitions = ddf.spatial_partitions.copy() 15 | 16 | # geometry attribute 17 | gs = ddf.geometry 18 | assert gs.spatial_partitions is not None 19 | assert_geoseries_equal(gs.spatial_partitions, spatial_partitions) 20 | 21 | # column access 22 | gs = ddf["geometry"] 23 | assert gs.spatial_partitions is not None 24 | assert_geoseries_equal(gs.spatial_partitions, spatial_partitions) 25 | 26 | # subset geodataframe 27 | subset = ddf[["continent", "geometry"]] 28 | assert subset.spatial_partitions is not None 29 | assert_geoseries_equal(subset.spatial_partitions, spatial_partitions) 30 | 31 | 32 | @pytest.mark.parametrize( 33 | "attr", ["boundary", "centroid", "convex_hull", "envelope", "exterior"] 34 | ) 35 | def test_propagate_geoseries_properties(naturalearth_lowres, attr): 36 | df = geopandas.read_file(naturalearth_lowres) 37 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 38 | ddf.calculate_spatial_partitions() 39 | spatial_partitions = ddf.spatial_partitions.copy() 40 | 41 | result = getattr(ddf, attr) 42 | assert result.spatial_partitions is not None 43 | assert_geoseries_equal(result.spatial_partitions, spatial_partitions) 44 | assert_geoseries_equal(result.compute(), getattr(df, attr)) 45 | 46 | 47 | def test_cx(naturalearth_lowres): 48 | # test cx using spatial partitions 49 | df = geopandas.read_file(naturalearth_lowres) 50 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 51 | ddf.calculate_spatial_partitions() 52 | 53 | subset = ddf.cx[-180:-70, 0:-80] 54 | assert len(subset) == 8 55 | expected = df.cx[-180:-70, 0:-80] 56 | assert_geodataframe_equal(subset.compute(), expected) 57 | 58 | # empty slice 59 | subset = ddf.cx[-200:-190, 300:400] 60 | assert len(subset) == 0 61 | expected = df.cx[-200:-190, 300:400] 62 | assert_geodataframe_equal(subset.compute(), expected) 63 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | geopandas>=0.10 2 | numpydoc==1.1.0 3 | sphinx-book-theme 4 | myst-nb 5 | myst-parser 6 | sphinx_copybutton 7 | sphinx 8 | matplotlib 9 | -------------------------------------------------------------------------------- /doc/source/_static/binary_geo-difference.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/_static/binary_geo-intersection.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/_static/binary_geo-symm_diff.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/_static/binary_geo-union.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/_static/binary_op-01.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/_static/binary_op-02.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/_static/binary_op-03.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* colors */ 2 | 3 | :root { 4 | --pst-color-primary: 19, 156, 90; 5 | --pst-color-active-navigation: 19, 156, 90; 6 | --pst-color-h2: var(--color-text-base); 7 | --pst-color-link: 19, 156, 90; 8 | } -------------------------------------------------------------------------------- /doc/source/api.rst: -------------------------------------------------------------------------------- 1 | .. _reference: 2 | 3 | API Reference 4 | ============= 5 | 6 | The API Reference provides an overview of all public objects, functions and methods implemented in Dask-GeoPandas. 7 | 8 | .. warning:: 9 | Some docstrings are taken directly from GeoPandas or Dask. Some inconsistencies with the Dask-GeoPandas version may exist. 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | GeoSeries 15 | GeoDataFrame 16 | Input/output 17 | Tools 18 | 19 | -------------------------------------------------------------------------------- /doc/source/changelog.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../CHANGELOG.md 2 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | import dask_geopandas # noqa 17 | 18 | autodoc_mock_imports = [ 19 | "shapely", 20 | "dask", 21 | ] 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = "dask-geopandas" 26 | copyright = "2020-, GeoPandas development team" 27 | author = "GeoPandas development team" 28 | 29 | # The full version, including alpha/beta/rc tags 30 | release = version = dask_geopandas.__version__ 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # Add any Sphinx extension module names here, as strings. They can be 36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 37 | # ones. 38 | extensions = [ 39 | "sphinx.ext.autodoc", 40 | "numpydoc", 41 | "sphinx.ext.autosummary", 42 | "myst_nb", 43 | "sphinx_copybutton", 44 | ] 45 | 46 | numpydoc_show_class_members = False 47 | autosummary_generate = True 48 | jupyter_execute_notebooks = "auto" 49 | execution_excludepatterns = [ 50 | "basic-intro.ipynb", 51 | "dissolve.ipynb", 52 | "spatial-partitioning.ipynb", 53 | ] 54 | 55 | 56 | def setup(app): 57 | app.add_css_file("custom.css") # may also be an URL 58 | 59 | 60 | # Add any paths that contain templates here, relative to this directory. 61 | templates_path = ["_templates"] 62 | 63 | # List of patterns, relative to source directory, that match files and 64 | # directories to ignore when looking for source files. 65 | # This pattern also affects html_static_path and html_extra_path. 66 | exclude_patterns = [] 67 | 68 | 69 | # -- Options for HTML output ------------------------------------------------- 70 | 71 | # The theme to use for HTML and HTML Help pages. See the documentation for 72 | # a list of builtin themes. 73 | # 74 | html_theme = "sphinx_book_theme" 75 | 76 | html_theme_options = { 77 | "repository_url": "https://github.com/geopandas/dask-geopandas", 78 | "use_repository_button": True, 79 | "use_fullscreen_button": False, 80 | } 81 | html_title = "dask-geopandas" 82 | # Add any paths that contain custom static files (such as style sheets) here, 83 | # relative to this directory. They are copied after the builtin static files, 84 | # so a file named "default.css" will overwrite the builtin "default.css". 85 | html_static_path = ["_static"] 86 | -------------------------------------------------------------------------------- /doc/source/docs/reference/geodataframe.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | GeoDataFrame 3 | ============ 4 | .. currentmodule:: dask_geopandas 5 | 6 | A ``GeoDataFrame`` is a tabular data structure that contains a column 7 | which stores geometries (a ``GeoSeries``). 8 | 9 | Constructor 10 | ----------- 11 | .. autosummary:: 12 | :toctree: api/ 13 | 14 | GeoDataFrame 15 | 16 | Serialization / IO / conversion 17 | ------------------------------- 18 | 19 | .. autosummary:: 20 | :toctree: api/ 21 | 22 | GeoDataFrame.to_dask_dataframe 23 | GeoDataFrame.to_feather 24 | GeoDataFrame.to_parquet 25 | GeoDataFrame.to_wkb 26 | GeoDataFrame.to_wkt 27 | 28 | Projection handling 29 | ------------------- 30 | 31 | .. autosummary:: 32 | :toctree: api/ 33 | 34 | GeoDataFrame.crs 35 | GeoDataFrame.set_crs 36 | GeoDataFrame.to_crs 37 | 38 | Active geometry handling 39 | ------------------------ 40 | 41 | .. autosummary:: 42 | :toctree: api/ 43 | 44 | GeoDataFrame.set_geometry 45 | GeoDataFrame.rename_geometry 46 | 47 | Aggregating and exploding 48 | ------------------------- 49 | 50 | .. autosummary:: 51 | :toctree: api/ 52 | 53 | GeoDataFrame.explode 54 | GeoDataFrame.dissolve 55 | 56 | Spatial joins 57 | ------------- 58 | 59 | .. autosummary:: 60 | :toctree: api/ 61 | 62 | GeoDataFrame.sjoin 63 | 64 | Overlay operations 65 | ------------------ 66 | 67 | .. autosummary:: 68 | :toctree: api/ 69 | 70 | GeoDataFrame.clip 71 | 72 | Indexing 73 | -------- 74 | 75 | .. autosummary:: 76 | :toctree: api/ 77 | 78 | GeoDataFrame.cx 79 | 80 | Spatial partitioning 81 | -------------------- 82 | 83 | .. autosummary:: 84 | :toctree: api/ 85 | 86 | GeoDataFrame.spatial_shuffle 87 | 88 | 89 | All dask ``DataFrame`` methods are also available, although they may 90 | not operate in a meaningful way on the ``geometry`` column. All methods 91 | listed in `GeoSeries `__ work directly on an active geometry column of GeoDataFrame. 92 | 93 | -------------------------------------------------------------------------------- /doc/source/docs/reference/geoseries.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | GeoSeries 3 | ========= 4 | .. currentmodule:: dask_geopandas 5 | 6 | Constructor 7 | ----------- 8 | .. autosummary:: 9 | :toctree: api/ 10 | 11 | GeoSeries 12 | 13 | General methods and attributes 14 | ------------------------------ 15 | 16 | .. autosummary:: 17 | :toctree: api/ 18 | 19 | GeoSeries.area 20 | GeoSeries.boundary 21 | GeoSeries.bounds 22 | GeoSeries.total_bounds 23 | GeoSeries.length 24 | GeoSeries.geom_type 25 | GeoSeries.distance 26 | GeoSeries.representative_point 27 | GeoSeries.exterior 28 | GeoSeries.interiors 29 | GeoSeries.x 30 | GeoSeries.y 31 | GeoSeries.z 32 | 33 | Unary predicates 34 | ---------------- 35 | 36 | .. autosummary:: 37 | :toctree: api/ 38 | 39 | GeoSeries.is_empty 40 | GeoSeries.is_ring 41 | GeoSeries.is_simple 42 | GeoSeries.is_valid 43 | GeoSeries.has_z 44 | 45 | 46 | Binary Predicates 47 | ----------------- 48 | 49 | .. autosummary:: 50 | :toctree: api/ 51 | 52 | GeoSeries.contains 53 | GeoSeries.crosses 54 | GeoSeries.disjoint 55 | GeoSeries.geom_equals 56 | GeoSeries.geom_equals_exact 57 | GeoSeries.intersects 58 | GeoSeries.overlaps 59 | GeoSeries.touches 60 | GeoSeries.within 61 | GeoSeries.covers 62 | GeoSeries.covered_by 63 | 64 | 65 | Set-theoretic Methods 66 | --------------------- 67 | 68 | .. autosummary:: 69 | :toctree: api/ 70 | 71 | GeoSeries.difference 72 | GeoSeries.intersection 73 | GeoSeries.symmetric_difference 74 | GeoSeries.union 75 | 76 | Constructive Methods and Attributes 77 | ----------------------------------- 78 | 79 | .. autosummary:: 80 | :toctree: api/ 81 | 82 | GeoSeries.buffer 83 | GeoSeries.boundary 84 | GeoSeries.centroid 85 | GeoSeries.convex_hull 86 | GeoSeries.envelope 87 | GeoSeries.simplify 88 | 89 | Affine transformations 90 | ---------------------- 91 | 92 | .. autosummary:: 93 | :toctree: api/ 94 | 95 | GeoSeries.affine_transform 96 | GeoSeries.rotate 97 | GeoSeries.scale 98 | GeoSeries.skew 99 | GeoSeries.translate 100 | 101 | Aggregating and exploding 102 | ------------------------- 103 | 104 | .. autosummary:: 105 | :toctree: api/ 106 | 107 | GeoSeries.unary_union 108 | GeoSeries.explode 109 | 110 | Serialization / IO / conversion 111 | ------------------------------- 112 | 113 | .. autosummary:: 114 | :toctree: api/ 115 | 116 | GeoSeries.to_wkb 117 | GeoSeries.to_wkt 118 | 119 | Projection handling 120 | ------------------- 121 | 122 | .. autosummary:: 123 | :toctree: api/ 124 | 125 | GeoSeries.crs 126 | GeoSeries.set_crs 127 | GeoSeries.to_crs 128 | 129 | Missing values 130 | -------------- 131 | 132 | .. autosummary:: 133 | :toctree: api/ 134 | 135 | GeoSeries.fillna 136 | GeoSeries.isna 137 | 138 | Overlay operations 139 | ------------------ 140 | 141 | .. autosummary:: 142 | :toctree: api/ 143 | 144 | GeoSeries.clip 145 | 146 | Indexing 147 | -------- 148 | 149 | .. autosummary:: 150 | :toctree: api/ 151 | 152 | GeoSeries.cx 153 | 154 | Spatial partitioning 155 | -------------------- 156 | 157 | .. autosummary:: 158 | :toctree: api/ 159 | 160 | GeoSeries.calculate_spatial_partitions 161 | GeoSeries.hilbert_distance 162 | GeoSeries.morton_distance 163 | GeoSeries.geohash 164 | 165 | 166 | 167 | Methods of dask ``Series`` objects are also available, although not 168 | all are applicable to geometric objects and some may return a 169 | ``Series`` rather than a ``GeoSeries`` result when appropriate. The methods 170 | ``isna()`` and ``fillna()`` have been 171 | implemented specifically for ``GeoSeries`` and are expected to work 172 | correctly. 173 | -------------------------------------------------------------------------------- /doc/source/docs/reference/io.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Input/output 3 | ============ 4 | .. currentmodule:: dask_geopandas 5 | 6 | GeoPandas 7 | ---------------- 8 | .. autosummary:: 9 | :toctree: api/ 10 | 11 | from_dask_dataframe 12 | from_geopandas 13 | 14 | GIS files 15 | --------- 16 | 17 | .. autosummary:: 18 | :toctree: api/ 19 | 20 | read_file 21 | 22 | Parquet 23 | ------- 24 | .. autosummary:: 25 | :toctree: api/ 26 | 27 | read_parquet 28 | GeoDataFrame.to_parquet 29 | 30 | Feather 31 | ------- 32 | .. autosummary:: 33 | :toctree: api/ 34 | 35 | read_feather 36 | GeoDataFrame.to_feather 37 | -------------------------------------------------------------------------------- /doc/source/docs/reference/tools.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Tools 3 | ===== 4 | .. currentmodule:: dask_geopandas 5 | 6 | .. autosummary:: 7 | :toctree: api/ 8 | 9 | sjoin 10 | clip 11 | points_from_xy 12 | from_wkt 13 | from_wkb -------------------------------------------------------------------------------- /doc/source/getting_started.md: -------------------------------------------------------------------------------- 1 | # Getting started 2 | 3 | The relationship between Dask-GeoPandas and GeoPandas is the same as the relationship 4 | between `dask.dataframe` and `pandas`. We recommend checking the 5 | [Dask documentation](https://docs.dask.org/en/latest/dataframe.html) to better understand how 6 | DataFrames are scaled before diving into Dask-GeoPandas. 7 | 8 | ## Dask-GeoPandas basics 9 | 10 | Given a GeoPandas dataframe 11 | 12 | ```py 13 | import geopandas 14 | df = geopandas.read_file('...') 15 | ``` 16 | 17 | We can repartition it into a Dask-GeoPandas dataframe: 18 | 19 | ```py 20 | import dask_geopandas 21 | ddf = dask_geopandas.from_geopandas(df, npartitions=4) 22 | ``` 23 | 24 | By default, this repartitions the data naively by rows. However, you can 25 | also provide spatial partitioning to take advantage of the spatial structure of 26 | the GeoDataFrame. 27 | 28 | ```py 29 | ddf = ddf.spatial_shuffle() 30 | ``` 31 | 32 | The familiar spatial attributes and methods of GeoPandas are also available 33 | and will be computed in parallel: 34 | 35 | ```py 36 | ddf.geometry.area.compute() 37 | ddf.within(polygon) 38 | ``` 39 | 40 | Additionally, if you have a distributed dask.dataframe you can pass columns of 41 | x-y points to the ``set_geometry`` method. 42 | 43 | ```py 44 | import dask.dataframe as dd 45 | import dask_geopandas 46 | 47 | ddf = dd.read_csv('...') 48 | 49 | ddf = ddf.set_geometry( 50 | dask_geopandas.points_from_xy(ddf, 'longitude', 'latitude') 51 | ) 52 | ``` 53 | 54 | Writing files (and reading back) is currently supported for the Parquet and Feather file 55 | formats. 56 | 57 | ```py 58 | ddf.to_parquet("path/to/dir/") 59 | ddf = dask_geopandas.read_parquet("path/to/dir/") 60 | ``` 61 | 62 | Traditional GIS file formats can be read into partitioned GeoDataFrame 63 | (requires `pyogrio`) but not written. 64 | 65 | ```py 66 | ddf = dask_geopandas.read_file("file.gpkg", npartitions=4) 67 | ``` 68 | -------------------------------------------------------------------------------- /doc/source/guide.md: -------------------------------------------------------------------------------- 1 | # User Guide 2 | 3 | The User Guide covers different parts of basic usage of Dask-GeoPandas. Each page focuses on a single topic and outlines how it is implemented in Dask-GeoPandas, with reproducible examples. You can also check the documentation of [GeoPandas](https://geopandas.org) 4 | and [Dask](https://dask.org). 5 | 6 | ```{toctree} 7 | --- 8 | maxdepth: 2 9 | --- 10 | Basic introduction 11 | Spatial partitioning 12 | Aggregation with dissolve 13 | ``` 14 | -------------------------------------------------------------------------------- /doc/source/guide/basic-intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The basic introduction to Dask-GeoPandas\n", 8 | "\n", 9 | "This notebook illustrates the basic API of Dask-GeoPandas and provides a basic timing comparison between operations on `geopandas.GeoDataFrame` and parallel `dask_geopandas.GeoDataFrame`." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import geopandas\n", 20 | "\n", 21 | "import dask_geopandas" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Creating a parallelized `dask_geopandas.GeoDataFrame`\n", 29 | "\n", 30 | "There are many ways how to create a parallelized `dask_geopandas.GeoDataFrame`. If your initial data fits in memory, you can create if from a `geopandas.GeoDataFrame` using the `from_geopandas` function:" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "df = geopandas.read_file(geopandas.datasets.get_path(\"naturalearth_lowres\"))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | "
pop_estcontinentnameiso_a3gdp_md_estgeometry
0920938OceaniaFijiFJI8374.0MULTIPOLYGON (((180.00000 -16.06713, 180.00000...
153950935AfricaTanzaniaTZA150600.0POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...
2603253AfricaW. SaharaESH906.5POLYGON ((-8.66559 27.65643, -8.66512 27.58948...
335623680North AmericaCanadaCAN1674000.0MULTIPOLYGON (((-122.84000 49.00000, -122.9742...
4326625791North AmericaUnited States of AmericaUSA18560000.0MULTIPOLYGON (((-122.84000 49.00000, -120.0000...
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | " pop_est continent name iso_a3 gdp_md_est \\\n", 128 | "0 920938 Oceania Fiji FJI 8374.0 \n", 129 | "1 53950935 Africa Tanzania TZA 150600.0 \n", 130 | "2 603253 Africa W. Sahara ESH 906.5 \n", 131 | "3 35623680 North America Canada CAN 1674000.0 \n", 132 | "4 326625791 North America United States of America USA 18560000.0 \n", 133 | "\n", 134 | " geometry \n", 135 | "0 MULTIPOLYGON (((180.00000 -16.06713, 180.00000... \n", 136 | "1 POLYGON ((33.90371 -0.95000, 34.07262 -1.05982... \n", 137 | "2 POLYGON ((-8.66559 27.65643, -8.66512 27.58948... \n", 138 | "3 MULTIPOLYGON (((-122.84000 49.00000, -122.9742... \n", 139 | "4 MULTIPOLYGON (((-122.84000 49.00000, -120.0000... " 140 | ] 141 | }, 142 | "execution_count": 3, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "df.head()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "When creating `dask_geopandas.GeoDataFrame` we have to specify how to partittion, e.g. using `npartitons` argument to split it into N equal chunks." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 4, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "ddf = dask_geopandas.from_geopandas(df, npartitions=4)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/html": [ 175 | "
Dask DataFrame Structure:
\n", 176 | "
\n", 177 | "\n", 190 | "\n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | "
pop_estcontinentnameiso_a3gdp_md_estgeometry
npartitions=4
0int64objectobjectobjectfloat64geometry
45..................
90..................
135..................
176..................
\n", 259 | "
\n", 260 | "
Dask Name: from_pandas, 4 tasks
" 261 | ], 262 | "text/plain": [ 263 | "" 264 | ] 265 | }, 266 | "execution_count": 5, 267 | "metadata": {}, 268 | "output_type": "execute_result" 269 | } 270 | ], 271 | "source": [ 272 | "ddf" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "Computation on a non-geometry column:" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 6, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "Africa 51\n", 291 | "Asia 47\n", 292 | "Europe 39\n", 293 | "North America 18\n", 294 | "South America 13\n", 295 | "Oceania 7\n", 296 | "Seven seas (open ocean) 1\n", 297 | "Antarctica 1\n", 298 | "Name: continent, dtype: int64" 299 | ] 300 | }, 301 | "execution_count": 6, 302 | "metadata": {}, 303 | "output_type": "execute_result" 304 | } 305 | ], 306 | "source": [ 307 | "ddf.continent.value_counts().compute()" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "And calling one of the geopandas-specific methods or attributes:" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 7, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "Dask Series Structure:\n", 326 | "npartitions=4\n", 327 | "0 float64\n", 328 | "45 ...\n", 329 | "90 ...\n", 330 | "135 ...\n", 331 | "176 ...\n", 332 | "dtype: float64\n", 333 | "Dask Name: getitem, 12 tasks" 334 | ] 335 | }, 336 | "execution_count": 7, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | } 340 | ], 341 | "source": [ 342 | "ddf.geometry.area" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "As you can see, without calling `compute()`, the resulting Series does not yet contain any values." 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 8, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "0 1.639511\n", 361 | "1 76.301964\n", 362 | "2 8.603984\n", 363 | "3 1712.995228\n", 364 | "4 1122.281921\n", 365 | " ... \n", 366 | "172 8.604719\n", 367 | "173 1.479321\n", 368 | "174 1.231641\n", 369 | "175 0.639000\n", 370 | "176 51.196106\n", 371 | "Length: 177, dtype: float64" 372 | ] 373 | }, 374 | "execution_count": 8, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "ddf.geometry.area.compute()" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "## Timing comparison: Point-in-polygon with million points" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "The GeoDataFrame used above is a bit small to see any benefit from parallelization using dask (as the overhead of the task scheduler is larger than the actual operation on such a tiny dataframe), so let's create a bigger point GeoSeries:" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 9, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "N = 10_000_000" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 10, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "points = geopandas.GeoDataFrame(geometry=geopandas.points_from_xy(np.random.randn(N),np.random.randn(N)))" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "And creating the dask-geopandas version of this series:" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 11, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "dpoints = dask_geopandas.from_geopandas(points, npartitions=16)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "A single polygon for which we will check if the points are located within this polygon:" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 12, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "import shapely.geometry\n", 445 | "box = shapely.geometry.box(0, 0, 1, 1)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": {}, 451 | "source": [ 452 | "The `within` operation will result in a boolean Series:" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 13, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "Dask Series Structure:\n", 464 | "npartitions=16\n", 465 | "0 bool\n", 466 | "625000 ...\n", 467 | " ... \n", 468 | "9375000 ...\n", 469 | "9999999 ...\n", 470 | "dtype: bool\n", 471 | "Dask Name: within, 32 tasks" 472 | ] 473 | }, 474 | "execution_count": 13, 475 | "metadata": {}, 476 | "output_type": "execute_result" 477 | } 478 | ], 479 | "source": [ 480 | "dpoints.within(box)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "The relative number of the points within the polygon:" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 14, 493 | "metadata": {}, 494 | "outputs": [ 495 | { 496 | "data": { 497 | "text/plain": [ 498 | "0.1162862" 499 | ] 500 | }, 501 | "execution_count": 14, 502 | "metadata": {}, 503 | "output_type": "execute_result" 504 | } 505 | ], 506 | "source": [ 507 | "(dpoints.within(box).sum() / len(dpoints)).compute()" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "Let's compare the time it takes to compute this:" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 15, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "name": "stdout", 524 | "output_type": "stream", 525 | "text": [ 526 | "460 ms ± 30.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 527 | ] 528 | } 529 | ], 530 | "source": [ 531 | "%timeit points.within(box)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 17, 537 | "metadata": {}, 538 | "outputs": [ 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "169 ms ± 39.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 544 | ] 545 | } 546 | ], 547 | "source": [ 548 | "%timeit dpoints.within(box).compute()" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "This is run on a laptop with 4 physical cores, and giving roughly a 3x speed-up using multithreading." 556 | ] 557 | } 558 | ], 559 | "metadata": { 560 | "kernelspec": { 561 | "display_name": "Python 3", 562 | "language": "python", 563 | "name": "python3" 564 | }, 565 | "language_info": { 566 | "codemirror_mode": { 567 | "name": "ipython", 568 | "version": 3 569 | }, 570 | "file_extension": ".py", 571 | "mimetype": "text/x-python", 572 | "name": "python", 573 | "nbconvert_exporter": "python", 574 | "pygments_lexer": "ipython3", 575 | "version": "3.8.5" 576 | } 577 | }, 578 | "nbformat": 4, 579 | "nbformat_minor": 4 580 | } 581 | -------------------------------------------------------------------------------- /doc/source/index.md: -------------------------------------------------------------------------------- 1 | # dask-geopandas documentation 2 | 3 | Parallel GeoPandas with Dask 4 | 5 | Dask-GeoPandas is a project merging the geospatial capabilities of [GeoPandas](https://geopandas.org) 6 | and scalability of [Dask](https://dask.org). GeoPandas is an open source project designed to make working with geospatial data in Python easier. GeoPandas extends the datatypes used by pandas to allow spatial operations on geometric types. 7 | Dask provides advanced parallelism and distributed out-of-core computation with a dask.dataframe module designed to scale 8 | pandas. Since GeoPandas is an extension to the pandas DataFrame, the same way Dask scales pandas can also be applied to GeoPandas. 9 | 10 | This project is a bridge between Dask and GeoPandas and offers geospatial capabilities of GeoPandas backed by Dask. 11 | 12 | ## Install 13 | 14 | Dask-GeoPandas depends on Dask and GeoPandas. In addition, it also requires 15 | Shapely >= 2.0. We recommend installing via `conda` or `mamba` 16 | from the `conda-forge` channel but you can also install it from PyPI. 17 | 18 | ```sh 19 | conda install dask-geopandas -c conda-forge 20 | ``` 21 | 22 | ```sh 23 | pip install dask-geopandas 24 | ``` 25 | 26 | For more details, see the [installation instructions](installation). 27 | 28 | ## Example 29 | 30 | As with `dask.dataframe` and `pandas`, the API of `dask_geopandas` mirrors the one of `geopandas`. 31 | 32 | ```py 33 | import geopandas 34 | import dask_geopandas 35 | 36 | df = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres")) 37 | dask_df = dask_geopandas.from_geopandas(df, npartitions=4) 38 | 39 | dask_df.geometry.area.compute() 40 | ``` 41 | 42 | ## When should I use Dask-GeoPandas? 43 | 44 | Dask-GeoPandas is useful when dealing with large GeoDataFrames that either do not comfortably fit in memory or require expensive computation that can be easily parallelised. Note that using Dask-GeoPandas is not always faster than using GeoPandas as there is an unavoidable overhead in task scheduling and transfer of data between threads and processes, but in other cases, your performance gains can be almost linear with more threads. 45 | 46 | ## Useful links 47 | 48 | [Source Repository (GitHub)](https://github.com/geopandas/dask-geopandas) | [Issues & Ideas](https://github.com/geopandas/dask-geopandas/issues) | [Gitter (chat)](https://gitter.im/geopandas/dask-geopandas) 49 | 50 | ```{toctree} 51 | --- 52 | maxdepth: 2 53 | caption: Documentation 54 | hidden: true 55 | --- 56 | installation 57 | getting_started 58 | guide 59 | parquet 60 | api 61 | changelog 62 | GitHub 63 | ``` 64 | -------------------------------------------------------------------------------- /doc/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | This package depends on GeoPandas and Dask. In addition, it also requires 4 | Shapely >= 2.0. 5 | 6 | GeoPandas is written in pure Python, but has several dependencies written in C (GEOS, GDAL, PROJ). Those base C libraries can sometimes be a challenge to install. Therefore, we advise you to closely follow the [recommendations](https://geopandas.org/en/stable/getting_started/install.html) to avoid installation problems. 7 | 8 | ## Easy way 9 | 10 | The best way to install Dask-GeoPandas is using `conda` or `mamba` and `conda-forge` channel: 11 | 12 | ```sh 13 | conda install -c conda-forge dask-geopandas 14 | ``` 15 | 16 | ## pip 17 | 18 | You can install Dask-GeoPandas with `pip` from PyPI but make sure that your environment contains 19 | properly installed GeoPandas (note that Dask-GeoPandas does not use `fiona` which therefore doesn't 20 | have to be installed). See the [GeoPandas installation instructions](https://geopandas.org/en/stable/getting_started/install.html#installing-with-pip) for details. 21 | 22 | ```sh 23 | pip install dask-geopandas 24 | ``` 25 | 26 | ## Fresh environment 27 | 28 | One way to install all required dependencies is to use the `conda` package manager to 29 | create a new environment: 30 | 31 | ```shell 32 | conda create -n geo_env 33 | conda activate geo_env 34 | conda config --env --add channels conda-forge 35 | conda config --env --set channel_priority strict 36 | conda install dask-geopandas 37 | ``` 38 | -------------------------------------------------------------------------------- /doc/source/parquet.md: -------------------------------------------------------------------------------- 1 | # Reading and Writing Apache Parquet 2 | 3 | Similar to dask-dataframe, dask-geopandas supports reading and writing Apache Parquet files. 4 | 5 | See the [Dask DataFrame](https://docs.dask.org/en/stable/dataframe-parquet.html#dataframe-parquet) 6 | and [Geopandas](https://geopandas.org/en/stable/docs/user_guide/io.html#apache-parquet-and-feather-file-formats) documentation 7 | for more on Apache Parquet. 8 | 9 | ## Partitioning 10 | 11 | As outlined in [Spatial partitioning in Dask-GeoPandas](guide/spatial-partitioning.ipynb), dask-geopandas can spatially partition datasets. These partitions are 12 | persisted in the parquet files. 13 | 14 | By default, reading these spatial partitions requires opening every file and checking its spatial extent. This can be a 15 | bit slow if the parquet dataset is made up of many individual partitions. To disable loading the spatial partitions, 16 | specify ``gather_spatial_partitions=False`` when reading the file: 17 | 18 | 19 | ```py 20 | ddf = dask_geopandas.read_parquet("...", gather_spatial_partitions=False) 21 | ddf.spatial_partitions # None 22 | ``` 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "dask-geopandas" 10 | dynamic = ["version"] 11 | authors = [ 12 | { name = "Julia Signell", email = "jsignell@gmail.com" } 13 | ] 14 | maintainers = [ 15 | { name = "GeoPandas contributors" } 16 | ] 17 | license = { text = "BSD 3-Clause" } 18 | description = "Parallel GeoPandas with Dask" 19 | readme = "README.rst" 20 | keywords = ["dask", "geopandas", "spatial", "distributed", "cluster"] 21 | classifiers = [ 22 | "Development Status :: 5 - Production/Stable", 23 | "Intended Audience :: Developers", 24 | "Intended Audience :: Science/Research", 25 | "License :: OSI Approved :: BSD License", 26 | "Operating System :: OS Independent", 27 | "Programming Language :: Python :: 3", 28 | "Programming Language :: Python :: 3 :: Only", 29 | "Topic :: Scientific/Engineering", 30 | "Topic :: Scientific/Engineering :: GIS", 31 | "Topic :: System :: Distributed Computing", 32 | ] 33 | requires-python = ">=3.10" 34 | dependencies = [ 35 | "geopandas>=0.14.3", 36 | "shapely>=2.0", 37 | "dask[dataframe]>=2025.1.0", 38 | "packaging", 39 | ] 40 | 41 | [project.optional-dependencies] 42 | test = [ 43 | "pytest", 44 | ] 45 | 46 | [project.urls] 47 | Home = "https://geopandas.org" 48 | Documentation = "https://dask-geopandas.readthedocs.io/" 49 | Repository = "https://github.com/geopandas/dask-geopandas" 50 | "Issue Tracker" = "https://github.com/geopandas/dask-geopandas/issues" 51 | 52 | 53 | [tool.black] 54 | line-length = 88 55 | 56 | [tool.ruff] 57 | line-length = 88 58 | extend-exclude = ["doc/*", "versioneer.py", "dask_geopandas/_version.py"] 59 | 60 | [tool.ruff.lint] 61 | select = [ 62 | # pyflakes 63 | "F", 64 | # pycodestyle 65 | "E", 66 | "W", 67 | # pyupgrade 68 | # "UP", 69 | # flake8-bugbear 70 | "B", 71 | # flake8-debugger 72 | "T10", 73 | # flake8-simplify 74 | # "SIM", 75 | # pylint 76 | "PLC", 77 | "PLE", 78 | "PLR", 79 | "PLW", 80 | # misc lints 81 | "PIE", 82 | # implicit string concatenation 83 | "ISC", 84 | # type-checking imports 85 | "TCH", 86 | # comprehensions 87 | "C4", 88 | # Ruff-specific rules 89 | "RUF", 90 | # isort 91 | "I", 92 | ] 93 | 94 | ignore = [ 95 | ### Intentionally disabled 96 | # module level import not at top of file 97 | "E402", 98 | # do not assign a lambda expression, use a def 99 | "E731", 100 | # mutable-argument-default 101 | "B006", 102 | # unused-loop-control-variable 103 | "B007", 104 | # get-attr-with-constant 105 | "B009", 106 | # Only works with python >=3.10 107 | "B905", 108 | # dict literals 109 | "C408", 110 | # Too many arguments to function call 111 | "PLR0913", 112 | # Too many returns 113 | "PLR0911", 114 | # Too many branches 115 | "PLR0912", 116 | # Too many statements 117 | "PLR0915", 118 | # Magic number 119 | "PLR2004", 120 | # Redefined loop name 121 | "PLW2901", 122 | # Global statements are discouraged 123 | "PLW0603", 124 | # compare-to-empty-string 125 | "PLC1901", 126 | 127 | ### Additional checks that don't pass yet 128 | # Useless statement 129 | "B018", 130 | # Within an except clause, raise exceptions with ... 131 | "B904", 132 | # Consider `elif` instead of `else` then `if` to remove indentation level 133 | "PLR5501", 134 | # collection-literal-concatenation 135 | "RUF005", 136 | # Mutable class attributes should be annotated with `typing.ClassVar`, 137 | "RUF012" 138 | ] 139 | 140 | [tool.ruff.lint.per-file-ignores] 141 | "dask_geopandas/__init__.py" = ["F401", "I"] 142 | 143 | [tool.ruff.lint.isort] 144 | extra-standard-library = ["packaging"] 145 | 146 | section-order = [ 147 | "future", 148 | "standard-library", 149 | "third-party", 150 | "dask", 151 | "geo", 152 | "first-party", 153 | "local-folder", 154 | "testing" 155 | ] 156 | 157 | [tool.ruff.lint.isort.sections] 158 | "dask" = ["dask"] 159 | "geo" = ["geopandas", "shapely", "pyproj"] 160 | "testing" = ["pytest", "pandas.testing", "numpy.testing", "geopandas.tests", "geopandas.testing"] 161 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | os: ubuntu-22.04 4 | tools: 5 | python: "3.11" 6 | formats: [] 7 | 8 | sphinx: 9 | configuration: doc/source/conf.py 10 | 11 | python: 12 | install: 13 | - requirements: doc/requirements.txt 14 | - method: pip 15 | path: . 16 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | hilbertcurve 4 | pygeohash 5 | pymorton 6 | pytest 7 | bokeh 8 | distributed -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # See the docstring in versioneer.py for instructions. Note that you must 2 | # re-run 'versioneer.py setup' after changing this section, and commit the 3 | # resulting files. 4 | 5 | [versioneer] 6 | VCS = git 7 | style = pep440 8 | versionfile_source = dask_geopandas/_version.py 9 | versionfile_build = dask_geopandas/_version.py 10 | tag_prefix = v 11 | parentdir_prefix = dask_geopandas- 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from setuptools import setup 5 | 6 | # ensure the current directory is on sys.path so versioneer can be imported 7 | # when pip uses PEP 517/518 build rules. 8 | # https://github.com/python-versioneer/python-versioneer/issues/193 9 | sys.path.append(os.path.dirname(__file__)) 10 | 11 | import versioneer 12 | 13 | 14 | # see pyproject.toml for static project metadata 15 | setup( 16 | name="dask-geopandas", # need by GitHub dependency graph 17 | version=versioneer.get_version(), 18 | cmdclass=versioneer.get_cmdclass(), 19 | ) 20 | --------------------------------------------------------------------------------