├── .github └── workflows │ ├── pre-commit.yml │ ├── publish_package.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── dev-requirements.txt ├── imgs └── repo_logo.png ├── pypi_README.md ├── pyproject.toml ├── requirements.txt ├── src └── geoparquet_pydantic │ ├── __init__.py │ ├── convert.py │ ├── schemas.py │ └── validate.py └── tests ├── conftest.py ├── test_conversions.py ├── test_data ├── valid.geojson └── valid_geojson.parquet ├── test_schemas.py └── test_validation.py /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | pre-commit: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | - uses: actions/setup-python@v3 12 | - uses: pre-commit/action@v3.0.1 13 | -------------------------------------------------------------------------------- /.github/workflows/publish_package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | name: Publish Release to PyPI 5 | 6 | on: 7 | release: 8 | types: [published] 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | deploy: 15 | 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v3 23 | with: 24 | python-version: '3.12' 25 | 26 | - name: Install publishing dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install --upgrade build 30 | 31 | - name: Build package 32 | run: python -m build --sdist --wheel . --outdir dist 33 | 34 | - name: Publish package to PyPi 35 | uses: pypa/gh-action-pypi-publish@release/v1 36 | with: 37 | password: ${{ secrets.PYPI_PASSWORD }} 38 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | run: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | python-version: ["3.11", "3.12"] 17 | os: [ubuntu-latest] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Install Python 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | python --version 30 | pip install -U pip 31 | pip install -r requirements.txt 32 | pip install -r dev-requirements.txt 33 | pip list 34 | 35 | - name: Install the library 36 | run: | 37 | pip install -e . 38 | pip list 39 | 40 | - name: Run tests 41 | run: pytest --cov=geoparquet_pydantic --cov-report=xml tests/ 42 | 43 | - name: Upload coverage report to CodeCov 44 | uses: codecov/codecov-action@v3 45 | env: 46 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | # - repo: https://github.com/RobertCraigie/pyright-python 3 | # rev: v1.1.351 4 | # hooks: 5 | # - id: pyright 6 | # exclude: ^tests/.* 7 | # additional_dependencies: [pyarrow, shapely, geojson-pydantic] 8 | - repo: https://github.com/PyCQA/docformatter 9 | rev: v1.7.5 10 | hooks: 11 | - id: docformatter 12 | additional_dependencies: [tomli] 13 | args: [--black, --in-place] 14 | 15 | - repo: https://github.com/pre-commit/pre-commit-hooks 16 | rev: v4.4.0 17 | hooks: 18 | - id: trailing-whitespace 19 | - id: check-ast 20 | - id: check-case-conflict 21 | - id: debug-statements 22 | - id: end-of-file-fixer 23 | - id: check-docstring-first 24 | - id: check-added-large-files 25 | 26 | - repo: https://github.com/psf/black-pre-commit-mirror 27 | rev: 24.2.0 28 | hooks: 29 | - id: black 30 | language_version: python3.12 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Xavier Nogueira 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GeoParquet-Pydantic 2 | 3 |

4 | Logo 5 |

6 |

7 | A lightweight, pydantic centric library for validating GeoParquet files (or PyArrow Tables) and converting between GeoJSON and GeoParquet...without GDAL! 8 |

9 |

10 | 11 | Pre-Commit 12 | 13 | 14 | Tests 15 | 16 | 17 | Coverage 18 | 19 | 20 | Package version 21 | 22 | 23 | License 24 | 25 |

26 | 27 | --- 28 | **Motivation:** This project started at the 2024 San Fransisco GeoParquet Community hackathon, and arose out of a simple observation: 29 | why must Python users install the *massive* GDAL dependency (typically via GeoPandas) to do simple GeoJSON<>GeoParquet conversions. 30 | 31 | **Is this library the right choice for you?:** 32 | * Do you need to use a wide variety of Geospatial functions? If so, you will likely have to add GDAL/GeoPandas as a dependency anyways, 33 | making this ibrary's conversion functions *probably* redundant. 34 | * Is your workflow command line centric? If so you may want to consider Planet Lab's simular CLI tool [`gpq`](https://github.com/planetlabs/gpq), 35 | which is written in Go and substantially faster. 36 | * Otherwise, if you are using Python and want to avoid unnecessary bulky dependencies, this library will be a great choice! 37 | 38 | **Note:** All user-exposed functions and schema classes are available at the top level (i.e., `geoparquet_pydantic.validate_geoparquet_table(...)`) of this library. 39 | 40 | # Features 41 | 42 | ## `pydantic` Schemas 43 | 44 | * [`GeometryColumnMetadata`](https://github.com/xaviernogueira/geoparquet-pydantic/blob/cec560451db01cd5c4a4b1fea6486c86975f7499/geoparquet_pydantic/schemas.py#L40): A `pydantic` model that validates a 45 | geometry column's (aka `primary_column`) metadata. This is nested within the following schema. 46 | * [`GeoParquetMetadata`](https://github.com/xaviernogueira/geoparquet-pydantic/blob/cec560451db01cd5c4a4b1fea6486c86975f7499/geoparquet_pydantic/schemas.py#L93): A `pydantic` model for the metadata assigned to the "geo" key in a `pyarrow.Table` 47 | that allows it to be read by GeoParquet readers once saved. 48 | 49 | For an explanation of these schemas, please refence the [geoparquet repository](https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md). 50 | 51 | ## Validation functions 52 | 53 | Convenience functions that simply uses `GeoParquetMetadata` to return a `bool` depending on whether the GeoParquet metadata obeys the [schema](https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md). 54 | 55 | ### Validate a `pyarrow.Table`'s GeoParquet metadata: 56 | ```python 57 | def validate_geoparquet_table( 58 | table: pyarrow.Table, 59 | primary_column: Optional[str] = None, 60 | ) -> bool: 61 | """Validates a the GeoParquet metadata of a pyarrow.Table. 62 | 63 | Args: 64 | table (pyarrow.Table): The table to validate. 65 | primary_column (Optional[str], optional): The name of the primary geometry column. 66 | Defaults to None. 67 | 68 | Returns: 69 | bool: True if the metadata is valid, False otherwise. 70 | """ 71 | ... 72 | ``` 73 | 74 | ### Validate a Parquet file's GeoParquet metadata: 75 | 76 | ```python 77 | def validate_geoparquet_file( 78 | geoparquet_file: str | Path | pyarrow.parquet.ParquetFile, 79 | primary_column: Optional[str] = None, 80 | read_file_kwargs: Optional[dict] = None, 81 | ) -> bool: 82 | """Validates that a parquet file has correct GeoParquet metadata without opening it. 83 | 84 | Args: 85 | geoparquet_file (str | Path | ParquetFile): The file to validate. 86 | primary_column (str, optional): The primary column name. Defaults to 'geometry'. 87 | read_file_kwargs (dict, optional): Kwargs to be passed into pyarrow.parquet.ParquetFile(). 88 | See: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow-parquet-parquetfile 89 | 90 | Returns: 91 | bool: True if the metadata is valid, False otherwise. 92 | """ 93 | ... 94 | ``` 95 | 96 | ## Conversion functions 97 | 98 | ### Convert from `geojson_pydantic.FeatureCollection` to a GeoParquet `pyarrow.Table` 99 | 100 | ```python 101 | def geojson_to_geoparquet( 102 | geojson: FeatureCollection | Path, 103 | primary_column: Optional[str] = None, 104 | column_schema: Optional[pyarrow.Schema] = None, 105 | add_none_values: Optional[bool] = False, 106 | geo_metadata: GeoParquetMetadata | dict | None = None, 107 | **kwargs, 108 | ) -> pyarrow.Table: 109 | """Converts a GeoJSON Pydantic FeatureCollection to an Arrow table with geoparquet 110 | metadata. 111 | 112 | To save to a file, simply use pyarrow.parquet.write_table() on the returned table. 113 | 114 | Args: 115 | geojson (FeatureCollection): The GeoJSON Pydantic FeatureCollection. 116 | primary_column (str, optional): The name of the primary column. Defaults to None. 117 | column_schema (pyarrow.Schema, optional): The Arrow schema for the table. Defaults to None. 118 | add_none_values (bool, default=False): Whether to fill missing column values 119 | specified in param:column_schema with 'None' (converts to pyarrow.null()). 120 | geo_metadata (GeoParquet | dict | None, optional): The GeoParquet metadata. 121 | **kwargs: Additional keyword arguments for the Arrow table writer. 122 | 123 | Returns: 124 | The Arrow table with GeoParquet metadata. 125 | """ 126 | ... 127 | ``` 128 | 129 | ### Convert from a GeoParquet `pyarrow.Table` or file to a `geojson_pydantic.FeatureCollection` 130 | 131 | ```python 132 | def geoparquet_to_geojson( 133 | geoparquet: pyarrow.Table | str | Path, 134 | primary_column: Optional[str] = None, 135 | max_chunksize: Optional[int] = None, 136 | max_workers: Optional[int] = None, 137 | ) -> FeatureCollection: 138 | """Converts an Arrow table with GeoParquet metadata to a GeoJSON Pydantic 139 | FeatureCollection. 140 | 141 | Args: 142 | geoparquet (pyarrow.Table): Either an Arrow.Table or parquet with GeoParquet metadata. 143 | primary_column (str, optional): The name of the primary column. Defaults to 'geometry'. 144 | max_chunksize (int, optional): The maximum chunksize to read from the parquet file. Defaults to 1000. 145 | max_workers (int, optional): The maximum number of workers to use for parallel processing. 146 | Defaults to 0 (runs sequentially). Use -1 for all available cores. 147 | 148 | Returns: 149 | FeatureCollection: The GeoJSON Pydantic FeatureCollection. 150 | """ 151 | ... 152 | ``` 153 | 154 | # Getting Started 155 | 156 | Install from [PyPi](https://pypi.org/project/geoparquet-pydantic): 157 | ```bash 158 | pip install geoparquet-pydantic 159 | ``` 160 | 161 | Or from source: 162 | ```bash 163 | $ git clone https://github.com/xaviernogueira/geoparquet-pydantic.git 164 | $ cd geoparquet-pydantic 165 | $ pip install . 166 | ``` 167 | 168 | Then import with an underscore: 169 | ```python 170 | import geoparquet_pydantic 171 | ``` 172 | 173 | Or just import the functions/classes you need from the top-level: 174 | ```python 175 | from geoparquet_pydantic import ( 176 | GeometryColumnMetadata, 177 | GeoParquetMetadata, 178 | validate_geoparquet_table, 179 | validate_geoparquet_file, 180 | geojson_to_geoparquet, 181 | geoparquet_to_geojson, 182 | ) 183 | ``` 184 | 185 | # Roadmap 186 | 187 | - [ ] Make CLI file<>file functions w/ `click`. 188 | - [ ] Add parrallelized Parquet read for `geoparquet_pydantic.geoparquet_to_geojson()`. 189 | 190 | # Contribute 191 | 192 | We encourage contributions, feature requests, and bug reports! 193 | 194 | Here is our recomended workflow: 195 | 196 | * Use `dev-requirements.txt` to install our development dependencies. 197 | * Make your edits using `pyright` as a linter. 198 | * Use `pre-commit run --all-file` before commiting your work. 199 | * If you add a new feature, we request that you add test coverage for it. 200 | 201 | Happy coding! 202 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | wheel 3 | pytest 4 | pytest-cov 5 | black 6 | pyright 7 | pre-commit 8 | geopandas 9 | -------------------------------------------------------------------------------- /imgs/repo_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xaviernogueira/geoparquet-pydantic/094df67663df6d9c8eb02579e903ecc77dd41cc7/imgs/repo_logo.png -------------------------------------------------------------------------------- /pypi_README.md: -------------------------------------------------------------------------------- 1 | # GeoParquet-Pydantic 2 | 3 | **Motivation:** This project started at the 2024 San Fransisco GeoParquet Community hackathon, and arose out of a simple observation: 4 | why must Python users install the *massive* GDAL dependency (typically via GeoPandas) to do simple GeoJSON<>GeoParquet conversions. 5 | 6 | **Is this library the right choice for you?:** 7 | * Do you need to use a wide variety of Geospatial functions? If so, you will likely have to add GDAL/GeoPandas as a dependency anyways, 8 | making this ibrary's conversion functions *probably* redundant. 9 | * Is your workflow command line centric? If so you may want to consider Planet Lab's simular CLI tool [`gpq`](https://github.com/planetlabs/gpq), 10 | which is written in Go and substantially faster. 11 | * Otherwise, if you are using Python and want to avoid unnecessary bulky dependencies, this library will be a great choice! 12 | 13 | **Note:** All user-exposed functions and schema classes are available at the top level (i.e., `geoparquet_pydantic.validate_geoparquet_table(...)`) of this library. 14 | 15 | # Documentation is on GitHub [here](https://github.com/xaviernogueira/geoparquet-pydantic/blob/main/README.md) 16 | 17 | # Getting Started 18 | 19 | Install from [PyPi](https://pypi.org/project/geoparquet-pydantic): 20 | ```bash 21 | pip install geoparquet-pydantic 22 | ``` 23 | 24 | Or from source: 25 | ```bash 26 | $ git clone https://github.com/xaviernogueira/geoparquet-pydantic.git 27 | $ cd geoparquet-pydantic 28 | $ pip install . 29 | ``` 30 | 31 | Then import with an underscore: 32 | ```python 33 | import geoparquet_pydantic 34 | ``` 35 | 36 | Or just import the functions/classes you need from the top-level: 37 | ```python 38 | from geoparquet_pydantic import GeometryColumnMetadata 39 | from geoparquet_pydantic import GeoParquetMetadata 40 | from geoparquet_pydantic import validate_geoparquet_table 41 | from geoparquet_pydantic import validate_geoparquet_file 42 | from geoparquet_pydantic import geojson_to_geoparquet 43 | from geoparquet_pydantic import geoparquet_to_geojson 44 | ``` 45 | 46 | # Roadmap 47 | 48 | - [ ] Make CLI file<>file functions w/ `click`. 49 | - [ ] Add parrallelized Parquet read for `geoparquet_pydantic.geoparquet_to_geojson()`. 50 | 51 | # Contribute 52 | 53 | We encourage contributions, feature requests, and bug reports! 54 | 55 | Here is our recomended workflow: 56 | 57 | * Use `dev-requirements.txt` to install our development dependencies. 58 | * Make your edits using `pyright` as a linter. 59 | * Use `pre-commit run --all-file` before commiting your work. 60 | * If you add a new feature, we request that you add test coverage for it. 61 | 62 | Happy coding! 63 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "setuptools-scm", 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "geoparquet_pydantic" 10 | description = "Read/write geoparquet with the geojson-pydanic models." 11 | requires-python = ">=3.11" 12 | keywords = [ 13 | "geoparquet", 14 | "pydantic", 15 | "geospatial", 16 | ] 17 | license = {text = "MIT"} 18 | classifiers = [ 19 | "Programming Language :: Python :: 3", 20 | ] 21 | dependencies = [ 22 | "geojson-pydantic", 23 | "pyarrow", 24 | "shapely", 25 | "pyproj", 26 | "click", 27 | ] 28 | dynamic = [ 29 | "version", 30 | "readme", 31 | ] 32 | 33 | [tool.setuptools.packages.find] 34 | where = ["src"] 35 | include = ["geoparquet_pydantic*"] 36 | 37 | [tool.setuptools.dynamic] 38 | version = {attr = "geoparquet_pydantic.__version__"} 39 | readme = {file = "pypi_README.md"} 40 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | geojson-pydantic 2 | pyarrow 3 | shapely 4 | pyproj 5 | click 6 | -------------------------------------------------------------------------------- /src/geoparquet_pydantic/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | from .schemas import ( 3 | GeometryColumnMetadata, 4 | GeoParquetMetadata, 5 | ) 6 | from .convert import ( 7 | geojson_to_geoparquet, 8 | geoparquet_to_geojson, 9 | ) 10 | from .validate import ( 11 | validate_geoparquet_table, 12 | validate_geoparquet_file, 13 | ) 14 | -------------------------------------------------------------------------------- /src/geoparquet_pydantic/convert.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import functools 3 | import warnings 4 | import geojson_pydantic 5 | from geojson_pydantic.types import BBox 6 | import shapely.wkb 7 | import shapely.wkt 8 | import pyarrow 9 | import json 10 | from geojson_pydantic.geometries import ( 11 | _GeometryBase, 12 | ) 13 | from geojson_pydantic.features import ( 14 | Feature, 15 | FeatureCollection, 16 | ) 17 | from geoparquet_pydantic.schemas import ( 18 | GeometryColumnMetadata, 19 | GeoParquetMetadata, 20 | GeometryTypes, 21 | ) 22 | from pathlib import Path 23 | from typing import Any, Optional, Iterable 24 | 25 | 26 | def _to_wkb(geometry: _GeometryBase) -> bytes: 27 | """Converts the GeoJSON object to WKB format.""" 28 | return shapely.wkb.dumps(shapely.wkt.loads(geometry.wkt)) 29 | 30 | 31 | def _get_geom_types(features: list[Feature]) -> list[str]: 32 | return list(set([feature.geometry.type for feature in features])) 33 | 34 | 35 | def _get_default_geo_metadata( 36 | feature_collection: FeatureCollection, 37 | ) -> GeoParquetMetadata: 38 | return GeoParquetMetadata( 39 | primary_column="geometry", 40 | columns={ 41 | "geometry": GeometryColumnMetadata( 42 | **{ 43 | "encoding": "WKB", 44 | "geometry_types": _get_geom_types(feature_collection.features), 45 | } 46 | ), 47 | }, 48 | ) 49 | 50 | 51 | def _update_metadata(table: pyarrow.Table, metadata: dict) -> pyarrow.Table: 52 | new_metadata = table.schema.metadata 53 | if not new_metadata: 54 | new_metadata = {} 55 | for k, v in metadata.items(): 56 | new_metadata[k] = json.dumps(v).encode("utf-8") 57 | return table.replace_schema_metadata(new_metadata) 58 | 59 | 60 | def _validate_column_schema( 61 | column_schema: pyarrow.Schema, 62 | primary_column: str, 63 | geojson: FeatureCollection, 64 | add_none_values: bool, 65 | ) -> None: 66 | names = [i for i in column_schema.names if i != primary_column] 67 | for feature in geojson.features: 68 | if not add_none_values: 69 | all_present = all([name in feature.properties.keys() for name in names]) 70 | if not all_present: 71 | raise ValueError( 72 | f"Feature {feature} does not contain all the columns in the schema: {column_schema.names}", 73 | ) 74 | 75 | else: 76 | for name in names: 77 | if not feature.properties.get(name): 78 | feature.properties[name] = None 79 | 80 | 81 | def geojson_to_geoparquet( 82 | geojson: FeatureCollection | Path, 83 | primary_column: Optional[str] = None, 84 | column_schema: Optional[pyarrow.Schema] = None, 85 | add_none_values: Optional[bool] = False, 86 | geo_metadata: GeoParquetMetadata | dict | None = None, 87 | **kwargs, 88 | ) -> pyarrow.Table: 89 | """Converts a GeoJSON Pydantic FeatureCollection to an Arrow table with geoparquet 90 | metadata. 91 | 92 | To save to a file, simply use pyarrow.parquet.write_table() on the returned table. 93 | 94 | Args: 95 | geojson (FeatureCollection): The GeoJSON Pydantic FeatureCollection. 96 | primary_column (str, optional): The name of the primary column. Defaults to None. 97 | column_schema (pyarrow.Schema, optional): The Arrow schema for the table. Defaults to None. 98 | add_none_values (bool, default=False): Whether to fill missing column values 99 | specified in param:column_schema with 'None' (converts to pyarrow.null()). 100 | geo_metadata (GeoParquet | dict | None, optional): The GeoParquet metadata. 101 | **kwargs: Additional keyword arguments for the Arrow table writer. 102 | 103 | Returns: 104 | The Arrow table with GeoParquet metadata. 105 | """ 106 | if not isinstance(geojson, FeatureCollection): 107 | geojson = FeatureCollection(**json.load(geojson.open("r"))) 108 | if not primary_column: 109 | primary_column = "geometry" 110 | 111 | # get primary column as iterables 112 | columns: list[Iterable] = [map(lambda f: _to_wkb(f.geometry), geojson.features)] 113 | 114 | # get geo metadata 115 | if not geo_metadata: 116 | geo_metadata = _get_default_geo_metadata(geojson) 117 | if isinstance(geo_metadata, dict): 118 | geo_metadata = GeoParquetMetadata(**geo_metadata) 119 | if not isinstance(geo_metadata, GeoParquetMetadata): 120 | raise ValueError("geo_metadata must be a valid GeoParquet class, dict, or None") 121 | 122 | # get other columns as iterables and update schema 123 | if not column_schema: 124 | column_schema = pyarrow.schema( 125 | [ 126 | (primary_column, pyarrow.binary()), 127 | ("properties", pyarrow.string()), 128 | ] 129 | ) 130 | elif isinstance(column_schema, pyarrow.Schema): 131 | if primary_column in column_schema.names: 132 | column_schema.remove(column_schema.get_field_index(primary_column)) 133 | column_schema.insert(0, pyarrow.field(primary_column, pyarrow.binary())) 134 | else: 135 | raise ValueError("column_schema must be a valid pyarrow.Schema or None") 136 | 137 | if "properties" in column_schema.names: 138 | if len(column_schema.names) > 2: 139 | raise ValueError( 140 | "Cannot have 'properties' as a column with other columns (which are pulled from GeoJSON propreties)." 141 | ) 142 | columns.append(map(lambda f: json.dumps(f.properties), geojson.features)) 143 | 144 | else: 145 | _validate_column_schema(column_schema, primary_column, geojson, add_none_values) 146 | 147 | for col in column_schema.names: 148 | columns.append(map(lambda f: f.properties.get(col), geojson.features)) 149 | 150 | # write table 151 | table = pyarrow.Table.from_pydict( 152 | {**dict(zip(column_schema.names, columns))}, 153 | schema=column_schema, 154 | **kwargs, 155 | ) 156 | return _update_metadata(table, {"geo": geo_metadata.model_dump()}) 157 | 158 | 159 | def _find_bbox(geoparquet: pyarrow.Table) -> BBox | None: 160 | if not geoparquet.schema.metadata: 161 | warnings.warn("No GeoParquet metadata found in the Arrow table.") 162 | return None 163 | decoded_metadata: dict[str, Any] = ast.literal_eval( 164 | geoparquet.schema.metadata[b"geo"].decode("utf-8"), 165 | ) 166 | bbox = decoded_metadata["columns"]["geometry"].get("bbox", None) 167 | if isinstance(bbox, list): 168 | bbox = tuple(bbox) 169 | return bbox 170 | 171 | 172 | def _get_prop_records(name_value_tuple: tuple[str, list[Any]]) -> list[tuple[str, Any]]: 173 | name, values = name_value_tuple 174 | return list(zip([name] * len(values), values)) 175 | 176 | 177 | def _shapely_to_feature( 178 | geometry: shapely.geometry.base.BaseGeometry, 179 | properties: list[tuple[str, Any]], 180 | ) -> Feature: 181 | geom_class: type[GeometryTypes] = getattr(geojson_pydantic, type(geometry).__name__) 182 | return Feature( 183 | type="Feature", 184 | geometry=geom_class(**json.loads(shapely.to_geojson(geometry))), 185 | bbox=list(geometry.bounds), 186 | properties=dict([*properties]), 187 | ) 188 | 189 | 190 | def geoparquet_to_geojson( 191 | geoparquet: pyarrow.Table | str | Path, 192 | primary_column: Optional[str] = None, 193 | max_chunksize: Optional[int] = None, 194 | max_workers: Optional[int] = None, 195 | ) -> FeatureCollection: 196 | """Converts an Arrow table with GeoParquet metadata to a GeoJSON Pydantic 197 | FeatureCollection. 198 | 199 | Args: 200 | geoparquet (pyarrow.Table): Either an Arrow.Table or parquet with GeoParquet metadata. 201 | primary_column (str, optional): The name of the primary column. Defaults to 'geometry'. 202 | max_chunksize (int, optional): The maximum chunksize to read from the parquet file. Defaults to 1000. 203 | max_workers (int, optional): The maximum number of workers to use for parallel processing. 204 | Defaults to 0 (runs sequentially). Use -1 for all available cores. 205 | Returns: 206 | FeatureCollection: The GeoJSON Pydantic FeatureCollection. 207 | """ 208 | if not primary_column: 209 | primary_column = "geometry" 210 | if not max_chunksize: 211 | max_chunksize = 1000 212 | if isinstance(geoparquet, (str, Path)): 213 | geoparquet = pyarrow.parquet.read_table(geoparquet) 214 | if not isinstance(geoparquet, pyarrow.Table): 215 | raise ValueError( 216 | "param:geoparquet must be a valid pyarrow.Table or parquet file" 217 | ) 218 | 219 | if primary_column not in geoparquet.column_names: 220 | raise ValueError(f"Primary column {primary_column} not found in the table.") 221 | 222 | # attempt to get the bbox from metadata 223 | bbox: BBox | None = _find_bbox(geoparquet) 224 | 225 | # TODO: parallelize this (optionally) 226 | if max_workers: 227 | raise NotImplementedError("Parallel processing not yet implemented.") 228 | 229 | feature_lists: list[list[Feature]] = [] 230 | for chunk in geoparquet.to_batches(max_chunksize): 231 | chunk_dict = chunk.to_pydict() 232 | geoms: list[bytes] = chunk_dict.pop(primary_column) 233 | properties: Iterable[list[tuple[str, Any]]] = map( 234 | _get_prop_records, 235 | chunk_dict.items(), 236 | ) 237 | feature_props: Iterable[list[tuple[str, Any]]] = map( 238 | lambda i: [p[i] for p in properties], 239 | range(len(geoms)), 240 | ) 241 | try: 242 | chunk_features: Iterable[Feature] = list( 243 | map( 244 | lambda gp: _shapely_to_feature(shapely.from_wkb(gp[0]), gp[1]), 245 | zip(geoms, feature_props), 246 | ) 247 | ) 248 | except shapely.errors.GEOSException as e: 249 | raise ValueError( 250 | f"Error converting WKB to shapely geometry. Make sure the WKB is valid! Exception: {e}" 251 | ) 252 | 253 | feature_lists.append(chunk_features) 254 | features: list[Feature] = list(functools.reduce(lambda a, b: a + b, feature_lists)) 255 | 256 | return FeatureCollection( 257 | type="FeatureCollection", 258 | features=features, 259 | bbox=bbox, 260 | ) 261 | -------------------------------------------------------------------------------- /src/geoparquet_pydantic/schemas.py: -------------------------------------------------------------------------------- 1 | """Pydantic models for GeoParquet metadata.""" 2 | 3 | import ast 4 | from pydantic import BeforeValidator, Field, BaseModel, field_validator, model_validator 5 | from typing import Annotated, Optional, Literal, Union 6 | from pyproj import CRS 7 | 8 | EdgeType = Literal["planar", "spherical"] 9 | 10 | FlatGeometryTypes = Annotated[ 11 | # TODO: support 3d geometries with Z suffix 12 | Literal[ 13 | "Point", 14 | "MultiPoint", 15 | "LineString", 16 | "MultiLineString", 17 | "Polygon", 18 | "MultiPolygon", 19 | "GeometryCollection", 20 | ], 21 | Field(description="The geometry types supported by the column"), 22 | ] 23 | 24 | ZGeometryTypes = Annotated[ 25 | Literal[ 26 | "PointZ", 27 | "MultiPointZ", 28 | "LineStringZ", 29 | "MultiLineStringZ", 30 | "PolygonZ", 31 | "MultiPolygonZ", 32 | "GeometryCollectionZ", 33 | ], 34 | Field(description="3D geometry types supported by the column"), 35 | ] 36 | 37 | GeometryTypes = Union[FlatGeometryTypes, ZGeometryTypes] 38 | 39 | 40 | class GeometryColumnMetadata(BaseModel): 41 | encoding: Literal["WKB"] 42 | geometry_types: list[GeometryTypes] 43 | 44 | crs: Annotated[ 45 | str, 46 | Field( 47 | description="The CRS of the geometry column in a string format readable by pyproj. Is the converted to PROJJSON format" 48 | ), 49 | ] = "OGC:CRS84" 50 | 51 | edges: Annotated[ 52 | EdgeType, Field(description="The type of edges of the geometries") 53 | ] = "planar" 54 | 55 | bbox: Optional[ 56 | Annotated[list[float], Field(description="The bounding box of the geometries")] 57 | ] = None 58 | 59 | epoch: Optional[ 60 | Annotated[ 61 | Union[int, float], 62 | Field(description="Coordinate epoch in case of a dynamic CRS"), 63 | ] 64 | ] = None 65 | 66 | orientation: Literal["counterclockwise"] = "counterclockwise" 67 | 68 | @field_validator("crs") 69 | @classmethod 70 | def convert_crs_to_projjson(cls, v) -> str: 71 | """Parse a CRS string and return a PROJJSON string.""" 72 | try: 73 | crs = CRS.from_string(v) 74 | return crs.to_json() 75 | except Exception as e: 76 | raise ValueError(f"Invalid CRS string: {e}") 77 | 78 | @field_validator("geometry_types") 79 | @classmethod 80 | def only_unique_types(cls, v): 81 | if len(v) != len(set(v)): 82 | raise ValueError("geometry_types items must be unique!") 83 | return v 84 | 85 | @field_validator("bbox") 86 | @classmethod 87 | def must_be_length_4(cls, v): 88 | if v is not None and len(v) != 4: 89 | raise ValueError("bbox must be a list of 4 floats!") 90 | return v 91 | 92 | 93 | class GeoParquetMetadata(BaseModel): 94 | version: Annotated[ 95 | str, Field(description="The version of the GeoParquet format") 96 | ] = "1.1.0-dev" 97 | primary_column: Annotated[ 98 | str, Field(description="The name of the geometry primary column") 99 | ] = "geometry" 100 | columns: Annotated[ 101 | dict[str, GeometryColumnMetadata | dict | str], 102 | Field(description="Metadata for each column (keys)"), 103 | ] 104 | 105 | @model_validator(mode="after") 106 | def contains_primary_col(self) -> "GeoParquetMetadata": 107 | if not self.primary_column in self.columns.keys(): 108 | raise ValueError( 109 | f"primary column={self.primary_column} not in arg:columns={self.columns}" 110 | ) 111 | return self 112 | 113 | @model_validator(mode="after") 114 | def convert_geo_to_class(self) -> "GeoParquetMetadata": 115 | if not isinstance(self.columns[self.primary_column], GeometryColumnMetadata): 116 | if isinstance(self.columns[self.primary_column], str): 117 | self.columns[self.primary_column] = ast.literal_eval( 118 | self.columns[self.primary_column] 119 | ) 120 | if isinstance(self.columns[self.primary_column], dict): 121 | self.columns[self.primary_column] = GeometryColumnMetadata( 122 | **self.columns[self.primary_column] 123 | ) 124 | else: 125 | raise ValueError( 126 | f"Invalid primary column metadata: {self.columns[self.primary_column]}" 127 | ) 128 | return self 129 | -------------------------------------------------------------------------------- /src/geoparquet_pydantic/validate.py: -------------------------------------------------------------------------------- 1 | """For validating an existing GeoParquet file or Arrow table. 2 | 3 | Note that validating GeoParquet metadata can be handles with the 4 | `.schemas` module pydantic classes. 5 | """ 6 | 7 | import ast 8 | import pyarrow 9 | from geoparquet_pydantic.schemas import ( 10 | GeoParquetMetadata, 11 | ) 12 | from typing import Optional 13 | from pathlib import Path 14 | 15 | 16 | def _validate_geo_metadata(metadata: dict[bytes, bytes]) -> bool: 17 | try: 18 | geo_metadata = ast.literal_eval(metadata[b"geo"].decode("utf-8")) 19 | GeoParquetMetadata(**geo_metadata) 20 | print("Valid GeoParquet metadata!") 21 | return True 22 | except KeyError as e: 23 | print(f"Invalid GeoParquet metadata, could not find b'geo' key: {e}") 24 | except ValueError as e: 25 | print(f"Invalid GeoParquet metadata: {e}") 26 | return False 27 | 28 | 29 | def validate_geoparquet_table( 30 | table: pyarrow.Table, 31 | primary_column: Optional[str] = None, 32 | ) -> bool: 33 | """Validates a the GeoParquet metadata of a pyarrow.Table. 34 | 35 | See: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md 36 | 37 | Args: 38 | table (pyarrow.Table): The table to validate. 39 | primary_column (Optional[str], optional): The name of the primary geometry column. 40 | Defaults to None. 41 | 42 | Returns: 43 | bool: True if the metadata is valid, False otherwise. 44 | """ 45 | if not primary_column: 46 | primary_column = "geometry" 47 | return _validate_geo_metadata(table.schema.metadata) 48 | 49 | 50 | def validate_geoparquet_file( 51 | geoparquet_file: str | Path | pyarrow.parquet.ParquetFile, 52 | primary_column: Optional[str] = None, 53 | read_file_kwargs: Optional[dict] = None, 54 | ) -> bool: 55 | """Validates that a parquet file has correct GeoParquet metadata without opening it. 56 | 57 | See: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md 58 | 59 | Args: 60 | geoparquet_file (str | Path | ParquetFile): The file to validate. 61 | primary_column (str, optional): The primary column name. Defaults to 'geometry'. 62 | read_file_kwargs (dict, optional): Kwargs to be passed into pyarrow.parquet.ParquetFile(). 63 | See: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow-parquet-parquetfile 64 | 65 | Returns: 66 | bool: True if the metadata is valid, False otherwise. 67 | """ 68 | if not primary_column: 69 | primary_column = "geometry" 70 | default_read_file_kwargs = { 71 | "memory_map": True, 72 | } 73 | if read_file_kwargs is None: 74 | read_file_kwargs = default_read_file_kwargs 75 | elif isinstance(read_file_kwargs, dict): 76 | for k, v in default_read_file_kwargs.items(): 77 | if k not in read_file_kwargs: 78 | read_file_kwargs[k] = v 79 | else: 80 | raise TypeError(f"Optional param:read_file_kwargs must be a dict or None!") 81 | 82 | if isinstance(geoparquet_file, (str, Path)): 83 | geoparquet_file = pyarrow.parquet.ParquetFile( 84 | geoparquet_file, 85 | **read_file_kwargs, 86 | ) 87 | if not isinstance(geoparquet_file, pyarrow.parquet.ParquetFile): 88 | raise TypeError( 89 | "Input must be a file path (str | Path) or a ParquetFile object!" 90 | ) 91 | return _validate_geo_metadata(geoparquet_file.schema_arrow.metadata) 92 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import json 3 | import pyarrow 4 | from pathlib import Path 5 | from geojson_pydantic.features import FeatureCollection 6 | 7 | # get the path to the data directory 8 | TEST_DATA_DIR = Path(__file__).parent / "test_data" 9 | 10 | 11 | @pytest.fixture 12 | def valid_geojson_file() -> Path: 13 | valid_geojson = TEST_DATA_DIR / "valid.geojson" 14 | assert valid_geojson.exists() 15 | return valid_geojson 16 | 17 | 18 | @pytest.fixture 19 | def valid_geojson_obj(valid_geojson_file) -> FeatureCollection: 20 | return FeatureCollection(**json.load(open(valid_geojson_file, "r"))) 21 | 22 | 23 | @pytest.fixture 24 | def valid_geoparquet_file() -> Path: 25 | valid_geoparquet = TEST_DATA_DIR / "valid_geojson.parquet" 26 | assert valid_geoparquet.exists() 27 | return valid_geoparquet 28 | 29 | 30 | @pytest.fixture 31 | def valid_geoparquet_table(valid_geoparquet_file) -> pyarrow.Table: 32 | return pyarrow.parquet.read_table(valid_geoparquet_file) 33 | -------------------------------------------------------------------------------- /tests/test_conversions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import json 3 | import pyarrow 4 | from pathlib import Path 5 | import geojson_pydantic 6 | import geopandas as gpd 7 | import pyarrow.parquet 8 | from geojson_pydantic.features import FeatureCollection 9 | 10 | from geoparquet_pydantic.schemas import ( 11 | GeoParquetMetadata, 12 | ) 13 | from geoparquet_pydantic.convert import ( 14 | _to_wkb, 15 | _get_geom_types, 16 | _get_default_geo_metadata, 17 | _update_metadata, 18 | _validate_column_schema, 19 | geojson_to_geoparquet, 20 | geoparquet_to_geojson, 21 | ) 22 | import shapely 23 | 24 | from geoparquet_pydantic.validate import validate_geoparquet_file 25 | 26 | 27 | @pytest.fixture 28 | def geometry_type_examples( 29 | valid_geojson_obj: FeatureCollection, 30 | ) -> dict[str, geojson_pydantic.geometries._GeometryBase]: 31 | geometry_types = {} 32 | for feature in valid_geojson_obj.features: 33 | if feature.geometry.type not in geometry_types: 34 | geometry_types[feature.geometry.type] = feature.geometry 35 | assert len(geometry_types) == 7 36 | for k, v in geometry_types.items(): 37 | assert isinstance(v, getattr(geojson_pydantic.geometries, k)) 38 | 39 | return geometry_types 40 | 41 | 42 | @pytest.fixture 43 | def mock_table() -> pyarrow.Table: 44 | table_dict = { 45 | "col1": [1, 2, 3], 46 | "col2": [4, 5, 6], 47 | } 48 | metadata = {b"key": b"value"} 49 | table = pyarrow.Table.from_pydict(table_dict, metadata=metadata) 50 | assert isinstance(table, pyarrow.Table) 51 | assert table.schema.metadata == metadata 52 | return table 53 | 54 | 55 | def test_to_wkb( 56 | geometry_type_examples: dict[str, geojson_pydantic.geometries._GeometryBase] 57 | ): 58 | """Test the conversion of a GeoJSON object to WKB format.""" 59 | for k, v in geometry_type_examples.items(): 60 | wkb = _to_wkb(v) 61 | assert isinstance(wkb, bytes) 62 | assert len(wkb) > 0 63 | back_in = shapely.wkb.loads(wkb) 64 | assert isinstance(back_in, getattr(shapely.geometry, k)) 65 | 66 | 67 | def test_get_geom_types( 68 | valid_geojson_obj: FeatureCollection, 69 | ): 70 | """Test the extraction of unique geometry types from a GeoJSON object.""" 71 | geom_types = _get_geom_types(valid_geojson_obj.features) 72 | assert isinstance(geom_types, list) 73 | assert len(geom_types) == 7 74 | assert set(geom_types) == { 75 | "Point", 76 | "MultiPoint", 77 | "LineString", 78 | "MultiLineString", 79 | "Polygon", 80 | "MultiPolygon", 81 | "GeometryCollection", 82 | } 83 | 84 | 85 | def test_get_default_geo_metadata( 86 | valid_geojson_obj: FeatureCollection, 87 | ): 88 | default_metadata = _get_default_geo_metadata(valid_geojson_obj) 89 | assert isinstance(default_metadata, GeoParquetMetadata) 90 | assert default_metadata.columns["geometry"].geometry_types == _get_geom_types( 91 | valid_geojson_obj.features 92 | ) 93 | 94 | 95 | def test_update_metadata( 96 | mock_table: pyarrow.Table, 97 | ): 98 | new_metadata = {"new_key": "new_value"} 99 | new_table = _update_metadata(mock_table, new_metadata) 100 | assert isinstance(new_table, pyarrow.Table) 101 | assert b"new_key" in new_table.schema.metadata 102 | assert b"key" in new_table.schema.metadata 103 | 104 | 105 | def test_validate_column_schema( 106 | valid_geojson_obj: FeatureCollection, 107 | ): 108 | # make updated FeatureCollection properties 109 | for i, feature in enumerate(valid_geojson_obj.features): 110 | feature.properties["number"] = i 111 | mock_schema = pyarrow.schema( 112 | [ 113 | ("geometry", pyarrow.binary()), 114 | ("name", pyarrow.string()), 115 | ("number", pyarrow.int64()), 116 | ] 117 | ) 118 | # test with valid schema 119 | _validate_column_schema( 120 | mock_schema, 121 | primary_column="geometry", 122 | geojson=valid_geojson_obj, 123 | add_none_values=False, 124 | ) 125 | _validate_column_schema( 126 | mock_schema, 127 | primary_column="geometry", 128 | geojson=valid_geojson_obj, 129 | add_none_values=True, 130 | ) 131 | 132 | # test with invalid schema 133 | for i, feature in enumerate(valid_geojson_obj.features): 134 | if i % 2 == 0: 135 | feature.properties = {} 136 | assert not feature.properties 137 | with pytest.raises(ValueError): 138 | _validate_column_schema( 139 | mock_schema, 140 | "geometry", 141 | valid_geojson_obj, 142 | False, 143 | ) 144 | 145 | # now test that it can add Nones 146 | _validate_column_schema( 147 | mock_schema, 148 | "geometry", 149 | valid_geojson_obj, 150 | True, 151 | ) 152 | 153 | 154 | def test_geojson_to_geoparquet( 155 | valid_geojson_obj: FeatureCollection, 156 | ): 157 | """Test the conversion of a valid GeoJSON file and pydantic object to a valid 158 | GeoParquet table.""" 159 | 160 | # convert the GeoJSON object to a GeoParquet table with minimal optional 161 | table = geojson_to_geoparquet(valid_geojson_obj) 162 | assert isinstance(table, pyarrow.Table) 163 | table.validate(full=True) 164 | table_dict = table.to_pydict() 165 | assert "geometry" in table_dict 166 | assert len(table_dict["geometry"]) == len(valid_geojson_obj.features) 167 | assert "properties" in table_dict 168 | assert ( 169 | json.loads(table_dict["properties"][0]) 170 | == valid_geojson_obj.features[0].properties 171 | ) 172 | 173 | parquet_path = Path("test.parquet") 174 | pyarrow.parquet.write_table(table, parquet_path) 175 | assert parquet_path.exists() 176 | gdf = gpd.read_parquet(parquet_path) 177 | assert isinstance(gdf, gpd.GeoDataFrame) 178 | assert gdf.crs.to_string() == "OGC:CRS84" 179 | assert len(gdf) == len(valid_geojson_obj.features) 180 | parquet_path.unlink() 181 | 182 | # try again with more arguments 183 | metadata = _get_default_geo_metadata(valid_geojson_obj) 184 | table = geojson_to_geoparquet( 185 | valid_geojson_obj, 186 | geo_metadata=metadata, 187 | column_schema=pyarrow.schema( 188 | [ 189 | ("geometry", pyarrow.binary()), 190 | ("name", pyarrow.string()), 191 | ] 192 | ), 193 | ) 194 | assert isinstance(table, pyarrow.Table) 195 | table.validate(full=True) 196 | table_dict = table.to_pydict() 197 | assert "geometry" in table_dict 198 | assert len(table_dict["geometry"]) == len(valid_geojson_obj.features) 199 | assert "name" in table_dict 200 | 201 | parquet_path = Path("test.parquet") 202 | pyarrow.parquet.write_table(table, parquet_path) 203 | assert parquet_path.exists() 204 | gdf = gpd.read_parquet(parquet_path) 205 | assert isinstance(gdf, gpd.GeoDataFrame) 206 | assert gdf.crs.to_string() == "OGC:CRS84" 207 | assert len(gdf) == len(valid_geojson_obj.features) 208 | parquet_path.unlink() 209 | 210 | 211 | def test_bad_geojson_to_geoparquet( 212 | valid_geojson_obj: FeatureCollection, 213 | ): 214 | """Test the error handling of bad inputs.""" 215 | # test bad geo_metadata 216 | with pytest.raises(ValueError): 217 | geojson_to_geoparquet( 218 | valid_geojson_obj, 219 | geo_metadata={"NOT": "VALID"}, 220 | ) 221 | 222 | # test bad column_schema 223 | with pytest.raises(ValueError): 224 | geojson_to_geoparquet( 225 | valid_geojson_obj, 226 | column_schema={"NOT": "VALID"}, 227 | ) 228 | 229 | # cant have properties as a column with other columns 230 | with pytest.raises(ValueError): 231 | geojson_to_geoparquet( 232 | valid_geojson_obj, 233 | column_schema=pyarrow.schema( 234 | [ 235 | ("geometry", pyarrow.binary()), 236 | ("properties", pyarrow.string()), 237 | ("name", pyarrow.string()), 238 | ] 239 | ), 240 | ) 241 | 242 | 243 | def test_valid_geoparquet_to_geojson( 244 | valid_geoparquet_file: Path, 245 | ): 246 | """Test the conversion of a valid GeoParquet file to a valid GeoJSON object.""" 247 | # test defaults 248 | geojson = geoparquet_to_geojson(valid_geoparquet_file) 249 | assert isinstance(geojson, FeatureCollection) 250 | assert len(geojson.features) == 7 251 | for feature in geojson.features: 252 | assert isinstance(feature, geojson_pydantic.features.Feature) 253 | assert isinstance( 254 | feature.geometry, geojson_pydantic.geometries._GeometryBase 255 | ) or isinstance(feature.geometry, geojson_pydantic.base._GeoJsonBase) 256 | assert isinstance(feature.properties, dict) 257 | assert isinstance(feature.bbox, tuple) 258 | assert len(feature.bbox) == 4 259 | 260 | # test with max_chunk_size = 1 261 | geojson = geoparquet_to_geojson(valid_geoparquet_file, max_chunksize=1) 262 | assert isinstance(geojson, FeatureCollection) 263 | assert len(geojson.features) == 7 264 | for feature in geojson.features: 265 | assert isinstance(feature, geojson_pydantic.features.Feature) 266 | assert isinstance( 267 | feature.geometry, geojson_pydantic.geometries._GeometryBase 268 | ) or isinstance(feature.geometry, geojson_pydantic.base._GeoJsonBase) 269 | assert isinstance(feature.properties, dict) 270 | assert isinstance(feature.bbox, tuple) 271 | assert len(feature.bbox) == 4 272 | 273 | 274 | def test_bad_geoparquet_to_geojson(): 275 | # first we start with a table missing geo 276 | table = pyarrow.Table.from_pydict( 277 | { 278 | "geometry": [b"NOT_VALID_GEOMETRY"], 279 | "properties": ["{}"], 280 | } 281 | ) 282 | parquet_path = Path("test.parquet") 283 | pyarrow.parquet.write_table(table, parquet_path) 284 | assert parquet_path.exists() 285 | with pytest.raises(ValueError): 286 | geoparquet_to_geojson(parquet_path) 287 | parquet_path.unlink() 288 | 289 | # now we test bad inputs 290 | with pytest.raises(ValueError): 291 | geoparquet_to_geojson(-999) 292 | with pytest.raises(ValueError): 293 | geoparquet_to_geojson(table, primary_column="NOT_VALID_COLUMN") 294 | -------------------------------------------------------------------------------- /tests/test_data/valid.geojson: -------------------------------------------------------------------------------- 1 | { 2 | "type": "FeatureCollection", 3 | "features": [ 4 | { 5 | "type": "Feature", 6 | "geometry": { 7 | "type": "Point", 8 | "coordinates": [0, 0] 9 | }, 10 | "properties": { 11 | "name": "Point Feature" 12 | } 13 | }, 14 | { 15 | "type": "Feature", 16 | "geometry": { 17 | "type": "MultiPoint", 18 | "coordinates": [[1, 1], [2, 2], [3, 3]] 19 | }, 20 | "properties": { 21 | "name": "MultiPoint Feature" 22 | } 23 | }, 24 | { 25 | "type": "Feature", 26 | "geometry": { 27 | "type": "LineString", 28 | "coordinates": [[4, 4], [5, 5], [6, 6]] 29 | }, 30 | "properties": { 31 | "name": "LineString Feature" 32 | } 33 | }, 34 | { 35 | "type": "Feature", 36 | "geometry": { 37 | "type": "MultiLineString", 38 | "coordinates": [[[7, 7], [8, 8]], [[9, 9], [10, 10]]] 39 | }, 40 | "properties": { 41 | "name": "MultiLineString Feature" 42 | } 43 | }, 44 | { 45 | "type": "Feature", 46 | "geometry": { 47 | "type": "Polygon", 48 | "coordinates": [[[11, 11], [12, 12], [13, 13], [11, 11]]] 49 | }, 50 | "properties": { 51 | "name": "Polygon Feature" 52 | } 53 | }, 54 | { 55 | "type": "Feature", 56 | "geometry": { 57 | "type": "MultiPolygon", 58 | "coordinates": [[[[14, 14], [15, 15], [16, 16], [14, 14]]], [[[17, 17], [18, 18], [19, 19], [17, 17]]]] 59 | }, 60 | "properties": { 61 | "name": "MultiPolygon Feature" 62 | } 63 | }, 64 | { 65 | "type": "Feature", 66 | "geometry": { 67 | "type": "GeometryCollection", 68 | "geometries": [ 69 | { 70 | "type": "Point", 71 | "coordinates": [20, 20] 72 | }, 73 | { 74 | "type": "LineString", 75 | "coordinates": [[21, 21], [22, 22], [23, 23]] 76 | }, 77 | { 78 | "type": "Polygon", 79 | "coordinates": [[[24, 24], [25, 25], [26, 26], [24, 24]]] 80 | } 81 | ] 82 | }, 83 | "properties": { 84 | "name": "GeometryCollection Feature" 85 | } 86 | } 87 | ] 88 | } 89 | -------------------------------------------------------------------------------- /tests/test_data/valid_geojson.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xaviernogueira/geoparquet-pydantic/094df67663df6d9c8eb02579e903ecc77dd41cc7/tests/test_data/valid_geojson.parquet -------------------------------------------------------------------------------- /tests/test_schemas.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyproj import CRS 3 | from geoparquet_pydantic.schemas import ( 4 | GeometryColumnMetadata, 5 | GeoParquetMetadata, 6 | ) 7 | 8 | 9 | @pytest.fixture 10 | def good_geo_column_metadata(): 11 | return { 12 | "encoding": "WKB", 13 | "geometry_types": ["Point"], 14 | "crs": "OGC:CRS84", 15 | "edges": "planar", 16 | "bbox": [0, 0, 25, 25], 17 | "epoch": None, 18 | "orientation": "counterclockwise", 19 | } 20 | 21 | 22 | def test_good_geo_column_metadata(good_geo_column_metadata): 23 | metadata = GeometryColumnMetadata(**good_geo_column_metadata) 24 | assert metadata.encoding == good_geo_column_metadata["encoding"] 25 | assert metadata.geometry_types == good_geo_column_metadata["geometry_types"] 26 | assert metadata.crs != good_geo_column_metadata["crs"] 27 | assert CRS.from_json(metadata.crs).to_string() == good_geo_column_metadata["crs"] 28 | assert metadata.edges == good_geo_column_metadata["edges"] 29 | assert metadata.bbox == good_geo_column_metadata["bbox"] 30 | assert metadata.epoch == None 31 | assert metadata.orientation == good_geo_column_metadata["orientation"] 32 | 33 | 34 | def test_bad_geo_column_metadata(good_geo_column_metadata): 35 | """Test that the GeoColumnMetadata raises an error when given bad data.""" 36 | 37 | # Test bad encoding 38 | bad_encoding = good_geo_column_metadata.copy() 39 | bad_encoding["encoding"] = "WKT" 40 | with pytest.raises(ValueError): 41 | GeometryColumnMetadata(**bad_encoding) 42 | 43 | # Test bad geometry types 44 | bad_geometry_types = good_geo_column_metadata.copy() 45 | bad_geometry_types["geometry_types"] = ["NOT_A_REAL_TIME"] 46 | with pytest.raises(ValueError): 47 | GeometryColumnMetadata(**bad_geometry_types) 48 | 49 | # Test bad CRS 50 | bad_crs = good_geo_column_metadata.copy() 51 | bad_crs["crs"] = "NOT_A_REAL_CRS" 52 | with pytest.raises(ValueError): 53 | GeometryColumnMetadata(**bad_crs) 54 | 55 | # Test bad edges 56 | bad_edges = good_geo_column_metadata.copy() 57 | bad_edges["edges"] = "NOT_A_REAL_EDGE" 58 | with pytest.raises(ValueError): 59 | GeometryColumnMetadata(**bad_edges) 60 | 61 | # Test bad bbox 62 | bad_bbox = good_geo_column_metadata.copy() 63 | bad_bbox["bbox"] = [0, 0, 25] 64 | with pytest.raises(ValueError): 65 | GeometryColumnMetadata(**bad_bbox) 66 | 67 | # Test bad epoch 68 | bad_epoch = good_geo_column_metadata.copy() 69 | bad_epoch["epoch"] = "NOT_A_REAL_EPOCH" 70 | with pytest.raises(ValueError): 71 | GeometryColumnMetadata(**bad_epoch) 72 | 73 | # Test bad orientation 74 | bad_orientation = good_geo_column_metadata.copy() 75 | bad_orientation["orientation"] = "NOT_A_REAL_ORIENTATION" 76 | with pytest.raises(ValueError): 77 | GeometryColumnMetadata(**bad_orientation) 78 | 79 | 80 | def test_good_geoparquet(good_geo_column_metadata): 81 | 82 | # minimum inputs 83 | geo_parquet = GeoParquetMetadata( 84 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)}, 85 | ) 86 | assert geo_parquet.version == "1.1.0-dev" 87 | assert geo_parquet.primary_column == "geometry" 88 | assert isinstance(geo_parquet.columns, dict) 89 | assert len(geo_parquet.columns) == 1 90 | assert "geometry" in geo_parquet.columns 91 | assert isinstance(geo_parquet.columns["geometry"], GeometryColumnMetadata) 92 | 93 | # maximum inputs 94 | geo_parquet = GeoParquetMetadata( 95 | version="1.0.0", 96 | primary_column="geom", 97 | columns={"geom": GeometryColumnMetadata(**good_geo_column_metadata)}, 98 | ) 99 | assert geo_parquet.version == "1.0.0" 100 | assert geo_parquet.primary_column == "geom" 101 | assert isinstance(geo_parquet.columns, dict) 102 | assert len(geo_parquet.columns) == 1 103 | assert "geom" in geo_parquet.columns 104 | assert isinstance(geo_parquet.columns["geom"], GeometryColumnMetadata) 105 | 106 | 107 | def test_bad_geoparquet(good_geo_column_metadata): 108 | 109 | # Test bad version 110 | with pytest.raises(ValueError): 111 | GeoParquetMetadata( 112 | version=1.431243, 113 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)}, 114 | ) 115 | 116 | # Test bad primary_column 117 | with pytest.raises(ValueError): 118 | GeoParquetMetadata( 119 | primary_column=1.431243, 120 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)}, 121 | ) 122 | 123 | # Test bad columns 124 | with pytest.raises(ValueError): 125 | GeoParquetMetadata( 126 | columns={"geometry": "NOT_A_REAL_METADATA"}, 127 | ) 128 | with pytest.raises(ValueError): 129 | GeoParquetMetadata( 130 | columns="NOT_EVEN_A_DICT", 131 | ) 132 | with pytest.raises(ValueError): 133 | GeoParquetMetadata( 134 | columns={"geometry": {"A_DICT": "BUT_NOT_VALID"}}, 135 | ) 136 | 137 | # Test missing primary_column 138 | with pytest.raises(ValueError): 139 | GeoParquetMetadata( 140 | primary_column="NOT_A_REAL_COLUMN", 141 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)}, 142 | ) 143 | -------------------------------------------------------------------------------- /tests/test_validation.py: -------------------------------------------------------------------------------- 1 | import pyarrow 2 | import pytest 3 | from pathlib import Path 4 | from geoparquet_pydantic.validate import ( 5 | validate_geoparquet_table, 6 | validate_geoparquet_file, 7 | ) 8 | 9 | 10 | @pytest.fixture 11 | def no_geo_metadata_table() -> pyarrow.Table: 12 | return pyarrow.Table.from_pydict( 13 | { 14 | "geometry": [None], 15 | "id": [1], 16 | }, 17 | metadata={"NOTGEO": "metadata"}, 18 | ) 19 | 20 | 21 | @pytest.fixture 22 | def bad_geo_metadata_table() -> pyarrow.Table: 23 | return pyarrow.Table.from_pydict( 24 | { 25 | "geometry": [None], 26 | "id": [1], 27 | }, 28 | metadata={ 29 | b"geo": b"{'version': '1.1.0-dev', 'primary_column': 'geometry', 'columns': {'geometry': 'not-a-geometry'}}" 30 | }, 31 | ) 32 | 33 | 34 | def test_valididate_geoparquet_table(valid_geoparquet_table): 35 | """Test the validation of a valid GeoParquet table.""" 36 | assert validate_geoparquet_table(valid_geoparquet_table) 37 | 38 | 39 | def test_invalid_geoparquet_table(no_geo_metadata_table, bad_geo_metadata_table): 40 | """Test the validation of an invalid GeoParquet table.""" 41 | assert validate_geoparquet_table(no_geo_metadata_table) == False 42 | assert validate_geoparquet_table(bad_geo_metadata_table) == False 43 | 44 | 45 | def test_valid_geoparquet_file(valid_geoparquet_file: Path): 46 | """Test the validation of a valid GeoParquet file.""" 47 | assert validate_geoparquet_file(valid_geoparquet_file) 48 | assert validate_geoparquet_file(str(valid_geoparquet_file)) 49 | assert validate_geoparquet_file( 50 | pyarrow.parquet.ParquetFile(valid_geoparquet_file), 51 | ) 52 | 53 | 54 | def test_invalid_geoparquet_file(no_geo_metadata_table, bad_geo_metadata_table): 55 | """Test the validation of an invalid GeoParquet file.""" 56 | pyarrow.parquet.write_table(no_geo_metadata_table, "test1.parquet") 57 | assert validate_geoparquet_file("test1.parquet") == False 58 | Path("test1.parquet").unlink() 59 | 60 | pyarrow.parquet.write_table(no_geo_metadata_table, "test2.parquet") 61 | assert validate_geoparquet_file("test2.parquet") == False 62 | Path("test2.parquet").unlink() 63 | --------------------------------------------------------------------------------