├── requirements.txt ├── imgs └── repo_logo.png ├── dev-requirements.txt ├── tests ├── test_data │ ├── valid_geojson.parquet │ └── valid.geojson ├── conftest.py ├── test_validation.py ├── test_schemas.py └── test_conversions.py ├── .github └── workflows │ ├── pre-commit.yml │ ├── publish_package.yml │ └── tests.yml ├── src └── geoparquet_pydantic │ ├── __init__.py │ ├── validate.py │ ├── schemas.py │ └── convert.py ├── pyproject.toml ├── .pre-commit-config.yaml ├── LICENSE ├── pypi_README.md ├── .gitignore └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | geojson-pydantic 2 | pyarrow 3 | shapely 4 | pyproj 5 | click 6 | -------------------------------------------------------------------------------- /imgs/repo_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xaviernogueira/geoparquet-pydantic/HEAD/imgs/repo_logo.png -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | wheel 3 | pytest 4 | pytest-cov 5 | black 6 | pyright 7 | pre-commit 8 | geopandas 9 | -------------------------------------------------------------------------------- /tests/test_data/valid_geojson.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xaviernogueira/geoparquet-pydantic/HEAD/tests/test_data/valid_geojson.parquet -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | pre-commit: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | - uses: actions/setup-python@v3 12 | - uses: pre-commit/action@v3.0.1 13 | -------------------------------------------------------------------------------- /src/geoparquet_pydantic/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | from .schemas import ( 3 | GeometryColumnMetadata, 4 | GeoParquetMetadata, 5 | ) 6 | from .convert import ( 7 | geojson_to_geoparquet, 8 | geoparquet_to_geojson, 9 | ) 10 | from .validate import ( 11 | validate_geoparquet_table, 12 | validate_geoparquet_file, 13 | ) 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "setuptools-scm", 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "geoparquet_pydantic" 10 | description = "Read/write geoparquet with the geojson-pydanic models." 11 | requires-python = ">=3.11" 12 | keywords = [ 13 | "geoparquet", 14 | "pydantic", 15 | "geospatial", 16 | ] 17 | license = {text = "MIT"} 18 | classifiers = [ 19 | "Programming Language :: Python :: 3", 20 | ] 21 | dependencies = [ 22 | "geojson-pydantic", 23 | "pyarrow", 24 | "shapely", 25 | "pyproj", 26 | "click", 27 | ] 28 | dynamic = [ 29 | "version", 30 | "readme", 31 | ] 32 | 33 | [tool.setuptools.packages.find] 34 | where = ["src"] 35 | include = ["geoparquet_pydantic*"] 36 | 37 | [tool.setuptools.dynamic] 38 | version = {attr = "geoparquet_pydantic.__version__"} 39 | readme = {file = "pypi_README.md"} 40 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | # - repo: https://github.com/RobertCraigie/pyright-python 3 | # rev: v1.1.351 4 | # hooks: 5 | # - id: pyright 6 | # exclude: ^tests/.* 7 | # additional_dependencies: [pyarrow, shapely, geojson-pydantic] 8 | - repo: https://github.com/PyCQA/docformatter 9 | rev: v1.7.5 10 | hooks: 11 | - id: docformatter 12 | additional_dependencies: [tomli] 13 | args: [--black, --in-place] 14 | 15 | - repo: https://github.com/pre-commit/pre-commit-hooks 16 | rev: v4.4.0 17 | hooks: 18 | - id: trailing-whitespace 19 | - id: check-ast 20 | - id: check-case-conflict 21 | - id: debug-statements 22 | - id: end-of-file-fixer 23 | - id: check-docstring-first 24 | - id: check-added-large-files 25 | 26 | - repo: https://github.com/psf/black-pre-commit-mirror 27 | rev: 24.2.0 28 | hooks: 29 | - id: black 30 | language_version: python3.12 31 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import json 3 | import pyarrow 4 | from pathlib import Path 5 | from geojson_pydantic.features import FeatureCollection 6 | 7 | # get the path to the data directory 8 | TEST_DATA_DIR = Path(__file__).parent / "test_data" 9 | 10 | 11 | @pytest.fixture 12 | def valid_geojson_file() -> Path: 13 | valid_geojson = TEST_DATA_DIR / "valid.geojson" 14 | assert valid_geojson.exists() 15 | return valid_geojson 16 | 17 | 18 | @pytest.fixture 19 | def valid_geojson_obj(valid_geojson_file) -> FeatureCollection: 20 | return FeatureCollection(**json.load(open(valid_geojson_file, "r"))) 21 | 22 | 23 | @pytest.fixture 24 | def valid_geoparquet_file() -> Path: 25 | valid_geoparquet = TEST_DATA_DIR / "valid_geojson.parquet" 26 | assert valid_geoparquet.exists() 27 | return valid_geoparquet 28 | 29 | 30 | @pytest.fixture 31 | def valid_geoparquet_table(valid_geoparquet_file) -> pyarrow.Table: 32 | return pyarrow.parquet.read_table(valid_geoparquet_file) 33 | -------------------------------------------------------------------------------- /.github/workflows/publish_package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | name: Publish Release to PyPI 5 | 6 | on: 7 | release: 8 | types: [published] 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | deploy: 15 | 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v3 23 | with: 24 | python-version: '3.12' 25 | 26 | - name: Install publishing dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install --upgrade build 30 | 31 | - name: Build package 32 | run: python -m build --sdist --wheel . --outdir dist 33 | 34 | - name: Publish package to PyPi 35 | uses: pypa/gh-action-pypi-publish@release/v1 36 | with: 37 | password: ${{ secrets.PYPI_PASSWORD }} 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Xavier Nogueira 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | run: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | python-version: ["3.11", "3.12"] 17 | os: [ubuntu-latest] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Install Python 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | python --version 30 | pip install -U pip 31 | pip install -r requirements.txt 32 | pip install -r dev-requirements.txt 33 | pip list 34 | 35 | - name: Install the library 36 | run: | 37 | pip install -e . 38 | pip list 39 | 40 | - name: Run tests 41 | run: pytest --cov=geoparquet_pydantic --cov-report=xml tests/ 42 | 43 | - name: Upload coverage report to CodeCov 44 | uses: codecov/codecov-action@v3 45 | env: 46 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 47 | -------------------------------------------------------------------------------- /tests/test_validation.py: -------------------------------------------------------------------------------- 1 | import pyarrow 2 | import pytest 3 | from pathlib import Path 4 | from geoparquet_pydantic.validate import ( 5 | validate_geoparquet_table, 6 | validate_geoparquet_file, 7 | ) 8 | 9 | 10 | @pytest.fixture 11 | def no_geo_metadata_table() -> pyarrow.Table: 12 | return pyarrow.Table.from_pydict( 13 | { 14 | "geometry": [None], 15 | "id": [1], 16 | }, 17 | metadata={"NOTGEO": "metadata"}, 18 | ) 19 | 20 | 21 | @pytest.fixture 22 | def bad_geo_metadata_table() -> pyarrow.Table: 23 | return pyarrow.Table.from_pydict( 24 | { 25 | "geometry": [None], 26 | "id": [1], 27 | }, 28 | metadata={ 29 | b"geo": b"{'version': '1.1.0-dev', 'primary_column': 'geometry', 'columns': {'geometry': 'not-a-geometry'}}" 30 | }, 31 | ) 32 | 33 | 34 | def test_valididate_geoparquet_table(valid_geoparquet_table): 35 | """Test the validation of a valid GeoParquet table.""" 36 | assert validate_geoparquet_table(valid_geoparquet_table) 37 | 38 | 39 | def test_invalid_geoparquet_table(no_geo_metadata_table, bad_geo_metadata_table): 40 | """Test the validation of an invalid GeoParquet table.""" 41 | assert validate_geoparquet_table(no_geo_metadata_table) == False 42 | assert validate_geoparquet_table(bad_geo_metadata_table) == False 43 | 44 | 45 | def test_valid_geoparquet_file(valid_geoparquet_file: Path): 46 | """Test the validation of a valid GeoParquet file.""" 47 | assert validate_geoparquet_file(valid_geoparquet_file) 48 | assert validate_geoparquet_file(str(valid_geoparquet_file)) 49 | assert validate_geoparquet_file( 50 | pyarrow.parquet.ParquetFile(valid_geoparquet_file), 51 | ) 52 | 53 | 54 | def test_invalid_geoparquet_file(no_geo_metadata_table, bad_geo_metadata_table): 55 | """Test the validation of an invalid GeoParquet file.""" 56 | pyarrow.parquet.write_table(no_geo_metadata_table, "test1.parquet") 57 | assert validate_geoparquet_file("test1.parquet") == False 58 | Path("test1.parquet").unlink() 59 | 60 | pyarrow.parquet.write_table(no_geo_metadata_table, "test2.parquet") 61 | assert validate_geoparquet_file("test2.parquet") == False 62 | Path("test2.parquet").unlink() 63 | -------------------------------------------------------------------------------- /tests/test_data/valid.geojson: -------------------------------------------------------------------------------- 1 | { 2 | "type": "FeatureCollection", 3 | "features": [ 4 | { 5 | "type": "Feature", 6 | "geometry": { 7 | "type": "Point", 8 | "coordinates": [0, 0] 9 | }, 10 | "properties": { 11 | "name": "Point Feature" 12 | } 13 | }, 14 | { 15 | "type": "Feature", 16 | "geometry": { 17 | "type": "MultiPoint", 18 | "coordinates": [[1, 1], [2, 2], [3, 3]] 19 | }, 20 | "properties": { 21 | "name": "MultiPoint Feature" 22 | } 23 | }, 24 | { 25 | "type": "Feature", 26 | "geometry": { 27 | "type": "LineString", 28 | "coordinates": [[4, 4], [5, 5], [6, 6]] 29 | }, 30 | "properties": { 31 | "name": "LineString Feature" 32 | } 33 | }, 34 | { 35 | "type": "Feature", 36 | "geometry": { 37 | "type": "MultiLineString", 38 | "coordinates": [[[7, 7], [8, 8]], [[9, 9], [10, 10]]] 39 | }, 40 | "properties": { 41 | "name": "MultiLineString Feature" 42 | } 43 | }, 44 | { 45 | "type": "Feature", 46 | "geometry": { 47 | "type": "Polygon", 48 | "coordinates": [[[11, 11], [12, 12], [13, 13], [11, 11]]] 49 | }, 50 | "properties": { 51 | "name": "Polygon Feature" 52 | } 53 | }, 54 | { 55 | "type": "Feature", 56 | "geometry": { 57 | "type": "MultiPolygon", 58 | "coordinates": [[[[14, 14], [15, 15], [16, 16], [14, 14]]], [[[17, 17], [18, 18], [19, 19], [17, 17]]]] 59 | }, 60 | "properties": { 61 | "name": "MultiPolygon Feature" 62 | } 63 | }, 64 | { 65 | "type": "Feature", 66 | "geometry": { 67 | "type": "GeometryCollection", 68 | "geometries": [ 69 | { 70 | "type": "Point", 71 | "coordinates": [20, 20] 72 | }, 73 | { 74 | "type": "LineString", 75 | "coordinates": [[21, 21], [22, 22], [23, 23]] 76 | }, 77 | { 78 | "type": "Polygon", 79 | "coordinates": [[[24, 24], [25, 25], [26, 26], [24, 24]]] 80 | } 81 | ] 82 | }, 83 | "properties": { 84 | "name": "GeometryCollection Feature" 85 | } 86 | } 87 | ] 88 | } 89 | -------------------------------------------------------------------------------- /pypi_README.md: -------------------------------------------------------------------------------- 1 | # GeoParquet-Pydantic 2 | 3 | **Motivation:** This project started at the 2024 San Fransisco GeoParquet Community hackathon, and arose out of a simple observation: 4 | why must Python users install the *massive* GDAL dependency (typically via GeoPandas) to do simple GeoJSON<>GeoParquet conversions. 5 | 6 | **Is this library the right choice for you?:** 7 | * Do you need to use a wide variety of Geospatial functions? If so, you will likely have to add GDAL/GeoPandas as a dependency anyways, 8 | making this ibrary's conversion functions *probably* redundant. 9 | * Is your workflow command line centric? If so you may want to consider Planet Lab's simular CLI tool [`gpq`](https://github.com/planetlabs/gpq), 10 | which is written in Go and substantially faster. 11 | * Otherwise, if you are using Python and want to avoid unnecessary bulky dependencies, this library will be a great choice! 12 | 13 | **Note:** All user-exposed functions and schema classes are available at the top level (i.e., `geoparquet_pydantic.validate_geoparquet_table(...)`) of this library. 14 | 15 | # Documentation is on GitHub [here](https://github.com/xaviernogueira/geoparquet-pydantic/blob/main/README.md) 16 | 17 | # Getting Started 18 | 19 | Install from [PyPi](https://pypi.org/project/geoparquet-pydantic): 20 | ```bash 21 | pip install geoparquet-pydantic 22 | ``` 23 | 24 | Or from source: 25 | ```bash 26 | $ git clone https://github.com/xaviernogueira/geoparquet-pydantic.git 27 | $ cd geoparquet-pydantic 28 | $ pip install . 29 | ``` 30 | 31 | Then import with an underscore: 32 | ```python 33 | import geoparquet_pydantic 34 | ``` 35 | 36 | Or just import the functions/classes you need from the top-level: 37 | ```python 38 | from geoparquet_pydantic import GeometryColumnMetadata 39 | from geoparquet_pydantic import GeoParquetMetadata 40 | from geoparquet_pydantic import validate_geoparquet_table 41 | from geoparquet_pydantic import validate_geoparquet_file 42 | from geoparquet_pydantic import geojson_to_geoparquet 43 | from geoparquet_pydantic import geoparquet_to_geojson 44 | ``` 45 | 46 | # Roadmap 47 | 48 | - [ ] Make CLI file<>file functions w/ `click`. 49 | - [ ] Add parrallelized Parquet read for `geoparquet_pydantic.geoparquet_to_geojson()`. 50 | 51 | # Contribute 52 | 53 | We encourage contributions, feature requests, and bug reports! 54 | 55 | Here is our recomended workflow: 56 | 57 | * Use `dev-requirements.txt` to install our development dependencies. 58 | * Make your edits using `pyright` as a linter. 59 | * Use `pre-commit run --all-file` before commiting your work. 60 | * If you add a new feature, we request that you add test coverage for it. 61 | 62 | Happy coding! 63 | -------------------------------------------------------------------------------- /src/geoparquet_pydantic/validate.py: -------------------------------------------------------------------------------- 1 | """For validating an existing GeoParquet file or Arrow table. 2 | 3 | Note that validating GeoParquet metadata can be handles with the 4 | `.schemas` module pydantic classes. 5 | """ 6 | 7 | import ast 8 | import pyarrow 9 | from geoparquet_pydantic.schemas import ( 10 | GeoParquetMetadata, 11 | ) 12 | from typing import Optional 13 | from pathlib import Path 14 | 15 | 16 | def _validate_geo_metadata(metadata: dict[bytes, bytes]) -> bool: 17 | try: 18 | geo_metadata = ast.literal_eval(metadata[b"geo"].decode("utf-8")) 19 | GeoParquetMetadata(**geo_metadata) 20 | print("Valid GeoParquet metadata!") 21 | return True 22 | except KeyError as e: 23 | print(f"Invalid GeoParquet metadata, could not find b'geo' key: {e}") 24 | except ValueError as e: 25 | print(f"Invalid GeoParquet metadata: {e}") 26 | return False 27 | 28 | 29 | def validate_geoparquet_table( 30 | table: pyarrow.Table, 31 | primary_column: Optional[str] = None, 32 | ) -> bool: 33 | """Validates a the GeoParquet metadata of a pyarrow.Table. 34 | 35 | See: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md 36 | 37 | Args: 38 | table (pyarrow.Table): The table to validate. 39 | primary_column (Optional[str], optional): The name of the primary geometry column. 40 | Defaults to None. 41 | 42 | Returns: 43 | bool: True if the metadata is valid, False otherwise. 44 | """ 45 | if not primary_column: 46 | primary_column = "geometry" 47 | return _validate_geo_metadata(table.schema.metadata) 48 | 49 | 50 | def validate_geoparquet_file( 51 | geoparquet_file: str | Path | pyarrow.parquet.ParquetFile, 52 | primary_column: Optional[str] = None, 53 | read_file_kwargs: Optional[dict] = None, 54 | ) -> bool: 55 | """Validates that a parquet file has correct GeoParquet metadata without opening it. 56 | 57 | See: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md 58 | 59 | Args: 60 | geoparquet_file (str | Path | ParquetFile): The file to validate. 61 | primary_column (str, optional): The primary column name. Defaults to 'geometry'. 62 | read_file_kwargs (dict, optional): Kwargs to be passed into pyarrow.parquet.ParquetFile(). 63 | See: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow-parquet-parquetfile 64 | 65 | Returns: 66 | bool: True if the metadata is valid, False otherwise. 67 | """ 68 | if not primary_column: 69 | primary_column = "geometry" 70 | default_read_file_kwargs = { 71 | "memory_map": True, 72 | } 73 | if read_file_kwargs is None: 74 | read_file_kwargs = default_read_file_kwargs 75 | elif isinstance(read_file_kwargs, dict): 76 | for k, v in default_read_file_kwargs.items(): 77 | if k not in read_file_kwargs: 78 | read_file_kwargs[k] = v 79 | else: 80 | raise TypeError(f"Optional param:read_file_kwargs must be a dict or None!") 81 | 82 | if isinstance(geoparquet_file, (str, Path)): 83 | geoparquet_file = pyarrow.parquet.ParquetFile( 84 | geoparquet_file, 85 | **read_file_kwargs, 86 | ) 87 | if not isinstance(geoparquet_file, pyarrow.parquet.ParquetFile): 88 | raise TypeError( 89 | "Input must be a file path (str | Path) or a ParquetFile object!" 90 | ) 91 | return _validate_geo_metadata(geoparquet_file.schema_arrow.metadata) 92 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /src/geoparquet_pydantic/schemas.py: -------------------------------------------------------------------------------- 1 | """Pydantic models for GeoParquet metadata.""" 2 | 3 | import ast 4 | from pydantic import BeforeValidator, Field, BaseModel, field_validator, model_validator 5 | from typing import Annotated, Optional, Literal, Union 6 | from pyproj import CRS 7 | 8 | EdgeType = Literal["planar", "spherical"] 9 | 10 | FlatGeometryTypes = Annotated[ 11 | # TODO: support 3d geometries with Z suffix 12 | Literal[ 13 | "Point", 14 | "MultiPoint", 15 | "LineString", 16 | "MultiLineString", 17 | "Polygon", 18 | "MultiPolygon", 19 | "GeometryCollection", 20 | ], 21 | Field(description="The geometry types supported by the column"), 22 | ] 23 | 24 | ZGeometryTypes = Annotated[ 25 | Literal[ 26 | "PointZ", 27 | "MultiPointZ", 28 | "LineStringZ", 29 | "MultiLineStringZ", 30 | "PolygonZ", 31 | "MultiPolygonZ", 32 | "GeometryCollectionZ", 33 | ], 34 | Field(description="3D geometry types supported by the column"), 35 | ] 36 | 37 | GeometryTypes = Union[FlatGeometryTypes, ZGeometryTypes] 38 | 39 | 40 | class GeometryColumnMetadata(BaseModel): 41 | encoding: Literal["WKB"] 42 | geometry_types: list[GeometryTypes] 43 | 44 | crs: Annotated[ 45 | str, 46 | Field( 47 | description="The CRS of the geometry column in a string format readable by pyproj. Is the converted to PROJJSON format" 48 | ), 49 | ] = "OGC:CRS84" 50 | 51 | edges: Annotated[ 52 | EdgeType, Field(description="The type of edges of the geometries") 53 | ] = "planar" 54 | 55 | bbox: Optional[ 56 | Annotated[list[float], Field(description="The bounding box of the geometries")] 57 | ] = None 58 | 59 | epoch: Optional[ 60 | Annotated[ 61 | Union[int, float], 62 | Field(description="Coordinate epoch in case of a dynamic CRS"), 63 | ] 64 | ] = None 65 | 66 | orientation: Literal["counterclockwise"] = "counterclockwise" 67 | 68 | @field_validator("crs") 69 | @classmethod 70 | def convert_crs_to_projjson(cls, v) -> str: 71 | """Parse a CRS string and return a PROJJSON string.""" 72 | try: 73 | crs = CRS.from_string(v) 74 | return crs.to_json() 75 | except Exception as e: 76 | raise ValueError(f"Invalid CRS string: {e}") 77 | 78 | @field_validator("geometry_types") 79 | @classmethod 80 | def only_unique_types(cls, v): 81 | if len(v) != len(set(v)): 82 | raise ValueError("geometry_types items must be unique!") 83 | return v 84 | 85 | @field_validator("bbox") 86 | @classmethod 87 | def must_be_length_4(cls, v): 88 | if v is not None and len(v) != 4: 89 | raise ValueError("bbox must be a list of 4 floats!") 90 | return v 91 | 92 | 93 | class GeoParquetMetadata(BaseModel): 94 | version: Annotated[ 95 | str, Field(description="The version of the GeoParquet format") 96 | ] = "1.1.0-dev" 97 | primary_column: Annotated[ 98 | str, Field(description="The name of the geometry primary column") 99 | ] = "geometry" 100 | columns: Annotated[ 101 | dict[str, GeometryColumnMetadata | dict | str], 102 | Field(description="Metadata for each column (keys)"), 103 | ] 104 | 105 | @model_validator(mode="after") 106 | def contains_primary_col(self) -> "GeoParquetMetadata": 107 | if not self.primary_column in self.columns.keys(): 108 | raise ValueError( 109 | f"primary column={self.primary_column} not in arg:columns={self.columns}" 110 | ) 111 | return self 112 | 113 | @model_validator(mode="after") 114 | def convert_geo_to_class(self) -> "GeoParquetMetadata": 115 | if not isinstance(self.columns[self.primary_column], GeometryColumnMetadata): 116 | if isinstance(self.columns[self.primary_column], str): 117 | self.columns[self.primary_column] = ast.literal_eval( 118 | self.columns[self.primary_column] 119 | ) 120 | if isinstance(self.columns[self.primary_column], dict): 121 | self.columns[self.primary_column] = GeometryColumnMetadata( 122 | **self.columns[self.primary_column] 123 | ) 124 | else: 125 | raise ValueError( 126 | f"Invalid primary column metadata: {self.columns[self.primary_column]}" 127 | ) 128 | return self 129 | -------------------------------------------------------------------------------- /tests/test_schemas.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyproj import CRS 3 | from geoparquet_pydantic.schemas import ( 4 | GeometryColumnMetadata, 5 | GeoParquetMetadata, 6 | ) 7 | 8 | 9 | @pytest.fixture 10 | def good_geo_column_metadata(): 11 | return { 12 | "encoding": "WKB", 13 | "geometry_types": ["Point"], 14 | "crs": "OGC:CRS84", 15 | "edges": "planar", 16 | "bbox": [0, 0, 25, 25], 17 | "epoch": None, 18 | "orientation": "counterclockwise", 19 | } 20 | 21 | 22 | def test_good_geo_column_metadata(good_geo_column_metadata): 23 | metadata = GeometryColumnMetadata(**good_geo_column_metadata) 24 | assert metadata.encoding == good_geo_column_metadata["encoding"] 25 | assert metadata.geometry_types == good_geo_column_metadata["geometry_types"] 26 | assert metadata.crs != good_geo_column_metadata["crs"] 27 | assert CRS.from_json(metadata.crs).to_string() == good_geo_column_metadata["crs"] 28 | assert metadata.edges == good_geo_column_metadata["edges"] 29 | assert metadata.bbox == good_geo_column_metadata["bbox"] 30 | assert metadata.epoch == None 31 | assert metadata.orientation == good_geo_column_metadata["orientation"] 32 | 33 | 34 | def test_bad_geo_column_metadata(good_geo_column_metadata): 35 | """Test that the GeoColumnMetadata raises an error when given bad data.""" 36 | 37 | # Test bad encoding 38 | bad_encoding = good_geo_column_metadata.copy() 39 | bad_encoding["encoding"] = "WKT" 40 | with pytest.raises(ValueError): 41 | GeometryColumnMetadata(**bad_encoding) 42 | 43 | # Test bad geometry types 44 | bad_geometry_types = good_geo_column_metadata.copy() 45 | bad_geometry_types["geometry_types"] = ["NOT_A_REAL_TIME"] 46 | with pytest.raises(ValueError): 47 | GeometryColumnMetadata(**bad_geometry_types) 48 | 49 | # Test bad CRS 50 | bad_crs = good_geo_column_metadata.copy() 51 | bad_crs["crs"] = "NOT_A_REAL_CRS" 52 | with pytest.raises(ValueError): 53 | GeometryColumnMetadata(**bad_crs) 54 | 55 | # Test bad edges 56 | bad_edges = good_geo_column_metadata.copy() 57 | bad_edges["edges"] = "NOT_A_REAL_EDGE" 58 | with pytest.raises(ValueError): 59 | GeometryColumnMetadata(**bad_edges) 60 | 61 | # Test bad bbox 62 | bad_bbox = good_geo_column_metadata.copy() 63 | bad_bbox["bbox"] = [0, 0, 25] 64 | with pytest.raises(ValueError): 65 | GeometryColumnMetadata(**bad_bbox) 66 | 67 | # Test bad epoch 68 | bad_epoch = good_geo_column_metadata.copy() 69 | bad_epoch["epoch"] = "NOT_A_REAL_EPOCH" 70 | with pytest.raises(ValueError): 71 | GeometryColumnMetadata(**bad_epoch) 72 | 73 | # Test bad orientation 74 | bad_orientation = good_geo_column_metadata.copy() 75 | bad_orientation["orientation"] = "NOT_A_REAL_ORIENTATION" 76 | with pytest.raises(ValueError): 77 | GeometryColumnMetadata(**bad_orientation) 78 | 79 | 80 | def test_good_geoparquet(good_geo_column_metadata): 81 | 82 | # minimum inputs 83 | geo_parquet = GeoParquetMetadata( 84 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)}, 85 | ) 86 | assert geo_parquet.version == "1.1.0-dev" 87 | assert geo_parquet.primary_column == "geometry" 88 | assert isinstance(geo_parquet.columns, dict) 89 | assert len(geo_parquet.columns) == 1 90 | assert "geometry" in geo_parquet.columns 91 | assert isinstance(geo_parquet.columns["geometry"], GeometryColumnMetadata) 92 | 93 | # maximum inputs 94 | geo_parquet = GeoParquetMetadata( 95 | version="1.0.0", 96 | primary_column="geom", 97 | columns={"geom": GeometryColumnMetadata(**good_geo_column_metadata)}, 98 | ) 99 | assert geo_parquet.version == "1.0.0" 100 | assert geo_parquet.primary_column == "geom" 101 | assert isinstance(geo_parquet.columns, dict) 102 | assert len(geo_parquet.columns) == 1 103 | assert "geom" in geo_parquet.columns 104 | assert isinstance(geo_parquet.columns["geom"], GeometryColumnMetadata) 105 | 106 | 107 | def test_bad_geoparquet(good_geo_column_metadata): 108 | 109 | # Test bad version 110 | with pytest.raises(ValueError): 111 | GeoParquetMetadata( 112 | version=1.431243, 113 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)}, 114 | ) 115 | 116 | # Test bad primary_column 117 | with pytest.raises(ValueError): 118 | GeoParquetMetadata( 119 | primary_column=1.431243, 120 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)}, 121 | ) 122 | 123 | # Test bad columns 124 | with pytest.raises(ValueError): 125 | GeoParquetMetadata( 126 | columns={"geometry": "NOT_A_REAL_METADATA"}, 127 | ) 128 | with pytest.raises(ValueError): 129 | GeoParquetMetadata( 130 | columns="NOT_EVEN_A_DICT", 131 | ) 132 | with pytest.raises(ValueError): 133 | GeoParquetMetadata( 134 | columns={"geometry": {"A_DICT": "BUT_NOT_VALID"}}, 135 | ) 136 | 137 | # Test missing primary_column 138 | with pytest.raises(ValueError): 139 | GeoParquetMetadata( 140 | primary_column="NOT_A_REAL_COLUMN", 141 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)}, 142 | ) 143 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GeoParquet-Pydantic 2 | 3 |
4 |
5 |
7 | A lightweight, pydantic centric library for validating GeoParquet files (or PyArrow Tables) and converting between GeoJSON and GeoParquet...without GDAL! 8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |