├── .github
└── workflows
│ ├── pre-commit.yml
│ ├── publish_package.yml
│ └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── dev-requirements.txt
├── imgs
└── repo_logo.png
├── pypi_README.md
├── pyproject.toml
├── requirements.txt
├── src
└── geoparquet_pydantic
│ ├── __init__.py
│ ├── convert.py
│ ├── schemas.py
│ └── validate.py
└── tests
├── conftest.py
├── test_conversions.py
├── test_data
├── valid.geojson
└── valid_geojson.parquet
├── test_schemas.py
└── test_validation.py
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
1 | name: pre-commit
2 |
3 | on:
4 | pull_request:
5 |
6 | jobs:
7 | pre-commit:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - uses: actions/checkout@v3
11 | - uses: actions/setup-python@v3
12 | - uses: pre-commit/action@v3.0.1
13 |
--------------------------------------------------------------------------------
/.github/workflows/publish_package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | name: Publish Release to PyPI
5 |
6 | on:
7 | release:
8 | types: [published]
9 |
10 | permissions:
11 | contents: read
12 |
13 | jobs:
14 | deploy:
15 |
16 | runs-on: ubuntu-latest
17 |
18 | steps:
19 | - uses: actions/checkout@v3
20 |
21 | - name: Set up Python
22 | uses: actions/setup-python@v3
23 | with:
24 | python-version: '3.12'
25 |
26 | - name: Install publishing dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install --upgrade build
30 |
31 | - name: Build package
32 | run: python -m build --sdist --wheel . --outdir dist
33 |
34 | - name: Publish package to PyPi
35 | uses: pypa/gh-action-pypi-publish@release/v1
36 | with:
37 | password: ${{ secrets.PYPI_PASSWORD }}
38 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | jobs:
12 | run:
13 | runs-on: ${{ matrix.os }}
14 | strategy:
15 | matrix:
16 | python-version: ["3.11", "3.12"]
17 | os: [ubuntu-latest]
18 |
19 | steps:
20 | - uses: actions/checkout@v4
21 |
22 | - name: Install Python
23 | uses: actions/setup-python@v5
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 |
27 | - name: Install dependencies
28 | run: |
29 | python --version
30 | pip install -U pip
31 | pip install -r requirements.txt
32 | pip install -r dev-requirements.txt
33 | pip list
34 |
35 | - name: Install the library
36 | run: |
37 | pip install -e .
38 | pip list
39 |
40 | - name: Run tests
41 | run: pytest --cov=geoparquet_pydantic --cov-report=xml tests/
42 |
43 | - name: Upload coverage report to CodeCov
44 | uses: codecov/codecov-action@v3
45 | env:
46 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
47 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | # - repo: https://github.com/RobertCraigie/pyright-python
3 | # rev: v1.1.351
4 | # hooks:
5 | # - id: pyright
6 | # exclude: ^tests/.*
7 | # additional_dependencies: [pyarrow, shapely, geojson-pydantic]
8 | - repo: https://github.com/PyCQA/docformatter
9 | rev: v1.7.5
10 | hooks:
11 | - id: docformatter
12 | additional_dependencies: [tomli]
13 | args: [--black, --in-place]
14 |
15 | - repo: https://github.com/pre-commit/pre-commit-hooks
16 | rev: v4.4.0
17 | hooks:
18 | - id: trailing-whitespace
19 | - id: check-ast
20 | - id: check-case-conflict
21 | - id: debug-statements
22 | - id: end-of-file-fixer
23 | - id: check-docstring-first
24 | - id: check-added-large-files
25 |
26 | - repo: https://github.com/psf/black-pre-commit-mirror
27 | rev: 24.2.0
28 | hooks:
29 | - id: black
30 | language_version: python3.12
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Xavier Nogueira
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GeoParquet-Pydantic
2 |
3 |
4 |
5 |
6 |
7 | A lightweight, pydantic centric library for validating GeoParquet files (or PyArrow Tables) and converting between GeoJSON and GeoParquet...without GDAL!
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | ---
28 | **Motivation:** This project started at the 2024 San Fransisco GeoParquet Community hackathon, and arose out of a simple observation:
29 | why must Python users install the *massive* GDAL dependency (typically via GeoPandas) to do simple GeoJSON<>GeoParquet conversions.
30 |
31 | **Is this library the right choice for you?:**
32 | * Do you need to use a wide variety of Geospatial functions? If so, you will likely have to add GDAL/GeoPandas as a dependency anyways,
33 | making this ibrary's conversion functions *probably* redundant.
34 | * Is your workflow command line centric? If so you may want to consider Planet Lab's simular CLI tool [`gpq`](https://github.com/planetlabs/gpq),
35 | which is written in Go and substantially faster.
36 | * Otherwise, if you are using Python and want to avoid unnecessary bulky dependencies, this library will be a great choice!
37 |
38 | **Note:** All user-exposed functions and schema classes are available at the top level (i.e., `geoparquet_pydantic.validate_geoparquet_table(...)`) of this library.
39 |
40 | # Features
41 |
42 | ## `pydantic` Schemas
43 |
44 | * [`GeometryColumnMetadata`](https://github.com/xaviernogueira/geoparquet-pydantic/blob/cec560451db01cd5c4a4b1fea6486c86975f7499/geoparquet_pydantic/schemas.py#L40): A `pydantic` model that validates a
45 | geometry column's (aka `primary_column`) metadata. This is nested within the following schema.
46 | * [`GeoParquetMetadata`](https://github.com/xaviernogueira/geoparquet-pydantic/blob/cec560451db01cd5c4a4b1fea6486c86975f7499/geoparquet_pydantic/schemas.py#L93): A `pydantic` model for the metadata assigned to the "geo" key in a `pyarrow.Table`
47 | that allows it to be read by GeoParquet readers once saved.
48 |
49 | For an explanation of these schemas, please refence the [geoparquet repository](https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md).
50 |
51 | ## Validation functions
52 |
53 | Convenience functions that simply uses `GeoParquetMetadata` to return a `bool` depending on whether the GeoParquet metadata obeys the [schema](https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md).
54 |
55 | ### Validate a `pyarrow.Table`'s GeoParquet metadata:
56 | ```python
57 | def validate_geoparquet_table(
58 | table: pyarrow.Table,
59 | primary_column: Optional[str] = None,
60 | ) -> bool:
61 | """Validates a the GeoParquet metadata of a pyarrow.Table.
62 |
63 | Args:
64 | table (pyarrow.Table): The table to validate.
65 | primary_column (Optional[str], optional): The name of the primary geometry column.
66 | Defaults to None.
67 |
68 | Returns:
69 | bool: True if the metadata is valid, False otherwise.
70 | """
71 | ...
72 | ```
73 |
74 | ### Validate a Parquet file's GeoParquet metadata:
75 |
76 | ```python
77 | def validate_geoparquet_file(
78 | geoparquet_file: str | Path | pyarrow.parquet.ParquetFile,
79 | primary_column: Optional[str] = None,
80 | read_file_kwargs: Optional[dict] = None,
81 | ) -> bool:
82 | """Validates that a parquet file has correct GeoParquet metadata without opening it.
83 |
84 | Args:
85 | geoparquet_file (str | Path | ParquetFile): The file to validate.
86 | primary_column (str, optional): The primary column name. Defaults to 'geometry'.
87 | read_file_kwargs (dict, optional): Kwargs to be passed into pyarrow.parquet.ParquetFile().
88 | See: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow-parquet-parquetfile
89 |
90 | Returns:
91 | bool: True if the metadata is valid, False otherwise.
92 | """
93 | ...
94 | ```
95 |
96 | ## Conversion functions
97 |
98 | ### Convert from `geojson_pydantic.FeatureCollection` to a GeoParquet `pyarrow.Table`
99 |
100 | ```python
101 | def geojson_to_geoparquet(
102 | geojson: FeatureCollection | Path,
103 | primary_column: Optional[str] = None,
104 | column_schema: Optional[pyarrow.Schema] = None,
105 | add_none_values: Optional[bool] = False,
106 | geo_metadata: GeoParquetMetadata | dict | None = None,
107 | **kwargs,
108 | ) -> pyarrow.Table:
109 | """Converts a GeoJSON Pydantic FeatureCollection to an Arrow table with geoparquet
110 | metadata.
111 |
112 | To save to a file, simply use pyarrow.parquet.write_table() on the returned table.
113 |
114 | Args:
115 | geojson (FeatureCollection): The GeoJSON Pydantic FeatureCollection.
116 | primary_column (str, optional): The name of the primary column. Defaults to None.
117 | column_schema (pyarrow.Schema, optional): The Arrow schema for the table. Defaults to None.
118 | add_none_values (bool, default=False): Whether to fill missing column values
119 | specified in param:column_schema with 'None' (converts to pyarrow.null()).
120 | geo_metadata (GeoParquet | dict | None, optional): The GeoParquet metadata.
121 | **kwargs: Additional keyword arguments for the Arrow table writer.
122 |
123 | Returns:
124 | The Arrow table with GeoParquet metadata.
125 | """
126 | ...
127 | ```
128 |
129 | ### Convert from a GeoParquet `pyarrow.Table` or file to a `geojson_pydantic.FeatureCollection`
130 |
131 | ```python
132 | def geoparquet_to_geojson(
133 | geoparquet: pyarrow.Table | str | Path,
134 | primary_column: Optional[str] = None,
135 | max_chunksize: Optional[int] = None,
136 | max_workers: Optional[int] = None,
137 | ) -> FeatureCollection:
138 | """Converts an Arrow table with GeoParquet metadata to a GeoJSON Pydantic
139 | FeatureCollection.
140 |
141 | Args:
142 | geoparquet (pyarrow.Table): Either an Arrow.Table or parquet with GeoParquet metadata.
143 | primary_column (str, optional): The name of the primary column. Defaults to 'geometry'.
144 | max_chunksize (int, optional): The maximum chunksize to read from the parquet file. Defaults to 1000.
145 | max_workers (int, optional): The maximum number of workers to use for parallel processing.
146 | Defaults to 0 (runs sequentially). Use -1 for all available cores.
147 |
148 | Returns:
149 | FeatureCollection: The GeoJSON Pydantic FeatureCollection.
150 | """
151 | ...
152 | ```
153 |
154 | # Getting Started
155 |
156 | Install from [PyPi](https://pypi.org/project/geoparquet-pydantic):
157 | ```bash
158 | pip install geoparquet-pydantic
159 | ```
160 |
161 | Or from source:
162 | ```bash
163 | $ git clone https://github.com/xaviernogueira/geoparquet-pydantic.git
164 | $ cd geoparquet-pydantic
165 | $ pip install .
166 | ```
167 |
168 | Then import with an underscore:
169 | ```python
170 | import geoparquet_pydantic
171 | ```
172 |
173 | Or just import the functions/classes you need from the top-level:
174 | ```python
175 | from geoparquet_pydantic import (
176 | GeometryColumnMetadata,
177 | GeoParquetMetadata,
178 | validate_geoparquet_table,
179 | validate_geoparquet_file,
180 | geojson_to_geoparquet,
181 | geoparquet_to_geojson,
182 | )
183 | ```
184 |
185 | # Roadmap
186 |
187 | - [ ] Make CLI file<>file functions w/ `click`.
188 | - [ ] Add parrallelized Parquet read for `geoparquet_pydantic.geoparquet_to_geojson()`.
189 |
190 | # Contribute
191 |
192 | We encourage contributions, feature requests, and bug reports!
193 |
194 | Here is our recomended workflow:
195 |
196 | * Use `dev-requirements.txt` to install our development dependencies.
197 | * Make your edits using `pyright` as a linter.
198 | * Use `pre-commit run --all-file` before commiting your work.
199 | * If you add a new feature, we request that you add test coverage for it.
200 |
201 | Happy coding!
202 |
--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | wheel
3 | pytest
4 | pytest-cov
5 | black
6 | pyright
7 | pre-commit
8 | geopandas
9 |
--------------------------------------------------------------------------------
/imgs/repo_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xaviernogueira/geoparquet-pydantic/094df67663df6d9c8eb02579e903ecc77dd41cc7/imgs/repo_logo.png
--------------------------------------------------------------------------------
/pypi_README.md:
--------------------------------------------------------------------------------
1 | # GeoParquet-Pydantic
2 |
3 | **Motivation:** This project started at the 2024 San Fransisco GeoParquet Community hackathon, and arose out of a simple observation:
4 | why must Python users install the *massive* GDAL dependency (typically via GeoPandas) to do simple GeoJSON<>GeoParquet conversions.
5 |
6 | **Is this library the right choice for you?:**
7 | * Do you need to use a wide variety of Geospatial functions? If so, you will likely have to add GDAL/GeoPandas as a dependency anyways,
8 | making this ibrary's conversion functions *probably* redundant.
9 | * Is your workflow command line centric? If so you may want to consider Planet Lab's simular CLI tool [`gpq`](https://github.com/planetlabs/gpq),
10 | which is written in Go and substantially faster.
11 | * Otherwise, if you are using Python and want to avoid unnecessary bulky dependencies, this library will be a great choice!
12 |
13 | **Note:** All user-exposed functions and schema classes are available at the top level (i.e., `geoparquet_pydantic.validate_geoparquet_table(...)`) of this library.
14 |
15 | # Documentation is on GitHub [here](https://github.com/xaviernogueira/geoparquet-pydantic/blob/main/README.md)
16 |
17 | # Getting Started
18 |
19 | Install from [PyPi](https://pypi.org/project/geoparquet-pydantic):
20 | ```bash
21 | pip install geoparquet-pydantic
22 | ```
23 |
24 | Or from source:
25 | ```bash
26 | $ git clone https://github.com/xaviernogueira/geoparquet-pydantic.git
27 | $ cd geoparquet-pydantic
28 | $ pip install .
29 | ```
30 |
31 | Then import with an underscore:
32 | ```python
33 | import geoparquet_pydantic
34 | ```
35 |
36 | Or just import the functions/classes you need from the top-level:
37 | ```python
38 | from geoparquet_pydantic import GeometryColumnMetadata
39 | from geoparquet_pydantic import GeoParquetMetadata
40 | from geoparquet_pydantic import validate_geoparquet_table
41 | from geoparquet_pydantic import validate_geoparquet_file
42 | from geoparquet_pydantic import geojson_to_geoparquet
43 | from geoparquet_pydantic import geoparquet_to_geojson
44 | ```
45 |
46 | # Roadmap
47 |
48 | - [ ] Make CLI file<>file functions w/ `click`.
49 | - [ ] Add parrallelized Parquet read for `geoparquet_pydantic.geoparquet_to_geojson()`.
50 |
51 | # Contribute
52 |
53 | We encourage contributions, feature requests, and bug reports!
54 |
55 | Here is our recomended workflow:
56 |
57 | * Use `dev-requirements.txt` to install our development dependencies.
58 | * Make your edits using `pyright` as a linter.
59 | * Use `pre-commit run --all-file` before commiting your work.
60 | * If you add a new feature, we request that you add test coverage for it.
61 |
62 | Happy coding!
63 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools",
4 | "setuptools-scm",
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
8 | [project]
9 | name = "geoparquet_pydantic"
10 | description = "Read/write geoparquet with the geojson-pydanic models."
11 | requires-python = ">=3.11"
12 | keywords = [
13 | "geoparquet",
14 | "pydantic",
15 | "geospatial",
16 | ]
17 | license = {text = "MIT"}
18 | classifiers = [
19 | "Programming Language :: Python :: 3",
20 | ]
21 | dependencies = [
22 | "geojson-pydantic",
23 | "pyarrow",
24 | "shapely",
25 | "pyproj",
26 | "click",
27 | ]
28 | dynamic = [
29 | "version",
30 | "readme",
31 | ]
32 |
33 | [tool.setuptools.packages.find]
34 | where = ["src"]
35 | include = ["geoparquet_pydantic*"]
36 |
37 | [tool.setuptools.dynamic]
38 | version = {attr = "geoparquet_pydantic.__version__"}
39 | readme = {file = "pypi_README.md"}
40 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | geojson-pydantic
2 | pyarrow
3 | shapely
4 | pyproj
5 | click
6 |
--------------------------------------------------------------------------------
/src/geoparquet_pydantic/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 | from .schemas import (
3 | GeometryColumnMetadata,
4 | GeoParquetMetadata,
5 | )
6 | from .convert import (
7 | geojson_to_geoparquet,
8 | geoparquet_to_geojson,
9 | )
10 | from .validate import (
11 | validate_geoparquet_table,
12 | validate_geoparquet_file,
13 | )
14 |
--------------------------------------------------------------------------------
/src/geoparquet_pydantic/convert.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import functools
3 | import warnings
4 | import geojson_pydantic
5 | from geojson_pydantic.types import BBox
6 | import shapely.wkb
7 | import shapely.wkt
8 | import pyarrow
9 | import json
10 | from geojson_pydantic.geometries import (
11 | _GeometryBase,
12 | )
13 | from geojson_pydantic.features import (
14 | Feature,
15 | FeatureCollection,
16 | )
17 | from geoparquet_pydantic.schemas import (
18 | GeometryColumnMetadata,
19 | GeoParquetMetadata,
20 | GeometryTypes,
21 | )
22 | from pathlib import Path
23 | from typing import Any, Optional, Iterable
24 |
25 |
26 | def _to_wkb(geometry: _GeometryBase) -> bytes:
27 | """Converts the GeoJSON object to WKB format."""
28 | return shapely.wkb.dumps(shapely.wkt.loads(geometry.wkt))
29 |
30 |
31 | def _get_geom_types(features: list[Feature]) -> list[str]:
32 | return list(set([feature.geometry.type for feature in features]))
33 |
34 |
35 | def _get_default_geo_metadata(
36 | feature_collection: FeatureCollection,
37 | ) -> GeoParquetMetadata:
38 | return GeoParquetMetadata(
39 | primary_column="geometry",
40 | columns={
41 | "geometry": GeometryColumnMetadata(
42 | **{
43 | "encoding": "WKB",
44 | "geometry_types": _get_geom_types(feature_collection.features),
45 | }
46 | ),
47 | },
48 | )
49 |
50 |
51 | def _update_metadata(table: pyarrow.Table, metadata: dict) -> pyarrow.Table:
52 | new_metadata = table.schema.metadata
53 | if not new_metadata:
54 | new_metadata = {}
55 | for k, v in metadata.items():
56 | new_metadata[k] = json.dumps(v).encode("utf-8")
57 | return table.replace_schema_metadata(new_metadata)
58 |
59 |
60 | def _validate_column_schema(
61 | column_schema: pyarrow.Schema,
62 | primary_column: str,
63 | geojson: FeatureCollection,
64 | add_none_values: bool,
65 | ) -> None:
66 | names = [i for i in column_schema.names if i != primary_column]
67 | for feature in geojson.features:
68 | if not add_none_values:
69 | all_present = all([name in feature.properties.keys() for name in names])
70 | if not all_present:
71 | raise ValueError(
72 | f"Feature {feature} does not contain all the columns in the schema: {column_schema.names}",
73 | )
74 |
75 | else:
76 | for name in names:
77 | if not feature.properties.get(name):
78 | feature.properties[name] = None
79 |
80 |
81 | def geojson_to_geoparquet(
82 | geojson: FeatureCollection | Path,
83 | primary_column: Optional[str] = None,
84 | column_schema: Optional[pyarrow.Schema] = None,
85 | add_none_values: Optional[bool] = False,
86 | geo_metadata: GeoParquetMetadata | dict | None = None,
87 | **kwargs,
88 | ) -> pyarrow.Table:
89 | """Converts a GeoJSON Pydantic FeatureCollection to an Arrow table with geoparquet
90 | metadata.
91 |
92 | To save to a file, simply use pyarrow.parquet.write_table() on the returned table.
93 |
94 | Args:
95 | geojson (FeatureCollection): The GeoJSON Pydantic FeatureCollection.
96 | primary_column (str, optional): The name of the primary column. Defaults to None.
97 | column_schema (pyarrow.Schema, optional): The Arrow schema for the table. Defaults to None.
98 | add_none_values (bool, default=False): Whether to fill missing column values
99 | specified in param:column_schema with 'None' (converts to pyarrow.null()).
100 | geo_metadata (GeoParquet | dict | None, optional): The GeoParquet metadata.
101 | **kwargs: Additional keyword arguments for the Arrow table writer.
102 |
103 | Returns:
104 | The Arrow table with GeoParquet metadata.
105 | """
106 | if not isinstance(geojson, FeatureCollection):
107 | geojson = FeatureCollection(**json.load(geojson.open("r")))
108 | if not primary_column:
109 | primary_column = "geometry"
110 |
111 | # get primary column as iterables
112 | columns: list[Iterable] = [map(lambda f: _to_wkb(f.geometry), geojson.features)]
113 |
114 | # get geo metadata
115 | if not geo_metadata:
116 | geo_metadata = _get_default_geo_metadata(geojson)
117 | if isinstance(geo_metadata, dict):
118 | geo_metadata = GeoParquetMetadata(**geo_metadata)
119 | if not isinstance(geo_metadata, GeoParquetMetadata):
120 | raise ValueError("geo_metadata must be a valid GeoParquet class, dict, or None")
121 |
122 | # get other columns as iterables and update schema
123 | if not column_schema:
124 | column_schema = pyarrow.schema(
125 | [
126 | (primary_column, pyarrow.binary()),
127 | ("properties", pyarrow.string()),
128 | ]
129 | )
130 | elif isinstance(column_schema, pyarrow.Schema):
131 | if primary_column in column_schema.names:
132 | column_schema.remove(column_schema.get_field_index(primary_column))
133 | column_schema.insert(0, pyarrow.field(primary_column, pyarrow.binary()))
134 | else:
135 | raise ValueError("column_schema must be a valid pyarrow.Schema or None")
136 |
137 | if "properties" in column_schema.names:
138 | if len(column_schema.names) > 2:
139 | raise ValueError(
140 | "Cannot have 'properties' as a column with other columns (which are pulled from GeoJSON propreties)."
141 | )
142 | columns.append(map(lambda f: json.dumps(f.properties), geojson.features))
143 |
144 | else:
145 | _validate_column_schema(column_schema, primary_column, geojson, add_none_values)
146 |
147 | for col in column_schema.names:
148 | columns.append(map(lambda f: f.properties.get(col), geojson.features))
149 |
150 | # write table
151 | table = pyarrow.Table.from_pydict(
152 | {**dict(zip(column_schema.names, columns))},
153 | schema=column_schema,
154 | **kwargs,
155 | )
156 | return _update_metadata(table, {"geo": geo_metadata.model_dump()})
157 |
158 |
159 | def _find_bbox(geoparquet: pyarrow.Table) -> BBox | None:
160 | if not geoparquet.schema.metadata:
161 | warnings.warn("No GeoParquet metadata found in the Arrow table.")
162 | return None
163 | decoded_metadata: dict[str, Any] = ast.literal_eval(
164 | geoparquet.schema.metadata[b"geo"].decode("utf-8"),
165 | )
166 | bbox = decoded_metadata["columns"]["geometry"].get("bbox", None)
167 | if isinstance(bbox, list):
168 | bbox = tuple(bbox)
169 | return bbox
170 |
171 |
172 | def _get_prop_records(name_value_tuple: tuple[str, list[Any]]) -> list[tuple[str, Any]]:
173 | name, values = name_value_tuple
174 | return list(zip([name] * len(values), values))
175 |
176 |
177 | def _shapely_to_feature(
178 | geometry: shapely.geometry.base.BaseGeometry,
179 | properties: list[tuple[str, Any]],
180 | ) -> Feature:
181 | geom_class: type[GeometryTypes] = getattr(geojson_pydantic, type(geometry).__name__)
182 | return Feature(
183 | type="Feature",
184 | geometry=geom_class(**json.loads(shapely.to_geojson(geometry))),
185 | bbox=list(geometry.bounds),
186 | properties=dict([*properties]),
187 | )
188 |
189 |
190 | def geoparquet_to_geojson(
191 | geoparquet: pyarrow.Table | str | Path,
192 | primary_column: Optional[str] = None,
193 | max_chunksize: Optional[int] = None,
194 | max_workers: Optional[int] = None,
195 | ) -> FeatureCollection:
196 | """Converts an Arrow table with GeoParquet metadata to a GeoJSON Pydantic
197 | FeatureCollection.
198 |
199 | Args:
200 | geoparquet (pyarrow.Table): Either an Arrow.Table or parquet with GeoParquet metadata.
201 | primary_column (str, optional): The name of the primary column. Defaults to 'geometry'.
202 | max_chunksize (int, optional): The maximum chunksize to read from the parquet file. Defaults to 1000.
203 | max_workers (int, optional): The maximum number of workers to use for parallel processing.
204 | Defaults to 0 (runs sequentially). Use -1 for all available cores.
205 | Returns:
206 | FeatureCollection: The GeoJSON Pydantic FeatureCollection.
207 | """
208 | if not primary_column:
209 | primary_column = "geometry"
210 | if not max_chunksize:
211 | max_chunksize = 1000
212 | if isinstance(geoparquet, (str, Path)):
213 | geoparquet = pyarrow.parquet.read_table(geoparquet)
214 | if not isinstance(geoparquet, pyarrow.Table):
215 | raise ValueError(
216 | "param:geoparquet must be a valid pyarrow.Table or parquet file"
217 | )
218 |
219 | if primary_column not in geoparquet.column_names:
220 | raise ValueError(f"Primary column {primary_column} not found in the table.")
221 |
222 | # attempt to get the bbox from metadata
223 | bbox: BBox | None = _find_bbox(geoparquet)
224 |
225 | # TODO: parallelize this (optionally)
226 | if max_workers:
227 | raise NotImplementedError("Parallel processing not yet implemented.")
228 |
229 | feature_lists: list[list[Feature]] = []
230 | for chunk in geoparquet.to_batches(max_chunksize):
231 | chunk_dict = chunk.to_pydict()
232 | geoms: list[bytes] = chunk_dict.pop(primary_column)
233 | properties: Iterable[list[tuple[str, Any]]] = map(
234 | _get_prop_records,
235 | chunk_dict.items(),
236 | )
237 | feature_props: Iterable[list[tuple[str, Any]]] = map(
238 | lambda i: [p[i] for p in properties],
239 | range(len(geoms)),
240 | )
241 | try:
242 | chunk_features: Iterable[Feature] = list(
243 | map(
244 | lambda gp: _shapely_to_feature(shapely.from_wkb(gp[0]), gp[1]),
245 | zip(geoms, feature_props),
246 | )
247 | )
248 | except shapely.errors.GEOSException as e:
249 | raise ValueError(
250 | f"Error converting WKB to shapely geometry. Make sure the WKB is valid! Exception: {e}"
251 | )
252 |
253 | feature_lists.append(chunk_features)
254 | features: list[Feature] = list(functools.reduce(lambda a, b: a + b, feature_lists))
255 |
256 | return FeatureCollection(
257 | type="FeatureCollection",
258 | features=features,
259 | bbox=bbox,
260 | )
261 |
--------------------------------------------------------------------------------
/src/geoparquet_pydantic/schemas.py:
--------------------------------------------------------------------------------
1 | """Pydantic models for GeoParquet metadata."""
2 |
3 | import ast
4 | from pydantic import BeforeValidator, Field, BaseModel, field_validator, model_validator
5 | from typing import Annotated, Optional, Literal, Union
6 | from pyproj import CRS
7 |
8 | EdgeType = Literal["planar", "spherical"]
9 |
10 | FlatGeometryTypes = Annotated[
11 | # TODO: support 3d geometries with Z suffix
12 | Literal[
13 | "Point",
14 | "MultiPoint",
15 | "LineString",
16 | "MultiLineString",
17 | "Polygon",
18 | "MultiPolygon",
19 | "GeometryCollection",
20 | ],
21 | Field(description="The geometry types supported by the column"),
22 | ]
23 |
24 | ZGeometryTypes = Annotated[
25 | Literal[
26 | "PointZ",
27 | "MultiPointZ",
28 | "LineStringZ",
29 | "MultiLineStringZ",
30 | "PolygonZ",
31 | "MultiPolygonZ",
32 | "GeometryCollectionZ",
33 | ],
34 | Field(description="3D geometry types supported by the column"),
35 | ]
36 |
37 | GeometryTypes = Union[FlatGeometryTypes, ZGeometryTypes]
38 |
39 |
40 | class GeometryColumnMetadata(BaseModel):
41 | encoding: Literal["WKB"]
42 | geometry_types: list[GeometryTypes]
43 |
44 | crs: Annotated[
45 | str,
46 | Field(
47 | description="The CRS of the geometry column in a string format readable by pyproj. Is the converted to PROJJSON format"
48 | ),
49 | ] = "OGC:CRS84"
50 |
51 | edges: Annotated[
52 | EdgeType, Field(description="The type of edges of the geometries")
53 | ] = "planar"
54 |
55 | bbox: Optional[
56 | Annotated[list[float], Field(description="The bounding box of the geometries")]
57 | ] = None
58 |
59 | epoch: Optional[
60 | Annotated[
61 | Union[int, float],
62 | Field(description="Coordinate epoch in case of a dynamic CRS"),
63 | ]
64 | ] = None
65 |
66 | orientation: Literal["counterclockwise"] = "counterclockwise"
67 |
68 | @field_validator("crs")
69 | @classmethod
70 | def convert_crs_to_projjson(cls, v) -> str:
71 | """Parse a CRS string and return a PROJJSON string."""
72 | try:
73 | crs = CRS.from_string(v)
74 | return crs.to_json()
75 | except Exception as e:
76 | raise ValueError(f"Invalid CRS string: {e}")
77 |
78 | @field_validator("geometry_types")
79 | @classmethod
80 | def only_unique_types(cls, v):
81 | if len(v) != len(set(v)):
82 | raise ValueError("geometry_types items must be unique!")
83 | return v
84 |
85 | @field_validator("bbox")
86 | @classmethod
87 | def must_be_length_4(cls, v):
88 | if v is not None and len(v) != 4:
89 | raise ValueError("bbox must be a list of 4 floats!")
90 | return v
91 |
92 |
93 | class GeoParquetMetadata(BaseModel):
94 | version: Annotated[
95 | str, Field(description="The version of the GeoParquet format")
96 | ] = "1.1.0-dev"
97 | primary_column: Annotated[
98 | str, Field(description="The name of the geometry primary column")
99 | ] = "geometry"
100 | columns: Annotated[
101 | dict[str, GeometryColumnMetadata | dict | str],
102 | Field(description="Metadata for each column (keys)"),
103 | ]
104 |
105 | @model_validator(mode="after")
106 | def contains_primary_col(self) -> "GeoParquetMetadata":
107 | if not self.primary_column in self.columns.keys():
108 | raise ValueError(
109 | f"primary column={self.primary_column} not in arg:columns={self.columns}"
110 | )
111 | return self
112 |
113 | @model_validator(mode="after")
114 | def convert_geo_to_class(self) -> "GeoParquetMetadata":
115 | if not isinstance(self.columns[self.primary_column], GeometryColumnMetadata):
116 | if isinstance(self.columns[self.primary_column], str):
117 | self.columns[self.primary_column] = ast.literal_eval(
118 | self.columns[self.primary_column]
119 | )
120 | if isinstance(self.columns[self.primary_column], dict):
121 | self.columns[self.primary_column] = GeometryColumnMetadata(
122 | **self.columns[self.primary_column]
123 | )
124 | else:
125 | raise ValueError(
126 | f"Invalid primary column metadata: {self.columns[self.primary_column]}"
127 | )
128 | return self
129 |
--------------------------------------------------------------------------------
/src/geoparquet_pydantic/validate.py:
--------------------------------------------------------------------------------
1 | """For validating an existing GeoParquet file or Arrow table.
2 |
3 | Note that validating GeoParquet metadata can be handles with the
4 | `.schemas` module pydantic classes.
5 | """
6 |
7 | import ast
8 | import pyarrow
9 | from geoparquet_pydantic.schemas import (
10 | GeoParquetMetadata,
11 | )
12 | from typing import Optional
13 | from pathlib import Path
14 |
15 |
16 | def _validate_geo_metadata(metadata: dict[bytes, bytes]) -> bool:
17 | try:
18 | geo_metadata = ast.literal_eval(metadata[b"geo"].decode("utf-8"))
19 | GeoParquetMetadata(**geo_metadata)
20 | print("Valid GeoParquet metadata!")
21 | return True
22 | except KeyError as e:
23 | print(f"Invalid GeoParquet metadata, could not find b'geo' key: {e}")
24 | except ValueError as e:
25 | print(f"Invalid GeoParquet metadata: {e}")
26 | return False
27 |
28 |
29 | def validate_geoparquet_table(
30 | table: pyarrow.Table,
31 | primary_column: Optional[str] = None,
32 | ) -> bool:
33 | """Validates a the GeoParquet metadata of a pyarrow.Table.
34 |
35 | See: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md
36 |
37 | Args:
38 | table (pyarrow.Table): The table to validate.
39 | primary_column (Optional[str], optional): The name of the primary geometry column.
40 | Defaults to None.
41 |
42 | Returns:
43 | bool: True if the metadata is valid, False otherwise.
44 | """
45 | if not primary_column:
46 | primary_column = "geometry"
47 | return _validate_geo_metadata(table.schema.metadata)
48 |
49 |
50 | def validate_geoparquet_file(
51 | geoparquet_file: str | Path | pyarrow.parquet.ParquetFile,
52 | primary_column: Optional[str] = None,
53 | read_file_kwargs: Optional[dict] = None,
54 | ) -> bool:
55 | """Validates that a parquet file has correct GeoParquet metadata without opening it.
56 |
57 | See: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md
58 |
59 | Args:
60 | geoparquet_file (str | Path | ParquetFile): The file to validate.
61 | primary_column (str, optional): The primary column name. Defaults to 'geometry'.
62 | read_file_kwargs (dict, optional): Kwargs to be passed into pyarrow.parquet.ParquetFile().
63 | See: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow-parquet-parquetfile
64 |
65 | Returns:
66 | bool: True if the metadata is valid, False otherwise.
67 | """
68 | if not primary_column:
69 | primary_column = "geometry"
70 | default_read_file_kwargs = {
71 | "memory_map": True,
72 | }
73 | if read_file_kwargs is None:
74 | read_file_kwargs = default_read_file_kwargs
75 | elif isinstance(read_file_kwargs, dict):
76 | for k, v in default_read_file_kwargs.items():
77 | if k not in read_file_kwargs:
78 | read_file_kwargs[k] = v
79 | else:
80 | raise TypeError(f"Optional param:read_file_kwargs must be a dict or None!")
81 |
82 | if isinstance(geoparquet_file, (str, Path)):
83 | geoparquet_file = pyarrow.parquet.ParquetFile(
84 | geoparquet_file,
85 | **read_file_kwargs,
86 | )
87 | if not isinstance(geoparquet_file, pyarrow.parquet.ParquetFile):
88 | raise TypeError(
89 | "Input must be a file path (str | Path) or a ParquetFile object!"
90 | )
91 | return _validate_geo_metadata(geoparquet_file.schema_arrow.metadata)
92 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import json
3 | import pyarrow
4 | from pathlib import Path
5 | from geojson_pydantic.features import FeatureCollection
6 |
7 | # get the path to the data directory
8 | TEST_DATA_DIR = Path(__file__).parent / "test_data"
9 |
10 |
11 | @pytest.fixture
12 | def valid_geojson_file() -> Path:
13 | valid_geojson = TEST_DATA_DIR / "valid.geojson"
14 | assert valid_geojson.exists()
15 | return valid_geojson
16 |
17 |
18 | @pytest.fixture
19 | def valid_geojson_obj(valid_geojson_file) -> FeatureCollection:
20 | return FeatureCollection(**json.load(open(valid_geojson_file, "r")))
21 |
22 |
23 | @pytest.fixture
24 | def valid_geoparquet_file() -> Path:
25 | valid_geoparquet = TEST_DATA_DIR / "valid_geojson.parquet"
26 | assert valid_geoparquet.exists()
27 | return valid_geoparquet
28 |
29 |
30 | @pytest.fixture
31 | def valid_geoparquet_table(valid_geoparquet_file) -> pyarrow.Table:
32 | return pyarrow.parquet.read_table(valid_geoparquet_file)
33 |
--------------------------------------------------------------------------------
/tests/test_conversions.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import json
3 | import pyarrow
4 | from pathlib import Path
5 | import geojson_pydantic
6 | import geopandas as gpd
7 | import pyarrow.parquet
8 | from geojson_pydantic.features import FeatureCollection
9 |
10 | from geoparquet_pydantic.schemas import (
11 | GeoParquetMetadata,
12 | )
13 | from geoparquet_pydantic.convert import (
14 | _to_wkb,
15 | _get_geom_types,
16 | _get_default_geo_metadata,
17 | _update_metadata,
18 | _validate_column_schema,
19 | geojson_to_geoparquet,
20 | geoparquet_to_geojson,
21 | )
22 | import shapely
23 |
24 | from geoparquet_pydantic.validate import validate_geoparquet_file
25 |
26 |
27 | @pytest.fixture
28 | def geometry_type_examples(
29 | valid_geojson_obj: FeatureCollection,
30 | ) -> dict[str, geojson_pydantic.geometries._GeometryBase]:
31 | geometry_types = {}
32 | for feature in valid_geojson_obj.features:
33 | if feature.geometry.type not in geometry_types:
34 | geometry_types[feature.geometry.type] = feature.geometry
35 | assert len(geometry_types) == 7
36 | for k, v in geometry_types.items():
37 | assert isinstance(v, getattr(geojson_pydantic.geometries, k))
38 |
39 | return geometry_types
40 |
41 |
42 | @pytest.fixture
43 | def mock_table() -> pyarrow.Table:
44 | table_dict = {
45 | "col1": [1, 2, 3],
46 | "col2": [4, 5, 6],
47 | }
48 | metadata = {b"key": b"value"}
49 | table = pyarrow.Table.from_pydict(table_dict, metadata=metadata)
50 | assert isinstance(table, pyarrow.Table)
51 | assert table.schema.metadata == metadata
52 | return table
53 |
54 |
55 | def test_to_wkb(
56 | geometry_type_examples: dict[str, geojson_pydantic.geometries._GeometryBase]
57 | ):
58 | """Test the conversion of a GeoJSON object to WKB format."""
59 | for k, v in geometry_type_examples.items():
60 | wkb = _to_wkb(v)
61 | assert isinstance(wkb, bytes)
62 | assert len(wkb) > 0
63 | back_in = shapely.wkb.loads(wkb)
64 | assert isinstance(back_in, getattr(shapely.geometry, k))
65 |
66 |
67 | def test_get_geom_types(
68 | valid_geojson_obj: FeatureCollection,
69 | ):
70 | """Test the extraction of unique geometry types from a GeoJSON object."""
71 | geom_types = _get_geom_types(valid_geojson_obj.features)
72 | assert isinstance(geom_types, list)
73 | assert len(geom_types) == 7
74 | assert set(geom_types) == {
75 | "Point",
76 | "MultiPoint",
77 | "LineString",
78 | "MultiLineString",
79 | "Polygon",
80 | "MultiPolygon",
81 | "GeometryCollection",
82 | }
83 |
84 |
85 | def test_get_default_geo_metadata(
86 | valid_geojson_obj: FeatureCollection,
87 | ):
88 | default_metadata = _get_default_geo_metadata(valid_geojson_obj)
89 | assert isinstance(default_metadata, GeoParquetMetadata)
90 | assert default_metadata.columns["geometry"].geometry_types == _get_geom_types(
91 | valid_geojson_obj.features
92 | )
93 |
94 |
95 | def test_update_metadata(
96 | mock_table: pyarrow.Table,
97 | ):
98 | new_metadata = {"new_key": "new_value"}
99 | new_table = _update_metadata(mock_table, new_metadata)
100 | assert isinstance(new_table, pyarrow.Table)
101 | assert b"new_key" in new_table.schema.metadata
102 | assert b"key" in new_table.schema.metadata
103 |
104 |
105 | def test_validate_column_schema(
106 | valid_geojson_obj: FeatureCollection,
107 | ):
108 | # make updated FeatureCollection properties
109 | for i, feature in enumerate(valid_geojson_obj.features):
110 | feature.properties["number"] = i
111 | mock_schema = pyarrow.schema(
112 | [
113 | ("geometry", pyarrow.binary()),
114 | ("name", pyarrow.string()),
115 | ("number", pyarrow.int64()),
116 | ]
117 | )
118 | # test with valid schema
119 | _validate_column_schema(
120 | mock_schema,
121 | primary_column="geometry",
122 | geojson=valid_geojson_obj,
123 | add_none_values=False,
124 | )
125 | _validate_column_schema(
126 | mock_schema,
127 | primary_column="geometry",
128 | geojson=valid_geojson_obj,
129 | add_none_values=True,
130 | )
131 |
132 | # test with invalid schema
133 | for i, feature in enumerate(valid_geojson_obj.features):
134 | if i % 2 == 0:
135 | feature.properties = {}
136 | assert not feature.properties
137 | with pytest.raises(ValueError):
138 | _validate_column_schema(
139 | mock_schema,
140 | "geometry",
141 | valid_geojson_obj,
142 | False,
143 | )
144 |
145 | # now test that it can add Nones
146 | _validate_column_schema(
147 | mock_schema,
148 | "geometry",
149 | valid_geojson_obj,
150 | True,
151 | )
152 |
153 |
154 | def test_geojson_to_geoparquet(
155 | valid_geojson_obj: FeatureCollection,
156 | ):
157 | """Test the conversion of a valid GeoJSON file and pydantic object to a valid
158 | GeoParquet table."""
159 |
160 | # convert the GeoJSON object to a GeoParquet table with minimal optional
161 | table = geojson_to_geoparquet(valid_geojson_obj)
162 | assert isinstance(table, pyarrow.Table)
163 | table.validate(full=True)
164 | table_dict = table.to_pydict()
165 | assert "geometry" in table_dict
166 | assert len(table_dict["geometry"]) == len(valid_geojson_obj.features)
167 | assert "properties" in table_dict
168 | assert (
169 | json.loads(table_dict["properties"][0])
170 | == valid_geojson_obj.features[0].properties
171 | )
172 |
173 | parquet_path = Path("test.parquet")
174 | pyarrow.parquet.write_table(table, parquet_path)
175 | assert parquet_path.exists()
176 | gdf = gpd.read_parquet(parquet_path)
177 | assert isinstance(gdf, gpd.GeoDataFrame)
178 | assert gdf.crs.to_string() == "OGC:CRS84"
179 | assert len(gdf) == len(valid_geojson_obj.features)
180 | parquet_path.unlink()
181 |
182 | # try again with more arguments
183 | metadata = _get_default_geo_metadata(valid_geojson_obj)
184 | table = geojson_to_geoparquet(
185 | valid_geojson_obj,
186 | geo_metadata=metadata,
187 | column_schema=pyarrow.schema(
188 | [
189 | ("geometry", pyarrow.binary()),
190 | ("name", pyarrow.string()),
191 | ]
192 | ),
193 | )
194 | assert isinstance(table, pyarrow.Table)
195 | table.validate(full=True)
196 | table_dict = table.to_pydict()
197 | assert "geometry" in table_dict
198 | assert len(table_dict["geometry"]) == len(valid_geojson_obj.features)
199 | assert "name" in table_dict
200 |
201 | parquet_path = Path("test.parquet")
202 | pyarrow.parquet.write_table(table, parquet_path)
203 | assert parquet_path.exists()
204 | gdf = gpd.read_parquet(parquet_path)
205 | assert isinstance(gdf, gpd.GeoDataFrame)
206 | assert gdf.crs.to_string() == "OGC:CRS84"
207 | assert len(gdf) == len(valid_geojson_obj.features)
208 | parquet_path.unlink()
209 |
210 |
211 | def test_bad_geojson_to_geoparquet(
212 | valid_geojson_obj: FeatureCollection,
213 | ):
214 | """Test the error handling of bad inputs."""
215 | # test bad geo_metadata
216 | with pytest.raises(ValueError):
217 | geojson_to_geoparquet(
218 | valid_geojson_obj,
219 | geo_metadata={"NOT": "VALID"},
220 | )
221 |
222 | # test bad column_schema
223 | with pytest.raises(ValueError):
224 | geojson_to_geoparquet(
225 | valid_geojson_obj,
226 | column_schema={"NOT": "VALID"},
227 | )
228 |
229 | # cant have properties as a column with other columns
230 | with pytest.raises(ValueError):
231 | geojson_to_geoparquet(
232 | valid_geojson_obj,
233 | column_schema=pyarrow.schema(
234 | [
235 | ("geometry", pyarrow.binary()),
236 | ("properties", pyarrow.string()),
237 | ("name", pyarrow.string()),
238 | ]
239 | ),
240 | )
241 |
242 |
243 | def test_valid_geoparquet_to_geojson(
244 | valid_geoparquet_file: Path,
245 | ):
246 | """Test the conversion of a valid GeoParquet file to a valid GeoJSON object."""
247 | # test defaults
248 | geojson = geoparquet_to_geojson(valid_geoparquet_file)
249 | assert isinstance(geojson, FeatureCollection)
250 | assert len(geojson.features) == 7
251 | for feature in geojson.features:
252 | assert isinstance(feature, geojson_pydantic.features.Feature)
253 | assert isinstance(
254 | feature.geometry, geojson_pydantic.geometries._GeometryBase
255 | ) or isinstance(feature.geometry, geojson_pydantic.base._GeoJsonBase)
256 | assert isinstance(feature.properties, dict)
257 | assert isinstance(feature.bbox, tuple)
258 | assert len(feature.bbox) == 4
259 |
260 | # test with max_chunk_size = 1
261 | geojson = geoparquet_to_geojson(valid_geoparquet_file, max_chunksize=1)
262 | assert isinstance(geojson, FeatureCollection)
263 | assert len(geojson.features) == 7
264 | for feature in geojson.features:
265 | assert isinstance(feature, geojson_pydantic.features.Feature)
266 | assert isinstance(
267 | feature.geometry, geojson_pydantic.geometries._GeometryBase
268 | ) or isinstance(feature.geometry, geojson_pydantic.base._GeoJsonBase)
269 | assert isinstance(feature.properties, dict)
270 | assert isinstance(feature.bbox, tuple)
271 | assert len(feature.bbox) == 4
272 |
273 |
274 | def test_bad_geoparquet_to_geojson():
275 | # first we start with a table missing geo
276 | table = pyarrow.Table.from_pydict(
277 | {
278 | "geometry": [b"NOT_VALID_GEOMETRY"],
279 | "properties": ["{}"],
280 | }
281 | )
282 | parquet_path = Path("test.parquet")
283 | pyarrow.parquet.write_table(table, parquet_path)
284 | assert parquet_path.exists()
285 | with pytest.raises(ValueError):
286 | geoparquet_to_geojson(parquet_path)
287 | parquet_path.unlink()
288 |
289 | # now we test bad inputs
290 | with pytest.raises(ValueError):
291 | geoparquet_to_geojson(-999)
292 | with pytest.raises(ValueError):
293 | geoparquet_to_geojson(table, primary_column="NOT_VALID_COLUMN")
294 |
--------------------------------------------------------------------------------
/tests/test_data/valid.geojson:
--------------------------------------------------------------------------------
1 | {
2 | "type": "FeatureCollection",
3 | "features": [
4 | {
5 | "type": "Feature",
6 | "geometry": {
7 | "type": "Point",
8 | "coordinates": [0, 0]
9 | },
10 | "properties": {
11 | "name": "Point Feature"
12 | }
13 | },
14 | {
15 | "type": "Feature",
16 | "geometry": {
17 | "type": "MultiPoint",
18 | "coordinates": [[1, 1], [2, 2], [3, 3]]
19 | },
20 | "properties": {
21 | "name": "MultiPoint Feature"
22 | }
23 | },
24 | {
25 | "type": "Feature",
26 | "geometry": {
27 | "type": "LineString",
28 | "coordinates": [[4, 4], [5, 5], [6, 6]]
29 | },
30 | "properties": {
31 | "name": "LineString Feature"
32 | }
33 | },
34 | {
35 | "type": "Feature",
36 | "geometry": {
37 | "type": "MultiLineString",
38 | "coordinates": [[[7, 7], [8, 8]], [[9, 9], [10, 10]]]
39 | },
40 | "properties": {
41 | "name": "MultiLineString Feature"
42 | }
43 | },
44 | {
45 | "type": "Feature",
46 | "geometry": {
47 | "type": "Polygon",
48 | "coordinates": [[[11, 11], [12, 12], [13, 13], [11, 11]]]
49 | },
50 | "properties": {
51 | "name": "Polygon Feature"
52 | }
53 | },
54 | {
55 | "type": "Feature",
56 | "geometry": {
57 | "type": "MultiPolygon",
58 | "coordinates": [[[[14, 14], [15, 15], [16, 16], [14, 14]]], [[[17, 17], [18, 18], [19, 19], [17, 17]]]]
59 | },
60 | "properties": {
61 | "name": "MultiPolygon Feature"
62 | }
63 | },
64 | {
65 | "type": "Feature",
66 | "geometry": {
67 | "type": "GeometryCollection",
68 | "geometries": [
69 | {
70 | "type": "Point",
71 | "coordinates": [20, 20]
72 | },
73 | {
74 | "type": "LineString",
75 | "coordinates": [[21, 21], [22, 22], [23, 23]]
76 | },
77 | {
78 | "type": "Polygon",
79 | "coordinates": [[[24, 24], [25, 25], [26, 26], [24, 24]]]
80 | }
81 | ]
82 | },
83 | "properties": {
84 | "name": "GeometryCollection Feature"
85 | }
86 | }
87 | ]
88 | }
89 |
--------------------------------------------------------------------------------
/tests/test_data/valid_geojson.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xaviernogueira/geoparquet-pydantic/094df67663df6d9c8eb02579e903ecc77dd41cc7/tests/test_data/valid_geojson.parquet
--------------------------------------------------------------------------------
/tests/test_schemas.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from pyproj import CRS
3 | from geoparquet_pydantic.schemas import (
4 | GeometryColumnMetadata,
5 | GeoParquetMetadata,
6 | )
7 |
8 |
9 | @pytest.fixture
10 | def good_geo_column_metadata():
11 | return {
12 | "encoding": "WKB",
13 | "geometry_types": ["Point"],
14 | "crs": "OGC:CRS84",
15 | "edges": "planar",
16 | "bbox": [0, 0, 25, 25],
17 | "epoch": None,
18 | "orientation": "counterclockwise",
19 | }
20 |
21 |
22 | def test_good_geo_column_metadata(good_geo_column_metadata):
23 | metadata = GeometryColumnMetadata(**good_geo_column_metadata)
24 | assert metadata.encoding == good_geo_column_metadata["encoding"]
25 | assert metadata.geometry_types == good_geo_column_metadata["geometry_types"]
26 | assert metadata.crs != good_geo_column_metadata["crs"]
27 | assert CRS.from_json(metadata.crs).to_string() == good_geo_column_metadata["crs"]
28 | assert metadata.edges == good_geo_column_metadata["edges"]
29 | assert metadata.bbox == good_geo_column_metadata["bbox"]
30 | assert metadata.epoch == None
31 | assert metadata.orientation == good_geo_column_metadata["orientation"]
32 |
33 |
34 | def test_bad_geo_column_metadata(good_geo_column_metadata):
35 | """Test that the GeoColumnMetadata raises an error when given bad data."""
36 |
37 | # Test bad encoding
38 | bad_encoding = good_geo_column_metadata.copy()
39 | bad_encoding["encoding"] = "WKT"
40 | with pytest.raises(ValueError):
41 | GeometryColumnMetadata(**bad_encoding)
42 |
43 | # Test bad geometry types
44 | bad_geometry_types = good_geo_column_metadata.copy()
45 | bad_geometry_types["geometry_types"] = ["NOT_A_REAL_TIME"]
46 | with pytest.raises(ValueError):
47 | GeometryColumnMetadata(**bad_geometry_types)
48 |
49 | # Test bad CRS
50 | bad_crs = good_geo_column_metadata.copy()
51 | bad_crs["crs"] = "NOT_A_REAL_CRS"
52 | with pytest.raises(ValueError):
53 | GeometryColumnMetadata(**bad_crs)
54 |
55 | # Test bad edges
56 | bad_edges = good_geo_column_metadata.copy()
57 | bad_edges["edges"] = "NOT_A_REAL_EDGE"
58 | with pytest.raises(ValueError):
59 | GeometryColumnMetadata(**bad_edges)
60 |
61 | # Test bad bbox
62 | bad_bbox = good_geo_column_metadata.copy()
63 | bad_bbox["bbox"] = [0, 0, 25]
64 | with pytest.raises(ValueError):
65 | GeometryColumnMetadata(**bad_bbox)
66 |
67 | # Test bad epoch
68 | bad_epoch = good_geo_column_metadata.copy()
69 | bad_epoch["epoch"] = "NOT_A_REAL_EPOCH"
70 | with pytest.raises(ValueError):
71 | GeometryColumnMetadata(**bad_epoch)
72 |
73 | # Test bad orientation
74 | bad_orientation = good_geo_column_metadata.copy()
75 | bad_orientation["orientation"] = "NOT_A_REAL_ORIENTATION"
76 | with pytest.raises(ValueError):
77 | GeometryColumnMetadata(**bad_orientation)
78 |
79 |
80 | def test_good_geoparquet(good_geo_column_metadata):
81 |
82 | # minimum inputs
83 | geo_parquet = GeoParquetMetadata(
84 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)},
85 | )
86 | assert geo_parquet.version == "1.1.0-dev"
87 | assert geo_parquet.primary_column == "geometry"
88 | assert isinstance(geo_parquet.columns, dict)
89 | assert len(geo_parquet.columns) == 1
90 | assert "geometry" in geo_parquet.columns
91 | assert isinstance(geo_parquet.columns["geometry"], GeometryColumnMetadata)
92 |
93 | # maximum inputs
94 | geo_parquet = GeoParquetMetadata(
95 | version="1.0.0",
96 | primary_column="geom",
97 | columns={"geom": GeometryColumnMetadata(**good_geo_column_metadata)},
98 | )
99 | assert geo_parquet.version == "1.0.0"
100 | assert geo_parquet.primary_column == "geom"
101 | assert isinstance(geo_parquet.columns, dict)
102 | assert len(geo_parquet.columns) == 1
103 | assert "geom" in geo_parquet.columns
104 | assert isinstance(geo_parquet.columns["geom"], GeometryColumnMetadata)
105 |
106 |
107 | def test_bad_geoparquet(good_geo_column_metadata):
108 |
109 | # Test bad version
110 | with pytest.raises(ValueError):
111 | GeoParquetMetadata(
112 | version=1.431243,
113 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)},
114 | )
115 |
116 | # Test bad primary_column
117 | with pytest.raises(ValueError):
118 | GeoParquetMetadata(
119 | primary_column=1.431243,
120 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)},
121 | )
122 |
123 | # Test bad columns
124 | with pytest.raises(ValueError):
125 | GeoParquetMetadata(
126 | columns={"geometry": "NOT_A_REAL_METADATA"},
127 | )
128 | with pytest.raises(ValueError):
129 | GeoParquetMetadata(
130 | columns="NOT_EVEN_A_DICT",
131 | )
132 | with pytest.raises(ValueError):
133 | GeoParquetMetadata(
134 | columns={"geometry": {"A_DICT": "BUT_NOT_VALID"}},
135 | )
136 |
137 | # Test missing primary_column
138 | with pytest.raises(ValueError):
139 | GeoParquetMetadata(
140 | primary_column="NOT_A_REAL_COLUMN",
141 | columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)},
142 | )
143 |
--------------------------------------------------------------------------------
/tests/test_validation.py:
--------------------------------------------------------------------------------
1 | import pyarrow
2 | import pytest
3 | from pathlib import Path
4 | from geoparquet_pydantic.validate import (
5 | validate_geoparquet_table,
6 | validate_geoparquet_file,
7 | )
8 |
9 |
10 | @pytest.fixture
11 | def no_geo_metadata_table() -> pyarrow.Table:
12 | return pyarrow.Table.from_pydict(
13 | {
14 | "geometry": [None],
15 | "id": [1],
16 | },
17 | metadata={"NOTGEO": "metadata"},
18 | )
19 |
20 |
21 | @pytest.fixture
22 | def bad_geo_metadata_table() -> pyarrow.Table:
23 | return pyarrow.Table.from_pydict(
24 | {
25 | "geometry": [None],
26 | "id": [1],
27 | },
28 | metadata={
29 | b"geo": b"{'version': '1.1.0-dev', 'primary_column': 'geometry', 'columns': {'geometry': 'not-a-geometry'}}"
30 | },
31 | )
32 |
33 |
34 | def test_valididate_geoparquet_table(valid_geoparquet_table):
35 | """Test the validation of a valid GeoParquet table."""
36 | assert validate_geoparquet_table(valid_geoparquet_table)
37 |
38 |
39 | def test_invalid_geoparquet_table(no_geo_metadata_table, bad_geo_metadata_table):
40 | """Test the validation of an invalid GeoParquet table."""
41 | assert validate_geoparquet_table(no_geo_metadata_table) == False
42 | assert validate_geoparquet_table(bad_geo_metadata_table) == False
43 |
44 |
45 | def test_valid_geoparquet_file(valid_geoparquet_file: Path):
46 | """Test the validation of a valid GeoParquet file."""
47 | assert validate_geoparquet_file(valid_geoparquet_file)
48 | assert validate_geoparquet_file(str(valid_geoparquet_file))
49 | assert validate_geoparquet_file(
50 | pyarrow.parquet.ParquetFile(valid_geoparquet_file),
51 | )
52 |
53 |
54 | def test_invalid_geoparquet_file(no_geo_metadata_table, bad_geo_metadata_table):
55 | """Test the validation of an invalid GeoParquet file."""
56 | pyarrow.parquet.write_table(no_geo_metadata_table, "test1.parquet")
57 | assert validate_geoparquet_file("test1.parquet") == False
58 | Path("test1.parquet").unlink()
59 |
60 | pyarrow.parquet.write_table(no_geo_metadata_table, "test2.parquet")
61 | assert validate_geoparquet_file("test2.parquet") == False
62 | Path("test2.parquet").unlink()
63 |
--------------------------------------------------------------------------------