├── .github
    └── workflows
    │   ├── pre-commit.yml
    │   ├── publish_package.yml
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── dev-requirements.txt
├── imgs
    └── repo_logo.png
├── pypi_README.md
├── pyproject.toml
├── requirements.txt
├── src
    └── geoparquet_pydantic
    │   ├── __init__.py
    │   ├── convert.py
    │   ├── schemas.py
    │   └── validate.py
└── tests
    ├── conftest.py
    ├── test_conversions.py
    ├── test_data
        ├── valid.geojson
        └── valid_geojson.parquet
    ├── test_schemas.py
    └── test_validation.py


/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | jobs:
 7 |   pre-commit:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - uses: actions/checkout@v3
11 |     - uses: actions/setup-python@v3
12 |     - uses: pre-commit/action@v3.0.1
13 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | name: Publish Release to PyPI
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [published]
 9 | 
10 | permissions:
11 |   contents: read
12 | 
13 | jobs:
14 |   deploy:
15 | 
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v3
20 | 
21 |     - name: Set up Python
22 |       uses: actions/setup-python@v3
23 |       with:
24 |         python-version: '3.12'
25 | 
26 |     - name: Install publishing dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install --upgrade build
30 | 
31 |     - name: Build package
32 |       run: python -m build --sdist --wheel . --outdir dist
33 | 
34 |     - name: Publish package to PyPi
35 |       uses: pypa/gh-action-pypi-publish@release/v1
36 |       with:
37 |         password: ${{ secrets.PYPI_PASSWORD }}
38 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   run:
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.11", "3.12"]
17 |         os: [ubuntu-latest]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 | 
22 |       - name: Install Python
23 |         uses: actions/setup-python@v5
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 | 
27 |       - name: Install dependencies
28 |         run: |
29 |           python --version
30 |           pip install -U pip
31 |           pip install -r requirements.txt
32 |           pip install -r dev-requirements.txt
33 |           pip list
34 | 
35 |       - name: Install the library
36 |         run: |
37 |           pip install -e .
38 |           pip list
39 | 
40 |       - name: Run tests
41 |         run: pytest --cov=geoparquet_pydantic --cov-report=xml tests/
42 | 
43 |       - name: Upload coverage report to CodeCov
44 |         uses: codecov/codecov-action@v3
45 |         env:
46 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | # - repo: https://github.com/RobertCraigie/pyright-python
 3 | #   rev: v1.1.351
 4 | #   hooks:
 5 | #   - id: pyright
 6 | #     exclude: ^tests/.*
 7 | #     additional_dependencies: [pyarrow, shapely, geojson-pydantic]
 8 |   - repo: https://github.com/PyCQA/docformatter
 9 |     rev: v1.7.5
10 |     hooks:
11 |       - id: docformatter
12 |         additional_dependencies: [tomli]
13 |         args: [--black, --in-place]
14 | 
15 |   - repo: https://github.com/pre-commit/pre-commit-hooks
16 |     rev: v4.4.0
17 |     hooks:
18 |       - id: trailing-whitespace
19 |       - id: check-ast
20 |       - id: check-case-conflict
21 |       - id: debug-statements
22 |       - id: end-of-file-fixer
23 |       - id: check-docstring-first
24 |       - id: check-added-large-files
25 | 
26 |   - repo: https://github.com/psf/black-pre-commit-mirror
27 |     rev: 24.2.0
28 |     hooks:
29 |       - id: black
30 |         language_version: python3.12
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Xavier Nogueira
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GeoParquet-Pydantic
  2 | 
  3 | <p align="center">
  4 |   <img src="https://github.com/xaviernogueira/geoparquet-pydantic/blob/main/imgs/repo_logo.png" alt="Logo">
  5 | </p>
  6 | <p align="center">
  7 |   <em> A lightweight, <a href="https://docs.pydantic.dev/latest/" target="_blank">pydantic</a> centric library for validating GeoParquet files (or PyArrow Tables) and converting between GeoJSON and GeoParquet...without GDAL!</em>
  8 | </p>
  9 | <p align="center">
 10 |   <a href="https://github.com/xaviernogueira/geoparquet-pydantic/actions/workflows/pre-commit.yml" target="_blank">
 11 |       <img src="https://github.com/xaviernogueira/geoparquet-pydantic/workflows/pre-commit/badge.svg" alt="Pre-Commit">
 12 |   </a>
 13 |   <a href="https://github.com/xaviernogueira/geoparquet-pydantic/actions/workflows/tests.yml" target="_blank">
 14 |       <img src="https://github.com/xaviernogueira/geoparquet-pydantic/workflows/tests/badge.svg" alt="Tests">
 15 |   </a>
 16 |   <a href="https://codecov.io/gh/xaviernogueira/geoparquet-pydantic" target="_blank">
 17 |       <img src="https://codecov.io/gh/xaviernogueira/geoparquet-pydantic/branch/main/graph/badge.svg" alt="Coverage">
 18 |   </a>
 19 |   <a href="https://pypi.org/project/geoparquet-pydantic" target="_blank">
 20 |       <img src="https://img.shields.io/pypi/v/geoparquet-pydantic.svg" alt="Package version">
 21 |   </a>
 22 |   <a href="https://github.com/xaviernogueira/geoparquet-pydantic/blob/main/LICENSE" target="_blank">
 23 |       <img src="https://img.shields.io/github/license/xaviernogueira/geoparquet-pydantic.svg" alt="License">
 24 |   </a>
 25 | </p>
 26 | 
 27 | ---
 28 | **Motivation:** This project started at the 2024 San Fransisco GeoParquet Community hackathon, and arose out of a simple observation:
 29 | why must Python users install the *massive* GDAL dependency (typically via GeoPandas) to do simple GeoJSON<>GeoParquet conversions.
 30 | 
 31 | **Is this library the right choice for you?:**
 32 | * Do you need to use a wide variety of Geospatial functions? If so, you will likely have to add GDAL/GeoPandas as a dependency anyways,
 33 | making this ibrary's conversion functions *probably* redundant.
 34 | * Is your workflow command line centric? If so you may want to consider Planet Lab's simular CLI tool [`gpq`](https://github.com/planetlabs/gpq),
 35 | which is written in Go and substantially faster.
 36 | * Otherwise, if you are using Python and want to avoid unnecessary bulky dependencies, this library will be a great choice!
 37 | 
 38 | **Note:** All user-exposed functions and schema classes are available at the top level (i.e., `geoparquet_pydantic.validate_geoparquet_table(...)`) of this library.
 39 | 
 40 | # Features
 41 | 
 42 | ## `pydantic` Schemas
 43 | 
 44 | * [`GeometryColumnMetadata`](https://github.com/xaviernogueira/geoparquet-pydantic/blob/cec560451db01cd5c4a4b1fea6486c86975f7499/geoparquet_pydantic/schemas.py#L40): A `pydantic` model that validates a
 45 | geometry column's (aka `primary_column`) metadata. This is nested within the following schema.
 46 | * [`GeoParquetMetadata`](https://github.com/xaviernogueira/geoparquet-pydantic/blob/cec560451db01cd5c4a4b1fea6486c86975f7499/geoparquet_pydantic/schemas.py#L93): A `pydantic` model for the metadata assigned to the "geo" key in a `pyarrow.Table`
 47 | that allows it to be read by GeoParquet readers once saved.
 48 | 
 49 | For an explanation of these schemas, please refence the [geoparquet repository](https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md).
 50 | 
 51 | ## Validation functions
 52 | 
 53 | Convenience functions that simply uses `GeoParquetMetadata` to return a `bool` depending on whether the GeoParquet metadata obeys the [schema](https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md).
 54 | 
 55 | ### Validate a `pyarrow.Table`'s GeoParquet metadata:
 56 | ```python
 57 | def validate_geoparquet_table(
 58 |     table: pyarrow.Table,
 59 |     primary_column: Optional[str] = None,
 60 | ) -> bool:
 61 |   """Validates a the GeoParquet metadata of a pyarrow.Table.
 62 | 
 63 |     Args:
 64 |         table (pyarrow.Table): The table to validate.
 65 |         primary_column (Optional[str], optional): The name of the primary geometry column.
 66 |             Defaults to None.
 67 | 
 68 |     Returns:
 69 |         bool: True if the metadata is valid, False otherwise.
 70 |     """
 71 |     ...
 72 | ```
 73 | 
 74 | ### Validate a Parquet file's GeoParquet metadata:
 75 | 
 76 | ```python
 77 | def validate_geoparquet_file(
 78 |     geoparquet_file: str | Path | pyarrow.parquet.ParquetFile,
 79 |     primary_column: Optional[str] = None,
 80 |     read_file_kwargs: Optional[dict] = None,
 81 | ) -> bool:
 82 |     """Validates that a parquet file has correct GeoParquet metadata without opening it.
 83 | 
 84 |     Args:
 85 |         geoparquet_file (str | Path | ParquetFile): The file to validate.
 86 |         primary_column (str, optional): The primary column name. Defaults to 'geometry'.
 87 |         read_file_kwargs (dict, optional): Kwargs to be passed into pyarrow.parquet.ParquetFile().
 88 |             See: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow-parquet-parquetfile
 89 | 
 90 |     Returns:
 91 |         bool: True if the metadata is valid, False otherwise.
 92 |     """
 93 |     ...
 94 | ```
 95 | 
 96 | ## Conversion functions
 97 | 
 98 | ### Convert from `geojson_pydantic.FeatureCollection` to a GeoParquet `pyarrow.Table`
 99 | 
100 | ```python
101 | def geojson_to_geoparquet(
102 |     geojson: FeatureCollection | Path,
103 |     primary_column: Optional[str] = None,
104 |     column_schema: Optional[pyarrow.Schema] = None,
105 |     add_none_values: Optional[bool] = False,
106 |     geo_metadata: GeoParquetMetadata | dict | None = None,
107 |     **kwargs,
108 | ) -> pyarrow.Table:
109 |     """Converts a GeoJSON Pydantic FeatureCollection to an Arrow table with geoparquet
110 |     metadata.
111 | 
112 |     To save to a file, simply use pyarrow.parquet.write_table() on the returned table.
113 | 
114 |     Args:
115 |         geojson (FeatureCollection): The GeoJSON Pydantic FeatureCollection.
116 |         primary_column (str, optional): The name of the primary column. Defaults to None.
117 |         column_schema (pyarrow.Schema, optional): The Arrow schema for the table. Defaults to None.
118 |         add_none_values (bool, default=False): Whether to fill missing column values
119 |             specified in param:column_schema with 'None' (converts to pyarrow.null()).
120 |         geo_metadata (GeoParquet | dict | None, optional): The GeoParquet metadata.
121 |         **kwargs: Additional keyword arguments for the Arrow table writer.
122 | 
123 |     Returns:
124 |         The Arrow table with GeoParquet metadata.
125 |     """
126 |     ...
127 | ```
128 | 
129 | ### Convert from a GeoParquet `pyarrow.Table` or file to a `geojson_pydantic.FeatureCollection`
130 | 
131 | ```python
132 | def geoparquet_to_geojson(
133 |     geoparquet: pyarrow.Table | str | Path,
134 |     primary_column: Optional[str] = None,
135 |     max_chunksize: Optional[int] = None,
136 |     max_workers: Optional[int] = None,
137 | ) -> FeatureCollection:
138 |     """Converts an Arrow table with GeoParquet metadata to a GeoJSON Pydantic
139 |     FeatureCollection.
140 | 
141 |     Args:
142 |         geoparquet (pyarrow.Table): Either an Arrow.Table or parquet with GeoParquet metadata.
143 |         primary_column (str, optional): The name of the primary column. Defaults to 'geometry'.
144 |         max_chunksize (int, optional): The maximum chunksize to read from the parquet file. Defaults to 1000.
145 |         max_workers (int, optional): The maximum number of workers to use for parallel processing.
146 |             Defaults to 0 (runs sequentially). Use -1 for all available cores.
147 | 
148 |     Returns:
149 |         FeatureCollection: The GeoJSON Pydantic FeatureCollection.
150 |     """
151 |     ...
152 | ```
153 | 
154 | # Getting Started
155 | 
156 | Install from [PyPi](https://pypi.org/project/geoparquet-pydantic):
157 | ```bash
158 | pip install geoparquet-pydantic
159 | ```
160 | 
161 | Or from source:
162 | ```bash
163 | $ git clone https://github.com/xaviernogueira/geoparquet-pydantic.git
164 | $ cd geoparquet-pydantic
165 | $ pip install .
166 | ```
167 | 
168 | Then import with an underscore:
169 | ```python
170 | import geoparquet_pydantic
171 | ```
172 | 
173 | Or just import the functions/classes you need from the top-level:
174 | ```python
175 | from geoparquet_pydantic import (
176 |   GeometryColumnMetadata,
177 |   GeoParquetMetadata,
178 |   validate_geoparquet_table,
179 |   validate_geoparquet_file,
180 |   geojson_to_geoparquet,
181 |   geoparquet_to_geojson,
182 | )
183 | ```
184 | 
185 | # Roadmap
186 | 
187 | - [ ] Make CLI file<>file functions w/ `click`.
188 | - [ ] Add parrallelized Parquet read for `geoparquet_pydantic.geoparquet_to_geojson()`.
189 | 
190 | # Contribute
191 | 
192 | We encourage contributions, feature requests, and bug reports!
193 | 
194 | Here is our recomended workflow:
195 | 
196 | * Use `dev-requirements.txt` to install our development dependencies.
197 | * Make your edits using `pyright` as a linter.
198 | * Use `pre-commit run --all-file` before commiting your work.
199 | * If you add a new feature, we request that you add test coverage for it.
200 | 
201 | Happy coding!
202 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | wheel
3 | pytest
4 | pytest-cov
5 | black
6 | pyright
7 | pre-commit
8 | geopandas
9 | 


--------------------------------------------------------------------------------
/imgs/repo_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xaviernogueira/geoparquet-pydantic/094df67663df6d9c8eb02579e903ecc77dd41cc7/imgs/repo_logo.png


--------------------------------------------------------------------------------
/pypi_README.md:
--------------------------------------------------------------------------------
 1 | # GeoParquet-Pydantic
 2 | 
 3 | **Motivation:** This project started at the 2024 San Fransisco GeoParquet Community hackathon, and arose out of a simple observation:
 4 | why must Python users install the *massive* GDAL dependency (typically via GeoPandas) to do simple GeoJSON<>GeoParquet conversions.
 5 | 
 6 | **Is this library the right choice for you?:**
 7 | * Do you need to use a wide variety of Geospatial functions? If so, you will likely have to add GDAL/GeoPandas as a dependency anyways,
 8 | making this ibrary's conversion functions *probably* redundant.
 9 | * Is your workflow command line centric? If so you may want to consider Planet Lab's simular CLI tool [`gpq`](https://github.com/planetlabs/gpq),
10 | which is written in Go and substantially faster.
11 | * Otherwise, if you are using Python and want to avoid unnecessary bulky dependencies, this library will be a great choice!
12 | 
13 | **Note:** All user-exposed functions and schema classes are available at the top level (i.e., `geoparquet_pydantic.validate_geoparquet_table(...)`) of this library.
14 | 
15 | # Documentation is on GitHub [here](https://github.com/xaviernogueira/geoparquet-pydantic/blob/main/README.md)
16 | 
17 | # Getting Started
18 | 
19 | Install from [PyPi](https://pypi.org/project/geoparquet-pydantic):
20 | ```bash
21 | pip install geoparquet-pydantic
22 | ```
23 | 
24 | Or from source:
25 | ```bash
26 | $ git clone https://github.com/xaviernogueira/geoparquet-pydantic.git
27 | $ cd geoparquet-pydantic
28 | $ pip install .
29 | ```
30 | 
31 | Then import with an underscore:
32 | ```python
33 | import geoparquet_pydantic
34 | ```
35 | 
36 | Or just import the functions/classes you need from the top-level:
37 | ```python
38 | from geoparquet_pydantic import GeometryColumnMetadata
39 | from geoparquet_pydantic import GeoParquetMetadata
40 | from geoparquet_pydantic import validate_geoparquet_table
41 | from geoparquet_pydantic import validate_geoparquet_file
42 | from geoparquet_pydantic import geojson_to_geoparquet
43 | from geoparquet_pydantic import geoparquet_to_geojson
44 | ```
45 | 
46 | # Roadmap
47 | 
48 | - [ ] Make CLI file<>file functions w/ `click`.
49 | - [ ] Add parrallelized Parquet read for `geoparquet_pydantic.geoparquet_to_geojson()`.
50 | 
51 | # Contribute
52 | 
53 | We encourage contributions, feature requests, and bug reports!
54 | 
55 | Here is our recomended workflow:
56 | 
57 | * Use `dev-requirements.txt` to install our development dependencies.
58 | * Make your edits using `pyright` as a linter.
59 | * Use `pre-commit run --all-file` before commiting your work.
60 | * If you add a new feature, we request that you add test coverage for it.
61 | 
62 | Happy coding!
63 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools",
 4 |     "setuptools-scm",
 5 | ]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [project]
 9 | name = "geoparquet_pydantic"
10 | description = "Read/write geoparquet with the geojson-pydanic models."
11 | requires-python = ">=3.11"
12 | keywords = [
13 |     "geoparquet",
14 |     "pydantic",
15 |     "geospatial",
16 | ]
17 | license = {text = "MIT"}
18 | classifiers = [
19 |     "Programming Language :: Python :: 3",
20 | ]
21 | dependencies = [
22 |     "geojson-pydantic",
23 |     "pyarrow",
24 |     "shapely",
25 |     "pyproj",
26 |     "click",
27 | ]
28 | dynamic = [
29 |     "version",
30 |     "readme",
31 | ]
32 | 
33 | [tool.setuptools.packages.find]
34 | where = ["src"]
35 | include = ["geoparquet_pydantic*"]
36 | 
37 | [tool.setuptools.dynamic]
38 | version = {attr = "geoparquet_pydantic.__version__"}
39 | readme = {file = "pypi_README.md"}
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | geojson-pydantic
2 | pyarrow
3 | shapely
4 | pyproj
5 | click
6 | 


--------------------------------------------------------------------------------
/src/geoparquet_pydantic/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.0.1"
 2 | from .schemas import (
 3 |     GeometryColumnMetadata,
 4 |     GeoParquetMetadata,
 5 | )
 6 | from .convert import (
 7 |     geojson_to_geoparquet,
 8 |     geoparquet_to_geojson,
 9 | )
10 | from .validate import (
11 |     validate_geoparquet_table,
12 |     validate_geoparquet_file,
13 | )
14 | 


--------------------------------------------------------------------------------
/src/geoparquet_pydantic/convert.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import functools
  3 | import warnings
  4 | import geojson_pydantic
  5 | from geojson_pydantic.types import BBox
  6 | import shapely.wkb
  7 | import shapely.wkt
  8 | import pyarrow
  9 | import json
 10 | from geojson_pydantic.geometries import (
 11 |     _GeometryBase,
 12 | )
 13 | from geojson_pydantic.features import (
 14 |     Feature,
 15 |     FeatureCollection,
 16 | )
 17 | from geoparquet_pydantic.schemas import (
 18 |     GeometryColumnMetadata,
 19 |     GeoParquetMetadata,
 20 |     GeometryTypes,
 21 | )
 22 | from pathlib import Path
 23 | from typing import Any, Optional, Iterable
 24 | 
 25 | 
 26 | def _to_wkb(geometry: _GeometryBase) -> bytes:
 27 |     """Converts the GeoJSON object to WKB format."""
 28 |     return shapely.wkb.dumps(shapely.wkt.loads(geometry.wkt))
 29 | 
 30 | 
 31 | def _get_geom_types(features: list[Feature]) -> list[str]:
 32 |     return list(set([feature.geometry.type for feature in features]))
 33 | 
 34 | 
 35 | def _get_default_geo_metadata(
 36 |     feature_collection: FeatureCollection,
 37 | ) -> GeoParquetMetadata:
 38 |     return GeoParquetMetadata(
 39 |         primary_column="geometry",
 40 |         columns={
 41 |             "geometry": GeometryColumnMetadata(
 42 |                 **{
 43 |                     "encoding": "WKB",
 44 |                     "geometry_types": _get_geom_types(feature_collection.features),
 45 |                 }
 46 |             ),
 47 |         },
 48 |     )
 49 | 
 50 | 
 51 | def _update_metadata(table: pyarrow.Table, metadata: dict) -> pyarrow.Table:
 52 |     new_metadata = table.schema.metadata
 53 |     if not new_metadata:
 54 |         new_metadata = {}
 55 |     for k, v in metadata.items():
 56 |         new_metadata[k] = json.dumps(v).encode("utf-8")
 57 |     return table.replace_schema_metadata(new_metadata)
 58 | 
 59 | 
 60 | def _validate_column_schema(
 61 |     column_schema: pyarrow.Schema,
 62 |     primary_column: str,
 63 |     geojson: FeatureCollection,
 64 |     add_none_values: bool,
 65 | ) -> None:
 66 |     names = [i for i in column_schema.names if i != primary_column]
 67 |     for feature in geojson.features:
 68 |         if not add_none_values:
 69 |             all_present = all([name in feature.properties.keys() for name in names])
 70 |             if not all_present:
 71 |                 raise ValueError(
 72 |                     f"Feature {feature} does not contain all the columns in the schema: {column_schema.names}",
 73 |                 )
 74 | 
 75 |         else:
 76 |             for name in names:
 77 |                 if not feature.properties.get(name):
 78 |                     feature.properties[name] = None
 79 | 
 80 | 
 81 | def geojson_to_geoparquet(
 82 |     geojson: FeatureCollection | Path,
 83 |     primary_column: Optional[str] = None,
 84 |     column_schema: Optional[pyarrow.Schema] = None,
 85 |     add_none_values: Optional[bool] = False,
 86 |     geo_metadata: GeoParquetMetadata | dict | None = None,
 87 |     **kwargs,
 88 | ) -> pyarrow.Table:
 89 |     """Converts a GeoJSON Pydantic FeatureCollection to an Arrow table with geoparquet
 90 |     metadata.
 91 | 
 92 |     To save to a file, simply use pyarrow.parquet.write_table() on the returned table.
 93 | 
 94 |     Args:
 95 |         geojson (FeatureCollection): The GeoJSON Pydantic FeatureCollection.
 96 |         primary_column (str, optional): The name of the primary column. Defaults to None.
 97 |         column_schema (pyarrow.Schema, optional): The Arrow schema for the table. Defaults to None.
 98 |         add_none_values (bool, default=False): Whether to fill missing column values
 99 |             specified in param:column_schema with 'None' (converts to pyarrow.null()).
100 |         geo_metadata (GeoParquet | dict | None, optional): The GeoParquet metadata.
101 |         **kwargs: Additional keyword arguments for the Arrow table writer.
102 | 
103 |     Returns:
104 |         The Arrow table with GeoParquet metadata.
105 |     """
106 |     if not isinstance(geojson, FeatureCollection):
107 |         geojson = FeatureCollection(**json.load(geojson.open("r")))
108 |     if not primary_column:
109 |         primary_column = "geometry"
110 | 
111 |     # get primary column as iterables
112 |     columns: list[Iterable] = [map(lambda f: _to_wkb(f.geometry), geojson.features)]
113 | 
114 |     # get geo metadata
115 |     if not geo_metadata:
116 |         geo_metadata = _get_default_geo_metadata(geojson)
117 |     if isinstance(geo_metadata, dict):
118 |         geo_metadata = GeoParquetMetadata(**geo_metadata)
119 |     if not isinstance(geo_metadata, GeoParquetMetadata):
120 |         raise ValueError("geo_metadata must be a valid GeoParquet class, dict, or None")
121 | 
122 |     # get other columns as iterables and update schema
123 |     if not column_schema:
124 |         column_schema = pyarrow.schema(
125 |             [
126 |                 (primary_column, pyarrow.binary()),
127 |                 ("properties", pyarrow.string()),
128 |             ]
129 |         )
130 |     elif isinstance(column_schema, pyarrow.Schema):
131 |         if primary_column in column_schema.names:
132 |             column_schema.remove(column_schema.get_field_index(primary_column))
133 |         column_schema.insert(0, pyarrow.field(primary_column, pyarrow.binary()))
134 |     else:
135 |         raise ValueError("column_schema must be a valid pyarrow.Schema or None")
136 | 
137 |     if "properties" in column_schema.names:
138 |         if len(column_schema.names) > 2:
139 |             raise ValueError(
140 |                 "Cannot have 'properties' as a column with other columns (which are pulled from GeoJSON propreties)."
141 |             )
142 |         columns.append(map(lambda f: json.dumps(f.properties), geojson.features))
143 | 
144 |     else:
145 |         _validate_column_schema(column_schema, primary_column, geojson, add_none_values)
146 | 
147 |         for col in column_schema.names:
148 |             columns.append(map(lambda f: f.properties.get(col), geojson.features))
149 | 
150 |     # write table
151 |     table = pyarrow.Table.from_pydict(
152 |         {**dict(zip(column_schema.names, columns))},
153 |         schema=column_schema,
154 |         **kwargs,
155 |     )
156 |     return _update_metadata(table, {"geo": geo_metadata.model_dump()})
157 | 
158 | 
159 | def _find_bbox(geoparquet: pyarrow.Table) -> BBox | None:
160 |     if not geoparquet.schema.metadata:
161 |         warnings.warn("No GeoParquet metadata found in the Arrow table.")
162 |         return None
163 |     decoded_metadata: dict[str, Any] = ast.literal_eval(
164 |         geoparquet.schema.metadata[b"geo"].decode("utf-8"),
165 |     )
166 |     bbox = decoded_metadata["columns"]["geometry"].get("bbox", None)
167 |     if isinstance(bbox, list):
168 |         bbox = tuple(bbox)
169 |     return bbox
170 | 
171 | 
172 | def _get_prop_records(name_value_tuple: tuple[str, list[Any]]) -> list[tuple[str, Any]]:
173 |     name, values = name_value_tuple
174 |     return list(zip([name] * len(values), values))
175 | 
176 | 
177 | def _shapely_to_feature(
178 |     geometry: shapely.geometry.base.BaseGeometry,
179 |     properties: list[tuple[str, Any]],
180 | ) -> Feature:
181 |     geom_class: type[GeometryTypes] = getattr(geojson_pydantic, type(geometry).__name__)
182 |     return Feature(
183 |         type="Feature",
184 |         geometry=geom_class(**json.loads(shapely.to_geojson(geometry))),
185 |         bbox=list(geometry.bounds),
186 |         properties=dict([*properties]),
187 |     )
188 | 
189 | 
190 | def geoparquet_to_geojson(
191 |     geoparquet: pyarrow.Table | str | Path,
192 |     primary_column: Optional[str] = None,
193 |     max_chunksize: Optional[int] = None,
194 |     max_workers: Optional[int] = None,
195 | ) -> FeatureCollection:
196 |     """Converts an Arrow table with GeoParquet metadata to a GeoJSON Pydantic
197 |     FeatureCollection.
198 | 
199 |     Args:
200 |         geoparquet (pyarrow.Table): Either an Arrow.Table or parquet with GeoParquet metadata.
201 |         primary_column (str, optional): The name of the primary column. Defaults to 'geometry'.
202 |         max_chunksize (int, optional): The maximum chunksize to read from the parquet file. Defaults to 1000.
203 |         max_workers (int, optional): The maximum number of workers to use for parallel processing.
204 |             Defaults to 0 (runs sequentially). Use -1 for all available cores.
205 |     Returns:
206 |         FeatureCollection: The GeoJSON Pydantic FeatureCollection.
207 |     """
208 |     if not primary_column:
209 |         primary_column = "geometry"
210 |     if not max_chunksize:
211 |         max_chunksize = 1000
212 |     if isinstance(geoparquet, (str, Path)):
213 |         geoparquet = pyarrow.parquet.read_table(geoparquet)
214 |     if not isinstance(geoparquet, pyarrow.Table):
215 |         raise ValueError(
216 |             "param:geoparquet must be a valid pyarrow.Table or parquet file"
217 |         )
218 | 
219 |     if primary_column not in geoparquet.column_names:
220 |         raise ValueError(f"Primary column {primary_column} not found in the table.")
221 | 
222 |     # attempt to get the bbox from metadata
223 |     bbox: BBox | None = _find_bbox(geoparquet)
224 | 
225 |     # TODO: parallelize this (optionally)
226 |     if max_workers:
227 |         raise NotImplementedError("Parallel processing not yet implemented.")
228 | 
229 |     feature_lists: list[list[Feature]] = []
230 |     for chunk in geoparquet.to_batches(max_chunksize):
231 |         chunk_dict = chunk.to_pydict()
232 |         geoms: list[bytes] = chunk_dict.pop(primary_column)
233 |         properties: Iterable[list[tuple[str, Any]]] = map(
234 |             _get_prop_records,
235 |             chunk_dict.items(),
236 |         )
237 |         feature_props: Iterable[list[tuple[str, Any]]] = map(
238 |             lambda i: [p[i] for p in properties],
239 |             range(len(geoms)),
240 |         )
241 |         try:
242 |             chunk_features: Iterable[Feature] = list(
243 |                 map(
244 |                     lambda gp: _shapely_to_feature(shapely.from_wkb(gp[0]), gp[1]),
245 |                     zip(geoms, feature_props),
246 |                 )
247 |             )
248 |         except shapely.errors.GEOSException as e:
249 |             raise ValueError(
250 |                 f"Error converting WKB to shapely geometry. Make sure the WKB is valid! Exception: {e}"
251 |             )
252 | 
253 |         feature_lists.append(chunk_features)
254 |     features: list[Feature] = list(functools.reduce(lambda a, b: a + b, feature_lists))
255 | 
256 |     return FeatureCollection(
257 |         type="FeatureCollection",
258 |         features=features,
259 |         bbox=bbox,
260 |     )
261 | 


--------------------------------------------------------------------------------
/src/geoparquet_pydantic/schemas.py:
--------------------------------------------------------------------------------
  1 | """Pydantic models for GeoParquet metadata."""
  2 | 
  3 | import ast
  4 | from pydantic import BeforeValidator, Field, BaseModel, field_validator, model_validator
  5 | from typing import Annotated, Optional, Literal, Union
  6 | from pyproj import CRS
  7 | 
  8 | EdgeType = Literal["planar", "spherical"]
  9 | 
 10 | FlatGeometryTypes = Annotated[
 11 |     # TODO: support 3d geometries with Z suffix
 12 |     Literal[
 13 |         "Point",
 14 |         "MultiPoint",
 15 |         "LineString",
 16 |         "MultiLineString",
 17 |         "Polygon",
 18 |         "MultiPolygon",
 19 |         "GeometryCollection",
 20 |     ],
 21 |     Field(description="The geometry types supported by the column"),
 22 | ]
 23 | 
 24 | ZGeometryTypes = Annotated[
 25 |     Literal[
 26 |         "PointZ",
 27 |         "MultiPointZ",
 28 |         "LineStringZ",
 29 |         "MultiLineStringZ",
 30 |         "PolygonZ",
 31 |         "MultiPolygonZ",
 32 |         "GeometryCollectionZ",
 33 |     ],
 34 |     Field(description="3D geometry types supported by the column"),
 35 | ]
 36 | 
 37 | GeometryTypes = Union[FlatGeometryTypes, ZGeometryTypes]
 38 | 
 39 | 
 40 | class GeometryColumnMetadata(BaseModel):
 41 |     encoding: Literal["WKB"]
 42 |     geometry_types: list[GeometryTypes]
 43 | 
 44 |     crs: Annotated[
 45 |         str,
 46 |         Field(
 47 |             description="The CRS of the geometry column in a string format readable by pyproj. Is the converted to PROJJSON format"
 48 |         ),
 49 |     ] = "OGC:CRS84"
 50 | 
 51 |     edges: Annotated[
 52 |         EdgeType, Field(description="The type of edges of the geometries")
 53 |     ] = "planar"
 54 | 
 55 |     bbox: Optional[
 56 |         Annotated[list[float], Field(description="The bounding box of the geometries")]
 57 |     ] = None
 58 | 
 59 |     epoch: Optional[
 60 |         Annotated[
 61 |             Union[int, float],
 62 |             Field(description="Coordinate epoch in case of a dynamic CRS"),
 63 |         ]
 64 |     ] = None
 65 | 
 66 |     orientation: Literal["counterclockwise"] = "counterclockwise"
 67 | 
 68 |     @field_validator("crs")
 69 |     @classmethod
 70 |     def convert_crs_to_projjson(cls, v) -> str:
 71 |         """Parse a CRS string and return a PROJJSON string."""
 72 |         try:
 73 |             crs = CRS.from_string(v)
 74 |             return crs.to_json()
 75 |         except Exception as e:
 76 |             raise ValueError(f"Invalid CRS string: {e}")
 77 | 
 78 |     @field_validator("geometry_types")
 79 |     @classmethod
 80 |     def only_unique_types(cls, v):
 81 |         if len(v) != len(set(v)):
 82 |             raise ValueError("geometry_types items must be unique!")
 83 |         return v
 84 | 
 85 |     @field_validator("bbox")
 86 |     @classmethod
 87 |     def must_be_length_4(cls, v):
 88 |         if v is not None and len(v) != 4:
 89 |             raise ValueError("bbox must be a list of 4 floats!")
 90 |         return v
 91 | 
 92 | 
 93 | class GeoParquetMetadata(BaseModel):
 94 |     version: Annotated[
 95 |         str, Field(description="The version of the GeoParquet format")
 96 |     ] = "1.1.0-dev"
 97 |     primary_column: Annotated[
 98 |         str, Field(description="The name of the geometry primary column")
 99 |     ] = "geometry"
100 |     columns: Annotated[
101 |         dict[str, GeometryColumnMetadata | dict | str],
102 |         Field(description="Metadata for each column (keys)"),
103 |     ]
104 | 
105 |     @model_validator(mode="after")
106 |     def contains_primary_col(self) -> "GeoParquetMetadata":
107 |         if not self.primary_column in self.columns.keys():
108 |             raise ValueError(
109 |                 f"primary column={self.primary_column} not in arg:columns={self.columns}"
110 |             )
111 |         return self
112 | 
113 |     @model_validator(mode="after")
114 |     def convert_geo_to_class(self) -> "GeoParquetMetadata":
115 |         if not isinstance(self.columns[self.primary_column], GeometryColumnMetadata):
116 |             if isinstance(self.columns[self.primary_column], str):
117 |                 self.columns[self.primary_column] = ast.literal_eval(
118 |                     self.columns[self.primary_column]
119 |                 )
120 |             if isinstance(self.columns[self.primary_column], dict):
121 |                 self.columns[self.primary_column] = GeometryColumnMetadata(
122 |                     **self.columns[self.primary_column]
123 |                 )
124 |             else:
125 |                 raise ValueError(
126 |                     f"Invalid primary column metadata: {self.columns[self.primary_column]}"
127 |                 )
128 |         return self
129 | 


--------------------------------------------------------------------------------
/src/geoparquet_pydantic/validate.py:
--------------------------------------------------------------------------------
 1 | """For validating an existing GeoParquet file or Arrow table.
 2 | 
 3 | Note that validating GeoParquet metadata can be handles with the
 4 | `.schemas` module pydantic classes.
 5 | """
 6 | 
 7 | import ast
 8 | import pyarrow
 9 | from geoparquet_pydantic.schemas import (
10 |     GeoParquetMetadata,
11 | )
12 | from typing import Optional
13 | from pathlib import Path
14 | 
15 | 
16 | def _validate_geo_metadata(metadata: dict[bytes, bytes]) -> bool:
17 |     try:
18 |         geo_metadata = ast.literal_eval(metadata[b"geo"].decode("utf-8"))
19 |         GeoParquetMetadata(**geo_metadata)
20 |         print("Valid GeoParquet metadata!")
21 |         return True
22 |     except KeyError as e:
23 |         print(f"Invalid GeoParquet metadata, could not find b'geo' key: {e}")
24 |     except ValueError as e:
25 |         print(f"Invalid GeoParquet metadata: {e}")
26 |     return False
27 | 
28 | 
29 | def validate_geoparquet_table(
30 |     table: pyarrow.Table,
31 |     primary_column: Optional[str] = None,
32 | ) -> bool:
33 |     """Validates a the GeoParquet metadata of a pyarrow.Table.
34 | 
35 |     See: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md
36 | 
37 |     Args:
38 |         table (pyarrow.Table): The table to validate.
39 |         primary_column (Optional[str], optional): The name of the primary geometry column.
40 |             Defaults to None.
41 | 
42 |     Returns:
43 |         bool: True if the metadata is valid, False otherwise.
44 |     """
45 |     if not primary_column:
46 |         primary_column = "geometry"
47 |     return _validate_geo_metadata(table.schema.metadata)
48 | 
49 | 
50 | def validate_geoparquet_file(
51 |     geoparquet_file: str | Path | pyarrow.parquet.ParquetFile,
52 |     primary_column: Optional[str] = None,
53 |     read_file_kwargs: Optional[dict] = None,
54 | ) -> bool:
55 |     """Validates that a parquet file has correct GeoParquet metadata without opening it.
56 | 
57 |     See: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md
58 | 
59 |     Args:
60 |         geoparquet_file (str | Path | ParquetFile): The file to validate.
61 |         primary_column (str, optional): The primary column name. Defaults to 'geometry'.
62 |         read_file_kwargs (dict, optional): Kwargs to be passed into pyarrow.parquet.ParquetFile().
63 |             See: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow-parquet-parquetfile
64 | 
65 |     Returns:
66 |         bool: True if the metadata is valid, False otherwise.
67 |     """
68 |     if not primary_column:
69 |         primary_column = "geometry"
70 |     default_read_file_kwargs = {
71 |         "memory_map": True,
72 |     }
73 |     if read_file_kwargs is None:
74 |         read_file_kwargs = default_read_file_kwargs
75 |     elif isinstance(read_file_kwargs, dict):
76 |         for k, v in default_read_file_kwargs.items():
77 |             if k not in read_file_kwargs:
78 |                 read_file_kwargs[k] = v
79 |     else:
80 |         raise TypeError(f"Optional param:read_file_kwargs must be a dict or None!")
81 | 
82 |     if isinstance(geoparquet_file, (str, Path)):
83 |         geoparquet_file = pyarrow.parquet.ParquetFile(
84 |             geoparquet_file,
85 |             **read_file_kwargs,
86 |         )
87 |     if not isinstance(geoparquet_file, pyarrow.parquet.ParquetFile):
88 |         raise TypeError(
89 |             "Input must be a file path (str | Path) or a ParquetFile object!"
90 |         )
91 |     return _validate_geo_metadata(geoparquet_file.schema_arrow.metadata)
92 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import json
 3 | import pyarrow
 4 | from pathlib import Path
 5 | from geojson_pydantic.features import FeatureCollection
 6 | 
 7 | # get the path to the data directory
 8 | TEST_DATA_DIR = Path(__file__).parent / "test_data"
 9 | 
10 | 
11 | @pytest.fixture
12 | def valid_geojson_file() -> Path:
13 |     valid_geojson = TEST_DATA_DIR / "valid.geojson"
14 |     assert valid_geojson.exists()
15 |     return valid_geojson
16 | 
17 | 
18 | @pytest.fixture
19 | def valid_geojson_obj(valid_geojson_file) -> FeatureCollection:
20 |     return FeatureCollection(**json.load(open(valid_geojson_file, "r")))
21 | 
22 | 
23 | @pytest.fixture
24 | def valid_geoparquet_file() -> Path:
25 |     valid_geoparquet = TEST_DATA_DIR / "valid_geojson.parquet"
26 |     assert valid_geoparquet.exists()
27 |     return valid_geoparquet
28 | 
29 | 
30 | @pytest.fixture
31 | def valid_geoparquet_table(valid_geoparquet_file) -> pyarrow.Table:
32 |     return pyarrow.parquet.read_table(valid_geoparquet_file)
33 | 


--------------------------------------------------------------------------------
/tests/test_conversions.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import json
  3 | import pyarrow
  4 | from pathlib import Path
  5 | import geojson_pydantic
  6 | import geopandas as gpd
  7 | import pyarrow.parquet
  8 | from geojson_pydantic.features import FeatureCollection
  9 | 
 10 | from geoparquet_pydantic.schemas import (
 11 |     GeoParquetMetadata,
 12 | )
 13 | from geoparquet_pydantic.convert import (
 14 |     _to_wkb,
 15 |     _get_geom_types,
 16 |     _get_default_geo_metadata,
 17 |     _update_metadata,
 18 |     _validate_column_schema,
 19 |     geojson_to_geoparquet,
 20 |     geoparquet_to_geojson,
 21 | )
 22 | import shapely
 23 | 
 24 | from geoparquet_pydantic.validate import validate_geoparquet_file
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def geometry_type_examples(
 29 |     valid_geojson_obj: FeatureCollection,
 30 | ) -> dict[str, geojson_pydantic.geometries._GeometryBase]:
 31 |     geometry_types = {}
 32 |     for feature in valid_geojson_obj.features:
 33 |         if feature.geometry.type not in geometry_types:
 34 |             geometry_types[feature.geometry.type] = feature.geometry
 35 |     assert len(geometry_types) == 7
 36 |     for k, v in geometry_types.items():
 37 |         assert isinstance(v, getattr(geojson_pydantic.geometries, k))
 38 | 
 39 |     return geometry_types
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def mock_table() -> pyarrow.Table:
 44 |     table_dict = {
 45 |         "col1": [1, 2, 3],
 46 |         "col2": [4, 5, 6],
 47 |     }
 48 |     metadata = {b"key": b"value"}
 49 |     table = pyarrow.Table.from_pydict(table_dict, metadata=metadata)
 50 |     assert isinstance(table, pyarrow.Table)
 51 |     assert table.schema.metadata == metadata
 52 |     return table
 53 | 
 54 | 
 55 | def test_to_wkb(
 56 |     geometry_type_examples: dict[str, geojson_pydantic.geometries._GeometryBase]
 57 | ):
 58 |     """Test the conversion of a GeoJSON object to WKB format."""
 59 |     for k, v in geometry_type_examples.items():
 60 |         wkb = _to_wkb(v)
 61 |         assert isinstance(wkb, bytes)
 62 |         assert len(wkb) > 0
 63 |         back_in = shapely.wkb.loads(wkb)
 64 |         assert isinstance(back_in, getattr(shapely.geometry, k))
 65 | 
 66 | 
 67 | def test_get_geom_types(
 68 |     valid_geojson_obj: FeatureCollection,
 69 | ):
 70 |     """Test the extraction of unique geometry types from a GeoJSON object."""
 71 |     geom_types = _get_geom_types(valid_geojson_obj.features)
 72 |     assert isinstance(geom_types, list)
 73 |     assert len(geom_types) == 7
 74 |     assert set(geom_types) == {
 75 |         "Point",
 76 |         "MultiPoint",
 77 |         "LineString",
 78 |         "MultiLineString",
 79 |         "Polygon",
 80 |         "MultiPolygon",
 81 |         "GeometryCollection",
 82 |     }
 83 | 
 84 | 
 85 | def test_get_default_geo_metadata(
 86 |     valid_geojson_obj: FeatureCollection,
 87 | ):
 88 |     default_metadata = _get_default_geo_metadata(valid_geojson_obj)
 89 |     assert isinstance(default_metadata, GeoParquetMetadata)
 90 |     assert default_metadata.columns["geometry"].geometry_types == _get_geom_types(
 91 |         valid_geojson_obj.features
 92 |     )
 93 | 
 94 | 
 95 | def test_update_metadata(
 96 |     mock_table: pyarrow.Table,
 97 | ):
 98 |     new_metadata = {"new_key": "new_value"}
 99 |     new_table = _update_metadata(mock_table, new_metadata)
100 |     assert isinstance(new_table, pyarrow.Table)
101 |     assert b"new_key" in new_table.schema.metadata
102 |     assert b"key" in new_table.schema.metadata
103 | 
104 | 
105 | def test_validate_column_schema(
106 |     valid_geojson_obj: FeatureCollection,
107 | ):
108 |     # make updated FeatureCollection properties
109 |     for i, feature in enumerate(valid_geojson_obj.features):
110 |         feature.properties["number"] = i
111 |     mock_schema = pyarrow.schema(
112 |         [
113 |             ("geometry", pyarrow.binary()),
114 |             ("name", pyarrow.string()),
115 |             ("number", pyarrow.int64()),
116 |         ]
117 |     )
118 |     # test with valid schema
119 |     _validate_column_schema(
120 |         mock_schema,
121 |         primary_column="geometry",
122 |         geojson=valid_geojson_obj,
123 |         add_none_values=False,
124 |     )
125 |     _validate_column_schema(
126 |         mock_schema,
127 |         primary_column="geometry",
128 |         geojson=valid_geojson_obj,
129 |         add_none_values=True,
130 |     )
131 | 
132 |     # test with invalid schema
133 |     for i, feature in enumerate(valid_geojson_obj.features):
134 |         if i % 2 == 0:
135 |             feature.properties = {}
136 |             assert not feature.properties
137 |     with pytest.raises(ValueError):
138 |         _validate_column_schema(
139 |             mock_schema,
140 |             "geometry",
141 |             valid_geojson_obj,
142 |             False,
143 |         )
144 | 
145 |     # now test that it can add Nones
146 |     _validate_column_schema(
147 |         mock_schema,
148 |         "geometry",
149 |         valid_geojson_obj,
150 |         True,
151 |     )
152 | 
153 | 
154 | def test_geojson_to_geoparquet(
155 |     valid_geojson_obj: FeatureCollection,
156 | ):
157 |     """Test the conversion of a valid GeoJSON file and pydantic object to a valid
158 |     GeoParquet table."""
159 | 
160 |     # convert the GeoJSON object to a GeoParquet table with minimal optional
161 |     table = geojson_to_geoparquet(valid_geojson_obj)
162 |     assert isinstance(table, pyarrow.Table)
163 |     table.validate(full=True)
164 |     table_dict = table.to_pydict()
165 |     assert "geometry" in table_dict
166 |     assert len(table_dict["geometry"]) == len(valid_geojson_obj.features)
167 |     assert "properties" in table_dict
168 |     assert (
169 |         json.loads(table_dict["properties"][0])
170 |         == valid_geojson_obj.features[0].properties
171 |     )
172 | 
173 |     parquet_path = Path("test.parquet")
174 |     pyarrow.parquet.write_table(table, parquet_path)
175 |     assert parquet_path.exists()
176 |     gdf = gpd.read_parquet(parquet_path)
177 |     assert isinstance(gdf, gpd.GeoDataFrame)
178 |     assert gdf.crs.to_string() == "OGC:CRS84"
179 |     assert len(gdf) == len(valid_geojson_obj.features)
180 |     parquet_path.unlink()
181 | 
182 |     # try again with more arguments
183 |     metadata = _get_default_geo_metadata(valid_geojson_obj)
184 |     table = geojson_to_geoparquet(
185 |         valid_geojson_obj,
186 |         geo_metadata=metadata,
187 |         column_schema=pyarrow.schema(
188 |             [
189 |                 ("geometry", pyarrow.binary()),
190 |                 ("name", pyarrow.string()),
191 |             ]
192 |         ),
193 |     )
194 |     assert isinstance(table, pyarrow.Table)
195 |     table.validate(full=True)
196 |     table_dict = table.to_pydict()
197 |     assert "geometry" in table_dict
198 |     assert len(table_dict["geometry"]) == len(valid_geojson_obj.features)
199 |     assert "name" in table_dict
200 | 
201 |     parquet_path = Path("test.parquet")
202 |     pyarrow.parquet.write_table(table, parquet_path)
203 |     assert parquet_path.exists()
204 |     gdf = gpd.read_parquet(parquet_path)
205 |     assert isinstance(gdf, gpd.GeoDataFrame)
206 |     assert gdf.crs.to_string() == "OGC:CRS84"
207 |     assert len(gdf) == len(valid_geojson_obj.features)
208 |     parquet_path.unlink()
209 | 
210 | 
211 | def test_bad_geojson_to_geoparquet(
212 |     valid_geojson_obj: FeatureCollection,
213 | ):
214 |     """Test the error handling of bad inputs."""
215 |     # test bad geo_metadata
216 |     with pytest.raises(ValueError):
217 |         geojson_to_geoparquet(
218 |             valid_geojson_obj,
219 |             geo_metadata={"NOT": "VALID"},
220 |         )
221 | 
222 |     # test bad column_schema
223 |     with pytest.raises(ValueError):
224 |         geojson_to_geoparquet(
225 |             valid_geojson_obj,
226 |             column_schema={"NOT": "VALID"},
227 |         )
228 | 
229 |     # cant have properties as a column with other columns
230 |     with pytest.raises(ValueError):
231 |         geojson_to_geoparquet(
232 |             valid_geojson_obj,
233 |             column_schema=pyarrow.schema(
234 |                 [
235 |                     ("geometry", pyarrow.binary()),
236 |                     ("properties", pyarrow.string()),
237 |                     ("name", pyarrow.string()),
238 |                 ]
239 |             ),
240 |         )
241 | 
242 | 
243 | def test_valid_geoparquet_to_geojson(
244 |     valid_geoparquet_file: Path,
245 | ):
246 |     """Test the conversion of a valid GeoParquet file to a valid GeoJSON object."""
247 |     # test defaults
248 |     geojson = geoparquet_to_geojson(valid_geoparquet_file)
249 |     assert isinstance(geojson, FeatureCollection)
250 |     assert len(geojson.features) == 7
251 |     for feature in geojson.features:
252 |         assert isinstance(feature, geojson_pydantic.features.Feature)
253 |         assert isinstance(
254 |             feature.geometry, geojson_pydantic.geometries._GeometryBase
255 |         ) or isinstance(feature.geometry, geojson_pydantic.base._GeoJsonBase)
256 |         assert isinstance(feature.properties, dict)
257 |         assert isinstance(feature.bbox, tuple)
258 |         assert len(feature.bbox) == 4
259 | 
260 |     # test with max_chunk_size = 1
261 |     geojson = geoparquet_to_geojson(valid_geoparquet_file, max_chunksize=1)
262 |     assert isinstance(geojson, FeatureCollection)
263 |     assert len(geojson.features) == 7
264 |     for feature in geojson.features:
265 |         assert isinstance(feature, geojson_pydantic.features.Feature)
266 |         assert isinstance(
267 |             feature.geometry, geojson_pydantic.geometries._GeometryBase
268 |         ) or isinstance(feature.geometry, geojson_pydantic.base._GeoJsonBase)
269 |         assert isinstance(feature.properties, dict)
270 |         assert isinstance(feature.bbox, tuple)
271 |         assert len(feature.bbox) == 4
272 | 
273 | 
274 | def test_bad_geoparquet_to_geojson():
275 |     # first we start with a table missing geo
276 |     table = pyarrow.Table.from_pydict(
277 |         {
278 |             "geometry": [b"NOT_VALID_GEOMETRY"],
279 |             "properties": ["{}"],
280 |         }
281 |     )
282 |     parquet_path = Path("test.parquet")
283 |     pyarrow.parquet.write_table(table, parquet_path)
284 |     assert parquet_path.exists()
285 |     with pytest.raises(ValueError):
286 |         geoparquet_to_geojson(parquet_path)
287 |     parquet_path.unlink()
288 | 
289 |     # now we test bad inputs
290 |     with pytest.raises(ValueError):
291 |         geoparquet_to_geojson(-999)
292 |     with pytest.raises(ValueError):
293 |         geoparquet_to_geojson(table, primary_column="NOT_VALID_COLUMN")
294 | 


--------------------------------------------------------------------------------
/tests/test_data/valid.geojson:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "FeatureCollection",
 3 |   "features": [
 4 |     {
 5 |       "type": "Feature",
 6 |       "geometry": {
 7 |         "type": "Point",
 8 |         "coordinates": [0, 0]
 9 |       },
10 |       "properties": {
11 |         "name": "Point Feature"
12 |       }
13 |     },
14 |     {
15 |       "type": "Feature",
16 |       "geometry": {
17 |         "type": "MultiPoint",
18 |         "coordinates": [[1, 1], [2, 2], [3, 3]]
19 |       },
20 |       "properties": {
21 |         "name": "MultiPoint Feature"
22 |       }
23 |     },
24 |     {
25 |       "type": "Feature",
26 |       "geometry": {
27 |         "type": "LineString",
28 |         "coordinates": [[4, 4], [5, 5], [6, 6]]
29 |       },
30 |       "properties": {
31 |         "name": "LineString Feature"
32 |       }
33 |     },
34 |     {
35 |       "type": "Feature",
36 |       "geometry": {
37 |         "type": "MultiLineString",
38 |         "coordinates": [[[7, 7], [8, 8]], [[9, 9], [10, 10]]]
39 |       },
40 |       "properties": {
41 |         "name": "MultiLineString Feature"
42 |       }
43 |     },
44 |     {
45 |       "type": "Feature",
46 |       "geometry": {
47 |         "type": "Polygon",
48 |         "coordinates": [[[11, 11], [12, 12], [13, 13], [11, 11]]]
49 |       },
50 |       "properties": {
51 |         "name": "Polygon Feature"
52 |       }
53 |     },
54 |     {
55 |       "type": "Feature",
56 |       "geometry": {
57 |         "type": "MultiPolygon",
58 |         "coordinates": [[[[14, 14], [15, 15], [16, 16], [14, 14]]], [[[17, 17], [18, 18], [19, 19], [17, 17]]]]
59 |       },
60 |       "properties": {
61 |         "name": "MultiPolygon Feature"
62 |       }
63 |     },
64 |     {
65 |       "type": "Feature",
66 |       "geometry": {
67 |         "type": "GeometryCollection",
68 |         "geometries": [
69 |           {
70 |             "type": "Point",
71 |             "coordinates": [20, 20]
72 |           },
73 |           {
74 |             "type": "LineString",
75 |             "coordinates": [[21, 21], [22, 22], [23, 23]]
76 |           },
77 |           {
78 |             "type": "Polygon",
79 |             "coordinates": [[[24, 24], [25, 25], [26, 26], [24, 24]]]
80 |           }
81 |         ]
82 |       },
83 |       "properties": {
84 |         "name": "GeometryCollection Feature"
85 |       }
86 |     }
87 |   ]
88 | }
89 | 


--------------------------------------------------------------------------------
/tests/test_data/valid_geojson.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xaviernogueira/geoparquet-pydantic/094df67663df6d9c8eb02579e903ecc77dd41cc7/tests/test_data/valid_geojson.parquet


--------------------------------------------------------------------------------
/tests/test_schemas.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pyproj import CRS
  3 | from geoparquet_pydantic.schemas import (
  4 |     GeometryColumnMetadata,
  5 |     GeoParquetMetadata,
  6 | )
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def good_geo_column_metadata():
 11 |     return {
 12 |         "encoding": "WKB",
 13 |         "geometry_types": ["Point"],
 14 |         "crs": "OGC:CRS84",
 15 |         "edges": "planar",
 16 |         "bbox": [0, 0, 25, 25],
 17 |         "epoch": None,
 18 |         "orientation": "counterclockwise",
 19 |     }
 20 | 
 21 | 
 22 | def test_good_geo_column_metadata(good_geo_column_metadata):
 23 |     metadata = GeometryColumnMetadata(**good_geo_column_metadata)
 24 |     assert metadata.encoding == good_geo_column_metadata["encoding"]
 25 |     assert metadata.geometry_types == good_geo_column_metadata["geometry_types"]
 26 |     assert metadata.crs != good_geo_column_metadata["crs"]
 27 |     assert CRS.from_json(metadata.crs).to_string() == good_geo_column_metadata["crs"]
 28 |     assert metadata.edges == good_geo_column_metadata["edges"]
 29 |     assert metadata.bbox == good_geo_column_metadata["bbox"]
 30 |     assert metadata.epoch == None
 31 |     assert metadata.orientation == good_geo_column_metadata["orientation"]
 32 | 
 33 | 
 34 | def test_bad_geo_column_metadata(good_geo_column_metadata):
 35 |     """Test that the GeoColumnMetadata raises an error when given bad data."""
 36 | 
 37 |     # Test bad encoding
 38 |     bad_encoding = good_geo_column_metadata.copy()
 39 |     bad_encoding["encoding"] = "WKT"
 40 |     with pytest.raises(ValueError):
 41 |         GeometryColumnMetadata(**bad_encoding)
 42 | 
 43 |     # Test bad geometry types
 44 |     bad_geometry_types = good_geo_column_metadata.copy()
 45 |     bad_geometry_types["geometry_types"] = ["NOT_A_REAL_TIME"]
 46 |     with pytest.raises(ValueError):
 47 |         GeometryColumnMetadata(**bad_geometry_types)
 48 | 
 49 |     # Test bad CRS
 50 |     bad_crs = good_geo_column_metadata.copy()
 51 |     bad_crs["crs"] = "NOT_A_REAL_CRS"
 52 |     with pytest.raises(ValueError):
 53 |         GeometryColumnMetadata(**bad_crs)
 54 | 
 55 |     # Test bad edges
 56 |     bad_edges = good_geo_column_metadata.copy()
 57 |     bad_edges["edges"] = "NOT_A_REAL_EDGE"
 58 |     with pytest.raises(ValueError):
 59 |         GeometryColumnMetadata(**bad_edges)
 60 | 
 61 |     # Test bad bbox
 62 |     bad_bbox = good_geo_column_metadata.copy()
 63 |     bad_bbox["bbox"] = [0, 0, 25]
 64 |     with pytest.raises(ValueError):
 65 |         GeometryColumnMetadata(**bad_bbox)
 66 | 
 67 |     # Test bad epoch
 68 |     bad_epoch = good_geo_column_metadata.copy()
 69 |     bad_epoch["epoch"] = "NOT_A_REAL_EPOCH"
 70 |     with pytest.raises(ValueError):
 71 |         GeometryColumnMetadata(**bad_epoch)
 72 | 
 73 |     # Test bad orientation
 74 |     bad_orientation = good_geo_column_metadata.copy()
 75 |     bad_orientation["orientation"] = "NOT_A_REAL_ORIENTATION"
 76 |     with pytest.raises(ValueError):
 77 |         GeometryColumnMetadata(**bad_orientation)
 78 | 
 79 | 
 80 | def test_good_geoparquet(good_geo_column_metadata):
 81 | 
 82 |     # minimum inputs
 83 |     geo_parquet = GeoParquetMetadata(
 84 |         columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)},
 85 |     )
 86 |     assert geo_parquet.version == "1.1.0-dev"
 87 |     assert geo_parquet.primary_column == "geometry"
 88 |     assert isinstance(geo_parquet.columns, dict)
 89 |     assert len(geo_parquet.columns) == 1
 90 |     assert "geometry" in geo_parquet.columns
 91 |     assert isinstance(geo_parquet.columns["geometry"], GeometryColumnMetadata)
 92 | 
 93 |     # maximum inputs
 94 |     geo_parquet = GeoParquetMetadata(
 95 |         version="1.0.0",
 96 |         primary_column="geom",
 97 |         columns={"geom": GeometryColumnMetadata(**good_geo_column_metadata)},
 98 |     )
 99 |     assert geo_parquet.version == "1.0.0"
100 |     assert geo_parquet.primary_column == "geom"
101 |     assert isinstance(geo_parquet.columns, dict)
102 |     assert len(geo_parquet.columns) == 1
103 |     assert "geom" in geo_parquet.columns
104 |     assert isinstance(geo_parquet.columns["geom"], GeometryColumnMetadata)
105 | 
106 | 
107 | def test_bad_geoparquet(good_geo_column_metadata):
108 | 
109 |     # Test bad version
110 |     with pytest.raises(ValueError):
111 |         GeoParquetMetadata(
112 |             version=1.431243,
113 |             columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)},
114 |         )
115 | 
116 |     # Test bad primary_column
117 |     with pytest.raises(ValueError):
118 |         GeoParquetMetadata(
119 |             primary_column=1.431243,
120 |             columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)},
121 |         )
122 | 
123 |     # Test bad columns
124 |     with pytest.raises(ValueError):
125 |         GeoParquetMetadata(
126 |             columns={"geometry": "NOT_A_REAL_METADATA"},
127 |         )
128 |     with pytest.raises(ValueError):
129 |         GeoParquetMetadata(
130 |             columns="NOT_EVEN_A_DICT",
131 |         )
132 |     with pytest.raises(ValueError):
133 |         GeoParquetMetadata(
134 |             columns={"geometry": {"A_DICT": "BUT_NOT_VALID"}},
135 |         )
136 | 
137 |     # Test missing primary_column
138 |     with pytest.raises(ValueError):
139 |         GeoParquetMetadata(
140 |             primary_column="NOT_A_REAL_COLUMN",
141 |             columns={"geometry": GeometryColumnMetadata(**good_geo_column_metadata)},
142 |         )
143 | 


--------------------------------------------------------------------------------
/tests/test_validation.py:
--------------------------------------------------------------------------------
 1 | import pyarrow
 2 | import pytest
 3 | from pathlib import Path
 4 | from geoparquet_pydantic.validate import (
 5 |     validate_geoparquet_table,
 6 |     validate_geoparquet_file,
 7 | )
 8 | 
 9 | 
10 | @pytest.fixture
11 | def no_geo_metadata_table() -> pyarrow.Table:
12 |     return pyarrow.Table.from_pydict(
13 |         {
14 |             "geometry": [None],
15 |             "id": [1],
16 |         },
17 |         metadata={"NOTGEO": "metadata"},
18 |     )
19 | 
20 | 
21 | @pytest.fixture
22 | def bad_geo_metadata_table() -> pyarrow.Table:
23 |     return pyarrow.Table.from_pydict(
24 |         {
25 |             "geometry": [None],
26 |             "id": [1],
27 |         },
28 |         metadata={
29 |             b"geo": b"{'version': '1.1.0-dev', 'primary_column': 'geometry', 'columns': {'geometry': 'not-a-geometry'}}"
30 |         },
31 |     )
32 | 
33 | 
34 | def test_valididate_geoparquet_table(valid_geoparquet_table):
35 |     """Test the validation of a valid GeoParquet table."""
36 |     assert validate_geoparquet_table(valid_geoparquet_table)
37 | 
38 | 
39 | def test_invalid_geoparquet_table(no_geo_metadata_table, bad_geo_metadata_table):
40 |     """Test the validation of an invalid GeoParquet table."""
41 |     assert validate_geoparquet_table(no_geo_metadata_table) == False
42 |     assert validate_geoparquet_table(bad_geo_metadata_table) == False
43 | 
44 | 
45 | def test_valid_geoparquet_file(valid_geoparquet_file: Path):
46 |     """Test the validation of a valid GeoParquet file."""
47 |     assert validate_geoparquet_file(valid_geoparquet_file)
48 |     assert validate_geoparquet_file(str(valid_geoparquet_file))
49 |     assert validate_geoparquet_file(
50 |         pyarrow.parquet.ParquetFile(valid_geoparquet_file),
51 |     )
52 | 
53 | 
54 | def test_invalid_geoparquet_file(no_geo_metadata_table, bad_geo_metadata_table):
55 |     """Test the validation of an invalid GeoParquet file."""
56 |     pyarrow.parquet.write_table(no_geo_metadata_table, "test1.parquet")
57 |     assert validate_geoparquet_file("test1.parquet") == False
58 |     Path("test1.parquet").unlink()
59 | 
60 |     pyarrow.parquet.write_table(no_geo_metadata_table, "test2.parquet")
61 |     assert validate_geoparquet_file("test2.parquet") == False
62 |     Path("test2.parquet").unlink()
63 | 


--------------------------------------------------------------------------------