├── .github └── workflows │ └── tests.yml ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── docs ├── buffer.geojson ├── buffer.png ├── spatial_index.geojson ├── spatial_index.png ├── spatial_join.geojson └── spatial_join.png ├── meta.yaml ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── src ├── __init__.py └── h3_pyspark │ ├── __init__.py │ ├── indexing.py │ ├── traversal.py │ └── utils.py └── tests ├── __init__.py ├── test_core.py ├── test_coverage.py ├── test_indexing.py └── test_traversal.py /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Tests 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.10 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: "3.10" 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install black flake8 pytest 27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 34 | - name: Lint with black 35 | run: | 36 | black -l 120 --check --diff src tests 37 | - name: Test with pytest 38 | run: | 39 | pytest 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__/ 3 | .pytest_cache 4 | *.egg-info 5 | dist/ -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": [ 3 | "isfunction", 4 | "uncompact", 5 | "conda", 6 | "geospatial", 7 | "codecov", 8 | "pytest" 9 | ] 10 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 Kevin Schaich 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | H3 Logo 2 | 3 | # **h3-pyspark**: Uber's H3 Hexagonal Hierarchical Geospatial Indexing System in PySpark 4 | 5 | [![PyPI version](https://img.shields.io/pypi/v/h3-pyspark.svg)](https://pypi.org/project/h3-pyspark/) 6 | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/h3-pyspark.svg)](https://anaconda.org/conda-forge/h3-pyspark) 7 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/kevinschaich/h3-pyspark/blob/master/LICENSE) 8 | [![Tests](https://github.com/kevinschaich/h3-pyspark/actions/workflows/tests.yml/badge.svg?branch=master)](https://github.com/kevinschaich/h3-pyspark/actions/workflows/tests.yml) 9 | 10 | PySpark bindings for the [H3 core library](https://h3geo.org/). 11 | 12 | For available functions, please see the vanilla Python binding documentation at: 13 | 14 | - [uber.github.io/h3-py](https://uber.github.io/h3-py) 15 | 16 | ## Installation 17 | 18 | Via `PyPI`: 19 | 20 | ```bash 21 | pip install h3-pyspark 22 | ``` 23 | 24 | Via `conda-forge`: 25 | 26 | ```bash 27 | conda install -c conda-forge h3-pyspark 28 | ``` 29 | 30 | ## Usage 31 | 32 | ```python 33 | >>> from pyspark.sql import SparkSession, functions as F 34 | >>> import h3_pyspark 35 | >>> 36 | >>> spark = SparkSession.builder.getOrCreate() 37 | >>> df = spark.createDataFrame([{"lat": 37.769377, "lng": -122.388903, 'resolution': 9}]) 38 | >>> 39 | >>> df = df.withColumn('h3_9', h3_pyspark.geo_to_h3('lat', 'lng', 'resolution')) 40 | >>> df.show() 41 | 42 | +---------+-----------+----------+---------------+ 43 | | lat| lng|resolution| h3_9| 44 | +---------+-----------+----------+---------------+ 45 | |37.769377|-122.388903| 9|89283082e73ffff| 46 | +---------+-----------+----------+---------------+ 47 | ``` 48 | 49 | ## Extension Functions 50 | 51 | There are also various extension functions available for geospatial common operations which are not available in the vanilla H3 library. 52 | 53 | ### Assumptions 54 | 55 | * You use GeoJSON to represent geometries in your PySpark pipeline (as opposed to WKT) 56 | * Geometries are stored in a GeoJSON `string` within a column (such as `geometry`) in your PySpark dataset 57 | * Individual H3 cells are stored as a `string` column (such as `h3_9`) 58 | * Sets of H3 cells are stored in an `array(string)` column (such as `h3_9`) 59 | 60 | ### Indexing 61 | 62 | #### `index_shape(geometry: Column, resolution: Column)` 63 | 64 | Generate an H3 spatial index for an input GeoJSON geometry column. 65 | 66 | This function accepts GeoJSON `Point`, `LineString`, `Polygon`, `MultiPoint`, `MultiLineString`, and `MultiPolygon` 67 | input features, and returns the set of H3 cells at the specified resolution which completely cover them 68 | (could be more than one cell for a substantially large geometry and substantially granular resolution). 69 | 70 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell. 71 | 72 | This spatial index can then be used for bucketing, clustering, and joins in Spark via an `explode()` operation. 73 | 74 | ```python 75 | >>> from pyspark.sql import SparkSession, functions as F 76 | >>> from h3_pyspark.indexing import index_shape 77 | >>> spark = SparkSession.builder.getOrCreate() 78 | >>> 79 | >>> df = spark.createDataFrame([{ 80 | 'geometry': '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }', 81 | 82 | 'resolution': 9 83 | }]) 84 | >>> 85 | >>> df = df.withColumn('h3_9', index_shape('geometry', 'resolution')) 86 | >>> df.show() 87 | +----------------------+----------+------------------------------------+ 88 | | geometry|resolution| h3_9| 89 | +----------------------+----------+------------------------------------+ 90 | | { "type": "MultiP... | 9| [8944d551077ffff, 8944d551073ffff] | 91 | +----------------------+----------+------------------------------------+ 92 | ``` 93 | 94 | Optionally, add another column `h3_9_geometry` for the GeoJSON representation of each cell in the `h3_9` column [to easily map the result alongside your original input geometry](docs/spatial_index.geojson): 95 | 96 | ```python 97 | >>> df = df.withColumn('h3_9_geometry', h3_pyspark.h3_set_to_multi_polygon(F.col('h3_9'), F.lit(True))) 98 | ``` 99 | 100 | [View Live Map on GitHub](docs/spatial_index.geojson) 101 | 102 | [![Result](https://github.com/kevinschaich/h3-pyspark/raw/master/docs/spatial_index.png)](docs/spatial_index.geojson) 103 | 104 | ### Buffers 105 | 106 | #### `k_ring_distinct(cells: Column, distance: Column)` 107 | 108 | Takes in an array of input cells, perform a k-ring operation on each cell, and return the distinct set of output cells. 109 | 110 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell. 111 | 112 | Since [we know the edge length & diameter (`2 * edge length`) of each H3 cell resolution](https://h3geo.org/docs/core-library/restable), we can use this to efficiently generate a "buffered" index of our input geometry (useful for operations such as distance joins): 113 | 114 | ```python 115 | >>> from pyspark.sql import SparkSession, functions as F 116 | >>> from h3_pyspark.indexing import index_shape 117 | >>> from h3_pyspark.traversal import k_ring_distinct 118 | >>> spark = SparkSession.builder.getOrCreate() 119 | >>> 120 | >>> df = spark.createDataFrame([{ 121 | 'geometry': '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }', 122 | 123 | 'resolution': 9 124 | }]) 125 | >>> 126 | >>> df = df.withColumn('h3_9', index_shape('geometry', 'resolution')) 127 | >>> df = df.withColumn('h3_9_buffer', k_ring_distinct('h3_9', 1)) 128 | >>> df.show() 129 | +--------------------+----------+--------------------+--------------------+ 130 | | geometry|resolution| h3_9| h3_9_buffer| 131 | +--------------------+----------+--------------------+--------------------+ 132 | |{ "type": "MultiP...| 9|[8944d551077ffff,...|[8944d551073ffff,...| 133 | +--------------------+----------+--------------------+--------------------+ 134 | ``` 135 | 136 | [View Live Map on GitHub](docs/buffer.geojson) 137 | 138 | [![Result](https://github.com/kevinschaich/h3-pyspark/raw/master/docs/buffer.png)](docs/buffer.geojson) 139 | 140 | ### Spatial Joins 141 | 142 | Once we have an indexed version of our geometries, we can easily join on the string column in H3 to get a set of pair candidates: 143 | 144 | ```python 145 | >>> from pyspark.sql import SparkSession, functions as F 146 | >>> from h3_pyspark.indexing import index_shape 147 | >>> spark = SparkSession.builder.getOrCreate() 148 | >>> 149 | >>> left = spark.createDataFrame([{ 150 | 'left_id': 'left_point', 151 | 'left_geometry': '{ "type": "Point", "coordinates": [ -80.79527020454407, 32.132884966083935 ] }', 152 | }]) 153 | >>> right = spark.createDataFrame([{ 154 | 'right_id': 'right_polygon', 155 | 'right_geometry': '{ "type": "Polygon", "coordinates": [ [ [ -80.80022692680359, 32.12864200501338 ], [ -80.79224467277527, 32.12864200501338 ], [ -80.79224467277527, 32.13378441213715 ], [ -80.80022692680359, 32.13378441213715 ], [ -80.80022692680359, 32.12864200501338 ] ] ] }', 156 | }]) 157 | >>> 158 | >>> left = left.withColumn('h3_9', index_shape('left_geometry', F.lit(9))) 159 | >>> right = right.withColumn('h3_9', index_shape('right_geometry', F.lit(9))) 160 | >>> 161 | >>> left = left.withColumn('h3_9', F.explode('h3_9')) 162 | >>> right = right.withColumn('h3_9', F.explode('h3_9')) 163 | >>> 164 | >>> joined = left.join(right, on='h3_9', how='inner') 165 | >>> joined.show() 166 | +---------------+--------------------+----------+--------------------+-------------+ 167 | | h3_9| left_geometry| left_id| right_geometry| right_id| 168 | +---------------+--------------------+----------+--------------------+-------------+ 169 | |8944d55100fffff|{ "type": "Point"...|left_point|{ "type": "Polygo...|right_polygon| 170 | +---------------+--------------------+----------+--------------------+-------------+ 171 | ``` 172 | 173 | You can combine this technique with a [Buffer](#buffers) to do a **Distance Join**. 174 | 175 |
176 | 177 | > **⚠️ Warning ⚠️:** The outputs of an H3 join are *approximate* – all resulting geometry pairs should be considered *intersection candidates* rather than *definitely intersecting*. Pairing a join here with a subsequent `distance` calculation (`distance = 0` = intersecting) or `intersects` can make this calculation exact. [Shapely](https://shapely.readthedocs.io) is a popular library with a well-documented [`distance`](https://shapely.readthedocs.io/en/stable/manual.html#object.distance) function which can be easily wrapped in a UDF: 178 | 179 |
180 | 181 | ```python 182 | from pyspark.sql import functions as F, types as T 183 | from shapely import geometry 184 | import json 185 | 186 | @F.udf(T.DoubleType()) 187 | def distance(geometry1, geometry2): 188 | geometry1 = json.loads(geometry1) 189 | geometry1 = geometry.shape(geometry1) 190 | geometry2 = json.loads(geometry2) 191 | geometry2 = geometry.shape(geometry2) 192 | return geometry1.distance(geometry2) 193 | ``` 194 | 195 | After a spatial join (detailed above), you can filter to only directly intersecting geometries: 196 | 197 | ```python 198 | >>> joined = joined.withColumn('distance', distance(F.col('left_geometry'), F.col('right_geometry'))) 199 | >>> joined = joined.filter(F.col('distance') == 0) 200 | >>> joined.show() 201 | +---------------+--------------------+----------+--------------------+-------------+--------+ 202 | | h3_9| left_geometry| left_id| right_geometry| right_id|distance| 203 | +---------------+--------------------+----------+--------------------+-------------+--------+ 204 | |8944d55100fffff|{ "type": "Point"...|left_point|{ "type": "Polygo...|right_polygon| 0.0| 205 | +---------------+--------------------+----------+--------------------+-------------+--------+ 206 | ``` 207 | 208 | [View Live Map on GitHub](docs/spatial_join.geojson) 209 | 210 | [![Result](https://github.com/kevinschaich/h3-pyspark/raw/master/docs/spatial_join.png)](docs/spatial_join.geojson) 211 | 212 | ## Publishing New Versions 213 | 214 | 1. Bump version in [`setup.cfg`](./setup.cfg) 215 | 2. Publish to `PyPi` 216 | 217 | git clean -fdx 218 | python3 -m build 219 | python3 -m twine upload --repository pypi dist/* 220 | 221 | 3. Create a new tag & release w/ version `x.x.x` and name `h3-pyspark-x.x.x` in GitHub 222 | 4. Publish to `conda-forge`: 223 | * Bump version & new tag's `sha256` hash in [`meta.yml`](https://github.com/conda-forge/h3-pyspark-feedstock/blob/master/recipe/meta.yaml) in [`@conda-forge/h3-pyspark-feedstock`](https://github.com/conda-forge/h3-pyspark-feedstock) 224 | openssl sha256 /path/to/h3-pyspark-x.x.x.tar.gz 225 | -------------------------------------------------------------------------------- /docs/buffer.geojson: -------------------------------------------------------------------------------- 1 | { 2 | "type": "FeatureCollection", 3 | "features": [ 4 | { 5 | "type": "Feature", 6 | "properties": {}, 7 | "geometry": { 8 | "type": "MultiPolygon", 9 | "coordinates": [ 10 | [ 11 | [ 12 | [ 13 | -80.79442262649536, 14 | 32.13522895845023 15 | ], 16 | [ 17 | -80.79298496246338, 18 | 32.13522895845023 19 | ], 20 | [ 21 | -80.79298496246338, 22 | 32.13602844594619 23 | ], 24 | [ 25 | -80.79442262649536, 26 | 32.13602844594619 27 | ], 28 | [ 29 | -80.79442262649536, 30 | 32.13522895845023 31 | ] 32 | ] 33 | ], 34 | [ 35 | [ 36 | [ 37 | -80.7923412322998, 38 | 32.1330848437511 39 | ], 40 | [ 41 | -80.79073190689087, 42 | 32.1330848437511 43 | ], 44 | [ 45 | -80.79073190689087, 46 | 32.13375715632646 47 | ], 48 | [ 49 | -80.7923412322998, 50 | 32.13375715632646 51 | ], 52 | [ 53 | -80.7923412322998, 54 | 32.1330848437511 55 | ] 56 | ] 57 | ] 58 | ] 59 | } 60 | }, 61 | { 62 | "type": "Feature", 63 | "properties": {}, 64 | "geometry": { 65 | "type": "MultiPolygon", 66 | "coordinates": [ 67 | [ 68 | [ 69 | [ 70 | -80.78724600624852, 71 | 32.130476831471 72 | ], 73 | [ 74 | -80.78773022640085, 75 | 32.132228008570145 76 | ], 77 | [ 78 | -80.78635566417105, 79 | 32.1334385288524 80 | ], 81 | [ 82 | -80.78683987723358, 83 | 32.13518969274837 84 | ], 85 | [ 86 | -80.78869872440329, 87 | 32.13573036419037 88 | ], 89 | [ 90 | -80.78918300225553, 91 | 32.13748154270417 92 | ], 93 | [ 94 | -80.79104196132435, 95 | 32.13802221401248 96 | ], 97 | [ 98 | -80.79152630397307, 99 | 32.139773407134115 100 | ], 101 | [ 102 | -80.79338537494641, 103 | 32.14031407829878 104 | ], 105 | [ 106 | -80.79476007848629, 107 | 32.13910351377108 108 | ], 109 | [ 110 | -80.79661923657869, 111 | 32.139644142214856 112 | ], 113 | [ 114 | -80.79799395534437, 115 | 32.13843350714515 116 | ], 117 | [ 118 | -80.79750946892102, 119 | 32.1366822583843 120 | ], 121 | [ 122 | -80.79888415581274, 123 | 32.13547156752768 124 | ], 125 | [ 126 | -80.7983996623107, 127 | 32.13372030555331 128 | ], 129 | [ 130 | -80.79654055380145, 131 | 32.133179762250634 132 | ], 133 | [ 134 | -80.79605612510308, 135 | 32.13142851488513 136 | ], 137 | [ 138 | -80.79419712848552, 139 | 32.13088797143808 140 | ], 141 | [ 142 | -80.79371276458406, 143 | 32.12913673869145 144 | ], 145 | [ 146 | -80.79185387985282, 147 | 32.128596195109935 148 | ], 149 | [ 150 | -80.79047933424329, 151 | 32.12980684169767 152 | ], 153 | [ 154 | -80.78862053661935, 155 | 32.12926625541086 156 | ], 157 | [ 158 | -80.78724600624852, 159 | 32.130476831471 160 | ] 161 | ] 162 | ] 163 | ] 164 | } 165 | } 166 | ] 167 | } -------------------------------------------------------------------------------- /docs/buffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/buffer.png -------------------------------------------------------------------------------- /docs/spatial_index.geojson: -------------------------------------------------------------------------------- 1 | { 2 | "type": "FeatureCollection", 3 | "features": [ 4 | { 5 | "type": "Feature", 6 | "properties": {}, 7 | "geometry": { 8 | "type": "MultiPolygon", 9 | "coordinates": [ 10 | [ 11 | [ 12 | [ 13 | -80.79442262649536, 14 | 32.13522895845023 15 | ], 16 | [ 17 | -80.79298496246338, 18 | 32.13522895845023 19 | ], 20 | [ 21 | -80.79298496246338, 22 | 32.13602844594619 23 | ], 24 | [ 25 | -80.79442262649536, 26 | 32.13602844594619 27 | ], 28 | [ 29 | -80.79442262649536, 30 | 32.13522895845023 31 | ] 32 | ] 33 | ], 34 | [ 35 | [ 36 | [ 37 | -80.7923412322998, 38 | 32.1330848437511 39 | ], 40 | [ 41 | -80.79073190689087, 42 | 32.1330848437511 43 | ], 44 | [ 45 | -80.79073190689087, 46 | 32.13375715632646 47 | ], 48 | [ 49 | -80.7923412322998, 50 | 32.13375715632646 51 | ], 52 | [ 53 | -80.7923412322998, 54 | 32.1330848437511 55 | ] 56 | ] 57 | ] 58 | ] 59 | } 60 | }, 61 | { 62 | "type": "Feature", 63 | "properties": {}, 64 | "geometry": { 65 | "type": "MultiPolygon", 66 | "coordinates": [ 67 | [ 68 | [ 69 | [ 70 | -80.791932268028, 71 | 32.135060457894376 72 | ], 73 | [ 74 | -80.79241661776229, 75 | 32.13681166423319 76 | ], 77 | [ 78 | -80.79427566395086, 79 | 32.13735229282743 80 | ], 81 | [ 82 | -80.79565033561992, 83 | 32.13614167251003 84 | ], 85 | [ 86 | -80.79516591400238, 87 | 32.13439043835102 88 | ], 89 | [ 90 | -80.79330689259906, 91 | 32.13384985232931 92 | ], 93 | [ 94 | -80.79282253578053, 95 | 32.132098632782174 96 | ], 97 | [ 98 | -80.79096362626957, 99 | 32.131558046622956 100 | ], 101 | [ 102 | -80.78958904879794, 103 | 32.132768637435596 104 | ], 105 | [ 106 | -80.79007333373787, 107 | 32.13451982915956 108 | ], 109 | [ 110 | -80.791932268028, 111 | 32.135060457894376 112 | ] 113 | ] 114 | ] 115 | ] 116 | } 117 | } 118 | ] 119 | } -------------------------------------------------------------------------------- /docs/spatial_index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/spatial_index.png -------------------------------------------------------------------------------- /docs/spatial_join.geojson: -------------------------------------------------------------------------------- 1 | { 2 | "type": "FeatureCollection", 3 | "features": [ 4 | { 5 | "type": "Feature", 6 | "properties": {}, 7 | "geometry": { 8 | "type": "Polygon", 9 | "coordinates": [ 10 | [ 11 | [ 12 | -80.79419712848552, 13 | 32.13088797143808 14 | ], 15 | [ 16 | -80.79282253578053, 17 | 32.132098632782174 18 | ], 19 | [ 20 | -80.79330689259906, 21 | 32.13384985232931 22 | ], 23 | [ 24 | -80.79516591400238, 25 | 32.13439043835102 26 | ], 27 | [ 28 | -80.79654055380145, 29 | 32.133179762250634 30 | ], 31 | [ 32 | -80.79605612510308, 33 | 32.13142851488513 34 | ], 35 | [ 36 | -80.79419712848552, 37 | 32.13088797143808 38 | ] 39 | ] 40 | ] 41 | } 42 | }, 43 | { 44 | "type": "Feature", 45 | "properties": {}, 46 | "geometry": { 47 | "type": "Polygon", 48 | "coordinates": [ 49 | [ 50 | [ 51 | -80.80022692680359, 52 | 32.12864200501338 53 | ], 54 | [ 55 | -80.79224467277527, 56 | 32.12864200501338 57 | ], 58 | [ 59 | -80.79224467277527, 60 | 32.13378441213715 61 | ], 62 | [ 63 | -80.80022692680359, 64 | 32.13378441213715 65 | ], 66 | [ 67 | -80.80022692680359, 68 | 32.12864200501338 69 | ] 70 | ] 71 | ] 72 | } 73 | }, 74 | { 75 | "type": "Feature", 76 | "properties": {}, 77 | "geometry": { 78 | "type": "Point", 79 | "coordinates": [ 80 | -80.79527020454407, 81 | 32.132884966083935 82 | ] 83 | } 84 | } 85 | ] 86 | } -------------------------------------------------------------------------------- /docs/spatial_join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/spatial_join.png -------------------------------------------------------------------------------- /meta.yaml: -------------------------------------------------------------------------------- 1 | {% set version = "1.2.2" %} 2 | 3 | package: 4 | name: h3-pyspark 5 | version: {{ version }} 6 | 7 | source: 8 | url: https://github.com/kevinschaich/h3-pyspark/archive/refs/tags/{{ version }}.tar.gz 9 | sha256: 64c39a66664676ce799dbfb5cbd49d9a9d76926e5495dcc8ea580fb03b4b46fb 10 | 11 | build: 12 | noarch: python 13 | number: 0 14 | script: {{ PYTHON }} -m pip install . -vv 15 | 16 | requirements: 17 | build: 18 | - pytest 19 | - black 20 | host: 21 | - pip 22 | - python 23 | run: 24 | - python 25 | - pyspark 26 | - h3-py 27 | - shapely 28 | 29 | test: 30 | imports: 31 | - h3_pyspark 32 | 33 | about: 34 | home: https://github.com/kevinschaich/h3-pyspark 35 | summary: PySpark bindings for H3, a hierarchical hexagonal geospatial indexing system 36 | license: Apache-2.0 37 | license_family: Apache 38 | license_file: LICENSE 39 | dev_url: https://github.com/kevinschaich/h3-pyspark 40 | doc_url: https://github.com/kevinschaich/h3-pyspark 41 | 42 | extra: 43 | recipe-maintainers: 44 | - kevinschaich 45 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark 2 | h3 3 | Shapely 4 | pytest 5 | black -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = h3-pyspark 3 | version = 1.2.6 4 | author = Kevin Schaich 5 | author_email = schaich.kevin@gmail.com 6 | description = PySpark bindings for H3, a hierarchical hexagonal geospatial indexing system 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/kevinschaich/h3-pyspark 10 | project_urls = 11 | Bug Tracker = https://github.com/kevinschaich/h3-pyspark/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = src 20 | packages = find: 21 | python_requires = >=3.6 22 | 23 | [options.packages.find] 24 | where = src 25 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/src/__init__.py -------------------------------------------------------------------------------- /src/h3_pyspark/__init__.py: -------------------------------------------------------------------------------- 1 | import h3 2 | from pyspark.sql import functions as F, types as T 3 | import json 4 | from inspect import getmembers, isfunction 5 | from .utils import sanitize_types, handle_nulls 6 | import sys 7 | from shapely import geometry 8 | 9 | 10 | ############################################################################### 11 | # Indexing 12 | ############################################################################### 13 | 14 | 15 | @F.udf(returnType=T.StringType()) 16 | @handle_nulls 17 | def geo_to_h3(lat, lng, resolution): 18 | return sanitize_types(h3.geo_to_h3(lat, lng, resolution)) 19 | 20 | 21 | @F.udf(returnType=T.ArrayType(T.DoubleType())) 22 | @handle_nulls 23 | def h3_to_geo(h): 24 | return sanitize_types(h3.h3_to_geo(h)) 25 | 26 | 27 | @F.udf(returnType=T.StringType()) 28 | @handle_nulls 29 | def h3_to_geo_boundary(h, geo_json): 30 | # NOTE: this behavior differs from default 31 | # h3-pyspark return type will be a valid GeoJSON string if geo_json is set to True 32 | coordinates = h3.h3_to_geo_boundary(h, geo_json) 33 | if geo_json: 34 | return sanitize_types(json.dumps({"type": "MultiPolygon", "coordinates": coordinates})) 35 | return sanitize_types(coordinates) 36 | 37 | 38 | ############################################################################### 39 | # Inspection 40 | ############################################################################### 41 | 42 | 43 | @F.udf(returnType=T.IntegerType()) 44 | @handle_nulls 45 | def h3_get_resolution(h): 46 | return sanitize_types(h3.h3_get_resolution(h)) 47 | 48 | 49 | @F.udf(returnType=T.IntegerType()) 50 | @handle_nulls 51 | def h3_get_base_cell(h): 52 | return sanitize_types(h3.h3_get_base_cell(h)) 53 | 54 | 55 | @F.udf(returnType=T.LongType()) 56 | @handle_nulls 57 | def string_to_h3(h): 58 | return sanitize_types(h3.string_to_h3(h)) 59 | 60 | 61 | @F.udf(returnType=T.StringType()) 62 | @handle_nulls 63 | def h3_to_string(h): 64 | return sanitize_types(h3.h3_to_string(h)) 65 | 66 | 67 | @F.udf(returnType=T.BooleanType()) 68 | @handle_nulls 69 | def h3_is_valid(h): 70 | return sanitize_types(h3.h3_is_valid(h)) 71 | 72 | 73 | @F.udf(returnType=T.BooleanType()) 74 | @handle_nulls 75 | def h3_is_res_class_III(h): 76 | return sanitize_types(h3.h3_is_res_class_III(h)) 77 | 78 | 79 | @F.udf(returnType=T.BooleanType()) 80 | @handle_nulls 81 | def h3_is_pentagon(h): 82 | return sanitize_types(h3.h3_is_pentagon(h)) 83 | 84 | 85 | @F.udf(returnType=T.ArrayType(T.IntegerType())) 86 | @handle_nulls 87 | def h3_get_faces(h): 88 | return sanitize_types(h3.h3_get_faces(h)) 89 | 90 | 91 | ############################################################################### 92 | # Traversal 93 | ############################################################################### 94 | 95 | 96 | @F.udf(returnType=T.ArrayType(T.StringType())) 97 | @handle_nulls 98 | def k_ring(origin, k): 99 | return sanitize_types(h3.k_ring(origin, k)) 100 | 101 | 102 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.StringType()))) 103 | @handle_nulls 104 | def k_ring_distances(origin, k): 105 | return sanitize_types(h3.k_ring_distances(origin, k)) 106 | 107 | 108 | @F.udf(returnType=T.ArrayType(T.StringType())) 109 | @handle_nulls 110 | def hex_range(h, k): 111 | return sanitize_types(h3.hex_range(h, k)) 112 | 113 | 114 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.StringType()))) 115 | @handle_nulls 116 | def hex_range_distances(h, k): 117 | return sanitize_types(h3.hex_range_distances(h, k)) 118 | 119 | 120 | @F.udf(returnType=T.MapType(T.StringType(), T.ArrayType(T.ArrayType(T.StringType())))) 121 | @handle_nulls 122 | def hex_ranges(h, k): 123 | return sanitize_types(h3.hex_ranges(h, k)) 124 | 125 | 126 | @F.udf(returnType=T.ArrayType(T.StringType())) 127 | @handle_nulls 128 | def hex_ring(h, k): 129 | return sanitize_types(h3.hex_ring(h, k)) 130 | 131 | 132 | @F.udf(returnType=T.ArrayType(T.StringType())) 133 | @handle_nulls 134 | def h3_line(start, end): 135 | return sanitize_types(h3.h3_line(start, end)) 136 | 137 | 138 | @F.udf(returnType=T.IntegerType()) 139 | @handle_nulls 140 | def h3_distance(h1, h2): 141 | return sanitize_types(h3.h3_distance(h1, h2)) 142 | 143 | 144 | @F.udf(returnType=T.ArrayType(T.IntegerType())) 145 | @handle_nulls 146 | def experimental_h3_to_local_ij(origin, h): 147 | return sanitize_types(h3.experimental_h3_to_local_ij(origin, h)) 148 | 149 | 150 | @F.udf(returnType=T.StringType()) 151 | @handle_nulls 152 | def experimental_local_ij_to_h3(origin, i, j): 153 | return sanitize_types(h3.experimental_local_ij_to_h3(origin, i, j)) 154 | 155 | 156 | ############################################################################### 157 | # Hierarchy 158 | ############################################################################### 159 | 160 | 161 | @F.udf(returnType=T.StringType()) 162 | @handle_nulls 163 | def h3_to_parent(h, parent_res): 164 | return sanitize_types(h3.h3_to_parent(h, parent_res)) 165 | 166 | 167 | @F.udf(returnType=T.ArrayType(T.StringType())) 168 | @handle_nulls 169 | def h3_to_children(h, child_res): 170 | return sanitize_types(h3.h3_to_children(h, child_res)) 171 | 172 | 173 | @F.udf(returnType=T.StringType()) 174 | @handle_nulls 175 | def h3_to_center_child(h, child_res): 176 | return sanitize_types(h3.h3_to_center_child(h, child_res)) 177 | 178 | 179 | @F.udf(returnType=T.ArrayType(T.StringType())) 180 | @handle_nulls 181 | def compact(hexes): 182 | return sanitize_types(h3.compact(hexes)) 183 | 184 | 185 | @F.udf(returnType=T.ArrayType(T.StringType())) 186 | @handle_nulls 187 | def uncompact(hexes, res): 188 | return sanitize_types(h3.uncompact(hexes, res)) 189 | 190 | 191 | ############################################################################### 192 | # Regions 193 | ############################################################################### 194 | 195 | 196 | @F.udf(returnType=T.ArrayType(T.StringType())) 197 | @handle_nulls 198 | def polyfill(polygons, res, geo_json_conformant): 199 | # NOTE: this behavior differs from default 200 | # h3-pyspark expect `polygons` argument to be a valid GeoJSON string 201 | polygons = json.loads(polygons) 202 | return sanitize_types(h3.polyfill(polygons, res, geo_json_conformant)) 203 | 204 | 205 | @F.udf(returnType=T.StringType()) 206 | @handle_nulls 207 | def h3_set_to_multi_polygon(hexes, geo_json): 208 | # NOTE: this behavior differs from default 209 | # h3-pyspark return type will be a valid GeoJSON string if geo_json is set to True 210 | coordinates = h3.h3_set_to_multi_polygon(hexes, geo_json) 211 | if geo_json: 212 | return sanitize_types(json.dumps({"type": "MultiPolygon", "coordinates": coordinates})) 213 | return sanitize_types(coordinates) 214 | 215 | 216 | ############################################################################### 217 | # Unidirectional Edges 218 | ############################################################################### 219 | 220 | 221 | @F.udf(returnType=T.BooleanType()) 222 | @handle_nulls 223 | def h3_indexes_are_neighbors(origin, destination): 224 | return sanitize_types(h3.h3_indexes_are_neighbors(origin, destination)) 225 | 226 | 227 | @F.udf(returnType=T.StringType()) 228 | @handle_nulls 229 | def get_h3_unidirectional_edge(origin, destination): 230 | return sanitize_types(h3.get_h3_unidirectional_edge(origin, destination)) 231 | 232 | 233 | @F.udf(returnType=T.BooleanType()) 234 | @handle_nulls 235 | def h3_unidirectional_edge_is_valid(edge): 236 | return sanitize_types(h3.h3_unidirectional_edge_is_valid(edge)) 237 | 238 | 239 | @F.udf(returnType=T.StringType()) 240 | @handle_nulls 241 | def get_origin_h3_index_from_unidirectional_edge(edge): 242 | return sanitize_types(h3.get_origin_h3_index_from_unidirectional_edge(edge)) 243 | 244 | 245 | @F.udf(returnType=T.StringType()) 246 | @handle_nulls 247 | def get_destination_h3_index_from_unidirectional_edge(edge): 248 | return sanitize_types(h3.get_destination_h3_index_from_unidirectional_edge(edge)) 249 | 250 | 251 | @F.udf(returnType=T.ArrayType(T.StringType())) 252 | @handle_nulls 253 | def get_h3_indexes_from_unidirectional_edge(edge): 254 | return sanitize_types(h3.get_h3_indexes_from_unidirectional_edge(edge)) 255 | 256 | 257 | @F.udf(returnType=T.ArrayType(T.StringType())) 258 | @handle_nulls 259 | def get_h3_unidirectional_edges_from_hexagon(h): 260 | return sanitize_types(h3.get_h3_unidirectional_edges_from_hexagon(h)) 261 | 262 | 263 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.DoubleType()))) 264 | @handle_nulls 265 | def get_h3_unidirectional_edge_boundary(h, geo_json): 266 | return sanitize_types(h3.get_h3_unidirectional_edge_boundary(h, geo_json)) 267 | 268 | 269 | ############################################################################### 270 | # Miscellaneous 271 | ############################################################################### 272 | 273 | 274 | @F.udf(returnType=T.DoubleType()) 275 | @handle_nulls 276 | def hex_area(res, unit): 277 | return sanitize_types(h3.hex_area(res, unit)) 278 | 279 | 280 | @F.udf(returnType=T.DoubleType()) 281 | @handle_nulls 282 | def cell_area(h, unit): 283 | return sanitize_types(h3.cell_area(h, unit)) 284 | 285 | 286 | @F.udf(returnType=T.DoubleType()) 287 | @handle_nulls 288 | def edge_length(res, unit): 289 | return sanitize_types(h3.edge_length(res, unit)) 290 | 291 | 292 | @F.udf(returnType=T.DoubleType()) 293 | @handle_nulls 294 | def exact_edge_length(res, unit): 295 | return sanitize_types(h3.exact_edge_length(res, unit)) 296 | 297 | 298 | @F.udf(returnType=T.IntegerType()) 299 | @handle_nulls 300 | def num_hexagons(res): 301 | return sanitize_types(h3.num_hexagons(res)) 302 | 303 | 304 | @F.udf(returnType=T.ArrayType(T.StringType())) 305 | @handle_nulls 306 | def get_res0_indexes(): 307 | return sanitize_types(h3.get_res0_indexes()) 308 | 309 | 310 | @F.udf(returnType=T.ArrayType(T.StringType())) 311 | @handle_nulls 312 | def get_pentagon_indexes(res): 313 | return sanitize_types(h3.get_pentagon_indexes(res)) 314 | 315 | 316 | @F.udf(returnType=T.DoubleType()) 317 | @handle_nulls 318 | def point_dist(point1, point2, unit): 319 | return sanitize_types(h3.point_dist(point1, point2, unit)) 320 | 321 | 322 | # Steal docstrings from h3-py native bindings if they exist 323 | for f in [f[1] for f in getmembers(sys.modules[__name__], isfunction)]: 324 | try: 325 | h3_f = getattr(h3, f.__name__) 326 | f.__doc__ = h3_f.__doc__ 327 | except Exception: 328 | f.__doc__ = f.__doc__ 329 | -------------------------------------------------------------------------------- /src/h3_pyspark/indexing.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import h3 4 | from pyspark.sql.column import Column 5 | from shapely import geometry 6 | from shapely.geometry import ( 7 | Point, 8 | MultiPoint, 9 | LineString, 10 | MultiLineString, 11 | Polygon, 12 | MultiPolygon, 13 | ) 14 | from pyspark.sql import functions as F, types as T 15 | from .utils import flatten, densify, handle_nulls 16 | 17 | 18 | def _index_point_object(point: Point, resolution: int): 19 | """ 20 | Generate H3 spatial index for input point geometry. 21 | 22 | Returns the set of H3 cells at the specified resolution which completely cover the input point. 23 | """ 24 | result_set = set() 25 | 26 | # Hexes for point 27 | result_set.update(h3.geo_to_h3(t[1], t[0], resolution) for t in list(point.coords)) 28 | return result_set 29 | 30 | 31 | def _index_line_object(line: LineString, resolution: int): 32 | """ 33 | Generate H3 spatial index for input line geometry. 34 | 35 | Returns the set of H3 cells at the specified resolution which completely cover the input line. 36 | """ 37 | result_set = set() 38 | 39 | # Hexes for vertices 40 | vertex_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(line.coords)] 41 | result_set.update(vertex_hexes) 42 | 43 | # Figure out the max-length line segment (step) we can process without interpolating 44 | # https://github.com/kevinschaich/h3-pyspark/issues/8 45 | endpoint_hex_edges = flatten( 46 | [h3.get_h3_unidirectional_edges_from_hexagon(h) for h in [vertex_hexes[0], vertex_hexes[1]]] 47 | ) 48 | step = math.degrees(min([h3.exact_edge_length(e, unit="rads") for e in endpoint_hex_edges])) 49 | 50 | densified_line = densify(line, step) 51 | line_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(densified_line.coords)] 52 | result_set.update(line_hexes) 53 | 54 | neighboring_hexes = set(flatten([h3.k_ring(h, 1) for h in result_set])) - result_set 55 | intersecting_neighboring_hexes = filter( 56 | lambda h: Polygon(h3.h3_set_to_multi_polygon([h], True)[0][0]).distance(line) == 0, neighboring_hexes 57 | ) 58 | result_set.update(intersecting_neighboring_hexes) 59 | 60 | return result_set 61 | 62 | 63 | def _index_polygon_object(polygon: Polygon, resolution: int): 64 | """ 65 | Generate H3 spatial index for input polygon geometry. 66 | 67 | Returns the set of H3 cells at the specified resolution which completely cover the input polygon. 68 | """ 69 | result_set = set() 70 | # Hexes for vertices 71 | vertex_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(polygon.exterior.coords)] 72 | result_set.update(vertex_hexes) 73 | 74 | # Hexes for edges 75 | edge_hexes = _index_shape_object(polygon.boundary, resolution) 76 | result_set.update(edge_hexes) 77 | 78 | # Hexes for internal area 79 | result_set.update(list(h3.polyfill(geometry.mapping(polygon), resolution, geo_json_conformant=True))) 80 | return result_set 81 | 82 | 83 | def _index_shape_object(shape: geometry, resolution: int): 84 | """ 85 | Generate H3 spatial index for input geometry. 86 | 87 | Returns the set of H3 cells at the specified resolution which completely cover the input shape. 88 | """ 89 | result_set = set() 90 | 91 | try: 92 | if isinstance(shape, Point): 93 | result_set.update(_index_point_object(shape, resolution)) 94 | 95 | elif isinstance(shape, LineString): 96 | result_set.update(_index_line_object(shape, resolution)) 97 | 98 | elif isinstance(shape, Polygon): 99 | result_set.update(_index_polygon_object(shape, resolution)) 100 | 101 | elif isinstance(shape, MultiPoint) or isinstance(shape, MultiLineString) or isinstance(shape, MultiPolygon): 102 | result_set.update(*[_index_shape_object(s, resolution) for s in shape.geoms]) 103 | else: 104 | raise ValueError(f"Unsupported geometry_type {shape.geom_type}") 105 | 106 | except Exception as e: 107 | raise ValueError( 108 | f"Error finding indices for geometry {json.dumps(geometry.mapping(shape))}", 109 | repr(e), 110 | ) 111 | 112 | return list(result_set) 113 | 114 | 115 | def _index_shape(shape: str, resolution: int): 116 | """ 117 | Generate H3 spatial index for input shape. 118 | 119 | Returns the set of H3 cells at the specified resolution which completely cover the input shape. 120 | """ 121 | shape = geometry.shape(json.loads(shape)) 122 | return _index_shape_object(shape, resolution) 123 | 124 | 125 | @F.udf(T.ArrayType(T.StringType())) 126 | @handle_nulls 127 | def index_shape(geometry: Column, resolution: Column): 128 | """ 129 | Generate an H3 spatial index for an input GeoJSON geometry column. 130 | 131 | This function accepts GeoJSON `Point`, `LineString`, `Polygon`, `MultiPoint`, `MultiLineString`, and `MultiPolygon` 132 | input features, and returns the set of H3 cells at the specified resolution which completely cover them 133 | (could be more than one cell for a substantially large geometry and substantially granular resolution). 134 | 135 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell. 136 | 137 | This spatial index can then be used for bucketing, clustering, and joins in Spark via an `explode()` operation. 138 | """ 139 | return _index_shape(geometry, resolution) 140 | -------------------------------------------------------------------------------- /src/h3_pyspark/traversal.py: -------------------------------------------------------------------------------- 1 | import h3 2 | from pyspark.sql import functions as F, types as T 3 | from pyspark.sql.column import Column 4 | from typing import List 5 | from .utils import handle_nulls 6 | 7 | 8 | def _k_ring_distinct(cells: List[str], distance: int = 1): 9 | """ 10 | Perform a k-ring operation on every input cell and return the distinct set of output cells. 11 | """ 12 | result_set = set(cells) 13 | result_set = result_set.union(*[h3.k_ring(c, distance) for c in result_set]) 14 | 15 | return list(result_set) 16 | 17 | 18 | @F.udf(T.ArrayType(T.StringType())) 19 | @handle_nulls 20 | def k_ring_distinct(cells: Column, distance: Column): 21 | """ 22 | Perform a k-ring operation on every input cell and return the distinct set of output cells. 23 | 24 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell. 25 | """ 26 | return _k_ring_distinct(cells, distance) 27 | -------------------------------------------------------------------------------- /src/h3_pyspark/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from shapely.geometry import LineString 3 | 4 | 5 | def handle_nulls(function): 6 | """ 7 | Decorator to return null if any of the input arguments are null. 8 | """ 9 | 10 | def inner(*args, **kwargs): 11 | if any(arg is None for arg in args): 12 | return None 13 | return function(*args, **kwargs) 14 | 15 | return inner 16 | 17 | 18 | def flatten(t): 19 | return [item for sublist in t for item in sublist] 20 | 21 | 22 | def densify(line, step): 23 | """ 24 | Given a line segment, return another line segment with the same start & endpoints, 25 | and equally spaced sub-points based on `step` size. 26 | 27 | All the points on the new line are guaranteed to intersect with the original line, 28 | and the first and last points will be the same. 29 | """ 30 | 31 | if line.length < step: 32 | return line 33 | 34 | length = line.length 35 | current_distance = step 36 | new_points = [] 37 | 38 | # take actual first point 39 | new_points.append(line.interpolate(0.0, normalized=True)) 40 | 41 | # add points between endpoints by step size 42 | while current_distance < length: 43 | new_points.append(line.interpolate(current_distance)) 44 | current_distance += step 45 | 46 | # take actual last point 47 | new_points.append(line.interpolate(1.0, normalized=True)) 48 | 49 | return LineString(new_points) 50 | 51 | 52 | def sanitize_types(value): 53 | """ 54 | Casts values returned by H3 to native PySpark types. 55 | 56 | This is necessary because PySpark does not natively support 57 | all the types returned by H3, i.e. Python sets/tuples. 58 | """ 59 | 60 | if isinstance(value, str) or isinstance(value, bool) or isinstance(value, int) or isinstance(value, float): 61 | return value 62 | if isinstance(value, set) or isinstance(value, tuple): 63 | return [sanitize_types(v) for v in value] 64 | if isinstance(value, list): 65 | return [sanitize_types(v) for v in value] 66 | if isinstance(value, dict): 67 | return {k: sanitize_types(v) for k, v in value.items()} 68 | 69 | return json.dumps(value) 70 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | from inspect import getfullargspec 2 | from pyspark.sql import SparkSession, functions as F, types as T 3 | import h3 4 | import json 5 | import unittest 6 | 7 | from src import h3_pyspark 8 | from src.h3_pyspark.utils import sanitize_types 9 | 10 | 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | 14 | # Generate some arbitrary test values 15 | latitude = 29.8988 16 | longitude = -89.998354 17 | integer = 1 18 | double = 0.5 19 | point = '{"type": "Point", "coordinates": [-89.998354, 29.8988]}' 20 | line = '{"type": "LineString", "coordinates": [[-89.99927146300001, 29.90139583899997], [-89.99921418299999, 29.90139420899999], [-89.99903129900002, 29.90138951699998], [-89.99900807, 29.90142210300002], [-89.99898608000001, 29.90138835699997], [-89.99875118300002, 29.90138410499998], [-89.99872961, 29.90141686999999], [-89.99871085699999, 29.90138346399999], [-89.99837947499998, 29.90137720600001], [-89.99835869700001, 29.90140975100002], [-89.99834035200001, 29.901376191], [-89.998234115, 29.90137350700002], [-89.998218017, 29.90137313499997], [-89.99819830400003, 29.90137344499999], [-89.99787396300002, 29.90139402699998], [-89.99785696700002, 29.90142557899998], [-89.99783514199999, 29.90139429700002]]}' 21 | polygon = '{"type": "Polygon", "coordinates": [[[-89.998354, 29.8988], [-89.99807, 29.8988], [-89.99807, 29.898628], [-89.998354, 29.898628], [-89.998354, 29.8988]]]}' 22 | h3_cell = "81447ffffffffff" 23 | h3_cells = ["81447ffffffffff", "81267ffffffffff", "8148bffffffffff", "81483ffffffffff"] 24 | h3_edge = "131447ffffffffff" 25 | unit = "km^2" 26 | 27 | 28 | # Generate a dataframe from arbitrary test values (mapping function parameters to appropriate type) 29 | test_arg_map = { 30 | "i": integer, 31 | "j": integer, 32 | "k": integer, 33 | "x": integer, 34 | "resolution": integer, 35 | "res": integer, 36 | "lat": latitude, 37 | "lng": longitude, 38 | "point1": (latitude, longitude), 39 | "point2": (latitude, longitude), 40 | "h": h3_cells[0], 41 | "hexes": h3_cells, 42 | "h1": h3_cells[1], 43 | "h2": h3_cells[2], 44 | "origin": h3_cells[2], 45 | "destination": h3_cells[3], 46 | "start": h3_cells[1], 47 | "end": h3_cells[2], 48 | "e": h3_edge, 49 | "edge": h3_edge, 50 | "geo_json": True, 51 | "geo_json_conformant": True, 52 | "geojson": polygon, 53 | } 54 | df = spark.createDataFrame([test_arg_map]) 55 | 56 | 57 | def get_test_args(function): 58 | argspec = getfullargspec(function) 59 | args = argspec.args 60 | h3_test_args = [test_arg_map.get(a.lower()) for a in args] 61 | h3_pyspark_test_args = [F.col(a) for a in args] 62 | 63 | return h3_test_args, h3_pyspark_test_args 64 | 65 | 66 | def sort(value): 67 | if isinstance(value, str) or isinstance(value, bool) or isinstance(value, int) or isinstance(value, float): 68 | return value 69 | if isinstance(value, list): 70 | value = [sort(v) for v in value] 71 | value.sort() 72 | return value 73 | if isinstance(value, set) or isinstance(value, tuple): 74 | return [sort(v) for v in value] 75 | if isinstance(value, dict): 76 | return {k: sort(v) for k, v in value.items()} 77 | 78 | return json.dumps(value) 79 | 80 | 81 | class TestCore(unittest.TestCase): 82 | 83 | ############################################################################### 84 | # Indexing 85 | ############################################################################### 86 | 87 | def test_geo_to_h3(self): 88 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.geo_to_h3) 89 | 90 | actual = df.withColumn("actual", h3_pyspark.geo_to_h3(*h3_pyspark_test_args)) 91 | actual = actual.collect()[0]["actual"] 92 | expected = sanitize_types(h3.geo_to_h3(*h3_test_args)) 93 | assert sort(actual) == sort(expected) 94 | 95 | def test_geo_to_h3_single_null_input(self): 96 | actual = df.withColumn("actual", h3_pyspark.geo_to_h3(F.lit(100), F.lit(None), F.lit(9))) 97 | actual = actual.collect()[0]["actual"] 98 | expected = None 99 | assert actual == expected 100 | 101 | def test_geo_to_h3_all_null_inputs(self): 102 | actual = df.withColumn("actual", h3_pyspark.geo_to_h3(F.lit(None), F.lit(None), F.lit(None))) 103 | actual = actual.collect()[0]["actual"] 104 | expected = None 105 | assert actual == expected 106 | 107 | def test_h3_to_geo(self): 108 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_geo) 109 | 110 | actual = df.withColumn("actual", h3_pyspark.h3_to_geo(*h3_pyspark_test_args)) 111 | actual = actual.collect()[0]["actual"] 112 | expected = sanitize_types(h3.h3_to_geo(*h3_test_args)) 113 | assert sort(actual) == sort(expected) 114 | 115 | def test_h3_to_geo_boundary(self): 116 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_geo_boundary) 117 | 118 | actual = df.withColumn("actual", h3_pyspark.h3_to_geo_boundary(*h3_pyspark_test_args)) 119 | actual = actual.collect()[0]["actual"] 120 | expected = json.dumps({"type": "MultiPolygon", "coordinates": h3.h3_to_geo_boundary(*h3_test_args)}) 121 | assert sort(actual) == sort(expected) 122 | 123 | ############################################################################### 124 | # Inspection 125 | ############################################################################### 126 | 127 | def test_h3_get_resolution(self): 128 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_resolution) 129 | 130 | actual = df.withColumn("actual", h3_pyspark.h3_get_resolution(*h3_pyspark_test_args)) 131 | actual = actual.collect()[0]["actual"] 132 | expected = sanitize_types(h3.h3_get_resolution(*h3_test_args)) 133 | assert sort(actual) == sort(expected) 134 | 135 | def test_h3_get_base_cell(self): 136 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_base_cell) 137 | 138 | actual = df.withColumn("actual", h3_pyspark.h3_get_base_cell(*h3_pyspark_test_args)) 139 | actual = actual.collect()[0]["actual"] 140 | expected = sanitize_types(h3.h3_get_base_cell(*h3_test_args)) 141 | assert sort(actual) == sort(expected) 142 | 143 | def test_string_to_h3(self): 144 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.string_to_h3) 145 | 146 | actual = df.withColumn("actual", h3_pyspark.string_to_h3(*h3_pyspark_test_args)) 147 | actual = actual.collect()[0]["actual"] 148 | expected = sanitize_types(h3.string_to_h3(*h3_test_args)) 149 | assert sort(actual) == sort(expected) 150 | 151 | def test_h3_to_string(self): 152 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_string) 153 | 154 | actual = df.withColumn("actual", h3_pyspark.h3_to_string(*h3_pyspark_test_args)) 155 | actual = actual.collect()[0]["actual"] 156 | expected = sanitize_types(h3.h3_to_string(*h3_test_args)) 157 | assert sort(actual) == sort(expected) 158 | 159 | def test_h3_is_valid(self): 160 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_valid) 161 | 162 | actual = df.withColumn("actual", h3_pyspark.h3_is_valid(*h3_pyspark_test_args)) 163 | actual = actual.collect()[0]["actual"] 164 | expected = sanitize_types(h3.h3_is_valid(*h3_test_args)) 165 | assert sort(actual) == sort(expected) 166 | 167 | def test_h3_is_res_class_III(self): 168 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_res_class_III) 169 | 170 | actual = df.withColumn("actual", h3_pyspark.h3_is_res_class_III(*h3_pyspark_test_args)) 171 | actual = actual.collect()[0]["actual"] 172 | expected = sanitize_types(h3.h3_is_res_class_III(*h3_test_args)) 173 | assert sort(actual) == sort(expected) 174 | 175 | def test_h3_is_pentagon(self): 176 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_pentagon) 177 | 178 | actual = df.withColumn("actual", h3_pyspark.h3_is_pentagon(*h3_pyspark_test_args)) 179 | actual = actual.collect()[0]["actual"] 180 | expected = sanitize_types(h3.h3_is_pentagon(*h3_test_args)) 181 | assert sort(actual) == sort(expected) 182 | 183 | def test_h3_get_faces(self): 184 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_faces) 185 | 186 | actual = df.withColumn("actual", h3_pyspark.h3_get_faces(*h3_pyspark_test_args)) 187 | actual = actual.collect()[0]["actual"] 188 | expected = sanitize_types(h3.h3_get_faces(*h3_test_args)) 189 | assert sort(actual) == sort(expected) 190 | 191 | ############################################################################### 192 | # Traversal 193 | ############################################################################### 194 | 195 | def test_k_ring(self): 196 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.k_ring) 197 | 198 | actual = df.withColumn("actual", h3_pyspark.k_ring(*h3_pyspark_test_args)) 199 | actual = actual.collect()[0]["actual"] 200 | expected = sanitize_types(h3.k_ring(*h3_test_args)) 201 | assert sort(actual) == sort(expected) 202 | 203 | def test_k_ring_distances(self): 204 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.k_ring_distances) 205 | 206 | actual = df.withColumn("actual", h3_pyspark.k_ring_distances(*h3_pyspark_test_args)) 207 | actual = actual.collect()[0]["actual"] 208 | expected = sanitize_types(h3.k_ring_distances(*h3_test_args)) 209 | assert sort(actual) == sort(expected) 210 | 211 | def test_hex_range(self): 212 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_range) 213 | 214 | actual = df.withColumn("actual", h3_pyspark.hex_range(*h3_pyspark_test_args)) 215 | actual = actual.collect()[0]["actual"] 216 | expected = sanitize_types(h3.hex_range(*h3_test_args)) 217 | assert sort(actual) == sort(expected) 218 | 219 | def test_hex_range_distances(self): 220 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_range_distances) 221 | 222 | actual = df.withColumn("actual", h3_pyspark.hex_range_distances(*h3_pyspark_test_args)) 223 | actual = actual.collect()[0]["actual"] 224 | expected = sanitize_types(h3.hex_range_distances(*h3_test_args)) 225 | assert sort(actual) == sort(expected) 226 | 227 | def test_hex_ranges(self): 228 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_ranges) 229 | 230 | actual = df.withColumn("actual", h3_pyspark.hex_ranges(*h3_pyspark_test_args)) 231 | actual = actual.collect()[0]["actual"] 232 | expected = sanitize_types(h3.hex_ranges(*h3_test_args)) 233 | assert sort(actual) == sort(expected) 234 | 235 | def test_hex_ring(self): 236 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_ring) 237 | 238 | actual = df.withColumn("actual", h3_pyspark.hex_ring(*h3_pyspark_test_args)) 239 | actual = actual.collect()[0]["actual"] 240 | expected = sanitize_types(h3.hex_ring(*h3_test_args)) 241 | assert sort(actual) == sort(expected) 242 | 243 | def test_h3_line(self): 244 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_line) 245 | 246 | actual = df.withColumn("actual", h3_pyspark.h3_line(*h3_pyspark_test_args)) 247 | actual = actual.collect()[0]["actual"] 248 | expected = sanitize_types(h3.h3_line(*h3_test_args)) 249 | assert sort(actual) == sort(expected) 250 | 251 | def test_h3_distance(self): 252 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_distance) 253 | 254 | actual = df.withColumn("actual", h3_pyspark.h3_distance(*h3_pyspark_test_args)) 255 | actual = actual.collect()[0]["actual"] 256 | expected = sanitize_types(h3.h3_distance(*h3_test_args)) 257 | assert sort(actual) == sort(expected) 258 | 259 | def test_experimental_h3_to_local_ij(self): 260 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.experimental_h3_to_local_ij) 261 | 262 | actual = df.withColumn("actual", h3_pyspark.experimental_h3_to_local_ij(*h3_pyspark_test_args)) 263 | actual = actual.collect()[0]["actual"] 264 | expected = sanitize_types(h3.experimental_h3_to_local_ij(*h3_test_args)) 265 | assert sort(actual) == sort(expected) 266 | 267 | def test_experimental_local_ij_to_h3(self): 268 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.experimental_local_ij_to_h3) 269 | 270 | actual = df.withColumn("actual", h3_pyspark.experimental_local_ij_to_h3(*h3_pyspark_test_args)) 271 | actual = actual.collect()[0]["actual"] 272 | expected = sanitize_types(h3.experimental_local_ij_to_h3(*h3_test_args)) 273 | assert sort(actual) == sort(expected) 274 | 275 | ############################################################################### 276 | # Hierarchy 277 | ############################################################################### 278 | 279 | def test_h3_to_parent(self): 280 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_parent) 281 | 282 | actual = df.withColumn("actual", h3_pyspark.h3_to_parent(*h3_pyspark_test_args)) 283 | actual = actual.collect()[0]["actual"] 284 | expected = sanitize_types(h3.h3_to_parent(*h3_test_args)) 285 | assert sort(actual) == sort(expected) 286 | 287 | def test_h3_to_children(self): 288 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_children) 289 | 290 | actual = df.withColumn("actual", h3_pyspark.h3_to_children(*h3_pyspark_test_args)) 291 | actual = actual.collect()[0]["actual"] 292 | expected = sanitize_types(h3.h3_to_children(*h3_test_args)) 293 | assert sort(actual) == sort(expected) 294 | 295 | def test_h3_to_center_child(self): 296 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_center_child) 297 | 298 | actual = df.withColumn("actual", h3_pyspark.h3_to_center_child(*h3_pyspark_test_args)) 299 | actual = actual.collect()[0]["actual"] 300 | expected = sanitize_types(h3.h3_to_center_child(*h3_test_args)) 301 | assert sort(actual) == sort(expected) 302 | 303 | def test_compact(self): 304 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.compact) 305 | 306 | actual = df.withColumn("actual", h3_pyspark.compact(*h3_pyspark_test_args)) 307 | actual = actual.collect()[0]["actual"] 308 | expected = sanitize_types(h3.compact(*h3_test_args)) 309 | assert sort(actual) == sort(expected) 310 | 311 | def test_uncompact(self): 312 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.uncompact) 313 | 314 | actual = df.withColumn("actual", h3_pyspark.uncompact(*h3_pyspark_test_args)) 315 | actual = actual.collect()[0]["actual"] 316 | expected = sanitize_types(h3.uncompact(*h3_test_args)) 317 | assert sort(actual) == sort(expected) 318 | 319 | ############################################################################### 320 | # Regions 321 | ############################################################################### 322 | 323 | def test_polyfill(self): 324 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.polyfill) 325 | 326 | actual = df.withColumn("actual", h3_pyspark.polyfill(*h3_pyspark_test_args)) 327 | actual = actual.collect()[0]["actual"] 328 | expected = sanitize_types(h3.polyfill(json.loads(polygon), integer, True)) 329 | assert sort(actual) == sort(expected) 330 | 331 | def test_h3_set_to_multi_polygon(self): 332 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_set_to_multi_polygon) 333 | 334 | actual = df.withColumn("actual", h3_pyspark.h3_set_to_multi_polygon(*h3_pyspark_test_args)) 335 | actual = actual.collect()[0]["actual"] 336 | expected = expected = json.dumps( 337 | {"type": "MultiPolygon", "coordinates": h3.h3_set_to_multi_polygon(*h3_test_args)} 338 | ) 339 | assert sort(actual) == sort(expected) 340 | 341 | ############################################################################### 342 | # Unidirectional Edges 343 | ############################################################################### 344 | 345 | def test_h3_indexes_are_neighbors(self): 346 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_indexes_are_neighbors) 347 | 348 | actual = df.withColumn("actual", h3_pyspark.h3_indexes_are_neighbors(*h3_pyspark_test_args)) 349 | actual = actual.collect()[0]["actual"] 350 | expected = sanitize_types(h3.h3_indexes_are_neighbors(*h3_test_args)) 351 | assert sort(actual) == sort(expected) 352 | 353 | def test_get_h3_unidirectional_edge(self): 354 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edge) 355 | 356 | actual = df.withColumn("actual", h3_pyspark.get_h3_unidirectional_edge(*h3_pyspark_test_args)) 357 | actual = actual.collect()[0]["actual"] 358 | expected = sanitize_types(h3.get_h3_unidirectional_edge(*h3_test_args)) 359 | assert sort(actual) == sort(expected) 360 | 361 | def test_h3_unidirectional_edge_is_valid(self): 362 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_unidirectional_edge_is_valid) 363 | 364 | actual = df.withColumn("actual", h3_pyspark.h3_unidirectional_edge_is_valid(*h3_pyspark_test_args)) 365 | actual = actual.collect()[0]["actual"] 366 | expected = sanitize_types(h3.h3_unidirectional_edge_is_valid(*h3_test_args)) 367 | assert sort(actual) == sort(expected) 368 | 369 | def test_get_origin_h3_index_from_unidirectional_edge(self): 370 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_origin_h3_index_from_unidirectional_edge) 371 | 372 | actual = df.withColumn( 373 | "actual", 374 | h3_pyspark.get_origin_h3_index_from_unidirectional_edge(*h3_pyspark_test_args), 375 | ) 376 | actual = actual.collect()[0]["actual"] 377 | expected = sanitize_types(h3.get_origin_h3_index_from_unidirectional_edge(*h3_test_args)) 378 | assert sort(actual) == sort(expected) 379 | 380 | def test_get_destination_h3_index_from_unidirectional_edge(self): 381 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_destination_h3_index_from_unidirectional_edge) 382 | 383 | actual = df.withColumn( 384 | "actual", 385 | h3_pyspark.get_destination_h3_index_from_unidirectional_edge(*h3_pyspark_test_args), 386 | ) 387 | actual = actual.collect()[0]["actual"] 388 | expected = sanitize_types(h3.get_destination_h3_index_from_unidirectional_edge(*h3_test_args)) 389 | assert sort(actual) == sort(expected) 390 | 391 | def test_get_h3_indexes_from_unidirectional_edge(self): 392 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_indexes_from_unidirectional_edge) 393 | 394 | actual = df.withColumn( 395 | "actual", 396 | h3_pyspark.get_h3_indexes_from_unidirectional_edge(*h3_pyspark_test_args), 397 | ) 398 | actual = actual.collect()[0]["actual"] 399 | expected = sanitize_types(h3.get_h3_indexes_from_unidirectional_edge(*h3_test_args)) 400 | assert sort(actual) == sort(expected) 401 | 402 | def test_get_h3_unidirectional_edges_from_hexagon(self): 403 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edges_from_hexagon) 404 | 405 | actual = df.withColumn( 406 | "actual", 407 | h3_pyspark.get_h3_unidirectional_edges_from_hexagon(*h3_pyspark_test_args), 408 | ) 409 | actual = actual.collect()[0]["actual"] 410 | expected = sanitize_types(h3.get_h3_unidirectional_edges_from_hexagon(*h3_test_args)) 411 | assert sort(actual) == sort(expected) 412 | 413 | def test_get_h3_unidirectional_edge_boundary(self): 414 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edge_boundary) 415 | 416 | actual = df.withColumn( 417 | "actual", 418 | h3_pyspark.get_h3_unidirectional_edge_boundary(*h3_pyspark_test_args), 419 | ) 420 | actual = actual.collect()[0]["actual"] 421 | expected = sanitize_types(h3.get_h3_unidirectional_edge_boundary(*h3_test_args)) 422 | assert sort(actual) == sort(expected) 423 | 424 | ############################################################################### 425 | # Miscellaneous 426 | ############################################################################### 427 | 428 | def test_hex_area(self): 429 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_area) 430 | 431 | h3_test_args[-1] = "m^2" 432 | actual = df.withColumn("unit", F.lit("m^2")) 433 | 434 | actual = actual.withColumn("actual", h3_pyspark.hex_area(*h3_pyspark_test_args)) 435 | actual = actual.collect()[0]["actual"] 436 | expected = sanitize_types(h3.hex_area(*h3_test_args)) 437 | assert sort(actual) == sort(expected) 438 | 439 | def test_cell_area(self): 440 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.cell_area) 441 | 442 | h3_test_args[-1] = "m^2" 443 | actual = df.withColumn("unit", F.lit("m^2")) 444 | 445 | actual = actual.withColumn("actual", h3_pyspark.cell_area(*h3_pyspark_test_args)) 446 | actual = actual.collect()[0]["actual"] 447 | expected = sanitize_types(h3.cell_area(*h3_test_args)) 448 | assert sort(actual) == sort(expected) 449 | 450 | def test_edge_length(self): 451 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.edge_length) 452 | 453 | h3_test_args[-1] = "m" 454 | actual = df.withColumn("unit", F.lit("m")) 455 | 456 | actual = actual.withColumn("actual", h3_pyspark.edge_length(*h3_pyspark_test_args)) 457 | actual = actual.collect()[0]["actual"] 458 | expected = sanitize_types(h3.edge_length(*h3_test_args)) 459 | assert sort(actual) == sort(expected) 460 | 461 | def test_exact_edge_length(self): 462 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.exact_edge_length) 463 | 464 | h3_test_args[-1] = "m" 465 | actual = df.withColumn("unit", F.lit("m")) 466 | 467 | actual = actual.withColumn("actual", h3_pyspark.exact_edge_length(*h3_pyspark_test_args)) 468 | actual = actual.collect()[0]["actual"] 469 | expected = sanitize_types(h3.exact_edge_length(*h3_test_args)) 470 | assert sort(actual) == sort(expected) 471 | 472 | def test_num_hexagons(self): 473 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.num_hexagons) 474 | 475 | actual = df.withColumn("actual", h3_pyspark.num_hexagons(*h3_pyspark_test_args)) 476 | actual = actual.collect()[0]["actual"] 477 | expected = sanitize_types(h3.num_hexagons(*h3_test_args)) 478 | assert sort(actual) == sort(expected) 479 | 480 | def test_get_res0_indexes(self): 481 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_res0_indexes) 482 | 483 | actual = df.withColumn("actual", h3_pyspark.get_res0_indexes(*h3_pyspark_test_args)) 484 | actual = actual.collect()[0]["actual"] 485 | expected = sanitize_types(h3.get_res0_indexes(*h3_test_args)) 486 | assert sort(actual) == sort(expected) 487 | 488 | def test_get_pentagon_indexes(self): 489 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_pentagon_indexes) 490 | 491 | actual = df.withColumn("actual", h3_pyspark.get_pentagon_indexes(*h3_pyspark_test_args)) 492 | actual = actual.collect()[0]["actual"] 493 | expected = sanitize_types(h3.get_pentagon_indexes(*h3_test_args)) 494 | assert sort(actual) == sort(expected) 495 | 496 | def test_point_dist(self): 497 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.point_dist) 498 | 499 | h3_test_args[-1] = "m" 500 | actual = df.withColumn("unit", F.lit("m")) 501 | 502 | actual = actual.withColumn("actual", h3_pyspark.point_dist(*h3_pyspark_test_args)) 503 | actual = actual.collect()[0]["actual"] 504 | expected = sanitize_types(h3.point_dist(*h3_test_args)) 505 | assert sort(actual) == sort(expected) 506 | 507 | 508 | if __name__ == "__main__": 509 | unittest.main() 510 | -------------------------------------------------------------------------------- /tests/test_coverage.py: -------------------------------------------------------------------------------- 1 | from inspect import getmembers, isfunction 2 | import h3 3 | import unittest 4 | 5 | from src import h3_pyspark 6 | 7 | 8 | blacklist = set(["h3_is_res_class_iii", "polyfill_geojson", "versions", "polyfill_polygon"]) 9 | 10 | 11 | class TestCoverage(unittest.TestCase): 12 | def test_geometry_coverage(self): 13 | h3_functions = getmembers(h3, isfunction) 14 | h3_functions = set([x[0] for x in h3_functions if "__" not in x[0]]) 15 | 16 | h3_pyspark_functions = getmembers(h3_pyspark, isfunction) 17 | h3_pyspark_functions = set([x[0] for x in h3_pyspark_functions if "__" not in x[0]]) 18 | 19 | self.assertEqual(h3_functions - blacklist - h3_pyspark_functions, set()) 20 | 21 | 22 | if __name__ == "__main__": 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /tests/test_indexing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from src.h3_pyspark import indexing 3 | 4 | 5 | point = '{ "type": "Point", "coordinates": [ -80.79617142677307, 32.131567579594716 ] }' 6 | line = '{ "type": "LineString", "coordinates": [ [ -80.79708337783813, 32.13510176661157 ], [ -80.79504489898682, 32.13510176661157 ], [ -80.79440116882324, 32.13550151179293 ], [ -80.79315662384033, 32.13535615011151 ], [ -80.79259872436523, 32.13470201967832 ], [ -80.79141855239868, 32.13292130751054 ] ] }' 7 | line2 = '{ "type": "LineString", "coordinates": [ [ -80.79768419265747, 32.13413873693519 ], [ -80.79171895980835, 32.132230817929354 ] ] }' 8 | polygon = '{ "type": "Polygon", "coordinates": [ [ [ -80.79427242279051, 32.132866795365196 ], [ -80.79128980636597, 32.132866795365196 ], [ -80.79128980636597, 32.13479287140789 ], [ -80.79427242279051, 32.13479287140789 ], [ -80.79427242279051, 32.132866795365196 ] ] ] }' 9 | polygon2 = '{ "type": "Polygon", "coordinates": [ [ [ -80.7916921377182, 32.13222627521743 ], [ -80.79402565956116, 32.135074511194496 ], [ -80.79768419265747, 32.13414327955186 ], [ -80.7916921377182, 32.13222627521743 ] ] ] }' 10 | multipoint = '{ "type": "MultiPoint", "coordinates":[ [ -80.7935643196106, 32.135755894178004 ], [ -80.79058170318604, 32.1330848437511 ]] }' 11 | multilinestring = '{ "type": "MultiLineString", "coordinates": [ [[ -80.7945728302002, 32.13577406432124 ], [ -80.79319953918457, 32.135010915189675 ]], [ [ -80.79257726669312, 32.13395703208247 ], [ -80.7915472984314, 32.13315752643055 ] ] ] }' 12 | multipolygon = '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }' 13 | 14 | 15 | class TestIndexing(unittest.TestCase): 16 | def test_h3_index_point(self): 17 | actual = indexing._index_shape(point, 9) 18 | expected = ["8944d551007ffff"] 19 | assert set(actual) == set(expected) 20 | 21 | def test_h3_index_line(self): 22 | actual = indexing._index_shape(line, 9) 23 | expected = ["8944d551073ffff", "8944d551077ffff", "8944d55103bffff"] 24 | assert set(actual) == set(expected) 25 | 26 | def test_h3_index_line_2(self): 27 | actual = indexing._index_shape(line2, 9) 28 | expected = ["8944d551073ffff", "8944d55103bffff", "8944d55100fffff"] 29 | assert set(actual) == set(expected) 30 | 31 | def test_h3_index_polygon(self): 32 | actual = indexing._index_shape(polygon, 9) 33 | expected = ["8944d551077ffff", "8944d55100fffff", "8944d551073ffff"] 34 | assert set(actual) == set(expected) 35 | 36 | def test_h3_index_polygon2(self): 37 | actual = indexing._index_shape(polygon2, 9) 38 | expected = ["8944d551077ffff", "8944d55100fffff", "8944d551073ffff", "8944d55103bffff"] 39 | assert set(actual) == set(expected) 40 | 41 | def test_h3_index_multipoint(self): 42 | actual = indexing._index_shape(multipoint, 9) 43 | expected = ["8944d551077ffff", "8944d551073ffff"] 44 | assert set(actual) == set(expected) 45 | 46 | def test_h3_index_multiline(self): 47 | actual = indexing._index_shape(multilinestring, 9) 48 | expected = ["8944d551077ffff", "8944d551073ffff"] 49 | assert set(actual) == set(expected) 50 | 51 | def test_h3_index_multipolygon(self): 52 | actual = indexing._index_shape(multipolygon, 9) 53 | expected = ["8944d551077ffff", "8944d551073ffff"] 54 | assert set(actual) == set(expected) 55 | 56 | 57 | if __name__ == "__main__": 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /tests/test_traversal.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from src.h3_pyspark import traversal 4 | 5 | 6 | input_cells = ["81447ffffffffff", "81267ffffffffff", "8148bffffffffff"] 7 | results = { 8 | # first cell results 9 | "81443ffffffffff", 10 | "8148bffffffffff", 11 | "8144fffffffffff", 12 | "81447ffffffffff", 13 | "8126fffffffffff", 14 | "81457ffffffffff", 15 | "81267ffffffffff", 16 | # second cell results 17 | "81263ffffffffff", 18 | "81277ffffffffff", 19 | "812abffffffffff", 20 | "8144fffffffffff", 21 | "81447ffffffffff", 22 | "8126fffffffffff", 23 | "81267ffffffffff", 24 | # third cell results 25 | "8149bffffffffff", 26 | "8148bffffffffff", 27 | "8148fffffffffff", 28 | "81483ffffffffff", 29 | "81447ffffffffff", 30 | "8126fffffffffff", 31 | "81457ffffffffff", 32 | } 33 | 34 | 35 | class TestTraversal(unittest.TestCase): 36 | def test_k_ring(self): 37 | actual = traversal._k_ring_distinct(input_cells) 38 | assert set(actual) == set(results) 39 | 40 | 41 | if __name__ == "__main__": 42 | unittest.main() 43 | --------------------------------------------------------------------------------