├── .github
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── docs
    ├── buffer.geojson
    ├── buffer.png
    ├── spatial_index.geojson
    ├── spatial_index.png
    ├── spatial_join.geojson
    └── spatial_join.png
├── meta.yaml
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── src
    ├── __init__.py
    └── h3_pyspark
    │   ├── __init__.py
    │   ├── indexing.py
    │   ├── traversal.py
    │   └── utils.py
└── tests
    ├── __init__.py
    ├── test_core.py
    ├── test_coverage.py
    ├── test_indexing.py
    └── test_traversal.py


/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python 3.10
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: "3.10"
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install black flake8 pytest
27 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 |     - name: Lint with flake8
29 |       run: |
30 |         # stop the build if there are Python syntax errors or undefined names
31 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34 |     - name: Lint with black
35 |       run: |
36 |         black -l 120 --check --diff src tests
37 |     - name: Test with pytest
38 |       run: |
39 |         pytest
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__/
3 | .pytest_cache
4 | *.egg-info
5 | dist/


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cSpell.words": [
 3 |         "isfunction",
 4 |         "uncompact",
 5 |         "conda",
 6 |         "geospatial",
 7 |         "codecov",
 8 |         "pytest"
 9 |     ]
10 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2021 Kevin Schaich
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img align="right" src="https://uber.github.io/img/h3Logo-color.svg" alt="H3 Logo" width="200">
  2 | 
  3 | # **h3-pyspark**: Uber's H3 Hexagonal Hierarchical Geospatial Indexing System in PySpark
  4 | 
  5 | [![PyPI version](https://img.shields.io/pypi/v/h3-pyspark.svg)](https://pypi.org/project/h3-pyspark/)
  6 | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/h3-pyspark.svg)](https://anaconda.org/conda-forge/h3-pyspark)
  7 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/kevinschaich/h3-pyspark/blob/master/LICENSE)
  8 | [![Tests](https://github.com/kevinschaich/h3-pyspark/actions/workflows/tests.yml/badge.svg?branch=master)](https://github.com/kevinschaich/h3-pyspark/actions/workflows/tests.yml)
  9 | 
 10 | PySpark bindings for the [H3 core library](https://h3geo.org/).
 11 | 
 12 | For available functions, please see the vanilla Python binding documentation at:
 13 | 
 14 | - [uber.github.io/h3-py](https://uber.github.io/h3-py)
 15 | 
 16 | ## Installation
 17 | 
 18 | Via `PyPI`:
 19 | 
 20 | ```bash
 21 | pip install h3-pyspark
 22 | ```
 23 | 
 24 | Via `conda-forge`:
 25 | 
 26 | ```bash
 27 | conda install -c conda-forge h3-pyspark
 28 | ```
 29 | 
 30 | ## Usage
 31 | 
 32 | ```python
 33 | >>> from pyspark.sql import SparkSession, functions as F
 34 | >>> import h3_pyspark
 35 | >>>
 36 | >>> spark = SparkSession.builder.getOrCreate()
 37 | >>> df = spark.createDataFrame([{"lat": 37.769377, "lng": -122.388903, 'resolution': 9}])
 38 | >>>
 39 | >>> df = df.withColumn('h3_9', h3_pyspark.geo_to_h3('lat', 'lng', 'resolution'))
 40 | >>> df.show()
 41 | 
 42 | +---------+-----------+----------+---------------+
 43 | |      lat|        lng|resolution|           h3_9|
 44 | +---------+-----------+----------+---------------+
 45 | |37.769377|-122.388903|         9|89283082e73ffff|
 46 | +---------+-----------+----------+---------------+
 47 | ```
 48 | 
 49 | ## Extension Functions
 50 | 
 51 | There are also various extension functions available for geospatial common operations which are not available in the vanilla H3 library.
 52 | 
 53 | ### Assumptions
 54 | 
 55 | * You use GeoJSON to represent geometries in your PySpark pipeline (as opposed to WKT)
 56 | * Geometries are stored in a GeoJSON `string` within a column (such as `geometry`) in your PySpark dataset
 57 | * Individual H3 cells are stored as a `string` column (such as `h3_9`)
 58 | * Sets of H3 cells are stored in an `array(string)` column (such as `h3_9`)
 59 | 
 60 | ### Indexing
 61 | 
 62 | #### `index_shape(geometry: Column, resolution: Column)`
 63 | 
 64 | Generate an H3 spatial index for an input GeoJSON geometry column.
 65 | 
 66 | This function accepts GeoJSON `Point`, `LineString`, `Polygon`, `MultiPoint`, `MultiLineString`, and `MultiPolygon`
 67 | input features, and returns the set of H3 cells at the specified resolution which completely cover them
 68 | (could be more than one cell for a substantially large geometry and substantially granular resolution).
 69 | 
 70 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell.
 71 | 
 72 | This spatial index can then be used for bucketing, clustering, and joins in Spark via an `explode()` operation.
 73 | 
 74 | ```python
 75 | >>> from pyspark.sql import SparkSession, functions as F
 76 | >>> from h3_pyspark.indexing import index_shape
 77 | >>> spark = SparkSession.builder.getOrCreate()
 78 | >>>
 79 | >>> df = spark.createDataFrame([{
 80 |         'geometry': '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }',
 81 | 
 82 |         'resolution': 9
 83 |     }])
 84 | >>>
 85 | >>> df = df.withColumn('h3_9', index_shape('geometry', 'resolution'))
 86 | >>> df.show()
 87 | +----------------------+----------+------------------------------------+
 88 | |              geometry|resolution|                                h3_9|
 89 | +----------------------+----------+------------------------------------+
 90 | | { "type": "MultiP... |         9| [8944d551077ffff, 8944d551073ffff] |
 91 | +----------------------+----------+------------------------------------+
 92 | ```
 93 | 
 94 | Optionally, add another column `h3_9_geometry` for the GeoJSON representation of each cell in the `h3_9` column [to easily map the result alongside your original input geometry](docs/spatial_index.geojson):
 95 | 
 96 | ```python
 97 | >>> df = df.withColumn('h3_9_geometry', h3_pyspark.h3_set_to_multi_polygon(F.col('h3_9'), F.lit(True)))
 98 | ```
 99 | 
100 | [View Live Map on GitHub](docs/spatial_index.geojson)
101 | 
102 | [![Result](https://github.com/kevinschaich/h3-pyspark/raw/master/docs/spatial_index.png)](docs/spatial_index.geojson)
103 | 
104 | ### Buffers
105 | 
106 | #### `k_ring_distinct(cells: Column, distance: Column)`
107 | 
108 | Takes in an array of input cells, perform a k-ring operation on each cell, and return the distinct set of output cells.
109 | 
110 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell.
111 | 
112 | Since [we know the edge length & diameter (`2 * edge length`) of each H3 cell resolution](https://h3geo.org/docs/core-library/restable), we can use this to efficiently generate a "buffered" index of our input geometry (useful for operations such as distance joins):
113 | 
114 | ```python
115 | >>> from pyspark.sql import SparkSession, functions as F
116 | >>> from h3_pyspark.indexing import index_shape
117 | >>> from h3_pyspark.traversal import k_ring_distinct
118 | >>> spark = SparkSession.builder.getOrCreate()
119 | >>>
120 | >>> df = spark.createDataFrame([{
121 |         'geometry': '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }',
122 | 
123 |         'resolution': 9
124 |     }])
125 | >>>
126 | >>> df = df.withColumn('h3_9', index_shape('geometry', 'resolution'))
127 | >>> df = df.withColumn('h3_9_buffer', k_ring_distinct('h3_9', 1))
128 | >>> df.show()
129 | +--------------------+----------+--------------------+--------------------+
130 | |            geometry|resolution|                h3_9|         h3_9_buffer|
131 | +--------------------+----------+--------------------+--------------------+
132 | |{ "type": "MultiP...|         9|[8944d551077ffff,...|[8944d551073ffff,...|
133 | +--------------------+----------+--------------------+--------------------+
134 | ```
135 | 
136 | [View Live Map on GitHub](docs/buffer.geojson)
137 | 
138 | [![Result](https://github.com/kevinschaich/h3-pyspark/raw/master/docs/buffer.png)](docs/buffer.geojson)
139 | 
140 | ### Spatial Joins
141 | 
142 | Once we have an indexed version of our geometries, we can easily join on the string column in H3 to get a set of pair candidates:
143 | 
144 | ```python
145 | >>> from pyspark.sql import SparkSession, functions as F
146 | >>> from h3_pyspark.indexing import index_shape
147 | >>> spark = SparkSession.builder.getOrCreate()
148 | >>>
149 | >>> left = spark.createDataFrame([{
150 |         'left_id': 'left_point',
151 |         'left_geometry': '{ "type": "Point", "coordinates": [ -80.79527020454407, 32.132884966083935 ] }',
152 |     }])
153 | >>> right = spark.createDataFrame([{
154 |         'right_id': 'right_polygon',
155 |         'right_geometry': '{ "type": "Polygon", "coordinates": [ [ [ -80.80022692680359, 32.12864200501338 ], [ -80.79224467277527, 32.12864200501338 ], [ -80.79224467277527, 32.13378441213715 ], [ -80.80022692680359, 32.13378441213715 ], [ -80.80022692680359, 32.12864200501338 ] ] ] }',
156 |     }])
157 | >>>
158 | >>> left = left.withColumn('h3_9', index_shape('left_geometry', F.lit(9)))
159 | >>> right = right.withColumn('h3_9', index_shape('right_geometry', F.lit(9)))
160 | >>>
161 | >>> left = left.withColumn('h3_9', F.explode('h3_9'))
162 | >>> right = right.withColumn('h3_9', F.explode('h3_9'))
163 | >>>
164 | >>> joined = left.join(right, on='h3_9', how='inner')
165 | >>> joined.show()
166 | +---------------+--------------------+----------+--------------------+-------------+
167 | |           h3_9|       left_geometry|   left_id|      right_geometry|     right_id|
168 | +---------------+--------------------+----------+--------------------+-------------+
169 | |8944d55100fffff|{ "type": "Point"...|left_point|{ "type": "Polygo...|right_polygon|
170 | +---------------+--------------------+----------+--------------------+-------------+
171 | ```
172 | 
173 | You can combine this technique with a [Buffer](#buffers) to do a **Distance Join**.
174 | 
175 | <div style="color: red;">
176 | 
177 | > **⚠️ Warning ⚠️:** The outputs of an H3 join are *approximate* – all resulting geometry pairs should be considered *intersection candidates* rather than *definitely intersecting*. Pairing a join here with a subsequent `distance` calculation (`distance = 0` = intersecting) or `intersects` can make this calculation exact. [Shapely](https://shapely.readthedocs.io) is a popular library with a well-documented [`distance`](https://shapely.readthedocs.io/en/stable/manual.html#object.distance) function which can be easily wrapped in a UDF:
178 | 
179 | </div>
180 | 
181 | ```python
182 | from pyspark.sql import functions as F, types as T
183 | from shapely import geometry
184 | import json
185 | 
186 | @F.udf(T.DoubleType())
187 | def distance(geometry1, geometry2):
188 |     geometry1 = json.loads(geometry1)
189 |     geometry1 = geometry.shape(geometry1)
190 |     geometry2 = json.loads(geometry2)
191 |     geometry2 = geometry.shape(geometry2)
192 |     return geometry1.distance(geometry2)
193 | ```
194 | 
195 | After a spatial join (detailed above), you can filter to only directly intersecting geometries:
196 | 
197 | ```python
198 | >>> joined = joined.withColumn('distance', distance(F.col('left_geometry'), F.col('right_geometry')))
199 | >>> joined = joined.filter(F.col('distance') == 0)
200 | >>> joined.show()
201 | +---------------+--------------------+----------+--------------------+-------------+--------+
202 | |           h3_9|       left_geometry|   left_id|      right_geometry|     right_id|distance|
203 | +---------------+--------------------+----------+--------------------+-------------+--------+
204 | |8944d55100fffff|{ "type": "Point"...|left_point|{ "type": "Polygo...|right_polygon|     0.0|
205 | +---------------+--------------------+----------+--------------------+-------------+--------+
206 | ```
207 | 
208 | [View Live Map on GitHub](docs/spatial_join.geojson)
209 | 
210 | [![Result](https://github.com/kevinschaich/h3-pyspark/raw/master/docs/spatial_join.png)](docs/spatial_join.geojson)
211 | 
212 | ## Publishing New Versions
213 | 
214 | 1. Bump version in [`setup.cfg`](./setup.cfg)
215 | 2. Publish to `PyPi`
216 | 
217 |         git clean -fdx
218 |         python3 -m build
219 |         python3 -m twine upload --repository pypi dist/*
220 | 
221 | 3. Create a new tag & release w/ version `x.x.x` and name `h3-pyspark-x.x.x` in GitHub
222 | 4. Publish to `conda-forge`:
223 |     * Bump version & new tag's `sha256` hash in [`meta.yml`](https://github.com/conda-forge/h3-pyspark-feedstock/blob/master/recipe/meta.yaml) in [`@conda-forge/h3-pyspark-feedstock`](https://github.com/conda-forge/h3-pyspark-feedstock)
224 |         openssl sha256 /path/to/h3-pyspark-x.x.x.tar.gz
225 | 


--------------------------------------------------------------------------------
/docs/buffer.geojson:
--------------------------------------------------------------------------------
  1 | {
  2 |     "type": "FeatureCollection",
  3 |     "features": [
  4 |         {
  5 |             "type": "Feature",
  6 |             "properties": {},
  7 |             "geometry": {
  8 |                 "type": "MultiPolygon",
  9 |                 "coordinates": [
 10 |                     [
 11 |                         [
 12 |                             [
 13 |                                 -80.79442262649536,
 14 |                                 32.13522895845023
 15 |                             ],
 16 |                             [
 17 |                                 -80.79298496246338,
 18 |                                 32.13522895845023
 19 |                             ],
 20 |                             [
 21 |                                 -80.79298496246338,
 22 |                                 32.13602844594619
 23 |                             ],
 24 |                             [
 25 |                                 -80.79442262649536,
 26 |                                 32.13602844594619
 27 |                             ],
 28 |                             [
 29 |                                 -80.79442262649536,
 30 |                                 32.13522895845023
 31 |                             ]
 32 |                         ]
 33 |                     ],
 34 |                     [
 35 |                         [
 36 |                             [
 37 |                                 -80.7923412322998,
 38 |                                 32.1330848437511
 39 |                             ],
 40 |                             [
 41 |                                 -80.79073190689087,
 42 |                                 32.1330848437511
 43 |                             ],
 44 |                             [
 45 |                                 -80.79073190689087,
 46 |                                 32.13375715632646
 47 |                             ],
 48 |                             [
 49 |                                 -80.7923412322998,
 50 |                                 32.13375715632646
 51 |                             ],
 52 |                             [
 53 |                                 -80.7923412322998,
 54 |                                 32.1330848437511
 55 |                             ]
 56 |                         ]
 57 |                     ]
 58 |                 ]
 59 |             }
 60 |         },
 61 |         {
 62 |             "type": "Feature",
 63 |             "properties": {},
 64 |             "geometry": {
 65 |                 "type": "MultiPolygon",
 66 |                 "coordinates": [
 67 |                     [
 68 |                         [
 69 |                             [
 70 |                                 -80.78724600624852,
 71 |                                 32.130476831471
 72 |                             ],
 73 |                             [
 74 |                                 -80.78773022640085,
 75 |                                 32.132228008570145
 76 |                             ],
 77 |                             [
 78 |                                 -80.78635566417105,
 79 |                                 32.1334385288524
 80 |                             ],
 81 |                             [
 82 |                                 -80.78683987723358,
 83 |                                 32.13518969274837
 84 |                             ],
 85 |                             [
 86 |                                 -80.78869872440329,
 87 |                                 32.13573036419037
 88 |                             ],
 89 |                             [
 90 |                                 -80.78918300225553,
 91 |                                 32.13748154270417
 92 |                             ],
 93 |                             [
 94 |                                 -80.79104196132435,
 95 |                                 32.13802221401248
 96 |                             ],
 97 |                             [
 98 |                                 -80.79152630397307,
 99 |                                 32.139773407134115
100 |                             ],
101 |                             [
102 |                                 -80.79338537494641,
103 |                                 32.14031407829878
104 |                             ],
105 |                             [
106 |                                 -80.79476007848629,
107 |                                 32.13910351377108
108 |                             ],
109 |                             [
110 |                                 -80.79661923657869,
111 |                                 32.139644142214856
112 |                             ],
113 |                             [
114 |                                 -80.79799395534437,
115 |                                 32.13843350714515
116 |                             ],
117 |                             [
118 |                                 -80.79750946892102,
119 |                                 32.1366822583843
120 |                             ],
121 |                             [
122 |                                 -80.79888415581274,
123 |                                 32.13547156752768
124 |                             ],
125 |                             [
126 |                                 -80.7983996623107,
127 |                                 32.13372030555331
128 |                             ],
129 |                             [
130 |                                 -80.79654055380145,
131 |                                 32.133179762250634
132 |                             ],
133 |                             [
134 |                                 -80.79605612510308,
135 |                                 32.13142851488513
136 |                             ],
137 |                             [
138 |                                 -80.79419712848552,
139 |                                 32.13088797143808
140 |                             ],
141 |                             [
142 |                                 -80.79371276458406,
143 |                                 32.12913673869145
144 |                             ],
145 |                             [
146 |                                 -80.79185387985282,
147 |                                 32.128596195109935
148 |                             ],
149 |                             [
150 |                                 -80.79047933424329,
151 |                                 32.12980684169767
152 |                             ],
153 |                             [
154 |                                 -80.78862053661935,
155 |                                 32.12926625541086
156 |                             ],
157 |                             [
158 |                                 -80.78724600624852,
159 |                                 32.130476831471
160 |                             ]
161 |                         ]
162 |                     ]
163 |                 ]
164 |             }
165 |         }
166 |     ]
167 | }


--------------------------------------------------------------------------------
/docs/buffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/buffer.png


--------------------------------------------------------------------------------
/docs/spatial_index.geojson:
--------------------------------------------------------------------------------
  1 | {
  2 |     "type": "FeatureCollection",
  3 |     "features": [
  4 |         {
  5 |             "type": "Feature",
  6 |             "properties": {},
  7 |             "geometry": {
  8 |                 "type": "MultiPolygon",
  9 |                 "coordinates": [
 10 |                     [
 11 |                         [
 12 |                             [
 13 |                                 -80.79442262649536,
 14 |                                 32.13522895845023
 15 |                             ],
 16 |                             [
 17 |                                 -80.79298496246338,
 18 |                                 32.13522895845023
 19 |                             ],
 20 |                             [
 21 |                                 -80.79298496246338,
 22 |                                 32.13602844594619
 23 |                             ],
 24 |                             [
 25 |                                 -80.79442262649536,
 26 |                                 32.13602844594619
 27 |                             ],
 28 |                             [
 29 |                                 -80.79442262649536,
 30 |                                 32.13522895845023
 31 |                             ]
 32 |                         ]
 33 |                     ],
 34 |                     [
 35 |                         [
 36 |                             [
 37 |                                 -80.7923412322998,
 38 |                                 32.1330848437511
 39 |                             ],
 40 |                             [
 41 |                                 -80.79073190689087,
 42 |                                 32.1330848437511
 43 |                             ],
 44 |                             [
 45 |                                 -80.79073190689087,
 46 |                                 32.13375715632646
 47 |                             ],
 48 |                             [
 49 |                                 -80.7923412322998,
 50 |                                 32.13375715632646
 51 |                             ],
 52 |                             [
 53 |                                 -80.7923412322998,
 54 |                                 32.1330848437511
 55 |                             ]
 56 |                         ]
 57 |                     ]
 58 |                 ]
 59 |             }
 60 |         },
 61 |         {
 62 |             "type": "Feature",
 63 |             "properties": {},
 64 |             "geometry": {
 65 |                 "type": "MultiPolygon",
 66 |                 "coordinates": [
 67 |                     [
 68 |                         [
 69 |                             [
 70 |                                 -80.791932268028,
 71 |                                 32.135060457894376
 72 |                             ],
 73 |                             [
 74 |                                 -80.79241661776229,
 75 |                                 32.13681166423319
 76 |                             ],
 77 |                             [
 78 |                                 -80.79427566395086,
 79 |                                 32.13735229282743
 80 |                             ],
 81 |                             [
 82 |                                 -80.79565033561992,
 83 |                                 32.13614167251003
 84 |                             ],
 85 |                             [
 86 |                                 -80.79516591400238,
 87 |                                 32.13439043835102
 88 |                             ],
 89 |                             [
 90 |                                 -80.79330689259906,
 91 |                                 32.13384985232931
 92 |                             ],
 93 |                             [
 94 |                                 -80.79282253578053,
 95 |                                 32.132098632782174
 96 |                             ],
 97 |                             [
 98 |                                 -80.79096362626957,
 99 |                                 32.131558046622956
100 |                             ],
101 |                             [
102 |                                 -80.78958904879794,
103 |                                 32.132768637435596
104 |                             ],
105 |                             [
106 |                                 -80.79007333373787,
107 |                                 32.13451982915956
108 |                             ],
109 |                             [
110 |                                 -80.791932268028,
111 |                                 32.135060457894376
112 |                             ]
113 |                         ]
114 |                     ]
115 |                 ]
116 |             }
117 |         }
118 |     ]
119 | }


--------------------------------------------------------------------------------
/docs/spatial_index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/spatial_index.png


--------------------------------------------------------------------------------
/docs/spatial_join.geojson:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "FeatureCollection",
 3 |     "features": [
 4 |         {
 5 |             "type": "Feature",
 6 |             "properties": {},
 7 |             "geometry": {
 8 |                 "type": "Polygon",
 9 |                 "coordinates": [
10 |                     [
11 |                         [
12 |                             -80.79419712848552,
13 |                             32.13088797143808
14 |                         ],
15 |                         [
16 |                             -80.79282253578053,
17 |                             32.132098632782174
18 |                         ],
19 |                         [
20 |                             -80.79330689259906,
21 |                             32.13384985232931
22 |                         ],
23 |                         [
24 |                             -80.79516591400238,
25 |                             32.13439043835102
26 |                         ],
27 |                         [
28 |                             -80.79654055380145,
29 |                             32.133179762250634
30 |                         ],
31 |                         [
32 |                             -80.79605612510308,
33 |                             32.13142851488513
34 |                         ],
35 |                         [
36 |                             -80.79419712848552,
37 |                             32.13088797143808
38 |                         ]
39 |                     ]
40 |                 ]
41 |             }
42 |         },
43 |         {
44 |             "type": "Feature",
45 |             "properties": {},
46 |             "geometry": {
47 |                 "type": "Polygon",
48 |                 "coordinates": [
49 |                     [
50 |                         [
51 |                             -80.80022692680359,
52 |                             32.12864200501338
53 |                         ],
54 |                         [
55 |                             -80.79224467277527,
56 |                             32.12864200501338
57 |                         ],
58 |                         [
59 |                             -80.79224467277527,
60 |                             32.13378441213715
61 |                         ],
62 |                         [
63 |                             -80.80022692680359,
64 |                             32.13378441213715
65 |                         ],
66 |                         [
67 |                             -80.80022692680359,
68 |                             32.12864200501338
69 |                         ]
70 |                     ]
71 |                 ]
72 |             }
73 |         },
74 |         {
75 |             "type": "Feature",
76 |             "properties": {},
77 |             "geometry": {
78 |                 "type": "Point",
79 |                 "coordinates": [
80 |                     -80.79527020454407,
81 |                     32.132884966083935
82 |                 ]
83 |             }
84 |         }
85 |     ]
86 | }


--------------------------------------------------------------------------------
/docs/spatial_join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/spatial_join.png


--------------------------------------------------------------------------------
/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set version = "1.2.2" %}
 2 | 
 3 | package:
 4 |   name: h3-pyspark
 5 |   version: {{ version }}
 6 | 
 7 | source:
 8 |   url: https://github.com/kevinschaich/h3-pyspark/archive/refs/tags/{{ version }}.tar.gz
 9 |   sha256: 64c39a66664676ce799dbfb5cbd49d9a9d76926e5495dcc8ea580fb03b4b46fb
10 | 
11 | build:
12 |   noarch: python
13 |   number: 0
14 |   script: {{ PYTHON }} -m pip install . -vv
15 | 
16 | requirements:
17 |   build:
18 |     - pytest
19 |     - black
20 |   host:
21 |     - pip
22 |     - python
23 |   run:
24 |     - python
25 |     - pyspark
26 |     - h3-py
27 |     - shapely
28 | 
29 | test:
30 |   imports:
31 |     - h3_pyspark
32 | 
33 | about:
34 |   home: https://github.com/kevinschaich/h3-pyspark
35 |   summary: PySpark bindings for H3, a hierarchical hexagonal geospatial indexing system
36 |   license: Apache-2.0
37 |   license_family: Apache
38 |   license_file: LICENSE
39 |   dev_url: https://github.com/kevinschaich/h3-pyspark
40 |   doc_url: https://github.com/kevinschaich/h3-pyspark
41 | 
42 | extra:
43 |   recipe-maintainers:
44 |     - kevinschaich
45 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark
2 | h3
3 | Shapely
4 | pytest
5 | black


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = h3-pyspark
 3 | version = 1.2.6
 4 | author = Kevin Schaich
 5 | author_email = schaich.kevin@gmail.com
 6 | description = PySpark bindings for H3, a hierarchical hexagonal geospatial indexing system
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/kevinschaich/h3-pyspark
10 | project_urls =
11 |     Bug Tracker = https://github.com/kevinschaich/h3-pyspark/issues
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     License :: OSI Approved :: MIT License
15 |     Operating System :: OS Independent
16 | 
17 | [options]
18 | package_dir =
19 |     = src
20 | packages = find:
21 | python_requires = >=3.6
22 | 
23 | [options.packages.find]
24 | where = src
25 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/src/__init__.py


--------------------------------------------------------------------------------
/src/h3_pyspark/__init__.py:
--------------------------------------------------------------------------------
  1 | import h3
  2 | from pyspark.sql import functions as F, types as T
  3 | import json
  4 | from inspect import getmembers, isfunction
  5 | from .utils import sanitize_types, handle_nulls
  6 | import sys
  7 | from shapely import geometry
  8 | 
  9 | 
 10 | ###############################################################################
 11 | # Indexing
 12 | ###############################################################################
 13 | 
 14 | 
 15 | @F.udf(returnType=T.StringType())
 16 | @handle_nulls
 17 | def geo_to_h3(lat, lng, resolution):
 18 |     return sanitize_types(h3.geo_to_h3(lat, lng, resolution))
 19 | 
 20 | 
 21 | @F.udf(returnType=T.ArrayType(T.DoubleType()))
 22 | @handle_nulls
 23 | def h3_to_geo(h):
 24 |     return sanitize_types(h3.h3_to_geo(h))
 25 | 
 26 | 
 27 | @F.udf(returnType=T.StringType())
 28 | @handle_nulls
 29 | def h3_to_geo_boundary(h, geo_json):
 30 |     # NOTE: this behavior differs from default
 31 |     # h3-pyspark return type will be a valid GeoJSON string if geo_json is set to True
 32 |     coordinates = h3.h3_to_geo_boundary(h, geo_json)
 33 |     if geo_json:
 34 |         return sanitize_types(json.dumps({"type": "MultiPolygon", "coordinates": coordinates}))
 35 |     return sanitize_types(coordinates)
 36 | 
 37 | 
 38 | ###############################################################################
 39 | # Inspection
 40 | ###############################################################################
 41 | 
 42 | 
 43 | @F.udf(returnType=T.IntegerType())
 44 | @handle_nulls
 45 | def h3_get_resolution(h):
 46 |     return sanitize_types(h3.h3_get_resolution(h))
 47 | 
 48 | 
 49 | @F.udf(returnType=T.IntegerType())
 50 | @handle_nulls
 51 | def h3_get_base_cell(h):
 52 |     return sanitize_types(h3.h3_get_base_cell(h))
 53 | 
 54 | 
 55 | @F.udf(returnType=T.LongType())
 56 | @handle_nulls
 57 | def string_to_h3(h):
 58 |     return sanitize_types(h3.string_to_h3(h))
 59 | 
 60 | 
 61 | @F.udf(returnType=T.StringType())
 62 | @handle_nulls
 63 | def h3_to_string(h):
 64 |     return sanitize_types(h3.h3_to_string(h))
 65 | 
 66 | 
 67 | @F.udf(returnType=T.BooleanType())
 68 | @handle_nulls
 69 | def h3_is_valid(h):
 70 |     return sanitize_types(h3.h3_is_valid(h))
 71 | 
 72 | 
 73 | @F.udf(returnType=T.BooleanType())
 74 | @handle_nulls
 75 | def h3_is_res_class_III(h):
 76 |     return sanitize_types(h3.h3_is_res_class_III(h))
 77 | 
 78 | 
 79 | @F.udf(returnType=T.BooleanType())
 80 | @handle_nulls
 81 | def h3_is_pentagon(h):
 82 |     return sanitize_types(h3.h3_is_pentagon(h))
 83 | 
 84 | 
 85 | @F.udf(returnType=T.ArrayType(T.IntegerType()))
 86 | @handle_nulls
 87 | def h3_get_faces(h):
 88 |     return sanitize_types(h3.h3_get_faces(h))
 89 | 
 90 | 
 91 | ###############################################################################
 92 | # Traversal
 93 | ###############################################################################
 94 | 
 95 | 
 96 | @F.udf(returnType=T.ArrayType(T.StringType()))
 97 | @handle_nulls
 98 | def k_ring(origin, k):
 99 |     return sanitize_types(h3.k_ring(origin, k))
100 | 
101 | 
102 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.StringType())))
103 | @handle_nulls
104 | def k_ring_distances(origin, k):
105 |     return sanitize_types(h3.k_ring_distances(origin, k))
106 | 
107 | 
108 | @F.udf(returnType=T.ArrayType(T.StringType()))
109 | @handle_nulls
110 | def hex_range(h, k):
111 |     return sanitize_types(h3.hex_range(h, k))
112 | 
113 | 
114 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.StringType())))
115 | @handle_nulls
116 | def hex_range_distances(h, k):
117 |     return sanitize_types(h3.hex_range_distances(h, k))
118 | 
119 | 
120 | @F.udf(returnType=T.MapType(T.StringType(), T.ArrayType(T.ArrayType(T.StringType()))))
121 | @handle_nulls
122 | def hex_ranges(h, k):
123 |     return sanitize_types(h3.hex_ranges(h, k))
124 | 
125 | 
126 | @F.udf(returnType=T.ArrayType(T.StringType()))
127 | @handle_nulls
128 | def hex_ring(h, k):
129 |     return sanitize_types(h3.hex_ring(h, k))
130 | 
131 | 
132 | @F.udf(returnType=T.ArrayType(T.StringType()))
133 | @handle_nulls
134 | def h3_line(start, end):
135 |     return sanitize_types(h3.h3_line(start, end))
136 | 
137 | 
138 | @F.udf(returnType=T.IntegerType())
139 | @handle_nulls
140 | def h3_distance(h1, h2):
141 |     return sanitize_types(h3.h3_distance(h1, h2))
142 | 
143 | 
144 | @F.udf(returnType=T.ArrayType(T.IntegerType()))
145 | @handle_nulls
146 | def experimental_h3_to_local_ij(origin, h):
147 |     return sanitize_types(h3.experimental_h3_to_local_ij(origin, h))
148 | 
149 | 
150 | @F.udf(returnType=T.StringType())
151 | @handle_nulls
152 | def experimental_local_ij_to_h3(origin, i, j):
153 |     return sanitize_types(h3.experimental_local_ij_to_h3(origin, i, j))
154 | 
155 | 
156 | ###############################################################################
157 | # Hierarchy
158 | ###############################################################################
159 | 
160 | 
161 | @F.udf(returnType=T.StringType())
162 | @handle_nulls
163 | def h3_to_parent(h, parent_res):
164 |     return sanitize_types(h3.h3_to_parent(h, parent_res))
165 | 
166 | 
167 | @F.udf(returnType=T.ArrayType(T.StringType()))
168 | @handle_nulls
169 | def h3_to_children(h, child_res):
170 |     return sanitize_types(h3.h3_to_children(h, child_res))
171 | 
172 | 
173 | @F.udf(returnType=T.StringType())
174 | @handle_nulls
175 | def h3_to_center_child(h, child_res):
176 |     return sanitize_types(h3.h3_to_center_child(h, child_res))
177 | 
178 | 
179 | @F.udf(returnType=T.ArrayType(T.StringType()))
180 | @handle_nulls
181 | def compact(hexes):
182 |     return sanitize_types(h3.compact(hexes))
183 | 
184 | 
185 | @F.udf(returnType=T.ArrayType(T.StringType()))
186 | @handle_nulls
187 | def uncompact(hexes, res):
188 |     return sanitize_types(h3.uncompact(hexes, res))
189 | 
190 | 
191 | ###############################################################################
192 | # Regions
193 | ###############################################################################
194 | 
195 | 
196 | @F.udf(returnType=T.ArrayType(T.StringType()))
197 | @handle_nulls
198 | def polyfill(polygons, res, geo_json_conformant):
199 |     # NOTE: this behavior differs from default
200 |     # h3-pyspark expect `polygons` argument to be a valid GeoJSON string
201 |     polygons = json.loads(polygons)
202 |     return sanitize_types(h3.polyfill(polygons, res, geo_json_conformant))
203 | 
204 | 
205 | @F.udf(returnType=T.StringType())
206 | @handle_nulls
207 | def h3_set_to_multi_polygon(hexes, geo_json):
208 |     # NOTE: this behavior differs from default
209 |     # h3-pyspark return type will be a valid GeoJSON string if geo_json is set to True
210 |     coordinates = h3.h3_set_to_multi_polygon(hexes, geo_json)
211 |     if geo_json:
212 |         return sanitize_types(json.dumps({"type": "MultiPolygon", "coordinates": coordinates}))
213 |     return sanitize_types(coordinates)
214 | 
215 | 
216 | ###############################################################################
217 | # Unidirectional Edges
218 | ###############################################################################
219 | 
220 | 
221 | @F.udf(returnType=T.BooleanType())
222 | @handle_nulls
223 | def h3_indexes_are_neighbors(origin, destination):
224 |     return sanitize_types(h3.h3_indexes_are_neighbors(origin, destination))
225 | 
226 | 
227 | @F.udf(returnType=T.StringType())
228 | @handle_nulls
229 | def get_h3_unidirectional_edge(origin, destination):
230 |     return sanitize_types(h3.get_h3_unidirectional_edge(origin, destination))
231 | 
232 | 
233 | @F.udf(returnType=T.BooleanType())
234 | @handle_nulls
235 | def h3_unidirectional_edge_is_valid(edge):
236 |     return sanitize_types(h3.h3_unidirectional_edge_is_valid(edge))
237 | 
238 | 
239 | @F.udf(returnType=T.StringType())
240 | @handle_nulls
241 | def get_origin_h3_index_from_unidirectional_edge(edge):
242 |     return sanitize_types(h3.get_origin_h3_index_from_unidirectional_edge(edge))
243 | 
244 | 
245 | @F.udf(returnType=T.StringType())
246 | @handle_nulls
247 | def get_destination_h3_index_from_unidirectional_edge(edge):
248 |     return sanitize_types(h3.get_destination_h3_index_from_unidirectional_edge(edge))
249 | 
250 | 
251 | @F.udf(returnType=T.ArrayType(T.StringType()))
252 | @handle_nulls
253 | def get_h3_indexes_from_unidirectional_edge(edge):
254 |     return sanitize_types(h3.get_h3_indexes_from_unidirectional_edge(edge))
255 | 
256 | 
257 | @F.udf(returnType=T.ArrayType(T.StringType()))
258 | @handle_nulls
259 | def get_h3_unidirectional_edges_from_hexagon(h):
260 |     return sanitize_types(h3.get_h3_unidirectional_edges_from_hexagon(h))
261 | 
262 | 
263 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.DoubleType())))
264 | @handle_nulls
265 | def get_h3_unidirectional_edge_boundary(h, geo_json):
266 |     return sanitize_types(h3.get_h3_unidirectional_edge_boundary(h, geo_json))
267 | 
268 | 
269 | ###############################################################################
270 | # Miscellaneous
271 | ###############################################################################
272 | 
273 | 
274 | @F.udf(returnType=T.DoubleType())
275 | @handle_nulls
276 | def hex_area(res, unit):
277 |     return sanitize_types(h3.hex_area(res, unit))
278 | 
279 | 
280 | @F.udf(returnType=T.DoubleType())
281 | @handle_nulls
282 | def cell_area(h, unit):
283 |     return sanitize_types(h3.cell_area(h, unit))
284 | 
285 | 
286 | @F.udf(returnType=T.DoubleType())
287 | @handle_nulls
288 | def edge_length(res, unit):
289 |     return sanitize_types(h3.edge_length(res, unit))
290 | 
291 | 
292 | @F.udf(returnType=T.DoubleType())
293 | @handle_nulls
294 | def exact_edge_length(res, unit):
295 |     return sanitize_types(h3.exact_edge_length(res, unit))
296 | 
297 | 
298 | @F.udf(returnType=T.IntegerType())
299 | @handle_nulls
300 | def num_hexagons(res):
301 |     return sanitize_types(h3.num_hexagons(res))
302 | 
303 | 
304 | @F.udf(returnType=T.ArrayType(T.StringType()))
305 | @handle_nulls
306 | def get_res0_indexes():
307 |     return sanitize_types(h3.get_res0_indexes())
308 | 
309 | 
310 | @F.udf(returnType=T.ArrayType(T.StringType()))
311 | @handle_nulls
312 | def get_pentagon_indexes(res):
313 |     return sanitize_types(h3.get_pentagon_indexes(res))
314 | 
315 | 
316 | @F.udf(returnType=T.DoubleType())
317 | @handle_nulls
318 | def point_dist(point1, point2, unit):
319 |     return sanitize_types(h3.point_dist(point1, point2, unit))
320 | 
321 | 
322 | # Steal docstrings from h3-py native bindings if they exist
323 | for f in [f[1] for f in getmembers(sys.modules[__name__], isfunction)]:
324 |     try:
325 |         h3_f = getattr(h3, f.__name__)
326 |         f.__doc__ = h3_f.__doc__
327 |     except Exception:
328 |         f.__doc__ = f.__doc__
329 | 


--------------------------------------------------------------------------------
/src/h3_pyspark/indexing.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import h3
  4 | from pyspark.sql.column import Column
  5 | from shapely import geometry
  6 | from shapely.geometry import (
  7 |     Point,
  8 |     MultiPoint,
  9 |     LineString,
 10 |     MultiLineString,
 11 |     Polygon,
 12 |     MultiPolygon,
 13 | )
 14 | from pyspark.sql import functions as F, types as T
 15 | from .utils import flatten, densify, handle_nulls
 16 | 
 17 | 
 18 | def _index_point_object(point: Point, resolution: int):
 19 |     """
 20 |     Generate H3 spatial index for input point geometry.
 21 | 
 22 |     Returns the set of H3 cells at the specified resolution which completely cover the input point.
 23 |     """
 24 |     result_set = set()
 25 | 
 26 |     # Hexes for point
 27 |     result_set.update(h3.geo_to_h3(t[1], t[0], resolution) for t in list(point.coords))
 28 |     return result_set
 29 | 
 30 | 
 31 | def _index_line_object(line: LineString, resolution: int):
 32 |     """
 33 |     Generate H3 spatial index for input line geometry.
 34 | 
 35 |     Returns the set of H3 cells at the specified resolution which completely cover the input line.
 36 |     """
 37 |     result_set = set()
 38 | 
 39 |     # Hexes for vertices
 40 |     vertex_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(line.coords)]
 41 |     result_set.update(vertex_hexes)
 42 | 
 43 |     # Figure out the max-length line segment (step) we can process without interpolating
 44 |     # https://github.com/kevinschaich/h3-pyspark/issues/8
 45 |     endpoint_hex_edges = flatten(
 46 |         [h3.get_h3_unidirectional_edges_from_hexagon(h) for h in [vertex_hexes[0], vertex_hexes[1]]]
 47 |     )
 48 |     step = math.degrees(min([h3.exact_edge_length(e, unit="rads") for e in endpoint_hex_edges]))
 49 | 
 50 |     densified_line = densify(line, step)
 51 |     line_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(densified_line.coords)]
 52 |     result_set.update(line_hexes)
 53 | 
 54 |     neighboring_hexes = set(flatten([h3.k_ring(h, 1) for h in result_set])) - result_set
 55 |     intersecting_neighboring_hexes = filter(
 56 |         lambda h: Polygon(h3.h3_set_to_multi_polygon([h], True)[0][0]).distance(line) == 0, neighboring_hexes
 57 |     )
 58 |     result_set.update(intersecting_neighboring_hexes)
 59 | 
 60 |     return result_set
 61 | 
 62 | 
 63 | def _index_polygon_object(polygon: Polygon, resolution: int):
 64 |     """
 65 |     Generate H3 spatial index for input polygon geometry.
 66 | 
 67 |     Returns the set of H3 cells at the specified resolution which completely cover the input polygon.
 68 |     """
 69 |     result_set = set()
 70 |     # Hexes for vertices
 71 |     vertex_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(polygon.exterior.coords)]
 72 |     result_set.update(vertex_hexes)
 73 | 
 74 |     # Hexes for edges
 75 |     edge_hexes = _index_shape_object(polygon.boundary, resolution)
 76 |     result_set.update(edge_hexes)
 77 | 
 78 |     # Hexes for internal area
 79 |     result_set.update(list(h3.polyfill(geometry.mapping(polygon), resolution, geo_json_conformant=True)))
 80 |     return result_set
 81 | 
 82 | 
 83 | def _index_shape_object(shape: geometry, resolution: int):
 84 |     """
 85 |     Generate H3 spatial index for input geometry.
 86 | 
 87 |     Returns the set of H3 cells at the specified resolution which completely cover the input shape.
 88 |     """
 89 |     result_set = set()
 90 | 
 91 |     try:
 92 |         if isinstance(shape, Point):
 93 |             result_set.update(_index_point_object(shape, resolution))
 94 | 
 95 |         elif isinstance(shape, LineString):
 96 |             result_set.update(_index_line_object(shape, resolution))
 97 | 
 98 |         elif isinstance(shape, Polygon):
 99 |             result_set.update(_index_polygon_object(shape, resolution))
100 | 
101 |         elif isinstance(shape, MultiPoint) or isinstance(shape, MultiLineString) or isinstance(shape, MultiPolygon):
102 |             result_set.update(*[_index_shape_object(s, resolution) for s in shape.geoms])
103 |         else:
104 |             raise ValueError(f"Unsupported geometry_type {shape.geom_type}")
105 | 
106 |     except Exception as e:
107 |         raise ValueError(
108 |             f"Error finding indices for geometry {json.dumps(geometry.mapping(shape))}",
109 |             repr(e),
110 |         )
111 | 
112 |     return list(result_set)
113 | 
114 | 
115 | def _index_shape(shape: str, resolution: int):
116 |     """
117 |     Generate H3 spatial index for input shape.
118 | 
119 |     Returns the set of H3 cells at the specified resolution which completely cover the input shape.
120 |     """
121 |     shape = geometry.shape(json.loads(shape))
122 |     return _index_shape_object(shape, resolution)
123 | 
124 | 
125 | @F.udf(T.ArrayType(T.StringType()))
126 | @handle_nulls
127 | def index_shape(geometry: Column, resolution: Column):
128 |     """
129 |     Generate an H3 spatial index for an input GeoJSON geometry column.
130 | 
131 |     This function accepts GeoJSON `Point`, `LineString`, `Polygon`, `MultiPoint`, `MultiLineString`, and `MultiPolygon`
132 |     input features, and returns the set of H3 cells at the specified resolution which completely cover them
133 |     (could be more than one cell for a substantially large geometry and substantially granular resolution).
134 | 
135 |     The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell.
136 | 
137 |     This spatial index can then be used for bucketing, clustering, and joins in Spark via an `explode()` operation.
138 |     """
139 |     return _index_shape(geometry, resolution)
140 | 


--------------------------------------------------------------------------------
/src/h3_pyspark/traversal.py:
--------------------------------------------------------------------------------
 1 | import h3
 2 | from pyspark.sql import functions as F, types as T
 3 | from pyspark.sql.column import Column
 4 | from typing import List
 5 | from .utils import handle_nulls
 6 | 
 7 | 
 8 | def _k_ring_distinct(cells: List[str], distance: int = 1):
 9 |     """
10 |     Perform a k-ring operation on every input cell and return the distinct set of output cells.
11 |     """
12 |     result_set = set(cells)
13 |     result_set = result_set.union(*[h3.k_ring(c, distance) for c in result_set])
14 | 
15 |     return list(result_set)
16 | 
17 | 
18 | @F.udf(T.ArrayType(T.StringType()))
19 | @handle_nulls
20 | def k_ring_distinct(cells: Column, distance: Column):
21 |     """
22 |     Perform a k-ring operation on every input cell and return the distinct set of output cells.
23 | 
24 |     The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell.
25 |     """
26 |     return _k_ring_distinct(cells, distance)
27 | 


--------------------------------------------------------------------------------
/src/h3_pyspark/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from shapely.geometry import LineString
 3 | 
 4 | 
 5 | def handle_nulls(function):
 6 |     """
 7 |     Decorator to return null if any of the input arguments are null.
 8 |     """
 9 | 
10 |     def inner(*args, **kwargs):
11 |         if any(arg is None for arg in args):
12 |             return None
13 |         return function(*args, **kwargs)
14 | 
15 |     return inner
16 | 
17 | 
18 | def flatten(t):
19 |     return [item for sublist in t for item in sublist]
20 | 
21 | 
22 | def densify(line, step):
23 |     """
24 |     Given a line segment, return another line segment with the same start & endpoints,
25 |     and equally spaced sub-points based on `step` size.
26 | 
27 |     All the points on the new line are guaranteed to intersect with the original line,
28 |     and the first and last points will be the same.
29 |     """
30 | 
31 |     if line.length < step:
32 |         return line
33 | 
34 |     length = line.length
35 |     current_distance = step
36 |     new_points = []
37 | 
38 |     # take actual first point
39 |     new_points.append(line.interpolate(0.0, normalized=True))
40 | 
41 |     # add points between endpoints by step size
42 |     while current_distance < length:
43 |         new_points.append(line.interpolate(current_distance))
44 |         current_distance += step
45 | 
46 |     # take actual last point
47 |     new_points.append(line.interpolate(1.0, normalized=True))
48 | 
49 |     return LineString(new_points)
50 | 
51 | 
52 | def sanitize_types(value):
53 |     """
54 |     Casts values returned by H3 to native PySpark types.
55 | 
56 |     This is necessary because PySpark does not natively support
57 |     all the types returned by H3, i.e. Python sets/tuples.
58 |     """
59 | 
60 |     if isinstance(value, str) or isinstance(value, bool) or isinstance(value, int) or isinstance(value, float):
61 |         return value
62 |     if isinstance(value, set) or isinstance(value, tuple):
63 |         return [sanitize_types(v) for v in value]
64 |     if isinstance(value, list):
65 |         return [sanitize_types(v) for v in value]
66 |     if isinstance(value, dict):
67 |         return {k: sanitize_types(v) for k, v in value.items()}
68 | 
69 |     return json.dumps(value)
70 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | from inspect import getfullargspec
  2 | from pyspark.sql import SparkSession, functions as F, types as T
  3 | import h3
  4 | import json
  5 | import unittest
  6 | 
  7 | from src import h3_pyspark
  8 | from src.h3_pyspark.utils import sanitize_types
  9 | 
 10 | 
 11 | spark = SparkSession.builder.getOrCreate()
 12 | 
 13 | 
 14 | # Generate some arbitrary test values
 15 | latitude = 29.8988
 16 | longitude = -89.998354
 17 | integer = 1
 18 | double = 0.5
 19 | point = '{"type": "Point", "coordinates": [-89.998354, 29.8988]}'
 20 | line = '{"type": "LineString", "coordinates": [[-89.99927146300001, 29.90139583899997], [-89.99921418299999, 29.90139420899999], [-89.99903129900002, 29.90138951699998], [-89.99900807, 29.90142210300002], [-89.99898608000001, 29.90138835699997], [-89.99875118300002, 29.90138410499998], [-89.99872961, 29.90141686999999], [-89.99871085699999, 29.90138346399999], [-89.99837947499998, 29.90137720600001], [-89.99835869700001, 29.90140975100002], [-89.99834035200001, 29.901376191], [-89.998234115, 29.90137350700002], [-89.998218017, 29.90137313499997], [-89.99819830400003, 29.90137344499999], [-89.99787396300002, 29.90139402699998], [-89.99785696700002, 29.90142557899998], [-89.99783514199999, 29.90139429700002]]}'
 21 | polygon = '{"type": "Polygon", "coordinates": [[[-89.998354, 29.8988], [-89.99807, 29.8988], [-89.99807, 29.898628], [-89.998354, 29.898628], [-89.998354, 29.8988]]]}'
 22 | h3_cell = "81447ffffffffff"
 23 | h3_cells = ["81447ffffffffff", "81267ffffffffff", "8148bffffffffff", "81483ffffffffff"]
 24 | h3_edge = "131447ffffffffff"
 25 | unit = "km^2"
 26 | 
 27 | 
 28 | # Generate a dataframe from arbitrary test values (mapping function parameters to appropriate type)
 29 | test_arg_map = {
 30 |     "i": integer,
 31 |     "j": integer,
 32 |     "k": integer,
 33 |     "x": integer,
 34 |     "resolution": integer,
 35 |     "res": integer,
 36 |     "lat": latitude,
 37 |     "lng": longitude,
 38 |     "point1": (latitude, longitude),
 39 |     "point2": (latitude, longitude),
 40 |     "h": h3_cells[0],
 41 |     "hexes": h3_cells,
 42 |     "h1": h3_cells[1],
 43 |     "h2": h3_cells[2],
 44 |     "origin": h3_cells[2],
 45 |     "destination": h3_cells[3],
 46 |     "start": h3_cells[1],
 47 |     "end": h3_cells[2],
 48 |     "e": h3_edge,
 49 |     "edge": h3_edge,
 50 |     "geo_json": True,
 51 |     "geo_json_conformant": True,
 52 |     "geojson": polygon,
 53 | }
 54 | df = spark.createDataFrame([test_arg_map])
 55 | 
 56 | 
 57 | def get_test_args(function):
 58 |     argspec = getfullargspec(function)
 59 |     args = argspec.args
 60 |     h3_test_args = [test_arg_map.get(a.lower()) for a in args]
 61 |     h3_pyspark_test_args = [F.col(a) for a in args]
 62 | 
 63 |     return h3_test_args, h3_pyspark_test_args
 64 | 
 65 | 
 66 | def sort(value):
 67 |     if isinstance(value, str) or isinstance(value, bool) or isinstance(value, int) or isinstance(value, float):
 68 |         return value
 69 |     if isinstance(value, list):
 70 |         value = [sort(v) for v in value]
 71 |         value.sort()
 72 |         return value
 73 |     if isinstance(value, set) or isinstance(value, tuple):
 74 |         return [sort(v) for v in value]
 75 |     if isinstance(value, dict):
 76 |         return {k: sort(v) for k, v in value.items()}
 77 | 
 78 |     return json.dumps(value)
 79 | 
 80 | 
 81 | class TestCore(unittest.TestCase):
 82 | 
 83 |     ###############################################################################
 84 |     # Indexing
 85 |     ###############################################################################
 86 | 
 87 |     def test_geo_to_h3(self):
 88 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.geo_to_h3)
 89 | 
 90 |         actual = df.withColumn("actual", h3_pyspark.geo_to_h3(*h3_pyspark_test_args))
 91 |         actual = actual.collect()[0]["actual"]
 92 |         expected = sanitize_types(h3.geo_to_h3(*h3_test_args))
 93 |         assert sort(actual) == sort(expected)
 94 | 
 95 |     def test_geo_to_h3_single_null_input(self):
 96 |         actual = df.withColumn("actual", h3_pyspark.geo_to_h3(F.lit(100), F.lit(None), F.lit(9)))
 97 |         actual = actual.collect()[0]["actual"]
 98 |         expected = None
 99 |         assert actual == expected
100 | 
101 |     def test_geo_to_h3_all_null_inputs(self):
102 |         actual = df.withColumn("actual", h3_pyspark.geo_to_h3(F.lit(None), F.lit(None), F.lit(None)))
103 |         actual = actual.collect()[0]["actual"]
104 |         expected = None
105 |         assert actual == expected
106 | 
107 |     def test_h3_to_geo(self):
108 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_geo)
109 | 
110 |         actual = df.withColumn("actual", h3_pyspark.h3_to_geo(*h3_pyspark_test_args))
111 |         actual = actual.collect()[0]["actual"]
112 |         expected = sanitize_types(h3.h3_to_geo(*h3_test_args))
113 |         assert sort(actual) == sort(expected)
114 | 
115 |     def test_h3_to_geo_boundary(self):
116 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_geo_boundary)
117 | 
118 |         actual = df.withColumn("actual", h3_pyspark.h3_to_geo_boundary(*h3_pyspark_test_args))
119 |         actual = actual.collect()[0]["actual"]
120 |         expected = json.dumps({"type": "MultiPolygon", "coordinates": h3.h3_to_geo_boundary(*h3_test_args)})
121 |         assert sort(actual) == sort(expected)
122 | 
123 |     ###############################################################################
124 |     # Inspection
125 |     ###############################################################################
126 | 
127 |     def test_h3_get_resolution(self):
128 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_resolution)
129 | 
130 |         actual = df.withColumn("actual", h3_pyspark.h3_get_resolution(*h3_pyspark_test_args))
131 |         actual = actual.collect()[0]["actual"]
132 |         expected = sanitize_types(h3.h3_get_resolution(*h3_test_args))
133 |         assert sort(actual) == sort(expected)
134 | 
135 |     def test_h3_get_base_cell(self):
136 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_base_cell)
137 | 
138 |         actual = df.withColumn("actual", h3_pyspark.h3_get_base_cell(*h3_pyspark_test_args))
139 |         actual = actual.collect()[0]["actual"]
140 |         expected = sanitize_types(h3.h3_get_base_cell(*h3_test_args))
141 |         assert sort(actual) == sort(expected)
142 | 
143 |     def test_string_to_h3(self):
144 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.string_to_h3)
145 | 
146 |         actual = df.withColumn("actual", h3_pyspark.string_to_h3(*h3_pyspark_test_args))
147 |         actual = actual.collect()[0]["actual"]
148 |         expected = sanitize_types(h3.string_to_h3(*h3_test_args))
149 |         assert sort(actual) == sort(expected)
150 | 
151 |     def test_h3_to_string(self):
152 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_string)
153 | 
154 |         actual = df.withColumn("actual", h3_pyspark.h3_to_string(*h3_pyspark_test_args))
155 |         actual = actual.collect()[0]["actual"]
156 |         expected = sanitize_types(h3.h3_to_string(*h3_test_args))
157 |         assert sort(actual) == sort(expected)
158 | 
159 |     def test_h3_is_valid(self):
160 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_valid)
161 | 
162 |         actual = df.withColumn("actual", h3_pyspark.h3_is_valid(*h3_pyspark_test_args))
163 |         actual = actual.collect()[0]["actual"]
164 |         expected = sanitize_types(h3.h3_is_valid(*h3_test_args))
165 |         assert sort(actual) == sort(expected)
166 | 
167 |     def test_h3_is_res_class_III(self):
168 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_res_class_III)
169 | 
170 |         actual = df.withColumn("actual", h3_pyspark.h3_is_res_class_III(*h3_pyspark_test_args))
171 |         actual = actual.collect()[0]["actual"]
172 |         expected = sanitize_types(h3.h3_is_res_class_III(*h3_test_args))
173 |         assert sort(actual) == sort(expected)
174 | 
175 |     def test_h3_is_pentagon(self):
176 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_pentagon)
177 | 
178 |         actual = df.withColumn("actual", h3_pyspark.h3_is_pentagon(*h3_pyspark_test_args))
179 |         actual = actual.collect()[0]["actual"]
180 |         expected = sanitize_types(h3.h3_is_pentagon(*h3_test_args))
181 |         assert sort(actual) == sort(expected)
182 | 
183 |     def test_h3_get_faces(self):
184 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_faces)
185 | 
186 |         actual = df.withColumn("actual", h3_pyspark.h3_get_faces(*h3_pyspark_test_args))
187 |         actual = actual.collect()[0]["actual"]
188 |         expected = sanitize_types(h3.h3_get_faces(*h3_test_args))
189 |         assert sort(actual) == sort(expected)
190 | 
191 |     ###############################################################################
192 |     # Traversal
193 |     ###############################################################################
194 | 
195 |     def test_k_ring(self):
196 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.k_ring)
197 | 
198 |         actual = df.withColumn("actual", h3_pyspark.k_ring(*h3_pyspark_test_args))
199 |         actual = actual.collect()[0]["actual"]
200 |         expected = sanitize_types(h3.k_ring(*h3_test_args))
201 |         assert sort(actual) == sort(expected)
202 | 
203 |     def test_k_ring_distances(self):
204 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.k_ring_distances)
205 | 
206 |         actual = df.withColumn("actual", h3_pyspark.k_ring_distances(*h3_pyspark_test_args))
207 |         actual = actual.collect()[0]["actual"]
208 |         expected = sanitize_types(h3.k_ring_distances(*h3_test_args))
209 |         assert sort(actual) == sort(expected)
210 | 
211 |     def test_hex_range(self):
212 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_range)
213 | 
214 |         actual = df.withColumn("actual", h3_pyspark.hex_range(*h3_pyspark_test_args))
215 |         actual = actual.collect()[0]["actual"]
216 |         expected = sanitize_types(h3.hex_range(*h3_test_args))
217 |         assert sort(actual) == sort(expected)
218 | 
219 |     def test_hex_range_distances(self):
220 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_range_distances)
221 | 
222 |         actual = df.withColumn("actual", h3_pyspark.hex_range_distances(*h3_pyspark_test_args))
223 |         actual = actual.collect()[0]["actual"]
224 |         expected = sanitize_types(h3.hex_range_distances(*h3_test_args))
225 |         assert sort(actual) == sort(expected)
226 | 
227 |     def test_hex_ranges(self):
228 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_ranges)
229 | 
230 |         actual = df.withColumn("actual", h3_pyspark.hex_ranges(*h3_pyspark_test_args))
231 |         actual = actual.collect()[0]["actual"]
232 |         expected = sanitize_types(h3.hex_ranges(*h3_test_args))
233 |         assert sort(actual) == sort(expected)
234 | 
235 |     def test_hex_ring(self):
236 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_ring)
237 | 
238 |         actual = df.withColumn("actual", h3_pyspark.hex_ring(*h3_pyspark_test_args))
239 |         actual = actual.collect()[0]["actual"]
240 |         expected = sanitize_types(h3.hex_ring(*h3_test_args))
241 |         assert sort(actual) == sort(expected)
242 | 
243 |     def test_h3_line(self):
244 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_line)
245 | 
246 |         actual = df.withColumn("actual", h3_pyspark.h3_line(*h3_pyspark_test_args))
247 |         actual = actual.collect()[0]["actual"]
248 |         expected = sanitize_types(h3.h3_line(*h3_test_args))
249 |         assert sort(actual) == sort(expected)
250 | 
251 |     def test_h3_distance(self):
252 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_distance)
253 | 
254 |         actual = df.withColumn("actual", h3_pyspark.h3_distance(*h3_pyspark_test_args))
255 |         actual = actual.collect()[0]["actual"]
256 |         expected = sanitize_types(h3.h3_distance(*h3_test_args))
257 |         assert sort(actual) == sort(expected)
258 | 
259 |     def test_experimental_h3_to_local_ij(self):
260 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.experimental_h3_to_local_ij)
261 | 
262 |         actual = df.withColumn("actual", h3_pyspark.experimental_h3_to_local_ij(*h3_pyspark_test_args))
263 |         actual = actual.collect()[0]["actual"]
264 |         expected = sanitize_types(h3.experimental_h3_to_local_ij(*h3_test_args))
265 |         assert sort(actual) == sort(expected)
266 | 
267 |     def test_experimental_local_ij_to_h3(self):
268 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.experimental_local_ij_to_h3)
269 | 
270 |         actual = df.withColumn("actual", h3_pyspark.experimental_local_ij_to_h3(*h3_pyspark_test_args))
271 |         actual = actual.collect()[0]["actual"]
272 |         expected = sanitize_types(h3.experimental_local_ij_to_h3(*h3_test_args))
273 |         assert sort(actual) == sort(expected)
274 | 
275 |     ###############################################################################
276 |     # Hierarchy
277 |     ###############################################################################
278 | 
279 |     def test_h3_to_parent(self):
280 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_parent)
281 | 
282 |         actual = df.withColumn("actual", h3_pyspark.h3_to_parent(*h3_pyspark_test_args))
283 |         actual = actual.collect()[0]["actual"]
284 |         expected = sanitize_types(h3.h3_to_parent(*h3_test_args))
285 |         assert sort(actual) == sort(expected)
286 | 
287 |     def test_h3_to_children(self):
288 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_children)
289 | 
290 |         actual = df.withColumn("actual", h3_pyspark.h3_to_children(*h3_pyspark_test_args))
291 |         actual = actual.collect()[0]["actual"]
292 |         expected = sanitize_types(h3.h3_to_children(*h3_test_args))
293 |         assert sort(actual) == sort(expected)
294 | 
295 |     def test_h3_to_center_child(self):
296 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_center_child)
297 | 
298 |         actual = df.withColumn("actual", h3_pyspark.h3_to_center_child(*h3_pyspark_test_args))
299 |         actual = actual.collect()[0]["actual"]
300 |         expected = sanitize_types(h3.h3_to_center_child(*h3_test_args))
301 |         assert sort(actual) == sort(expected)
302 | 
303 |     def test_compact(self):
304 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.compact)
305 | 
306 |         actual = df.withColumn("actual", h3_pyspark.compact(*h3_pyspark_test_args))
307 |         actual = actual.collect()[0]["actual"]
308 |         expected = sanitize_types(h3.compact(*h3_test_args))
309 |         assert sort(actual) == sort(expected)
310 | 
311 |     def test_uncompact(self):
312 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.uncompact)
313 | 
314 |         actual = df.withColumn("actual", h3_pyspark.uncompact(*h3_pyspark_test_args))
315 |         actual = actual.collect()[0]["actual"]
316 |         expected = sanitize_types(h3.uncompact(*h3_test_args))
317 |         assert sort(actual) == sort(expected)
318 | 
319 |     ###############################################################################
320 |     # Regions
321 |     ###############################################################################
322 | 
323 |     def test_polyfill(self):
324 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.polyfill)
325 | 
326 |         actual = df.withColumn("actual", h3_pyspark.polyfill(*h3_pyspark_test_args))
327 |         actual = actual.collect()[0]["actual"]
328 |         expected = sanitize_types(h3.polyfill(json.loads(polygon), integer, True))
329 |         assert sort(actual) == sort(expected)
330 | 
331 |     def test_h3_set_to_multi_polygon(self):
332 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_set_to_multi_polygon)
333 | 
334 |         actual = df.withColumn("actual", h3_pyspark.h3_set_to_multi_polygon(*h3_pyspark_test_args))
335 |         actual = actual.collect()[0]["actual"]
336 |         expected = expected = json.dumps(
337 |             {"type": "MultiPolygon", "coordinates": h3.h3_set_to_multi_polygon(*h3_test_args)}
338 |         )
339 |         assert sort(actual) == sort(expected)
340 | 
341 |     ###############################################################################
342 |     # Unidirectional Edges
343 |     ###############################################################################
344 | 
345 |     def test_h3_indexes_are_neighbors(self):
346 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_indexes_are_neighbors)
347 | 
348 |         actual = df.withColumn("actual", h3_pyspark.h3_indexes_are_neighbors(*h3_pyspark_test_args))
349 |         actual = actual.collect()[0]["actual"]
350 |         expected = sanitize_types(h3.h3_indexes_are_neighbors(*h3_test_args))
351 |         assert sort(actual) == sort(expected)
352 | 
353 |     def test_get_h3_unidirectional_edge(self):
354 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edge)
355 | 
356 |         actual = df.withColumn("actual", h3_pyspark.get_h3_unidirectional_edge(*h3_pyspark_test_args))
357 |         actual = actual.collect()[0]["actual"]
358 |         expected = sanitize_types(h3.get_h3_unidirectional_edge(*h3_test_args))
359 |         assert sort(actual) == sort(expected)
360 | 
361 |     def test_h3_unidirectional_edge_is_valid(self):
362 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_unidirectional_edge_is_valid)
363 | 
364 |         actual = df.withColumn("actual", h3_pyspark.h3_unidirectional_edge_is_valid(*h3_pyspark_test_args))
365 |         actual = actual.collect()[0]["actual"]
366 |         expected = sanitize_types(h3.h3_unidirectional_edge_is_valid(*h3_test_args))
367 |         assert sort(actual) == sort(expected)
368 | 
369 |     def test_get_origin_h3_index_from_unidirectional_edge(self):
370 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_origin_h3_index_from_unidirectional_edge)
371 | 
372 |         actual = df.withColumn(
373 |             "actual",
374 |             h3_pyspark.get_origin_h3_index_from_unidirectional_edge(*h3_pyspark_test_args),
375 |         )
376 |         actual = actual.collect()[0]["actual"]
377 |         expected = sanitize_types(h3.get_origin_h3_index_from_unidirectional_edge(*h3_test_args))
378 |         assert sort(actual) == sort(expected)
379 | 
380 |     def test_get_destination_h3_index_from_unidirectional_edge(self):
381 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_destination_h3_index_from_unidirectional_edge)
382 | 
383 |         actual = df.withColumn(
384 |             "actual",
385 |             h3_pyspark.get_destination_h3_index_from_unidirectional_edge(*h3_pyspark_test_args),
386 |         )
387 |         actual = actual.collect()[0]["actual"]
388 |         expected = sanitize_types(h3.get_destination_h3_index_from_unidirectional_edge(*h3_test_args))
389 |         assert sort(actual) == sort(expected)
390 | 
391 |     def test_get_h3_indexes_from_unidirectional_edge(self):
392 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_indexes_from_unidirectional_edge)
393 | 
394 |         actual = df.withColumn(
395 |             "actual",
396 |             h3_pyspark.get_h3_indexes_from_unidirectional_edge(*h3_pyspark_test_args),
397 |         )
398 |         actual = actual.collect()[0]["actual"]
399 |         expected = sanitize_types(h3.get_h3_indexes_from_unidirectional_edge(*h3_test_args))
400 |         assert sort(actual) == sort(expected)
401 | 
402 |     def test_get_h3_unidirectional_edges_from_hexagon(self):
403 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edges_from_hexagon)
404 | 
405 |         actual = df.withColumn(
406 |             "actual",
407 |             h3_pyspark.get_h3_unidirectional_edges_from_hexagon(*h3_pyspark_test_args),
408 |         )
409 |         actual = actual.collect()[0]["actual"]
410 |         expected = sanitize_types(h3.get_h3_unidirectional_edges_from_hexagon(*h3_test_args))
411 |         assert sort(actual) == sort(expected)
412 | 
413 |     def test_get_h3_unidirectional_edge_boundary(self):
414 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edge_boundary)
415 | 
416 |         actual = df.withColumn(
417 |             "actual",
418 |             h3_pyspark.get_h3_unidirectional_edge_boundary(*h3_pyspark_test_args),
419 |         )
420 |         actual = actual.collect()[0]["actual"]
421 |         expected = sanitize_types(h3.get_h3_unidirectional_edge_boundary(*h3_test_args))
422 |         assert sort(actual) == sort(expected)
423 | 
424 |     ###############################################################################
425 |     # Miscellaneous
426 |     ###############################################################################
427 | 
428 |     def test_hex_area(self):
429 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_area)
430 | 
431 |         h3_test_args[-1] = "m^2"
432 |         actual = df.withColumn("unit", F.lit("m^2"))
433 | 
434 |         actual = actual.withColumn("actual", h3_pyspark.hex_area(*h3_pyspark_test_args))
435 |         actual = actual.collect()[0]["actual"]
436 |         expected = sanitize_types(h3.hex_area(*h3_test_args))
437 |         assert sort(actual) == sort(expected)
438 | 
439 |     def test_cell_area(self):
440 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.cell_area)
441 | 
442 |         h3_test_args[-1] = "m^2"
443 |         actual = df.withColumn("unit", F.lit("m^2"))
444 | 
445 |         actual = actual.withColumn("actual", h3_pyspark.cell_area(*h3_pyspark_test_args))
446 |         actual = actual.collect()[0]["actual"]
447 |         expected = sanitize_types(h3.cell_area(*h3_test_args))
448 |         assert sort(actual) == sort(expected)
449 | 
450 |     def test_edge_length(self):
451 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.edge_length)
452 | 
453 |         h3_test_args[-1] = "m"
454 |         actual = df.withColumn("unit", F.lit("m"))
455 | 
456 |         actual = actual.withColumn("actual", h3_pyspark.edge_length(*h3_pyspark_test_args))
457 |         actual = actual.collect()[0]["actual"]
458 |         expected = sanitize_types(h3.edge_length(*h3_test_args))
459 |         assert sort(actual) == sort(expected)
460 | 
461 |     def test_exact_edge_length(self):
462 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.exact_edge_length)
463 | 
464 |         h3_test_args[-1] = "m"
465 |         actual = df.withColumn("unit", F.lit("m"))
466 | 
467 |         actual = actual.withColumn("actual", h3_pyspark.exact_edge_length(*h3_pyspark_test_args))
468 |         actual = actual.collect()[0]["actual"]
469 |         expected = sanitize_types(h3.exact_edge_length(*h3_test_args))
470 |         assert sort(actual) == sort(expected)
471 | 
472 |     def test_num_hexagons(self):
473 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.num_hexagons)
474 | 
475 |         actual = df.withColumn("actual", h3_pyspark.num_hexagons(*h3_pyspark_test_args))
476 |         actual = actual.collect()[0]["actual"]
477 |         expected = sanitize_types(h3.num_hexagons(*h3_test_args))
478 |         assert sort(actual) == sort(expected)
479 | 
480 |     def test_get_res0_indexes(self):
481 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_res0_indexes)
482 | 
483 |         actual = df.withColumn("actual", h3_pyspark.get_res0_indexes(*h3_pyspark_test_args))
484 |         actual = actual.collect()[0]["actual"]
485 |         expected = sanitize_types(h3.get_res0_indexes(*h3_test_args))
486 |         assert sort(actual) == sort(expected)
487 | 
488 |     def test_get_pentagon_indexes(self):
489 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_pentagon_indexes)
490 | 
491 |         actual = df.withColumn("actual", h3_pyspark.get_pentagon_indexes(*h3_pyspark_test_args))
492 |         actual = actual.collect()[0]["actual"]
493 |         expected = sanitize_types(h3.get_pentagon_indexes(*h3_test_args))
494 |         assert sort(actual) == sort(expected)
495 | 
496 |     def test_point_dist(self):
497 |         h3_test_args, h3_pyspark_test_args = get_test_args(h3.point_dist)
498 | 
499 |         h3_test_args[-1] = "m"
500 |         actual = df.withColumn("unit", F.lit("m"))
501 | 
502 |         actual = actual.withColumn("actual", h3_pyspark.point_dist(*h3_pyspark_test_args))
503 |         actual = actual.collect()[0]["actual"]
504 |         expected = sanitize_types(h3.point_dist(*h3_test_args))
505 |         assert sort(actual) == sort(expected)
506 | 
507 | 
508 | if __name__ == "__main__":
509 |     unittest.main()
510 | 


--------------------------------------------------------------------------------
/tests/test_coverage.py:
--------------------------------------------------------------------------------
 1 | from inspect import getmembers, isfunction
 2 | import h3
 3 | import unittest
 4 | 
 5 | from src import h3_pyspark
 6 | 
 7 | 
 8 | blacklist = set(["h3_is_res_class_iii", "polyfill_geojson", "versions", "polyfill_polygon"])
 9 | 
10 | 
11 | class TestCoverage(unittest.TestCase):
12 |     def test_geometry_coverage(self):
13 |         h3_functions = getmembers(h3, isfunction)
14 |         h3_functions = set([x[0] for x in h3_functions if "__" not in x[0]])
15 | 
16 |         h3_pyspark_functions = getmembers(h3_pyspark, isfunction)
17 |         h3_pyspark_functions = set([x[0] for x in h3_pyspark_functions if "__" not in x[0]])
18 | 
19 |         self.assertEqual(h3_functions - blacklist - h3_pyspark_functions, set())
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     unittest.main()
24 | 


--------------------------------------------------------------------------------
/tests/test_indexing.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from src.h3_pyspark import indexing
 3 | 
 4 | 
 5 | point = '{ "type": "Point", "coordinates": [ -80.79617142677307, 32.131567579594716 ] }'
 6 | line = '{ "type": "LineString", "coordinates": [ [ -80.79708337783813, 32.13510176661157 ], [ -80.79504489898682, 32.13510176661157 ], [ -80.79440116882324, 32.13550151179293 ], [ -80.79315662384033, 32.13535615011151 ], [ -80.79259872436523, 32.13470201967832 ], [ -80.79141855239868, 32.13292130751054 ] ] }'
 7 | line2 = '{ "type": "LineString", "coordinates": [ [ -80.79768419265747, 32.13413873693519 ], [ -80.79171895980835, 32.132230817929354 ] ] }'
 8 | polygon = '{ "type": "Polygon", "coordinates": [ [ [ -80.79427242279051, 32.132866795365196 ], [ -80.79128980636597, 32.132866795365196 ], [ -80.79128980636597, 32.13479287140789 ], [ -80.79427242279051, 32.13479287140789 ], [ -80.79427242279051, 32.132866795365196 ] ] ] }'
 9 | polygon2 = '{ "type": "Polygon", "coordinates": [ [ [ -80.7916921377182, 32.13222627521743 ], [ -80.79402565956116, 32.135074511194496 ], [ -80.79768419265747, 32.13414327955186 ], [ -80.7916921377182, 32.13222627521743 ] ] ] }'
10 | multipoint = '{ "type": "MultiPoint", "coordinates":[ [ -80.7935643196106, 32.135755894178004 ], [ -80.79058170318604, 32.1330848437511 ]] }'
11 | multilinestring = '{ "type": "MultiLineString", "coordinates": [ [[ -80.7945728302002, 32.13577406432124 ], [ -80.79319953918457, 32.135010915189675 ]], [ [ -80.79257726669312, 32.13395703208247 ], [ -80.7915472984314, 32.13315752643055 ] ] ] }'
12 | multipolygon = '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }'
13 | 
14 | 
15 | class TestIndexing(unittest.TestCase):
16 |     def test_h3_index_point(self):
17 |         actual = indexing._index_shape(point, 9)
18 |         expected = ["8944d551007ffff"]
19 |         assert set(actual) == set(expected)
20 | 
21 |     def test_h3_index_line(self):
22 |         actual = indexing._index_shape(line, 9)
23 |         expected = ["8944d551073ffff", "8944d551077ffff", "8944d55103bffff"]
24 |         assert set(actual) == set(expected)
25 | 
26 |     def test_h3_index_line_2(self):
27 |         actual = indexing._index_shape(line2, 9)
28 |         expected = ["8944d551073ffff", "8944d55103bffff", "8944d55100fffff"]
29 |         assert set(actual) == set(expected)
30 | 
31 |     def test_h3_index_polygon(self):
32 |         actual = indexing._index_shape(polygon, 9)
33 |         expected = ["8944d551077ffff", "8944d55100fffff", "8944d551073ffff"]
34 |         assert set(actual) == set(expected)
35 | 
36 |     def test_h3_index_polygon2(self):
37 |         actual = indexing._index_shape(polygon2, 9)
38 |         expected = ["8944d551077ffff", "8944d55100fffff", "8944d551073ffff", "8944d55103bffff"]
39 |         assert set(actual) == set(expected)
40 | 
41 |     def test_h3_index_multipoint(self):
42 |         actual = indexing._index_shape(multipoint, 9)
43 |         expected = ["8944d551077ffff", "8944d551073ffff"]
44 |         assert set(actual) == set(expected)
45 | 
46 |     def test_h3_index_multiline(self):
47 |         actual = indexing._index_shape(multilinestring, 9)
48 |         expected = ["8944d551077ffff", "8944d551073ffff"]
49 |         assert set(actual) == set(expected)
50 | 
51 |     def test_h3_index_multipolygon(self):
52 |         actual = indexing._index_shape(multipolygon, 9)
53 |         expected = ["8944d551077ffff", "8944d551073ffff"]
54 |         assert set(actual) == set(expected)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/tests/test_traversal.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from src.h3_pyspark import traversal
 4 | 
 5 | 
 6 | input_cells = ["81447ffffffffff", "81267ffffffffff", "8148bffffffffff"]
 7 | results = {
 8 |     # first cell results
 9 |     "81443ffffffffff",
10 |     "8148bffffffffff",
11 |     "8144fffffffffff",
12 |     "81447ffffffffff",
13 |     "8126fffffffffff",
14 |     "81457ffffffffff",
15 |     "81267ffffffffff",
16 |     # second cell results
17 |     "81263ffffffffff",
18 |     "81277ffffffffff",
19 |     "812abffffffffff",
20 |     "8144fffffffffff",
21 |     "81447ffffffffff",
22 |     "8126fffffffffff",
23 |     "81267ffffffffff",
24 |     # third cell results
25 |     "8149bffffffffff",
26 |     "8148bffffffffff",
27 |     "8148fffffffffff",
28 |     "81483ffffffffff",
29 |     "81447ffffffffff",
30 |     "8126fffffffffff",
31 |     "81457ffffffffff",
32 | }
33 | 
34 | 
35 | class TestTraversal(unittest.TestCase):
36 |     def test_k_ring(self):
37 |         actual = traversal._k_ring_distinct(input_cells)
38 |         assert set(actual) == set(results)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------