├── .github
└── workflows
│ └── tests.yml
├── .gitignore
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── docs
├── buffer.geojson
├── buffer.png
├── spatial_index.geojson
├── spatial_index.png
├── spatial_join.geojson
└── spatial_join.png
├── meta.yaml
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── src
├── __init__.py
└── h3_pyspark
│ ├── __init__.py
│ ├── indexing.py
│ ├── traversal.py
│ └── utils.py
└── tests
├── __init__.py
├── test_core.py
├── test_coverage.py
├── test_indexing.py
└── test_traversal.py
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Tests
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | pull_request:
10 | branches: [ master ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python 3.10
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: "3.10"
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install black flake8 pytest
27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 | - name: Lint with flake8
29 | run: |
30 | # stop the build if there are Python syntax errors or undefined names
31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34 | - name: Lint with black
35 | run: |
36 | black -l 120 --check --diff src tests
37 | - name: Test with pytest
38 | run: |
39 | pytest
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__/
3 | .pytest_cache
4 | *.egg-info
5 | dist/
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "cSpell.words": [
3 | "isfunction",
4 | "uncompact",
5 | "conda",
6 | "geospatial",
7 | "codecov",
8 | "pytest"
9 | ]
10 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2021 Kevin Schaich
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # **h3-pyspark**: Uber's H3 Hexagonal Hierarchical Geospatial Indexing System in PySpark
4 |
5 | [](https://pypi.org/project/h3-pyspark/)
6 | [](https://anaconda.org/conda-forge/h3-pyspark)
7 | [](https://github.com/kevinschaich/h3-pyspark/blob/master/LICENSE)
8 | [](https://github.com/kevinschaich/h3-pyspark/actions/workflows/tests.yml)
9 |
10 | PySpark bindings for the [H3 core library](https://h3geo.org/).
11 |
12 | For available functions, please see the vanilla Python binding documentation at:
13 |
14 | - [uber.github.io/h3-py](https://uber.github.io/h3-py)
15 |
16 | ## Installation
17 |
18 | Via `PyPI`:
19 |
20 | ```bash
21 | pip install h3-pyspark
22 | ```
23 |
24 | Via `conda-forge`:
25 |
26 | ```bash
27 | conda install -c conda-forge h3-pyspark
28 | ```
29 |
30 | ## Usage
31 |
32 | ```python
33 | >>> from pyspark.sql import SparkSession, functions as F
34 | >>> import h3_pyspark
35 | >>>
36 | >>> spark = SparkSession.builder.getOrCreate()
37 | >>> df = spark.createDataFrame([{"lat": 37.769377, "lng": -122.388903, 'resolution': 9}])
38 | >>>
39 | >>> df = df.withColumn('h3_9', h3_pyspark.geo_to_h3('lat', 'lng', 'resolution'))
40 | >>> df.show()
41 |
42 | +---------+-----------+----------+---------------+
43 | | lat| lng|resolution| h3_9|
44 | +---------+-----------+----------+---------------+
45 | |37.769377|-122.388903| 9|89283082e73ffff|
46 | +---------+-----------+----------+---------------+
47 | ```
48 |
49 | ## Extension Functions
50 |
51 | There are also various extension functions available for geospatial common operations which are not available in the vanilla H3 library.
52 |
53 | ### Assumptions
54 |
55 | * You use GeoJSON to represent geometries in your PySpark pipeline (as opposed to WKT)
56 | * Geometries are stored in a GeoJSON `string` within a column (such as `geometry`) in your PySpark dataset
57 | * Individual H3 cells are stored as a `string` column (such as `h3_9`)
58 | * Sets of H3 cells are stored in an `array(string)` column (such as `h3_9`)
59 |
60 | ### Indexing
61 |
62 | #### `index_shape(geometry: Column, resolution: Column)`
63 |
64 | Generate an H3 spatial index for an input GeoJSON geometry column.
65 |
66 | This function accepts GeoJSON `Point`, `LineString`, `Polygon`, `MultiPoint`, `MultiLineString`, and `MultiPolygon`
67 | input features, and returns the set of H3 cells at the specified resolution which completely cover them
68 | (could be more than one cell for a substantially large geometry and substantially granular resolution).
69 |
70 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell.
71 |
72 | This spatial index can then be used for bucketing, clustering, and joins in Spark via an `explode()` operation.
73 |
74 | ```python
75 | >>> from pyspark.sql import SparkSession, functions as F
76 | >>> from h3_pyspark.indexing import index_shape
77 | >>> spark = SparkSession.builder.getOrCreate()
78 | >>>
79 | >>> df = spark.createDataFrame([{
80 | 'geometry': '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }',
81 |
82 | 'resolution': 9
83 | }])
84 | >>>
85 | >>> df = df.withColumn('h3_9', index_shape('geometry', 'resolution'))
86 | >>> df.show()
87 | +----------------------+----------+------------------------------------+
88 | | geometry|resolution| h3_9|
89 | +----------------------+----------+------------------------------------+
90 | | { "type": "MultiP... | 9| [8944d551077ffff, 8944d551073ffff] |
91 | +----------------------+----------+------------------------------------+
92 | ```
93 |
94 | Optionally, add another column `h3_9_geometry` for the GeoJSON representation of each cell in the `h3_9` column [to easily map the result alongside your original input geometry](docs/spatial_index.geojson):
95 |
96 | ```python
97 | >>> df = df.withColumn('h3_9_geometry', h3_pyspark.h3_set_to_multi_polygon(F.col('h3_9'), F.lit(True)))
98 | ```
99 |
100 | [View Live Map on GitHub](docs/spatial_index.geojson)
101 |
102 | [](docs/spatial_index.geojson)
103 |
104 | ### Buffers
105 |
106 | #### `k_ring_distinct(cells: Column, distance: Column)`
107 |
108 | Takes in an array of input cells, perform a k-ring operation on each cell, and return the distinct set of output cells.
109 |
110 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell.
111 |
112 | Since [we know the edge length & diameter (`2 * edge length`) of each H3 cell resolution](https://h3geo.org/docs/core-library/restable), we can use this to efficiently generate a "buffered" index of our input geometry (useful for operations such as distance joins):
113 |
114 | ```python
115 | >>> from pyspark.sql import SparkSession, functions as F
116 | >>> from h3_pyspark.indexing import index_shape
117 | >>> from h3_pyspark.traversal import k_ring_distinct
118 | >>> spark = SparkSession.builder.getOrCreate()
119 | >>>
120 | >>> df = spark.createDataFrame([{
121 | 'geometry': '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }',
122 |
123 | 'resolution': 9
124 | }])
125 | >>>
126 | >>> df = df.withColumn('h3_9', index_shape('geometry', 'resolution'))
127 | >>> df = df.withColumn('h3_9_buffer', k_ring_distinct('h3_9', 1))
128 | >>> df.show()
129 | +--------------------+----------+--------------------+--------------------+
130 | | geometry|resolution| h3_9| h3_9_buffer|
131 | +--------------------+----------+--------------------+--------------------+
132 | |{ "type": "MultiP...| 9|[8944d551077ffff,...|[8944d551073ffff,...|
133 | +--------------------+----------+--------------------+--------------------+
134 | ```
135 |
136 | [View Live Map on GitHub](docs/buffer.geojson)
137 |
138 | [](docs/buffer.geojson)
139 |
140 | ### Spatial Joins
141 |
142 | Once we have an indexed version of our geometries, we can easily join on the string column in H3 to get a set of pair candidates:
143 |
144 | ```python
145 | >>> from pyspark.sql import SparkSession, functions as F
146 | >>> from h3_pyspark.indexing import index_shape
147 | >>> spark = SparkSession.builder.getOrCreate()
148 | >>>
149 | >>> left = spark.createDataFrame([{
150 | 'left_id': 'left_point',
151 | 'left_geometry': '{ "type": "Point", "coordinates": [ -80.79527020454407, 32.132884966083935 ] }',
152 | }])
153 | >>> right = spark.createDataFrame([{
154 | 'right_id': 'right_polygon',
155 | 'right_geometry': '{ "type": "Polygon", "coordinates": [ [ [ -80.80022692680359, 32.12864200501338 ], [ -80.79224467277527, 32.12864200501338 ], [ -80.79224467277527, 32.13378441213715 ], [ -80.80022692680359, 32.13378441213715 ], [ -80.80022692680359, 32.12864200501338 ] ] ] }',
156 | }])
157 | >>>
158 | >>> left = left.withColumn('h3_9', index_shape('left_geometry', F.lit(9)))
159 | >>> right = right.withColumn('h3_9', index_shape('right_geometry', F.lit(9)))
160 | >>>
161 | >>> left = left.withColumn('h3_9', F.explode('h3_9'))
162 | >>> right = right.withColumn('h3_9', F.explode('h3_9'))
163 | >>>
164 | >>> joined = left.join(right, on='h3_9', how='inner')
165 | >>> joined.show()
166 | +---------------+--------------------+----------+--------------------+-------------+
167 | | h3_9| left_geometry| left_id| right_geometry| right_id|
168 | +---------------+--------------------+----------+--------------------+-------------+
169 | |8944d55100fffff|{ "type": "Point"...|left_point|{ "type": "Polygo...|right_polygon|
170 | +---------------+--------------------+----------+--------------------+-------------+
171 | ```
172 |
173 | You can combine this technique with a [Buffer](#buffers) to do a **Distance Join**.
174 |
175 |
176 |
177 | > **⚠️ Warning ⚠️:** The outputs of an H3 join are *approximate* – all resulting geometry pairs should be considered *intersection candidates* rather than *definitely intersecting*. Pairing a join here with a subsequent `distance` calculation (`distance = 0` = intersecting) or `intersects` can make this calculation exact. [Shapely](https://shapely.readthedocs.io) is a popular library with a well-documented [`distance`](https://shapely.readthedocs.io/en/stable/manual.html#object.distance) function which can be easily wrapped in a UDF:
178 |
179 |
180 |
181 | ```python
182 | from pyspark.sql import functions as F, types as T
183 | from shapely import geometry
184 | import json
185 |
186 | @F.udf(T.DoubleType())
187 | def distance(geometry1, geometry2):
188 | geometry1 = json.loads(geometry1)
189 | geometry1 = geometry.shape(geometry1)
190 | geometry2 = json.loads(geometry2)
191 | geometry2 = geometry.shape(geometry2)
192 | return geometry1.distance(geometry2)
193 | ```
194 |
195 | After a spatial join (detailed above), you can filter to only directly intersecting geometries:
196 |
197 | ```python
198 | >>> joined = joined.withColumn('distance', distance(F.col('left_geometry'), F.col('right_geometry')))
199 | >>> joined = joined.filter(F.col('distance') == 0)
200 | >>> joined.show()
201 | +---------------+--------------------+----------+--------------------+-------------+--------+
202 | | h3_9| left_geometry| left_id| right_geometry| right_id|distance|
203 | +---------------+--------------------+----------+--------------------+-------------+--------+
204 | |8944d55100fffff|{ "type": "Point"...|left_point|{ "type": "Polygo...|right_polygon| 0.0|
205 | +---------------+--------------------+----------+--------------------+-------------+--------+
206 | ```
207 |
208 | [View Live Map on GitHub](docs/spatial_join.geojson)
209 |
210 | [](docs/spatial_join.geojson)
211 |
212 | ## Publishing New Versions
213 |
214 | 1. Bump version in [`setup.cfg`](./setup.cfg)
215 | 2. Publish to `PyPi`
216 |
217 | git clean -fdx
218 | python3 -m build
219 | python3 -m twine upload --repository pypi dist/*
220 |
221 | 3. Create a new tag & release w/ version `x.x.x` and name `h3-pyspark-x.x.x` in GitHub
222 | 4. Publish to `conda-forge`:
223 | * Bump version & new tag's `sha256` hash in [`meta.yml`](https://github.com/conda-forge/h3-pyspark-feedstock/blob/master/recipe/meta.yaml) in [`@conda-forge/h3-pyspark-feedstock`](https://github.com/conda-forge/h3-pyspark-feedstock)
224 | openssl sha256 /path/to/h3-pyspark-x.x.x.tar.gz
225 |
--------------------------------------------------------------------------------
/docs/buffer.geojson:
--------------------------------------------------------------------------------
1 | {
2 | "type": "FeatureCollection",
3 | "features": [
4 | {
5 | "type": "Feature",
6 | "properties": {},
7 | "geometry": {
8 | "type": "MultiPolygon",
9 | "coordinates": [
10 | [
11 | [
12 | [
13 | -80.79442262649536,
14 | 32.13522895845023
15 | ],
16 | [
17 | -80.79298496246338,
18 | 32.13522895845023
19 | ],
20 | [
21 | -80.79298496246338,
22 | 32.13602844594619
23 | ],
24 | [
25 | -80.79442262649536,
26 | 32.13602844594619
27 | ],
28 | [
29 | -80.79442262649536,
30 | 32.13522895845023
31 | ]
32 | ]
33 | ],
34 | [
35 | [
36 | [
37 | -80.7923412322998,
38 | 32.1330848437511
39 | ],
40 | [
41 | -80.79073190689087,
42 | 32.1330848437511
43 | ],
44 | [
45 | -80.79073190689087,
46 | 32.13375715632646
47 | ],
48 | [
49 | -80.7923412322998,
50 | 32.13375715632646
51 | ],
52 | [
53 | -80.7923412322998,
54 | 32.1330848437511
55 | ]
56 | ]
57 | ]
58 | ]
59 | }
60 | },
61 | {
62 | "type": "Feature",
63 | "properties": {},
64 | "geometry": {
65 | "type": "MultiPolygon",
66 | "coordinates": [
67 | [
68 | [
69 | [
70 | -80.78724600624852,
71 | 32.130476831471
72 | ],
73 | [
74 | -80.78773022640085,
75 | 32.132228008570145
76 | ],
77 | [
78 | -80.78635566417105,
79 | 32.1334385288524
80 | ],
81 | [
82 | -80.78683987723358,
83 | 32.13518969274837
84 | ],
85 | [
86 | -80.78869872440329,
87 | 32.13573036419037
88 | ],
89 | [
90 | -80.78918300225553,
91 | 32.13748154270417
92 | ],
93 | [
94 | -80.79104196132435,
95 | 32.13802221401248
96 | ],
97 | [
98 | -80.79152630397307,
99 | 32.139773407134115
100 | ],
101 | [
102 | -80.79338537494641,
103 | 32.14031407829878
104 | ],
105 | [
106 | -80.79476007848629,
107 | 32.13910351377108
108 | ],
109 | [
110 | -80.79661923657869,
111 | 32.139644142214856
112 | ],
113 | [
114 | -80.79799395534437,
115 | 32.13843350714515
116 | ],
117 | [
118 | -80.79750946892102,
119 | 32.1366822583843
120 | ],
121 | [
122 | -80.79888415581274,
123 | 32.13547156752768
124 | ],
125 | [
126 | -80.7983996623107,
127 | 32.13372030555331
128 | ],
129 | [
130 | -80.79654055380145,
131 | 32.133179762250634
132 | ],
133 | [
134 | -80.79605612510308,
135 | 32.13142851488513
136 | ],
137 | [
138 | -80.79419712848552,
139 | 32.13088797143808
140 | ],
141 | [
142 | -80.79371276458406,
143 | 32.12913673869145
144 | ],
145 | [
146 | -80.79185387985282,
147 | 32.128596195109935
148 | ],
149 | [
150 | -80.79047933424329,
151 | 32.12980684169767
152 | ],
153 | [
154 | -80.78862053661935,
155 | 32.12926625541086
156 | ],
157 | [
158 | -80.78724600624852,
159 | 32.130476831471
160 | ]
161 | ]
162 | ]
163 | ]
164 | }
165 | }
166 | ]
167 | }
--------------------------------------------------------------------------------
/docs/buffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/buffer.png
--------------------------------------------------------------------------------
/docs/spatial_index.geojson:
--------------------------------------------------------------------------------
1 | {
2 | "type": "FeatureCollection",
3 | "features": [
4 | {
5 | "type": "Feature",
6 | "properties": {},
7 | "geometry": {
8 | "type": "MultiPolygon",
9 | "coordinates": [
10 | [
11 | [
12 | [
13 | -80.79442262649536,
14 | 32.13522895845023
15 | ],
16 | [
17 | -80.79298496246338,
18 | 32.13522895845023
19 | ],
20 | [
21 | -80.79298496246338,
22 | 32.13602844594619
23 | ],
24 | [
25 | -80.79442262649536,
26 | 32.13602844594619
27 | ],
28 | [
29 | -80.79442262649536,
30 | 32.13522895845023
31 | ]
32 | ]
33 | ],
34 | [
35 | [
36 | [
37 | -80.7923412322998,
38 | 32.1330848437511
39 | ],
40 | [
41 | -80.79073190689087,
42 | 32.1330848437511
43 | ],
44 | [
45 | -80.79073190689087,
46 | 32.13375715632646
47 | ],
48 | [
49 | -80.7923412322998,
50 | 32.13375715632646
51 | ],
52 | [
53 | -80.7923412322998,
54 | 32.1330848437511
55 | ]
56 | ]
57 | ]
58 | ]
59 | }
60 | },
61 | {
62 | "type": "Feature",
63 | "properties": {},
64 | "geometry": {
65 | "type": "MultiPolygon",
66 | "coordinates": [
67 | [
68 | [
69 | [
70 | -80.791932268028,
71 | 32.135060457894376
72 | ],
73 | [
74 | -80.79241661776229,
75 | 32.13681166423319
76 | ],
77 | [
78 | -80.79427566395086,
79 | 32.13735229282743
80 | ],
81 | [
82 | -80.79565033561992,
83 | 32.13614167251003
84 | ],
85 | [
86 | -80.79516591400238,
87 | 32.13439043835102
88 | ],
89 | [
90 | -80.79330689259906,
91 | 32.13384985232931
92 | ],
93 | [
94 | -80.79282253578053,
95 | 32.132098632782174
96 | ],
97 | [
98 | -80.79096362626957,
99 | 32.131558046622956
100 | ],
101 | [
102 | -80.78958904879794,
103 | 32.132768637435596
104 | ],
105 | [
106 | -80.79007333373787,
107 | 32.13451982915956
108 | ],
109 | [
110 | -80.791932268028,
111 | 32.135060457894376
112 | ]
113 | ]
114 | ]
115 | ]
116 | }
117 | }
118 | ]
119 | }
--------------------------------------------------------------------------------
/docs/spatial_index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/spatial_index.png
--------------------------------------------------------------------------------
/docs/spatial_join.geojson:
--------------------------------------------------------------------------------
1 | {
2 | "type": "FeatureCollection",
3 | "features": [
4 | {
5 | "type": "Feature",
6 | "properties": {},
7 | "geometry": {
8 | "type": "Polygon",
9 | "coordinates": [
10 | [
11 | [
12 | -80.79419712848552,
13 | 32.13088797143808
14 | ],
15 | [
16 | -80.79282253578053,
17 | 32.132098632782174
18 | ],
19 | [
20 | -80.79330689259906,
21 | 32.13384985232931
22 | ],
23 | [
24 | -80.79516591400238,
25 | 32.13439043835102
26 | ],
27 | [
28 | -80.79654055380145,
29 | 32.133179762250634
30 | ],
31 | [
32 | -80.79605612510308,
33 | 32.13142851488513
34 | ],
35 | [
36 | -80.79419712848552,
37 | 32.13088797143808
38 | ]
39 | ]
40 | ]
41 | }
42 | },
43 | {
44 | "type": "Feature",
45 | "properties": {},
46 | "geometry": {
47 | "type": "Polygon",
48 | "coordinates": [
49 | [
50 | [
51 | -80.80022692680359,
52 | 32.12864200501338
53 | ],
54 | [
55 | -80.79224467277527,
56 | 32.12864200501338
57 | ],
58 | [
59 | -80.79224467277527,
60 | 32.13378441213715
61 | ],
62 | [
63 | -80.80022692680359,
64 | 32.13378441213715
65 | ],
66 | [
67 | -80.80022692680359,
68 | 32.12864200501338
69 | ]
70 | ]
71 | ]
72 | }
73 | },
74 | {
75 | "type": "Feature",
76 | "properties": {},
77 | "geometry": {
78 | "type": "Point",
79 | "coordinates": [
80 | -80.79527020454407,
81 | 32.132884966083935
82 | ]
83 | }
84 | }
85 | ]
86 | }
--------------------------------------------------------------------------------
/docs/spatial_join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/docs/spatial_join.png
--------------------------------------------------------------------------------
/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set version = "1.2.2" %}
2 |
3 | package:
4 | name: h3-pyspark
5 | version: {{ version }}
6 |
7 | source:
8 | url: https://github.com/kevinschaich/h3-pyspark/archive/refs/tags/{{ version }}.tar.gz
9 | sha256: 64c39a66664676ce799dbfb5cbd49d9a9d76926e5495dcc8ea580fb03b4b46fb
10 |
11 | build:
12 | noarch: python
13 | number: 0
14 | script: {{ PYTHON }} -m pip install . -vv
15 |
16 | requirements:
17 | build:
18 | - pytest
19 | - black
20 | host:
21 | - pip
22 | - python
23 | run:
24 | - python
25 | - pyspark
26 | - h3-py
27 | - shapely
28 |
29 | test:
30 | imports:
31 | - h3_pyspark
32 |
33 | about:
34 | home: https://github.com/kevinschaich/h3-pyspark
35 | summary: PySpark bindings for H3, a hierarchical hexagonal geospatial indexing system
36 | license: Apache-2.0
37 | license_family: Apache
38 | license_file: LICENSE
39 | dev_url: https://github.com/kevinschaich/h3-pyspark
40 | doc_url: https://github.com/kevinschaich/h3-pyspark
41 |
42 | extra:
43 | recipe-maintainers:
44 | - kevinschaich
45 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark
2 | h3
3 | Shapely
4 | pytest
5 | black
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = h3-pyspark
3 | version = 1.2.6
4 | author = Kevin Schaich
5 | author_email = schaich.kevin@gmail.com
6 | description = PySpark bindings for H3, a hierarchical hexagonal geospatial indexing system
7 | long_description = file: README.md
8 | long_description_content_type = text/markdown
9 | url = https://github.com/kevinschaich/h3-pyspark
10 | project_urls =
11 | Bug Tracker = https://github.com/kevinschaich/h3-pyspark/issues
12 | classifiers =
13 | Programming Language :: Python :: 3
14 | License :: OSI Approved :: MIT License
15 | Operating System :: OS Independent
16 |
17 | [options]
18 | package_dir =
19 | = src
20 | packages = find:
21 | python_requires = >=3.6
22 |
23 | [options.packages.find]
24 | where = src
25 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/src/__init__.py
--------------------------------------------------------------------------------
/src/h3_pyspark/__init__.py:
--------------------------------------------------------------------------------
1 | import h3
2 | from pyspark.sql import functions as F, types as T
3 | import json
4 | from inspect import getmembers, isfunction
5 | from .utils import sanitize_types, handle_nulls
6 | import sys
7 | from shapely import geometry
8 |
9 |
10 | ###############################################################################
11 | # Indexing
12 | ###############################################################################
13 |
14 |
15 | @F.udf(returnType=T.StringType())
16 | @handle_nulls
17 | def geo_to_h3(lat, lng, resolution):
18 | return sanitize_types(h3.geo_to_h3(lat, lng, resolution))
19 |
20 |
21 | @F.udf(returnType=T.ArrayType(T.DoubleType()))
22 | @handle_nulls
23 | def h3_to_geo(h):
24 | return sanitize_types(h3.h3_to_geo(h))
25 |
26 |
27 | @F.udf(returnType=T.StringType())
28 | @handle_nulls
29 | def h3_to_geo_boundary(h, geo_json):
30 | # NOTE: this behavior differs from default
31 | # h3-pyspark return type will be a valid GeoJSON string if geo_json is set to True
32 | coordinates = h3.h3_to_geo_boundary(h, geo_json)
33 | if geo_json:
34 | return sanitize_types(json.dumps({"type": "MultiPolygon", "coordinates": coordinates}))
35 | return sanitize_types(coordinates)
36 |
37 |
38 | ###############################################################################
39 | # Inspection
40 | ###############################################################################
41 |
42 |
43 | @F.udf(returnType=T.IntegerType())
44 | @handle_nulls
45 | def h3_get_resolution(h):
46 | return sanitize_types(h3.h3_get_resolution(h))
47 |
48 |
49 | @F.udf(returnType=T.IntegerType())
50 | @handle_nulls
51 | def h3_get_base_cell(h):
52 | return sanitize_types(h3.h3_get_base_cell(h))
53 |
54 |
55 | @F.udf(returnType=T.LongType())
56 | @handle_nulls
57 | def string_to_h3(h):
58 | return sanitize_types(h3.string_to_h3(h))
59 |
60 |
61 | @F.udf(returnType=T.StringType())
62 | @handle_nulls
63 | def h3_to_string(h):
64 | return sanitize_types(h3.h3_to_string(h))
65 |
66 |
67 | @F.udf(returnType=T.BooleanType())
68 | @handle_nulls
69 | def h3_is_valid(h):
70 | return sanitize_types(h3.h3_is_valid(h))
71 |
72 |
73 | @F.udf(returnType=T.BooleanType())
74 | @handle_nulls
75 | def h3_is_res_class_III(h):
76 | return sanitize_types(h3.h3_is_res_class_III(h))
77 |
78 |
79 | @F.udf(returnType=T.BooleanType())
80 | @handle_nulls
81 | def h3_is_pentagon(h):
82 | return sanitize_types(h3.h3_is_pentagon(h))
83 |
84 |
85 | @F.udf(returnType=T.ArrayType(T.IntegerType()))
86 | @handle_nulls
87 | def h3_get_faces(h):
88 | return sanitize_types(h3.h3_get_faces(h))
89 |
90 |
91 | ###############################################################################
92 | # Traversal
93 | ###############################################################################
94 |
95 |
96 | @F.udf(returnType=T.ArrayType(T.StringType()))
97 | @handle_nulls
98 | def k_ring(origin, k):
99 | return sanitize_types(h3.k_ring(origin, k))
100 |
101 |
102 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.StringType())))
103 | @handle_nulls
104 | def k_ring_distances(origin, k):
105 | return sanitize_types(h3.k_ring_distances(origin, k))
106 |
107 |
108 | @F.udf(returnType=T.ArrayType(T.StringType()))
109 | @handle_nulls
110 | def hex_range(h, k):
111 | return sanitize_types(h3.hex_range(h, k))
112 |
113 |
114 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.StringType())))
115 | @handle_nulls
116 | def hex_range_distances(h, k):
117 | return sanitize_types(h3.hex_range_distances(h, k))
118 |
119 |
120 | @F.udf(returnType=T.MapType(T.StringType(), T.ArrayType(T.ArrayType(T.StringType()))))
121 | @handle_nulls
122 | def hex_ranges(h, k):
123 | return sanitize_types(h3.hex_ranges(h, k))
124 |
125 |
126 | @F.udf(returnType=T.ArrayType(T.StringType()))
127 | @handle_nulls
128 | def hex_ring(h, k):
129 | return sanitize_types(h3.hex_ring(h, k))
130 |
131 |
132 | @F.udf(returnType=T.ArrayType(T.StringType()))
133 | @handle_nulls
134 | def h3_line(start, end):
135 | return sanitize_types(h3.h3_line(start, end))
136 |
137 |
138 | @F.udf(returnType=T.IntegerType())
139 | @handle_nulls
140 | def h3_distance(h1, h2):
141 | return sanitize_types(h3.h3_distance(h1, h2))
142 |
143 |
144 | @F.udf(returnType=T.ArrayType(T.IntegerType()))
145 | @handle_nulls
146 | def experimental_h3_to_local_ij(origin, h):
147 | return sanitize_types(h3.experimental_h3_to_local_ij(origin, h))
148 |
149 |
150 | @F.udf(returnType=T.StringType())
151 | @handle_nulls
152 | def experimental_local_ij_to_h3(origin, i, j):
153 | return sanitize_types(h3.experimental_local_ij_to_h3(origin, i, j))
154 |
155 |
156 | ###############################################################################
157 | # Hierarchy
158 | ###############################################################################
159 |
160 |
161 | @F.udf(returnType=T.StringType())
162 | @handle_nulls
163 | def h3_to_parent(h, parent_res):
164 | return sanitize_types(h3.h3_to_parent(h, parent_res))
165 |
166 |
167 | @F.udf(returnType=T.ArrayType(T.StringType()))
168 | @handle_nulls
169 | def h3_to_children(h, child_res):
170 | return sanitize_types(h3.h3_to_children(h, child_res))
171 |
172 |
173 | @F.udf(returnType=T.StringType())
174 | @handle_nulls
175 | def h3_to_center_child(h, child_res):
176 | return sanitize_types(h3.h3_to_center_child(h, child_res))
177 |
178 |
179 | @F.udf(returnType=T.ArrayType(T.StringType()))
180 | @handle_nulls
181 | def compact(hexes):
182 | return sanitize_types(h3.compact(hexes))
183 |
184 |
185 | @F.udf(returnType=T.ArrayType(T.StringType()))
186 | @handle_nulls
187 | def uncompact(hexes, res):
188 | return sanitize_types(h3.uncompact(hexes, res))
189 |
190 |
191 | ###############################################################################
192 | # Regions
193 | ###############################################################################
194 |
195 |
196 | @F.udf(returnType=T.ArrayType(T.StringType()))
197 | @handle_nulls
198 | def polyfill(polygons, res, geo_json_conformant):
199 | # NOTE: this behavior differs from default
200 | # h3-pyspark expect `polygons` argument to be a valid GeoJSON string
201 | polygons = json.loads(polygons)
202 | return sanitize_types(h3.polyfill(polygons, res, geo_json_conformant))
203 |
204 |
205 | @F.udf(returnType=T.StringType())
206 | @handle_nulls
207 | def h3_set_to_multi_polygon(hexes, geo_json):
208 | # NOTE: this behavior differs from default
209 | # h3-pyspark return type will be a valid GeoJSON string if geo_json is set to True
210 | coordinates = h3.h3_set_to_multi_polygon(hexes, geo_json)
211 | if geo_json:
212 | return sanitize_types(json.dumps({"type": "MultiPolygon", "coordinates": coordinates}))
213 | return sanitize_types(coordinates)
214 |
215 |
216 | ###############################################################################
217 | # Unidirectional Edges
218 | ###############################################################################
219 |
220 |
221 | @F.udf(returnType=T.BooleanType())
222 | @handle_nulls
223 | def h3_indexes_are_neighbors(origin, destination):
224 | return sanitize_types(h3.h3_indexes_are_neighbors(origin, destination))
225 |
226 |
227 | @F.udf(returnType=T.StringType())
228 | @handle_nulls
229 | def get_h3_unidirectional_edge(origin, destination):
230 | return sanitize_types(h3.get_h3_unidirectional_edge(origin, destination))
231 |
232 |
233 | @F.udf(returnType=T.BooleanType())
234 | @handle_nulls
235 | def h3_unidirectional_edge_is_valid(edge):
236 | return sanitize_types(h3.h3_unidirectional_edge_is_valid(edge))
237 |
238 |
239 | @F.udf(returnType=T.StringType())
240 | @handle_nulls
241 | def get_origin_h3_index_from_unidirectional_edge(edge):
242 | return sanitize_types(h3.get_origin_h3_index_from_unidirectional_edge(edge))
243 |
244 |
245 | @F.udf(returnType=T.StringType())
246 | @handle_nulls
247 | def get_destination_h3_index_from_unidirectional_edge(edge):
248 | return sanitize_types(h3.get_destination_h3_index_from_unidirectional_edge(edge))
249 |
250 |
251 | @F.udf(returnType=T.ArrayType(T.StringType()))
252 | @handle_nulls
253 | def get_h3_indexes_from_unidirectional_edge(edge):
254 | return sanitize_types(h3.get_h3_indexes_from_unidirectional_edge(edge))
255 |
256 |
257 | @F.udf(returnType=T.ArrayType(T.StringType()))
258 | @handle_nulls
259 | def get_h3_unidirectional_edges_from_hexagon(h):
260 | return sanitize_types(h3.get_h3_unidirectional_edges_from_hexagon(h))
261 |
262 |
263 | @F.udf(returnType=T.ArrayType(T.ArrayType(T.DoubleType())))
264 | @handle_nulls
265 | def get_h3_unidirectional_edge_boundary(h, geo_json):
266 | return sanitize_types(h3.get_h3_unidirectional_edge_boundary(h, geo_json))
267 |
268 |
269 | ###############################################################################
270 | # Miscellaneous
271 | ###############################################################################
272 |
273 |
274 | @F.udf(returnType=T.DoubleType())
275 | @handle_nulls
276 | def hex_area(res, unit):
277 | return sanitize_types(h3.hex_area(res, unit))
278 |
279 |
280 | @F.udf(returnType=T.DoubleType())
281 | @handle_nulls
282 | def cell_area(h, unit):
283 | return sanitize_types(h3.cell_area(h, unit))
284 |
285 |
286 | @F.udf(returnType=T.DoubleType())
287 | @handle_nulls
288 | def edge_length(res, unit):
289 | return sanitize_types(h3.edge_length(res, unit))
290 |
291 |
292 | @F.udf(returnType=T.DoubleType())
293 | @handle_nulls
294 | def exact_edge_length(res, unit):
295 | return sanitize_types(h3.exact_edge_length(res, unit))
296 |
297 |
298 | @F.udf(returnType=T.IntegerType())
299 | @handle_nulls
300 | def num_hexagons(res):
301 | return sanitize_types(h3.num_hexagons(res))
302 |
303 |
304 | @F.udf(returnType=T.ArrayType(T.StringType()))
305 | @handle_nulls
306 | def get_res0_indexes():
307 | return sanitize_types(h3.get_res0_indexes())
308 |
309 |
310 | @F.udf(returnType=T.ArrayType(T.StringType()))
311 | @handle_nulls
312 | def get_pentagon_indexes(res):
313 | return sanitize_types(h3.get_pentagon_indexes(res))
314 |
315 |
316 | @F.udf(returnType=T.DoubleType())
317 | @handle_nulls
318 | def point_dist(point1, point2, unit):
319 | return sanitize_types(h3.point_dist(point1, point2, unit))
320 |
321 |
322 | # Steal docstrings from h3-py native bindings if they exist
323 | for f in [f[1] for f in getmembers(sys.modules[__name__], isfunction)]:
324 | try:
325 | h3_f = getattr(h3, f.__name__)
326 | f.__doc__ = h3_f.__doc__
327 | except Exception:
328 | f.__doc__ = f.__doc__
329 |
--------------------------------------------------------------------------------
/src/h3_pyspark/indexing.py:
--------------------------------------------------------------------------------
1 | import json
2 | import math
3 | import h3
4 | from pyspark.sql.column import Column
5 | from shapely import geometry
6 | from shapely.geometry import (
7 | Point,
8 | MultiPoint,
9 | LineString,
10 | MultiLineString,
11 | Polygon,
12 | MultiPolygon,
13 | )
14 | from pyspark.sql import functions as F, types as T
15 | from .utils import flatten, densify, handle_nulls
16 |
17 |
18 | def _index_point_object(point: Point, resolution: int):
19 | """
20 | Generate H3 spatial index for input point geometry.
21 |
22 | Returns the set of H3 cells at the specified resolution which completely cover the input point.
23 | """
24 | result_set = set()
25 |
26 | # Hexes for point
27 | result_set.update(h3.geo_to_h3(t[1], t[0], resolution) for t in list(point.coords))
28 | return result_set
29 |
30 |
31 | def _index_line_object(line: LineString, resolution: int):
32 | """
33 | Generate H3 spatial index for input line geometry.
34 |
35 | Returns the set of H3 cells at the specified resolution which completely cover the input line.
36 | """
37 | result_set = set()
38 |
39 | # Hexes for vertices
40 | vertex_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(line.coords)]
41 | result_set.update(vertex_hexes)
42 |
43 | # Figure out the max-length line segment (step) we can process without interpolating
44 | # https://github.com/kevinschaich/h3-pyspark/issues/8
45 | endpoint_hex_edges = flatten(
46 | [h3.get_h3_unidirectional_edges_from_hexagon(h) for h in [vertex_hexes[0], vertex_hexes[1]]]
47 | )
48 | step = math.degrees(min([h3.exact_edge_length(e, unit="rads") for e in endpoint_hex_edges]))
49 |
50 | densified_line = densify(line, step)
51 | line_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(densified_line.coords)]
52 | result_set.update(line_hexes)
53 |
54 | neighboring_hexes = set(flatten([h3.k_ring(h, 1) for h in result_set])) - result_set
55 | intersecting_neighboring_hexes = filter(
56 | lambda h: Polygon(h3.h3_set_to_multi_polygon([h], True)[0][0]).distance(line) == 0, neighboring_hexes
57 | )
58 | result_set.update(intersecting_neighboring_hexes)
59 |
60 | return result_set
61 |
62 |
63 | def _index_polygon_object(polygon: Polygon, resolution: int):
64 | """
65 | Generate H3 spatial index for input polygon geometry.
66 |
67 | Returns the set of H3 cells at the specified resolution which completely cover the input polygon.
68 | """
69 | result_set = set()
70 | # Hexes for vertices
71 | vertex_hexes = [h3.geo_to_h3(t[1], t[0], resolution) for t in list(polygon.exterior.coords)]
72 | result_set.update(vertex_hexes)
73 |
74 | # Hexes for edges
75 | edge_hexes = _index_shape_object(polygon.boundary, resolution)
76 | result_set.update(edge_hexes)
77 |
78 | # Hexes for internal area
79 | result_set.update(list(h3.polyfill(geometry.mapping(polygon), resolution, geo_json_conformant=True)))
80 | return result_set
81 |
82 |
83 | def _index_shape_object(shape: geometry, resolution: int):
84 | """
85 | Generate H3 spatial index for input geometry.
86 |
87 | Returns the set of H3 cells at the specified resolution which completely cover the input shape.
88 | """
89 | result_set = set()
90 |
91 | try:
92 | if isinstance(shape, Point):
93 | result_set.update(_index_point_object(shape, resolution))
94 |
95 | elif isinstance(shape, LineString):
96 | result_set.update(_index_line_object(shape, resolution))
97 |
98 | elif isinstance(shape, Polygon):
99 | result_set.update(_index_polygon_object(shape, resolution))
100 |
101 | elif isinstance(shape, MultiPoint) or isinstance(shape, MultiLineString) or isinstance(shape, MultiPolygon):
102 | result_set.update(*[_index_shape_object(s, resolution) for s in shape.geoms])
103 | else:
104 | raise ValueError(f"Unsupported geometry_type {shape.geom_type}")
105 |
106 | except Exception as e:
107 | raise ValueError(
108 | f"Error finding indices for geometry {json.dumps(geometry.mapping(shape))}",
109 | repr(e),
110 | )
111 |
112 | return list(result_set)
113 |
114 |
115 | def _index_shape(shape: str, resolution: int):
116 | """
117 | Generate H3 spatial index for input shape.
118 |
119 | Returns the set of H3 cells at the specified resolution which completely cover the input shape.
120 | """
121 | shape = geometry.shape(json.loads(shape))
122 | return _index_shape_object(shape, resolution)
123 |
124 |
125 | @F.udf(T.ArrayType(T.StringType()))
126 | @handle_nulls
127 | def index_shape(geometry: Column, resolution: Column):
128 | """
129 | Generate an H3 spatial index for an input GeoJSON geometry column.
130 |
131 | This function accepts GeoJSON `Point`, `LineString`, `Polygon`, `MultiPoint`, `MultiLineString`, and `MultiPolygon`
132 | input features, and returns the set of H3 cells at the specified resolution which completely cover them
133 | (could be more than one cell for a substantially large geometry and substantially granular resolution).
134 |
135 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell.
136 |
137 | This spatial index can then be used for bucketing, clustering, and joins in Spark via an `explode()` operation.
138 | """
139 | return _index_shape(geometry, resolution)
140 |
--------------------------------------------------------------------------------
/src/h3_pyspark/traversal.py:
--------------------------------------------------------------------------------
1 | import h3
2 | from pyspark.sql import functions as F, types as T
3 | from pyspark.sql.column import Column
4 | from typing import List
5 | from .utils import handle_nulls
6 |
7 |
8 | def _k_ring_distinct(cells: List[str], distance: int = 1):
9 | """
10 | Perform a k-ring operation on every input cell and return the distinct set of output cells.
11 | """
12 | result_set = set(cells)
13 | result_set = result_set.union(*[h3.k_ring(c, distance) for c in result_set])
14 |
15 | return list(result_set)
16 |
17 |
18 | @F.udf(T.ArrayType(T.StringType()))
19 | @handle_nulls
20 | def k_ring_distinct(cells: Column, distance: Column):
21 | """
22 | Perform a k-ring operation on every input cell and return the distinct set of output cells.
23 |
24 | The schema of the output column will be `T.ArrayType(T.StringType())`, where each value in the array is an H3 cell.
25 | """
26 | return _k_ring_distinct(cells, distance)
27 |
--------------------------------------------------------------------------------
/src/h3_pyspark/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | from shapely.geometry import LineString
3 |
4 |
5 | def handle_nulls(function):
6 | """
7 | Decorator to return null if any of the input arguments are null.
8 | """
9 |
10 | def inner(*args, **kwargs):
11 | if any(arg is None for arg in args):
12 | return None
13 | return function(*args, **kwargs)
14 |
15 | return inner
16 |
17 |
18 | def flatten(t):
19 | return [item for sublist in t for item in sublist]
20 |
21 |
22 | def densify(line, step):
23 | """
24 | Given a line segment, return another line segment with the same start & endpoints,
25 | and equally spaced sub-points based on `step` size.
26 |
27 | All the points on the new line are guaranteed to intersect with the original line,
28 | and the first and last points will be the same.
29 | """
30 |
31 | if line.length < step:
32 | return line
33 |
34 | length = line.length
35 | current_distance = step
36 | new_points = []
37 |
38 | # take actual first point
39 | new_points.append(line.interpolate(0.0, normalized=True))
40 |
41 | # add points between endpoints by step size
42 | while current_distance < length:
43 | new_points.append(line.interpolate(current_distance))
44 | current_distance += step
45 |
46 | # take actual last point
47 | new_points.append(line.interpolate(1.0, normalized=True))
48 |
49 | return LineString(new_points)
50 |
51 |
52 | def sanitize_types(value):
53 | """
54 | Casts values returned by H3 to native PySpark types.
55 |
56 | This is necessary because PySpark does not natively support
57 | all the types returned by H3, i.e. Python sets/tuples.
58 | """
59 |
60 | if isinstance(value, str) or isinstance(value, bool) or isinstance(value, int) or isinstance(value, float):
61 | return value
62 | if isinstance(value, set) or isinstance(value, tuple):
63 | return [sanitize_types(v) for v in value]
64 | if isinstance(value, list):
65 | return [sanitize_types(v) for v in value]
66 | if isinstance(value, dict):
67 | return {k: sanitize_types(v) for k, v in value.items()}
68 |
69 | return json.dumps(value)
70 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinschaich/h3-pyspark/dffe8e1dea5d99b0b121cc9282f650a2a262ae72/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
1 | from inspect import getfullargspec
2 | from pyspark.sql import SparkSession, functions as F, types as T
3 | import h3
4 | import json
5 | import unittest
6 |
7 | from src import h3_pyspark
8 | from src.h3_pyspark.utils import sanitize_types
9 |
10 |
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 |
14 | # Generate some arbitrary test values
15 | latitude = 29.8988
16 | longitude = -89.998354
17 | integer = 1
18 | double = 0.5
19 | point = '{"type": "Point", "coordinates": [-89.998354, 29.8988]}'
20 | line = '{"type": "LineString", "coordinates": [[-89.99927146300001, 29.90139583899997], [-89.99921418299999, 29.90139420899999], [-89.99903129900002, 29.90138951699998], [-89.99900807, 29.90142210300002], [-89.99898608000001, 29.90138835699997], [-89.99875118300002, 29.90138410499998], [-89.99872961, 29.90141686999999], [-89.99871085699999, 29.90138346399999], [-89.99837947499998, 29.90137720600001], [-89.99835869700001, 29.90140975100002], [-89.99834035200001, 29.901376191], [-89.998234115, 29.90137350700002], [-89.998218017, 29.90137313499997], [-89.99819830400003, 29.90137344499999], [-89.99787396300002, 29.90139402699998], [-89.99785696700002, 29.90142557899998], [-89.99783514199999, 29.90139429700002]]}'
21 | polygon = '{"type": "Polygon", "coordinates": [[[-89.998354, 29.8988], [-89.99807, 29.8988], [-89.99807, 29.898628], [-89.998354, 29.898628], [-89.998354, 29.8988]]]}'
22 | h3_cell = "81447ffffffffff"
23 | h3_cells = ["81447ffffffffff", "81267ffffffffff", "8148bffffffffff", "81483ffffffffff"]
24 | h3_edge = "131447ffffffffff"
25 | unit = "km^2"
26 |
27 |
28 | # Generate a dataframe from arbitrary test values (mapping function parameters to appropriate type)
29 | test_arg_map = {
30 | "i": integer,
31 | "j": integer,
32 | "k": integer,
33 | "x": integer,
34 | "resolution": integer,
35 | "res": integer,
36 | "lat": latitude,
37 | "lng": longitude,
38 | "point1": (latitude, longitude),
39 | "point2": (latitude, longitude),
40 | "h": h3_cells[0],
41 | "hexes": h3_cells,
42 | "h1": h3_cells[1],
43 | "h2": h3_cells[2],
44 | "origin": h3_cells[2],
45 | "destination": h3_cells[3],
46 | "start": h3_cells[1],
47 | "end": h3_cells[2],
48 | "e": h3_edge,
49 | "edge": h3_edge,
50 | "geo_json": True,
51 | "geo_json_conformant": True,
52 | "geojson": polygon,
53 | }
54 | df = spark.createDataFrame([test_arg_map])
55 |
56 |
57 | def get_test_args(function):
58 | argspec = getfullargspec(function)
59 | args = argspec.args
60 | h3_test_args = [test_arg_map.get(a.lower()) for a in args]
61 | h3_pyspark_test_args = [F.col(a) for a in args]
62 |
63 | return h3_test_args, h3_pyspark_test_args
64 |
65 |
66 | def sort(value):
67 | if isinstance(value, str) or isinstance(value, bool) or isinstance(value, int) or isinstance(value, float):
68 | return value
69 | if isinstance(value, list):
70 | value = [sort(v) for v in value]
71 | value.sort()
72 | return value
73 | if isinstance(value, set) or isinstance(value, tuple):
74 | return [sort(v) for v in value]
75 | if isinstance(value, dict):
76 | return {k: sort(v) for k, v in value.items()}
77 |
78 | return json.dumps(value)
79 |
80 |
81 | class TestCore(unittest.TestCase):
82 |
83 | ###############################################################################
84 | # Indexing
85 | ###############################################################################
86 |
87 | def test_geo_to_h3(self):
88 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.geo_to_h3)
89 |
90 | actual = df.withColumn("actual", h3_pyspark.geo_to_h3(*h3_pyspark_test_args))
91 | actual = actual.collect()[0]["actual"]
92 | expected = sanitize_types(h3.geo_to_h3(*h3_test_args))
93 | assert sort(actual) == sort(expected)
94 |
95 | def test_geo_to_h3_single_null_input(self):
96 | actual = df.withColumn("actual", h3_pyspark.geo_to_h3(F.lit(100), F.lit(None), F.lit(9)))
97 | actual = actual.collect()[0]["actual"]
98 | expected = None
99 | assert actual == expected
100 |
101 | def test_geo_to_h3_all_null_inputs(self):
102 | actual = df.withColumn("actual", h3_pyspark.geo_to_h3(F.lit(None), F.lit(None), F.lit(None)))
103 | actual = actual.collect()[0]["actual"]
104 | expected = None
105 | assert actual == expected
106 |
107 | def test_h3_to_geo(self):
108 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_geo)
109 |
110 | actual = df.withColumn("actual", h3_pyspark.h3_to_geo(*h3_pyspark_test_args))
111 | actual = actual.collect()[0]["actual"]
112 | expected = sanitize_types(h3.h3_to_geo(*h3_test_args))
113 | assert sort(actual) == sort(expected)
114 |
115 | def test_h3_to_geo_boundary(self):
116 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_geo_boundary)
117 |
118 | actual = df.withColumn("actual", h3_pyspark.h3_to_geo_boundary(*h3_pyspark_test_args))
119 | actual = actual.collect()[0]["actual"]
120 | expected = json.dumps({"type": "MultiPolygon", "coordinates": h3.h3_to_geo_boundary(*h3_test_args)})
121 | assert sort(actual) == sort(expected)
122 |
123 | ###############################################################################
124 | # Inspection
125 | ###############################################################################
126 |
127 | def test_h3_get_resolution(self):
128 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_resolution)
129 |
130 | actual = df.withColumn("actual", h3_pyspark.h3_get_resolution(*h3_pyspark_test_args))
131 | actual = actual.collect()[0]["actual"]
132 | expected = sanitize_types(h3.h3_get_resolution(*h3_test_args))
133 | assert sort(actual) == sort(expected)
134 |
135 | def test_h3_get_base_cell(self):
136 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_base_cell)
137 |
138 | actual = df.withColumn("actual", h3_pyspark.h3_get_base_cell(*h3_pyspark_test_args))
139 | actual = actual.collect()[0]["actual"]
140 | expected = sanitize_types(h3.h3_get_base_cell(*h3_test_args))
141 | assert sort(actual) == sort(expected)
142 |
143 | def test_string_to_h3(self):
144 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.string_to_h3)
145 |
146 | actual = df.withColumn("actual", h3_pyspark.string_to_h3(*h3_pyspark_test_args))
147 | actual = actual.collect()[0]["actual"]
148 | expected = sanitize_types(h3.string_to_h3(*h3_test_args))
149 | assert sort(actual) == sort(expected)
150 |
151 | def test_h3_to_string(self):
152 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_string)
153 |
154 | actual = df.withColumn("actual", h3_pyspark.h3_to_string(*h3_pyspark_test_args))
155 | actual = actual.collect()[0]["actual"]
156 | expected = sanitize_types(h3.h3_to_string(*h3_test_args))
157 | assert sort(actual) == sort(expected)
158 |
159 | def test_h3_is_valid(self):
160 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_valid)
161 |
162 | actual = df.withColumn("actual", h3_pyspark.h3_is_valid(*h3_pyspark_test_args))
163 | actual = actual.collect()[0]["actual"]
164 | expected = sanitize_types(h3.h3_is_valid(*h3_test_args))
165 | assert sort(actual) == sort(expected)
166 |
167 | def test_h3_is_res_class_III(self):
168 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_res_class_III)
169 |
170 | actual = df.withColumn("actual", h3_pyspark.h3_is_res_class_III(*h3_pyspark_test_args))
171 | actual = actual.collect()[0]["actual"]
172 | expected = sanitize_types(h3.h3_is_res_class_III(*h3_test_args))
173 | assert sort(actual) == sort(expected)
174 |
175 | def test_h3_is_pentagon(self):
176 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_is_pentagon)
177 |
178 | actual = df.withColumn("actual", h3_pyspark.h3_is_pentagon(*h3_pyspark_test_args))
179 | actual = actual.collect()[0]["actual"]
180 | expected = sanitize_types(h3.h3_is_pentagon(*h3_test_args))
181 | assert sort(actual) == sort(expected)
182 |
183 | def test_h3_get_faces(self):
184 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_get_faces)
185 |
186 | actual = df.withColumn("actual", h3_pyspark.h3_get_faces(*h3_pyspark_test_args))
187 | actual = actual.collect()[0]["actual"]
188 | expected = sanitize_types(h3.h3_get_faces(*h3_test_args))
189 | assert sort(actual) == sort(expected)
190 |
191 | ###############################################################################
192 | # Traversal
193 | ###############################################################################
194 |
195 | def test_k_ring(self):
196 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.k_ring)
197 |
198 | actual = df.withColumn("actual", h3_pyspark.k_ring(*h3_pyspark_test_args))
199 | actual = actual.collect()[0]["actual"]
200 | expected = sanitize_types(h3.k_ring(*h3_test_args))
201 | assert sort(actual) == sort(expected)
202 |
203 | def test_k_ring_distances(self):
204 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.k_ring_distances)
205 |
206 | actual = df.withColumn("actual", h3_pyspark.k_ring_distances(*h3_pyspark_test_args))
207 | actual = actual.collect()[0]["actual"]
208 | expected = sanitize_types(h3.k_ring_distances(*h3_test_args))
209 | assert sort(actual) == sort(expected)
210 |
211 | def test_hex_range(self):
212 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_range)
213 |
214 | actual = df.withColumn("actual", h3_pyspark.hex_range(*h3_pyspark_test_args))
215 | actual = actual.collect()[0]["actual"]
216 | expected = sanitize_types(h3.hex_range(*h3_test_args))
217 | assert sort(actual) == sort(expected)
218 |
219 | def test_hex_range_distances(self):
220 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_range_distances)
221 |
222 | actual = df.withColumn("actual", h3_pyspark.hex_range_distances(*h3_pyspark_test_args))
223 | actual = actual.collect()[0]["actual"]
224 | expected = sanitize_types(h3.hex_range_distances(*h3_test_args))
225 | assert sort(actual) == sort(expected)
226 |
227 | def test_hex_ranges(self):
228 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_ranges)
229 |
230 | actual = df.withColumn("actual", h3_pyspark.hex_ranges(*h3_pyspark_test_args))
231 | actual = actual.collect()[0]["actual"]
232 | expected = sanitize_types(h3.hex_ranges(*h3_test_args))
233 | assert sort(actual) == sort(expected)
234 |
235 | def test_hex_ring(self):
236 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_ring)
237 |
238 | actual = df.withColumn("actual", h3_pyspark.hex_ring(*h3_pyspark_test_args))
239 | actual = actual.collect()[0]["actual"]
240 | expected = sanitize_types(h3.hex_ring(*h3_test_args))
241 | assert sort(actual) == sort(expected)
242 |
243 | def test_h3_line(self):
244 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_line)
245 |
246 | actual = df.withColumn("actual", h3_pyspark.h3_line(*h3_pyspark_test_args))
247 | actual = actual.collect()[0]["actual"]
248 | expected = sanitize_types(h3.h3_line(*h3_test_args))
249 | assert sort(actual) == sort(expected)
250 |
251 | def test_h3_distance(self):
252 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_distance)
253 |
254 | actual = df.withColumn("actual", h3_pyspark.h3_distance(*h3_pyspark_test_args))
255 | actual = actual.collect()[0]["actual"]
256 | expected = sanitize_types(h3.h3_distance(*h3_test_args))
257 | assert sort(actual) == sort(expected)
258 |
259 | def test_experimental_h3_to_local_ij(self):
260 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.experimental_h3_to_local_ij)
261 |
262 | actual = df.withColumn("actual", h3_pyspark.experimental_h3_to_local_ij(*h3_pyspark_test_args))
263 | actual = actual.collect()[0]["actual"]
264 | expected = sanitize_types(h3.experimental_h3_to_local_ij(*h3_test_args))
265 | assert sort(actual) == sort(expected)
266 |
267 | def test_experimental_local_ij_to_h3(self):
268 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.experimental_local_ij_to_h3)
269 |
270 | actual = df.withColumn("actual", h3_pyspark.experimental_local_ij_to_h3(*h3_pyspark_test_args))
271 | actual = actual.collect()[0]["actual"]
272 | expected = sanitize_types(h3.experimental_local_ij_to_h3(*h3_test_args))
273 | assert sort(actual) == sort(expected)
274 |
275 | ###############################################################################
276 | # Hierarchy
277 | ###############################################################################
278 |
279 | def test_h3_to_parent(self):
280 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_parent)
281 |
282 | actual = df.withColumn("actual", h3_pyspark.h3_to_parent(*h3_pyspark_test_args))
283 | actual = actual.collect()[0]["actual"]
284 | expected = sanitize_types(h3.h3_to_parent(*h3_test_args))
285 | assert sort(actual) == sort(expected)
286 |
287 | def test_h3_to_children(self):
288 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_children)
289 |
290 | actual = df.withColumn("actual", h3_pyspark.h3_to_children(*h3_pyspark_test_args))
291 | actual = actual.collect()[0]["actual"]
292 | expected = sanitize_types(h3.h3_to_children(*h3_test_args))
293 | assert sort(actual) == sort(expected)
294 |
295 | def test_h3_to_center_child(self):
296 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_to_center_child)
297 |
298 | actual = df.withColumn("actual", h3_pyspark.h3_to_center_child(*h3_pyspark_test_args))
299 | actual = actual.collect()[0]["actual"]
300 | expected = sanitize_types(h3.h3_to_center_child(*h3_test_args))
301 | assert sort(actual) == sort(expected)
302 |
303 | def test_compact(self):
304 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.compact)
305 |
306 | actual = df.withColumn("actual", h3_pyspark.compact(*h3_pyspark_test_args))
307 | actual = actual.collect()[0]["actual"]
308 | expected = sanitize_types(h3.compact(*h3_test_args))
309 | assert sort(actual) == sort(expected)
310 |
311 | def test_uncompact(self):
312 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.uncompact)
313 |
314 | actual = df.withColumn("actual", h3_pyspark.uncompact(*h3_pyspark_test_args))
315 | actual = actual.collect()[0]["actual"]
316 | expected = sanitize_types(h3.uncompact(*h3_test_args))
317 | assert sort(actual) == sort(expected)
318 |
319 | ###############################################################################
320 | # Regions
321 | ###############################################################################
322 |
323 | def test_polyfill(self):
324 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.polyfill)
325 |
326 | actual = df.withColumn("actual", h3_pyspark.polyfill(*h3_pyspark_test_args))
327 | actual = actual.collect()[0]["actual"]
328 | expected = sanitize_types(h3.polyfill(json.loads(polygon), integer, True))
329 | assert sort(actual) == sort(expected)
330 |
331 | def test_h3_set_to_multi_polygon(self):
332 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_set_to_multi_polygon)
333 |
334 | actual = df.withColumn("actual", h3_pyspark.h3_set_to_multi_polygon(*h3_pyspark_test_args))
335 | actual = actual.collect()[0]["actual"]
336 | expected = expected = json.dumps(
337 | {"type": "MultiPolygon", "coordinates": h3.h3_set_to_multi_polygon(*h3_test_args)}
338 | )
339 | assert sort(actual) == sort(expected)
340 |
341 | ###############################################################################
342 | # Unidirectional Edges
343 | ###############################################################################
344 |
345 | def test_h3_indexes_are_neighbors(self):
346 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_indexes_are_neighbors)
347 |
348 | actual = df.withColumn("actual", h3_pyspark.h3_indexes_are_neighbors(*h3_pyspark_test_args))
349 | actual = actual.collect()[0]["actual"]
350 | expected = sanitize_types(h3.h3_indexes_are_neighbors(*h3_test_args))
351 | assert sort(actual) == sort(expected)
352 |
353 | def test_get_h3_unidirectional_edge(self):
354 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edge)
355 |
356 | actual = df.withColumn("actual", h3_pyspark.get_h3_unidirectional_edge(*h3_pyspark_test_args))
357 | actual = actual.collect()[0]["actual"]
358 | expected = sanitize_types(h3.get_h3_unidirectional_edge(*h3_test_args))
359 | assert sort(actual) == sort(expected)
360 |
361 | def test_h3_unidirectional_edge_is_valid(self):
362 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.h3_unidirectional_edge_is_valid)
363 |
364 | actual = df.withColumn("actual", h3_pyspark.h3_unidirectional_edge_is_valid(*h3_pyspark_test_args))
365 | actual = actual.collect()[0]["actual"]
366 | expected = sanitize_types(h3.h3_unidirectional_edge_is_valid(*h3_test_args))
367 | assert sort(actual) == sort(expected)
368 |
369 | def test_get_origin_h3_index_from_unidirectional_edge(self):
370 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_origin_h3_index_from_unidirectional_edge)
371 |
372 | actual = df.withColumn(
373 | "actual",
374 | h3_pyspark.get_origin_h3_index_from_unidirectional_edge(*h3_pyspark_test_args),
375 | )
376 | actual = actual.collect()[0]["actual"]
377 | expected = sanitize_types(h3.get_origin_h3_index_from_unidirectional_edge(*h3_test_args))
378 | assert sort(actual) == sort(expected)
379 |
380 | def test_get_destination_h3_index_from_unidirectional_edge(self):
381 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_destination_h3_index_from_unidirectional_edge)
382 |
383 | actual = df.withColumn(
384 | "actual",
385 | h3_pyspark.get_destination_h3_index_from_unidirectional_edge(*h3_pyspark_test_args),
386 | )
387 | actual = actual.collect()[0]["actual"]
388 | expected = sanitize_types(h3.get_destination_h3_index_from_unidirectional_edge(*h3_test_args))
389 | assert sort(actual) == sort(expected)
390 |
391 | def test_get_h3_indexes_from_unidirectional_edge(self):
392 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_indexes_from_unidirectional_edge)
393 |
394 | actual = df.withColumn(
395 | "actual",
396 | h3_pyspark.get_h3_indexes_from_unidirectional_edge(*h3_pyspark_test_args),
397 | )
398 | actual = actual.collect()[0]["actual"]
399 | expected = sanitize_types(h3.get_h3_indexes_from_unidirectional_edge(*h3_test_args))
400 | assert sort(actual) == sort(expected)
401 |
402 | def test_get_h3_unidirectional_edges_from_hexagon(self):
403 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edges_from_hexagon)
404 |
405 | actual = df.withColumn(
406 | "actual",
407 | h3_pyspark.get_h3_unidirectional_edges_from_hexagon(*h3_pyspark_test_args),
408 | )
409 | actual = actual.collect()[0]["actual"]
410 | expected = sanitize_types(h3.get_h3_unidirectional_edges_from_hexagon(*h3_test_args))
411 | assert sort(actual) == sort(expected)
412 |
413 | def test_get_h3_unidirectional_edge_boundary(self):
414 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_h3_unidirectional_edge_boundary)
415 |
416 | actual = df.withColumn(
417 | "actual",
418 | h3_pyspark.get_h3_unidirectional_edge_boundary(*h3_pyspark_test_args),
419 | )
420 | actual = actual.collect()[0]["actual"]
421 | expected = sanitize_types(h3.get_h3_unidirectional_edge_boundary(*h3_test_args))
422 | assert sort(actual) == sort(expected)
423 |
424 | ###############################################################################
425 | # Miscellaneous
426 | ###############################################################################
427 |
428 | def test_hex_area(self):
429 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.hex_area)
430 |
431 | h3_test_args[-1] = "m^2"
432 | actual = df.withColumn("unit", F.lit("m^2"))
433 |
434 | actual = actual.withColumn("actual", h3_pyspark.hex_area(*h3_pyspark_test_args))
435 | actual = actual.collect()[0]["actual"]
436 | expected = sanitize_types(h3.hex_area(*h3_test_args))
437 | assert sort(actual) == sort(expected)
438 |
439 | def test_cell_area(self):
440 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.cell_area)
441 |
442 | h3_test_args[-1] = "m^2"
443 | actual = df.withColumn("unit", F.lit("m^2"))
444 |
445 | actual = actual.withColumn("actual", h3_pyspark.cell_area(*h3_pyspark_test_args))
446 | actual = actual.collect()[0]["actual"]
447 | expected = sanitize_types(h3.cell_area(*h3_test_args))
448 | assert sort(actual) == sort(expected)
449 |
450 | def test_edge_length(self):
451 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.edge_length)
452 |
453 | h3_test_args[-1] = "m"
454 | actual = df.withColumn("unit", F.lit("m"))
455 |
456 | actual = actual.withColumn("actual", h3_pyspark.edge_length(*h3_pyspark_test_args))
457 | actual = actual.collect()[0]["actual"]
458 | expected = sanitize_types(h3.edge_length(*h3_test_args))
459 | assert sort(actual) == sort(expected)
460 |
461 | def test_exact_edge_length(self):
462 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.exact_edge_length)
463 |
464 | h3_test_args[-1] = "m"
465 | actual = df.withColumn("unit", F.lit("m"))
466 |
467 | actual = actual.withColumn("actual", h3_pyspark.exact_edge_length(*h3_pyspark_test_args))
468 | actual = actual.collect()[0]["actual"]
469 | expected = sanitize_types(h3.exact_edge_length(*h3_test_args))
470 | assert sort(actual) == sort(expected)
471 |
472 | def test_num_hexagons(self):
473 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.num_hexagons)
474 |
475 | actual = df.withColumn("actual", h3_pyspark.num_hexagons(*h3_pyspark_test_args))
476 | actual = actual.collect()[0]["actual"]
477 | expected = sanitize_types(h3.num_hexagons(*h3_test_args))
478 | assert sort(actual) == sort(expected)
479 |
480 | def test_get_res0_indexes(self):
481 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_res0_indexes)
482 |
483 | actual = df.withColumn("actual", h3_pyspark.get_res0_indexes(*h3_pyspark_test_args))
484 | actual = actual.collect()[0]["actual"]
485 | expected = sanitize_types(h3.get_res0_indexes(*h3_test_args))
486 | assert sort(actual) == sort(expected)
487 |
488 | def test_get_pentagon_indexes(self):
489 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.get_pentagon_indexes)
490 |
491 | actual = df.withColumn("actual", h3_pyspark.get_pentagon_indexes(*h3_pyspark_test_args))
492 | actual = actual.collect()[0]["actual"]
493 | expected = sanitize_types(h3.get_pentagon_indexes(*h3_test_args))
494 | assert sort(actual) == sort(expected)
495 |
496 | def test_point_dist(self):
497 | h3_test_args, h3_pyspark_test_args = get_test_args(h3.point_dist)
498 |
499 | h3_test_args[-1] = "m"
500 | actual = df.withColumn("unit", F.lit("m"))
501 |
502 | actual = actual.withColumn("actual", h3_pyspark.point_dist(*h3_pyspark_test_args))
503 | actual = actual.collect()[0]["actual"]
504 | expected = sanitize_types(h3.point_dist(*h3_test_args))
505 | assert sort(actual) == sort(expected)
506 |
507 |
508 | if __name__ == "__main__":
509 | unittest.main()
510 |
--------------------------------------------------------------------------------
/tests/test_coverage.py:
--------------------------------------------------------------------------------
1 | from inspect import getmembers, isfunction
2 | import h3
3 | import unittest
4 |
5 | from src import h3_pyspark
6 |
7 |
8 | blacklist = set(["h3_is_res_class_iii", "polyfill_geojson", "versions", "polyfill_polygon"])
9 |
10 |
11 | class TestCoverage(unittest.TestCase):
12 | def test_geometry_coverage(self):
13 | h3_functions = getmembers(h3, isfunction)
14 | h3_functions = set([x[0] for x in h3_functions if "__" not in x[0]])
15 |
16 | h3_pyspark_functions = getmembers(h3_pyspark, isfunction)
17 | h3_pyspark_functions = set([x[0] for x in h3_pyspark_functions if "__" not in x[0]])
18 |
19 | self.assertEqual(h3_functions - blacklist - h3_pyspark_functions, set())
20 |
21 |
22 | if __name__ == "__main__":
23 | unittest.main()
24 |
--------------------------------------------------------------------------------
/tests/test_indexing.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from src.h3_pyspark import indexing
3 |
4 |
5 | point = '{ "type": "Point", "coordinates": [ -80.79617142677307, 32.131567579594716 ] }'
6 | line = '{ "type": "LineString", "coordinates": [ [ -80.79708337783813, 32.13510176661157 ], [ -80.79504489898682, 32.13510176661157 ], [ -80.79440116882324, 32.13550151179293 ], [ -80.79315662384033, 32.13535615011151 ], [ -80.79259872436523, 32.13470201967832 ], [ -80.79141855239868, 32.13292130751054 ] ] }'
7 | line2 = '{ "type": "LineString", "coordinates": [ [ -80.79768419265747, 32.13413873693519 ], [ -80.79171895980835, 32.132230817929354 ] ] }'
8 | polygon = '{ "type": "Polygon", "coordinates": [ [ [ -80.79427242279051, 32.132866795365196 ], [ -80.79128980636597, 32.132866795365196 ], [ -80.79128980636597, 32.13479287140789 ], [ -80.79427242279051, 32.13479287140789 ], [ -80.79427242279051, 32.132866795365196 ] ] ] }'
9 | polygon2 = '{ "type": "Polygon", "coordinates": [ [ [ -80.7916921377182, 32.13222627521743 ], [ -80.79402565956116, 32.135074511194496 ], [ -80.79768419265747, 32.13414327955186 ], [ -80.7916921377182, 32.13222627521743 ] ] ] }'
10 | multipoint = '{ "type": "MultiPoint", "coordinates":[ [ -80.7935643196106, 32.135755894178004 ], [ -80.79058170318604, 32.1330848437511 ]] }'
11 | multilinestring = '{ "type": "MultiLineString", "coordinates": [ [[ -80.7945728302002, 32.13577406432124 ], [ -80.79319953918457, 32.135010915189675 ]], [ [ -80.79257726669312, 32.13395703208247 ], [ -80.7915472984314, 32.13315752643055 ] ] ] }'
12 | multipolygon = '{ "type": "MultiPolygon", "coordinates": [ [ [ [ -80.79442262649536, 32.13522895845023 ], [ -80.79298496246338, 32.13522895845023 ], [ -80.79298496246338, 32.13602844594619 ], [ -80.79442262649536, 32.13602844594619 ], [ -80.79442262649536, 32.13522895845023 ] ] ], [ [ [ -80.7923412322998, 32.1330848437511 ], [ -80.79073190689087, 32.1330848437511 ], [ -80.79073190689087, 32.13375715632646 ], [ -80.7923412322998, 32.13375715632646 ], [ -80.7923412322998, 32.1330848437511 ] ] ] ] }'
13 |
14 |
15 | class TestIndexing(unittest.TestCase):
16 | def test_h3_index_point(self):
17 | actual = indexing._index_shape(point, 9)
18 | expected = ["8944d551007ffff"]
19 | assert set(actual) == set(expected)
20 |
21 | def test_h3_index_line(self):
22 | actual = indexing._index_shape(line, 9)
23 | expected = ["8944d551073ffff", "8944d551077ffff", "8944d55103bffff"]
24 | assert set(actual) == set(expected)
25 |
26 | def test_h3_index_line_2(self):
27 | actual = indexing._index_shape(line2, 9)
28 | expected = ["8944d551073ffff", "8944d55103bffff", "8944d55100fffff"]
29 | assert set(actual) == set(expected)
30 |
31 | def test_h3_index_polygon(self):
32 | actual = indexing._index_shape(polygon, 9)
33 | expected = ["8944d551077ffff", "8944d55100fffff", "8944d551073ffff"]
34 | assert set(actual) == set(expected)
35 |
36 | def test_h3_index_polygon2(self):
37 | actual = indexing._index_shape(polygon2, 9)
38 | expected = ["8944d551077ffff", "8944d55100fffff", "8944d551073ffff", "8944d55103bffff"]
39 | assert set(actual) == set(expected)
40 |
41 | def test_h3_index_multipoint(self):
42 | actual = indexing._index_shape(multipoint, 9)
43 | expected = ["8944d551077ffff", "8944d551073ffff"]
44 | assert set(actual) == set(expected)
45 |
46 | def test_h3_index_multiline(self):
47 | actual = indexing._index_shape(multilinestring, 9)
48 | expected = ["8944d551077ffff", "8944d551073ffff"]
49 | assert set(actual) == set(expected)
50 |
51 | def test_h3_index_multipolygon(self):
52 | actual = indexing._index_shape(multipolygon, 9)
53 | expected = ["8944d551077ffff", "8944d551073ffff"]
54 | assert set(actual) == set(expected)
55 |
56 |
57 | if __name__ == "__main__":
58 | unittest.main()
59 |
--------------------------------------------------------------------------------
/tests/test_traversal.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from src.h3_pyspark import traversal
4 |
5 |
6 | input_cells = ["81447ffffffffff", "81267ffffffffff", "8148bffffffffff"]
7 | results = {
8 | # first cell results
9 | "81443ffffffffff",
10 | "8148bffffffffff",
11 | "8144fffffffffff",
12 | "81447ffffffffff",
13 | "8126fffffffffff",
14 | "81457ffffffffff",
15 | "81267ffffffffff",
16 | # second cell results
17 | "81263ffffffffff",
18 | "81277ffffffffff",
19 | "812abffffffffff",
20 | "8144fffffffffff",
21 | "81447ffffffffff",
22 | "8126fffffffffff",
23 | "81267ffffffffff",
24 | # third cell results
25 | "8149bffffffffff",
26 | "8148bffffffffff",
27 | "8148fffffffffff",
28 | "81483ffffffffff",
29 | "81447ffffffffff",
30 | "8126fffffffffff",
31 | "81457ffffffffff",
32 | }
33 |
34 |
35 | class TestTraversal(unittest.TestCase):
36 | def test_k_ring(self):
37 | actual = traversal._k_ring_distinct(input_cells)
38 | assert set(actual) == set(results)
39 |
40 |
41 | if __name__ == "__main__":
42 | unittest.main()
43 |
--------------------------------------------------------------------------------