element.
173 | # For black navbar, do "navbar navbar-inverse"
174 | #'navbar_class': "navbar navbar-inverse",
175 | # Fix navigation bar to top of page?
176 | # Values: "true" (default) or "false"
177 | "navbar_fixed_top": "true",
178 | # Location of link to source.
179 | # Options are "nav" (default), "footer" or anything else to exclude.
180 | "source_link_position": "footer",
181 | # Bootswatch (http://bootswatch.com/) theme.
182 | #
183 | # Options are nothing (default) or the name of a valid theme
184 | # such as "amelia" or "cosmo", "yeti", "flatly".
185 | "bootswatch_theme": "yeti",
186 | # Choose Bootstrap version.
187 | # Values: "3" (default) or "2" (in quotes)
188 | "bootstrap_version": "3",
189 | "navbar_links": [
190 | # ("Gallery", "auto_examples/index"),
191 | ("Installation", "installation"),
192 | ("Tutorial", "tutorial"),
193 | ("API", "api"),
194 | ("References", "references"),
195 | ],
196 | }
197 |
198 | # Add any paths that contain custom static files (such as style sheets) here,
199 | # relative to this directory. They are copied after the builtin static files,
200 | # so a file named "default.css" will overwrite the builtin "default.css".
201 | html_static_path = ["_static"]
202 |
203 | # Custom sidebar templates, maps document names to template names.
204 | # html_sidebars = {}
205 | # html_sidebars = {'sidebar': ['localtoc.html', 'sourcelink.html', 'searchbox.html']}
206 |
207 | # -- Options for HTMLHelp output ------------------------------------------
208 |
209 | # Output file base name for HTML help builder.
210 | htmlhelp_basename = "toblerdoc"
211 |
212 |
213 | # -- Options for LaTeX output ---------------------------------------------
214 |
215 | latex_elements = {
216 | # The paper size ('letterpaper' or 'a4paper').
217 | #
218 | # 'papersize': 'letterpaper',
219 | # The font size ('10pt', '11pt' or '12pt').
220 | #
221 | # 'pointsize': '10pt',
222 | # Additional stuff for the LaTeX preamble.
223 | #
224 | # 'preamble': '',
225 | # Latex figure (float) alignment
226 | #
227 | # 'figure_align': 'htbp',
228 | }
229 |
230 | # Grouping the document tree into LaTeX files. List of tuples
231 | # (source start file, target name, title,
232 | # author, documentclass [howto, manual, or own class]).
233 | latex_documents = [
234 | (
235 | master_doc,
236 | "toblerdoc.tex",
237 | u"tobler Documentation",
238 | u"tobler developers",
239 | "manual",
240 | )
241 | ]
242 |
243 |
244 | # -- Options for manual page output ---------------------------------------
245 |
246 | # One entry per manual page. List of tuples
247 | # (source start file, name, description, authors, manual section).
248 | man_pages = [(master_doc, "tobler", u"tobler Documentation", [author], 1)]
249 |
250 |
251 | # -- Options for Texinfo output -------------------------------------------
252 |
253 | # Grouping the document tree into Texinfo files. List of tuples
254 | # (source start file, target name, title, author,
255 | # dir menu entry, description, category)
256 | texinfo_documents = [
257 | (
258 | master_doc,
259 | "tobler",
260 | u"tobler Documentation",
261 | author,
262 | "tobler",
263 | "One line description of project.",
264 | "Miscellaneous",
265 | )
266 | ]
267 |
268 | # -----------------------------------------------------------------------------
269 | # Napoleon configuration
270 | # -----------------------------------------------------------------------------
271 | # numpydoc_show_class_members = True
272 | # numpydoc_class_members_toctree = False
273 | #
274 | # napoleon_use_ivar = True
275 |
276 | # -----------------------------------------------------------------------------
277 | # Autosummary
278 | # -----------------------------------------------------------------------------
279 |
280 | # Generate the API documentation when building
281 | autosummary_generate = True
282 | numpydoc_show_class_members = False
283 | class_members_toctree = True
284 | numpydoc_show_inherited_class_members = True
285 | numpydoc_use_plots = True
286 | numpydoc_xref_param_type=True
287 | # display the source code for Plot directive
288 | plot_include_source = True
289 |
290 |
291 | # Example configuration for intersphinx: refer to the Python standard library.
292 | intersphinx_mapping = {
293 | 'python': ('https://docs.python.org/3/', None),
294 | 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None),
295 | 'numpy': ('https://docs.scipy.org/doc/numpy', None),
296 | 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
297 | 'geopandas': ('https://geopandas.readthedocs.io/en/latest/', None),
298 | 'sklearn': ('https://scikit-learn.org/stable/', None),
299 | 'giddy': ('https://giddy.readthedocs.io/en/latest/', None),
300 | 'libpysal': ('https://pysal.org/libpysal/', None),
301 | 'esda': ('https://pysal.org/esda/', None),
302 | 'region': ('https://region.readthedocs.io/en/latest/', None),
303 | 'hdbscan': ('https://hdbscan.readthedocs.io/en/latest/', None)
304 |
305 | }
306 |
307 | numpydoc_xref_ignore = {'type', 'optional', 'default', 'shape', 'fitted', 'instance',
308 | 'cluster', 'of', 'or', 'if', 'using', 'otherwise', 'required',
309 | 'from'}
310 |
311 |
--------------------------------------------------------------------------------
/docs/figs/nsf_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/nsf_logo.jpg
--------------------------------------------------------------------------------
/docs/figs/raster_lattice_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/raster_lattice_example.png
--------------------------------------------------------------------------------
/docs/figs/tobler_long.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/tobler_long.png
--------------------------------------------------------------------------------
/docs/figs/tobler_long.svg:
--------------------------------------------------------------------------------
1 |
2 |
433 |
--------------------------------------------------------------------------------
/docs/figs/toy_census_tracts_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/toy_census_tracts_example.png
--------------------------------------------------------------------------------
/docs/figs/toy_census_tracts_example_old.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/toy_census_tracts_example_old.png
--------------------------------------------------------------------------------
/docs/figs/waldo_travel.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pysal/tobler/73df0d617183bd46da7a2c254d2a595404c7cde0/docs/figs/waldo_travel.jpg
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. image:: figs/tobler_long.png
2 | :align: center
3 | :height: 200px
4 | :width: 425px
5 | :alt: tobler
6 |
7 |
8 | .. raw:: html
9 |
10 |
38 |
39 |
40 |
41 |
42 |
43 | Tobler
44 | ```````````````````````````````````````````````````````````
45 |
46 | :code:`tobler` is a library for areal interpolation and dasymetric mapping. The name is an homage to the legendary geographer `Waldo Tobler`_. It is a subpackage of `PySAL`_ (Python Spatial Analysis Library), and is under active `development`_.
47 |
48 |
49 |
50 | .. toctree::
51 | :hidden:
52 | :maxdepth: 3
53 | :caption: Contents:
54 |
55 | Installation
56 | Tutorial
57 | API
58 | References
59 |
60 |
61 |
62 | .. _PySAL: https://pysal.org
63 | .. _development: https://github.com/pysal/tobler
64 | .. _Waldo Tobler: https://en.wikipedia.org/wiki/Waldo_R._Tobler
65 |
66 |
--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | .. Installation
2 |
3 | .. highlight:: rst
4 |
5 | .. role:: python(code)
6 | :language: python
7 |
8 |
9 | Installation
10 | ===============
11 |
12 | tobler supports Python `3.6`_ and `3.7`_ only. Please make sure that you are
13 | operating in a Python 3 environment.
14 |
15 | Installing a released version
16 | ------------------------------
17 | ``tobler`` is available on both conda and pip, and can be installed with either
18 |
19 | .. code-block:: bash
20 |
21 | conda install -c conda-forge tobler
22 |
23 | or
24 |
25 | .. code-block:: bash
26 |
27 | pip install tobler
28 |
29 |
30 | Installing a development from source
31 | -------------------------------------
32 | For working with a development version, we recommend `anaconda`_. To get started, clone this repository or download it manually then ``cd`` into the directory and run the following commands:
33 |
34 | .. code-block:: bash
35 |
36 | conda env create -f environment.yml
37 | source activate tobler
38 | python setup.py develop
39 |
40 | You can also `fork`_ the `pysal/tobler`_ repo and create a local clone of
41 | your fork. By making changes
42 | to your local clone and submitting a pull request to `pysal/tobler`_, you can
43 | contribute to the tobler development.
44 |
45 | .. _3.6: https://docs.python.org/3.6/
46 | .. _3.7: https://docs.python.org/3.7/
47 | .. _Python Package Index: https://pypi.org/pysal/tobler/
48 | .. _pysal/tobler: https://github.com/pysal/tobler
49 | .. _fork: https://help.github.com/articles/fork-a-repo/
50 | .. _anaconda: https://www.anaconda.com/download/
51 |
--------------------------------------------------------------------------------
/docs/notebooks/01_interpolation_methods_overview.nblink:
--------------------------------------------------------------------------------
1 | {
2 | "path": "../../notebooks/01_interpolation_methods_overview.ipynb"
3 | }
4 |
--------------------------------------------------------------------------------
/docs/notebooks/02_areal_interpolation_example.nblink:
--------------------------------------------------------------------------------
1 | {
2 | "path": "../../notebooks/02_areal_interpolation_example.ipynb"
3 | }
4 |
--------------------------------------------------------------------------------
/docs/notebooks/03_areal_interpolation_details.nblink:
--------------------------------------------------------------------------------
1 | {
2 | "path": "../../notebooks/03_areal_interpolation_details.ipynb"
3 | }
4 |
--------------------------------------------------------------------------------
/docs/notebooks/binary_dasymetric.nblink:
--------------------------------------------------------------------------------
1 | {
2 | "path": "../../notebooks/binary_dasymetric.ipynb"
3 | }
4 |
--------------------------------------------------------------------------------
/docs/notebooks/census_to_hexgrid.nblink:
--------------------------------------------------------------------------------
1 | {
2 | "path": "../../notebooks/census_to_hexgrid.ipynb"
3 | }
4 |
--------------------------------------------------------------------------------
/docs/notebooks/extract_urban_areas.nblink:
--------------------------------------------------------------------------------
1 | {
2 | "path": "../../notebooks/extract_urban_areas.ipynb"
3 | }
4 |
--------------------------------------------------------------------------------
/docs/references.rst:
--------------------------------------------------------------------------------
1 | .. reference for the docs
2 |
3 | References
4 | ==========
5 |
6 | .. bibliography:: _static/references.bib
7 | :all:
8 |
--------------------------------------------------------------------------------
/docs/tutorial.rst:
--------------------------------------------------------------------------------
1 | Tobler Tutorial
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 1
6 | :caption: Contents:
7 |
8 | notebooks/01_interpolation_methods_overview.ipynb
9 | notebooks/02_areal_interpolation_example.ipynb
10 | notebooks/03_areal_interpolation_details.ipynb
11 | notebooks/census_to_hexgrid.ipynb
12 | notebooks/extract_urban_areas
13 | notebooks/binary_dasymetric
14 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: tobler
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - dask-geopandas
6 | - dask
7 | - jupyterlab
8 | - numpy
9 | - geopandas >=0.13
10 | - pandas
11 | - numpy
12 | - rasterio
13 | - rasterstats
14 | - statsmodels
15 | - scikit-learn
16 | - scipy
17 | - libpysal
18 | - tqdm
19 | - pip
20 | - mapclassify
21 | - descartes
22 | - joblib
23 |
--------------------------------------------------------------------------------
/notebooks/04_area_interpolate_dask.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "e3f2586a-5b6a-4d46-b6e8-1991ae3bec6f",
6 | "metadata": {},
7 | "source": [
8 | "# (Distributed) areal interpolation"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "00f875bd-2714-4551-b10c-1ef3f514478d",
14 | "metadata": {},
15 | "source": [
16 | "In this notebook, we compare the single-core version in `tobler.area_weighted.area_interpolate` with the distributed version in `tobler.area_weighted.area_interpolate_dask`. "
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "id": "4084f715-3989-4424-943a-2a4066a8bcf2",
23 | "metadata": {
24 | "tags": []
25 | },
26 | "outputs": [],
27 | "source": [
28 | "import os\n",
29 | "os.environ['USE_PYGEOS'] = '1'\n",
30 | "\n",
31 | "import pandas\n",
32 | "import geopandas\n",
33 | "import dask_geopandas\n",
34 | "import tobler\n",
35 | "from libpysal.examples import load_example\n",
36 | "import numpy as np\n",
37 | "\n",
38 | "from dask.distributed import Client, LocalCluster"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "id": "d16a2e15-866b-407d-b65d-54a675aefbd7",
44 | "metadata": {},
45 | "source": [
46 | "## Setup"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "id": "080369e7-f3d4-41c6-a629-12ed458eb743",
52 | "metadata": {},
53 | "source": [
54 | "Load example data from `pysal`:"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 2,
60 | "id": "cb395dc5-67f2-462e-a1cf-919c8e6d0ae8",
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "c1 = load_example('Charleston1')\n",
65 | "c2 = load_example('Charleston2')\n",
66 | "\n",
67 | "crs = 6569 # https://epsg.io/6569\n",
68 | "\n",
69 | "tracts = geopandas.read_file(c1.get_path('sc_final_census2.shp')).to_crs(crs)\n",
70 | "zip_codes = geopandas.read_file(c2.get_path('CharlestonMSA2.shp')).to_crs(crs)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "id": "1d11c1d7-6435-40cb-a4d4-851f63eccf01",
76 | "metadata": {},
77 | "source": [
78 | "We make up a categorical variable with four classes distributed randomly across the dataset:"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 3,
84 | "id": "3543702f-5e8a-4336-a14d-19a4eeb77b1b",
85 | "metadata": {
86 | "tags": []
87 | },
88 | "outputs": [],
89 | "source": [
90 | "rng = np.random.default_rng(seed=42)\n",
91 | "\n",
92 | "tracts['rando'] = pandas.Series(\n",
93 | " rng.integers(0, 4, len(tracts)), dtype='category'\n",
94 | ")"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "id": "d2546bb7-abcb-4cad-8db8-c569ea9289ae",
100 | "metadata": {},
101 | "source": [
102 | "We will set up a local Dask cluster so you can follow the computations on the dashboard (`http://localhost:8787` by default):"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 2,
108 | "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749",
109 | "metadata": {
110 | "tags": []
111 | },
112 | "outputs": [],
113 | "source": [
114 | "client = Client(LocalCluster(n_workers=8))"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "id": "88c32c7d-0ca8-4945-a1f8-edfbc8917880",
120 | "metadata": {},
121 | "source": [
122 | "Finally, for Dask, we need to provide `dask_geopandas.GeoDataFrame` objects with spatial partitions and categorical variables properly set up:"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 7,
128 | "id": "a31a1a91-4071-40e2-a21f-7e035d734976",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "dtracts = (\n",
133 | " dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=16)\n",
134 | " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
135 | ")\n",
136 | "\n",
137 | "dzips = (\n",
138 | " dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=16)\n",
139 | " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
140 | ")"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "id": "54f986ec-ea46-479e-aed8-5edeeaf16fda",
146 | "metadata": {},
147 | "source": [
148 | "---\n",
149 | "\n",
150 | "**IMPORTANT** - At this point, only *categorical* variables are implemented, so those are what we will test.\n",
151 | "\n",
152 | "---"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "id": "b783aabc-8221-40f6-a0d5-bf21dd75e2a6",
158 | "metadata": {
159 | "tags": []
160 | },
161 | "source": [
162 | "## Correctness"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "id": "92dafb11-ec94-43c2-baec-2a5e2a0b380d",
168 | "metadata": {},
169 | "source": [
170 | "- Single core"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 8,
176 | "id": "4d4cde6d-73c1-4197-86ed-131724e21296",
177 | "metadata": {
178 | "tags": []
179 | },
180 | "outputs": [],
181 | "source": [
182 | "cat_sc = tobler.area_weighted.area_interpolate(\n",
183 | " tracts, zip_codes, categorical_variables=['rando']\n",
184 | ")"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "id": "2982d8dc-c1e9-4927-8643-9900b1b09890",
190 | "metadata": {},
191 | "source": [
192 | "- Dask"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 9,
198 | "id": "d8c7896f-9004-4a07-b3ba-75301f8120e5",
199 | "metadata": {
200 | "tags": []
201 | },
202 | "outputs": [],
203 | "source": [
204 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
205 | " dtracts, dzips, 'ZIP', categorical_variables=['rando']\n",
206 | ").compute()"
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "id": "5e19b8dd-505f-4dc1-ba85-9fd825e59b43",
212 | "metadata": {},
213 | "source": [
214 | "And we can compare both results are the same:"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 8,
220 | "id": "81de5e35-f3b6-4567-86b1-36d98583dca0",
221 | "metadata": {
222 | "tags": []
223 | },
224 | "outputs": [
225 | {
226 | "data": {
227 | "text/plain": [
228 | "rando_0 4.188295e-08\n",
229 | "rando_1 5.328575e-08\n",
230 | "rando_2 5.396667e-08\n",
231 | "rando_3 2.935173e-08\n",
232 | "dtype: float64"
233 | ]
234 | },
235 | "execution_count": 8,
236 | "metadata": {},
237 | "output_type": "execute_result"
238 | }
239 | ],
240 | "source": [
241 | "a = (\n",
242 | " cat_dk\n",
243 | " .set_index('ZIP')\n",
244 | " .reindex(zip_codes['ZIP'].values)\n",
245 | " .drop(columns='geometry')\n",
246 | ")\n",
247 | "\n",
248 | "b = (\n",
249 | " cat_sc\n",
250 | " .drop(columns='geometry')\n",
251 | " [['rando_0', 'rando_1', 'rando_2', 'rando_3']]\n",
252 | ")\n",
253 | "b.index = a.index\n",
254 | "\n",
255 | "(a - b).max()"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "id": "e2e04df1-3331-449c-b74c-e910239c3067",
261 | "metadata": {},
262 | "source": [
263 | "The differences in the estimates for the proportions of each area start at the 8th decimal, and thus likely rounding errors derived from the different approaches used to compute the interpolation (the single core does it in one-shot, while Dask computes parts and brings them together later with a sum)."
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "id": "1debbdf4-892f-4fda-834a-0403595794ef",
269 | "metadata": {
270 | "tags": []
271 | },
272 | "source": [
273 | "## Performance\n",
274 | "\n",
275 | "---\n",
276 | "\n",
277 | "**NOTE** - Timings below do _not_ include computation time required for spatial shuffling and partitioning (which can be substantial with large datasets), or converting from `geopandas`. These are \"sunk costs\" that'll only make this approach preferable with large datasets, although they can be computed once and the result stored in disk efficiently (e.g., as Parquet files). Having said that, when \"larger\" is large enough is not very large in modern terms: from a handful of thousand observations the gains will be substantial if several cores/workers are available.\n",
278 | "\n",
279 | "---"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "id": "e5242c13-c4cd-46e2-9131-ec1734bcc142",
285 | "metadata": {},
286 | "source": [
287 | "We can now time the example above:\n"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 12,
293 | "id": "902e494b-65ba-4fa2-99e6-eb3a513769f8",
294 | "metadata": {
295 | "tags": []
296 | },
297 | "outputs": [
298 | {
299 | "name": "stdout",
300 | "output_type": "stream",
301 | "text": [
302 | "85 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
303 | ]
304 | }
305 | ],
306 | "source": [
307 | "%%timeit\n",
308 | "cat_sc = tobler.area_weighted.area_interpolate(\n",
309 | " tracts, zip_codes, categorical_variables=['rando']\n",
310 | ")"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 13,
316 | "id": "5cfc44d9-f79a-4b8e-9caa-975ea64d5f0e",
317 | "metadata": {
318 | "tags": []
319 | },
320 | "outputs": [
321 | {
322 | "name": "stdout",
323 | "output_type": "stream",
324 | "text": [
325 | "1.41 s ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
326 | ]
327 | }
328 | ],
329 | "source": [
330 | "%%timeit\n",
331 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
332 | " dtracts, dzips, 'ZIP', categorical_variables=['rando']\n",
333 | ").compute()"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "id": "a124ee86-c527-4386-be8d-2dc833270fd9",
339 | "metadata": {},
340 | "source": [
341 | "This is notably slower (about 5x!). For such a small dataset, the overhead in distributing computations and collecting them overcomes any gains in parallelism.\n",
342 | "\n",
343 | "Now we can artificially increase the size of the datasets by concatenating them several times and re-computing (this time we only time one execution):"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 17,
349 | "id": "5f56d579-0022-45c2-845c-f351bf96ed01",
350 | "metadata": {
351 | "tags": []
352 | },
353 | "outputs": [
354 | {
355 | "name": "stdout",
356 | "output_type": "stream",
357 | "text": [
358 | "40x increase | N. tracts: 4680 | N. ZIPs: 1680\n"
359 | ]
360 | },
361 | {
362 | "name": "stderr",
363 | "output_type": "stream",
364 | "text": [
365 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.18 MiB.\n",
366 | "This may cause some slowdown.\n",
367 | "Consider scattering data ahead of time and using futures.\n",
368 | " warnings.warn(\n",
369 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.18 MiB.\n",
370 | "This may cause some slowdown.\n",
371 | "Consider scattering data ahead of time and using futures.\n",
372 | " warnings.warn(\n"
373 | ]
374 | }
375 | ],
376 | "source": [
377 | "sizeup = 40\n",
378 | "tracts_lrg = pandas.concat([tracts] * sizeup)\n",
379 | "zips_lrg = pandas.concat([zip_codes] * sizeup)\n",
380 | "print(\n",
381 | " f'{sizeup}x increase | N. tracts: {len(tracts_lrg)} | N. ZIPs: {len(zips_lrg)}'\n",
382 | ")\n",
383 | "\n",
384 | "dtracts_lrg = (\n",
385 | " dask_geopandas.from_geopandas(tracts_lrg[['geometry', 'rando']], chunksize=500)\n",
386 | " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
387 | ")\n",
388 | "\n",
389 | "dzips_lrg = (\n",
390 | " dask_geopandas.from_geopandas(zips_lrg[['ZIP', 'geometry']], chunksize=500)\n",
391 | " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
392 | ")"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "id": "e5187109-ba95-4b5f-b373-2ec4745d0289",
398 | "metadata": {},
399 | "source": [
400 | "And re-compute the timings:"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "id": "c0da372a-f791-47fb-ade0-317a1cf6ff9c",
406 | "metadata": {
407 | "jp-MarkdownHeadingCollapsed": true,
408 | "tags": []
409 | },
410 | "source": [
411 | "---\n",
412 | "\n",
413 | "### 10x"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 14,
419 | "id": "620cf9ab-7b9e-4458-809c-c7a73d13f26c",
420 | "metadata": {
421 | "tags": []
422 | },
423 | "outputs": [
424 | {
425 | "name": "stdout",
426 | "output_type": "stream",
427 | "text": [
428 | "Computing for a sizeup of 10x\n",
429 | "CPU times: user 7.21 s, sys: 11.3 ms, total: 7.23 s\n",
430 | "Wall time: 6.95 s\n"
431 | ]
432 | }
433 | ],
434 | "source": [
435 | "%%time\n",
436 | "print(f'Computing for a sizeup of {sizeup}x')\n",
437 | "cat_sc = tobler.area_weighted.area_interpolate(\n",
438 | " tracts_lrg, zips_lrg, categorical_variables=['rando']\n",
439 | ")"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 16,
445 | "id": "c615b27a-e004-429b-a0c5-e4b237516f9f",
446 | "metadata": {
447 | "tags": []
448 | },
449 | "outputs": [
450 | {
451 | "name": "stdout",
452 | "output_type": "stream",
453 | "text": [
454 | "Computing for a sizeup of 10x\n",
455 | "CPU times: user 548 ms, sys: 18 ms, total: 566 ms\n",
456 | "Wall time: 3.56 s\n"
457 | ]
458 | }
459 | ],
460 | "source": [
461 | "%%time\n",
462 | "print(f'Computing for a sizeup of {sizeup}x')\n",
463 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
464 | " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
465 | ").compute()"
466 | ]
467 | },
468 | {
469 | "cell_type": "markdown",
470 | "id": "cc13af25-e97e-4b34-bb1f-bb946c15748e",
471 | "metadata": {
472 | "jp-MarkdownHeadingCollapsed": true,
473 | "tags": []
474 | },
475 | "source": [
476 | "---\n",
477 | "\n",
478 | "### 20x"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 18,
484 | "id": "8dbb40d4-4b3b-446d-9d1b-99462a122d6e",
485 | "metadata": {
486 | "tags": []
487 | },
488 | "outputs": [
489 | {
490 | "name": "stdout",
491 | "output_type": "stream",
492 | "text": [
493 | "Computing for a sizeup of 20x\n",
494 | "CPU times: user 28.6 s, sys: 26.1 ms, total: 28.7 s\n",
495 | "Wall time: 27.6 s\n"
496 | ]
497 | }
498 | ],
499 | "source": [
500 | "%%time\n",
501 | "print(f'Computing for a sizeup of {sizeup}x')\n",
502 | "cat_sc = tobler.area_weighted.area_interpolate(\n",
503 | " tracts_lrg, zips_lrg, categorical_variables=['rando']\n",
504 | ")"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": 24,
510 | "id": "f2ca1394-5f8d-428f-a61c-87beb8778322",
511 | "metadata": {
512 | "tags": []
513 | },
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "Computing for a sizeup of 20x\n"
520 | ]
521 | },
522 | {
523 | "name": "stderr",
524 | "output_type": "stream",
525 | "text": [
526 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 16.77 MiB.\n",
527 | "This may cause some slowdown.\n",
528 | "Consider scattering data ahead of time and using futures.\n",
529 | " warnings.warn(\n"
530 | ]
531 | },
532 | {
533 | "name": "stdout",
534 | "output_type": "stream",
535 | "text": [
536 | "CPU times: user 1.32 s, sys: 65.3 ms, total: 1.38 s\n",
537 | "Wall time: 9.86 s\n"
538 | ]
539 | }
540 | ],
541 | "source": [
542 | "%%time\n",
543 | "print(f'Computing for a sizeup of {sizeup}x')\n",
544 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
545 | " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
546 | ").compute()"
547 | ]
548 | },
549 | {
550 | "cell_type": "markdown",
551 | "id": "335b34b4-9fea-48a6-b38b-8b1a5d755ca1",
552 | "metadata": {
553 | "jp-MarkdownHeadingCollapsed": true,
554 | "tags": []
555 | },
556 | "source": [
557 | "---\n",
558 | "\n",
559 | "### 30x"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 26,
565 | "id": "1598ce3f-d21e-4a60-9619-ee5b1eb4932f",
566 | "metadata": {
567 | "tags": []
568 | },
569 | "outputs": [
570 | {
571 | "name": "stdout",
572 | "output_type": "stream",
573 | "text": [
574 | "Computing for a sizeup of 30x\n",
575 | "CPU times: user 1min 4s, sys: 176 ms, total: 1min 4s\n",
576 | "Wall time: 1min 1s\n"
577 | ]
578 | }
579 | ],
580 | "source": [
581 | "%%time\n",
582 | "print(f'Computing for a sizeup of {sizeup}x')\n",
583 | "cat_sc = tobler.area_weighted.area_interpolate(\n",
584 | " tracts_lrg, zips_lrg, categorical_variables=['rando']\n",
585 | ")"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 7,
591 | "id": "224ffbca-7690-4b20-bad2-efbf042623a9",
592 | "metadata": {
593 | "tags": []
594 | },
595 | "outputs": [
596 | {
597 | "name": "stdout",
598 | "output_type": "stream",
599 | "text": [
600 | "Computing for a sizeup of 30x\n"
601 | ]
602 | },
603 | {
604 | "name": "stderr",
605 | "output_type": "stream",
606 | "text": [
607 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 25.14 MiB.\n",
608 | "This may cause some slowdown.\n",
609 | "Consider scattering data ahead of time and using futures.\n",
610 | " warnings.warn(\n"
611 | ]
612 | },
613 | {
614 | "name": "stdout",
615 | "output_type": "stream",
616 | "text": [
617 | "CPU times: user 1.91 s, sys: 58.8 ms, total: 1.97 s\n",
618 | "Wall time: 14.6 s\n"
619 | ]
620 | }
621 | ],
622 | "source": [
623 | "%%time\n",
624 | "print(f'Computing for a sizeup of {sizeup}x')\n",
625 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
626 | " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
627 | ").compute()"
628 | ]
629 | },
630 | {
631 | "cell_type": "markdown",
632 | "id": "b004834f-c5ce-4f92-be9a-364a07c7996b",
633 | "metadata": {
634 | "tags": []
635 | },
636 | "source": [
637 | "---\n",
638 | "\n",
639 | "### 40x"
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": 17,
645 | "id": "b6b9d06a-9034-4c39-b3a9-92fc6408d5c6",
646 | "metadata": {
647 | "tags": []
648 | },
649 | "outputs": [
650 | {
651 | "name": "stdout",
652 | "output_type": "stream",
653 | "text": [
654 | "Computing for a sizeup of 40x\n",
655 | "CPU times: user 2min 2s, sys: 1.71 s, total: 2min 3s\n",
656 | "Wall time: 1min 53s\n"
657 | ]
658 | }
659 | ],
660 | "source": [
661 | "%%time\n",
662 | "print(f'Computing for a sizeup of {sizeup}x')\n",
663 | "cat_sc = tobler.area_weighted.area_interpolate(\n",
664 | " tracts_lrg, zips_lrg, categorical_variables=['rando']\n",
665 | ")"
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": 18,
671 | "id": "8a68e5fe-ee41-48cc-9222-6554a7651c28",
672 | "metadata": {
673 | "tags": []
674 | },
675 | "outputs": [
676 | {
677 | "name": "stdout",
678 | "output_type": "stream",
679 | "text": [
680 | "Computing for a sizeup of 40x\n"
681 | ]
682 | },
683 | {
684 | "name": "stderr",
685 | "output_type": "stream",
686 | "text": [
687 | "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 33.52 MiB.\n",
688 | "This may cause some slowdown.\n",
689 | "Consider scattering data ahead of time and using futures.\n",
690 | " warnings.warn(\n"
691 | ]
692 | },
693 | {
694 | "name": "stdout",
695 | "output_type": "stream",
696 | "text": [
697 | "CPU times: user 6.99 s, sys: 512 ms, total: 7.5 s\n",
698 | "Wall time: 30.5 s\n"
699 | ]
700 | }
701 | ],
702 | "source": [
703 | "%%time\n",
704 | "print(f'Computing for a sizeup of {sizeup}x')\n",
705 | "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
706 | " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
707 | ").compute()"
708 | ]
709 | }
710 | ],
711 | "metadata": {
712 | "kernelspec": {
713 | "display_name": "tobler",
714 | "language": "python",
715 | "name": "tobler"
716 | },
717 | "language_info": {
718 | "codemirror_mode": {
719 | "name": "ipython",
720 | "version": 3
721 | },
722 | "file_extension": ".py",
723 | "mimetype": "text/x-python",
724 | "name": "python",
725 | "nbconvert_exporter": "python",
726 | "pygments_lexer": "ipython3",
727 | "version": "3.11.4"
728 | }
729 | },
730 | "nbformat": 4,
731 | "nbformat_minor": 5
732 | }
733 |
--------------------------------------------------------------------------------
/notebooks/area_interpolate_perf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# `numba` tests\n",
8 | "\n",
9 | "This notebook documents and serves as a scratchpad for exploring `numba`-based acceleration on areal interpolation.\n",
10 | "\n",
11 | "**NOTE** - To be removed/relocated once/if functionality is merged\n",
12 | "\n",
13 | "---\n",
14 | "\n",
15 | "**IMPORTANT**\n",
16 | "\n",
17 | "As of Dec. 17th'20, the multi-core implementation requires the versions in `main` for `pygeos` and `geopandas`. On a working environment with the latest released versions (as the `gds_env:5.0`), this can be achieved by:\n",
18 | "\n",
19 | "```shell\n",
20 | "pip install --no-deps git+https://github.com/pygeos/pygeos.git\n",
21 | "pip install --no-deps git+https://github.com/geopandas/geopandas.git\n",
22 | "```\n",
23 | "\n",
24 | "---"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 1,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "from tobler.area_weighted.area_interpolate import _area_tables_binning, _area_tables_binning_parallel\n",
34 | "import geopandas, pandas\n",
35 | "\n",
36 | "summary = lambda src, tgt: print(\n",
37 | " f\"Transfer {src.shape[0]} polygons into {tgt.shape[0]}\"\n",
38 | ")\n",
39 | "\n",
40 | "def down_load(p):\n",
41 | " fn = f\"/home/jovyan/{p.split('/')[0]}\"\n",
42 | " try:\n",
43 | " return geopandas.read_file(fn)\n",
44 | " except:\n",
45 | " ! wget $p -O $fn\n",
46 | " return geopandas.read_file(fn)"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "## Data setup"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "- Minimal problem"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 2,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "Transfer 628 polygons into 628\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "p = (\"https://geographicdata.science/book/_downloads/\"\\\n",
78 | " \"f2341ee89163afe06b42fc5d5ed38060/sandiego_tracts.gpkg\")\n",
79 | "src = down_load(p).rename(lambda i: 'i'+str(i))\n",
80 | "\n",
81 | "p = (\"https://geographicdata.science/book/_downloads/\"\\\n",
82 | " \"d740a1069144baa1302b9561c3d31afe/sd_h3_grid.gpkg\")\n",
83 | "tgt = down_load(p).rename(lambda i: 'i'+str(i)).to_crs(src.crs)\n",
84 | "\n",
85 | "w, s, e, n = tgt.total_bounds\n",
86 | "#src = src.cx[w:e, s:n]\n",
87 | "summary(src, tgt)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "- Slightly larger problem"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 3,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "Transfer 3140 polygons into 2512\n"
107 | ]
108 | }
109 | ],
110 | "source": [
111 | "# Tracts\n",
112 | "p = \"https://ndownloader.figshare.com/files/20460645\"\n",
113 | "src = down_load(p)\n",
114 | "src = pandas.concat([src]*5)\n",
115 | "\n",
116 | "# Precincts\n",
117 | "p = \"https://ndownloader.figshare.com/files/20460549\"\n",
118 | "tgt = down_load(p).to_crs(src.crs)\n",
119 | "tgt = pandas.concat([tgt]*4)\n",
120 | "summary(src, tgt)"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "## Correctness"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 4,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "data": {
137 | "text/plain": [
138 | "0"
139 | ]
140 | },
141 | "execution_count": 4,
142 | "metadata": {},
143 | "output_type": "execute_result"
144 | }
145 | ],
146 | "source": [
147 | "cross2 = _area_tables_binning_parallel(src, tgt, n_jobs=1)\n",
148 | "cross = _area_tables_binning(src, tgt, 'auto')\n",
149 | "(cross != cross2).sum()"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "## Performance"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "Results with all observations in first dataset:"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 5,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "name": "stdout",
173 | "output_type": "stream",
174 | "text": [
175 | "2.22 s ± 20.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "%timeit cross = _area_tables_binning(src, tgt, 'auto')"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 6,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "2.22 s ± 25.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
193 | ]
194 | }
195 | ],
196 | "source": [
197 | "%timeit cross2 = _area_tables_binning_parallel(src, tgt, n_jobs=1)"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 7,
203 | "metadata": {},
204 | "outputs": [
205 | {
206 | "name": "stdout",
207 | "output_type": "stream",
208 | "text": [
209 | "756 ms ± 21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
210 | ]
211 | }
212 | ],
213 | "source": [
214 | "%timeit cross3 = _area_tables_binning_parallel(src, tgt, n_jobs=-1)"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | "---"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "Results with second dataset:"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 5,
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "name": "stdout",
238 | "output_type": "stream",
239 | "text": [
240 | "CPU times: user 47.5 s, sys: 15.8 ms, total: 47.5 s\n",
241 | "Wall time: 47.6 s\n"
242 | ]
243 | }
244 | ],
245 | "source": [
246 | "%time cross = _area_tables_binning(src, tgt, 'auto')"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 8,
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "name": "stdout",
256 | "output_type": "stream",
257 | "text": [
258 | "CPU times: user 46.8 s, sys: 108 ms, total: 46.9 s\n",
259 | "Wall time: 46.9 s\n"
260 | ]
261 | }
262 | ],
263 | "source": [
264 | "%time cross3 = _area_tables_binning_parallel(src, tgt, n_jobs=1)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 6,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "name": "stdout",
274 | "output_type": "stream",
275 | "text": [
276 | "CPU times: user 1.86 s, sys: 488 ms, total: 2.35 s\n",
277 | "Wall time: 9.61 s\n"
278 | ]
279 | }
280 | ],
281 | "source": [
282 | "%time cross3 = _area_tables_binning_parallel(src, tgt, n_jobs=-1)"
283 | ]
284 | }
285 | ],
286 | "metadata": {
287 | "kernelspec": {
288 | "display_name": "Python 3",
289 | "language": "python",
290 | "name": "python3"
291 | },
292 | "language_info": {
293 | "codemirror_mode": {
294 | "name": "ipython",
295 | "version": 3
296 | },
297 | "file_extension": ".py",
298 | "mimetype": "text/x-python",
299 | "name": "python",
300 | "nbconvert_exporter": "python",
301 | "pygments_lexer": "ipython3",
302 | "version": "3.7.8"
303 | }
304 | },
305 | "nbformat": 4,
306 | "nbformat_minor": 4
307 | }
308 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0", "setuptools_scm[toml]>=6.2"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.setuptools_scm]
6 |
7 | [project]
8 | name = "tobler"
9 | dynamic = ["version"]
10 | authors = [
11 | { name = "Eli Knaap", email = "ek@knaaptime.com" },
12 | { name = "Serge Rey", email = "sjsrey@gmail.com" },
13 | ]
14 | maintainers = [{ name = "pysal contributors" }]
15 | license = { text = "BSD 3-Clause" }
16 | description = "Tobler is a Python library for areal interpolation."
17 | keywords = [
18 | "dasymetric mapping, spatial analysis, interpolation, change of support"
19 | ]
20 | readme = { text = """\
21 | Spatial interpolation, Dasymetric Mapping, & Change of Support"
22 |
23 | """, content-type = "text/x-rst" }
24 | classifiers = [
25 | "Programming Language :: Python :: 3",
26 | "License :: OSI Approved :: BSD License",
27 | "Operating System :: OS Independent",
28 | "Intended Audience :: Science/Research",
29 | "Topic :: Scientific/Engineering :: GIS",
30 | ]
31 | requires-python = ">=3.9"
32 | dependencies = [
33 | "numpy",
34 | "pandas",
35 | "geopandas >=0.13",
36 | "rasterio",
37 | "scipy",
38 | "statsmodels",
39 | "rasterstats",
40 | "libpysal",
41 | "tqdm",
42 | "joblib"
43 | ]
44 |
45 | [project.urls]
46 | Home = "https://github.com/pysal/tobler/"
47 | Repository = "https://github.com/pysal/tobler"
48 |
49 | [project.optional-dependencies]
50 | dev = ["pre-commit"]
51 | docs = [
52 | "nbsphinx",
53 | "numpydoc",
54 | "pandoc",
55 | "sphinx",
56 | "sphinxcontrib-bibtex",
57 | "sphinx_bootstrap_theme",
58 | "mkdocs-jupyter",
59 | "myst-parser"
60 | ]
61 | tests = [
62 | "codecov",
63 | "coverage",
64 | "pytest",
65 | "pytest-mpl",
66 | "pytest-cov",
67 | "pytest-xdist",
68 | "watermark",
69 | "h3",
70 | "astropy"
71 | ]
72 |
73 | [tool.setuptools.packages.find]
74 | include = ["tobler", "tobler.*"]
75 |
76 | [tool.black]
77 | line-length = 88
78 |
79 | [tool.ruff]
80 | line-length = 88
81 | select = ["E", "F", "W", "I", "UP", "N", "B", "A", "C4", "SIM", "ARG"]
82 | target-version = "py39"
83 | ignore = [
84 | "B006",
85 | "B008",
86 | "B009",
87 | "B010",
88 | "C408",
89 | "E731",
90 | "F401",
91 | "F403",
92 | "N803",
93 | "N806",
94 | "N999",
95 | "UP007"
96 | ]
97 | exclude = ["tobler/tests/*", "docs/*"]
98 |
99 | [tool.coverage.run]
100 | source = ["./tobler"]
101 |
102 | [tool.coverage.report]
103 | exclude_lines = [
104 | "if self.debug:",
105 | "pragma: no cover",
106 | "raise NotImplementedError",
107 | "except ModuleNotFoundError:",
108 | "except ImportError",
109 | ]
110 | ignore_errors = true
111 | omit = ["tobler/tests/*", "docs/conf.py"]
112 |
--------------------------------------------------------------------------------
/tobler/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | :mod:`tobler` --- A library for spatial interpolation
3 | =================================================
4 |
5 | """
6 | import contextlib
7 | from importlib.metadata import PackageNotFoundError, version
8 |
9 | from . import area_weighted, dasymetric, model, pycno, util
10 |
11 | with contextlib.suppress(PackageNotFoundError):
12 | __version__ = version("tobler")
13 |
--------------------------------------------------------------------------------
/tobler/area_weighted/__init__.py:
--------------------------------------------------------------------------------
1 | from .area_interpolate import area_interpolate
2 | from .area_interpolate import _area_tables_binning
3 | from .area_join import area_join
4 | from .area_interpolate_dask import area_interpolate_dask
5 |
6 | __all__ = [area_interpolate, area_join, area_interpolate_dask]
7 |
--------------------------------------------------------------------------------
/tobler/area_weighted/area_interpolate.py:
--------------------------------------------------------------------------------
1 | """
2 | Area Weighted Interpolation
3 |
4 | """
5 |
6 | import os
7 |
8 | import geopandas as gpd
9 | import numpy as np
10 | import pandas as pd
11 | from scipy.sparse import coo_matrix, diags
12 |
13 | from tobler.util.util import _check_crs, _inf_check, _nan_check
14 |
15 | __all__ = ["area_interpolate"]
16 |
17 |
18 | def _chunk_dfs(geoms_to_chunk, geoms_full, n_jobs):
19 | chunk_size = geoms_to_chunk.shape[0] // n_jobs + 1
20 | for i in range(n_jobs):
21 | start = i * chunk_size
22 | yield geoms_to_chunk.iloc[start : start + chunk_size], geoms_full
23 |
24 |
25 | def _index_n_query(geoms1, geoms2):
26 | # Pick largest for STRTree, query the smallest
27 | if geoms1.shape[0] > geoms2.shape[0]:
28 | large = geoms1
29 | small = geoms2
30 | else:
31 | large = geoms2
32 | small = geoms1
33 | # Build tree + query
34 | qry_polyIDs, tree_polyIDs = large.sindex.query(small, predicate="intersects")
35 | # Remap IDs to global
36 | large_global_ids = large.iloc[tree_polyIDs].index.values
37 | small_global_ids = small.iloc[qry_polyIDs].index.values
38 | # Return always global IDs for geoms1, geoms2
39 | if geoms1.shape[0] > geoms2.shape[0]:
40 | return np.array([large_global_ids, small_global_ids]).T
41 | else:
42 | return np.array([small_global_ids, large_global_ids]).T
43 |
44 |
45 | def _chunk_polys(id_pairs, geoms_left, geoms_right, n_jobs):
46 | chunk_size = id_pairs.shape[0] // n_jobs + 1
47 | for i in range(n_jobs):
48 | start = i * chunk_size
49 | chunk1 = geoms_left.array[id_pairs[start : start + chunk_size, 0]]
50 | chunk2 = geoms_right.array[id_pairs[start : start + chunk_size, 1]]
51 | yield chunk1, chunk2
52 |
53 |
54 | def _intersect_area_on_chunk(geoms1, geoms2):
55 | areas = geoms1.intersection(geoms2).area
56 | return areas
57 |
58 |
59 | def _area_tables_binning_parallel(source_df, target_df, n_jobs=-1):
60 | """Construct area allocation and source-target correspondence tables using
61 | a parallel spatial indexing approach
62 | ...
63 |
64 | NOTE: currently, the largest df is chunked and the other one is shipped in
65 | full to each core; within each process, the spatial index is built for the
66 | largest set of geometries, and the other one used for `query`
67 |
68 | Parameters
69 | ----------
70 | source_df : geopandas.GeoDataFrame
71 | GeoDataFrame containing input data and polygons
72 | target_df : geopandas.GeoDataFramee
73 | GeoDataFrame defining the output geometries
74 | n_jobs : int
75 | [Optional. Default=-1] Number of processes to run in parallel. If -1,
76 | this is set to the number of CPUs available
77 |
78 | Returns
79 | -------
80 | tables : scipy.sparse.csr_matrix
81 |
82 | """
83 | from joblib import Parallel, delayed, parallel_backend
84 |
85 | if _check_crs(source_df, target_df):
86 | pass
87 | else:
88 | return None
89 | if n_jobs == -1:
90 | n_jobs = os.cpu_count()
91 |
92 | df1 = source_df.copy()
93 | df2 = target_df.copy()
94 |
95 | # Chunk the largest, ship the smallest in full
96 | if df1.shape[0] > df2.shape[1]:
97 | to_chunk = df1
98 | df_full = df2
99 | else:
100 | to_chunk = df2
101 | df_full = df1
102 |
103 | # Spatial index query
104 | ## Reindex on positional IDs
105 | to_workers = _chunk_dfs(
106 | gpd.GeoSeries(to_chunk.geometry.values, crs=to_chunk.crs),
107 | gpd.GeoSeries(df_full.geometry.values, crs=df_full.crs),
108 | n_jobs,
109 | )
110 |
111 | with parallel_backend("loky", inner_max_num_threads=1):
112 | worker_out = Parallel(n_jobs=n_jobs)(
113 | delayed(_index_n_query)(*chunk_pair) for chunk_pair in to_workers
114 | )
115 |
116 | ids_src, ids_tgt = np.concatenate(worker_out).T
117 |
118 | # Intersection + area calculation
119 | chunks_to_intersection = _chunk_polys(
120 | np.vstack([ids_src, ids_tgt]).T, df1.geometry, df2.geometry, n_jobs
121 | )
122 | with parallel_backend("loky", inner_max_num_threads=1):
123 | worker_out = Parallel(n_jobs=n_jobs)(
124 | delayed(_intersect_area_on_chunk)(*chunk_pair)
125 | for chunk_pair in chunks_to_intersection
126 | )
127 | areas = np.concatenate(worker_out)
128 |
129 | # Build CSR table
130 | table = coo_matrix(
131 | (
132 | areas,
133 | (ids_src, ids_tgt),
134 | ),
135 | shape=(df1.shape[0], df2.shape[0]),
136 | dtype=np.float32,
137 | )
138 | table = table.tocsr()
139 | return table
140 |
141 |
142 | def _area_tables_binning(source_df, target_df, spatial_index):
143 | """Construct area allocation and source-target correspondence tables using a spatial indexing approach
144 | ...
145 |
146 | NOTE: this currently relies on Geopandas' spatial index machinery
147 |
148 | Parameters
149 | ----------
150 | source_df : geopandas.GeoDataFrame
151 | GeoDataFrame containing input data and polygons
152 | target_df : geopandas.GeoDataFramee
153 | GeoDataFrame defining the output geometries
154 | spatial_index : str
155 | Spatial index to use to build the allocation of area from source to
156 | target tables. It currently support the following values:
157 | - "source": build the spatial index on `source_df`
158 | - "target": build the spatial index on `target_df`
159 | - "auto": attempts to guess the most efficient alternative.
160 | Currently, this option uses the largest table to build the
161 | index, and performs a `bulk_query` on the shorter table.
162 |
163 | Returns
164 | -------
165 | tables : scipy.sparse.csr_matrix
166 |
167 | """
168 | if _check_crs(source_df, target_df):
169 | pass
170 | else:
171 | return None
172 |
173 | df1 = source_df.copy()
174 | df2 = target_df.copy()
175 |
176 | # it is generally more performant to use the longer df as spatial index
177 | if spatial_index == "auto":
178 | if df1.shape[0] > df2.shape[0]:
179 | spatial_index = "source"
180 | else:
181 | spatial_index = "target"
182 |
183 | if spatial_index == "source":
184 | ids_tgt, ids_src = df1.sindex.query(df2.geometry, predicate="intersects")
185 | elif spatial_index == "target":
186 | ids_src, ids_tgt = df2.sindex.query(df1.geometry, predicate="intersects")
187 | else:
188 | raise ValueError(
189 | f"'{spatial_index}' is not a valid option. Use 'auto', 'source' or 'target'."
190 | )
191 |
192 | areas = df1.geometry.values[ids_src].intersection(df2.geometry.values[ids_tgt]).area
193 |
194 | table = coo_matrix(
195 | (
196 | areas,
197 | (ids_src, ids_tgt),
198 | ),
199 | shape=(df1.shape[0], df2.shape[0]),
200 | dtype=np.float32,
201 | )
202 |
203 | table = table.tocsr()
204 |
205 | return table
206 |
207 |
208 | def area_interpolate(
209 | source_df,
210 | target_df,
211 | extensive_variables=None,
212 | intensive_variables=None,
213 | table=None,
214 | allocate_total=True,
215 | spatial_index="auto",
216 | n_jobs=1,
217 | categorical_variables=None,
218 | categorical_frequency=True,
219 | ):
220 | """
221 | Area interpolation for extensive, intensive and categorical variables.
222 |
223 | Parameters
224 | ----------
225 | source_df : geopandas.GeoDataFrame
226 |
227 | target_df : geopandas.GeoDataFrame
228 |
229 | extensive_variables : list
230 | [Optional. Default=None] Columns in dataframes for extensive variables
231 |
232 | intensive_variables : list
233 | [Optional. Default=None] Columns in dataframes for intensive variables
234 |
235 | table : scipy.sparse.csr_matrix
236 | [Optional. Default=None] Area allocation source-target correspondence
237 | table. If not provided, it will be built from `source_df` and
238 | `target_df` using `tobler.area_interpolate._area_tables_binning`
239 |
240 | allocate_total : boolean
241 | [Optional. Default=True] True if total value of source area should be
242 | allocated. False if denominator is area of i. Note that the two cases
243 | would be identical when the area of the source polygon is exhausted by
244 | intersections. See Notes for more details.
245 |
246 | spatial_index : str
247 | [Optional. Default="auto"] Spatial index to use to build the
248 | allocation of area from source to target tables. It currently support
249 | the following values:
250 |
251 | - "source": build the spatial index on `source_df`
252 | - "target": build the spatial index on `target_df`
253 | - "auto": attempts to guess the most efficient alternative.
254 |
255 | Currently, this option uses the largest table to build the
256 | index, and performs a `bulk_query` on the shorter table.
257 | This argument is ignored if n_jobs>1 (or n_jobs=-1).
258 |
259 | n_jobs : int
260 | [Optional. Default=1] Number of processes to run in parallel to
261 | generate the area allocation. If -1, this is set to the number of CPUs
262 | available. If `table` is passed, this is ignored.
263 |
264 | categorical_variables : list
265 | [Optional. Default=None] Columns in dataframes for categorical variables
266 |
267 | categorical_frequency : Boolean
268 | [Optional. Default=True] If True, `estimates` returns the frequency of each
269 | value in a categorical variable in every polygon of `target_df` (proportion of
270 | area). If False, `estimates` contains the area in every polygon of `target_df`
271 | that is occupied by each value of the categorical
272 |
273 | Returns
274 | -------
275 | estimates : geopandas.GeoDataFrame
276 | new geodataframe with interpolated variables as columns and target_df geometry
277 | as output geometry
278 |
279 | Notes
280 | -----
281 | The assumption is both dataframes have the same coordinate reference system.
282 | For an extensive variable, the estimate at target polygon j (default case) is:
283 |
284 | .. math::
285 | v_j = \\sum_i v_i w_{i,j}
286 |
287 | w_{i,j} = a_{i,j} / \\sum_k a_{i,k}
288 |
289 | If the area of the source polygon is not exhausted by intersections with
290 | target polygons and there is reason to not allocate the complete value of
291 | an extensive attribute, then setting allocate_total=False will use the
292 | following weights:
293 |
294 | $$v_j = \\sum_i v_i w_{i,j}$$
295 |
296 | $$w_{i,j} = a_{i,j} / a_i$$
297 |
298 | where a_i is the total area of source polygon i.
299 | For an intensive variable, the estimate at target polygon j is:
300 |
301 | $$v_j = \\sum_i v_i w_{i,j}$$
302 |
303 | $$w_{i,j} = a_{i,j} / \\sum_k a_{k,j}$$
304 |
305 | For categorical variables, the estimate returns ratio of presence of each
306 | unique category.
307 | """
308 | source_df = source_df.copy()
309 | target_df = target_df.copy()
310 |
311 | if _check_crs(source_df, target_df):
312 | pass
313 | else:
314 | return None
315 |
316 | if table is None:
317 | if n_jobs == 1:
318 | table = _area_tables_binning(source_df, target_df, spatial_index)
319 | else:
320 | table = _area_tables_binning_parallel(source_df, target_df, n_jobs=n_jobs)
321 |
322 | dfs = []
323 | extensive = []
324 | if extensive_variables:
325 | den = source_df.area.values
326 | if allocate_total:
327 | den = np.asarray(table.sum(axis=1))
328 | den = den + (den == 0)
329 | den = 1.0 / den
330 | n = den.shape[0]
331 | den = den.reshape((n,))
332 | den = diags([den], [0])
333 | weights = den.dot(table) # row standardize table
334 |
335 | for variable in extensive_variables:
336 | vals = _nan_check(source_df, variable)
337 | vals = _inf_check(source_df, variable)
338 | estimates = diags([vals], [0]).dot(weights)
339 | estimates = estimates.sum(axis=0)
340 | extensive.append(estimates.tolist()[0])
341 |
342 | extensive = np.asarray(extensive)
343 | extensive = np.array(extensive)
344 | extensive = pd.DataFrame(extensive.T, columns=extensive_variables)
345 |
346 | intensive = []
347 | if intensive_variables:
348 | area = np.asarray(table.sum(axis=0))
349 | den = 1.0 / (area + (area == 0))
350 | n, k = den.shape
351 | den = den.reshape((k,))
352 | den = diags([den], [0])
353 | weights = table.dot(den)
354 |
355 | for variable in intensive_variables:
356 | vals = _nan_check(source_df, variable)
357 | vals = _inf_check(source_df, variable)
358 | n = vals.shape[0]
359 | vals = vals.reshape((n,))
360 | estimates = diags([vals], [0])
361 | estimates = estimates.dot(weights).sum(axis=0)
362 | intensive.append(estimates.tolist()[0])
363 |
364 | intensive = np.asarray(intensive)
365 | intensive = pd.DataFrame(intensive.T, columns=intensive_variables)
366 |
367 | if categorical_variables:
368 | categorical = {}
369 | for variable in categorical_variables:
370 | unique = source_df[variable].unique()
371 | for value in unique:
372 | mask = source_df[variable] == value
373 | categorical[f"{variable}_{value}"] = np.asarray(
374 | table[mask.to_numpy()].sum(axis=0)
375 | )[0]
376 |
377 | categorical = pd.DataFrame(categorical)
378 | if categorical_frequency is True:
379 | categorical = categorical.div(target_df.area.values, axis="rows")
380 |
381 | if extensive_variables:
382 | dfs.append(extensive)
383 | if intensive_variables:
384 | dfs.append(intensive)
385 | if categorical_variables:
386 | dfs.append(categorical)
387 |
388 | df = pd.concat(dfs, axis=1)
389 | df["geometry"] = target_df[target_df.geometry.name].reset_index(drop=True)
390 | df = gpd.GeoDataFrame(df.replace(np.inf, np.nan))
391 |
392 | return df.set_index(target_df.index)
393 |
--------------------------------------------------------------------------------
/tobler/area_weighted/area_interpolate_dask.py:
--------------------------------------------------------------------------------
1 | """
2 | Area Weighted Interpolation, out-of-core and parallel through Dask
3 | """
4 |
5 | import geopandas
6 | import numpy as np
7 | import pandas
8 |
9 | from .area_interpolate import area_interpolate
10 |
11 | __all__ = ['area_interpolate_dask']
12 |
13 | def area_interpolate_dask(
14 | source_dgdf,
15 | target_dgdf,
16 | id_col,
17 | extensive_variables=None,
18 | intensive_variables=None,
19 | categorical_variables=None,
20 | categorical_frequency=True,
21 | ):
22 | """
23 | Out-of-core and parallel area interpolation for categorical variables.
24 |
25 | Parameters
26 | ----------
27 | source_dgdf : dask_geopandas.GeoDataFrame
28 | Dask-geopandas GeoDataFrame
29 | IMPORTANT: the table needs to be spatially shuffled and with spatial partitions.
30 | This is required so only overlapping partitions are checked for interpolation. See
31 | more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html
32 | target_dgdf : dask_geopandas.GeoDataFrame
33 | Dask-geopandas GeoDataFrame
34 | IMPORTANT: the table needs to be spatially shuffled and with spatial partitions.
35 | This is required so only overlapping partitions are checked for interpolation. See
36 | more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html
37 | id_col : str
38 | Name of the column in `target_dgdf` with unique IDs to be used in output table
39 | extensive_variables : list
40 | [Optional. Default=None] Columns in `source_dgdf` for extensive variables.
41 | IMPORTANT: currently NOT implemented.
42 | intensive_variables : list
43 | [Optional. Default=None] Columns in `source_dgdf` for intensive variables
44 | IMPORTANT: currently NOT implemented.
45 | categorical_variables : list
46 | [Optional. Default=None] Columns in `source_dgdf` for categorical variables
47 | IMPORTANT: categorical variables must be of type `'category[known]'`. This is so
48 | all categories are known ahead of time and Dask can run lazily.
49 | categorical_frequency : Boolean
50 | [Optional. Default=True] If True, `estimates` returns the frequency of each
51 | value in a categorical variable in every polygon of `target_df` (proportion of
52 | area). If False, `estimates` contains the area in every polygon of `target_df`
53 | that is occupied by each value of the categorical
54 |
55 |
56 | Returns
57 | -------
58 | estimates : dask_geopandas.GeoDataFrame
59 | new dask-geopandas geodataframe with interpolated variables and `id_col` as
60 | columns and target_df geometry as output geometry
61 |
62 | """
63 | try:
64 | import dask_geopandas
65 | from dask.base import tokenize
66 | from dask.highlevelgraph import HighLevelGraph
67 | except ImportError:
68 | raise ImportError(
69 | "Area interpolation with Dask requires `dask` and "
70 | "`dask_geopandas` installed to run. Please install them "
71 | "before importing this functionality."
72 | )
73 |
74 | if intensive_variables is not None:
75 | raise NotImplementedError(
76 | (
77 | "Dask-based interpolation of intensive variables is "
78 | "not implemented yet. Please remove intensive variables to "
79 | "be able to run the rest."
80 | )
81 | )
82 | if extensive_variables is not None:
83 | raise NotImplementedError(
84 | (
85 | "Dask-based interpolation of extensive variables is "
86 | "not implemented yet. Please remove intensive variables to "
87 | "be able to run the rest."
88 | )
89 | )
90 | # Categoricals must be Dask's known categorical
91 | if categorical_variables is not None:
92 | category_vars = []
93 | for cat_var in categorical_variables:
94 | var_names = [f"{cat_var}_{c}" for c in source_dgdf[cat_var].cat.categories]
95 | category_vars.extend(var_names)
96 | else:
97 | category_vars = None
98 | # Build tasks by joining pairs of chunks from left/right
99 | dsk = {}
100 | new_spatial_partitions = []
101 | parts = geopandas.sjoin(
102 | source_dgdf.spatial_partitions.to_frame("geometry"),
103 | target_dgdf.spatial_partitions.to_frame("geometry"),
104 | how="inner",
105 | predicate="intersects",
106 | )
107 | parts_left = np.asarray(parts.index)
108 | parts_right = np.asarray(parts["index_right"].values)
109 | name = "area_interpolate-" + tokenize(target_dgdf, source_dgdf)
110 | for i, (l, r) in enumerate(zip(parts_left, parts_right)):
111 | dsk[(name, i)] = (
112 | id_area_interpolate,
113 | (source_dgdf._name, l),
114 | (target_dgdf._name, r),
115 | id_col,
116 | extensive_variables,
117 | intensive_variables,
118 | None,
119 | True,
120 | "auto",
121 | 1,
122 | categorical_variables,
123 | category_vars,
124 | )
125 | lr = source_dgdf.spatial_partitions.iloc[l]
126 | rr = target_dgdf.spatial_partitions.iloc[r]
127 | extent = lr.intersection(rr)
128 | new_spatial_partitions.append(extent)
129 | # Create geometries for new spatial partitions
130 | new_spatial_partitions = geopandas.GeoSeries(
131 | data=new_spatial_partitions, crs=source_dgdf.crs
132 | )
133 | # Build Dask graph
134 | graph = HighLevelGraph.from_collections(
135 | name, dsk, dependencies=[source_dgdf, target_dgdf]
136 | )
137 | # Get metadata for the outcome table
138 | meta = id_area_interpolate(
139 | source_dgdf._meta,
140 | target_dgdf._meta,
141 | id_col,
142 | extensive_variables=extensive_variables,
143 | intensive_variables=intensive_variables,
144 | table=None,
145 | allocate_total=True,
146 | spatial_index="auto",
147 | n_jobs=1,
148 | categorical_variables=categorical_variables,
149 | category_vars=category_vars,
150 | )
151 | # Build output table
152 | transferred = dask_geopandas.GeoDataFrame(
153 | graph, name, meta, [None] * (len(dsk) + 1), new_spatial_partitions
154 | )
155 | # Merge chunks
156 | out = target_dgdf[[id_col, "geometry"]]
157 | ## Extensive --> Not implemented (DAB: the below does not match single-core)
158 | """
159 | if extensive_variables is not None:
160 | out_extensive = (
161 | transferred
162 | .groupby(id_col)
163 | [extensive_variables]
164 | .agg({v: 'sum' for v in extensive_variables})
165 | )
166 | out = out.join(out_extensive, on=id_col)
167 | """
168 | ## Intensive --> Weight by area of the chunk (Not implemented)
169 | ## Categorical --> Add up proportions
170 | if categorical_variables is not None:
171 | out_categorical = (
172 | transferred[category_vars]
173 | .astype(float)
174 | .groupby(transferred[id_col])
175 | .agg({v: "sum" for v in category_vars})
176 | )
177 | out = out.join(out_categorical, on=id_col)
178 | if categorical_frequency is True:
179 | cols = out_categorical.columns.tolist()
180 | out[cols] = out[cols].div(out.area, axis="index")
181 | return out
182 |
183 |
184 | def id_area_interpolate(
185 | source_df,
186 | target_df,
187 | id_col,
188 | extensive_variables=None,
189 | intensive_variables=None,
190 | table=None,
191 | allocate_total=True,
192 | spatial_index="auto",
193 | n_jobs=1,
194 | categorical_variables=None,
195 | category_vars=None,
196 | ):
197 | """
198 | Light wrapper around single-core area interpolation to be run on distributed workers
199 |
200 | Parameters
201 | ----------
202 | source_df : geopandas.GeoDataFrame
203 | target_df : geopandas.GeoDataFrame
204 | id_col : str
205 | Name of the column in `target_dgdf` with unique IDs to be used in output table
206 | extensive_variables : list
207 | [Optional. Default=None] Columns in dataframes for extensive variables
208 | intensive_variables : list
209 | [Optional. Default=None] Columns in dataframes for intensive variables
210 | table : scipy.sparse.csr_matrix
211 | [Optional. Default=None] Area allocation source-target correspondence
212 | table. If not provided, it will be built from `source_df` and
213 | `target_df` using `tobler.area_interpolate._area_tables_binning`
214 | allocate_total : boolean
215 | [Optional. Default=True] True if total value of source area should be
216 | allocated. False if denominator is area of i. Note that the two cases
217 | would be identical when the area of the source polygon is exhausted by
218 | intersections. See Notes for more details.
219 | spatial_index : str
220 | [Optional. Default="auto"] Spatial index to use to build the
221 | allocation of area from source to target tables. It currently support
222 | the following values:
223 | - "source": build the spatial index on `source_df`
224 | - "target": build the spatial index on `target_df`
225 | - "auto": attempts to guess the most efficient alternative.
226 | Currently, this option uses the largest table to build the
227 | index, and performs a `bulk_query` on the shorter table.
228 | This argument is ignored if n_jobs>1 (or n_jobs=-1).
229 | n_jobs : int
230 | [Optional. Default=1] Number of processes to run in parallel to
231 | generate the area allocation. If -1, this is set to the number of CPUs
232 | available. If `table` is passed, this is ignored.
233 | categorical_variables : list
234 | [Optional. Default=None] Columns in dataframes for categorical variables
235 | categories : list
236 | [Optional. Default=None] Full list of category names in the format
237 | `f'{var_name}_{cat_name}'`
238 |
239 | Returns
240 | -------
241 | estimates : geopandas.GeoDataFrame
242 | new geodaraframe with interpolated variables as columns and target_df geometry
243 | as output geometry
244 |
245 | """
246 | estimates = area_interpolate(
247 | source_df,
248 | target_df,
249 | extensive_variables=extensive_variables,
250 | intensive_variables=intensive_variables,
251 | table=table,
252 | allocate_total=allocate_total,
253 | spatial_index=spatial_index,
254 | n_jobs=n_jobs,
255 | categorical_variables=categorical_variables,
256 | categorical_frequency=False,
257 | )
258 | estimates[id_col] = target_df[id_col].values
259 |
260 | if categorical_variables is not None:
261 | category_vars_to_add = []
262 | for category_var in category_vars:
263 | if category_var not in estimates.columns:
264 | category_vars_to_add.append(category_var)
265 | estimates = estimates.join(
266 | pandas.DataFrame(index=estimates.index, columns=category_vars_to_add)
267 | )
268 | return estimates
269 |
--------------------------------------------------------------------------------
/tobler/area_weighted/area_join.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import warnings
4 |
5 | __author__ = "Martin Fleischmann "
6 |
7 | __all__ = ["area_join"]
8 |
9 |
10 | def area_join(source_df, target_df, variables):
11 | """
12 | Join variables from source_df based on the largest intersection. In case of a tie it picks the first one.
13 |
14 | Parameters
15 | ----------
16 | source_df : geopandas.GeoDataFrame
17 | GeoDataFrame containing source values
18 | target_df : geopandas.GeoDataFrame
19 | GeoDataFrame containing source values
20 | variables : string or list-like
21 | column(s) in source_df dataframe for variable(s) to be joined
22 |
23 | Returns
24 | -------
25 | joined : geopandas.GeoDataFrame
26 | target_df GeoDataFrame with joined variables as additional columns
27 |
28 | """
29 | if not pd.api.types.is_list_like(variables):
30 | variables = [variables]
31 |
32 | for v in variables:
33 | if v in target_df.columns:
34 | raise ValueError(f"Column '{v}' already present in target_df.")
35 |
36 | target_df = target_df.copy()
37 | target_ix, source_ix = source_df.sindex.query(
38 | target_df.geometry, predicate="intersects"
39 | )
40 | areas = (
41 | target_df.geometry.values[target_ix]
42 | .intersection(source_df.geometry.values[source_ix])
43 | .area
44 | )
45 |
46 | main = []
47 | for i in range(len(target_df)): # vectorise this loop?
48 | mask = target_ix == i
49 | if np.any(mask):
50 | main.append(source_ix[mask][np.argmax(areas[mask])])
51 | else:
52 | main.append(np.nan)
53 |
54 | main = np.array(main, dtype=float)
55 | mask = ~np.isnan(main)
56 |
57 | for v in variables:
58 | arr = np.empty(len(main), dtype=object)
59 | arr[mask] = source_df[v].values[main[mask].astype(int)]
60 | try:
61 | arr = arr.astype(source_df[v].dtype)
62 | except TypeError:
63 | warnings.warn(
64 | f"Cannot preserve dtype of '{v}'. Falling back to `dtype=object`.",
65 | )
66 | target_df[v] = arr
67 |
68 | return target_df
69 |
--------------------------------------------------------------------------------
/tobler/dasymetric/__init__.py:
--------------------------------------------------------------------------------
1 | from .masked_area_interpolate import masked_area_interpolate
2 | from .raster_tools import extract_raster_features, _fast_append_profile_in_gdf
3 |
4 | __all__ = ["masked_area_interpolate"]
--------------------------------------------------------------------------------
/tobler/dasymetric/masked_area_interpolate.py:
--------------------------------------------------------------------------------
1 | from warnings import warn
2 |
3 | import geopandas as gpd
4 |
5 | from ..area_weighted import area_interpolate
6 | from .raster_tools import extract_raster_features
7 |
8 | __all__ = ["masked_area_interpolate"]
9 |
10 |
11 | def masked_area_interpolate(
12 | source_df,
13 | target_df,
14 | raster,
15 | pixel_values,
16 | extensive_variables=None,
17 | intensive_variables=None,
18 | categorical_variables=None,
19 | allocate_total=True,
20 | nodata=255,
21 | n_jobs=-1,
22 | codes=None,
23 | ):
24 | """Interpolate data between two polygonal datasets using an auxiliary raster to mask out uninhabited land.
25 |
26 | Parameters
27 | ----------
28 | source_df : geopandas.GeoDataFrame
29 | source data to be converted to another geometric representation.
30 | target_df : geopandas.GeoDataFrame
31 | target geometries that will form the new representation of the input data
32 | raster : str
33 | path to raster file that contains ancillary data
34 | pixel_values : list of ints
35 | list of pixel values that should be considered part of the mask. For example if
36 | using data from NLCD Land Cover Database , a common
37 | input might be [21,22,23,24], which match the "developed" land types in that dataset
38 | extensive_variables : list
39 | Columns of the input dataframe containing extensive variables to interpolate
40 | intensive_variables : list
41 | Columns of the input dataframe containing intensive variables to interpolate
42 | categorical_variables : list
43 | [Optional. Default=None] Columns in dataframes for categorical variables
44 | allocate_total : bool
45 | whether to allocate the total from the source geometries (the default is True).
46 | nodata : int
47 | value in raster that indicates null or missing values. Default is 255
48 | n_jobs : int
49 | [Optional. Default=-1] Number of processes to run in parallel to
50 | generate the area allocation. If -1, this is set to the number of CPUs
51 | available.
52 |
53 |
54 | Returns
55 | -------
56 | geopandas.GeoDataFrame
57 | GeoDataFrame with geometries matching the target_df and extensive and intensive
58 | variables as the columns
59 |
60 | """
61 | if codes:
62 | warn(
63 | "The `codes` keyword is deprecated and will be removed shortly. Please use `pixel_values` instead"
64 | )
65 | pixel_values = codes
66 | source_df = source_df.copy()
67 | assert not any(
68 | source_df.index.duplicated()
69 | ), "The index of the source_df cannot contain duplicates."
70 |
71 | # create a vector mask from the raster data
72 | raster_mask = extract_raster_features(
73 | source_df, raster, pixel_values, nodata, n_jobs, collapse_values=True
74 | )
75 | # create a column in the source_df to dissolve on
76 | idx_name = source_df.index.name if source_df.index.name else "idx"
77 | source_df[idx_name] = source_df.index
78 |
79 | # clip source_df by its mask (overlay/dissolve is faster than gpd.clip here)
80 | source_df = gpd.overlay(
81 | source_df, raster_mask.to_crs(source_df.crs), how="intersection"
82 | ).dissolve(idx_name)
83 |
84 | # continue with standard areal interpolation using the clipped source
85 | interpolation = area_interpolate(
86 | source_df,
87 | target_df.copy(),
88 | extensive_variables=extensive_variables,
89 | intensive_variables=intensive_variables,
90 | n_jobs=n_jobs,
91 | categorical_variables=categorical_variables,
92 | allocate_total=allocate_total,
93 | )
94 | return interpolation
95 |
--------------------------------------------------------------------------------
/tobler/dasymetric/raster_tools.py:
--------------------------------------------------------------------------------
1 | """tools for working with rasters."""
2 |
3 | import ast
4 | import multiprocessing
5 | import warnings
6 |
7 | import geopandas as gpd
8 | import numpy as np
9 | import pandas as pd
10 | import rasterio as rio
11 | import rasterstats as rs
12 | from joblib import Parallel, delayed
13 | from packaging.version import Version
14 | from rasterio import features
15 | from rasterio.mask import mask
16 | from shapely.geometry import shape
17 |
18 | from ..util.util import _check_presence_of_crs
19 |
20 | GPD_10 = Version(gpd.__version__) >= Version("1.0.0dev")
21 |
22 | __all__ = ["extract_raster_features"]
23 |
24 |
25 | def _chunk_dfs(geoms_to_chunk, n_jobs):
26 | chunk_size = geoms_to_chunk.shape[0] // n_jobs + 1
27 | for i in range(n_jobs):
28 | start = i * chunk_size
29 | yield geoms_to_chunk.iloc[start : start + chunk_size]
30 |
31 |
32 | def _parse_geom(geom_str):
33 | return shape(ast.literal_eval(geom_str))
34 |
35 |
36 | def _apply_parser(df):
37 | return df.apply(_parse_geom)
38 |
39 |
40 | def _fast_append_profile_in_gdf(geodataframe, raster_path, force_crs_match=True):
41 | """Append categorical zonal statistics (counts by pixel type) as columns to an input geodataframe.
42 |
43 | geodataframe : geopandas.GeoDataFrame
44 | geodataframe that has overlay with the raster. If some polygon do not overlay the raster,
45 | consider a preprocessing step using the function subset_gdf_polygons_from_raster.
46 | raster_path : str
47 | path to the raster image.
48 | force_crs_match : bool, Default is True.
49 | Whether the Coordinate Reference System (CRS) of the polygon will be reprojected to
50 | the CRS of the raster file. It is recommended to let this argument as True.
51 |
52 | Notes
53 | -----
54 | The generated geodataframe will input the value 0 for each Type that is not present in the raster
55 | for each polygon.
56 | """
57 |
58 | _check_presence_of_crs(geodataframe)
59 | if force_crs_match:
60 | with rio.open(raster_path) as raster:
61 | geodataframe = geodataframe.to_crs(crs=raster.crs.data)
62 | else:
63 | warnings.warn(
64 | "The GeoDataFrame is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
65 | )
66 |
67 | zonal_gjson = rs.zonal_stats(
68 | geodataframe, raster_path, prefix="Type_", geojson_out=True, categorical=True
69 | )
70 |
71 | zonal_ppt_gdf = gpd.GeoDataFrame.from_features(zonal_gjson)
72 |
73 | return zonal_ppt_gdf
74 |
75 |
76 | def extract_raster_features(
77 | gdf, raster_path, pixel_values=None, nodata=255, n_jobs=-1, collapse_values=False
78 | ):
79 | """Generate a geodataframe from raster data by polygonizing contiguous pixels with the same value using rasterio's features module.
80 |
81 | Parameters
82 | ----------
83 | gdf : geopandas.GeoDataFrame
84 | geodataframe defining the area of interest. The input raster will be
85 | clipped to the extent of the geodataframe
86 | raster_path : str
87 | path to raster file, such as downloaded from
88 | pixel_values : list-like, optional
89 | subset of pixel values to extract, by default None. If None, this function
90 | may generate a very large geodataframe
91 | nodata : int, optional
92 | pixel value denoting "no data" in input raster
93 | n_jobs : int
94 | [Optional. Default=-1] Number of processes to run in parallel. If -1,
95 | this is set to the number of CPUs available
96 | collapse_values : bool, optional
97 | If True, multiple values passed to the pixel_values argument are treated
98 | as a single type. I.e. polygons will be generated from any contiguous collection
99 | of values from pixel_types, instead of unique polygons generated for each value
100 | This can dramatically reduce the complexity of the resulting geodataframe a
101 | fewer polygons are required to represent the study area.
102 |
103 | Returns
104 | -------
105 | geopandas.GeoDataFrame
106 | geodataframe whose rows are the zones extracted by the rasterio.features module.
107 | The geometry of each zone is the boundary of a contiguous group of pixels with
108 | the same value; the `value` column contains the pixel value of each zone.
109 | """
110 | if n_jobs == -1:
111 | n_jobs = multiprocessing.cpu_count()
112 | with rio.open(raster_path) as src:
113 | raster_crs = src.crs.to_dict()
114 | gdf = gdf.to_crs(raster_crs)
115 | if GPD_10:
116 | geomask = [gdf.union_all().__geo_interface__]
117 | else:
118 | geomask = [gdf.unary_union.__geo_interface__]
119 |
120 | out_image, out_transform = mask(
121 | src, geomask, nodata=nodata, crop=True
122 | ) # clip to AoI using a vector layer
123 |
124 | if pixel_values:
125 | if collapse_values:
126 | out_image = np.where(
127 | np.isin(out_image, pixel_values), pixel_values[0], out_image
128 | ) # replace values to generate fewer polys
129 | pixel_values = np.isin(
130 | out_image, pixel_values
131 | ) # only include requested pixels
132 |
133 | shapes = list(
134 | features.shapes(out_image, mask=pixel_values, transform=out_transform)
135 | ) # convert regions to polygons
136 | res = list(zip(*shapes))
137 | geoms = pd.Series(res[0], name="geometry").astype(str)
138 | pieces = _chunk_dfs(geoms, n_jobs)
139 | geoms = pd.concat(
140 | Parallel(n_jobs=n_jobs)(delayed(_apply_parser)(i) for i in pieces)
141 | )
142 | geoms = gpd.GeoSeries(geoms).buffer(0) # we sometimes get self-intersecting rings
143 | vals = pd.Series(res[1], name="value")
144 | gdf = gpd.GeoDataFrame(vals, geometry=geoms, crs=raster_crs)
145 | if collapse_values:
146 | gdf = gdf.drop(columns=["value"]) # values col is misleading in this case
147 |
148 | return gdf
149 |
--------------------------------------------------------------------------------
/tobler/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .glm import glm
2 |
3 | __all__ = ['glm']
--------------------------------------------------------------------------------
/tobler/model/glm.py:
--------------------------------------------------------------------------------
1 | """Model-based methods for areal interpolation."""
2 |
3 | import numpy as np
4 | import statsmodels.formula.api as smf
5 | from statsmodels.genmod.families import Gaussian, NegativeBinomial, Poisson
6 |
7 | from ..dasymetric import _fast_append_profile_in_gdf
8 | from ..util.util import _check_presence_of_crs
9 |
10 | __all__ = ["glm"]
11 |
12 |
13 | def glm(
14 | source_df=None,
15 | target_df=None,
16 | raster="nlcd_2011",
17 | raster_codes=None,
18 | variable=None,
19 | formula=None,
20 | likelihood="poisson",
21 | force_crs_match=True,
22 | return_model=False,
23 | ):
24 | """Train a generalized linear model to predict polygon attributes based on the collection of pixel values they contain.
25 |
26 | Parameters
27 | ----------
28 | source_df : geopandas.GeoDataFrame, required
29 | geodataframe containing source original data to be represented by another geometry
30 | target_df : geopandas.GeoDataFrame, required
31 | geodataframe containing target boundaries that will be used to represent the source data
32 | raster : str, required (default="nlcd_2011")
33 | path to raster file that will be used to input data to the regression model.
34 | i.e. a coefficients refer to the relationship between pixel counts and population counts.
35 | Defaults to 2011 NLCD
36 | raster_codes : list, required (default =[21, 22, 23, 24, 41, 42, 52])
37 | list of integers that represent different types of raster cells. If no formula is given,
38 | the model will be fit from a linear combination of the logged count of each cell type
39 | listed here. Defaults to [21, 22, 23, 24, 41, 42, 52] which
40 | are informative land type cells from the NLCD
41 | variable : str, required
42 | name of the variable (column) to be modeled from the `source_df`
43 | formula : str, optional
44 | patsy-style model formula that specifies the model. Raster codes should be prefixed with
45 | "Type_", e.g. `"n_total_pop ~ -1 + np.log1p(Type_21) + np.log1p(Type_22)`
46 | likelihood : str, {'poisson', 'gaussian', 'neg_binomial'} (default = "poisson")
47 | the likelihood function used in the model
48 | force_crs_match : bool
49 | whether to coerce geodataframe and raster to the same CRS
50 | return model : bool
51 | whether to return the fitted model in addition to the interpolated geodataframe.
52 | If true, this will return (geodataframe, model)
53 |
54 | Returns
55 | --------
56 | interpolated : geopandas.GeoDataFrame
57 | a new geopandas dataframe with boundaries from `target_df` and modeled attribute
58 | data from the `source_df`. If `return_model` is true, the function will also return
59 | the fitted regression model for further diagnostics
60 |
61 |
62 | """
63 | source_df = source_df.copy()
64 | target_df = target_df.copy()
65 | _check_presence_of_crs(source_df)
66 | liks = {"poisson": Poisson, "gaussian": Gaussian, "neg_binomial": NegativeBinomial}
67 |
68 | if likelihood not in liks.keys():
69 | raise ValueError(f"likelihood must one of {liks.keys()}")
70 |
71 | if not raster_codes:
72 | raster_codes = [21, 22, 23, 24, 41, 42, 52]
73 | raster_codes = ["Type_" + str(i) for i in raster_codes]
74 |
75 | if not formula:
76 | formula = (
77 | variable
78 | + "~ -1 +"
79 | + "+".join(["np.log1p(" + code + ")" for code in raster_codes])
80 | )
81 |
82 | profiled_df = _fast_append_profile_in_gdf(
83 | source_df[[source_df.geometry.name, variable]], raster, force_crs_match
84 | )
85 |
86 | results = smf.glm(formula, data=profiled_df, family=liks[likelihood]()).fit()
87 |
88 | out = target_df[[target_df.geometry.name]]
89 | temp = _fast_append_profile_in_gdf(
90 | out[[out.geometry.name]], raster, force_crs_match
91 | )
92 |
93 | out[variable] = results.predict(temp.drop(columns=[temp.geometry.name]).fillna(0))
94 |
95 | if return_model:
96 | return out, results
97 |
98 | return out
99 |
--------------------------------------------------------------------------------
/tobler/pycno/__init__.py:
--------------------------------------------------------------------------------
1 | from .pycno import pycno_interpolate
2 |
3 | __all__ = ['pycno_interpolate']
4 |
--------------------------------------------------------------------------------
/tobler/pycno/pycno.py:
--------------------------------------------------------------------------------
1 | """Pycnophylactic Interpolation (contributed by @danlewis85)."""
2 | # https://github.com/danlewis85/pycno/
3 |
4 | import numpy as np
5 | import pandas as pd
6 | import rasterio
7 | from numpy import (
8 | absolute,
9 | apply_along_axis,
10 | asarray,
11 | convolve,
12 | copy,
13 | nan,
14 | nanmax,
15 | nanmean,
16 | nansum,
17 | pad,
18 | power,
19 | round,
20 | unique,
21 | )
22 | from numpy.ma import masked_invalid, masked_where
23 | from pandas import DataFrame
24 | from rasterio.features import rasterize
25 |
26 | __all__ = ["pycno_interpolate"]
27 |
28 |
29 | def pycno(
30 | gdf, value_field, cellsize, r=0.2, handle_null=True, converge=3, verbose=True
31 | ):
32 | """Returns a smooth pycnophylactic interpolation raster for a given geodataframe
33 |
34 | Args:
35 | gdf (geopandas.geodataframe.GeoDataFrame): Input GeoDataFrame.
36 | value_field (str): Field name of values to be used to produce pycnophylactic surface
37 | cellsize (int): Pixel size of raster in planar units (i.e. metres, feet)
38 | r (float, optional): Relaxation parameter, default of 0.2 is generally fine.
39 | handle_null (boolean, optional): Changes how nodata values are smoothed. Default True.
40 | converge (int, optional): Index for stopping value, default 3 is generally fine.
41 | verbose (boolean, optional): Print out progress at each iteration.
42 |
43 | Returns:
44 | Numpy Array: Smooth pycnophylactic interpolation.
45 | Rasterio geotransform
46 | GeoPandas crs
47 | """
48 | # set nodata value
49 | nodata = -9999
50 |
51 | # work out raster rows and columns based on gdf extent and cellsize
52 | xmin, ymin, xmax, ymax = gdf.total_bounds
53 | xres = int((xmax - xmin) / cellsize)
54 | yres = int((ymax - ymin) / cellsize)
55 |
56 | # Work out transform so that we rasterize the area where the data are!
57 | trans = rasterio.Affine.from_gdal(xmin, cellsize, 0, ymax, 0, -cellsize)
58 |
59 | # First make a zone array
60 | # NB using index values as ids can often be too large/alphanumeric. Limit is int32 polygon features.
61 | # create a generator of geom, index pairs to use in rasterizing
62 | shapes = ((geom, value) for geom, value in zip(gdf.geometry, gdf.index))
63 | # burn the features into a raster array
64 | feature_array = rasterize(
65 | shapes=shapes, fill=nodata, out_shape=(yres, xres), transform=trans
66 | )
67 |
68 | # Get cell counts per index value (feature)
69 | unique, count = np.unique(feature_array, return_counts=True)
70 | cellcounts = asarray((unique, count)).T
71 | # Lose the nodata counts
72 | cellcounts = cellcounts[cellcounts[:, 0] != nodata, :]
73 | # Adjust value totals by cells
74 | # Make cell counts dataframe
75 | celldf = DataFrame(cellcounts[:, 1], index=cellcounts[:, 0], columns=["cellcount"])
76 | # Merge cell counts
77 | gdf = gdf.merge(celldf, how="left", left_index=True, right_index=True)
78 | # Calculate cell values
79 | gdf["cellvalues"] = gdf[value_field] / gdf["cellcount"]
80 |
81 | # create a generator of geom, cellvalue pairs to use in rasterizing
82 | shapes = ((geom, value) for geom, value in zip(gdf.geometry, gdf.cellvalues))
83 | # Now burn the initial value raster
84 | value_array = rasterize(
85 | shapes=shapes, fill=nodata, out_shape=(yres, xres), transform=trans
86 | )
87 |
88 | # Set nodata in value array to np.nan
89 | value_array[value_array == -9999] = nan
90 |
91 | # Set stopper value based on converge parameter
92 | stopper = nanmax(value_array) * power(10.0, -converge)
93 |
94 | # The basic numpy convolve function doesn't handle nulls.
95 | def smooth2D(data):
96 | # Create function that calls a 1 dimensionsal smoother.
97 | s1d = lambda s: convolve(s, [0.5, 0.0, 0.5], mode="same")
98 | # pad the data array with the mean value
99 | padarray = pad(data, 1, "constant", constant_values=nanmean(data))
100 | # make nodata mask
101 | mask = masked_invalid(padarray).mask
102 | # set nodata as zero to avoid eroding the raster
103 | padarray[mask] = 0.0
104 | # Apply the convolution along each axis of the data and average
105 | padarray = (
106 | apply_along_axis(s1d, 1, padarray) + apply_along_axis(s1d, 0, padarray)
107 | ) / 2
108 | # Reinstate nodata
109 | padarray[mask] = nan
110 | return padarray[1:-1, 1:-1]
111 |
112 | # The convolution function from astropy handles nulls.
113 | def astroSmooth2d(data):
114 | try:
115 | from astropy.convolution import convolve as astro_convolve
116 | except (ImportError, ModuleNotFoundError) as err:
117 | raise ImportError(
118 | "Pycnophylactic interpolation with handle_null=True "
119 | "requires the astropy package"
120 | ) from err
121 | s1d = lambda s: astro_convolve(s, [0.5, 0, 0.5])
122 | # pad the data array with the mean value
123 | padarray = pad(data, 1, "constant", constant_values=nanmean(data))
124 | # Apply the convolution along each axis of the data and average
125 | padarray = (
126 | apply_along_axis(s1d, 1, padarray) + apply_along_axis(s1d, 0, padarray)
127 | ) / 2
128 | return padarray[1:-1, 1:-1]
129 |
130 | def correct2Da(data):
131 | for idx, val in gdf[value_field].items():
132 | # Create zone mask from feature_array
133 | mask = masked_where(feature_array == idx, feature_array).mask
134 | # Work out the correction factor
135 | correct = (val - nansum(data[mask])) / mask.sum()
136 | # Apply correction
137 | data[mask] += correct
138 |
139 | return data
140 |
141 | def correct2Dm(data):
142 | for idx, val in gdf[value_field].items():
143 | # Create zone mask from feature_array
144 | mask = masked_where(feature_array == idx, feature_array).mask
145 | # Work out the correction factor
146 | correct = val / nansum(data[mask])
147 | if correct != 0.0:
148 | # Apply correction
149 | data[mask] *= correct
150 |
151 | return data
152 |
153 | while True:
154 | # Store the current iteration
155 | old = copy(value_array)
156 |
157 | # Smooth the value_array
158 | if handle_null:
159 | sm = astroSmooth2d(value_array)
160 | else:
161 | sm = smooth2D(value_array)
162 |
163 | # Relaxation to prevent overcompensation in the smoothing step
164 | value_array = value_array * r + (1.0 - r) * sm
165 |
166 | # Perform correction
167 | value_array = correct2Da(value_array)
168 |
169 | # Reset any negative values to zero.
170 | value_array[value_array < 0] = 0.0
171 |
172 | # Perform correction
173 | value_array = correct2Dm(value_array)
174 |
175 | if verbose:
176 | print(
177 | "Maximum Change: "
178 | + str(round(nanmax(absolute(old - value_array)), 4))
179 | + " - will stop at "
180 | + str(round(stopper, 4))
181 | )
182 |
183 | if nanmax(absolute(old - value_array)) < stopper:
184 | break
185 |
186 | return (value_array, trans, gdf.crs)
187 |
188 |
189 | def save_pycno(pycno_array, transform, crs, filestring, driver="GTiff"):
190 | """Saves a numpy array as a raster, largely a helper function for pycno
191 | Args:
192 | pycno_array (numpy array): 2D numpy array of pycnophylactic surface
193 | transform (rasterio geotransform): Relevant transform from pycno()
194 | crs (GeoPandas crs): Coordinate reference system of GeoDataFrame used in pycno()
195 | filestring (str): File path to save raster
196 | driver (str, optional): Format for output raster, default: geoTiff.
197 | Returns:
198 | None
199 | """
200 | import rasterio
201 |
202 | # Save raster
203 | new_dataset = rasterio.open(
204 | filestring,
205 | "w",
206 | driver=driver,
207 | height=pycno_array.shape[0],
208 | width=pycno_array.shape[1],
209 | count=1,
210 | dtype="float64",
211 | crs=crs,
212 | transform=transform,
213 | )
214 | new_dataset.write(pycno_array.astype("float64"), 1)
215 | new_dataset.close()
216 |
217 | return None
218 |
219 |
220 | def extract_values(pycno_array, gdf, transform, fieldname="Estimate"):
221 | """Extract raster value sums according to a provided polygon geodataframe
222 | Args:
223 | pycno_array (numpy array): 2D numpy array of pycnophylactic surface.
224 | gdf (geopandas.geodataframe.GeoDataFrame): Target GeoDataFrame.
225 | transform (rasterio geotransform): Relevant transform from pycno()
226 | fieldname (str, optional): New gdf field to save estimates in. Default name: 'Estimate'.
227 | Returns:
228 | geopandas.geodataframe.GeoDataFrame: Target GeoDataFrame with appended estimates.
229 | """
230 | from numpy import nansum
231 | from rasterio.features import geometry_mask
232 |
233 | estimates = []
234 | # Iterate through geodataframe and extract values
235 | for idx, geom in gdf["geometry"].items():
236 | mask = geometry_mask(
237 | [geom], pycno_array.shape, transform=transform, invert=True
238 | )
239 | estimates.append(nansum(pycno_array[mask]))
240 | out = pd.Series(estimates, index=gdf.index)
241 | return out
242 |
243 |
244 | def pycno_interpolate(
245 | source_df,
246 | target_df,
247 | variables,
248 | cellsize,
249 | r=0.2,
250 | handle_null=True,
251 | converge=3,
252 | verbose=False,
253 | ):
254 | """Pycnophylactic Inerpolation.
255 |
256 | Parameters
257 | ----------
258 | source_df : geopandas.GeoDataFrame (required)
259 | geodataframe with polygon geometries and data to transfer
260 | target_df : geopandas.GeoDataFrame (required)
261 | geodataframe with polygon geometries to receive new data
262 | variables : list
263 | columns on the source_df containing data to transfer
264 | cellsize : int
265 | Pixel size of intermediate raster in planar units (i.e. metres, feet)
266 | r : float, optional
267 | Relaxation parameter, default of 0.2 is generally fine
268 | handle_null : bool, optional
269 | Changes how nodata values are smoothed. Default True.
270 | converge : int, optional
271 | Index for stopping value, default 3 is generally fine.
272 | verbose : bool, optional
273 | Print out progress at each iteration.
274 |
275 | Returns
276 | -------
277 | geopandas.GeoDataFrame
278 | new geodataframe with interpolated variables as columns and target_df geometry
279 | as output geometry
280 |
281 | Notes
282 | -----
283 | The formula is based on Tobler, W. R. (1979). Smooth pycnophylactic interpolation for geographical regions. Journal of the American Statistical Association, 74(367), 519–529. https://doi.org/10.1080/01621459.1979.10481647
284 |
285 | Original implementation written by @danlewis85 at
286 | and based in part on the R pycno package by Chris Brusndon ()
287 |
288 | References: :cite:`tobler_smooth_1979`
289 | """
290 | assert source_df.crs.equals(
291 | target_df.crs
292 | ), "source_df CRS and target_df CRS are not the same. Reproject into consistent systems before proceeding"
293 | output_vars = target_df.copy()[[target_df.geometry.name]]
294 | for variable in variables:
295 | pyc, trans, _ = pycno(
296 | source_df,
297 | variable,
298 | cellsize=cellsize,
299 | r=r,
300 | handle_null=handle_null,
301 | converge=converge,
302 | verbose=verbose,
303 | )
304 | vals = extract_values(pyc, target_df, transform=trans)
305 | output_vars[variable] = vals
306 |
307 | return output_vars
308 |
--------------------------------------------------------------------------------
/tobler/tests/test_area_interpolators.py:
--------------------------------------------------------------------------------
1 | """test interpolation functions."""
2 |
3 | import geopandas
4 | import dask_geopandas
5 |
6 | from libpysal.examples import load_example
7 | from numpy.testing import assert_almost_equal
8 | from tobler.area_weighted import area_interpolate
9 | from tobler.area_weighted import area_interpolate_dask
10 | from tobler.area_weighted.area_interpolate import _area_tables_binning
11 | from geopandas.testing import assert_geodataframe_equal
12 | import pytest
13 |
14 |
15 | def datasets():
16 | sac1 = load_example("Sacramento1")
17 | sac2 = load_example("Sacramento2")
18 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
19 | sac1 = sac1.to_crs(sac1.estimate_utm_crs())
20 | sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp"))
21 | sac2 = sac2.to_crs(sac1.crs)
22 | sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT
23 | categories = ["cat", "dog", "donkey", "wombat", "capybara"]
24 | sac1["animal"] = (categories * ((len(sac1) // len(categories)) + 1))[: len(sac1)]
25 |
26 | return sac1, sac2
27 |
28 |
29 | def test_area_interpolate_singlecore():
30 | sac1, sac2 = datasets()
31 | area = area_interpolate(
32 | source_df=sac1,
33 | target_df=sac2,
34 | extensive_variables=["TOT_POP"],
35 | intensive_variables=["pct_poverty"],
36 | categorical_variables=["animal"],
37 | n_jobs=1,
38 | )
39 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0)
40 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0)
41 | assert_almost_equal(area.animal_cat.sum(), 32, decimal=0)
42 | assert_almost_equal(area.animal_dog.sum(), 19, decimal=0)
43 | assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0)
44 | assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0)
45 | assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0)
46 |
47 |
48 | def test_area_interpolate_extensive():
49 | sac1, sac2 = datasets()
50 | area = area_interpolate(
51 | source_df=sac1.to_crs(4326), # trigger warning once
52 | target_df=sac2.to_crs(4326),
53 | extensive_variables=["TOT_POP"],
54 | n_jobs=1,
55 | )
56 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0)
57 |
58 |
59 | def test_area_interpolate_intensive():
60 | sac1, sac2 = datasets()
61 | area = area_interpolate(
62 | source_df=sac1,
63 | target_df=sac2,
64 | intensive_variables=["pct_poverty"],
65 | n_jobs=1,
66 | )
67 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0)
68 |
69 |
70 | def test_area_interpolate_categorical():
71 | sac1, sac2 = datasets()
72 | area = area_interpolate(
73 | source_df=sac1,
74 | target_df=sac2,
75 | extensive_variables=["TOT_POP"],
76 | intensive_variables=["pct_poverty"],
77 | categorical_variables=["animal"],
78 | n_jobs=1,
79 | )
80 | assert_almost_equal(area.animal_cat.sum(), 32, decimal=0)
81 | assert_almost_equal(area.animal_dog.sum(), 19, decimal=0)
82 | assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0)
83 | assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0)
84 | assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0)
85 |
86 |
87 | @pytest.mark.xfail(reason="dask_geopandas is broken with dask-expr backend")
88 | def test_area_interpolate_categorical_dask():
89 | sac1, sac2 = datasets()
90 | sac1["animal"] = sac1["animal"].astype("category")
91 | dsac1 = dask_geopandas.from_geopandas(sac1, npartitions=2).spatial_shuffle(
92 | by="hilbert", shuffle="tasks"
93 | )
94 | dsac2 = dask_geopandas.from_geopandas(sac2, npartitions=2).spatial_shuffle(
95 | by="hilbert", shuffle="tasks"
96 | )
97 | area = area_interpolate_dask(
98 | source_dgdf=dsac1,
99 | target_dgdf=dsac2,
100 | id_col="ZIP",
101 | categorical_variables=["animal"],
102 | ).compute()
103 | assert_almost_equal(area.animal_cat.sum(), 32, decimal=0)
104 | assert_almost_equal(area.animal_dog.sum(), 19, decimal=0)
105 | assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0)
106 | assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0)
107 | assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0)
108 |
109 |
110 | def test_area_interpolate_custom_index():
111 | sac1, sac2 = datasets()
112 | sac1.index = sac1.index * 2
113 | sac2.index = sac2.index * 13
114 | area = area_interpolate(
115 | source_df=sac1,
116 | target_df=sac2,
117 | extensive_variables=["TOT_POP"],
118 | intensive_variables=["pct_poverty"],
119 | categorical_variables=["animal"],
120 | )
121 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0)
122 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0)
123 | assert_almost_equal(area.animal_cat.sum(), 32, decimal=0)
124 | assert_almost_equal(area.animal_dog.sum(), 19, decimal=0)
125 | assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0)
126 | assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0)
127 | assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0)
128 | assert not area.isna().any().any()
129 |
130 |
131 | def test_area_interpolate_sindex_options():
132 | sac1, sac2 = datasets()
133 | auto = area_interpolate(
134 | source_df=sac1,
135 | target_df=sac2,
136 | extensive_variables=["TOT_POP"],
137 | intensive_variables=["pct_poverty"],
138 | )
139 | source = area_interpolate(
140 | source_df=sac1,
141 | target_df=sac2,
142 | extensive_variables=["TOT_POP"],
143 | intensive_variables=["pct_poverty"],
144 | spatial_index="source",
145 | )
146 | target = area_interpolate(
147 | source_df=sac1,
148 | target_df=sac2,
149 | extensive_variables=["TOT_POP"],
150 | intensive_variables=["pct_poverty"],
151 | spatial_index="target",
152 | )
153 |
154 | assert_geodataframe_equal(auto, source)
155 | assert_geodataframe_equal(auto, target)
156 |
157 | with pytest.raises(ValueError):
158 | area_interpolate(
159 | source_df=sac1,
160 | target_df=sac2,
161 | extensive_variables=["TOT_POP"],
162 | intensive_variables=["pct_poverty"],
163 | spatial_index="non-existent",
164 | )
165 |
166 |
167 | def test_area_interpolate_parallel():
168 | sac1, sac2 = datasets()
169 | area = area_interpolate(
170 | source_df=sac1,
171 | target_df=sac2,
172 | extensive_variables=["TOT_POP"],
173 | intensive_variables=["pct_poverty"],
174 | n_jobs=-1,
175 | )
176 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0)
177 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0)
178 |
179 |
180 | def test_area_tables_binning():
181 | sac1, sac2 = datasets()
182 | sac1 = sac1.to_crs(4326)
183 | sac2 = sac2.to_crs(4326)
184 |
185 | auto = _area_tables_binning(
186 | source_df=sac1, target_df=sac2, spatial_index="auto"
187 | )
188 | source = _area_tables_binning(
189 | source_df=sac1, target_df=sac2, spatial_index="source"
190 | )
191 | target = _area_tables_binning(
192 | source_df=sac1, target_df=sac2, spatial_index="target"
193 | )
194 |
195 | assert (auto != source).sum() == 0
196 | assert (auto != target).sum() == 0
197 |
198 | assert auto.sum() == pytest.approx(1.3879647)
199 | assert auto.mean() == pytest.approx(2.7552649e-05)
200 |
201 | assert (auto[5][0].toarray() > 0).sum() == 7
202 |
203 |
204 | def test_passed_table():
205 | sac1, sac2 = datasets()
206 | csr = _area_tables_binning(source_df=sac1, target_df=sac2, spatial_index="auto")
207 |
208 | area = area_interpolate(
209 | source_df=sac1,
210 | target_df=sac2,
211 | extensive_variables=["TOT_POP"],
212 | intensive_variables=["pct_poverty"],
213 | table=csr,
214 | )
215 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0)
216 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0)
217 |
218 | dok = csr.todok()
219 |
220 | area = area_interpolate(
221 | source_df=sac1,
222 | target_df=sac2,
223 | extensive_variables=["TOT_POP"],
224 | intensive_variables=["pct_poverty"],
225 | table=dok,
226 | )
227 | assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0)
228 | assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0)
229 |
--------------------------------------------------------------------------------
/tobler/tests/test_area_join.py:
--------------------------------------------------------------------------------
1 | import geopandas as gpd
2 | import numpy as np
3 | from shapely.geometry import Point
4 |
5 | import pytest
6 |
7 | from tobler.area_weighted import area_join
8 |
9 |
10 | class TestAreaJoin:
11 | def setup_method(self):
12 | self.grid = gpd.points_from_xy(
13 | np.repeat(np.linspace(1, 10, 10), 10), np.tile(np.linspace(1, 10, 10), 10)
14 | ).buffer(0.5, cap_style=3)
15 | self.source = gpd.GeoDataFrame(
16 | {
17 | "floats": np.linspace(1, 10, 100),
18 | "ints": np.linspace(1, 100, 100, dtype="int"),
19 | "strings": np.array(["darribas", "is", "the", "king"] * 25),
20 | },
21 | geometry=self.grid,
22 | )
23 |
24 | self.target = gpd.GeoDataFrame(geometry=self.grid.translate(xoff=2.2, yoff=0.2))
25 |
26 | def test_area_join_float(self):
27 | result = area_join(self.source, self.target, "floats")
28 | assert (result.columns == ["geometry", "floats"]).all()
29 | np.testing.assert_almost_equal(result.floats.mean(), 6.409, 3)
30 | assert result.floats.dtype == float
31 | assert result.floats.isna().sum() == 20
32 |
33 | def test_area_join_ints(self):
34 | with pytest.warns(UserWarning, match="Cannot preserve dtype of"):
35 | result = area_join(self.source, self.target, "ints")
36 |
37 | assert (result.columns == ["geometry", "ints"]).all()
38 | np.testing.assert_almost_equal(result.ints.mean(), 60.5, 3)
39 | assert result.ints.dtype == object
40 | assert type(result.ints.iloc[0]) == int
41 | assert result.ints.isna().sum() == 20
42 |
43 | def test_area_join_strings(self):
44 | result = area_join(self.source, self.target, "strings")
45 | assert (result.columns == ["geometry", "strings"]).all()
46 | assert result.strings.dtype == object
47 | assert type(result.strings.iloc[0]) == str
48 | assert result.strings.isna().sum() == 20
49 |
50 | def test_area_join_array(self):
51 | with pytest.warns(UserWarning, match="Cannot preserve dtype of"):
52 | result = area_join(self.source, self.target, ["floats", "ints", "strings"])
53 |
54 | assert (result.columns == ["geometry", "floats", "ints", "strings"]).all()
55 | np.testing.assert_almost_equal(result.floats.mean(), 6.409, 3)
56 | assert result.floats.dtype == float
57 | assert result.floats.isna().sum() == 20
58 | np.testing.assert_almost_equal(result.ints.mean(), 60.5, 3)
59 | assert result.ints.dtype == object
60 | assert type(result.ints.iloc[0]) == int
61 | assert result.ints.isna().sum() == 20
62 | assert result.strings.dtype == object
63 | assert type(result.strings.iloc[0]) == str
64 | assert result.strings.isna().sum() == 20
65 |
66 | def test_area_join_error(self):
67 | target = self.target
68 | target["floats"] = 0
69 | with pytest.raises(ValueError, match="Column 'floats'"):
70 | area_join(self.source, target, "floats")
71 |
--------------------------------------------------------------------------------
/tobler/tests/test_dasymetric.py:
--------------------------------------------------------------------------------
1 | """test interpolation functions."""
2 | import geopandas
3 |
4 | from libpysal.examples import load_example
5 | from tobler.dasymetric import masked_area_interpolate
6 |
7 |
8 | def datasets():
9 | sac1 = load_example("Sacramento1")
10 | sac2 = load_example("Sacramento2")
11 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
12 | sac1 = sac1.to_crs(sac1.estimate_utm_crs())
13 | sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp"))
14 | sac2 = sac2.to_crs(sac2.estimate_utm_crs())
15 | sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT
16 | categories = ["cat", "dog", "donkey", "wombat", "capybara"]
17 | sac1["animal"] = (categories * ((len(sac1) // len(categories)) + 1))[
18 | : len(sac1)
19 | ]
20 | return sac1, sac2
21 |
22 |
23 | def test_masked_area_interpolate():
24 | sac1, sac2 = datasets()
25 | masked = masked_area_interpolate(
26 | source_df=sac1,
27 | target_df=sac2,
28 | extensive_variables=["TOT_POP"],
29 | intensive_variables=["pct_poverty"],
30 | raster="https://spatial-ucr.s3.amazonaws.com/nlcd/landcover/nlcd_landcover_2011.tif",
31 | pixel_values=[21, 22, 23, 24],
32 | )
33 | assert masked.TOT_POP.sum().round(0) == sac1.TOT_POP.sum()
34 | assert masked.pct_poverty.sum() > 2000
35 |
--------------------------------------------------------------------------------
/tobler/tests/test_model.py:
--------------------------------------------------------------------------------
1 | """test interpolation functions."""
2 | import geopandas
3 |
4 | from libpysal.examples import load_example
5 |
6 | from tobler.model import glm
7 |
8 |
9 | def datasets():
10 | sac1 = load_example("Sacramento1")
11 | sac2 = load_example("Sacramento2")
12 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
13 | sac1 = sac1.to_crs(sac1.estimate_utm_crs())
14 | sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp"))
15 | sac2 = sac2.to_crs(sac2.estimate_utm_crs())
16 | sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT
17 | categories = ["cat", "dog", "donkey", "wombat", "capybara"]
18 | sac1["animal"] = (categories * ((len(sac1) // len(categories)) + 1))[
19 | : len(sac1)
20 | ]
21 |
22 | return sac1, sac2
23 |
24 |
25 | def test_glm_poisson():
26 | sac1, sac2 = datasets()
27 | glm_poisson = glm(
28 | source_df=sac2, target_df=sac1, variable="POP2001", raster="https://spatial-ucr.s3.amazonaws.com/nlcd/landcover/nlcd_landcover_2011.tif",
29 | )
30 | assert glm_poisson.POP2001.sum() > 1469000
31 |
--------------------------------------------------------------------------------
/tobler/tests/test_pycno.py:
--------------------------------------------------------------------------------
1 | """test interpolation functions."""
2 | import geopandas
3 |
4 | from libpysal.examples import load_example
5 | from numpy.testing import assert_almost_equal
6 | from tobler.pycno import pycno_interpolate
7 |
8 |
9 | def datasets():
10 | sac1 = load_example("Sacramento1")
11 | sac2 = load_example("Sacramento2")
12 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
13 | sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp"))
14 | sac1 = sac1.to_crs(sac1.estimate_utm_crs())
15 | sac2 = sac2.to_crs(sac1.crs)
16 | sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT
17 |
18 | return sac1, sac2
19 |
20 |
21 | def test_pycno_interpolate():
22 | sac1, sac2 = datasets()
23 | pyc = pycno_interpolate(
24 | source_df=sac1, target_df=sac2, variables=["TOT_POP"], cellsize=500
25 | )
26 | assert_almost_equal(pyc.TOT_POP.sum(), 1794618.503, decimal=1)
27 |
28 | def test_custom_index():
29 | sac1, sac2 = datasets()
30 | sac2 = sac2.set_index("ZIP")
31 | pyc = pycno_interpolate(
32 | source_df=sac1, target_df=sac2, variables=["TOT_POP"], cellsize=500
33 | )
34 | assert_almost_equal(pyc.TOT_POP.sum(), 1794618.503, decimal=1)
--------------------------------------------------------------------------------
/tobler/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | """test utility functions."""
2 |
3 | import platform
4 |
5 | import geopandas
6 | import pytest
7 | from libpysal.examples import load_example
8 | from numpy.testing import assert_almost_equal
9 |
10 | from tobler.util import h3fy
11 |
12 |
13 | def test_h3fy():
14 | sac1 = load_example("Sacramento1")
15 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
16 | sac_hex = h3fy(sac1, return_geoms=True)
17 | assert sac_hex.shape == (364, 1)
18 |
19 |
20 | def test_h3fy_nogeoms():
21 | sac1 = load_example("Sacramento1")
22 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
23 | sac_hex = h3fy(sac1, return_geoms=False)
24 | assert len(sac_hex) == 364
25 |
26 |
27 | def test_h3fy_nocrs():
28 | sac1 = load_example("Sacramento1")
29 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
30 | sac1.crs = None
31 | try:
32 | sac_hex = h3fy(sac1, return_geoms=True)
33 | except ValueError:
34 | pass
35 |
36 |
37 | def test_h3fy_diff_crs():
38 | sac1 = load_example("Sacramento1")
39 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
40 | sac1 = sac1.to_crs(32710)
41 | sac_hex = h3fy(sac1)
42 | assert sac_hex.shape == (364, 1)
43 | assert sac_hex.crs.to_string() == "EPSG:32710"
44 |
45 |
46 | def test_h3fy_clip():
47 | sac1 = load_example("Sacramento1")
48 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
49 | sac_hex = h3fy(sac1, clip=True)
50 | sac_hex = sac_hex.to_crs(sac_hex.estimate_utm_crs())
51 | assert_almost_equal(
52 | sac_hex.area.sum(), 13131736346.537422, decimal=0
53 | )
54 |
55 | def test_h3fy_clip_buffer():
56 | sac1 = load_example("Sacramento1")
57 | sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
58 | sac_hex = h3fy(sac1, clip=True, buffer=True)
59 | sac_hex = sac_hex.to_crs(sac_hex.estimate_utm_crs())
60 | sac1 = sac1.to_crs(sac_hex.estimate_utm_crs())
61 | assert_almost_equal(
62 | sac_hex.area.sum(), sac1.area.sum(), decimal=-8
63 | )
64 |
65 | @pytest.mark.skipif(platform.system() == "Windows", reason='Unknown precision error on Windows. See #174 for details')
66 | def test_h3_multipoly():
67 | va = geopandas.read_file(load_example("virginia").get_path("virginia.shp"))
68 | va = va.to_crs(va.estimate_utm_crs())
69 |
70 | va = h3fy(va)
71 | assert_almost_equal(va.area.sum(), 102888497504.47836, decimal=0)
72 |
--------------------------------------------------------------------------------
/tobler/util/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 |
3 | __all__ = ['h3fy', 'circumradius']
--------------------------------------------------------------------------------
/tobler/util/util.py:
--------------------------------------------------------------------------------
1 | """Useful functions to support tobler's interpolation methods."""
2 |
3 | from warnings import warn
4 |
5 | import geopandas
6 | import numpy as np
7 | import pandas
8 | import shapely
9 | from packaging.version import Version
10 | from shapely.geometry import Polygon
11 |
12 | GPD_10 = Version(geopandas.__version__) >= Version("1.0.0dev")
13 |
14 | __all__ = ["h3fy", "circumradius"]
15 |
16 |
17 | def circumradius(resolution):
18 | """Find the circumradius of an h3 hexagon at given resolution.
19 |
20 | Parameters
21 | ----------
22 | resolution : int
23 | h3 grid resolution
24 |
25 | Returns
26 | -------
27 | circumradius : float
28 | circumradius in meters
29 | """
30 | try:
31 | import h3
32 | except ImportError:
33 | raise ImportError(
34 | "This function requires the `h3` library. "
35 | "You can install it with `conda install h3-py` or "
36 | "`pip install h3`"
37 | )
38 | if Version(h3.__version__) < Version("4.0"):
39 | return h3.edge_length(resolution, "m")
40 | return h3.average_hexagon_edge_length(resolution, "m")
41 |
42 |
43 | def _check_crs(source_df, target_df):
44 | """check if crs is identical"""
45 | if not (source_df.crs == target_df.crs):
46 | print("Source and target dataframes have different crs. Please correct.")
47 | return False
48 | return True
49 |
50 |
51 | def _nan_check(df, column):
52 | """Check if variable has nan values.
53 |
54 | Warn and replace nan with 0.0.
55 | """
56 | values = df[column].values
57 | if np.any(np.isnan(values)) or np.any(np.isinf(values)):
58 | wherenan = np.isnan(values)
59 | values[wherenan] = 0.0
60 | warn(f"nan values in variable: {column}, replacing with 0")
61 | return values
62 |
63 |
64 | def _inf_check(df, column):
65 | """Check if variable has nan values.
66 |
67 | Warn and replace inf with 0.0.
68 | """
69 | values = df[column].values
70 | if np.any(np.isinf(values)):
71 | wherenan = np.isinf(values)
72 | values[wherenan] = 0.0
73 | warn(f"inf values in variable: {column}, replacing with 0")
74 | return values
75 |
76 |
77 | def _check_presence_of_crs(geoinput):
78 | """check if there is crs in the polygon/geodataframe"""
79 | if geoinput.crs is None:
80 | raise KeyError("Geodataframe must have a CRS set before using this function.")
81 |
82 |
83 | def h3fy(source, resolution=6, clip=False, buffer=False, return_geoms=True):
84 | """Generate a hexgrid geodataframe that covers the face of a source geodataframe.
85 |
86 | Parameters
87 | ----------
88 | source : geopandas.GeoDataFrame
89 | GeoDataFrame to transform into a hexagonal grid
90 | resolution : int, optional (default is 6)
91 | resolution of output h3 hexgrid.
92 | See for more information
93 | clip : bool, optional (default is False)
94 | if True, hexagons are clipped by the boundary of the source gdf. Otherwise,
95 | heaxgons along the boundary will be left intact.
96 | buffer : bool, optional (default is False)
97 | if True, force hexagons to completely fill the interior of the source area.
98 | if False, (h3 default) may result in empty areas within the source area.
99 | return_geoms: bool, optional (default is True)
100 | whether to generate hexagon geometries as a geodataframe or simply return
101 | hex ids as a pandas.Series
102 |
103 | Returns
104 | -------
105 | pandas.Series or geopandas.GeoDataFrame
106 | if `return_geoms` is True, a geopandas.GeoDataFrame whose rows comprise a hexagonal h3 grid (indexed on h3 hex id).
107 | if `return_geoms` is False, a pandas.Series of h3 hexagon ids
108 | """
109 | try:
110 | import h3
111 | except ImportError as err:
112 | raise ImportError(
113 | "This function requires the `h3` library. "
114 | "You can install it with `conda install h3-py` or "
115 | "`pip install h3`"
116 | ) from err
117 | # h3 hexes only work on polygons, not multipolygons
118 | if source.crs is None:
119 | raise ValueError(
120 | "source geodataframe must have a valid CRS set before using this function"
121 | )
122 |
123 | orig_crs = source.crs
124 | clipper = source
125 |
126 | if source.crs.is_geographic:
127 | if buffer: # if CRS is geographic but user wants a buffer, we need to estimate
128 | warn(
129 | "The source geodataframe is stored in a geographic CRS. Falling back to estimated UTM zone "
130 | "to generate desired buffer. If this produces unexpected results, reproject the input data "
131 | "prior to using this function"
132 | )
133 | source = (
134 | source.to_crs(source.estimate_utm_crs())
135 | .buffer(circumradius(resolution))
136 | .to_crs(4326)
137 | )
138 |
139 | else: # if CRS is projected, we need lat/long
140 | crs_units = source.crs.to_dict()["units"]
141 | if buffer: # we can only convert between units we know
142 | if not crs_units in ["m", "us-ft"]:
143 | raise ValueError(
144 | f"The CRS of source geodataframe uses an unknown measurement unit: `{crs_units}`. "
145 | "The `buffer` argument requires either a geographic CRS or a projected one measured "
146 | "in meters or feet (U.S.)"
147 | )
148 | clipper = source.to_crs(4326)
149 | distance = circumradius(resolution)
150 | if crs_units == "ft-us":
151 | distance = distance * 3.281
152 | source = source.buffer(distance).to_crs(4326)
153 | else:
154 | source = source.to_crs(4326)
155 |
156 | if GPD_10:
157 | source_unary = shapely.force_2d(source.union_all())
158 | else:
159 | source_unary = shapely.force_2d(source.unary_union)
160 |
161 | if type(source_unary) == Polygon:
162 | hexagons = _to_hex(
163 | source_unary, resolution=resolution, return_geoms=return_geoms
164 | )
165 | else:
166 | output = []
167 | for geom in source_unary.geoms:
168 | hexes = _to_hex(geom, resolution=resolution, return_geoms=return_geoms)
169 | output.append(hexes)
170 | hexagons = pandas.concat(output)
171 |
172 | if return_geoms and clip:
173 | hexagons = geopandas.clip(hexagons, clipper)
174 |
175 | if return_geoms and not hexagons.crs.equals(orig_crs):
176 | hexagons = hexagons.to_crs(orig_crs)
177 |
178 | return hexagons
179 |
180 |
181 | def _to_hex(source, resolution=6, return_geoms=True, buffer=True):
182 | """Generate a hexgrid geodataframe that covers the face of a source geometry.
183 |
184 | Parameters
185 | ----------
186 | source : geometry
187 | geometry to transform into a hexagonal grid (needs to support __geo_interface__)
188 | resolution : int, optional (default is 6)
189 | resolution of output h3 hexgrid.
190 | See for more information
191 | return_geoms: bool, optional (default is True)
192 | whether to generate hexagon geometries as a geodataframe or simply return
193 | hex ids as a pandas.Series
194 |
195 | Returns
196 | -------
197 | pandas.Series or geopandas.GeoDataFrame
198 | if `return_geoms` is True, a geopandas.GeoDataFrame whose rows comprise a hexagonal h3 grid (indexed on h3 hex id).
199 | if `return_geoms` is False, a pandas.Series of h3 hexagon ids
200 | """
201 | try:
202 | import h3
203 | except ImportError as err:
204 | raise ImportError(
205 | "This function requires the `h3` library. "
206 | "You can install it with `conda install h3-py` or "
207 | "`pip install h3`"
208 | ) from err
209 |
210 | if Version(h3.__version__) > Version("4.0"):
211 | polyfill = h3.geo_to_cells
212 | kwargs = {}
213 | else:
214 | polyfill = h3.polyfill
215 | kwargs = dict(geo_json_conformant=True)
216 |
217 | hexids = pandas.Series(
218 | list(polyfill(source.__geo_interface__, resolution, **kwargs)),
219 | name="hex_id",
220 | )
221 |
222 | if not return_geoms:
223 | return hexids
224 |
225 | if Version(h3.__version__) > Version("4.0"):
226 | polys = hexids.apply(
227 | lambda hex_id: shapely.geometry.shape(h3.cells_to_geo([hex_id])),
228 | )
229 | else:
230 | polys = hexids.apply(
231 | lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)),
232 | )
233 |
234 | hexs = geopandas.GeoDataFrame(hexids, geometry=polys.values, crs=4326).set_index(
235 | "hex_id"
236 | )
237 |
238 | return hexs
239 |
--------------------------------------------------------------------------------