├── .github
    ├── dependabot.yml
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CITATION.cff
├── LICENSE.txt
├── README.md
├── code
    ├── 01-construct-models
    │   ├── 01-prep-ghsl.py
    │   ├── 02-download-cache.py
    │   └── 03-create-graphs.py
    ├── 02-attach-elevation
    │   ├── 01-aster-srtm
    │   │   ├── 01-download-aster_v3.py
    │   │   ├── 02-download-srtmgl1.py
    │   │   ├── 03-build-vrts.py
    │   │   └── 04-add-node-elevations.py
    │   └── 02-google
    │   │   ├── 01-cluster-nodes.py
    │   │   ├── 02-make-google-urls.py
    │   │   ├── 03-download-google-elevations.py
    │   │   └── 04-choose-best-elevation.py
    ├── 03-calculate-indicators
    │   ├── 01-calculate-node-bc.py
    │   ├── 02-calculate-indicators.py
    │   ├── 03-merge-indicators.py
    │   └── 04-create-metadata.py
    ├── 04-upload-repository
    │   ├── 01-save-files.py
    │   ├── 02-stage-files.py
    │   └── 03-upload-dataverse.py
    ├── config.json
    ├── environment.yml
    └── run.sh
└── paper
    ├── README.md
    └── latex
        ├── main.tex
        └── references.bib


/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 2
3 | updates:
4 |   - package-ecosystem: github-actions
5 |     directory: /
6 |     schedule:
7 |       interval: weekly
8 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: CI
 3 | 
 4 | on:  # yamllint disable-line rule:truthy
 5 |   push:
 6 |     branches: [main]
 7 |   pull_request:
 8 |     branches: [main]
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build:
13 |     name: ${{ matrix.os }}
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         os: [ubuntu-latest]
19 | 
20 |     defaults:
21 |       run:
22 |         shell: bash -elo pipefail {0}
23 | 
24 |     steps:
25 |       - name: Checkout repo
26 |         uses: actions/checkout@v4
27 | 
28 |       - name: Create environment with Micromamba
29 |         uses: mamba-org/setup-micromamba@v2
30 |         with:
31 |           cache-environment: true
32 |           environment-file: ./code/environment.yml
33 |           post-cleanup: none
34 | 
35 |       - name: Cache pre-commit
36 |         uses: actions/cache@v4
37 |         with:
38 |           path: ~/.cache/pre-commit/
39 |           key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }}
40 | 
41 |       - name: Run pre-commit checks
42 |         run: pre-commit run --all-files
43 |         env:
44 |           SKIP: no-commit-to-branch
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.ipynb
  2 | *.pdf
  3 | .DS_Store
  4 | keys.py
  5 | paper/analysis
  6 | paper/submission
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | cover/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | #   For a library or package, you might want to ignore these files since the code is
 94 | #   intended to run in multiple environments; otherwise, check them in:
 95 | # .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # UV
105 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
106 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #uv.lock
109 | 
110 | # poetry
111 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
112 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
113 | #   commonly ignored for libraries.
114 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
115 | #poetry.lock
116 | 
117 | # pdm
118 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
119 | #pdm.lock
120 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
121 | #   in version control.
122 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
123 | .pdm.toml
124 | .pdm-python
125 | .pdm-build/
126 | 
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 | 
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Environments
138 | .env
139 | .venv
140 | env/
141 | venv/
142 | ENV/
143 | env.bak/
144 | venv.bak/
145 | 
146 | # Spyder project settings
147 | .spyderproject
148 | .spyproject
149 | 
150 | # Rope project settings
151 | .ropeproject
152 | 
153 | # mkdocs documentation
154 | /site
155 | 
156 | # mypy
157 | .mypy_cache/
158 | .dmypy.json
159 | dmypy.json
160 | 
161 | # Pyre type checker
162 | .pyre/
163 | 
164 | # pytype static type analyzer
165 | .pytype/
166 | 
167 | # Cython debug symbols
168 | cython_debug/
169 | 
170 | # PyCharm
171 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
172 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
173 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
174 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
175 | #.idea/
176 | 
177 | # PyPI configuration file
178 | .pypirc
179 | 
180 | ## Core latex/pdflatex auxiliary files:
181 | *.aux
182 | *.lof
183 | *.log
184 | *.lot
185 | *.fls
186 | *.out
187 | *.toc
188 | *.fmt
189 | *.fot
190 | *.cb
191 | *.cb2
192 | .*.lb
193 | 
194 | ## Intermediate documents:
195 | *.dvi
196 | *.xdv
197 | *-converted-to.*
198 | # these rules might exclude image files for figures etc.
199 | # *.ps
200 | # *.eps
201 | # *.pdf
202 | 
203 | ## Generated if empty string is given at "Please type another file name for output:"
204 | .pdf
205 | 
206 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
207 | *.bbl
208 | *.bcf
209 | *.blg
210 | *-blx.aux
211 | *-blx.bib
212 | *.run.xml
213 | 
214 | ## Build tool auxiliary files:
215 | *.fdb_latexmk
216 | *.synctex
217 | *.synctex(busy)
218 | *.synctex.gz
219 | *.synctex.gz(busy)
220 | *.pdfsync
221 | 
222 | ## Build tool directories for auxiliary files
223 | # latexrun
224 | latex.out/
225 | 
226 | ## Auxiliary and intermediate files from other packages:
227 | # algorithms
228 | *.alg
229 | *.loa
230 | 
231 | # achemso
232 | acs-*.bib
233 | 
234 | # amsthm
235 | *.thm
236 | 
237 | # beamer
238 | *.nav
239 | *.pre
240 | *.snm
241 | *.vrb
242 | 
243 | # changes
244 | *.soc
245 | 
246 | # comment
247 | *.cut
248 | 
249 | # cprotect
250 | *.cpt
251 | 
252 | # elsarticle (documentclass of Elsevier journals)
253 | *.spl
254 | 
255 | # endnotes
256 | *.ent
257 | 
258 | # fixme
259 | *.lox
260 | 
261 | # feynmf/feynmp
262 | *.mf
263 | *.mp
264 | *.t[1-9]
265 | *.t[1-9][0-9]
266 | *.tfm
267 | 
268 | #(r)(e)ledmac/(r)(e)ledpar
269 | *.end
270 | *.?end
271 | *.[1-9]
272 | *.[1-9][0-9]
273 | *.[1-9][0-9][0-9]
274 | *.[1-9]R
275 | *.[1-9][0-9]R
276 | *.[1-9][0-9][0-9]R
277 | *.eledsec[1-9]
278 | *.eledsec[1-9]R
279 | *.eledsec[1-9][0-9]
280 | *.eledsec[1-9][0-9]R
281 | *.eledsec[1-9][0-9][0-9]
282 | *.eledsec[1-9][0-9][0-9]R
283 | 
284 | # glossaries
285 | *.acn
286 | *.acr
287 | *.glg
288 | *.glo
289 | *.gls
290 | *.glsdefs
291 | *.lzo
292 | *.lzs
293 | *.slg
294 | *.slo
295 | *.sls
296 | 
297 | # uncomment this for glossaries-extra (will ignore makeindex's style files!)
298 | # *.ist
299 | 
300 | # gnuplot
301 | *.gnuplot
302 | *.table
303 | 
304 | # gnuplottex
305 | *-gnuplottex-*
306 | 
307 | # gregoriotex
308 | *.gaux
309 | *.glog
310 | *.gtex
311 | 
312 | # htlatex
313 | *.4ct
314 | *.4tc
315 | *.idv
316 | *.lg
317 | *.trc
318 | *.xref
319 | 
320 | # hyperref
321 | *.brf
322 | 
323 | # knitr
324 | *-concordance.tex
325 | # TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
326 | # *.tikz
327 | *-tikzDictionary
328 | 
329 | # listings
330 | *.lol
331 | 
332 | # luatexja-ruby
333 | *.ltjruby
334 | 
335 | # makeidx
336 | *.idx
337 | *.ilg
338 | *.ind
339 | 
340 | # minitoc
341 | *.maf
342 | *.mlf
343 | *.mlt
344 | *.mtc[0-9]*
345 | *.slf[0-9]*
346 | *.slt[0-9]*
347 | *.stc[0-9]*
348 | 
349 | # minted
350 | _minted*
351 | *.pyg
352 | 
353 | # morewrites
354 | *.mw
355 | 
356 | # newpax
357 | *.newpax
358 | 
359 | # nomencl
360 | *.nlg
361 | *.nlo
362 | *.nls
363 | 
364 | # pax
365 | *.pax
366 | 
367 | # pdfpcnotes
368 | *.pdfpc
369 | 
370 | # sagetex
371 | *.sagetex.sage
372 | *.sagetex.py
373 | *.sagetex.scmd
374 | 
375 | # scrwfile
376 | *.wrt
377 | 
378 | # svg
379 | svg-inkscape/
380 | 
381 | # sympy
382 | *.sout
383 | *.sympy
384 | sympy-plots-for-*.tex/
385 | 
386 | # pdfcomment
387 | *.upa
388 | *.upb
389 | 
390 | # pythontex
391 | *.pytxcode
392 | pythontex-files-*/
393 | 
394 | # tcolorbox
395 | *.listing
396 | 
397 | # thmtools
398 | *.loe
399 | 
400 | # TikZ & PGF
401 | *.dpth
402 | *.md5
403 | *.auxlock
404 | 
405 | # titletoc
406 | *.ptc
407 | 
408 | # todonotes
409 | *.tdo
410 | 
411 | # vhistory
412 | *.hst
413 | *.ver
414 | 
415 | # easy-todo
416 | *.lod
417 | 
418 | # xcolor
419 | *.xcp
420 | 
421 | # xmpincl
422 | *.xmpi
423 | 
424 | # xindy
425 | *.xdy
426 | 
427 | # xypic precompiled matrices and outlines
428 | *.xyc
429 | *.xyd
430 | 
431 | # endfloat
432 | *.ttt
433 | *.fff
434 | 
435 | # Latexian
436 | TSWLatexianTemp*
437 | 
438 | ## Editors:
439 | # WinEdt
440 | *.bak
441 | *.sav
442 | 
443 | # Texpad
444 | .texpadtmp
445 | 
446 | # LyX
447 | *.lyx~
448 | 
449 | # Kile
450 | *.backup
451 | 
452 | # gummi
453 | .*.swp
454 | 
455 | # KBibTeX
456 | *~[0-9]*
457 | 
458 | # TeXnicCenter
459 | *.tps
460 | 
461 | # auto folder when using emacs and auctex
462 | ./auto/*
463 | *.el
464 | 
465 | # expex forward references with \gathertags
466 | *-tags.tex
467 | 
468 | # standalone packages
469 | *.sta
470 | 
471 | # Makeindex log files
472 | *.lpz
473 | 
474 | # xwatermark package
475 | *.xwm
476 | 
477 | # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
478 | # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
479 | # Uncomment the next line to have this generated file ignored.
480 | #*Notes.bib
481 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | repos:
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v5.0.0
 5 |     hooks:
 6 |       - id: check-added-large-files
 7 |         args: [--maxkb=50]
 8 |       - id: check-ast
 9 |       - id: check-case-conflict
10 |       - id: check-executables-have-shebangs
11 |       - id: check-json
12 |       - id: check-merge-conflict
13 |         args: [--assume-in-merge]
14 |       - id: check-shebang-scripts-are-executable
15 |       - id: check-toml
16 |       - id: check-xml
17 |       - id: check-yaml
18 |       - id: detect-private-key
19 |       - id: end-of-file-fixer
20 |       - id: fix-byte-order-marker
21 |       - id: mixed-line-ending
22 |       - id: no-commit-to-branch
23 |       - id: pretty-format-json
24 |         args: [--autofix]
25 |       - id: trailing-whitespace
26 | 
27 |   - repo: https://github.com/adrienverge/yamllint
28 |     rev: v1.37.0
29 |     hooks:
30 |       - id: yamllint
31 |         args:
32 |           - --strict
33 |           - >
34 |             -d={extends: default, rules: {
35 |               quoted-strings: {quote-type: single, required: only-when-needed}}}
36 | 
37 |   - repo: https://github.com/astral-sh/ruff-pre-commit
38 |     rev: v0.11.4
39 |     hooks:
40 |       - id: ruff
41 |         args:
42 |           - --fix
43 |           - --line-length=100
44 |           - --select=ALL
45 |           - --ignore=ANN,BLE001,D,N803,N806,PD901,S,SLF001,T201,TRY002,TRY301
46 |           - --no-cache
47 |       - id: ruff-format
48 |         args:
49 |           - --line-length=100
50 |           - --no-cache
51 | 
52 |   - repo: https://github.com/Lucas-C/pre-commit-hooks
53 |     rev: v1.5.5
54 |     hooks:
55 |       - id: remove-tabs
56 | 
57 |   - repo: https://github.com/meliache/pre-commit-chktex
58 |     rev: v0.2.2
59 |     hooks:
60 |       - id: chktex-conda
61 |         args: [-H, -I]
62 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | title: "Urban Street Network Models and Measures"
 3 | authors:
 4 | - family-names: "Boeing"
 5 |   given-names: "Geoff"
 6 |   orcid: "https://orcid.org/0000-0003-1851-6411"
 7 |   website: "https://geoffboeing.com"
 8 | url: "https://github.com/gboeing/street-network-models"
 9 | repository-code: "https://github.com/gboeing/street-network-models"
10 | preferred-citation:
11 |   type: report
12 |   title: "Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World"
13 |   authors:
14 |   - family-names: "Boeing"
15 |     given-names: "Geoff"
16 |     orcid: "https://orcid.org/0000-0003-1851-6411"
17 |     website: "https://geoffboeing.com"
18 |   year: 2025
19 |   url: "https://github.com/gboeing/street-network-models"
20 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2025 Geoff Boeing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Urban Street Network Models and Indicators
  2 | 
  3 | This project uses [OSMnx](https://github.com/gboeing/osmnx) to model and analyze the street networks of every urban area in the world then shares the results (models and indicators) in an open data [repository](https://dataverse.harvard.edu/dataverse/global-urban-street-networks) in the Harvard Dataverse.
  4 | 
  5 | ## Citation
  6 | 
  7 | Boeing, G. 2025. Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World. Working paper. https://github.com/gboeing/street-network-models
  8 | 
  9 | ## Computing environment
 10 | 
 11 | The following sections provide notes on reproducibility. Given the resource requirements, it's best to run the workflow in a high-performance computing cluster, but it's feasible to run it on a well-equipped personal computer.
 12 | 
 13 | System requirements:
 14 | 
 15 |   - RAM/CPU: minimum of 32gb for single-threaded execution (note: you'll have to edit `config.json` to set the CPU counts to 1). Recommended 128gb + 24 CPU cores for multithreaded execution as parameterized in the config file.
 16 |   - Disk space: 2 terabytes.
 17 |   - OS: agnostic, but this workflow was developed and tested on Linux.
 18 | 
 19 | Runtime environment: create a new [conda](https://conda.io) environment using the `environment.yml` file to install all the necessary packages to run the workflow. You can install a Jupyter kernel in it, if you wish, like `python -m ipykernel install --user --name snm --display-name "Python (snm)"`.
 20 | 
 21 | ## Input data
 22 | 
 23 | Create a project data root folder with a `inputs` subfolder and place the unzipped [input data](https://drive.usercontent.google.com/download?id=1UrHub0mX0LwybpEOKmwHgEvUgrMj0C7y&export=download) in it. This project uses the Global Human Settlement Layer urban centers dataset to define the world's urban areas' boundary polygons, specifically, their Urban Centre Database 2025:
 24 | 
 25 | > Mari Rivero, Ines; Melchiorri, Michele; Florio, Pietro; Schiavina, Marcello; Goch, Katarzyna; Politis, Panagiotis; Uhl, Johannes; Pesaresi, Martino; Maffenini, Luca; Sulis, Patrizia; Crippa, Monica; Guizzardi, Diego; Pisoni, Enrico; Belis, Claudio; Jacome Felix Oom, Duarte; Branco, Alfredo; Mwaniki, Dennis; Kochulem, Edwin; Githira, Daniel; Carioli, Alessandra; Ehrlich, Daniele; Tommasi, Pierpaolo; Kemper, Thomas; Dijkstra, Lewis (2024): GHS-UCDB R2024A - GHS Urban Centre Database 2025. European Commission, Joint Research Centre (JRC) [Dataset] doi: 10.2905/1a338be6-7eaf-480c-9664-3a8ade88cbcd PID: http://data.europa.eu/89h/1a338be6-7eaf-480c-9664-3a8ade88cbcd
 26 | 
 27 | ## Workflow
 28 | 
 29 | The workflow is organized into folders and scripts, as follows.
 30 | 
 31 | ### 1. Construct models
 32 | 
 33 | #### 1.1. Prep data
 34 | 
 35 | Load the GHS urban centers dataset, retain useful columns, save as a GeoPackage file.
 36 | 
 37 | #### 1.2. Download cache
 38 | 
 39 | Uses OSMnx to download OSM raw data to a cache for subsequent parallel processing.
 40 | 
 41 | #### 1.3. Create graphs
 42 | 
 43 | Use cached OSM raw data to construct a MultiDiGraph of each street network. Can be done in parallel with multiprocessing by changing `cpus` config setting. Saves to disk as GraphML file. Parameterized to get only drivable streets, retain all, simplify, and truncate by edge. Does this for every urban center's polygon boundary if it meets the following conditions:
 44 | 
 45 |   - is marked with a "high" quality control score
 46 |   - has >1 km2 built-up area
 47 |   - includes ≥3 nodes
 48 | 
 49 | ### 2. Attach elevation
 50 | 
 51 | This project uses three data sources for elevation:
 52 | 
 53 |   1. [ASTERv3](https://www.earthdata.nasa.gov/data/instruments/aster) GDEM at 30 meter resolution
 54 |   2. [SRTMGL1](https://www.earthdata.nasa.gov/news/nasa-shuttle-radar-topography-mission-srtm-version-30-global-1-arc-second-data-released-over) GDEM at 30 meter resolution with voids filled (version 3.0 global 1 arc second)
 55 |   3. Google Maps Elevation API
 56 | 
 57 | We use ASTER and SRTM to attach elevation data to each graph node in each model, then calculate edge grades. Both of these are public, free, open data. We just use Google Maps elevation as a validation dataset.
 58 | 
 59 | A few notes. A previous iteration of this project used to use [CGIAR](https://srtm.csi.cgiar.org)'s post-processed SRTM v4.1, but they only provide 90m resolution SRTM data. The Google billing scheme is changing in March 2025, rendering Google elevation data collection at this scale possibly infeasible in the future without substantial funding to pay for it. Historically, each billing account gets $200 usage credit free each month. The price per HTTP request was $0.005. Therefore you would get up to 200 / 0.005 = 40,000 free requests each month, within the usage limits of 512 locations per request and 6,000 requests per minute. URLs must be properly encoded to be valid and are limited to 16,384 characters for all web services. With three billing accounts, you could process this entire workflow for free once a month.
 60 | 
 61 | #### 2.1. ASTER and SRTM
 62 | 
 63 | ##### 2.1.1. Download ASTER
 64 | 
 65 | Download each ASTER DEM tif file (requires NASA EarthData login credentials).
 66 | 
 67 | ##### 2.1.2. Download SRTM
 68 | 
 69 | Download each SRTM DEM hgt file (requires NASA EarthData login credentials).
 70 | 
 71 | ##### 2.1.3. Build VRTs
 72 | 
 73 | Build two VRT virtual raster files (one for all the ASTER files and one for all the SRTM files) for subsequent querying.
 74 | 
 75 | ##### 2.1.4. Attach node elevations
 76 | 
 77 | Load each GraphML file saved in step 1.3 and add SRTM and ASTER elevation attributes to each node by querying the VRTs then resave the GraphML to disk.
 78 | 
 79 | #### 2.2. Google Elevation
 80 | 
 81 | ##### 2.2.1. Cluster nodes
 82 | 
 83 | We want to send node coordinates to the elevation API in batches. But the batches need to consist of (approximately) adjacent nodes because the Google API uses a smoothing function to estimate elevation. If the nodes are from different parts of the planet (or at different elevations), this smoothing will result in very coarse-grained approximations of individual nodes' elevations. So, load all the node coordinates for each graph and spatially cluster them into equal-size clusters of 512 coordinates apiece, then save as a CSV file.
 84 | 
 85 | ##### 2.2.2. Make URLs
 86 | 
 87 | Load the CSV file of node clusters and construct an API URL for each, with a key (requires 3 Google API keys).
 88 | 
 89 | ##### 2.2.3. Download Google elevations
 90 | 
 91 | Request each URL and save node ID and elevation to disk for all nodes.
 92 | 
 93 | #### 2.2.4. Choose best elevation
 94 | 
 95 | Load each GraphML file and select either ASTER or SRTM to use as the official node elevation value, for each node, based on which is closer to the Google value (as a tie-breaker). Then calculate all edge grades and add as edge attributes. Re-save graph to disk as GraphML.
 96 | 
 97 | ### 3. Calculate stats
 98 | 
 99 | #### 3.1. Calculate betweenness centrality
100 | 
101 | Load each GraphML file and calculate length-weighted node betweenness centrality for all nodes, using IGraph.
102 | 
103 | #### 3.2. Calculate stats
104 | 
105 | Load each saved graph's GraphML file. Calculate each stat as described in the metadata file.
106 | 
107 | #### 3.3. Merge stats
108 | 
109 | Merge the street network stats with the urban centers stats (from the GeoPackage file created in step 1.1). Save to disk with indicators named as described in the metadata file.
110 | 
111 | #### 3.4. Create metadata
112 | 
113 | Create metadata files for the graphs (node/edge attributes) and stats.
114 | 
115 | ### 4. Upload repository
116 | 
117 | #### 4.1. Generate files
118 | 
119 | Save graphs to disk as GeoPackages and node/edge list files. Then ensure we have what we expect: verify that we have the same number of countries for each file type, the same number of gpkg, graphml, and node/edge list files, and that the same set of country/city names exists across gkpg, graphml, and node/edge lists.
120 | 
121 | #### 4.2. Stage files
122 | 
123 | Compress and zip all model files (GeoPackages, GraphML, node/edge lists) into a staging area for upload to Dataverse.
124 | 
125 | #### 4.3. Upload to Dataverse
126 | 
127 | Upload to Dataverse using their v1 [Native API](https://guides.dataverse.org/en/latest/api/native-api.html). First [log in](https://dataverse.harvard.edu) and create an API key if you don't have an active one (they expire annually). If this is a revision to existing datasets, create a draft dataset revision on the Dataverse (edit dataset > metadata > change something > save). Otherwise, if this is the first upload ever, create a new Dataverse and new empty datasets within it, structured like:
128 | 
129 |   - Global Urban Street Networks
130 |       - Global Urban Street Networks GeoPackages
131 |       - Global Urban Street Networks GraphML Files
132 |       - Global Urban Street Networks Node/Edge Lists
133 |       - Global Urban Street Networks Measures
134 |       - Global Urban Street Networks Metadata
135 | 
136 | Then run the script to upload all the repository files automatically to their respective datasets in the Dataverse (note: if this a dataset *revision*, set `delete_existing = True` to first clear out all the carried-over files in the draft). Next, *manually* upload the indicators and metadata files to their respective datasets in the Dataverse. Finally, visit the Dataverse on the web to publish the draft.
137 | 


--------------------------------------------------------------------------------
/code/01-construct-models/01-prep-ghsl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import json
  4 | import re
  5 | import unicodedata
  6 | from pathlib import Path
  7 | 
  8 | import geopandas as gpd
  9 | import osmnx as ox
 10 | import pandas as pd
 11 | 
 12 | # load configs
 13 | with Path("./config.json").open() as f:
 14 |     config = json.load(f)
 15 | 
 16 | fp = config["uc_input_path"]
 17 | msg = f"Loading all layers from {fp!r}"
 18 | print(ox.ts(), msg)
 19 | 
 20 | # load all GHS urban centers dataset gpkg together into 1 gdf
 21 | col_on = "ID_UC_G0"
 22 | suffixes = ("", "_DROP")
 23 | layers = list(gpd.list_layers(fp)["name"])
 24 | ucs = gpd.read_file(fp, layer=layers[0])
 25 | for layer in layers[1:]:
 26 |     ucs = ucs.merge(
 27 |         gpd.read_file(fp, layer=layer),
 28 |         left_on=col_on,
 29 |         right_on=col_on,
 30 |         how="inner",
 31 |         suffixes=suffixes,
 32 |     )
 33 | drop = [c for c in ucs.columns if "_DROP" in c]
 34 | ucs = ucs.drop(columns=drop)
 35 | 
 36 | # quality control checks
 37 | assert ucs.index.is_unique
 38 | assert ucs.columns.is_unique
 39 | assert ucs.crs is not None
 40 | 
 41 | # project to OSMnx's default CRS
 42 | ucs = ucs.to_crs(ox.settings.default_crs)
 43 | ucs["geometry"] = ucs.make_valid()
 44 | print(ox.ts(), "Loaded urban centers data with shape", ucs.shape)
 45 | 
 46 | # identify which columns to keep when saving to disk
 47 | # comments from GHS_UCDB_GLOBE_R2024A_V1_0/GHS_UCDB_GLOBE_R2024A.pdf
 48 | cols = [
 49 |     "GC_PLS_SCR_2025",  # plausibility score (quality control)
 50 |     "ID_UC_G0",  # urban center ID
 51 |     "GC_UCN_MAI_2025",  # name of main city inside urban center
 52 |     "GC_UCN_LIS_2025",  # list of names of all cities inside urban center
 53 |     "GC_CNT_GAD_2025",  # country name based on GADM dataset
 54 |     "country_iso",  # country ISO 3166-1 alpha-3 code
 55 |     "GC_DEV_USR_2025",  # UN SDG geographic region
 56 |     # population, area, density
 57 |     "GC_POP_TOT_2025",  # total population (inhabitants) inside urban center
 58 |     "GC_UCA_KM2_2025",  # urban center area in km^2
 59 |     "GH_BUS_TOT_2025",  # total built-up area m^2
 60 |     "GH_BPC_TOT_2025",  # total built-up area per-capita (m^2/person)
 61 |     "GH_BUH_AVG_2020",  # average height of built surfaces (m) at 100m res
 62 |     # economic development
 63 |     "SC_SEC_GDP_2020",  # total GDP PPP (real? USD)
 64 |     "GC_DEV_WIG_2025",  # world bank income group
 65 |     "SC_SEC_HDI_2020",  # human development index at subnational level
 66 |     # pollution emission and concentration
 67 |     "EM_CO2_TRA_2022",  # total CO2 emissions in transport sector (ton/year)
 68 |     "EM_PM2_TRA_2022",  # total PM2.5 emissions in transport sector (ton/year)
 69 |     "EM_PM2_CON_2020",  # pop-weighted average PM2.5 concentrations (μg/m^3)
 70 |     # climate/land use
 71 |     "CL_KOP_CUR_2025",  # Köppen-Geiger classification of majority of surface
 72 |     "GE_ELV_AVG_2025",  # average elevation (m)
 73 |     "CL_B12_CUR_2010",  # average annual precipitation in the decade (mm/year)
 74 |     "CL_B01_CUR_2010",  # annual mean temperature in the decade (°C)
 75 |     "SD_POP_HGR_2025",  # share of pop living in area of high greenness
 76 |     "SD_LUE_LPR_2000_2020",  # land use efficiency = land consump rate / pop growth rate
 77 |     "geometry",  # urban center geometry
 78 | ]
 79 | 
 80 | # only retain urban centers with >1 sq km of built-up area
 81 | # drops 943 out of 11422 rows (8.3%)
 82 | sq_km = 1e6  # meters
 83 | ucs = ucs[ucs["GH_BUS_TOT_2025"] > sq_km]
 84 | 
 85 | # only retain urban centers with a "high" quality control score
 86 | # drops 127 out of 10479 rows (1.2%)
 87 | ucs = ucs[ucs["GC_PLS_SCR_2025"] == "High"]
 88 | 
 89 | # convert columns to int where needed
 90 | cols_int = ["GC_POP_TOT_2025", "SC_SEC_GDP_2020"]
 91 | ucs[cols_int] = ucs[cols_int].astype(int)
 92 | 
 93 | # add country ISO column from lookup table
 94 | iso = pd.read_csv(config["iso_codes_path"]).set_index("name")["alpha3"].to_dict()
 95 | ucs["country_iso"] = ucs["GC_CNT_GAD_2025"].replace(iso)
 96 | assert pd.notna(ucs["country_iso"]).all()
 97 | 
 98 | 
 99 | regex = re.compile("[^0-9a-zA-Z]+")
100 | 
101 | 
102 | def clean_str(s, regex=regex):
103 |     # clean up name/country for file naming: get ASCII representation and make
104 |     # everything just lowercase letters and underscores. if normalized name is
105 |     # null, empty string, or 1+ whitespaces then rename it to "unnamed"
106 |     try:
107 |         norm = unicodedata.normalize("NFKD", s).encode("ascii", errors="ignore").decode()
108 |         assert norm != ""
109 |         assert set(norm) != {" "}
110 |     except (AssertionError, TypeError):
111 |         norm = "Unnamed"
112 |     return regex.sub("_", norm).lower().strip("_")
113 | 
114 | 
115 | cols_lower = ["GC_UCN_MAI_2025", "GC_CNT_GAD_2025"]
116 | ucs[cols_lower] = ucs[cols_lower].map(clean_str)
117 | 
118 | # save final dataset to disk
119 | ucs = ucs[cols]
120 | ucs.to_file(config["uc_gpkg_path"], driver="GPKG", encoding="utf-8")
121 | msg = f"Saved urban centers gpkg with shape {ucs.shape} at {config['uc_gpkg_path']!r}"
122 | print(ox.ts(), msg)
123 | 


--------------------------------------------------------------------------------
/code/01-construct-models/02-download-cache.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import logging as lg
 5 | import multiprocessing as mp
 6 | import time
 7 | from pathlib import Path
 8 | 
 9 | import geopandas as gpd
10 | import osmnx as ox
11 | 
12 | print(ox.ts(), "OSMnx version", ox.__version__)
13 | 
14 | # hardcode CPU count to parallelize it without hammering Overpass server
15 | cpus = 3
16 | 
17 | # load configs
18 | with Path("./config.json").open() as f:
19 |     config = json.load(f)
20 | 
21 | # configure OSMnx
22 | ox.settings.log_file = True
23 | ox.settings.log_console = False
24 | ox.settings.logs_folder = config["osmnx_log_path"]
25 | ox.settings.cache_folder = config["osmnx_cache_path"]
26 | ox.settings.use_cache = True
27 | ox.settings.cache_only_mode = True
28 | 
29 | # configure queries
30 | network_type = "drive"
31 | retain_all = True
32 | simplify = True
33 | truncate_by_edge = True
34 | 
35 | # load the prepped urban centers dataset
36 | uc_gpkg_path = config["uc_gpkg_path"]
37 | ucs = gpd.read_file(uc_gpkg_path).sort_values("GH_BUS_TOT_2025", ascending=True)
38 | msg = f"Loaded urban centers data with shape {ucs.shape} from {uc_gpkg_path!r}"
39 | print(ox.ts(), msg)
40 | 
41 | 
42 | def download_data(name, geometry) -> None:
43 |     try:
44 |         ox.graph_from_polygon(
45 |             polygon=geometry,
46 |             network_type=network_type,
47 |             retain_all=retain_all,
48 |             simplify=simplify,
49 |             truncate_by_edge=truncate_by_edge,
50 |         )
51 |     except ox._errors.CacheOnlyInterruptError:
52 |         # error on success, because cache_only_mode is True
53 |         print(ox.ts(), "Finished", name, flush=True)
54 | 
55 |     except Exception as e:
56 |         ox.log(f'"{name}" failed: {e}', level=lg.ERROR)
57 |         print(name, e)
58 | 
59 | 
60 | names = ucs["country_iso"] + "-" + ucs["GC_UCN_MAI_2025"] + "-" + ucs["ID_UC_G0"].astype(str)
61 | args = zip(names, ucs["geometry"])
62 | 
63 | print(ox.ts(), f"Downloading {len(ucs):,} graphs' data using {cpus} CPUs")
64 | start_time = time.time()
65 | 
66 | with mp.get_context().Pool(cpus) as pool:
67 |     pool.starmap_async(download_data, args).get()
68 | 
69 | elapsed = time.time() - start_time
70 | msg = f"Finished caching data for {len(ucs):,} graphs in {elapsed:,.0f} seconds"
71 | print(ox.ts(), msg)
72 | 


--------------------------------------------------------------------------------
/code/01-construct-models/03-create-graphs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import logging as lg
 5 | import multiprocessing as mp
 6 | import time
 7 | from pathlib import Path
 8 | 
 9 | import geopandas as gpd
10 | import osmnx as ox
11 | 
12 | print(ox.ts(), "OSMnx version", ox.__version__)
13 | 
14 | # load configs
15 | with Path("./config.json").open() as f:
16 |     config = json.load(f)
17 | 
18 | # configure OSMnx
19 | ox.settings.log_file = True
20 | ox.settings.log_console = False
21 | ox.settings.logs_folder = config["osmnx_log_path"]
22 | ox.settings.cache_folder = config["osmnx_cache_path"]
23 | ox.settings.use_cache = True
24 | 
25 | # configure queries
26 | network_type = "drive"
27 | retain_all = True
28 | simplify = True
29 | truncate_by_edge = True
30 | 
31 | # configure multiprocessing
32 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
33 | 
34 | # load the prepped urban centers dataset
35 | uc_gpkg_path = config["uc_gpkg_path"]
36 | ucs = gpd.read_file(uc_gpkg_path).sort_values("GH_BUS_TOT_2025", ascending=False)
37 | msg = f"Loaded urban centers data with shape {ucs.shape} from {uc_gpkg_path!r}"
38 | print(ox.ts(), msg)
39 | 
40 | 
41 | def get_graph(uc, root) -> None:
42 |     try:
43 |         country_folder = f"{uc['GC_CNT_GAD_2025']}-{uc['country_iso']}"
44 |         uc_filename = f"{uc['GC_UCN_MAI_2025']}-{uc['ID_UC_G0']}.graphml"
45 |         filepath = root / country_folder / uc_filename
46 |         if not filepath.is_file():
47 |             G = ox.graph_from_polygon(
48 |                 polygon=uc["geometry"],
49 |                 network_type=network_type,
50 |                 retain_all=retain_all,
51 |                 simplify=simplify,
52 |                 truncate_by_edge=truncate_by_edge,
53 |             )
54 | 
55 |             # don't save graphs if they have fewer than 3 nodes
56 |             min_nodes = 3
57 |             if len(G) >= min_nodes:
58 |                 ox.save_graphml(G, filepath=filepath)
59 |                 print(ox.ts(), f"Saved {filepath}", flush=True)
60 | 
61 |     except Exception as e:
62 |         ox.log(f'"{filepath}" failed: {e}', level=lg.ERROR)
63 |         print(e, filepath)
64 | 
65 | 
66 | ucs = ucs.sample(len(ucs))  # .tail(10)
67 | 
68 | # create function arguments for multiprocessing
69 | root = Path(config["models_graphml_path"])
70 | cols = ["GC_CNT_GAD_2025", "country_iso", "GC_UCN_MAI_2025", "ID_UC_G0", "geometry"]
71 | args = ((uc[cols].to_dict(), root) for _, uc in ucs.iterrows())
72 | 
73 | print(ox.ts(), f"Begin creating {len(ucs):,} graphs using {cpus} CPUs")
74 | start_time = time.time()
75 | with mp.get_context().Pool(cpus) as pool:
76 |     pool.starmap_async(get_graph, args).get()
77 | 
78 | elapsed = time.time() - start_time
79 | msg = f"Finished creating {len(ucs):,} graphs in {elapsed:,.0f} seconds"
80 | print(ox.ts(), msg)
81 | file_count = len(list(root.glob("*/*")))
82 | msg = f"There are {file_count:,} GraphML files in {str(root)!r}"
83 | print(ox.ts(), msg)
84 | 


--------------------------------------------------------------------------------
/code/02-attach-elevation/01-aster-srtm/01-download-aster_v3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import multiprocessing as mp
 5 | from pathlib import Path
 6 | 
 7 | import osmnx as ox
 8 | import pandas as pd
 9 | import requests
10 | 
11 | # username/password for https://www.earthdata.nasa.gov/
12 | from keys import pwd, usr
13 | 
14 | # load configs
15 | with Path("./config.json").open() as f:
16 |     config = json.load(f)
17 | 
18 | # configurations
19 | cpus = 4
20 | urls_path = config["gdem_aster_urls_path"]
21 | dl_path = Path(config["gdem_aster_path"])
22 | dl_path.mkdir(parents=True, exist_ok=True)
23 | 
24 | 
25 | def download(url, usr=usr, pwd=pwd, dl_path=dl_path) -> None:
26 |     with requests.Session() as session:
27 |         filename = Path(url).name
28 |         session.trust_env = False
29 |         request = session.request("get", url, auth=(usr, pwd))
30 |         response = session.get(request.url, auth=(usr, pwd))
31 | 
32 |         if response.ok:
33 |             filepath = dl_path / filename
34 |             with filepath.open(mode="wb") as f:
35 |                 f.write(response.content)
36 |         else:
37 |             print(response.status_code, response.text)
38 | 
39 | 
40 | # get all the URLs pointing at dem tif files
41 | urls = pd.read_csv(urls_path, header=None).iloc[:, 0].sort_values()
42 | urls = urls[urls.str.endswith("_dem.tif")]
43 | print(ox.ts(), f"There are {len(urls):,} total ASTER URLs")
44 | 
45 | # how many files have already been downloaded?
46 | existing = {path.name for path in dl_path.glob("*.tif")}
47 | print(ox.ts(), f"There are {len(existing):,} files already downloaded")
48 | 
49 | # how many files are remaining to download?
50 | urls = [url for url in urls if Path(url).name not in existing]
51 | print(ox.ts(), f"Downloading {len(urls):,} URLs with {cpus} CPUs")
52 | 
53 | # multiprocess the queue
54 | if len(urls) > 0:
55 |     args = ((url,) for url in urls)
56 |     with mp.get_context().Pool(cpus) as pool:
57 |         pool.starmap_async(download, args).get()
58 | 
59 | file_count = len(list(dl_path.glob("*")))
60 | msg = f"Finished: {file_count:,} files in {str(dl_path)!r}"
61 | print(ox.ts(), msg)
62 | 


--------------------------------------------------------------------------------
/code/02-attach-elevation/01-aster-srtm/02-download-srtmgl1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import multiprocessing as mp
 5 | from pathlib import Path
 6 | from zipfile import ZipFile
 7 | 
 8 | import osmnx as ox
 9 | import pandas as pd
10 | import requests
11 | 
12 | # username/password for https://www.earthdata.nasa.gov/
13 | from keys import pwd, usr
14 | 
15 | # load configs
16 | with Path("./config.json").open() as f:
17 |     config = json.load(f)
18 | 
19 | # configurations
20 | cpus = 4
21 | urls_path = config["gdem_srtm_urls_path"]
22 | dl_path = Path(config["gdem_srtm_path"])
23 | dl_path.mkdir(parents=True, exist_ok=True)
24 | 
25 | 
26 | def download(url, usr=usr, pwd=pwd, dl_path=dl_path) -> None:
27 |     with requests.Session() as session:
28 |         filename = Path(url).name
29 |         session.trust_env = False
30 |         request = session.request("get", url, auth=(usr, pwd))
31 |         response = session.get(request.url, auth=(usr, pwd))
32 | 
33 |         if response.ok:
34 |             filepath = dl_path / filename
35 |             with filepath.open(mode="wb") as f:
36 |                 f.write(response.content)
37 | 
38 |             with ZipFile(filepath, "r") as z:
39 |                 z.extractall(dl_path)
40 |             filepath.unlink()
41 | 
42 |         else:
43 |             print(response.status_code, response.text)
44 | 
45 | 
46 | # get all the URLs
47 | urls = pd.read_csv(urls_path, header=None).iloc[:, 0].sort_values()
48 | print(ox.ts(), f"There are {len(urls):,} total SRTM URLs")
49 | 
50 | # how many files have already been downloaded?
51 | existing = {fp.name.split(".")[0] for fp in dl_path.glob("*.hgt")}
52 | print(ox.ts(), f"There are {len(existing):,} files already downloaded")
53 | 
54 | # how many files are remaining to download?
55 | tiles = (Path(url).name.split(".")[0] for url in urls)
56 | remaining = [url for url, tile in zip(urls, tiles) if tile not in existing]
57 | print(ox.ts(), f"Downloading {len(remaining):,} URLs with {cpus} CPUs")
58 | 
59 | # multiprocess the queue
60 | if len(remaining) > 0:
61 |     args = ((url,) for url in remaining)
62 |     with mp.get_context().Pool(cpus) as pool:
63 |         pool.starmap_async(download, args).get()
64 | 
65 | file_count = len(list(dl_path.glob("*")))
66 | msg = f"Finished: {file_count:,} files in {str(dl_path)!r}"
67 | print(ox.ts(), msg)
68 | 


--------------------------------------------------------------------------------
/code/02-attach-elevation/01-aster-srtm/03-build-vrts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | from pathlib import Path
 5 | 
 6 | import osmnx as ox
 7 | 
 8 | with Path("./config.json").open() as f:
 9 |     config = json.load(f)
10 | ox.settings.cache_folder = config["osmnx_cache_path"]
11 | aster_path = Path(config["gdem_aster_path"])
12 | srtm_path = Path(config["gdem_srtm_path"])
13 | 
14 | # get one sample graph, just to build the VRTs for the first time
15 | filepath = sorted(Path(config["models_graphml_path"]).glob("*/*"))[0]
16 | G = ox.io.load_graphml(filepath)
17 | 
18 | # build VRT files for the SRTM and ASTER raster files
19 | args = [("srtm", srtm_path, "*.hgt"), ("aster", aster_path, "*.tif")]
20 | for data_source, rasters_path, glob_pattern in args:
21 |     rasters = sorted(rasters_path.glob(glob_pattern))
22 |     msg = f"Building VRT for {len(rasters):,} files from {str(rasters_path)!r}"
23 |     print(ox.ts(), msg)
24 |     G = ox.elevation.add_node_elevations_raster(G, rasters)
25 |     for _, data in G.nodes(data=True):
26 |         data[f"elevation_{data_source}"] = data.pop("elevation")
27 | 
28 | # show descriptive stats for the elevation values in this one city
29 | cols = ["elevation_aster", "elevation_srtm"]
30 | stats = ox.convert.graph_to_gdfs(G, edges=False)[cols].describe()
31 | print(ox.ts(), stats)
32 | 


--------------------------------------------------------------------------------
/code/02-attach-elevation/01-aster-srtm/04-add-node-elevations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import multiprocessing as mp
 5 | from pathlib import Path
 6 | 
 7 | import networkx as nx
 8 | import osmnx as ox
 9 | 
10 | with Path("./config.json").open() as f:
11 |     config = json.load(f)
12 | ox.settings.cache_folder = config["osmnx_cache_path"]
13 | 
14 | # configure multiprocessing
15 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
16 | 
17 | # get the paths of all the ASTER/SRTM rasters
18 | srtm_files = sorted(Path(config["gdem_srtm_path"]).glob("*.hgt"))
19 | aster_files = sorted(Path(config["gdem_aster_path"]).glob("*.tif"))
20 | attr_rasters = [("elevation_aster", aster_files), ("elevation_srtm", srtm_files)]
21 | 
22 | 
23 | def process_graph(filepath, attr_rasters=attr_rasters) -> None:
24 |     G = ox.io.load_graphml(filepath)
25 |     for attr, rasters in attr_rasters:
26 |         # if not all graph nodes have this attr, then add elevation from
27 |         # raster files, rename elevation -> this attr name, then save graph
28 |         if set(G.nodes) != set(nx.get_node_attributes(G, attr)):
29 |             try:
30 |                 G = ox.elevation.add_node_elevations_raster(G, rasters, cpus=1)
31 |                 for _, data in G.nodes(data=True):
32 |                     data[attr] = data.pop("elevation")
33 |                 ox.io.save_graphml(G, filepath)
34 |             except ValueError as e:
35 |                 print(e, filepath, attr)
36 | 
37 | 
38 | # set up the args
39 | filepaths = sorted(Path(config["models_graphml_path"]).glob("*/*"))
40 | args = ((fp,) for fp in filepaths)
41 | 
42 | # multiprocess the queue
43 | print(ox.ts(), f"Adding elevation to {len(filepaths):,} graphs with {cpus} CPUs")
44 | with mp.get_context().Pool(cpus) as pool:
45 |     pool.starmap_async(process_graph, args).get()
46 | print(ox.ts(), f"Finished adding elevation to {len(filepaths):,} graphs")
47 | 


--------------------------------------------------------------------------------
/code/02-attach-elevation/02-google/01-cluster-nodes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import itertools
 4 | import json
 5 | import math
 6 | import multiprocessing as mp
 7 | from pathlib import Path
 8 | 
 9 | import numpy as np
10 | import osmnx as ox
11 | from scipy.spatial import cKDTree
12 | 
13 | # google usage limit: 512 locations per request
14 | coords_per_request = 512
15 | 
16 | # load configs
17 | with Path("./config.json").open() as f:
18 |     config = json.load(f)
19 | 
20 | # configure multiprocessing
21 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
22 | 
23 | graphml_folder = Path(config["models_graphml_path"])
24 | save_folder = Path(config["elevation_nodeclusters_path"])
25 | 
26 | 
27 | # return graph nodes' x-y coordinates
28 | def get_graph_nodes(fp):
29 |     return ox.convert.graph_to_gdfs(ox.io.load_graphml(fp), edges=False, node_geometry=False)[
30 |         ["x", "y"]
31 |     ]
32 | 
33 | 
34 | # get an iterator of points around the perimeter of nodes' coordinates
35 | def get_perimeter_points(nodes):
36 |     tl = np.array((nodes["x"].min(), nodes["y"].max()))
37 |     t = np.array((nodes["x"].mean(), nodes["y"].max()))
38 |     tr = np.array((nodes["x"].max(), nodes["y"].max()))
39 |     r = np.array((nodes["x"].max(), nodes["y"].mean()))
40 |     br = np.array((nodes["x"].max(), nodes["y"].min()))
41 |     b = np.array((nodes["x"].mean(), nodes["y"].min()))
42 |     bl = np.array((nodes["x"].min(), nodes["y"].min()))
43 |     l = np.array((nodes["x"].min(), nodes["y"].mean()))  # noqa: E741
44 |     points = [tl, t, tr, r, br, b, bl, l]
45 |     multiplier = math.ceil(len(nodes) / coords_per_request / len(points))
46 |     return iter(points * multiplier)
47 | 
48 | 
49 | # group the nodes into nearest-neighbor clusters
50 | def get_clusters(nodes):
51 |     nodes_remaining = nodes
52 |     perimeter_points = get_perimeter_points(nodes)
53 |     clusters = []
54 |     while len(nodes_remaining) > 0:
55 |         if len(nodes_remaining) <= coords_per_request:
56 |             labels = nodes_remaining.index
57 |         else:
58 |             # find node nearest to next perimeter point, then get a cluster of
59 |             # its nearest `coords_per_request` neighbors around it
60 |             tree = cKDTree(nodes_remaining[["x", "y"]])
61 |             _, start_pos = tree.query(next(perimeter_points), k=1)
62 |             start_point = nodes_remaining.iloc[start_pos][["x", "y"]]
63 |             _, pos = tree.query(start_point, k=coords_per_request)
64 |             labels = nodes_remaining.iloc[pos].index
65 |         clusters.append(labels)
66 |         nodes_remaining = nodes_remaining.drop(labels)
67 | 
68 |     # ensure each node has a cluster and each cluster is smaller than max size
69 |     assert set(itertools.chain.from_iterable(clusters)) == set(nodes.index)
70 |     for cluster in clusters:
71 |         assert len(cluster) <= coords_per_request
72 | 
73 |     return clusters
74 | 
75 | 
76 | # load graph, cluster nodes, and save to disk
77 | def cluster_nodes(fp) -> None:
78 |     nodes = get_graph_nodes(fp)
79 |     clusters = get_clusters(nodes)
80 |     for count, cluster in enumerate(clusters):
81 |         nodes.loc[cluster, "cluster"] = f"{fp.stem}_{count}"
82 | 
83 |     save_path = save_folder / (fp.stem + ".csv")
84 |     save_path.parent.mkdir(parents=True, exist_ok=True)
85 |     nodes.to_csv(save_path, index=True, encoding="utf-8")
86 |     msg = f"Clustered {fp.stem!r} {len(nodes):,} nodes into {len(clusters):,} clusters"
87 |     print(ox.ts(), msg, flush=True)
88 | 
89 | 
90 | filepaths = sorted(graphml_folder.glob("*/*.graphml"))
91 | args = [(fp,) for fp in filepaths if not (save_folder / (fp.stem + ".csv")).is_file()]
92 | print(ox.ts(), f"Clustering nodes from {len(args):,} remaining GraphML files")
93 | 
94 | with mp.get_context().Pool(cpus) as pool:
95 |     pool.starmap_async(cluster_nodes, args).get()
96 | 


--------------------------------------------------------------------------------
/code/02-attach-elevation/02-google/02-make-google-urls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import multiprocessing as mp
 5 | from itertools import batched
 6 | from pathlib import Path
 7 | 
 8 | import osmnx as ox
 9 | import pandas as pd
10 | from keys import api_keys
11 | 
12 | # google usage limit: 512 locations and 16384 characters per request
13 | precision = 5
14 | coords_per_request = 512
15 | requests_per_key = 39000
16 | chars_per_url = 16384
17 | url_template = (
18 |     "https://maps.googleapis.com/maps/api/elevation/json?locations={locations}&key={{key}}"
19 | )
20 | 
21 | # load configs
22 | with Path("./config.json").open() as f:
23 |     config = json.load(f)
24 | 
25 | # configure multiprocessing
26 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
27 | 
28 | # set up the args
29 | filepaths = sorted(Path(config["elevation_nodeclusters_path"]).glob("*.csv"))
30 | args = ((fp,) for fp in filepaths)
31 | print(ox.ts(), f"Loading node clusters from {len(filepaths):,} files with {cpus} CPUs")
32 | 
33 | # extract all nodes and coordinates from all graphs
34 | with mp.get_context().Pool(cpus) as pool:
35 |     result = pool.starmap_async(pd.read_csv, args)
36 |     df = pd.concat(result.get(), ignore_index=True).set_index("osmid").sort_index()
37 | 
38 | df = df[~df.index.duplicated()]
39 | print(ox.ts(), f"There are {len(df):,} unique nodes")
40 | 
41 | 
42 | def url_add_locations(_, cluster):
43 |     assert len(cluster) <= coords_per_request
44 |     strings = (f"{y:.{precision}f},{x:.{precision}f}" for y, x in zip(cluster["y"], cluster["x"]))
45 |     locations = "|".join(strings)
46 |     return tuple(cluster.index), url_template.format(locations=locations)
47 | 
48 | 
49 | with mp.get_context().Pool(cpus) as pool:
50 |     urls = pool.starmap_async(url_add_locations, df.groupby("cluster")).get()
51 | 
52 | # then add API keys to URLs, `requests_per_key` at a time
53 | urls_with_keys = []
54 | keys_nodes_urls = zip(api_keys, batched(urls, requests_per_key), strict=True)
55 | for api_key, nodes_urls in keys_nodes_urls:
56 |     for nodes, url in nodes_urls:
57 |         url_with_key = url.format(key=api_key)
58 |         assert len(url_with_key) <= chars_per_url
59 |         urls_with_keys.append((nodes, url_with_key))
60 | 
61 | # ensure no key is used more times than allowed
62 | df_save = pd.DataFrame(urls_with_keys, columns=["nodes", "url"])
63 | for api_key in api_keys:
64 |     count = df_save["url"].str.contains(api_key).sum()
65 |     print(ox.ts(), f"Created {count:,} URLs using key {api_key!r}")
66 |     assert count <= requests_per_key
67 | 
68 | # save to disk
69 | save_path = Path(config["elevation_google_urls_path"])
70 | save_path.parent.mkdir(parents=True, exist_ok=True)
71 | df_save.to_csv(save_path, index=False, encoding="utf-8")
72 | print(ox.ts(), f"Saved {len(df_save):,} URLs to {str(save_path)!r}")
73 | 


--------------------------------------------------------------------------------
/code/02-attach-elevation/02-google/03-download-google-elevations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import multiprocessing as mp
 5 | import time
 6 | from ast import literal_eval
 7 | from pathlib import Path
 8 | 
 9 | import osmnx as ox
10 | import pandas as pd
11 | import requests
12 | 
13 | # load configs
14 | with Path("./config.json").open() as f:
15 |     config = json.load(f)
16 | 
17 | # configure multiprocessing
18 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
19 | 
20 | ox.settings.use_cache = True
21 | ox.settings.log_console = False
22 | ox.settings.log_file = True
23 | ox.settings.logs_folder = config["osmnx_log_path"]
24 | ox.settings.cache_folder = config["osmnx_cache_path"]
25 | 
26 | 
27 | def get_elevations(nodes, url, pause=0):
28 |     # check if this request is already in the cache
29 |     cached_response_json = ox._http._retrieve_from_cache(url)
30 |     if cached_response_json is not None:
31 |         response_json = cached_response_json
32 |         ox.log(f"Got URL from cache: {url}")
33 | 
34 |     # otherwise, request the elevations from the API
35 |     else:
36 |         try:
37 |             ox.log(f"Requesting node elevations from API: {url}")
38 |             time.sleep(pause)
39 |             response = requests.get(url)
40 |             assert response.ok
41 |             response_json = response.json()
42 |             ox._http._save_to_cache(url, response_json, response.ok)
43 |         except Exception as e:
44 |             msg = f"Response: {response.status_code}, {response.reason}, {response.text}, {url}"
45 |             print(ox.ts(), msg, e)
46 |             return None
47 | 
48 |     # extract the results and, if any, return as dataframe
49 |     results = response_json["results"]
50 |     if results is None:
51 |         return None
52 |     df = pd.DataFrame(results, index=literal_eval(nodes))
53 |     if "elevation" not in df.columns:
54 |         cache_filepath = ox._http._resolve_cache_filepath(url)
55 |         print(ox.ts(), f"No elevation results in {str(cache_filepath)!r}")
56 |         return None
57 |     return df[["elevation", "resolution"]].round(2)
58 | 
59 | 
60 | # load the URLs and count how many we already have responses cached for
61 | urls = pd.read_csv(config["elevation_google_urls_path"])
62 | count_cached = 0
63 | count_uncached = 0
64 | for url in urls["url"]:
65 |     if ox._http._check_cache(url) is None:
66 |         count_uncached += 1
67 |     else:
68 |         count_cached += 1
69 | 
70 | msg = f"Getting {count_cached:,} URLs from cache and {count_uncached:,} from API using {cpus} CPUs"
71 | print(ox.ts(), msg)
72 | 
73 | # uncomment this if you want to actually hit the API (and pay for it)
74 | assert count_uncached == 0
75 | 
76 | # download elevations from Google API in parallel
77 | with mp.get_context().Pool(cpus) as pool:
78 |     args = ((nodes_url.nodes, nodes_url.url) for nodes_url in urls.itertuples())
79 |     result = pool.starmap_async(get_elevations, args)
80 |     df = pd.concat(result.get(), ignore_index=False).sort_index()
81 | 
82 | # save to disk
83 | save_path = Path(config["elevation_google_elevations_path"])
84 | save_path.parent.mkdir(parents=True, exist_ok=True)
85 | df.index.name = "osmid"
86 | df.to_csv(save_path, index=True, encoding="utf-8")
87 | print(ox.ts(), f"Saved {len(df):,} node elevations to disk at {save_path}")
88 | 


--------------------------------------------------------------------------------
/code/02-attach-elevation/02-google/04-choose-best-elevation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import multiprocessing as mp
 5 | from pathlib import Path
 6 | 
 7 | import networkx as nx
 8 | import numpy as np
 9 | import osmnx as ox
10 | import pandas as pd
11 | 
12 | # load configs
13 | with Path("./config.json").open() as f:
14 |     config = json.load(f)
15 | 
16 | # configure multiprocessing
17 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
18 | 
19 | # dict to convert elev attrs to correct dtype
20 | elev_attrs = ("elevation_aster", "elevation_srtm")
21 | node_dtypes = dict.fromkeys(elev_attrs, float)
22 | 
23 | # load google elevation data for lookup
24 | fp = config["elevation_google_elevations_path"]
25 | renamer = {"elevation": "elevation_google", "resolution": "elevation_google_resolution"}
26 | df_elev = pd.read_csv(fp).rename(columns=renamer).set_index("osmid").sort_index()
27 | print(f"Loaded {len(df_elev):,} Google node elevations")
28 | 
29 | 
30 | def set_elevations(fp, df_elev=df_elev, node_dtypes=node_dtypes):
31 |     # load the graph and attach google elevation data
32 |     G = ox.io.load_graphml(fp, node_dtypes=node_dtypes)
33 |     nodes, edges = ox.graph_to_gdfs(G)
34 |     nodes = nodes.join(df_elev)
35 | 
36 |     # calculate differences in ASTER, SRTM, and Google elevation values
37 |     nodes["elev_diff_aster_google"] = (nodes["elevation_aster"] - nodes["elevation_google"]).fillna(
38 |         np.inf,
39 |     )
40 |     nodes["elev_diff_srtm_google"] = (nodes["elevation_srtm"] - nodes["elevation_google"]).fillna(
41 |         np.inf,
42 |     )
43 | 
44 |     # in each row identify if SRTM or ASTER has smaller absolute difference from Google's value
45 |     use_srtm = nodes["elev_diff_srtm_google"].abs() <= nodes["elev_diff_aster_google"].abs()
46 |     pct = 100 * use_srtm.sum() / len(nodes)
47 |     print(f"{pct:0.1f}% of nodes use SRTM, {100 - pct:0.1f}% use ASTER in {fp.stem!r}")
48 | 
49 |     # assign elevation as the SRTM or ASTER value closer to Google's, as a tie-breaker
50 |     nodes["elevation"] = np.nan
51 |     nodes.loc[use_srtm, "elevation"] = nodes.loc[use_srtm, "elevation_srtm"]
52 |     nodes.loc[~use_srtm, "elevation"] = nodes.loc[~use_srtm, "elevation_aster"]
53 | 
54 |     # ensure all elevations are non-null
55 |     assert pd.notna(nodes["elevation"]).all()
56 |     nodes["elevation"] = nodes["elevation"].astype(int)
57 | 
58 |     # add elevation to graph nodes, calculate edge grades, then save to disk
59 |     nx.set_node_attributes(G, nodes["elevation"], "elevation")
60 |     G = ox.add_edge_grades(G, add_absolute=True)
61 |     ox.io.save_graphml(G, fp)
62 |     return nodes
63 | 
64 | 
65 | # multiprocess the queue
66 | args = [(fp,) for fp in Path(config["models_graphml_path"]).glob("*/*.graphml")]  # [-100:]
67 | msg = f"Setting node elevations for {len(args):,} GraphML files using {cpus} CPUs"
68 | print(ox.ts(), msg)
69 | with mp.get_context().Pool(cpus) as pool:
70 |     result = pool.starmap_async(set_elevations, args)
71 |     results = (r for r in result.get() if r is not None)
72 | 
73 | # save all nodes' elevation details to disk for later analysis
74 | df = pd.concat(results, ignore_index=False).sort_index()
75 | cols = [c for c in df.columns if "elev" in c]
76 | df = df[cols]
77 | df = df.replace([np.inf, -np.inf], np.nan)
78 | print(df.describe().round(2))
79 | df.to_csv(config["elevation_final_path"], index=True, encoding="utf-8")
80 | 


--------------------------------------------------------------------------------
/code/03-calculate-indicators/01-calculate-node-bc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import multiprocessing as mp
 5 | from os.path import getsize
 6 | from pathlib import Path
 7 | 
 8 | import igraph as ig
 9 | import networkx as nx
10 | import osmnx as ox
11 | 
12 | # we will calculate length-weighted betweenness centralities
13 | WEIGHT_ATTR = "length"
14 | 
15 | # load configs
16 | with Path("./config.json").open() as f:
17 |     config = json.load(f)
18 | 
19 | # configure multiprocessing
20 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
21 | 
22 | # configure where to find saved graphs and where to save results
23 | graphml_folder = Path(config["models_graphml_path"])
24 | save_folder = Path(config["node_bc_path"])
25 | save_folder.mkdir(parents=True, exist_ok=True)
26 | 
27 | 
28 | def convert_igraph(G_nx, weight_attr):
29 |     # relabel graph nodes as integers for igraph to ingest
30 |     G_nx = nx.relabel.convert_node_labels_to_integers(G_nx)
31 | 
32 |     # create igraph graph and add nodes/edges
33 |     G_ig = ig.Graph(directed=True)
34 |     G_ig.add_vertices(G_nx.nodes)
35 |     G_ig.add_edges(G_nx.edges(keys=False))
36 | 
37 |     # add edge weights and ensure values >0 for igraph
38 |     weights = nx.get_edge_attributes(G_nx, weight_attr).values()
39 |     weights = (0.001 if w == 0 else w for w in weights)
40 |     G_ig.es[weight_attr] = list(weights)
41 |     return G_ig
42 | 
43 | 
44 | def calculate_bc(fp, save_path, weight_attr=WEIGHT_ATTR) -> None:
45 |     print(ox.ts(), f"{str(fp)!r}")
46 | 
47 |     # load graphml, convert to igraph, calculate bc, and normalize values
48 |     G_nx = ox.io.load_graphml(fp)
49 |     bc_raw = convert_igraph(G_nx, weight_attr).betweenness(weights=weight_attr)
50 |     bc_norm = (x / (len(G_nx) - 1) / (len(G_nx) - 2) for x in bc_raw)
51 |     osmid_bc = dict(zip(G_nx.nodes, bc_norm, strict=True))
52 | 
53 |     # set graph node attributes and re-save graphml file
54 |     nx.set_node_attributes(G_nx, osmid_bc, name="bc")
55 |     ox.io.save_graphml(G_nx, fp)
56 | 
57 |     # also save results to disk as JSON
58 |     with save_path.open("w") as f:
59 |         json.dump(osmid_bc, f)
60 | 
61 | 
62 | # get graph filepaths for which we have not yet calculated BC, sorted by size
63 | filepaths = sorted(graphml_folder.glob("*/*.graphml"), key=getsize)
64 | savepaths = (save_folder / f"{fp.parent.stem}-{fp.stem}.json" for fp in filepaths)
65 | args = [(fp, sp) for fp, sp in zip(filepaths, savepaths) if not sp.is_file()]
66 | print(ox.ts(), f"There are {len(filepaths):,} total GraphML files")
67 | print(ox.ts(), f"Calculating BC for {len(args):,} remaining graphs")
68 | 
69 | # multiprocess the queue
70 | with mp.get_context().Pool(cpus) as pool:
71 |     pool.starmap_async(calculate_bc, args).get()
72 | 
73 | count_done = len(list(save_folder.glob("*.json")))
74 | print(ox.ts(), f"Calculated BC for {count_done:,} graphs")
75 | 


--------------------------------------------------------------------------------
/code/03-calculate-indicators/02-calculate-indicators.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import json
  4 | import multiprocessing as mp
  5 | import random
  6 | from os.path import getsize
  7 | from pathlib import Path
  8 | from statistics import mean, median
  9 | 
 10 | import networkx as nx
 11 | import numpy as np
 12 | import osmnx as ox
 13 | import pandas as pd
 14 | 
 15 | # load configs
 16 | with Path("./config.json").open() as f:
 17 |     config = json.load(f)
 18 | 
 19 | # configure multiprocessing
 20 | cpus = mp.cpu_count() if config["cpus_stats"] == 0 else config["cpus_stats"]
 21 | 
 22 | graphml_folder = Path(config["models_graphml_path"])  # where to load graphml files
 23 | save_path = Path(config["indicators_street_path"])  # where to save indicator output
 24 | 
 25 | 
 26 | def intersection_counts(Gup):
 27 |     TOL = 10  # meters for intersection cleaning tolerance
 28 |     icc = len(ox.consolidate_intersections(Gup, tolerance=TOL, rebuild_graph=False))
 29 |     ict = len(ox.consolidate_intersections(Gup, tolerance=TOL, reconnect_edges=False))
 30 |     return {
 31 |         "intersect_count": ox.stats.intersection_count(Gup),
 32 |         "intersect_count_clean": icc,
 33 |         "intersect_count_clean_topo": ict,
 34 |     }
 35 | 
 36 | 
 37 | def calculate_clustering(G):
 38 |     results = {}
 39 | 
 40 |     # get directed graph without parallel edges
 41 |     G = ox.convert.to_digraph(G, weight="length")
 42 | 
 43 |     # avg clust coeff for directed graph ignoring parallel edges
 44 |     results["cc_avg_dir"] = nx.average_clustering(G)
 45 | 
 46 |     # avg clust coeff (weighted) for directed graph ignoring parallel edges
 47 |     results["cc_wt_avg_dir"] = nx.average_clustering(G, weight="length")
 48 | 
 49 |     # max pagerank (weighted) in directed graph ignoring parallel edges
 50 |     results["pagerank_max"] = max(nx.pagerank(G, weight="length").values())
 51 | 
 52 |     # get undirected graph without parallel edges
 53 |     G = nx.Graph(G)
 54 | 
 55 |     # avg clust coeff for undirected graph ignoring parallel edges
 56 |     results["cc_avg_undir"] = nx.average_clustering(G)
 57 | 
 58 |     # avg clust coeff (weighted) for undirected graph ignoring parallel edges
 59 |     results["cc_wt_avg_undir"] = nx.average_clustering(G, weight="length")
 60 |     return results
 61 | 
 62 | 
 63 | def calculate_elevation_grades(Gu):
 64 |     # calculate elevation & grade stats
 65 |     grades = pd.Series(nx.get_edge_attributes(Gu, "grade_abs").values())
 66 |     elevs = pd.Series(nx.get_node_attributes(Gu, "elevation").values())
 67 |     elev_iqr = elevs.quantile(0.75) - elevs.quantile(0.25)
 68 |     elev_range = elevs.max() - elevs.min()
 69 |     return {
 70 |         "elev_iqr": elev_iqr,
 71 |         "elev_mean": elevs.mean(),
 72 |         "elev_median": elevs.median(),
 73 |         "elev_range": elev_range,
 74 |         "elev_std": elevs.std(),
 75 |         "grade_mean": grades.mean(),
 76 |         "grade_median": grades.median(),
 77 |     }
 78 | 
 79 | 
 80 | def gini(x):
 81 |     sorted_x = np.sort(x)
 82 |     n = len(x)
 83 |     cumx = np.cumsum(sorted_x, dtype=float)
 84 |     return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n
 85 | 
 86 | 
 87 | def save_results(results, save_path) -> None:
 88 |     save_path.parent.mkdir(parents=True, exist_ok=True)
 89 |     df = pd.DataFrame(results)
 90 |     if save_path.is_file():
 91 |         df = pd.concat([pd.read_csv(save_path), df])
 92 |     df.to_csv(save_path, index=False, encoding="utf-8")
 93 |     print(ox.ts(), f"Saved {len(results):,} new results to disk at {str(save_path)!r}")
 94 | 
 95 | 
 96 | def calculate_graph_stats(graphml_path):
 97 |     print(ox.ts(), f"Processing {str(graphml_path)!r}")
 98 |     G = ox.io.load_graphml(graphml_path, node_dtypes={"bc": float})
 99 | 
100 |     # get filepath and country/city identifiers
101 |     country, country_iso = graphml_path.parent.stem.split("-")
102 |     core_city, uc_id = graphml_path.stem.split("-")
103 |     uc_id = int(uc_id)
104 | 
105 |     # clustering and pagerank: needs directed representation
106 |     clustering_stats = calculate_clustering(G)
107 | 
108 |     # get an undirected representation of this network for everything else
109 |     Gu = ox.convert.to_undirected(G)
110 |     G.clear()
111 |     G = None
112 | 
113 |     # street lengths
114 |     lengths = nx.get_edge_attributes(Gu, "length").values()
115 |     length_total = sum(lengths)
116 |     length_mean = mean(lengths)
117 |     length_median = median(lengths)
118 | 
119 |     # nodes, edges, node degree, self loops
120 |     n = len(Gu.nodes)
121 |     m = len(Gu.edges)
122 |     k_avg = 2 * m / n
123 |     self_loop_proportion = ox.stats.self_loop_proportion(Gu)
124 | 
125 |     # proportion of 4-way intersections, 3-ways, and dead-ends
126 |     spn = ox.stats.streets_per_node_proportions(Gu)
127 |     prop_4way = spn.get(4, 0)
128 |     prop_3way = spn.get(3, 0)
129 |     prop_deadend = spn.get(1, 0)
130 | 
131 |     # betweenness centrality stats
132 |     bc = list(nx.get_node_attributes(Gu, "bc").values())
133 |     bc_gini = gini(bc)
134 |     bc_max = max(bc)
135 | 
136 |     # average circuity and straightness
137 |     circuity = ox.stats.circuity_avg(Gu)
138 |     straightness = 1 / circuity
139 | 
140 |     # elevation and grade
141 |     elevation_grades = calculate_elevation_grades(Gu)
142 | 
143 |     # orientation entropy
144 |     orientation_entropy = ox.bearing.orientation_entropy(ox.bearing.add_edge_bearings(Gu))
145 | 
146 |     # total and clean intersection counts
147 |     intersection_stats = intersection_counts(ox.projection.project_graph(Gu))
148 | 
149 |     # assemble the results
150 |     results = {
151 |         "country": country,
152 |         "country_iso": country_iso,
153 |         "core_city": core_city,
154 |         "uc_id": uc_id,
155 |         "circuity": circuity,
156 |         "k_avg": k_avg,
157 |         "length_mean": length_mean,
158 |         "length_median": length_median,
159 |         "length_total": length_total,
160 |         "street_segment_count": m,
161 |         "node_count": n,
162 |         "orientation_entropy": orientation_entropy,
163 |         "prop_4way": prop_4way,
164 |         "prop_3way": prop_3way,
165 |         "prop_deadend": prop_deadend,
166 |         "self_loop_proportion": self_loop_proportion,
167 |         "straightness": straightness,
168 |         "bc_gini": bc_gini,
169 |         "bc_max": bc_max,
170 |     }
171 |     results.update(clustering_stats)
172 |     results.update(elevation_grades)
173 |     results.update(intersection_stats)
174 |     return results
175 | 
176 | 
177 | # get all the filepaths that don't already have results in the save file
178 | done = set(pd.read_csv(save_path)["uc_id"]) if save_path.is_file() else set()
179 | filepaths = sorted(graphml_folder.glob("*/*"), key=getsize)
180 | args = [(fp,) for fp in filepaths if int(fp.stem.split("-")[1]) not in done]
181 | 
182 | # randomly order params so one thread doesn't have to do all the big graphs
183 | random.shuffle(args)
184 | msg = f"Calculating stats for {len(args):,} graphs using {cpus} CPUs"
185 | print(ox.ts(), msg)
186 | 
187 | # multiprocess the queue
188 | with mp.get_context().Pool(cpus) as pool:
189 |     results = pool.starmap_async(calculate_graph_stats, args).get()
190 | 
191 | # final save to disk
192 | save_results(results, save_path)
193 | 


--------------------------------------------------------------------------------
/code/03-calculate-indicators/03-merge-indicators.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | from pathlib import Path
 5 | 
 6 | import geopandas as gpd
 7 | import osmnx as ox
 8 | import pandas as pd
 9 | 
10 | # load configs
11 | with Path("./config.json").open() as f:
12 |     config = json.load(f)
13 | 
14 | uc_gpkg_path = config["uc_gpkg_path"]  # prepped urban centers dataset
15 | ind_street_path = config["indicators_street_path"]  # street network indicators to load
16 | ind_path = config["indicators_path"]  # merged indicators to save for repo upload
17 | ind_all_path = config["indicators_all_path"]  # all merged indicators to save for analysis
18 | 
19 | # load the UCs dataset
20 | ucs = gpd.read_file(uc_gpkg_path).sort_index().drop(columns=["country_iso"])
21 | print(ox.ts(), f"Loaded urban centers dataset with shape={ucs.shape}")
22 | 
23 | # load the previously calculated street network indicators dataset
24 | ind = pd.read_csv(ind_street_path)
25 | print(ox.ts(), f"Loaded indicators dataset with shape={ind.shape}")
26 | 
27 | # rename UC fields to something intelligible
28 | mapper = {
29 |     "GC_UCN_LIS_2025": "uc_names",
30 |     "GC_DEV_USR_2025": "world_region",
31 |     "GC_POP_TOT_2025": "resident_pop",
32 |     "GC_UCA_KM2_2025": "area_km2",
33 |     "GH_BUS_TOT_2025": "built_up_area_m2",
34 |     "GH_BPC_TOT_2025": "built_up_area_percap",
35 |     "GH_BUH_AVG_2020": "built_height_m",
36 |     "SC_SEC_GDP_2020": "gdp_ppp",
37 |     "GC_DEV_WIG_2025": "world_bank_income_group",
38 |     "SC_SEC_HDI_2020": "hdi",
39 |     "EM_CO2_TRA_2022": "transport_co2_em",
40 |     "EM_PM2_TRA_2022": "transport_pm25_em",
41 |     "EM_PM2_CON_2020": "pm25_concentration",
42 |     "CL_KOP_CUR_2025": "koppen_geiger",
43 |     "GE_ELV_AVG_2025": "avg_elevation",
44 |     "CL_B12_CUR_2010": "avg_precipitation",
45 |     "CL_B01_CUR_2010": "avg_temperature",
46 |     "SD_POP_HGR_2025": "pop_greenness",
47 |     "SD_LUE_LPR_2000_2020": "land_use_efficiency",
48 | }
49 | 
50 | # merge UC data with street network indicators, only keep columns from the
51 | # indicators data set or named in the mapper, then save to disk
52 | df = ind.merge(right=ucs, how="inner", left_on="uc_id", right_on="ID_UC_G0")
53 | df = df.rename(columns=mapper)
54 | df = df[[c for c in df.columns if c in ind.columns or c in mapper.values()]]
55 | df.to_csv(ind_all_path, index=False, encoding="utf-8")
56 | msg = f"Saved all indicators to disk at {str(ind_all_path)!r}, shape={df.shape}"
57 | print(ox.ts(), msg)
58 | 
59 | # drop columns that should not go in our repo then save
60 | drop = [
61 |     "built_up_area_percap",
62 |     "built_height_m",
63 |     "gdp_ppp",
64 |     "world_bank_income_group",
65 |     "hdi",
66 |     "transport_co2_em",
67 |     "transport_pm25_em",
68 |     "pm25_concentration",
69 |     "koppen_geiger",
70 |     "avg_elevation",
71 |     "avg_precipitation",
72 |     "avg_temperature",
73 |     "pop_greenness",
74 |     "land_use_efficiency",
75 | ]
76 | df = df.drop(columns=drop)
77 | df.to_csv(ind_path, index=False, encoding="utf-8")
78 | msg = f"Saved repo indicators to disk at {str(ind_path)!r}, shape={df.shape}"
79 | print(ox.ts(), msg)
80 | 


--------------------------------------------------------------------------------
/code/03-calculate-indicators/04-create-metadata.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import json
  4 | from pathlib import Path
  5 | 
  6 | import osmnx as ox
  7 | import pandas as pd
  8 | 
  9 | # load configs
 10 | with Path("./config.json").open() as f:
 11 |     config = json.load(f)
 12 | 
 13 | ind_path = config["indicators_path"]  # indicators data (repo subset)
 14 | ind_all_path = config["indicators_all_path"]  # all indicators data
 15 | ind_meta_path = config["indicators_metadata_path"]  # indicators metadata (repo subset)
 16 | ind_all_meta_path = config["indicators_all_metadata_path"]  # indicators metadata (all)
 17 | nodes_meta_path = config["models_metadata_nodes_path"]  # graph nodes metadata
 18 | edges_meta_path = config["models_metadata_edges_path"]  # graph edges metadata
 19 | 
 20 | # create graph nodes metadata
 21 | desc = {}
 22 | desc["osmid"] = {"description": "Unique OSM node ID", "type": "int"}
 23 | desc["x"] = {"description": "Longitude coordinate (EPSG:4326)", "type": "float"}
 24 | desc["y"] = {"description": "Latitude coordinate (EPSG:4326)", "type": "float"}
 25 | desc["elevation"] = {
 26 |     "description": "Node elevation (meters above sea level) from ASTER or SRTM",
 27 |     "type": "int",
 28 | }
 29 | desc["elevation_aster"] = {
 30 |     "description": "Node elevation (meters above sea level) from ASTER",
 31 |     "type": "int",
 32 | }
 33 | desc["elevation_srtm"] = {
 34 |     "description": "Node elevation (meters above sea level) from SRTM",
 35 |     "type": "int",
 36 | }
 37 | desc["street_count"] = {
 38 |     "description": "Number of physical street segments connected to this node",
 39 |     "type": "int",
 40 | }
 41 | desc["bc"] = {
 42 |     "description": "Normalized distance-weighted node betweenness centrality",
 43 |     "type": "float",
 44 | }
 45 | desc["other attributes"] = {"description": "As defined in OSM documentation", "type": ""}
 46 | 
 47 | # save nodes metadata to disk
 48 | nodes_meta = pd.DataFrame(desc).T.reset_index().rename(columns={"index": "indicator"})
 49 | nodes_meta.to_csv(nodes_meta_path, index=False, encoding="utf-8")
 50 | print(ox.ts(), f"Saved graph nodes metadata to {str(nodes_meta_path)!r}")
 51 | 
 52 | # create graph edges metadata
 53 | desc = {}
 54 | desc["u"] = {"description": "Unique OSM ID of source node", "type": "int"}
 55 | desc["v"] = {"description": "Unique OSM ID of destination node", "type": "int"}
 56 | desc["key"] = {"description": "Unique ID if parallel edges exist between u and v", "type": "int"}
 57 | desc["osmid"] = {"description": "Unique OSM way ID", "type": "int"}
 58 | desc["geometry"] = {"description": "Edge centerline geometry (EPSG:4326)", "type": "linestring"}
 59 | desc["length"] = {"description": "Length along the edge (meters)", "type": "float"}
 60 | desc["grade"] = {"description": "Edge grade (rise over run)", "type": "float"}
 61 | desc["grade_abs"] = {"description": "Absolute value of edge grade", "type": "float"}
 62 | desc["oneway"] = {"description": "Whether edge part of a one-way street", "type": "boolean"}
 63 | desc["reversed"] = {
 64 |     "description": "Whether edge runs opposite direction of OSM way",
 65 |     "type": "boolean",
 66 | }
 67 | desc["other attributes"] = {"description": "As defined in OSM documentation", "type": ""}
 68 | 
 69 | # save edges metadata to disk
 70 | edges_meta = pd.DataFrame(desc).T.reset_index().rename(columns={"index": "indicator"})
 71 | edges_meta.to_csv(edges_meta_path, index=False, encoding="utf-8")
 72 | print(ox.ts(), f"Saved graph edges metadata to {str(edges_meta_path)!r}")
 73 | 
 74 | # create indicators metadata
 75 | desc = {}
 76 | desc["area_km2"] = "Area within urban center boundary polygon, km2 (GHS)"
 77 | desc["avg_elevation"] = "Average elevation, meters above sea level (GHS)"
 78 | desc["avg_precipitation"] = "Annual average precipitation, millimeters (GHS)"
 79 | desc["avg_temperature"] = "Average temperature, celsius (GHS)"
 80 | desc["bc_gini"] = "Gini coefficient of normalized distance-weighted node betweenness centralities"
 81 | desc["bc_max"] = "Max normalized distance-weighted node betweenness centralities"
 82 | desc["built_height_m"] = "Average height of built surfaces, meters (GHS)"
 83 | desc["built_up_area_m2"] = "Built-up surface area, square meters (GHS)"
 84 | desc["built_up_area_percap"] = "Built-up surface area per-capita, square meters per person (GHS)"
 85 | desc["cc_avg_dir"] = "Average clustering coefficient (unweighted/directed)"
 86 | desc["cc_avg_undir"] = "Average clustering coefficient (unweighted/undirected)"
 87 | desc["cc_wt_avg_dir"] = "Average clustering coefficient (weighted/directed)"
 88 | desc["cc_wt_avg_undir"] = "Average clustering coefficient (weighted/undirected)"
 89 | desc["circuity"] = "Ratio of street lengths to straightline distances"
 90 | desc["core_city"] = "Urban center core city name"
 91 | desc["country"] = "Primary country name"
 92 | desc["country_iso"] = "Primary country ISO 3166-1 alpha-3 code"
 93 | desc["elev_iqr"] = "Interquartile range of node elevations, meters"
 94 | desc["elev_mean"] = "Mean node elevation, meters"
 95 | desc["elev_median"] = "Median node elevation, meters"
 96 | desc["elev_range"] = "Range of node elevations, meters"
 97 | desc["elev_std"] = "Standard deviation of node elevations, meters"
 98 | desc["gdp_ppp"] = "Total GDP PPP, USD (GHS)"
 99 | desc["grade_mean"] = "Mean absolute street grade (incline)"
100 | desc["grade_median"] = "Median absolute street grade (incline)"
101 | desc["hdi"] = "Human development index at subnational level (GHS)"
102 | desc["intersect_count"] = "Count of (undirected) edge intersections"
103 | desc["intersect_count_clean"] = (
104 |     "Count of street intersections (merged within 10 meters geometrically)"
105 | )
106 | desc["intersect_count_clean_topo"] = (
107 |     "Count of street intersections (merged within 10 meters topologically)"
108 | )
109 | desc["k_avg"] = "Average node degree (undirected)"
110 | desc["koppen_geiger"] = "Köppen-Geiger classification of majority of surface (GHS)"
111 | desc["land_use_efficiency"] = "Land use efficiency 1990-2015 (GHS)"
112 | desc["length_mean"] = "Mean street segment length (undirected edges), meters"
113 | desc["length_median"] = "Median street segment length (undirected edges), meters"
114 | desc["length_total"] = "Total street length (undirected edges), meters"
115 | desc["node_count"] = "Count of nodes"
116 | desc["orientation_entropy"] = "Entropy of street network bearings"
117 | desc["pagerank_max"] = "The maximum PageRank value of any node"
118 | desc["pm25_concentration"] = (
119 |     "Population-weighted average PM2.5 concentrations, micrograms/meter^3 (GHS)"
120 | )
121 | desc["pop_greenness"] = "Land consumption rate / population growth rate (GHS)"
122 | desc["prop_4way"] = "Proportion of nodes that represent 4-way street intersections"
123 | desc["prop_3way"] = "Proportion of nodes that represent 3-way street intersections"
124 | desc["prop_deadend"] = "Proportion of nodes that represent dead-ends"
125 | desc["resident_pop"] = "Total resident population (GHS)"
126 | desc["self_loop_proportion"] = "Proportion of edges that are self-loops"
127 | desc["straightness"] = "1 / circuity"
128 | desc["street_segment_count"] = "Count of streets (undirected edges)"
129 | desc["transport_co2_em"] = "Total CO2 emissions from transport sector, tons/year (GHS)"
130 | desc["transport_pm25_em"] = "Total PM2.5 emissions from transport sector, tons/year (GHS)"
131 | desc["uc_id"] = "Urban center unique ID (GHS)"
132 | desc["uc_names"] = "List of city names within this urban center (GHS)"
133 | desc["world_bank_income_group"] = "World Bank income group"
134 | desc["world_region"] = "UN SDG geographic region"
135 | 
136 | # turn the metadata descriptions into a dataframe
137 | meta = pd.DataFrame(desc, index=["description"]).T
138 | 
139 | # make sure we have metadata for all indicators
140 | ind_all = pd.read_csv(ind_all_path)
141 | assert len(ind_all.columns) == len(meta)
142 | 
143 | # reindex df so cols are in same order as metadata
144 | ind_all = ind_all.reindex(columns=meta.index).dropna()
145 | 
146 | # add data type of each field
147 | dtypes = ind_all.dtypes.astype(str).replace({"object": "string"}).str.replace("64", "")
148 | dtypes.name = "type"
149 | meta = meta.merge(right=dtypes, left_index=True, right_index=True).reindex(
150 |     columns=["type", "description"],
151 | )
152 | 
153 | # make sure all the indicators are present in the metadata
154 | assert (meta.index == ind_all.columns).all()
155 | 
156 | # save all metadata to disk
157 | meta_all = meta.reset_index().rename(columns={"index": "indicator"})
158 | meta_all.to_csv(ind_all_meta_path, index=False, encoding="utf-8")
159 | print(ox.ts(), f"Saved all indicator metadata to {str(ind_all_meta_path)!r}")
160 | 
161 | # drop fields that should not go in our repo then save
162 | repo_cols = set(pd.read_csv(ind_path).columns)
163 | keep = [k for k in desc if k in repo_cols]
164 | meta = meta.loc[keep].reset_index().rename(columns={"index": "indicator"})
165 | meta.to_csv(ind_meta_path, index=False, encoding="utf-8")
166 | print(ox.ts(), f"Saved repo indicator metadata to {str(ind_meta_path)!r}")
167 | 


--------------------------------------------------------------------------------
/code/04-upload-repository/01-save-files.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import json
  4 | import multiprocessing as mp
  5 | from pathlib import Path
  6 | 
  7 | import osmnx as ox
  8 | import pandas as pd
  9 | 
 10 | # load configs
 11 | with Path("./config.json").open() as f:
 12 |     config = json.load(f)
 13 | 
 14 | # configure multiprocessing
 15 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
 16 | 
 17 | # set up save/load folder locations
 18 | graphml_folder = Path(config["models_graphml_path"])  # where to load GraphML
 19 | gpkg_folder = Path(config["models_gpkg_path"])  # where to save GeoPackages
 20 | nelist_folder = Path(config["models_nelist_path"])  # where to save node/edge lists
 21 | 
 22 | 
 23 | # function to convert node elevation string -> float -> int
 24 | def to_int(value):
 25 |     try:
 26 |         return int(float(value))
 27 |     except ValueError:
 28 |         return float(value)
 29 | 
 30 | 
 31 | node_dtypes = {"bc": float, "elevation_aster": to_int, "elevation_srtm": to_int}
 32 | 
 33 | 
 34 | def save_graph(graphml_path, gpkg_path, nodes_path, edges_path, node_dtypes=node_dtypes) -> None:
 35 |     print(ox.ts(), f"Saving {str(graphml_path)!r}", flush=True)
 36 | 
 37 |     # load GraphML file and save as GeoPackage to disk
 38 |     G = ox.io.load_graphml(graphml_path, node_dtypes=node_dtypes)
 39 |     ox.io.save_graph_geopackage(G, gpkg_path)
 40 | 
 41 |     # get graph node/edge GeoDataFrames for node/edge lists
 42 |     nodes, edges = ox.convert.graph_to_gdfs(G, node_geometry=False, fill_edge_geometry=False)
 43 | 
 44 |     # nodes: round floats and organize columns
 45 |     node_cols = [
 46 |         "osmid",
 47 |         "x",
 48 |         "y",
 49 |         "elevation",
 50 |         "elevation_aster",
 51 |         "elevation_srtm",
 52 |         "bc",
 53 |         "ref",
 54 |         "highway",
 55 |     ]
 56 |     nodes = nodes.reset_index().reindex(columns=node_cols)
 57 | 
 58 |     # edges: round floats and organize columns
 59 |     round_cols = ["grade", "grade_abs", "length"]
 60 |     edges[round_cols] = edges[round_cols].round(3)
 61 |     edge_cols = [
 62 |         "u",
 63 |         "v",
 64 |         "key",
 65 |         "oneway",
 66 |         "highway",
 67 |         "name",
 68 |         "length",
 69 |         "grade",
 70 |         "grade_abs",
 71 |         "reversed",
 72 |         "lanes",
 73 |         "width",
 74 |         "est_width",
 75 |         "maxspeed",
 76 |         "access",
 77 |         "service",
 78 |         "bridge",
 79 |         "tunnel",
 80 |         "area",
 81 |         "junction",
 82 |         "osmid",
 83 |         "ref",
 84 |     ]
 85 |     edges = edges.drop(columns=["geometry"]).reset_index().reindex(columns=edge_cols)
 86 | 
 87 |     # save graph node/edge lists as CSV files to disk
 88 |     nodes_path.parent.mkdir(parents=True, exist_ok=True)
 89 |     nodes.to_csv(nodes_path, index=False, encoding="utf-8")
 90 |     edges.to_csv(edges_path, index=False, encoding="utf-8")
 91 | 
 92 | 
 93 | def make_args():
 94 |     filepaths = sorted(graphml_folder.glob("*/*"))
 95 |     print(ox.ts(), f"There are {len(filepaths):,} total GraphML files")
 96 | 
 97 |     args = []
 98 |     for fp in filepaths:
 99 |         gpkg_path = gpkg_folder / fp.parent.stem / fp.name.replace("graphml", "gpkg")
100 |         nelist_output_folder = nelist_folder / fp.parent.stem / fp.stem
101 |         nodes_path = nelist_output_folder / "node_list.csv"
102 |         edges_path = nelist_output_folder / "edge_list.csv"
103 |         if not (gpkg_path.is_file() and nodes_path.is_file() and edges_path.is_file()):
104 |             args.append((fp, gpkg_path, nodes_path, edges_path))
105 | 
106 |     print(ox.ts(), f"Saving GeoPackage and node/edge lists for {len(args):,} remaining graphs")
107 |     return args
108 | 
109 | 
110 | # multiprocess the queue
111 | with mp.get_context().Pool(cpus) as pool:
112 |     pool.starmap_async(save_graph, make_args()).get()
113 | 
114 | # final file count checks
115 | # verify same number of country folders across all file types
116 | graphml_countries = list(graphml_folder.glob("*"))
117 | gpkg_countries = list(gpkg_folder.glob("*"))
118 | nelist_countries = list(nelist_folder.glob("*"))
119 | assert len(graphml_countries) == len(gpkg_countries) == len(nelist_countries)
120 | 
121 | # verify same number of model files across all file types
122 | graphml_paths = list(graphml_folder.glob("*/*.graphml"))
123 | gpkg_paths = list(gpkg_folder.glob("*/*.gpkg"))
124 | nlist_paths = list(nelist_folder.glob("*/*/node_list.csv"))
125 | elist_paths = list(nelist_folder.glob("*/*/edge_list.csv"))
126 | assert len(graphml_paths) == len(gpkg_paths) == len(nlist_paths) == len(elist_paths)
127 | 
128 | # verify same countries/cities across all file types
129 | graphml_names = {fp.parent.stem + "/" + fp.stem for fp in graphml_paths}
130 | gpkg_names = {fp.parent.stem + "/" + fp.stem for fp in gpkg_paths}
131 | nelist_names = {fp.parent.stem + "/" + fp.stem for fp in nelist_folder.glob("*/*")}
132 | assert graphml_names == gpkg_names == nelist_names
133 | 
134 | # verify an indicator row exists for every GraphML file
135 | df = pd.read_csv(config["indicators_path"])
136 | ucids1 = set(df["uc_id"].astype(str).values)
137 | ucids2 = {fp.stem.split("-")[1] for fp in graphml_paths}
138 | assert ucids1 == ucids2
139 | 
140 | print(ox.ts(), "Successfully passed all file checks")
141 | 


--------------------------------------------------------------------------------
/code/04-upload-repository/02-stage-files.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import multiprocessing as mp
 5 | import zipfile
 6 | from pathlib import Path
 7 | 
 8 | import osmnx as ox
 9 | 
10 | # load configs
11 | with Path("./config.json").open() as f:
12 |     config = json.load(f)
13 | 
14 | compression_args = {"compression": zipfile.ZIP_BZIP2, "compresslevel": 9}
15 | 
16 | # map input folders to output folders containing zipped country files
17 | manifest = [
18 |     {"input": Path(config["models_gpkg_path"]), "output": Path(config["staging_gpkg_path"])},
19 |     {"input": Path(config["models_graphml_path"]), "output": Path(config["staging_graphml_path"])},
20 |     {"input": Path(config["models_nelist_path"]), "output": Path(config["staging_nelist_path"])},
21 | ]
22 | 
23 | # configure CPUs
24 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"]
25 | 
26 | 
27 | # zip a folder and its contents
28 | def zip_folder(input_folder, output_fp, compression_args=compression_args) -> None:
29 |     print(ox.ts(), f"Staging {str(output_fp)!r}", flush=True)
30 |     pattern = "*/*" if "nelist" in str(input_folder) else "*"
31 |     with zipfile.ZipFile(output_fp, mode="w", **compression_args) as zf:
32 |         for input_fp in input_folder.glob(pattern):
33 |             zf.write(input_fp, arcname=Path(input_fp.parent.stem) / input_fp.name)
34 | 
35 | 
36 | # assemble input folders to zip + their destination zip file paths
37 | args = []
38 | for item in manifest:
39 |     output_folder = item["output"]
40 |     output_folder.mkdir(parents=True, exist_ok=True)
41 |     for input_folder in item["input"].glob("*"):
42 |         output_fp = output_folder / (input_folder.stem + ".zip")
43 |         if not output_fp.is_file():
44 |             args.append((input_folder, output_fp))
45 | 
46 | # multiprocess the queue
47 | print(ox.ts(), f"Compressing and staging {len(args)} input files using {cpus} CPUs")
48 | with mp.get_context().Pool(cpus) as pool:
49 |     pool.starmap_async(zip_folder, args).get()
50 | 
51 | print(ox.ts(), f"Finished compressing and staging {len(args)} input files")
52 | 


--------------------------------------------------------------------------------
/code/04-upload-repository/03-upload-dataverse.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import json
  4 | import time
  5 | import zipfile
  6 | from hashlib import md5
  7 | from pathlib import Path
  8 | from urllib.parse import urljoin
  9 | 
 10 | import osmnx as ox
 11 | import requests
 12 | from keys import dataverse_api_key as api_key
 13 | 
 14 | # only set true on first run to erase everything from the draft
 15 | delete_existing = False
 16 | 
 17 | # lets you skip uploading files if this is supposed to be a dry run
 18 | debug_mode = False
 19 | 
 20 | # load configs
 21 | with Path("./config.json").open() as f:
 22 |     config = json.load(f)
 23 | 
 24 | # configure the dataverse upload
 25 | attempts_max = 3  # how many times to retry same file upload after error before giving up
 26 | pause_error = 10  # seconds to pause after an error
 27 | pause_normal = 0  # seconds to pause between uploads
 28 | upload_timeout = 1200  # how long to set the timeout for upload via http post
 29 | 
 30 | # base URL for working with datasets via dataverse native API
 31 | base_url = "https://dataverse.harvard.edu/api/v1/datasets/:persistentId/"
 32 | 
 33 | # base URL for working with files via dataverse native API
 34 | file_url = "https://dataverse.harvard.edu/api/files/{file_id}"
 35 | 
 36 | # define what to upload
 37 | manifests = [
 38 |     {
 39 |         "doi": config["doi_gpkg"],
 40 |         "folder": config["staging_gpkg_path"],
 41 |         "file_desc": "Zip contains GeoPackages of all urban street networks in {}.",
 42 |         "file_tags": ["GeoPackage", "Street Network", "Models"],
 43 |     },
 44 |     {
 45 |         "doi": config["doi_graphml"],
 46 |         "folder": config["staging_graphml_path"],
 47 |         "file_desc": "Zip contains GraphML files of all urban street networks in {}.",
 48 |         "file_tags": ["GraphML", "Street Network", "Models"],
 49 |     },
 50 |     {
 51 |         "doi": config["doi_nelist"],
 52 |         "folder": config["staging_nelist_path"],
 53 |         "file_desc": "Zip contains node/edge list CSV files of all urban street networks in {}.",
 54 |         "file_tags": ["Node/Edge List", "Street Network", "Models"],
 55 |     },
 56 | ]
 57 | 
 58 | 
 59 | # get all the files that currently exist in the draft or published dataset
 60 | def get_server_files(doi, version):
 61 |     endpoint = f"versions/:{version}/files?key={api_key}&persistentId={doi}"
 62 |     rj = requests.get(urljoin(base_url, endpoint)).json()
 63 |     try:
 64 |         return {file["dataFile"]["filename"]: file["dataFile"]["id"] for file in rj["data"]}
 65 |     except KeyError:
 66 |         return {}
 67 | 
 68 | 
 69 | # find pre-existing draft/published files already uploaded to dataset
 70 | def get_preexisting_files(manifests):
 71 |     draft_files = {}  # what files have already been uploaded to the draft?
 72 |     published_files = {}  # what files exist in the published dataset?
 73 |     for manifest in manifests:
 74 |         doi = manifest["doi"]
 75 |         draft_files[doi] = get_server_files(doi, version="draft")
 76 |         published_files[doi] = get_server_files(doi, version="latest-published")
 77 |         msg = (
 78 |             f"Files in {doi}: {len(published_files[doi])} published, {len(draft_files[doi])} draft."
 79 |         )
 80 |         print(ox.ts(), msg)
 81 |     return draft_files, published_files
 82 | 
 83 | 
 84 | # delete all the existing (carried-over) files in the draft datasets
 85 | def delete_draft_files(already_uploaded) -> None:
 86 |     file_ids = [f for d in already_uploaded.values() for f in d.values()]
 87 |     print(ox.ts(), f"Deleting {len(file_ids)} draft files...")
 88 |     headers = {"X-Dataverse-key": api_key}
 89 |     for file_id in file_ids:
 90 |         url = file_url.format(file_id=file_id)
 91 |         response = requests.delete(url, headers=headers)
 92 |         if not response.ok:
 93 |             print(ox.ts(), f"Failed to delete {url!r}")
 94 | 
 95 | 
 96 | # zip a staged zipped file, open it, and return the buffer. this will
 97 | # double-zip the zip files because dataverse unzips zip files when they are
 98 | # uploaded. the result is that dataverse will host the original zipped file
 99 | def get_file_to_upload(fp, target_filename):
100 |     checksum = md5(fp.open("rb").read()).hexdigest()
101 |     upload_fp = Path(config["staging_folder"]) / "upload_temp.zip"
102 |     with zipfile.ZipFile(file=upload_fp, mode="w") as zf:
103 |         zf.write(fp, arcname=target_filename)
104 |     file = {"file": upload_fp.open("rb")}
105 |     return file, checksum
106 | 
107 | 
108 | # configure the file description and tags that appear on dataverse
109 | def get_payload_to_upload(fp, manifest):
110 |     country_name = fp.stem[:-4].replace("_", " ").title()
111 |     description = manifest["file_desc"].format(country_name)
112 |     categories = manifest["file_tags"] + [country_name]
113 |     params = {"description": description, "categories": categories}
114 |     return {"jsonData": json.dumps(params)}
115 | 
116 | 
117 | # upload a new file to a dataverse dataset
118 | def upload_file(fp, target_filename, manifest, attempt_count=1) -> None:
119 |     print(ox.ts(), f"Uploading {str(fp)!r} to {manifest['doi']!r}")
120 |     if debug_mode:
121 |         return
122 | 
123 |     file, checksum = get_file_to_upload(fp, target_filename)
124 |     payload = get_payload_to_upload(fp, manifest)
125 |     endpoint = f"add?persistentId={manifest['doi']}&key={api_key}"
126 |     url = urljoin(base_url, endpoint)
127 | 
128 |     try:
129 |         # upload the file to the server
130 |         with requests.Session() as session:
131 |             start_time = time.time()
132 |             response = session.post(url, data=payload, files=file, timeout=upload_timeout)
133 |             elapsed = time.time() - start_time
134 |         if not response.ok:
135 |             raise Exception(response.text)
136 | 
137 |         # verify the checksum calculated by the server matches our own
138 |         remote_checksum = response.json()["data"]["files"][0]["dataFile"]["md5"]
139 |         if checksum != remote_checksum:
140 |             msg = f"Checksums do not match: {checksum} and {remote_checksum}"
141 |             raise Exception(msg)
142 | 
143 |         msg = f"Response {response.status_code} in {elapsed:,.1f} seconds, checksums match"
144 |         print(ox.ts(), msg)
145 |         time.sleep(pause_normal)
146 | 
147 |     except Exception as e:
148 |         print(ox.ts(), e)
149 |         if attempt_count < attempts_max:
150 |             # retry upload if we haven't exceeded max attempts
151 |             attempt_count += 1
152 |             print(ox.ts(), f"Re-trying (attempt {attempt_count} of {attempts_max})")
153 |             time.sleep(pause_error)
154 |             upload_file(fp, target_filename, manifest, attempt_count)
155 |         else:
156 |             print(ox.ts(), "No more attempts for this file, we give up")
157 | 
158 | 
159 | # get all draft/published files currently existing on server
160 | draft_files, published_files = get_preexisting_files(manifests)
161 | if delete_existing:
162 |     delete_draft_files(draft_files)
163 |     draft_files, published_files = get_preexisting_files(manifests)
164 | 
165 | # create arguments to upload all remaining files in all staging folders
166 | args_list = []
167 | for manifest in manifests:
168 |     for fp in sorted(Path(manifest["folder"]).glob("*.zip")):
169 |         target_filename = f"{fp.stem}_{fp.parent.stem}{fp.suffix}"
170 |         if target_filename not in draft_files[manifest["doi"]]:
171 |             args_list.append((fp, target_filename, manifest))
172 | 
173 | # process the queue
174 | print(ox.ts(), f"Uploading {len(args_list)} staged files...")
175 | for args in args_list:
176 |     upload_file(*args)
177 | 


--------------------------------------------------------------------------------
/code/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cpus": 24,
 3 |   "cpus_stats": 10,
 4 |   "doi_gpkg": "doi:10.7910/DVN/E5TPDQ",
 5 |   "doi_graphml": "doi:10.7910/DVN/KA5HJ3",
 6 |   "doi_nelist": "doi:10.7910/DVN/DC7U0A",
 7 |   "elevation_final_path": "/data/snm/elevation/elevations-final.csv",
 8 |   "elevation_google_elevations_path": "/data/snm/elevation/google/elevations-google.csv",
 9 |   "elevation_google_urls_path": "/data/snm/elevation/google/urls.csv",
10 |   "elevation_nodeclusters_path": "/data/snm/elevation/google/graph-clusters",
11 |   "gdem_aster_path": "/data/snm/GDEM/aster_v3/",
12 |   "gdem_aster_urls_path": "/data/snm/inputs/gdem-urls/urls-aster_v3.txt",
13 |   "gdem_srtm_path": "/data/snm/GDEM/srtmgl1/",
14 |   "gdem_srtm_urls_path": "/data/snm/inputs/gdem-urls/urls-srtmgl1.txt",
15 |   "indicators_all_metadata_path": "/data/snm/indicators/metadata-indicators-all.csv",
16 |   "indicators_all_path": "/data/snm/indicators/indicators-all.csv",
17 |   "indicators_metadata_path": "/data/snm/indicators/metadata-indicators.csv",
18 |   "indicators_path": "/data/snm/indicators/indicators.csv",
19 |   "indicators_street_path": "/data/snm/indicators/indicators-street-network.csv",
20 |   "iso_codes_path": "/data/snm/inputs/wikipedia-iso-country-codes.csv",
21 |   "models_gpkg_path": "/data/snm/models/gpkg",
22 |   "models_graphml_path": "/data/snm/models/graphml",
23 |   "models_metadata_edges_path": "/data/snm/models/metadata-graph-edges.csv",
24 |   "models_metadata_nodes_path": "/data/snm/models/metadata-graph-nodes.csv",
25 |   "models_nelist_path": "/data/snm/models/nelist",
26 |   "node_bc_path": "/data/snm/bc",
27 |   "osmnx_cache_path": "/data/snm/cache",
28 |   "osmnx_log_path": "/data/snm/logs",
29 |   "staging_folder": "/data/snm/staging",
30 |   "staging_gpkg_path": "/data/snm/staging/gpkg",
31 |   "staging_graphml_path": "/data/snm/staging/graphml",
32 |   "staging_indicators_path": "/data/snm/staging/indicators",
33 |   "staging_metadata_path": "/data/snm/staging/metadata",
34 |   "staging_nelist_path": "/data/snm/staging/nelist",
35 |   "uc_gpkg_path": "/data/snm/ucs.gpkg",
36 |   "uc_input_path": "/data/snm/inputs/GHS_UCDB_GLOBE_R2024A_V1_0/GHS_UCDB_GLOBE_R2024A.gpkg"
37 | }
38 | 


--------------------------------------------------------------------------------
/code/environment.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: snm
 3 | channels:
 4 |   - conda-forge
 5 | dependencies:
 6 |   - geopandas=1.0
 7 |   - jupyterlab
 8 |   - networkx=3.4
 9 |   - numpy=2.2
10 |   - osmnx=2.0
11 |   - pandas=2.2
12 |   - pre-commit
13 |   - python=3.13
14 |   - python-igraph=0.11
15 |   - requests=2.32
16 |   - scipy=1.15
17 | 


--------------------------------------------------------------------------------
/code/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | python ./01-construct-models/01-prep-ghsl.py
 4 | python ./01-construct-models/02-download-cache.py
 5 | python ./01-construct-models/03-create-graphs.py
 6 | python ./02-attach-elevation/01-aster-srtm/01-download-aster_v3.py
 7 | python ./02-attach-elevation/01-aster-srtm/02-download-srtmgl1.py
 8 | python ./02-attach-elevation/01-aster-srtm/03-build-vrts.py
 9 | python ./02-attach-elevation/01-aster-srtm/04-add-node-elevations.py
10 | python ./02-attach-elevation/02-google/01-cluster-nodes.py
11 | python ./02-attach-elevation/02-google/02-make-google-urls.py
12 | python ./02-attach-elevation/02-google/03-download-google-elevations.py
13 | python ./02-attach-elevation/02-google/04-choose-best-elevation.py
14 | python ./03-calculate-indicators/01-calculate-node-bc.py
15 | python ./03-calculate-indicators/02-calculate-indicators.py
16 | python ./03-calculate-indicators/03-merge-indicators.py
17 | python ./03-calculate-indicators/04-create-metadata.py
18 | python ./04-upload-repository/01-save-files.py
19 | python ./04-upload-repository/02-stage-files.py
20 | python ./04-upload-repository/03-upload-dataverse.py
21 | 


--------------------------------------------------------------------------------
/paper/README.md:
--------------------------------------------------------------------------------
1 | # Citation
2 | 
3 | Boeing, G. 2025. Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World. Working paper. https://github.com/gboeing/street-network-models
4 | 


--------------------------------------------------------------------------------
/paper/latex/main.tex:
--------------------------------------------------------------------------------
  1 | % !TeX program = pdflatex
  2 | % Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World
  3 | % Author: Geoff Boeing
  4 | % Web: https://geoffboeing.com/
  5 | % Repo: https://github.com/gboeing/street-network-models
  6 | 
  7 | \RequirePackage[l2tabu,orthodox]{nag} % warn if using any obsolete commands
  8 | \documentclass[12pt,letterpaper]{article} % document style
  9 | 
 10 | % load encoding and font packages for pdflatex, in order
 11 | \usepackage[T1]{fontenc} % output 8-bit encoded fonts
 12 | \usepackage[utf8]{inputenc} % allow input of utf-8 encoded characters
 13 | \usepackage{ebgaramond} % document's serif font
 14 | \usepackage{tgheros} % document's sans serif font
 15 | 
 16 | % load babel, csquotes, and microtype in order
 17 | \usepackage[USenglish]{babel} % auto-regionalize hyphens, quote marks, etc
 18 | \usepackage[strict,autostyle]{csquotes} % smart and nestable quote marks
 19 | \usepackage[babel=true]{microtype} % enable micro-typographic adjustments
 20 | 
 21 | % load everything else
 22 | \usepackage{amsmath} % additional mathematical typesetting features
 23 | \usepackage{authblk} % footnote-style author/affiliation info
 24 | \usepackage{booktabs} % better looking tables
 25 | \usepackage{caption} % custom figure/table caption styles
 26 | \usepackage{datetime} % enable formatting of date output
 27 | \usepackage[final]{draftwatermark} % watermark paper as a draft
 28 | \usepackage{endnotes} % enable endnotes
 29 | \usepackage{geometry} % configure page dimensions and margins
 30 | \usepackage{graphicx} % better inclusion of graphics
 31 | \usepackage{natbib} % textual/parenthetical author-year citations w/bibtex
 32 | \usepackage{rotating} % rotate wide tables/figures to make them landscape
 33 | \usepackage{setspace} % configure spacing between lines
 34 | \usepackage{titlesec} % custom section and subsection heading
 35 | \usepackage{url} % make nice line-breakable urls
 36 | 
 37 | % load hyperref/orcidlink last for compatibility
 38 | \usepackage{hyperref} % enable hyperlinks and pdf metadata
 39 | \usepackage{orcidlink} % provide orcid logo and link
 40 | 
 41 | % print only the month and year when using \today
 42 | \newdateformat{monthyeardate}{\monthname[\THEMONTH] \THEYEAR}
 43 | 
 44 | \newcommand{\myname}{Geoff Boeing}
 45 | \newcommand{\myemail}{boeing@usc.edu}
 46 | \newcommand{\myorcid}{0000-0003-1851-6411}  % chktex 8
 47 | \newcommand{\myaffiliation}{Department of Urban Planning and Spatial Analysis\\University of Southern California}
 48 | \newcommand{\paperdate}{April 2025}
 49 | \newcommand{\papertitle}{Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World}
 50 | \newcommand{\papercitation}{Boeing, G. 2025. \papertitle. Under review at \textit{Journal Name}.}
 51 | \newcommand{\paperkeywords}{Urban Planning, Transportation, Data Science}
 52 | 
 53 | % location of figure files, via graphicx package
 54 | \graphicspath{{./figures/}}
 55 | 
 56 | % configure the page layout, via geometry package
 57 | \geometry{
 58 |     paper=letterpaper, % paper size
 59 |     top=3.8cm, % margin sizes
 60 |     bottom=3.8cm,
 61 |     left=4cm,
 62 |     right=4cm}
 63 | \setstretch{1} % line spacing
 64 | \clubpenalty=10000 % prevent orphans
 65 | \widowpenalty=10000 % prevent widows
 66 | 
 67 | % set section/subsection headings as the sans serif font, via titlesec package
 68 | \titleformat{\section}{\normalfont\sffamily\large\bfseries\color{black}}{\thesection.}{0.3em}{}
 69 | \titleformat{\subsection}{\normalfont\sffamily\small\bfseries\color{black}}{\thesubsection.}{0.3em}{}
 70 | \titleformat{\subsubsection}{\normalfont\sffamily\small\color{black}}{\thesubsubsection.}{0.3em}{}
 71 | 
 72 | % make figure/table captions sans-serif small font
 73 | \captionsetup{font={footnotesize,sf},labelfont=bf,labelsep=period}
 74 | 
 75 | % configure pdf metadata and link handling, via hyperref package
 76 | \hypersetup{
 77 |     pdfauthor={\myname},
 78 |     pdftitle={\papertitle},
 79 |     pdfsubject={\papertitle},
 80 |     pdfkeywords={\paperkeywords},
 81 |     pdffitwindow=true, % window fit to page when opened
 82 |     breaklinks=true, % break links that overflow horizontally
 83 |     colorlinks=false, % remove link color
 84 |     pdfborder={0 0 0} % remove link border
 85 | }
 86 | 
 87 | \begin{document}
 88 | 
 89 | \title{\papertitle}%\footnote{{Citation info: \papercitation}}}
 90 | \author[]{Redacted for review}%\myname~\orcidlink{\myorcid}}
 91 | \affil[]{Redacted for review}%\myaffiliation}
 92 | \date{}%\paperdate}
 93 | 
 94 | \maketitle
 95 | 
 96 | \begin{abstract}
 97 | 
 98 | In this era of rapid urbanization and change, planners need up-to-date, global, and consistent street network models and indicators to measure resilience and performance, model accessibility, and target local quality-of-life interventions. This article presents up-to-date street network models and indicators for every urban area in the world. It uses 2025 urban area boundaries from the Global Human Settlement Layer, allowing users to join these data with hundreds of other urban attributes. Its workflow ingests 180 million OpenStreetMap nodes and 360 million OpenStreetMap edges across 10,351 urban areas in 189 countries. The code, models, and indicators are publicly available for reuse. These resources unlock worldwide urban street network science without samples as well as local analyses in under-resourced regions where models and indicators are otherwise less-accessible.
 99 | 
100 | \end{abstract}
101 | 
102 | 
103 | \section{Introduction}
104 | 
105 | Street networks structure the urban fabric and the flow of people and goods through cities \citep{barrington-leigh_global_2020}. Scholars and practitioners commonly use spatial graphs to model street networks to understand or predict many phenomena, including traffic dynamics, accessibility to daily living needs, and the resilience and sustainability of urban forms \citep{barthelemy_spatial_2022}. These spatial graphs are defined by both their topology (connections and configuration) and geometry (positions, lengths, areas, and angles) \citep{fischer_spatial_2014}. Various topological and geometric indicators exist throughout the literature to measure important street network characteristics: node degrees reveal streets' connectedness, weighted betweenness centralities identify relatively important parts of the network, circuity suggests its efficiency or lack thereof, etc. These indicators then inform downstream urban analytics to target planning interventions or benchmark and monitor cities' progress toward stated sustainability goals.
106 | 
107 | Up-to-date, global, consistent urban street network models and indicators are needed more today than ever before as planners face intertwined sustainability and public health crises in cities around the world \citep{giles-corti_creating_2022}. Meanwhile, urban science seeks to expand beyond the limits of traditional sampling to build universal theory and better understand understudied regions, such as the Global South. Yet traditional data sources and methods present headwinds to these offers. Data on urban streets are often digitized inconsistently from place to place, thwarting apples-to-apples global comparisons and making analyses particularly difficult in under-resourced regions \citep{liu_generalized_2022}. Popular data sources such as OpenStreetMap offer reasonably high quality data around the world, but do not package it in graph-theoretic form nor provide stats or indicators \citep{boeing_modeling_2025}. Tools like OSMnx aim to fill this gap, but still require coding knowledge to conduct the analysis and potentially require extensive computational resources for someone trying to conduct global urban science.
108 | 
109 | This article presents a resource a fill this gap by offering street network models and indicators worldwide for scholars and practitioners to easily reuse without reinventing the wheel. Using data from OpenStreetMap and boundaries from the 2025 Global Human Settlement Layer (GHSL), this study models and analyzes the street networks of every urban area in the world. This workflow ingests 180 million OpenStreetMap nodes and 360 million OpenStreetMap edges across 10,351 urban areas in 189 countries. This article describes this open data repository of street network models and indicators, as well as the open-source software repository containing the code to generate them. The next section describes these reproducible methods. Then we discuss the work's lineage, present contribution, and future. Finally we conclude with suggestions for getting started with these data and code.
110 | 
111 | \section{Reproducible Methods}
112 | 
113 | The following computational workflow, written in the Python programming language, generates these models and calculates these indicators.
114 | 
115 | \subsection{Urban Boundaries}
116 | 
117 | The workflow begins by extracting the boundary polygons of each urban area in the world from the 2025 GHSL Urban Centre Database (UCD), which contains 11,422 entities.\ \citet{mari_rivero_urban_2025} describe this input dataset in detail, but to summarize, the GHSL integrates a vast array of census data, remote sensing data, and volunteered geographic information to delineate the world's urbanized areas' boundaries and attach corresponding attribute data. We retain urban areas with >1 km\textsuperscript{2} built-up area and a \enquote{high} GHSL quality control score, resulting in 10,351 urban areas. This provides us with basic filtering to ensure we are modeling true urbanized areas rather than false positives or tiny villages.
118 | 
119 | \subsection{Network Modeling}
120 | 
121 | We used OSMnx v2.0.2 to download OpenStreetMap raw data in February 2025 and construct a spatial graph model of the drivable street network within each urban area. These models are nonplanar directed multigraphs with possible self-loops. They have node/edge attribute data from OpenStreetMap plus geographic coordinates and geometries \citep{boeing_modeling_2025}. We parameterize OSMnx to use its \enquote{drive} network type, retain all graph components, and run its edge simplification algorithm \citep{boeing_topological_2025}. Each urban area's graph is saved as a GraphML file, a standard graph serialization format.
122 | 
123 | \subsection{Elevation}
124 | 
125 | We attach elevation, in meters above sea level, to each node in each urban area's graph use two global digital elevation models (GDEMs): the Advanced Spaceborne Thermal Emission and Reflection Radiometer (ASTER) v3 GDEM, and the Shuttle Radar Topography Mission (SRTM) version 3.0 GDEM with voids filled. Both are 1 arcsecond (approximately 30-meter) resolution. First we download all the GDEM rasters for ASTER (45,824 tiles) and SRTM (14,297 tiles) from NASA EarthData. Next we build a virtual raster for each source. Then we use OSMnx to load each GraphML file and attach the elevation from ASTER and SRTM to each graph node.
126 | 
127 | As each node has both an ASTER and an SRTM elevation value, we choose one to use as the \enquote{official} node elevation by comparing both to a \enquote{tie-breaker} value from Google. To do so, we download each node's elevation from the Google Maps Elevation API, then choose between ASTER and SRTM based on whichever is nearer to Google's value. Then we calculate edge grades and re-save each GraphML file with these node/edge attributes.
128 | 
129 | \begin{table}[bth!]
130 |     \centering
131 |     \scriptsize
132 |     \caption{The indicators dataset contents. Variables carried over from GHSL are noted.}\label{tab:indicators}
133 |     \begin{tabular}{p{3.0cm} p{1.0cm} p{8.2cm}}
134 |         \toprule
135 |         Variable                      & Type    & Description \\
136 |         \midrule
137 |         area\_km2 & integer & Area within urban center boundary polygon, km2 (GHSL) \\
138 |         bc\_gini & decimal & Gini coefficient of normalized distance-weighted node betweenness centralities \\
139 |         bc\_max & decimal & Max normalized distance-weighted node betweenness centrality \\
140 |         built\_up\_area\_m2 & integer & Built-up surface area, square meters (GHSL) \\
141 |         cc\_avg\_dir & decimal & Average clustering coefficient (unweighted/directed) \\
142 |         cc\_avg\_undir & decimal & Average clustering coefficient (unweighted/undirected) \\
143 |         cc\_wt\_avg\_dir & decimal & Average clustering coefficient (weighted/directed) \\
144 |         cc\_wt\_avg\_undir & decimal & Average clustering coefficient (weighted/undirected) \\
145 |         circuity & decimal & Ratio of street lengths to straightline distances \\
146 |         core\_city & string & Urban center core city name \\
147 |         country & string & Primary country name \\
148 |         country\_iso & string & Primary country ISO 3166--1 alpha--3 code \\
149 |         elev\_iqr & decimal & Interquartile range of node elevations, meters \\
150 |         elev\_mean & decimal & Mean node elevation, meters \\
151 |         elev\_median & decimal & Median node elevation, meters \\
152 |         elev\_range & decimal & Range of node elevations, meters \\
153 |         elev\_std & decimal & Standard deviation of node elevations, meters \\
154 |         grade\_mean & decimal & Mean absolute street grade (incline) \\
155 |         grade\_median & decimal & Median absolute street grade (incline) \\
156 |         intersect\_count & integer & Count of (undirected) edge intersections \\
157 |         intersect\_count\_clean & integer & Count of street intersections (merged within 10 meters geometrically) \\
158 |         intersect\_count\_clean\_topo & integer & Count of street intersections (merged within 10 meters topologically) \\
159 |         k\_avg & decimal & Average node degree (undirected) \\
160 |         length\_mean & decimal & Mean street segment length (undirected edges), meters \\
161 |         length\_median & decimal & Median street segment length (undirected edges), meters \\
162 |         length\_total & decimal & Total street length (undirected edges), meters \\
163 |         node\_count & integer & Count of nodes \\
164 |         orientation\_entropy & decimal & Entropy of street network bearings \\
165 |         pagerank\_max & decimal & The maximum PageRank value of any node \\
166 |         prop\_4way & decimal & Proportion of nodes that represent 4-way street intersections \\
167 |         prop\_3way & decimal & Proportion of nodes that represent 3-way street intersections \\
168 |         prop\_deadend & decimal & Proportion of nodes that represent dead-ends \\
169 |         resident\_pop & integer & Total resident population (GHSL) \\
170 |         self\_loop\_proportion & decimal & Proportion of edges that are self-loops \\
171 |         straightness & decimal & Inverse of circuity \\
172 |         street\_segment\_count & integer & Count of streets (undirected edges) \\
173 |         uc\_id & integer & Urban center unique ID (GHSL) \\
174 |         uc\_names & string & List of city names within this urban center (GHSL) \\
175 |         world\_region & string & UN SDG geographic region \\
176 |         \bottomrule
177 |     \end{tabular}
178 | \end{table}
179 | 
180 | \subsection{Indicator Calculation}
181 | 
182 | For each graph, we the various street network indicators described in Table~\ref{tab:indicators}. These include geometric and topological measures common in transport planning, urban design, and statistical physics. We report node counts, intersection counts (i.e., non-dead-end nodes), and both geometrically and topologically consolidated intersection counts, using the algorithm described in \citet{boeing_topological_2025}. However, the most important contribution here is the calculation of node betweenness centrality for every node in every graph. A node's betweenness centrality measures the share of all possible shortest paths in a graph that use that node. High centrality values indicate \enquote{important} nodes relied on by many shortest paths. The maximum betweenness centrality represents the highest relative value in a graph (and thus identifies the most important node), and their Gini coefficient measures the concentration of importance in a network, indicating the presence and severity of chokepoints.
183 | 
184 | \subsection{Data Repository Preparation}
185 | 
186 | We convert each GraphML file to a GeoPackage and node/edge list files. The former allows users to work with these spatial networks in any GIS software. The latter provides a minimal, lightweight, highly compressible version of the models. Then we perform a series of file verification checks and create metadata files for the graphs' node and edge attributes and all of the indicators. Finally we compress and upload all model files (GeoPackages, GraphML, and node/edge lists), indicators, and metadata to the Harvard Dataverse.
187 | 
188 | \section{Code and Data Products}
189 | 
190 | \subsection{Code Repository}
191 | 
192 | The preceding methods are fully reproducible by running the modeling and analytics workflow, which is publicly available in the source code repository\endnote{Code repository: https://github.com/gboeing/street-network-models} on Github. A well-equipped personal computer can execute this workflow, but given the resource requirements it may be better (and faster) to run it in a high-performance computing cluster, where available. The code is written in Python and is operating system agnostic. The input data, dependencies, and resources required to run it are documented in the repository's readme file.
193 | 
194 | \subsection{Data Repository}
195 | 
196 | The data repository comprises five datasets nested within a top-level Dataverse\endnote{Top-level Dataverse: https://dataverse.harvard.edu/dataverse/global-urban-street-networks} data repository:
197 | 
198 | \begin{itemize}
199 |     \item Global Urban Street Networks GeoPackages\endnote{Global Urban Street Networks GeoPackages: https://doi.org/10.7910/DVN/E5TPDQ}
200 |     \item Global Urban Street Networks GraphML files\endnote{Global Urban Street Networks GraphML files: https://doi.org/10.7910/DVN/KA5HJ3}
201 |     \item Global Urban Street Networks Node/Edge lists\endnote{Global Urban Street Networks Node/Edge lists: https://doi.org/10.7910/DVN/DC7U0A}
202 |     \item Global Urban Street Networks Indicators \endnote{Global Urban Street Networks Indicators: https://doi.org/10.7910/DVN/ZTFPTB}
203 |     \item Global Urban Street Networks Metadata \endnote{Global Urban Street Networks Metadata: https://doi.org/10.7910/DVN/WMPPF9}
204 | \end{itemize}
205 | 
206 | The model files are zipped at the country level, and each file (and indicators row) is identified by its urban area name and UCD ID.\ The latter allows users to join them to GHSL attribute data.
207 | 
208 | \section{Discussion: Lineage and Contribution}
209 | 
210 | In an era of rapid urbanization, scholars and practitioners need models and indicators that keep up with the pace of transformational urban change. This project builds on prior work initially conducted in 2019--2020 that generated a preliminary version of the data repository \citep{boeing_street_2022}. That initial version was based on the 2015 version of the GHSL UCD and 2020 OpenStreetMap data. This new version takes advantage of years of advances to use the 2025 GHSL UCD and 2025 OpenStreetMap data to make six primary contributions.
211 | 
212 | First, it includes over 1,400 more urban areas and 11 more countries than the earlier version. This entails significantly more worldwide coverage in an era of rapid urban expansion.
213 | 
214 | Second, these new models incorporate 10 years of recent urbanization in their updated urban area boundaries and 5 years of new community additions to OpenStreetMap. As such, this workflow's modeling included approximately 20 million more street network nodes and 40 million more edges than the earlier version. The new urban boundaries allow users to link these street network models and indicators to hundreds of new, up-to-date GHSL attributes on urban climate, land use, economic conditions, etc.
215 | 
216 | Third, it adds new attributes and indicators to the repository---most consequentially the betweenness centrality of every node in every urban area's street network, which is extremely time and resource intensive to calculate, yet unlocks powerful analyses of network structure and resilience for urban science.
217 | 
218 | Fourth, it uses finer-grained SRTM data (30m instead of the previous 90m resolution) for more precise elevation attribute values.
219 | 
220 | Fifth, from a \textit{code product} perspective, the workflow's code base has been wholly refactored and rewritten from the ground-up to significantly reduce its cyclomatic complexity, memory use, and runtime. This makes the workflow more maintainable, sustainable, and easier to re-run in the future to periodically update the data repository whenever new GHSL data are released.
221 | 
222 | Sixth, and finally, these models and indicators themselves unlock other researchers' work. This project provides a global dataset to conduct both worldwide urban street network science beyond samples as well as local analyses particularly in less-resourced regions where such models and indicators are most needed, yet most scarce.
223 | 
224 | \section{Getting Started}
225 | 
226 | To get started, users may download the models or indicators directly from the aforementioned Dataverse, or access the source code and documentation at the aforementioned Github source code repository.
227 | 
228 | % print the footnotes as endnotes, if any exist
229 | \IfFileExists{\jobname.ent}{\theendnotes}{}
230 | 
231 | % print the bibliography
232 | \setlength{\bibsep}{0.00cm plus 0.05cm} % no space between items
233 | \bibliographystyle{apalike}
234 | \bibliography{references}
235 | 
236 | \end{document}
237 | 


--------------------------------------------------------------------------------
/paper/latex/references.bib:
--------------------------------------------------------------------------------
  1 | 
  2 | @Article{     barrington-leigh_global_2020,
  3 |   author    = {Barrington-Leigh, Christopher and Millard-Ball, Adam},
  4 |   journal   = {Proceedings of the National Academy of Sciences},
  5 |   month     = jan,
  6 |   number    = {4},
  7 |   pages     = {1941--1950},
  8 |   title     = {Global trends toward urban street-network sprawl},
  9 |   volume    = {117},
 10 |   year      = {2020},
 11 |   doi       = {10.1073/pnas.1905232116},
 12 |   issn      = {0027-8424, 1091-6490},
 13 |   language  = {en}
 14 | }
 15 | 
 16 | @Book{        barthelemy_spatial_2022,
 17 |   address   = {Cham},
 18 |   author    = {Barthelemy, Marc},
 19 |   publisher = {Springer International Publishing},
 20 |   title     = {Spatial {Networks}: {A} {Complete} {Introduction}},
 21 |   year      = {2022},
 22 |   isbn      = {978-3-030-94105-5 978-3-030-94106-2},
 23 |   language  = {en}
 24 | }
 25 | 
 26 | @InCollection{    fischer_spatial_2014,
 27 |   address   = {Berlin, Germany},
 28 |   author    = {O'Sullivan, David},
 29 |   booktitle = {Handbook of {Regional} {Science}},
 30 |   editor    = {Fischer, Manfred M. and Nijkamp, Peter},
 31 |   pages     = {1253--1273},
 32 |   publisher = {Springer-Verlag},
 33 |   title     = {Spatial {Network} {Analysis}},
 34 |   year      = {2014},
 35 |   isbn      = {978-3-642-23429-3},
 36 |   language  = {en}
 37 | }
 38 | 
 39 | @Article{     giles-corti_creating_2022,
 40 |   author    = {Giles-Corti, Billie and Moudon, Anne Vernez and Lowe,
 41 |           Melanie and Adlakha, Deepti and Cerin, Ester and Boeing,
 42 |           Geoff and Higgs, Carl and Arundel, Jonathan and Liu, Shiqin
 43 |           and Hinckson, Erica and Salvo, Deborah and Adams, Marc A
 44 |           and Badland, Hannah and Florindo, Alex A and Gebel, Klaus
 45 |           and Hunter, Ruth F and Mitáš, Josef and Oyeyemi, Adewale
 46 |           L and Puig-Ribera, Anna and Queralt, Ana and Santos, Maria
 47 |           Paula and Schipperijn, Jasper and Stevenson, Mark and Dyck,
 48 |           Delfien Van and Vich, Guillem and Sallis, James F},
 49 |   journal   = {The Lancet Global Health},
 50 |   month     = jun,
 51 |   number    = {6},
 52 |   pages     = {e782--e785},
 53 |   title     = {Creating healthy and sustainable cities: what gets
 54 |           measured, gets done},
 55 |   volume    = {10},
 56 |   year      = {2022},
 57 |   doi       = {10.1016/S2214-109X(22)00070-5},
 58 |   issn      = {2214109X},
 59 |   language  = {en}
 60 | }
 61 | 
 62 | @Article{     liu_generalized_2022,
 63 |   author    = {Liu, Shiqin and Higgs, Carl and Arundel, Jonathan and
 64 |           Boeing, Geoff and Cerdera, Nicholas and Moctezuma, David
 65 |           and Cerin, Ester and Adlakha, Deepti and Lowe, Melanie and
 66 |           Giles‐Corti, Billie},
 67 |   journal   = {Geographical Analysis},
 68 |   month     = jul,
 69 |   number    = {3},
 70 |   pages     = {559--582},
 71 |   title     = {A {Generalized} {Framework} for {Measuring} {Pedestrian}
 72 |           {Accessibility} around the {World} {Using} {Open} {Data}},
 73 |   volume    = {54},
 74 |   year      = {2022},
 75 |   doi       = {10.1111/gean.12290},
 76 |   issn      = {0016-7363, 1538-4632},
 77 |   language  = {en}
 78 | }
 79 | 
 80 | @Article{      boeing_modeling_2025,
 81 |   author    = {Boeing, Geoff},
 82 |   journal   = {Geographical Analysis},
 83 |   volume    = {published online ahead of print},
 84 |   title     = {Modeling and {Analyzing} {Urban} {Networks} and
 85 |           {Amenities} with {OSMnx}},
 86 |   year      = {2025}
 87 | }
 88 | 
 89 | @Misc{        mari_rivero_urban_2025,
 90 |   address   = {http://data.europa.eu/89h/1a338be6-7eaf-480c-9664-3a8ade88cbcd},
 91 |   author    = {Mari Rivero, I. and Melchiorri, M. and Florio, P. and
 92 |           Schiavina, M. and {et. al}},
 93 |   publisher = {European Commission, Joint Research Centre (JRC)},
 94 |   title     = {Urban {Centre} {Database} 2025},
 95 |   year      = {2025},
 96 |   doi       = {10.2905/1A338BE6-7EAF-480C-9664-3A8ADE88CBCD},
 97 |   url       = {http://data.europa.eu/89h/1a338be6-7eaf-480c-9664-
 98 |           3a8ade88cbcd}
 99 | }
100 | 
101 | @Article{     boeing_topological_2025,
102 |   author    = {Boeing, Geoff},
103 |   journal   = {Transactions in GIS},
104 |   volume    = {published online ahead of print},
105 |   title     = {Topological {Graph} {Simplification} {Solutions} to the
106 |           {Street} {Intersection} {Miscount} {Problem}},
107 |   year      = {2025}
108 | }
109 | 
110 | @Article{     boeing_street_2022,
111 |   author    = {Boeing, Geoff},
112 |   journal   = {Geographical Analysis},
113 |   month     = jul,
114 |   number    = {3},
115 |   pages     = {519--535},
116 |   title     = {Street {Network} {Models} and {Indicators} for {Every}
117 |           {Urban} {Area} in the {World}},
118 |   volume    = {54},
119 |   year      = {2022},
120 |   doi       = {10.1111/gean.12281},
121 |   issn      = {0016-7363, 1538-4632},
122 |   language  = {en}
123 | }
124 | 


--------------------------------------------------------------------------------