├── .github ├── dependabot.yml └── workflows │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE.txt ├── README.md ├── code ├── 01-construct-models │ ├── 01-prep-ghsl.py │ ├── 02-download-cache.py │ └── 03-create-graphs.py ├── 02-attach-elevation │ ├── 01-aster-srtm │ │ ├── 01-download-aster_v3.py │ │ ├── 02-download-srtmgl1.py │ │ ├── 03-build-vrts.py │ │ └── 04-add-node-elevations.py │ └── 02-google │ │ ├── 01-cluster-nodes.py │ │ ├── 02-make-google-urls.py │ │ ├── 03-download-google-elevations.py │ │ └── 04-choose-best-elevation.py ├── 03-calculate-indicators │ ├── 01-calculate-node-bc.py │ ├── 02-calculate-indicators.py │ ├── 03-merge-indicators.py │ └── 04-create-metadata.py ├── 04-upload-repository │ ├── 01-save-files.py │ ├── 02-stage-files.py │ └── 03-upload-dataverse.py ├── config.json ├── environment.yml └── run.sh └── paper ├── README.md └── latex ├── main.tex └── references.bib /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | updates: 4 | - package-ecosystem: github-actions 5 | directory: / 6 | schedule: 7 | interval: weekly 8 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: CI 3 | 4 | on: # yamllint disable-line rule:truthy 5 | push: 6 | branches: [main] 7 | pull_request: 8 | branches: [main] 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build: 13 | name: ${{ matrix.os }} 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ubuntu-latest] 19 | 20 | defaults: 21 | run: 22 | shell: bash -elo pipefail {0} 23 | 24 | steps: 25 | - name: Checkout repo 26 | uses: actions/checkout@v4 27 | 28 | - name: Create environment with Micromamba 29 | uses: mamba-org/setup-micromamba@v2 30 | with: 31 | cache-environment: true 32 | environment-file: ./code/environment.yml 33 | post-cleanup: none 34 | 35 | - name: Cache pre-commit 36 | uses: actions/cache@v4 37 | with: 38 | path: ~/.cache/pre-commit/ 39 | key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} 40 | 41 | - name: Run pre-commit checks 42 | run: pre-commit run --all-files 43 | env: 44 | SKIP: no-commit-to-branch 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb 2 | *.pdf 3 | .DS_Store 4 | keys.py 5 | paper/analysis 6 | paper/submission 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # UV 105 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | #uv.lock 109 | 110 | # poetry 111 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 112 | # This is especially recommended for binary packages to ensure reproducibility, and is more 113 | # commonly ignored for libraries. 114 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 115 | #poetry.lock 116 | 117 | # pdm 118 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 119 | #pdm.lock 120 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 121 | # in version control. 122 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 123 | .pdm.toml 124 | .pdm-python 125 | .pdm-build/ 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .venv 140 | env/ 141 | venv/ 142 | ENV/ 143 | env.bak/ 144 | venv.bak/ 145 | 146 | # Spyder project settings 147 | .spyderproject 148 | .spyproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | # mkdocs documentation 154 | /site 155 | 156 | # mypy 157 | .mypy_cache/ 158 | .dmypy.json 159 | dmypy.json 160 | 161 | # Pyre type checker 162 | .pyre/ 163 | 164 | # pytype static type analyzer 165 | .pytype/ 166 | 167 | # Cython debug symbols 168 | cython_debug/ 169 | 170 | # PyCharm 171 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 172 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 173 | # and can be added to the global gitignore or merged into this file. For a more nuclear 174 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 175 | #.idea/ 176 | 177 | # PyPI configuration file 178 | .pypirc 179 | 180 | ## Core latex/pdflatex auxiliary files: 181 | *.aux 182 | *.lof 183 | *.log 184 | *.lot 185 | *.fls 186 | *.out 187 | *.toc 188 | *.fmt 189 | *.fot 190 | *.cb 191 | *.cb2 192 | .*.lb 193 | 194 | ## Intermediate documents: 195 | *.dvi 196 | *.xdv 197 | *-converted-to.* 198 | # these rules might exclude image files for figures etc. 199 | # *.ps 200 | # *.eps 201 | # *.pdf 202 | 203 | ## Generated if empty string is given at "Please type another file name for output:" 204 | .pdf 205 | 206 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 207 | *.bbl 208 | *.bcf 209 | *.blg 210 | *-blx.aux 211 | *-blx.bib 212 | *.run.xml 213 | 214 | ## Build tool auxiliary files: 215 | *.fdb_latexmk 216 | *.synctex 217 | *.synctex(busy) 218 | *.synctex.gz 219 | *.synctex.gz(busy) 220 | *.pdfsync 221 | 222 | ## Build tool directories for auxiliary files 223 | # latexrun 224 | latex.out/ 225 | 226 | ## Auxiliary and intermediate files from other packages: 227 | # algorithms 228 | *.alg 229 | *.loa 230 | 231 | # achemso 232 | acs-*.bib 233 | 234 | # amsthm 235 | *.thm 236 | 237 | # beamer 238 | *.nav 239 | *.pre 240 | *.snm 241 | *.vrb 242 | 243 | # changes 244 | *.soc 245 | 246 | # comment 247 | *.cut 248 | 249 | # cprotect 250 | *.cpt 251 | 252 | # elsarticle (documentclass of Elsevier journals) 253 | *.spl 254 | 255 | # endnotes 256 | *.ent 257 | 258 | # fixme 259 | *.lox 260 | 261 | # feynmf/feynmp 262 | *.mf 263 | *.mp 264 | *.t[1-9] 265 | *.t[1-9][0-9] 266 | *.tfm 267 | 268 | #(r)(e)ledmac/(r)(e)ledpar 269 | *.end 270 | *.?end 271 | *.[1-9] 272 | *.[1-9][0-9] 273 | *.[1-9][0-9][0-9] 274 | *.[1-9]R 275 | *.[1-9][0-9]R 276 | *.[1-9][0-9][0-9]R 277 | *.eledsec[1-9] 278 | *.eledsec[1-9]R 279 | *.eledsec[1-9][0-9] 280 | *.eledsec[1-9][0-9]R 281 | *.eledsec[1-9][0-9][0-9] 282 | *.eledsec[1-9][0-9][0-9]R 283 | 284 | # glossaries 285 | *.acn 286 | *.acr 287 | *.glg 288 | *.glo 289 | *.gls 290 | *.glsdefs 291 | *.lzo 292 | *.lzs 293 | *.slg 294 | *.slo 295 | *.sls 296 | 297 | # uncomment this for glossaries-extra (will ignore makeindex's style files!) 298 | # *.ist 299 | 300 | # gnuplot 301 | *.gnuplot 302 | *.table 303 | 304 | # gnuplottex 305 | *-gnuplottex-* 306 | 307 | # gregoriotex 308 | *.gaux 309 | *.glog 310 | *.gtex 311 | 312 | # htlatex 313 | *.4ct 314 | *.4tc 315 | *.idv 316 | *.lg 317 | *.trc 318 | *.xref 319 | 320 | # hyperref 321 | *.brf 322 | 323 | # knitr 324 | *-concordance.tex 325 | # TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files 326 | # *.tikz 327 | *-tikzDictionary 328 | 329 | # listings 330 | *.lol 331 | 332 | # luatexja-ruby 333 | *.ltjruby 334 | 335 | # makeidx 336 | *.idx 337 | *.ilg 338 | *.ind 339 | 340 | # minitoc 341 | *.maf 342 | *.mlf 343 | *.mlt 344 | *.mtc[0-9]* 345 | *.slf[0-9]* 346 | *.slt[0-9]* 347 | *.stc[0-9]* 348 | 349 | # minted 350 | _minted* 351 | *.pyg 352 | 353 | # morewrites 354 | *.mw 355 | 356 | # newpax 357 | *.newpax 358 | 359 | # nomencl 360 | *.nlg 361 | *.nlo 362 | *.nls 363 | 364 | # pax 365 | *.pax 366 | 367 | # pdfpcnotes 368 | *.pdfpc 369 | 370 | # sagetex 371 | *.sagetex.sage 372 | *.sagetex.py 373 | *.sagetex.scmd 374 | 375 | # scrwfile 376 | *.wrt 377 | 378 | # svg 379 | svg-inkscape/ 380 | 381 | # sympy 382 | *.sout 383 | *.sympy 384 | sympy-plots-for-*.tex/ 385 | 386 | # pdfcomment 387 | *.upa 388 | *.upb 389 | 390 | # pythontex 391 | *.pytxcode 392 | pythontex-files-*/ 393 | 394 | # tcolorbox 395 | *.listing 396 | 397 | # thmtools 398 | *.loe 399 | 400 | # TikZ & PGF 401 | *.dpth 402 | *.md5 403 | *.auxlock 404 | 405 | # titletoc 406 | *.ptc 407 | 408 | # todonotes 409 | *.tdo 410 | 411 | # vhistory 412 | *.hst 413 | *.ver 414 | 415 | # easy-todo 416 | *.lod 417 | 418 | # xcolor 419 | *.xcp 420 | 421 | # xmpincl 422 | *.xmpi 423 | 424 | # xindy 425 | *.xdy 426 | 427 | # xypic precompiled matrices and outlines 428 | *.xyc 429 | *.xyd 430 | 431 | # endfloat 432 | *.ttt 433 | *.fff 434 | 435 | # Latexian 436 | TSWLatexianTemp* 437 | 438 | ## Editors: 439 | # WinEdt 440 | *.bak 441 | *.sav 442 | 443 | # Texpad 444 | .texpadtmp 445 | 446 | # LyX 447 | *.lyx~ 448 | 449 | # Kile 450 | *.backup 451 | 452 | # gummi 453 | .*.swp 454 | 455 | # KBibTeX 456 | *~[0-9]* 457 | 458 | # TeXnicCenter 459 | *.tps 460 | 461 | # auto folder when using emacs and auctex 462 | ./auto/* 463 | *.el 464 | 465 | # expex forward references with \gathertags 466 | *-tags.tex 467 | 468 | # standalone packages 469 | *.sta 470 | 471 | # Makeindex log files 472 | *.lpz 473 | 474 | # xwatermark package 475 | *.xwm 476 | 477 | # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib 478 | # option is specified. Footnotes are the stored in a file with suffix Notes.bib. 479 | # Uncomment the next line to have this generated file ignored. 480 | #*Notes.bib 481 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v5.0.0 5 | hooks: 6 | - id: check-added-large-files 7 | args: [--maxkb=50] 8 | - id: check-ast 9 | - id: check-case-conflict 10 | - id: check-executables-have-shebangs 11 | - id: check-json 12 | - id: check-merge-conflict 13 | args: [--assume-in-merge] 14 | - id: check-shebang-scripts-are-executable 15 | - id: check-toml 16 | - id: check-xml 17 | - id: check-yaml 18 | - id: detect-private-key 19 | - id: end-of-file-fixer 20 | - id: fix-byte-order-marker 21 | - id: mixed-line-ending 22 | - id: no-commit-to-branch 23 | - id: pretty-format-json 24 | args: [--autofix] 25 | - id: trailing-whitespace 26 | 27 | - repo: https://github.com/adrienverge/yamllint 28 | rev: v1.37.0 29 | hooks: 30 | - id: yamllint 31 | args: 32 | - --strict 33 | - > 34 | -d={extends: default, rules: { 35 | quoted-strings: {quote-type: single, required: only-when-needed}}} 36 | 37 | - repo: https://github.com/astral-sh/ruff-pre-commit 38 | rev: v0.11.4 39 | hooks: 40 | - id: ruff 41 | args: 42 | - --fix 43 | - --line-length=100 44 | - --select=ALL 45 | - --ignore=ANN,BLE001,D,N803,N806,PD901,S,SLF001,T201,TRY002,TRY301 46 | - --no-cache 47 | - id: ruff-format 48 | args: 49 | - --line-length=100 50 | - --no-cache 51 | 52 | - repo: https://github.com/Lucas-C/pre-commit-hooks 53 | rev: v1.5.5 54 | hooks: 55 | - id: remove-tabs 56 | 57 | - repo: https://github.com/meliache/pre-commit-chktex 58 | rev: v0.2.2 59 | hooks: 60 | - id: chktex-conda 61 | args: [-H, -I] 62 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | title: "Urban Street Network Models and Measures" 3 | authors: 4 | - family-names: "Boeing" 5 | given-names: "Geoff" 6 | orcid: "https://orcid.org/0000-0003-1851-6411" 7 | website: "https://geoffboeing.com" 8 | url: "https://github.com/gboeing/street-network-models" 9 | repository-code: "https://github.com/gboeing/street-network-models" 10 | preferred-citation: 11 | type: report 12 | title: "Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World" 13 | authors: 14 | - family-names: "Boeing" 15 | given-names: "Geoff" 16 | orcid: "https://orcid.org/0000-0003-1851-6411" 17 | website: "https://geoffboeing.com" 18 | year: 2025 19 | url: "https://github.com/gboeing/street-network-models" 20 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-2025 Geoff Boeing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Urban Street Network Models and Indicators 2 | 3 | This project uses [OSMnx](https://github.com/gboeing/osmnx) to model and analyze the street networks of every urban area in the world then shares the results (models and indicators) in an open data [repository](https://dataverse.harvard.edu/dataverse/global-urban-street-networks) in the Harvard Dataverse. 4 | 5 | ## Citation 6 | 7 | Boeing, G. 2025. Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World. Working paper. https://github.com/gboeing/street-network-models 8 | 9 | ## Computing environment 10 | 11 | The following sections provide notes on reproducibility. Given the resource requirements, it's best to run the workflow in a high-performance computing cluster, but it's feasible to run it on a well-equipped personal computer. 12 | 13 | System requirements: 14 | 15 | - RAM/CPU: minimum of 32gb for single-threaded execution (note: you'll have to edit `config.json` to set the CPU counts to 1). Recommended 128gb + 24 CPU cores for multithreaded execution as parameterized in the config file. 16 | - Disk space: 2 terabytes. 17 | - OS: agnostic, but this workflow was developed and tested on Linux. 18 | 19 | Runtime environment: create a new [conda](https://conda.io) environment using the `environment.yml` file to install all the necessary packages to run the workflow. You can install a Jupyter kernel in it, if you wish, like `python -m ipykernel install --user --name snm --display-name "Python (snm)"`. 20 | 21 | ## Input data 22 | 23 | Create a project data root folder with a `inputs` subfolder and place the unzipped [input data](https://drive.usercontent.google.com/download?id=1UrHub0mX0LwybpEOKmwHgEvUgrMj0C7y&export=download) in it. This project uses the Global Human Settlement Layer urban centers dataset to define the world's urban areas' boundary polygons, specifically, their Urban Centre Database 2025: 24 | 25 | > Mari Rivero, Ines; Melchiorri, Michele; Florio, Pietro; Schiavina, Marcello; Goch, Katarzyna; Politis, Panagiotis; Uhl, Johannes; Pesaresi, Martino; Maffenini, Luca; Sulis, Patrizia; Crippa, Monica; Guizzardi, Diego; Pisoni, Enrico; Belis, Claudio; Jacome Felix Oom, Duarte; Branco, Alfredo; Mwaniki, Dennis; Kochulem, Edwin; Githira, Daniel; Carioli, Alessandra; Ehrlich, Daniele; Tommasi, Pierpaolo; Kemper, Thomas; Dijkstra, Lewis (2024): GHS-UCDB R2024A - GHS Urban Centre Database 2025. European Commission, Joint Research Centre (JRC) [Dataset] doi: 10.2905/1a338be6-7eaf-480c-9664-3a8ade88cbcd PID: http://data.europa.eu/89h/1a338be6-7eaf-480c-9664-3a8ade88cbcd 26 | 27 | ## Workflow 28 | 29 | The workflow is organized into folders and scripts, as follows. 30 | 31 | ### 1. Construct models 32 | 33 | #### 1.1. Prep data 34 | 35 | Load the GHS urban centers dataset, retain useful columns, save as a GeoPackage file. 36 | 37 | #### 1.2. Download cache 38 | 39 | Uses OSMnx to download OSM raw data to a cache for subsequent parallel processing. 40 | 41 | #### 1.3. Create graphs 42 | 43 | Use cached OSM raw data to construct a MultiDiGraph of each street network. Can be done in parallel with multiprocessing by changing `cpus` config setting. Saves to disk as GraphML file. Parameterized to get only drivable streets, retain all, simplify, and truncate by edge. Does this for every urban center's polygon boundary if it meets the following conditions: 44 | 45 | - is marked with a "high" quality control score 46 | - has >1 km2 built-up area 47 | - includes ≥3 nodes 48 | 49 | ### 2. Attach elevation 50 | 51 | This project uses three data sources for elevation: 52 | 53 | 1. [ASTERv3](https://www.earthdata.nasa.gov/data/instruments/aster) GDEM at 30 meter resolution 54 | 2. [SRTMGL1](https://www.earthdata.nasa.gov/news/nasa-shuttle-radar-topography-mission-srtm-version-30-global-1-arc-second-data-released-over) GDEM at 30 meter resolution with voids filled (version 3.0 global 1 arc second) 55 | 3. Google Maps Elevation API 56 | 57 | We use ASTER and SRTM to attach elevation data to each graph node in each model, then calculate edge grades. Both of these are public, free, open data. We just use Google Maps elevation as a validation dataset. 58 | 59 | A few notes. A previous iteration of this project used to use [CGIAR](https://srtm.csi.cgiar.org)'s post-processed SRTM v4.1, but they only provide 90m resolution SRTM data. The Google billing scheme is changing in March 2025, rendering Google elevation data collection at this scale possibly infeasible in the future without substantial funding to pay for it. Historically, each billing account gets $200 usage credit free each month. The price per HTTP request was $0.005. Therefore you would get up to 200 / 0.005 = 40,000 free requests each month, within the usage limits of 512 locations per request and 6,000 requests per minute. URLs must be properly encoded to be valid and are limited to 16,384 characters for all web services. With three billing accounts, you could process this entire workflow for free once a month. 60 | 61 | #### 2.1. ASTER and SRTM 62 | 63 | ##### 2.1.1. Download ASTER 64 | 65 | Download each ASTER DEM tif file (requires NASA EarthData login credentials). 66 | 67 | ##### 2.1.2. Download SRTM 68 | 69 | Download each SRTM DEM hgt file (requires NASA EarthData login credentials). 70 | 71 | ##### 2.1.3. Build VRTs 72 | 73 | Build two VRT virtual raster files (one for all the ASTER files and one for all the SRTM files) for subsequent querying. 74 | 75 | ##### 2.1.4. Attach node elevations 76 | 77 | Load each GraphML file saved in step 1.3 and add SRTM and ASTER elevation attributes to each node by querying the VRTs then resave the GraphML to disk. 78 | 79 | #### 2.2. Google Elevation 80 | 81 | ##### 2.2.1. Cluster nodes 82 | 83 | We want to send node coordinates to the elevation API in batches. But the batches need to consist of (approximately) adjacent nodes because the Google API uses a smoothing function to estimate elevation. If the nodes are from different parts of the planet (or at different elevations), this smoothing will result in very coarse-grained approximations of individual nodes' elevations. So, load all the node coordinates for each graph and spatially cluster them into equal-size clusters of 512 coordinates apiece, then save as a CSV file. 84 | 85 | ##### 2.2.2. Make URLs 86 | 87 | Load the CSV file of node clusters and construct an API URL for each, with a key (requires 3 Google API keys). 88 | 89 | ##### 2.2.3. Download Google elevations 90 | 91 | Request each URL and save node ID and elevation to disk for all nodes. 92 | 93 | #### 2.2.4. Choose best elevation 94 | 95 | Load each GraphML file and select either ASTER or SRTM to use as the official node elevation value, for each node, based on which is closer to the Google value (as a tie-breaker). Then calculate all edge grades and add as edge attributes. Re-save graph to disk as GraphML. 96 | 97 | ### 3. Calculate stats 98 | 99 | #### 3.1. Calculate betweenness centrality 100 | 101 | Load each GraphML file and calculate length-weighted node betweenness centrality for all nodes, using IGraph. 102 | 103 | #### 3.2. Calculate stats 104 | 105 | Load each saved graph's GraphML file. Calculate each stat as described in the metadata file. 106 | 107 | #### 3.3. Merge stats 108 | 109 | Merge the street network stats with the urban centers stats (from the GeoPackage file created in step 1.1). Save to disk with indicators named as described in the metadata file. 110 | 111 | #### 3.4. Create metadata 112 | 113 | Create metadata files for the graphs (node/edge attributes) and stats. 114 | 115 | ### 4. Upload repository 116 | 117 | #### 4.1. Generate files 118 | 119 | Save graphs to disk as GeoPackages and node/edge list files. Then ensure we have what we expect: verify that we have the same number of countries for each file type, the same number of gpkg, graphml, and node/edge list files, and that the same set of country/city names exists across gkpg, graphml, and node/edge lists. 120 | 121 | #### 4.2. Stage files 122 | 123 | Compress and zip all model files (GeoPackages, GraphML, node/edge lists) into a staging area for upload to Dataverse. 124 | 125 | #### 4.3. Upload to Dataverse 126 | 127 | Upload to Dataverse using their v1 [Native API](https://guides.dataverse.org/en/latest/api/native-api.html). First [log in](https://dataverse.harvard.edu) and create an API key if you don't have an active one (they expire annually). If this is a revision to existing datasets, create a draft dataset revision on the Dataverse (edit dataset > metadata > change something > save). Otherwise, if this is the first upload ever, create a new Dataverse and new empty datasets within it, structured like: 128 | 129 | - Global Urban Street Networks 130 | - Global Urban Street Networks GeoPackages 131 | - Global Urban Street Networks GraphML Files 132 | - Global Urban Street Networks Node/Edge Lists 133 | - Global Urban Street Networks Measures 134 | - Global Urban Street Networks Metadata 135 | 136 | Then run the script to upload all the repository files automatically to their respective datasets in the Dataverse (note: if this a dataset *revision*, set `delete_existing = True` to first clear out all the carried-over files in the draft). Next, *manually* upload the indicators and metadata files to their respective datasets in the Dataverse. Finally, visit the Dataverse on the web to publish the draft. 137 | -------------------------------------------------------------------------------- /code/01-construct-models/01-prep-ghsl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import re 5 | import unicodedata 6 | from pathlib import Path 7 | 8 | import geopandas as gpd 9 | import osmnx as ox 10 | import pandas as pd 11 | 12 | # load configs 13 | with Path("./config.json").open() as f: 14 | config = json.load(f) 15 | 16 | fp = config["uc_input_path"] 17 | msg = f"Loading all layers from {fp!r}" 18 | print(ox.ts(), msg) 19 | 20 | # load all GHS urban centers dataset gpkg together into 1 gdf 21 | col_on = "ID_UC_G0" 22 | suffixes = ("", "_DROP") 23 | layers = list(gpd.list_layers(fp)["name"]) 24 | ucs = gpd.read_file(fp, layer=layers[0]) 25 | for layer in layers[1:]: 26 | ucs = ucs.merge( 27 | gpd.read_file(fp, layer=layer), 28 | left_on=col_on, 29 | right_on=col_on, 30 | how="inner", 31 | suffixes=suffixes, 32 | ) 33 | drop = [c for c in ucs.columns if "_DROP" in c] 34 | ucs = ucs.drop(columns=drop) 35 | 36 | # quality control checks 37 | assert ucs.index.is_unique 38 | assert ucs.columns.is_unique 39 | assert ucs.crs is not None 40 | 41 | # project to OSMnx's default CRS 42 | ucs = ucs.to_crs(ox.settings.default_crs) 43 | ucs["geometry"] = ucs.make_valid() 44 | print(ox.ts(), "Loaded urban centers data with shape", ucs.shape) 45 | 46 | # identify which columns to keep when saving to disk 47 | # comments from GHS_UCDB_GLOBE_R2024A_V1_0/GHS_UCDB_GLOBE_R2024A.pdf 48 | cols = [ 49 | "GC_PLS_SCR_2025", # plausibility score (quality control) 50 | "ID_UC_G0", # urban center ID 51 | "GC_UCN_MAI_2025", # name of main city inside urban center 52 | "GC_UCN_LIS_2025", # list of names of all cities inside urban center 53 | "GC_CNT_GAD_2025", # country name based on GADM dataset 54 | "country_iso", # country ISO 3166-1 alpha-3 code 55 | "GC_DEV_USR_2025", # UN SDG geographic region 56 | # population, area, density 57 | "GC_POP_TOT_2025", # total population (inhabitants) inside urban center 58 | "GC_UCA_KM2_2025", # urban center area in km^2 59 | "GH_BUS_TOT_2025", # total built-up area m^2 60 | "GH_BPC_TOT_2025", # total built-up area per-capita (m^2/person) 61 | "GH_BUH_AVG_2020", # average height of built surfaces (m) at 100m res 62 | # economic development 63 | "SC_SEC_GDP_2020", # total GDP PPP (real? USD) 64 | "GC_DEV_WIG_2025", # world bank income group 65 | "SC_SEC_HDI_2020", # human development index at subnational level 66 | # pollution emission and concentration 67 | "EM_CO2_TRA_2022", # total CO2 emissions in transport sector (ton/year) 68 | "EM_PM2_TRA_2022", # total PM2.5 emissions in transport sector (ton/year) 69 | "EM_PM2_CON_2020", # pop-weighted average PM2.5 concentrations (μg/m^3) 70 | # climate/land use 71 | "CL_KOP_CUR_2025", # Köppen-Geiger classification of majority of surface 72 | "GE_ELV_AVG_2025", # average elevation (m) 73 | "CL_B12_CUR_2010", # average annual precipitation in the decade (mm/year) 74 | "CL_B01_CUR_2010", # annual mean temperature in the decade (°C) 75 | "SD_POP_HGR_2025", # share of pop living in area of high greenness 76 | "SD_LUE_LPR_2000_2020", # land use efficiency = land consump rate / pop growth rate 77 | "geometry", # urban center geometry 78 | ] 79 | 80 | # only retain urban centers with >1 sq km of built-up area 81 | # drops 943 out of 11422 rows (8.3%) 82 | sq_km = 1e6 # meters 83 | ucs = ucs[ucs["GH_BUS_TOT_2025"] > sq_km] 84 | 85 | # only retain urban centers with a "high" quality control score 86 | # drops 127 out of 10479 rows (1.2%) 87 | ucs = ucs[ucs["GC_PLS_SCR_2025"] == "High"] 88 | 89 | # convert columns to int where needed 90 | cols_int = ["GC_POP_TOT_2025", "SC_SEC_GDP_2020"] 91 | ucs[cols_int] = ucs[cols_int].astype(int) 92 | 93 | # add country ISO column from lookup table 94 | iso = pd.read_csv(config["iso_codes_path"]).set_index("name")["alpha3"].to_dict() 95 | ucs["country_iso"] = ucs["GC_CNT_GAD_2025"].replace(iso) 96 | assert pd.notna(ucs["country_iso"]).all() 97 | 98 | 99 | regex = re.compile("[^0-9a-zA-Z]+") 100 | 101 | 102 | def clean_str(s, regex=regex): 103 | # clean up name/country for file naming: get ASCII representation and make 104 | # everything just lowercase letters and underscores. if normalized name is 105 | # null, empty string, or 1+ whitespaces then rename it to "unnamed" 106 | try: 107 | norm = unicodedata.normalize("NFKD", s).encode("ascii", errors="ignore").decode() 108 | assert norm != "" 109 | assert set(norm) != {" "} 110 | except (AssertionError, TypeError): 111 | norm = "Unnamed" 112 | return regex.sub("_", norm).lower().strip("_") 113 | 114 | 115 | cols_lower = ["GC_UCN_MAI_2025", "GC_CNT_GAD_2025"] 116 | ucs[cols_lower] = ucs[cols_lower].map(clean_str) 117 | 118 | # save final dataset to disk 119 | ucs = ucs[cols] 120 | ucs.to_file(config["uc_gpkg_path"], driver="GPKG", encoding="utf-8") 121 | msg = f"Saved urban centers gpkg with shape {ucs.shape} at {config['uc_gpkg_path']!r}" 122 | print(ox.ts(), msg) 123 | -------------------------------------------------------------------------------- /code/01-construct-models/02-download-cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import logging as lg 5 | import multiprocessing as mp 6 | import time 7 | from pathlib import Path 8 | 9 | import geopandas as gpd 10 | import osmnx as ox 11 | 12 | print(ox.ts(), "OSMnx version", ox.__version__) 13 | 14 | # hardcode CPU count to parallelize it without hammering Overpass server 15 | cpus = 3 16 | 17 | # load configs 18 | with Path("./config.json").open() as f: 19 | config = json.load(f) 20 | 21 | # configure OSMnx 22 | ox.settings.log_file = True 23 | ox.settings.log_console = False 24 | ox.settings.logs_folder = config["osmnx_log_path"] 25 | ox.settings.cache_folder = config["osmnx_cache_path"] 26 | ox.settings.use_cache = True 27 | ox.settings.cache_only_mode = True 28 | 29 | # configure queries 30 | network_type = "drive" 31 | retain_all = True 32 | simplify = True 33 | truncate_by_edge = True 34 | 35 | # load the prepped urban centers dataset 36 | uc_gpkg_path = config["uc_gpkg_path"] 37 | ucs = gpd.read_file(uc_gpkg_path).sort_values("GH_BUS_TOT_2025", ascending=True) 38 | msg = f"Loaded urban centers data with shape {ucs.shape} from {uc_gpkg_path!r}" 39 | print(ox.ts(), msg) 40 | 41 | 42 | def download_data(name, geometry) -> None: 43 | try: 44 | ox.graph_from_polygon( 45 | polygon=geometry, 46 | network_type=network_type, 47 | retain_all=retain_all, 48 | simplify=simplify, 49 | truncate_by_edge=truncate_by_edge, 50 | ) 51 | except ox._errors.CacheOnlyInterruptError: 52 | # error on success, because cache_only_mode is True 53 | print(ox.ts(), "Finished", name, flush=True) 54 | 55 | except Exception as e: 56 | ox.log(f'"{name}" failed: {e}', level=lg.ERROR) 57 | print(name, e) 58 | 59 | 60 | names = ucs["country_iso"] + "-" + ucs["GC_UCN_MAI_2025"] + "-" + ucs["ID_UC_G0"].astype(str) 61 | args = zip(names, ucs["geometry"]) 62 | 63 | print(ox.ts(), f"Downloading {len(ucs):,} graphs' data using {cpus} CPUs") 64 | start_time = time.time() 65 | 66 | with mp.get_context().Pool(cpus) as pool: 67 | pool.starmap_async(download_data, args).get() 68 | 69 | elapsed = time.time() - start_time 70 | msg = f"Finished caching data for {len(ucs):,} graphs in {elapsed:,.0f} seconds" 71 | print(ox.ts(), msg) 72 | -------------------------------------------------------------------------------- /code/01-construct-models/03-create-graphs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import logging as lg 5 | import multiprocessing as mp 6 | import time 7 | from pathlib import Path 8 | 9 | import geopandas as gpd 10 | import osmnx as ox 11 | 12 | print(ox.ts(), "OSMnx version", ox.__version__) 13 | 14 | # load configs 15 | with Path("./config.json").open() as f: 16 | config = json.load(f) 17 | 18 | # configure OSMnx 19 | ox.settings.log_file = True 20 | ox.settings.log_console = False 21 | ox.settings.logs_folder = config["osmnx_log_path"] 22 | ox.settings.cache_folder = config["osmnx_cache_path"] 23 | ox.settings.use_cache = True 24 | 25 | # configure queries 26 | network_type = "drive" 27 | retain_all = True 28 | simplify = True 29 | truncate_by_edge = True 30 | 31 | # configure multiprocessing 32 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 33 | 34 | # load the prepped urban centers dataset 35 | uc_gpkg_path = config["uc_gpkg_path"] 36 | ucs = gpd.read_file(uc_gpkg_path).sort_values("GH_BUS_TOT_2025", ascending=False) 37 | msg = f"Loaded urban centers data with shape {ucs.shape} from {uc_gpkg_path!r}" 38 | print(ox.ts(), msg) 39 | 40 | 41 | def get_graph(uc, root) -> None: 42 | try: 43 | country_folder = f"{uc['GC_CNT_GAD_2025']}-{uc['country_iso']}" 44 | uc_filename = f"{uc['GC_UCN_MAI_2025']}-{uc['ID_UC_G0']}.graphml" 45 | filepath = root / country_folder / uc_filename 46 | if not filepath.is_file(): 47 | G = ox.graph_from_polygon( 48 | polygon=uc["geometry"], 49 | network_type=network_type, 50 | retain_all=retain_all, 51 | simplify=simplify, 52 | truncate_by_edge=truncate_by_edge, 53 | ) 54 | 55 | # don't save graphs if they have fewer than 3 nodes 56 | min_nodes = 3 57 | if len(G) >= min_nodes: 58 | ox.save_graphml(G, filepath=filepath) 59 | print(ox.ts(), f"Saved {filepath}", flush=True) 60 | 61 | except Exception as e: 62 | ox.log(f'"{filepath}" failed: {e}', level=lg.ERROR) 63 | print(e, filepath) 64 | 65 | 66 | ucs = ucs.sample(len(ucs)) # .tail(10) 67 | 68 | # create function arguments for multiprocessing 69 | root = Path(config["models_graphml_path"]) 70 | cols = ["GC_CNT_GAD_2025", "country_iso", "GC_UCN_MAI_2025", "ID_UC_G0", "geometry"] 71 | args = ((uc[cols].to_dict(), root) for _, uc in ucs.iterrows()) 72 | 73 | print(ox.ts(), f"Begin creating {len(ucs):,} graphs using {cpus} CPUs") 74 | start_time = time.time() 75 | with mp.get_context().Pool(cpus) as pool: 76 | pool.starmap_async(get_graph, args).get() 77 | 78 | elapsed = time.time() - start_time 79 | msg = f"Finished creating {len(ucs):,} graphs in {elapsed:,.0f} seconds" 80 | print(ox.ts(), msg) 81 | file_count = len(list(root.glob("*/*"))) 82 | msg = f"There are {file_count:,} GraphML files in {str(root)!r}" 83 | print(ox.ts(), msg) 84 | -------------------------------------------------------------------------------- /code/02-attach-elevation/01-aster-srtm/01-download-aster_v3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | from pathlib import Path 6 | 7 | import osmnx as ox 8 | import pandas as pd 9 | import requests 10 | 11 | # username/password for https://www.earthdata.nasa.gov/ 12 | from keys import pwd, usr 13 | 14 | # load configs 15 | with Path("./config.json").open() as f: 16 | config = json.load(f) 17 | 18 | # configurations 19 | cpus = 4 20 | urls_path = config["gdem_aster_urls_path"] 21 | dl_path = Path(config["gdem_aster_path"]) 22 | dl_path.mkdir(parents=True, exist_ok=True) 23 | 24 | 25 | def download(url, usr=usr, pwd=pwd, dl_path=dl_path) -> None: 26 | with requests.Session() as session: 27 | filename = Path(url).name 28 | session.trust_env = False 29 | request = session.request("get", url, auth=(usr, pwd)) 30 | response = session.get(request.url, auth=(usr, pwd)) 31 | 32 | if response.ok: 33 | filepath = dl_path / filename 34 | with filepath.open(mode="wb") as f: 35 | f.write(response.content) 36 | else: 37 | print(response.status_code, response.text) 38 | 39 | 40 | # get all the URLs pointing at dem tif files 41 | urls = pd.read_csv(urls_path, header=None).iloc[:, 0].sort_values() 42 | urls = urls[urls.str.endswith("_dem.tif")] 43 | print(ox.ts(), f"There are {len(urls):,} total ASTER URLs") 44 | 45 | # how many files have already been downloaded? 46 | existing = {path.name for path in dl_path.glob("*.tif")} 47 | print(ox.ts(), f"There are {len(existing):,} files already downloaded") 48 | 49 | # how many files are remaining to download? 50 | urls = [url for url in urls if Path(url).name not in existing] 51 | print(ox.ts(), f"Downloading {len(urls):,} URLs with {cpus} CPUs") 52 | 53 | # multiprocess the queue 54 | if len(urls) > 0: 55 | args = ((url,) for url in urls) 56 | with mp.get_context().Pool(cpus) as pool: 57 | pool.starmap_async(download, args).get() 58 | 59 | file_count = len(list(dl_path.glob("*"))) 60 | msg = f"Finished: {file_count:,} files in {str(dl_path)!r}" 61 | print(ox.ts(), msg) 62 | -------------------------------------------------------------------------------- /code/02-attach-elevation/01-aster-srtm/02-download-srtmgl1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | from pathlib import Path 6 | from zipfile import ZipFile 7 | 8 | import osmnx as ox 9 | import pandas as pd 10 | import requests 11 | 12 | # username/password for https://www.earthdata.nasa.gov/ 13 | from keys import pwd, usr 14 | 15 | # load configs 16 | with Path("./config.json").open() as f: 17 | config = json.load(f) 18 | 19 | # configurations 20 | cpus = 4 21 | urls_path = config["gdem_srtm_urls_path"] 22 | dl_path = Path(config["gdem_srtm_path"]) 23 | dl_path.mkdir(parents=True, exist_ok=True) 24 | 25 | 26 | def download(url, usr=usr, pwd=pwd, dl_path=dl_path) -> None: 27 | with requests.Session() as session: 28 | filename = Path(url).name 29 | session.trust_env = False 30 | request = session.request("get", url, auth=(usr, pwd)) 31 | response = session.get(request.url, auth=(usr, pwd)) 32 | 33 | if response.ok: 34 | filepath = dl_path / filename 35 | with filepath.open(mode="wb") as f: 36 | f.write(response.content) 37 | 38 | with ZipFile(filepath, "r") as z: 39 | z.extractall(dl_path) 40 | filepath.unlink() 41 | 42 | else: 43 | print(response.status_code, response.text) 44 | 45 | 46 | # get all the URLs 47 | urls = pd.read_csv(urls_path, header=None).iloc[:, 0].sort_values() 48 | print(ox.ts(), f"There are {len(urls):,} total SRTM URLs") 49 | 50 | # how many files have already been downloaded? 51 | existing = {fp.name.split(".")[0] for fp in dl_path.glob("*.hgt")} 52 | print(ox.ts(), f"There are {len(existing):,} files already downloaded") 53 | 54 | # how many files are remaining to download? 55 | tiles = (Path(url).name.split(".")[0] for url in urls) 56 | remaining = [url for url, tile in zip(urls, tiles) if tile not in existing] 57 | print(ox.ts(), f"Downloading {len(remaining):,} URLs with {cpus} CPUs") 58 | 59 | # multiprocess the queue 60 | if len(remaining) > 0: 61 | args = ((url,) for url in remaining) 62 | with mp.get_context().Pool(cpus) as pool: 63 | pool.starmap_async(download, args).get() 64 | 65 | file_count = len(list(dl_path.glob("*"))) 66 | msg = f"Finished: {file_count:,} files in {str(dl_path)!r}" 67 | print(ox.ts(), msg) 68 | -------------------------------------------------------------------------------- /code/02-attach-elevation/01-aster-srtm/03-build-vrts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | from pathlib import Path 5 | 6 | import osmnx as ox 7 | 8 | with Path("./config.json").open() as f: 9 | config = json.load(f) 10 | ox.settings.cache_folder = config["osmnx_cache_path"] 11 | aster_path = Path(config["gdem_aster_path"]) 12 | srtm_path = Path(config["gdem_srtm_path"]) 13 | 14 | # get one sample graph, just to build the VRTs for the first time 15 | filepath = sorted(Path(config["models_graphml_path"]).glob("*/*"))[0] 16 | G = ox.io.load_graphml(filepath) 17 | 18 | # build VRT files for the SRTM and ASTER raster files 19 | args = [("srtm", srtm_path, "*.hgt"), ("aster", aster_path, "*.tif")] 20 | for data_source, rasters_path, glob_pattern in args: 21 | rasters = sorted(rasters_path.glob(glob_pattern)) 22 | msg = f"Building VRT for {len(rasters):,} files from {str(rasters_path)!r}" 23 | print(ox.ts(), msg) 24 | G = ox.elevation.add_node_elevations_raster(G, rasters) 25 | for _, data in G.nodes(data=True): 26 | data[f"elevation_{data_source}"] = data.pop("elevation") 27 | 28 | # show descriptive stats for the elevation values in this one city 29 | cols = ["elevation_aster", "elevation_srtm"] 30 | stats = ox.convert.graph_to_gdfs(G, edges=False)[cols].describe() 31 | print(ox.ts(), stats) 32 | -------------------------------------------------------------------------------- /code/02-attach-elevation/01-aster-srtm/04-add-node-elevations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | from pathlib import Path 6 | 7 | import networkx as nx 8 | import osmnx as ox 9 | 10 | with Path("./config.json").open() as f: 11 | config = json.load(f) 12 | ox.settings.cache_folder = config["osmnx_cache_path"] 13 | 14 | # configure multiprocessing 15 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 16 | 17 | # get the paths of all the ASTER/SRTM rasters 18 | srtm_files = sorted(Path(config["gdem_srtm_path"]).glob("*.hgt")) 19 | aster_files = sorted(Path(config["gdem_aster_path"]).glob("*.tif")) 20 | attr_rasters = [("elevation_aster", aster_files), ("elevation_srtm", srtm_files)] 21 | 22 | 23 | def process_graph(filepath, attr_rasters=attr_rasters) -> None: 24 | G = ox.io.load_graphml(filepath) 25 | for attr, rasters in attr_rasters: 26 | # if not all graph nodes have this attr, then add elevation from 27 | # raster files, rename elevation -> this attr name, then save graph 28 | if set(G.nodes) != set(nx.get_node_attributes(G, attr)): 29 | try: 30 | G = ox.elevation.add_node_elevations_raster(G, rasters, cpus=1) 31 | for _, data in G.nodes(data=True): 32 | data[attr] = data.pop("elevation") 33 | ox.io.save_graphml(G, filepath) 34 | except ValueError as e: 35 | print(e, filepath, attr) 36 | 37 | 38 | # set up the args 39 | filepaths = sorted(Path(config["models_graphml_path"]).glob("*/*")) 40 | args = ((fp,) for fp in filepaths) 41 | 42 | # multiprocess the queue 43 | print(ox.ts(), f"Adding elevation to {len(filepaths):,} graphs with {cpus} CPUs") 44 | with mp.get_context().Pool(cpus) as pool: 45 | pool.starmap_async(process_graph, args).get() 46 | print(ox.ts(), f"Finished adding elevation to {len(filepaths):,} graphs") 47 | -------------------------------------------------------------------------------- /code/02-attach-elevation/02-google/01-cluster-nodes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import itertools 4 | import json 5 | import math 6 | import multiprocessing as mp 7 | from pathlib import Path 8 | 9 | import numpy as np 10 | import osmnx as ox 11 | from scipy.spatial import cKDTree 12 | 13 | # google usage limit: 512 locations per request 14 | coords_per_request = 512 15 | 16 | # load configs 17 | with Path("./config.json").open() as f: 18 | config = json.load(f) 19 | 20 | # configure multiprocessing 21 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 22 | 23 | graphml_folder = Path(config["models_graphml_path"]) 24 | save_folder = Path(config["elevation_nodeclusters_path"]) 25 | 26 | 27 | # return graph nodes' x-y coordinates 28 | def get_graph_nodes(fp): 29 | return ox.convert.graph_to_gdfs(ox.io.load_graphml(fp), edges=False, node_geometry=False)[ 30 | ["x", "y"] 31 | ] 32 | 33 | 34 | # get an iterator of points around the perimeter of nodes' coordinates 35 | def get_perimeter_points(nodes): 36 | tl = np.array((nodes["x"].min(), nodes["y"].max())) 37 | t = np.array((nodes["x"].mean(), nodes["y"].max())) 38 | tr = np.array((nodes["x"].max(), nodes["y"].max())) 39 | r = np.array((nodes["x"].max(), nodes["y"].mean())) 40 | br = np.array((nodes["x"].max(), nodes["y"].min())) 41 | b = np.array((nodes["x"].mean(), nodes["y"].min())) 42 | bl = np.array((nodes["x"].min(), nodes["y"].min())) 43 | l = np.array((nodes["x"].min(), nodes["y"].mean())) # noqa: E741 44 | points = [tl, t, tr, r, br, b, bl, l] 45 | multiplier = math.ceil(len(nodes) / coords_per_request / len(points)) 46 | return iter(points * multiplier) 47 | 48 | 49 | # group the nodes into nearest-neighbor clusters 50 | def get_clusters(nodes): 51 | nodes_remaining = nodes 52 | perimeter_points = get_perimeter_points(nodes) 53 | clusters = [] 54 | while len(nodes_remaining) > 0: 55 | if len(nodes_remaining) <= coords_per_request: 56 | labels = nodes_remaining.index 57 | else: 58 | # find node nearest to next perimeter point, then get a cluster of 59 | # its nearest `coords_per_request` neighbors around it 60 | tree = cKDTree(nodes_remaining[["x", "y"]]) 61 | _, start_pos = tree.query(next(perimeter_points), k=1) 62 | start_point = nodes_remaining.iloc[start_pos][["x", "y"]] 63 | _, pos = tree.query(start_point, k=coords_per_request) 64 | labels = nodes_remaining.iloc[pos].index 65 | clusters.append(labels) 66 | nodes_remaining = nodes_remaining.drop(labels) 67 | 68 | # ensure each node has a cluster and each cluster is smaller than max size 69 | assert set(itertools.chain.from_iterable(clusters)) == set(nodes.index) 70 | for cluster in clusters: 71 | assert len(cluster) <= coords_per_request 72 | 73 | return clusters 74 | 75 | 76 | # load graph, cluster nodes, and save to disk 77 | def cluster_nodes(fp) -> None: 78 | nodes = get_graph_nodes(fp) 79 | clusters = get_clusters(nodes) 80 | for count, cluster in enumerate(clusters): 81 | nodes.loc[cluster, "cluster"] = f"{fp.stem}_{count}" 82 | 83 | save_path = save_folder / (fp.stem + ".csv") 84 | save_path.parent.mkdir(parents=True, exist_ok=True) 85 | nodes.to_csv(save_path, index=True, encoding="utf-8") 86 | msg = f"Clustered {fp.stem!r} {len(nodes):,} nodes into {len(clusters):,} clusters" 87 | print(ox.ts(), msg, flush=True) 88 | 89 | 90 | filepaths = sorted(graphml_folder.glob("*/*.graphml")) 91 | args = [(fp,) for fp in filepaths if not (save_folder / (fp.stem + ".csv")).is_file()] 92 | print(ox.ts(), f"Clustering nodes from {len(args):,} remaining GraphML files") 93 | 94 | with mp.get_context().Pool(cpus) as pool: 95 | pool.starmap_async(cluster_nodes, args).get() 96 | -------------------------------------------------------------------------------- /code/02-attach-elevation/02-google/02-make-google-urls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | from itertools import batched 6 | from pathlib import Path 7 | 8 | import osmnx as ox 9 | import pandas as pd 10 | from keys import api_keys 11 | 12 | # google usage limit: 512 locations and 16384 characters per request 13 | precision = 5 14 | coords_per_request = 512 15 | requests_per_key = 39000 16 | chars_per_url = 16384 17 | url_template = ( 18 | "https://maps.googleapis.com/maps/api/elevation/json?locations={locations}&key={{key}}" 19 | ) 20 | 21 | # load configs 22 | with Path("./config.json").open() as f: 23 | config = json.load(f) 24 | 25 | # configure multiprocessing 26 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 27 | 28 | # set up the args 29 | filepaths = sorted(Path(config["elevation_nodeclusters_path"]).glob("*.csv")) 30 | args = ((fp,) for fp in filepaths) 31 | print(ox.ts(), f"Loading node clusters from {len(filepaths):,} files with {cpus} CPUs") 32 | 33 | # extract all nodes and coordinates from all graphs 34 | with mp.get_context().Pool(cpus) as pool: 35 | result = pool.starmap_async(pd.read_csv, args) 36 | df = pd.concat(result.get(), ignore_index=True).set_index("osmid").sort_index() 37 | 38 | df = df[~df.index.duplicated()] 39 | print(ox.ts(), f"There are {len(df):,} unique nodes") 40 | 41 | 42 | def url_add_locations(_, cluster): 43 | assert len(cluster) <= coords_per_request 44 | strings = (f"{y:.{precision}f},{x:.{precision}f}" for y, x in zip(cluster["y"], cluster["x"])) 45 | locations = "|".join(strings) 46 | return tuple(cluster.index), url_template.format(locations=locations) 47 | 48 | 49 | with mp.get_context().Pool(cpus) as pool: 50 | urls = pool.starmap_async(url_add_locations, df.groupby("cluster")).get() 51 | 52 | # then add API keys to URLs, `requests_per_key` at a time 53 | urls_with_keys = [] 54 | keys_nodes_urls = zip(api_keys, batched(urls, requests_per_key), strict=True) 55 | for api_key, nodes_urls in keys_nodes_urls: 56 | for nodes, url in nodes_urls: 57 | url_with_key = url.format(key=api_key) 58 | assert len(url_with_key) <= chars_per_url 59 | urls_with_keys.append((nodes, url_with_key)) 60 | 61 | # ensure no key is used more times than allowed 62 | df_save = pd.DataFrame(urls_with_keys, columns=["nodes", "url"]) 63 | for api_key in api_keys: 64 | count = df_save["url"].str.contains(api_key).sum() 65 | print(ox.ts(), f"Created {count:,} URLs using key {api_key!r}") 66 | assert count <= requests_per_key 67 | 68 | # save to disk 69 | save_path = Path(config["elevation_google_urls_path"]) 70 | save_path.parent.mkdir(parents=True, exist_ok=True) 71 | df_save.to_csv(save_path, index=False, encoding="utf-8") 72 | print(ox.ts(), f"Saved {len(df_save):,} URLs to {str(save_path)!r}") 73 | -------------------------------------------------------------------------------- /code/02-attach-elevation/02-google/03-download-google-elevations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | import time 6 | from ast import literal_eval 7 | from pathlib import Path 8 | 9 | import osmnx as ox 10 | import pandas as pd 11 | import requests 12 | 13 | # load configs 14 | with Path("./config.json").open() as f: 15 | config = json.load(f) 16 | 17 | # configure multiprocessing 18 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 19 | 20 | ox.settings.use_cache = True 21 | ox.settings.log_console = False 22 | ox.settings.log_file = True 23 | ox.settings.logs_folder = config["osmnx_log_path"] 24 | ox.settings.cache_folder = config["osmnx_cache_path"] 25 | 26 | 27 | def get_elevations(nodes, url, pause=0): 28 | # check if this request is already in the cache 29 | cached_response_json = ox._http._retrieve_from_cache(url) 30 | if cached_response_json is not None: 31 | response_json = cached_response_json 32 | ox.log(f"Got URL from cache: {url}") 33 | 34 | # otherwise, request the elevations from the API 35 | else: 36 | try: 37 | ox.log(f"Requesting node elevations from API: {url}") 38 | time.sleep(pause) 39 | response = requests.get(url) 40 | assert response.ok 41 | response_json = response.json() 42 | ox._http._save_to_cache(url, response_json, response.ok) 43 | except Exception as e: 44 | msg = f"Response: {response.status_code}, {response.reason}, {response.text}, {url}" 45 | print(ox.ts(), msg, e) 46 | return None 47 | 48 | # extract the results and, if any, return as dataframe 49 | results = response_json["results"] 50 | if results is None: 51 | return None 52 | df = pd.DataFrame(results, index=literal_eval(nodes)) 53 | if "elevation" not in df.columns: 54 | cache_filepath = ox._http._resolve_cache_filepath(url) 55 | print(ox.ts(), f"No elevation results in {str(cache_filepath)!r}") 56 | return None 57 | return df[["elevation", "resolution"]].round(2) 58 | 59 | 60 | # load the URLs and count how many we already have responses cached for 61 | urls = pd.read_csv(config["elevation_google_urls_path"]) 62 | count_cached = 0 63 | count_uncached = 0 64 | for url in urls["url"]: 65 | if ox._http._check_cache(url) is None: 66 | count_uncached += 1 67 | else: 68 | count_cached += 1 69 | 70 | msg = f"Getting {count_cached:,} URLs from cache and {count_uncached:,} from API using {cpus} CPUs" 71 | print(ox.ts(), msg) 72 | 73 | # uncomment this if you want to actually hit the API (and pay for it) 74 | assert count_uncached == 0 75 | 76 | # download elevations from Google API in parallel 77 | with mp.get_context().Pool(cpus) as pool: 78 | args = ((nodes_url.nodes, nodes_url.url) for nodes_url in urls.itertuples()) 79 | result = pool.starmap_async(get_elevations, args) 80 | df = pd.concat(result.get(), ignore_index=False).sort_index() 81 | 82 | # save to disk 83 | save_path = Path(config["elevation_google_elevations_path"]) 84 | save_path.parent.mkdir(parents=True, exist_ok=True) 85 | df.index.name = "osmid" 86 | df.to_csv(save_path, index=True, encoding="utf-8") 87 | print(ox.ts(), f"Saved {len(df):,} node elevations to disk at {save_path}") 88 | -------------------------------------------------------------------------------- /code/02-attach-elevation/02-google/04-choose-best-elevation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | from pathlib import Path 6 | 7 | import networkx as nx 8 | import numpy as np 9 | import osmnx as ox 10 | import pandas as pd 11 | 12 | # load configs 13 | with Path("./config.json").open() as f: 14 | config = json.load(f) 15 | 16 | # configure multiprocessing 17 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 18 | 19 | # dict to convert elev attrs to correct dtype 20 | elev_attrs = ("elevation_aster", "elevation_srtm") 21 | node_dtypes = dict.fromkeys(elev_attrs, float) 22 | 23 | # load google elevation data for lookup 24 | fp = config["elevation_google_elevations_path"] 25 | renamer = {"elevation": "elevation_google", "resolution": "elevation_google_resolution"} 26 | df_elev = pd.read_csv(fp).rename(columns=renamer).set_index("osmid").sort_index() 27 | print(f"Loaded {len(df_elev):,} Google node elevations") 28 | 29 | 30 | def set_elevations(fp, df_elev=df_elev, node_dtypes=node_dtypes): 31 | # load the graph and attach google elevation data 32 | G = ox.io.load_graphml(fp, node_dtypes=node_dtypes) 33 | nodes, edges = ox.graph_to_gdfs(G) 34 | nodes = nodes.join(df_elev) 35 | 36 | # calculate differences in ASTER, SRTM, and Google elevation values 37 | nodes["elev_diff_aster_google"] = (nodes["elevation_aster"] - nodes["elevation_google"]).fillna( 38 | np.inf, 39 | ) 40 | nodes["elev_diff_srtm_google"] = (nodes["elevation_srtm"] - nodes["elevation_google"]).fillna( 41 | np.inf, 42 | ) 43 | 44 | # in each row identify if SRTM or ASTER has smaller absolute difference from Google's value 45 | use_srtm = nodes["elev_diff_srtm_google"].abs() <= nodes["elev_diff_aster_google"].abs() 46 | pct = 100 * use_srtm.sum() / len(nodes) 47 | print(f"{pct:0.1f}% of nodes use SRTM, {100 - pct:0.1f}% use ASTER in {fp.stem!r}") 48 | 49 | # assign elevation as the SRTM or ASTER value closer to Google's, as a tie-breaker 50 | nodes["elevation"] = np.nan 51 | nodes.loc[use_srtm, "elevation"] = nodes.loc[use_srtm, "elevation_srtm"] 52 | nodes.loc[~use_srtm, "elevation"] = nodes.loc[~use_srtm, "elevation_aster"] 53 | 54 | # ensure all elevations are non-null 55 | assert pd.notna(nodes["elevation"]).all() 56 | nodes["elevation"] = nodes["elevation"].astype(int) 57 | 58 | # add elevation to graph nodes, calculate edge grades, then save to disk 59 | nx.set_node_attributes(G, nodes["elevation"], "elevation") 60 | G = ox.add_edge_grades(G, add_absolute=True) 61 | ox.io.save_graphml(G, fp) 62 | return nodes 63 | 64 | 65 | # multiprocess the queue 66 | args = [(fp,) for fp in Path(config["models_graphml_path"]).glob("*/*.graphml")] # [-100:] 67 | msg = f"Setting node elevations for {len(args):,} GraphML files using {cpus} CPUs" 68 | print(ox.ts(), msg) 69 | with mp.get_context().Pool(cpus) as pool: 70 | result = pool.starmap_async(set_elevations, args) 71 | results = (r for r in result.get() if r is not None) 72 | 73 | # save all nodes' elevation details to disk for later analysis 74 | df = pd.concat(results, ignore_index=False).sort_index() 75 | cols = [c for c in df.columns if "elev" in c] 76 | df = df[cols] 77 | df = df.replace([np.inf, -np.inf], np.nan) 78 | print(df.describe().round(2)) 79 | df.to_csv(config["elevation_final_path"], index=True, encoding="utf-8") 80 | -------------------------------------------------------------------------------- /code/03-calculate-indicators/01-calculate-node-bc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | from os.path import getsize 6 | from pathlib import Path 7 | 8 | import igraph as ig 9 | import networkx as nx 10 | import osmnx as ox 11 | 12 | # we will calculate length-weighted betweenness centralities 13 | WEIGHT_ATTR = "length" 14 | 15 | # load configs 16 | with Path("./config.json").open() as f: 17 | config = json.load(f) 18 | 19 | # configure multiprocessing 20 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 21 | 22 | # configure where to find saved graphs and where to save results 23 | graphml_folder = Path(config["models_graphml_path"]) 24 | save_folder = Path(config["node_bc_path"]) 25 | save_folder.mkdir(parents=True, exist_ok=True) 26 | 27 | 28 | def convert_igraph(G_nx, weight_attr): 29 | # relabel graph nodes as integers for igraph to ingest 30 | G_nx = nx.relabel.convert_node_labels_to_integers(G_nx) 31 | 32 | # create igraph graph and add nodes/edges 33 | G_ig = ig.Graph(directed=True) 34 | G_ig.add_vertices(G_nx.nodes) 35 | G_ig.add_edges(G_nx.edges(keys=False)) 36 | 37 | # add edge weights and ensure values >0 for igraph 38 | weights = nx.get_edge_attributes(G_nx, weight_attr).values() 39 | weights = (0.001 if w == 0 else w for w in weights) 40 | G_ig.es[weight_attr] = list(weights) 41 | return G_ig 42 | 43 | 44 | def calculate_bc(fp, save_path, weight_attr=WEIGHT_ATTR) -> None: 45 | print(ox.ts(), f"{str(fp)!r}") 46 | 47 | # load graphml, convert to igraph, calculate bc, and normalize values 48 | G_nx = ox.io.load_graphml(fp) 49 | bc_raw = convert_igraph(G_nx, weight_attr).betweenness(weights=weight_attr) 50 | bc_norm = (x / (len(G_nx) - 1) / (len(G_nx) - 2) for x in bc_raw) 51 | osmid_bc = dict(zip(G_nx.nodes, bc_norm, strict=True)) 52 | 53 | # set graph node attributes and re-save graphml file 54 | nx.set_node_attributes(G_nx, osmid_bc, name="bc") 55 | ox.io.save_graphml(G_nx, fp) 56 | 57 | # also save results to disk as JSON 58 | with save_path.open("w") as f: 59 | json.dump(osmid_bc, f) 60 | 61 | 62 | # get graph filepaths for which we have not yet calculated BC, sorted by size 63 | filepaths = sorted(graphml_folder.glob("*/*.graphml"), key=getsize) 64 | savepaths = (save_folder / f"{fp.parent.stem}-{fp.stem}.json" for fp in filepaths) 65 | args = [(fp, sp) for fp, sp in zip(filepaths, savepaths) if not sp.is_file()] 66 | print(ox.ts(), f"There are {len(filepaths):,} total GraphML files") 67 | print(ox.ts(), f"Calculating BC for {len(args):,} remaining graphs") 68 | 69 | # multiprocess the queue 70 | with mp.get_context().Pool(cpus) as pool: 71 | pool.starmap_async(calculate_bc, args).get() 72 | 73 | count_done = len(list(save_folder.glob("*.json"))) 74 | print(ox.ts(), f"Calculated BC for {count_done:,} graphs") 75 | -------------------------------------------------------------------------------- /code/03-calculate-indicators/02-calculate-indicators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | import random 6 | from os.path import getsize 7 | from pathlib import Path 8 | from statistics import mean, median 9 | 10 | import networkx as nx 11 | import numpy as np 12 | import osmnx as ox 13 | import pandas as pd 14 | 15 | # load configs 16 | with Path("./config.json").open() as f: 17 | config = json.load(f) 18 | 19 | # configure multiprocessing 20 | cpus = mp.cpu_count() if config["cpus_stats"] == 0 else config["cpus_stats"] 21 | 22 | graphml_folder = Path(config["models_graphml_path"]) # where to load graphml files 23 | save_path = Path(config["indicators_street_path"]) # where to save indicator output 24 | 25 | 26 | def intersection_counts(Gup): 27 | TOL = 10 # meters for intersection cleaning tolerance 28 | icc = len(ox.consolidate_intersections(Gup, tolerance=TOL, rebuild_graph=False)) 29 | ict = len(ox.consolidate_intersections(Gup, tolerance=TOL, reconnect_edges=False)) 30 | return { 31 | "intersect_count": ox.stats.intersection_count(Gup), 32 | "intersect_count_clean": icc, 33 | "intersect_count_clean_topo": ict, 34 | } 35 | 36 | 37 | def calculate_clustering(G): 38 | results = {} 39 | 40 | # get directed graph without parallel edges 41 | G = ox.convert.to_digraph(G, weight="length") 42 | 43 | # avg clust coeff for directed graph ignoring parallel edges 44 | results["cc_avg_dir"] = nx.average_clustering(G) 45 | 46 | # avg clust coeff (weighted) for directed graph ignoring parallel edges 47 | results["cc_wt_avg_dir"] = nx.average_clustering(G, weight="length") 48 | 49 | # max pagerank (weighted) in directed graph ignoring parallel edges 50 | results["pagerank_max"] = max(nx.pagerank(G, weight="length").values()) 51 | 52 | # get undirected graph without parallel edges 53 | G = nx.Graph(G) 54 | 55 | # avg clust coeff for undirected graph ignoring parallel edges 56 | results["cc_avg_undir"] = nx.average_clustering(G) 57 | 58 | # avg clust coeff (weighted) for undirected graph ignoring parallel edges 59 | results["cc_wt_avg_undir"] = nx.average_clustering(G, weight="length") 60 | return results 61 | 62 | 63 | def calculate_elevation_grades(Gu): 64 | # calculate elevation & grade stats 65 | grades = pd.Series(nx.get_edge_attributes(Gu, "grade_abs").values()) 66 | elevs = pd.Series(nx.get_node_attributes(Gu, "elevation").values()) 67 | elev_iqr = elevs.quantile(0.75) - elevs.quantile(0.25) 68 | elev_range = elevs.max() - elevs.min() 69 | return { 70 | "elev_iqr": elev_iqr, 71 | "elev_mean": elevs.mean(), 72 | "elev_median": elevs.median(), 73 | "elev_range": elev_range, 74 | "elev_std": elevs.std(), 75 | "grade_mean": grades.mean(), 76 | "grade_median": grades.median(), 77 | } 78 | 79 | 80 | def gini(x): 81 | sorted_x = np.sort(x) 82 | n = len(x) 83 | cumx = np.cumsum(sorted_x, dtype=float) 84 | return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n 85 | 86 | 87 | def save_results(results, save_path) -> None: 88 | save_path.parent.mkdir(parents=True, exist_ok=True) 89 | df = pd.DataFrame(results) 90 | if save_path.is_file(): 91 | df = pd.concat([pd.read_csv(save_path), df]) 92 | df.to_csv(save_path, index=False, encoding="utf-8") 93 | print(ox.ts(), f"Saved {len(results):,} new results to disk at {str(save_path)!r}") 94 | 95 | 96 | def calculate_graph_stats(graphml_path): 97 | print(ox.ts(), f"Processing {str(graphml_path)!r}") 98 | G = ox.io.load_graphml(graphml_path, node_dtypes={"bc": float}) 99 | 100 | # get filepath and country/city identifiers 101 | country, country_iso = graphml_path.parent.stem.split("-") 102 | core_city, uc_id = graphml_path.stem.split("-") 103 | uc_id = int(uc_id) 104 | 105 | # clustering and pagerank: needs directed representation 106 | clustering_stats = calculate_clustering(G) 107 | 108 | # get an undirected representation of this network for everything else 109 | Gu = ox.convert.to_undirected(G) 110 | G.clear() 111 | G = None 112 | 113 | # street lengths 114 | lengths = nx.get_edge_attributes(Gu, "length").values() 115 | length_total = sum(lengths) 116 | length_mean = mean(lengths) 117 | length_median = median(lengths) 118 | 119 | # nodes, edges, node degree, self loops 120 | n = len(Gu.nodes) 121 | m = len(Gu.edges) 122 | k_avg = 2 * m / n 123 | self_loop_proportion = ox.stats.self_loop_proportion(Gu) 124 | 125 | # proportion of 4-way intersections, 3-ways, and dead-ends 126 | spn = ox.stats.streets_per_node_proportions(Gu) 127 | prop_4way = spn.get(4, 0) 128 | prop_3way = spn.get(3, 0) 129 | prop_deadend = spn.get(1, 0) 130 | 131 | # betweenness centrality stats 132 | bc = list(nx.get_node_attributes(Gu, "bc").values()) 133 | bc_gini = gini(bc) 134 | bc_max = max(bc) 135 | 136 | # average circuity and straightness 137 | circuity = ox.stats.circuity_avg(Gu) 138 | straightness = 1 / circuity 139 | 140 | # elevation and grade 141 | elevation_grades = calculate_elevation_grades(Gu) 142 | 143 | # orientation entropy 144 | orientation_entropy = ox.bearing.orientation_entropy(ox.bearing.add_edge_bearings(Gu)) 145 | 146 | # total and clean intersection counts 147 | intersection_stats = intersection_counts(ox.projection.project_graph(Gu)) 148 | 149 | # assemble the results 150 | results = { 151 | "country": country, 152 | "country_iso": country_iso, 153 | "core_city": core_city, 154 | "uc_id": uc_id, 155 | "circuity": circuity, 156 | "k_avg": k_avg, 157 | "length_mean": length_mean, 158 | "length_median": length_median, 159 | "length_total": length_total, 160 | "street_segment_count": m, 161 | "node_count": n, 162 | "orientation_entropy": orientation_entropy, 163 | "prop_4way": prop_4way, 164 | "prop_3way": prop_3way, 165 | "prop_deadend": prop_deadend, 166 | "self_loop_proportion": self_loop_proportion, 167 | "straightness": straightness, 168 | "bc_gini": bc_gini, 169 | "bc_max": bc_max, 170 | } 171 | results.update(clustering_stats) 172 | results.update(elevation_grades) 173 | results.update(intersection_stats) 174 | return results 175 | 176 | 177 | # get all the filepaths that don't already have results in the save file 178 | done = set(pd.read_csv(save_path)["uc_id"]) if save_path.is_file() else set() 179 | filepaths = sorted(graphml_folder.glob("*/*"), key=getsize) 180 | args = [(fp,) for fp in filepaths if int(fp.stem.split("-")[1]) not in done] 181 | 182 | # randomly order params so one thread doesn't have to do all the big graphs 183 | random.shuffle(args) 184 | msg = f"Calculating stats for {len(args):,} graphs using {cpus} CPUs" 185 | print(ox.ts(), msg) 186 | 187 | # multiprocess the queue 188 | with mp.get_context().Pool(cpus) as pool: 189 | results = pool.starmap_async(calculate_graph_stats, args).get() 190 | 191 | # final save to disk 192 | save_results(results, save_path) 193 | -------------------------------------------------------------------------------- /code/03-calculate-indicators/03-merge-indicators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | from pathlib import Path 5 | 6 | import geopandas as gpd 7 | import osmnx as ox 8 | import pandas as pd 9 | 10 | # load configs 11 | with Path("./config.json").open() as f: 12 | config = json.load(f) 13 | 14 | uc_gpkg_path = config["uc_gpkg_path"] # prepped urban centers dataset 15 | ind_street_path = config["indicators_street_path"] # street network indicators to load 16 | ind_path = config["indicators_path"] # merged indicators to save for repo upload 17 | ind_all_path = config["indicators_all_path"] # all merged indicators to save for analysis 18 | 19 | # load the UCs dataset 20 | ucs = gpd.read_file(uc_gpkg_path).sort_index().drop(columns=["country_iso"]) 21 | print(ox.ts(), f"Loaded urban centers dataset with shape={ucs.shape}") 22 | 23 | # load the previously calculated street network indicators dataset 24 | ind = pd.read_csv(ind_street_path) 25 | print(ox.ts(), f"Loaded indicators dataset with shape={ind.shape}") 26 | 27 | # rename UC fields to something intelligible 28 | mapper = { 29 | "GC_UCN_LIS_2025": "uc_names", 30 | "GC_DEV_USR_2025": "world_region", 31 | "GC_POP_TOT_2025": "resident_pop", 32 | "GC_UCA_KM2_2025": "area_km2", 33 | "GH_BUS_TOT_2025": "built_up_area_m2", 34 | "GH_BPC_TOT_2025": "built_up_area_percap", 35 | "GH_BUH_AVG_2020": "built_height_m", 36 | "SC_SEC_GDP_2020": "gdp_ppp", 37 | "GC_DEV_WIG_2025": "world_bank_income_group", 38 | "SC_SEC_HDI_2020": "hdi", 39 | "EM_CO2_TRA_2022": "transport_co2_em", 40 | "EM_PM2_TRA_2022": "transport_pm25_em", 41 | "EM_PM2_CON_2020": "pm25_concentration", 42 | "CL_KOP_CUR_2025": "koppen_geiger", 43 | "GE_ELV_AVG_2025": "avg_elevation", 44 | "CL_B12_CUR_2010": "avg_precipitation", 45 | "CL_B01_CUR_2010": "avg_temperature", 46 | "SD_POP_HGR_2025": "pop_greenness", 47 | "SD_LUE_LPR_2000_2020": "land_use_efficiency", 48 | } 49 | 50 | # merge UC data with street network indicators, only keep columns from the 51 | # indicators data set or named in the mapper, then save to disk 52 | df = ind.merge(right=ucs, how="inner", left_on="uc_id", right_on="ID_UC_G0") 53 | df = df.rename(columns=mapper) 54 | df = df[[c for c in df.columns if c in ind.columns or c in mapper.values()]] 55 | df.to_csv(ind_all_path, index=False, encoding="utf-8") 56 | msg = f"Saved all indicators to disk at {str(ind_all_path)!r}, shape={df.shape}" 57 | print(ox.ts(), msg) 58 | 59 | # drop columns that should not go in our repo then save 60 | drop = [ 61 | "built_up_area_percap", 62 | "built_height_m", 63 | "gdp_ppp", 64 | "world_bank_income_group", 65 | "hdi", 66 | "transport_co2_em", 67 | "transport_pm25_em", 68 | "pm25_concentration", 69 | "koppen_geiger", 70 | "avg_elevation", 71 | "avg_precipitation", 72 | "avg_temperature", 73 | "pop_greenness", 74 | "land_use_efficiency", 75 | ] 76 | df = df.drop(columns=drop) 77 | df.to_csv(ind_path, index=False, encoding="utf-8") 78 | msg = f"Saved repo indicators to disk at {str(ind_path)!r}, shape={df.shape}" 79 | print(ox.ts(), msg) 80 | -------------------------------------------------------------------------------- /code/03-calculate-indicators/04-create-metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | from pathlib import Path 5 | 6 | import osmnx as ox 7 | import pandas as pd 8 | 9 | # load configs 10 | with Path("./config.json").open() as f: 11 | config = json.load(f) 12 | 13 | ind_path = config["indicators_path"] # indicators data (repo subset) 14 | ind_all_path = config["indicators_all_path"] # all indicators data 15 | ind_meta_path = config["indicators_metadata_path"] # indicators metadata (repo subset) 16 | ind_all_meta_path = config["indicators_all_metadata_path"] # indicators metadata (all) 17 | nodes_meta_path = config["models_metadata_nodes_path"] # graph nodes metadata 18 | edges_meta_path = config["models_metadata_edges_path"] # graph edges metadata 19 | 20 | # create graph nodes metadata 21 | desc = {} 22 | desc["osmid"] = {"description": "Unique OSM node ID", "type": "int"} 23 | desc["x"] = {"description": "Longitude coordinate (EPSG:4326)", "type": "float"} 24 | desc["y"] = {"description": "Latitude coordinate (EPSG:4326)", "type": "float"} 25 | desc["elevation"] = { 26 | "description": "Node elevation (meters above sea level) from ASTER or SRTM", 27 | "type": "int", 28 | } 29 | desc["elevation_aster"] = { 30 | "description": "Node elevation (meters above sea level) from ASTER", 31 | "type": "int", 32 | } 33 | desc["elevation_srtm"] = { 34 | "description": "Node elevation (meters above sea level) from SRTM", 35 | "type": "int", 36 | } 37 | desc["street_count"] = { 38 | "description": "Number of physical street segments connected to this node", 39 | "type": "int", 40 | } 41 | desc["bc"] = { 42 | "description": "Normalized distance-weighted node betweenness centrality", 43 | "type": "float", 44 | } 45 | desc["other attributes"] = {"description": "As defined in OSM documentation", "type": ""} 46 | 47 | # save nodes metadata to disk 48 | nodes_meta = pd.DataFrame(desc).T.reset_index().rename(columns={"index": "indicator"}) 49 | nodes_meta.to_csv(nodes_meta_path, index=False, encoding="utf-8") 50 | print(ox.ts(), f"Saved graph nodes metadata to {str(nodes_meta_path)!r}") 51 | 52 | # create graph edges metadata 53 | desc = {} 54 | desc["u"] = {"description": "Unique OSM ID of source node", "type": "int"} 55 | desc["v"] = {"description": "Unique OSM ID of destination node", "type": "int"} 56 | desc["key"] = {"description": "Unique ID if parallel edges exist between u and v", "type": "int"} 57 | desc["osmid"] = {"description": "Unique OSM way ID", "type": "int"} 58 | desc["geometry"] = {"description": "Edge centerline geometry (EPSG:4326)", "type": "linestring"} 59 | desc["length"] = {"description": "Length along the edge (meters)", "type": "float"} 60 | desc["grade"] = {"description": "Edge grade (rise over run)", "type": "float"} 61 | desc["grade_abs"] = {"description": "Absolute value of edge grade", "type": "float"} 62 | desc["oneway"] = {"description": "Whether edge part of a one-way street", "type": "boolean"} 63 | desc["reversed"] = { 64 | "description": "Whether edge runs opposite direction of OSM way", 65 | "type": "boolean", 66 | } 67 | desc["other attributes"] = {"description": "As defined in OSM documentation", "type": ""} 68 | 69 | # save edges metadata to disk 70 | edges_meta = pd.DataFrame(desc).T.reset_index().rename(columns={"index": "indicator"}) 71 | edges_meta.to_csv(edges_meta_path, index=False, encoding="utf-8") 72 | print(ox.ts(), f"Saved graph edges metadata to {str(edges_meta_path)!r}") 73 | 74 | # create indicators metadata 75 | desc = {} 76 | desc["area_km2"] = "Area within urban center boundary polygon, km2 (GHS)" 77 | desc["avg_elevation"] = "Average elevation, meters above sea level (GHS)" 78 | desc["avg_precipitation"] = "Annual average precipitation, millimeters (GHS)" 79 | desc["avg_temperature"] = "Average temperature, celsius (GHS)" 80 | desc["bc_gini"] = "Gini coefficient of normalized distance-weighted node betweenness centralities" 81 | desc["bc_max"] = "Max normalized distance-weighted node betweenness centralities" 82 | desc["built_height_m"] = "Average height of built surfaces, meters (GHS)" 83 | desc["built_up_area_m2"] = "Built-up surface area, square meters (GHS)" 84 | desc["built_up_area_percap"] = "Built-up surface area per-capita, square meters per person (GHS)" 85 | desc["cc_avg_dir"] = "Average clustering coefficient (unweighted/directed)" 86 | desc["cc_avg_undir"] = "Average clustering coefficient (unweighted/undirected)" 87 | desc["cc_wt_avg_dir"] = "Average clustering coefficient (weighted/directed)" 88 | desc["cc_wt_avg_undir"] = "Average clustering coefficient (weighted/undirected)" 89 | desc["circuity"] = "Ratio of street lengths to straightline distances" 90 | desc["core_city"] = "Urban center core city name" 91 | desc["country"] = "Primary country name" 92 | desc["country_iso"] = "Primary country ISO 3166-1 alpha-3 code" 93 | desc["elev_iqr"] = "Interquartile range of node elevations, meters" 94 | desc["elev_mean"] = "Mean node elevation, meters" 95 | desc["elev_median"] = "Median node elevation, meters" 96 | desc["elev_range"] = "Range of node elevations, meters" 97 | desc["elev_std"] = "Standard deviation of node elevations, meters" 98 | desc["gdp_ppp"] = "Total GDP PPP, USD (GHS)" 99 | desc["grade_mean"] = "Mean absolute street grade (incline)" 100 | desc["grade_median"] = "Median absolute street grade (incline)" 101 | desc["hdi"] = "Human development index at subnational level (GHS)" 102 | desc["intersect_count"] = "Count of (undirected) edge intersections" 103 | desc["intersect_count_clean"] = ( 104 | "Count of street intersections (merged within 10 meters geometrically)" 105 | ) 106 | desc["intersect_count_clean_topo"] = ( 107 | "Count of street intersections (merged within 10 meters topologically)" 108 | ) 109 | desc["k_avg"] = "Average node degree (undirected)" 110 | desc["koppen_geiger"] = "Köppen-Geiger classification of majority of surface (GHS)" 111 | desc["land_use_efficiency"] = "Land use efficiency 1990-2015 (GHS)" 112 | desc["length_mean"] = "Mean street segment length (undirected edges), meters" 113 | desc["length_median"] = "Median street segment length (undirected edges), meters" 114 | desc["length_total"] = "Total street length (undirected edges), meters" 115 | desc["node_count"] = "Count of nodes" 116 | desc["orientation_entropy"] = "Entropy of street network bearings" 117 | desc["pagerank_max"] = "The maximum PageRank value of any node" 118 | desc["pm25_concentration"] = ( 119 | "Population-weighted average PM2.5 concentrations, micrograms/meter^3 (GHS)" 120 | ) 121 | desc["pop_greenness"] = "Land consumption rate / population growth rate (GHS)" 122 | desc["prop_4way"] = "Proportion of nodes that represent 4-way street intersections" 123 | desc["prop_3way"] = "Proportion of nodes that represent 3-way street intersections" 124 | desc["prop_deadend"] = "Proportion of nodes that represent dead-ends" 125 | desc["resident_pop"] = "Total resident population (GHS)" 126 | desc["self_loop_proportion"] = "Proportion of edges that are self-loops" 127 | desc["straightness"] = "1 / circuity" 128 | desc["street_segment_count"] = "Count of streets (undirected edges)" 129 | desc["transport_co2_em"] = "Total CO2 emissions from transport sector, tons/year (GHS)" 130 | desc["transport_pm25_em"] = "Total PM2.5 emissions from transport sector, tons/year (GHS)" 131 | desc["uc_id"] = "Urban center unique ID (GHS)" 132 | desc["uc_names"] = "List of city names within this urban center (GHS)" 133 | desc["world_bank_income_group"] = "World Bank income group" 134 | desc["world_region"] = "UN SDG geographic region" 135 | 136 | # turn the metadata descriptions into a dataframe 137 | meta = pd.DataFrame(desc, index=["description"]).T 138 | 139 | # make sure we have metadata for all indicators 140 | ind_all = pd.read_csv(ind_all_path) 141 | assert len(ind_all.columns) == len(meta) 142 | 143 | # reindex df so cols are in same order as metadata 144 | ind_all = ind_all.reindex(columns=meta.index).dropna() 145 | 146 | # add data type of each field 147 | dtypes = ind_all.dtypes.astype(str).replace({"object": "string"}).str.replace("64", "") 148 | dtypes.name = "type" 149 | meta = meta.merge(right=dtypes, left_index=True, right_index=True).reindex( 150 | columns=["type", "description"], 151 | ) 152 | 153 | # make sure all the indicators are present in the metadata 154 | assert (meta.index == ind_all.columns).all() 155 | 156 | # save all metadata to disk 157 | meta_all = meta.reset_index().rename(columns={"index": "indicator"}) 158 | meta_all.to_csv(ind_all_meta_path, index=False, encoding="utf-8") 159 | print(ox.ts(), f"Saved all indicator metadata to {str(ind_all_meta_path)!r}") 160 | 161 | # drop fields that should not go in our repo then save 162 | repo_cols = set(pd.read_csv(ind_path).columns) 163 | keep = [k for k in desc if k in repo_cols] 164 | meta = meta.loc[keep].reset_index().rename(columns={"index": "indicator"}) 165 | meta.to_csv(ind_meta_path, index=False, encoding="utf-8") 166 | print(ox.ts(), f"Saved repo indicator metadata to {str(ind_meta_path)!r}") 167 | -------------------------------------------------------------------------------- /code/04-upload-repository/01-save-files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | from pathlib import Path 6 | 7 | import osmnx as ox 8 | import pandas as pd 9 | 10 | # load configs 11 | with Path("./config.json").open() as f: 12 | config = json.load(f) 13 | 14 | # configure multiprocessing 15 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 16 | 17 | # set up save/load folder locations 18 | graphml_folder = Path(config["models_graphml_path"]) # where to load GraphML 19 | gpkg_folder = Path(config["models_gpkg_path"]) # where to save GeoPackages 20 | nelist_folder = Path(config["models_nelist_path"]) # where to save node/edge lists 21 | 22 | 23 | # function to convert node elevation string -> float -> int 24 | def to_int(value): 25 | try: 26 | return int(float(value)) 27 | except ValueError: 28 | return float(value) 29 | 30 | 31 | node_dtypes = {"bc": float, "elevation_aster": to_int, "elevation_srtm": to_int} 32 | 33 | 34 | def save_graph(graphml_path, gpkg_path, nodes_path, edges_path, node_dtypes=node_dtypes) -> None: 35 | print(ox.ts(), f"Saving {str(graphml_path)!r}", flush=True) 36 | 37 | # load GraphML file and save as GeoPackage to disk 38 | G = ox.io.load_graphml(graphml_path, node_dtypes=node_dtypes) 39 | ox.io.save_graph_geopackage(G, gpkg_path) 40 | 41 | # get graph node/edge GeoDataFrames for node/edge lists 42 | nodes, edges = ox.convert.graph_to_gdfs(G, node_geometry=False, fill_edge_geometry=False) 43 | 44 | # nodes: round floats and organize columns 45 | node_cols = [ 46 | "osmid", 47 | "x", 48 | "y", 49 | "elevation", 50 | "elevation_aster", 51 | "elevation_srtm", 52 | "bc", 53 | "ref", 54 | "highway", 55 | ] 56 | nodes = nodes.reset_index().reindex(columns=node_cols) 57 | 58 | # edges: round floats and organize columns 59 | round_cols = ["grade", "grade_abs", "length"] 60 | edges[round_cols] = edges[round_cols].round(3) 61 | edge_cols = [ 62 | "u", 63 | "v", 64 | "key", 65 | "oneway", 66 | "highway", 67 | "name", 68 | "length", 69 | "grade", 70 | "grade_abs", 71 | "reversed", 72 | "lanes", 73 | "width", 74 | "est_width", 75 | "maxspeed", 76 | "access", 77 | "service", 78 | "bridge", 79 | "tunnel", 80 | "area", 81 | "junction", 82 | "osmid", 83 | "ref", 84 | ] 85 | edges = edges.drop(columns=["geometry"]).reset_index().reindex(columns=edge_cols) 86 | 87 | # save graph node/edge lists as CSV files to disk 88 | nodes_path.parent.mkdir(parents=True, exist_ok=True) 89 | nodes.to_csv(nodes_path, index=False, encoding="utf-8") 90 | edges.to_csv(edges_path, index=False, encoding="utf-8") 91 | 92 | 93 | def make_args(): 94 | filepaths = sorted(graphml_folder.glob("*/*")) 95 | print(ox.ts(), f"There are {len(filepaths):,} total GraphML files") 96 | 97 | args = [] 98 | for fp in filepaths: 99 | gpkg_path = gpkg_folder / fp.parent.stem / fp.name.replace("graphml", "gpkg") 100 | nelist_output_folder = nelist_folder / fp.parent.stem / fp.stem 101 | nodes_path = nelist_output_folder / "node_list.csv" 102 | edges_path = nelist_output_folder / "edge_list.csv" 103 | if not (gpkg_path.is_file() and nodes_path.is_file() and edges_path.is_file()): 104 | args.append((fp, gpkg_path, nodes_path, edges_path)) 105 | 106 | print(ox.ts(), f"Saving GeoPackage and node/edge lists for {len(args):,} remaining graphs") 107 | return args 108 | 109 | 110 | # multiprocess the queue 111 | with mp.get_context().Pool(cpus) as pool: 112 | pool.starmap_async(save_graph, make_args()).get() 113 | 114 | # final file count checks 115 | # verify same number of country folders across all file types 116 | graphml_countries = list(graphml_folder.glob("*")) 117 | gpkg_countries = list(gpkg_folder.glob("*")) 118 | nelist_countries = list(nelist_folder.glob("*")) 119 | assert len(graphml_countries) == len(gpkg_countries) == len(nelist_countries) 120 | 121 | # verify same number of model files across all file types 122 | graphml_paths = list(graphml_folder.glob("*/*.graphml")) 123 | gpkg_paths = list(gpkg_folder.glob("*/*.gpkg")) 124 | nlist_paths = list(nelist_folder.glob("*/*/node_list.csv")) 125 | elist_paths = list(nelist_folder.glob("*/*/edge_list.csv")) 126 | assert len(graphml_paths) == len(gpkg_paths) == len(nlist_paths) == len(elist_paths) 127 | 128 | # verify same countries/cities across all file types 129 | graphml_names = {fp.parent.stem + "/" + fp.stem for fp in graphml_paths} 130 | gpkg_names = {fp.parent.stem + "/" + fp.stem for fp in gpkg_paths} 131 | nelist_names = {fp.parent.stem + "/" + fp.stem for fp in nelist_folder.glob("*/*")} 132 | assert graphml_names == gpkg_names == nelist_names 133 | 134 | # verify an indicator row exists for every GraphML file 135 | df = pd.read_csv(config["indicators_path"]) 136 | ucids1 = set(df["uc_id"].astype(str).values) 137 | ucids2 = {fp.stem.split("-")[1] for fp in graphml_paths} 138 | assert ucids1 == ucids2 139 | 140 | print(ox.ts(), "Successfully passed all file checks") 141 | -------------------------------------------------------------------------------- /code/04-upload-repository/02-stage-files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import multiprocessing as mp 5 | import zipfile 6 | from pathlib import Path 7 | 8 | import osmnx as ox 9 | 10 | # load configs 11 | with Path("./config.json").open() as f: 12 | config = json.load(f) 13 | 14 | compression_args = {"compression": zipfile.ZIP_BZIP2, "compresslevel": 9} 15 | 16 | # map input folders to output folders containing zipped country files 17 | manifest = [ 18 | {"input": Path(config["models_gpkg_path"]), "output": Path(config["staging_gpkg_path"])}, 19 | {"input": Path(config["models_graphml_path"]), "output": Path(config["staging_graphml_path"])}, 20 | {"input": Path(config["models_nelist_path"]), "output": Path(config["staging_nelist_path"])}, 21 | ] 22 | 23 | # configure CPUs 24 | cpus = mp.cpu_count() if config["cpus"] == 0 else config["cpus"] 25 | 26 | 27 | # zip a folder and its contents 28 | def zip_folder(input_folder, output_fp, compression_args=compression_args) -> None: 29 | print(ox.ts(), f"Staging {str(output_fp)!r}", flush=True) 30 | pattern = "*/*" if "nelist" in str(input_folder) else "*" 31 | with zipfile.ZipFile(output_fp, mode="w", **compression_args) as zf: 32 | for input_fp in input_folder.glob(pattern): 33 | zf.write(input_fp, arcname=Path(input_fp.parent.stem) / input_fp.name) 34 | 35 | 36 | # assemble input folders to zip + their destination zip file paths 37 | args = [] 38 | for item in manifest: 39 | output_folder = item["output"] 40 | output_folder.mkdir(parents=True, exist_ok=True) 41 | for input_folder in item["input"].glob("*"): 42 | output_fp = output_folder / (input_folder.stem + ".zip") 43 | if not output_fp.is_file(): 44 | args.append((input_folder, output_fp)) 45 | 46 | # multiprocess the queue 47 | print(ox.ts(), f"Compressing and staging {len(args)} input files using {cpus} CPUs") 48 | with mp.get_context().Pool(cpus) as pool: 49 | pool.starmap_async(zip_folder, args).get() 50 | 51 | print(ox.ts(), f"Finished compressing and staging {len(args)} input files") 52 | -------------------------------------------------------------------------------- /code/04-upload-repository/03-upload-dataverse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import time 5 | import zipfile 6 | from hashlib import md5 7 | from pathlib import Path 8 | from urllib.parse import urljoin 9 | 10 | import osmnx as ox 11 | import requests 12 | from keys import dataverse_api_key as api_key 13 | 14 | # only set true on first run to erase everything from the draft 15 | delete_existing = False 16 | 17 | # lets you skip uploading files if this is supposed to be a dry run 18 | debug_mode = False 19 | 20 | # load configs 21 | with Path("./config.json").open() as f: 22 | config = json.load(f) 23 | 24 | # configure the dataverse upload 25 | attempts_max = 3 # how many times to retry same file upload after error before giving up 26 | pause_error = 10 # seconds to pause after an error 27 | pause_normal = 0 # seconds to pause between uploads 28 | upload_timeout = 1200 # how long to set the timeout for upload via http post 29 | 30 | # base URL for working with datasets via dataverse native API 31 | base_url = "https://dataverse.harvard.edu/api/v1/datasets/:persistentId/" 32 | 33 | # base URL for working with files via dataverse native API 34 | file_url = "https://dataverse.harvard.edu/api/files/{file_id}" 35 | 36 | # define what to upload 37 | manifests = [ 38 | { 39 | "doi": config["doi_gpkg"], 40 | "folder": config["staging_gpkg_path"], 41 | "file_desc": "Zip contains GeoPackages of all urban street networks in {}.", 42 | "file_tags": ["GeoPackage", "Street Network", "Models"], 43 | }, 44 | { 45 | "doi": config["doi_graphml"], 46 | "folder": config["staging_graphml_path"], 47 | "file_desc": "Zip contains GraphML files of all urban street networks in {}.", 48 | "file_tags": ["GraphML", "Street Network", "Models"], 49 | }, 50 | { 51 | "doi": config["doi_nelist"], 52 | "folder": config["staging_nelist_path"], 53 | "file_desc": "Zip contains node/edge list CSV files of all urban street networks in {}.", 54 | "file_tags": ["Node/Edge List", "Street Network", "Models"], 55 | }, 56 | ] 57 | 58 | 59 | # get all the files that currently exist in the draft or published dataset 60 | def get_server_files(doi, version): 61 | endpoint = f"versions/:{version}/files?key={api_key}&persistentId={doi}" 62 | rj = requests.get(urljoin(base_url, endpoint)).json() 63 | try: 64 | return {file["dataFile"]["filename"]: file["dataFile"]["id"] for file in rj["data"]} 65 | except KeyError: 66 | return {} 67 | 68 | 69 | # find pre-existing draft/published files already uploaded to dataset 70 | def get_preexisting_files(manifests): 71 | draft_files = {} # what files have already been uploaded to the draft? 72 | published_files = {} # what files exist in the published dataset? 73 | for manifest in manifests: 74 | doi = manifest["doi"] 75 | draft_files[doi] = get_server_files(doi, version="draft") 76 | published_files[doi] = get_server_files(doi, version="latest-published") 77 | msg = ( 78 | f"Files in {doi}: {len(published_files[doi])} published, {len(draft_files[doi])} draft." 79 | ) 80 | print(ox.ts(), msg) 81 | return draft_files, published_files 82 | 83 | 84 | # delete all the existing (carried-over) files in the draft datasets 85 | def delete_draft_files(already_uploaded) -> None: 86 | file_ids = [f for d in already_uploaded.values() for f in d.values()] 87 | print(ox.ts(), f"Deleting {len(file_ids)} draft files...") 88 | headers = {"X-Dataverse-key": api_key} 89 | for file_id in file_ids: 90 | url = file_url.format(file_id=file_id) 91 | response = requests.delete(url, headers=headers) 92 | if not response.ok: 93 | print(ox.ts(), f"Failed to delete {url!r}") 94 | 95 | 96 | # zip a staged zipped file, open it, and return the buffer. this will 97 | # double-zip the zip files because dataverse unzips zip files when they are 98 | # uploaded. the result is that dataverse will host the original zipped file 99 | def get_file_to_upload(fp, target_filename): 100 | checksum = md5(fp.open("rb").read()).hexdigest() 101 | upload_fp = Path(config["staging_folder"]) / "upload_temp.zip" 102 | with zipfile.ZipFile(file=upload_fp, mode="w") as zf: 103 | zf.write(fp, arcname=target_filename) 104 | file = {"file": upload_fp.open("rb")} 105 | return file, checksum 106 | 107 | 108 | # configure the file description and tags that appear on dataverse 109 | def get_payload_to_upload(fp, manifest): 110 | country_name = fp.stem[:-4].replace("_", " ").title() 111 | description = manifest["file_desc"].format(country_name) 112 | categories = manifest["file_tags"] + [country_name] 113 | params = {"description": description, "categories": categories} 114 | return {"jsonData": json.dumps(params)} 115 | 116 | 117 | # upload a new file to a dataverse dataset 118 | def upload_file(fp, target_filename, manifest, attempt_count=1) -> None: 119 | print(ox.ts(), f"Uploading {str(fp)!r} to {manifest['doi']!r}") 120 | if debug_mode: 121 | return 122 | 123 | file, checksum = get_file_to_upload(fp, target_filename) 124 | payload = get_payload_to_upload(fp, manifest) 125 | endpoint = f"add?persistentId={manifest['doi']}&key={api_key}" 126 | url = urljoin(base_url, endpoint) 127 | 128 | try: 129 | # upload the file to the server 130 | with requests.Session() as session: 131 | start_time = time.time() 132 | response = session.post(url, data=payload, files=file, timeout=upload_timeout) 133 | elapsed = time.time() - start_time 134 | if not response.ok: 135 | raise Exception(response.text) 136 | 137 | # verify the checksum calculated by the server matches our own 138 | remote_checksum = response.json()["data"]["files"][0]["dataFile"]["md5"] 139 | if checksum != remote_checksum: 140 | msg = f"Checksums do not match: {checksum} and {remote_checksum}" 141 | raise Exception(msg) 142 | 143 | msg = f"Response {response.status_code} in {elapsed:,.1f} seconds, checksums match" 144 | print(ox.ts(), msg) 145 | time.sleep(pause_normal) 146 | 147 | except Exception as e: 148 | print(ox.ts(), e) 149 | if attempt_count < attempts_max: 150 | # retry upload if we haven't exceeded max attempts 151 | attempt_count += 1 152 | print(ox.ts(), f"Re-trying (attempt {attempt_count} of {attempts_max})") 153 | time.sleep(pause_error) 154 | upload_file(fp, target_filename, manifest, attempt_count) 155 | else: 156 | print(ox.ts(), "No more attempts for this file, we give up") 157 | 158 | 159 | # get all draft/published files currently existing on server 160 | draft_files, published_files = get_preexisting_files(manifests) 161 | if delete_existing: 162 | delete_draft_files(draft_files) 163 | draft_files, published_files = get_preexisting_files(manifests) 164 | 165 | # create arguments to upload all remaining files in all staging folders 166 | args_list = [] 167 | for manifest in manifests: 168 | for fp in sorted(Path(manifest["folder"]).glob("*.zip")): 169 | target_filename = f"{fp.stem}_{fp.parent.stem}{fp.suffix}" 170 | if target_filename not in draft_files[manifest["doi"]]: 171 | args_list.append((fp, target_filename, manifest)) 172 | 173 | # process the queue 174 | print(ox.ts(), f"Uploading {len(args_list)} staged files...") 175 | for args in args_list: 176 | upload_file(*args) 177 | -------------------------------------------------------------------------------- /code/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cpus": 24, 3 | "cpus_stats": 10, 4 | "doi_gpkg": "doi:10.7910/DVN/E5TPDQ", 5 | "doi_graphml": "doi:10.7910/DVN/KA5HJ3", 6 | "doi_nelist": "doi:10.7910/DVN/DC7U0A", 7 | "elevation_final_path": "/data/snm/elevation/elevations-final.csv", 8 | "elevation_google_elevations_path": "/data/snm/elevation/google/elevations-google.csv", 9 | "elevation_google_urls_path": "/data/snm/elevation/google/urls.csv", 10 | "elevation_nodeclusters_path": "/data/snm/elevation/google/graph-clusters", 11 | "gdem_aster_path": "/data/snm/GDEM/aster_v3/", 12 | "gdem_aster_urls_path": "/data/snm/inputs/gdem-urls/urls-aster_v3.txt", 13 | "gdem_srtm_path": "/data/snm/GDEM/srtmgl1/", 14 | "gdem_srtm_urls_path": "/data/snm/inputs/gdem-urls/urls-srtmgl1.txt", 15 | "indicators_all_metadata_path": "/data/snm/indicators/metadata-indicators-all.csv", 16 | "indicators_all_path": "/data/snm/indicators/indicators-all.csv", 17 | "indicators_metadata_path": "/data/snm/indicators/metadata-indicators.csv", 18 | "indicators_path": "/data/snm/indicators/indicators.csv", 19 | "indicators_street_path": "/data/snm/indicators/indicators-street-network.csv", 20 | "iso_codes_path": "/data/snm/inputs/wikipedia-iso-country-codes.csv", 21 | "models_gpkg_path": "/data/snm/models/gpkg", 22 | "models_graphml_path": "/data/snm/models/graphml", 23 | "models_metadata_edges_path": "/data/snm/models/metadata-graph-edges.csv", 24 | "models_metadata_nodes_path": "/data/snm/models/metadata-graph-nodes.csv", 25 | "models_nelist_path": "/data/snm/models/nelist", 26 | "node_bc_path": "/data/snm/bc", 27 | "osmnx_cache_path": "/data/snm/cache", 28 | "osmnx_log_path": "/data/snm/logs", 29 | "staging_folder": "/data/snm/staging", 30 | "staging_gpkg_path": "/data/snm/staging/gpkg", 31 | "staging_graphml_path": "/data/snm/staging/graphml", 32 | "staging_indicators_path": "/data/snm/staging/indicators", 33 | "staging_metadata_path": "/data/snm/staging/metadata", 34 | "staging_nelist_path": "/data/snm/staging/nelist", 35 | "uc_gpkg_path": "/data/snm/ucs.gpkg", 36 | "uc_input_path": "/data/snm/inputs/GHS_UCDB_GLOBE_R2024A_V1_0/GHS_UCDB_GLOBE_R2024A.gpkg" 37 | } 38 | -------------------------------------------------------------------------------- /code/environment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: snm 3 | channels: 4 | - conda-forge 5 | dependencies: 6 | - geopandas=1.0 7 | - jupyterlab 8 | - networkx=3.4 9 | - numpy=2.2 10 | - osmnx=2.0 11 | - pandas=2.2 12 | - pre-commit 13 | - python=3.13 14 | - python-igraph=0.11 15 | - requests=2.32 16 | - scipy=1.15 17 | -------------------------------------------------------------------------------- /code/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | python ./01-construct-models/01-prep-ghsl.py 4 | python ./01-construct-models/02-download-cache.py 5 | python ./01-construct-models/03-create-graphs.py 6 | python ./02-attach-elevation/01-aster-srtm/01-download-aster_v3.py 7 | python ./02-attach-elevation/01-aster-srtm/02-download-srtmgl1.py 8 | python ./02-attach-elevation/01-aster-srtm/03-build-vrts.py 9 | python ./02-attach-elevation/01-aster-srtm/04-add-node-elevations.py 10 | python ./02-attach-elevation/02-google/01-cluster-nodes.py 11 | python ./02-attach-elevation/02-google/02-make-google-urls.py 12 | python ./02-attach-elevation/02-google/03-download-google-elevations.py 13 | python ./02-attach-elevation/02-google/04-choose-best-elevation.py 14 | python ./03-calculate-indicators/01-calculate-node-bc.py 15 | python ./03-calculate-indicators/02-calculate-indicators.py 16 | python ./03-calculate-indicators/03-merge-indicators.py 17 | python ./03-calculate-indicators/04-create-metadata.py 18 | python ./04-upload-repository/01-save-files.py 19 | python ./04-upload-repository/02-stage-files.py 20 | python ./04-upload-repository/03-upload-dataverse.py 21 | -------------------------------------------------------------------------------- /paper/README.md: -------------------------------------------------------------------------------- 1 | # Citation 2 | 3 | Boeing, G. 2025. Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World. Working paper. https://github.com/gboeing/street-network-models 4 | -------------------------------------------------------------------------------- /paper/latex/main.tex: -------------------------------------------------------------------------------- 1 | % !TeX program = pdflatex 2 | % Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World 3 | % Author: Geoff Boeing 4 | % Web: https://geoffboeing.com/ 5 | % Repo: https://github.com/gboeing/street-network-models 6 | 7 | \RequirePackage[l2tabu,orthodox]{nag} % warn if using any obsolete commands 8 | \documentclass[12pt,letterpaper]{article} % document style 9 | 10 | % load encoding and font packages for pdflatex, in order 11 | \usepackage[T1]{fontenc} % output 8-bit encoded fonts 12 | \usepackage[utf8]{inputenc} % allow input of utf-8 encoded characters 13 | \usepackage{ebgaramond} % document's serif font 14 | \usepackage{tgheros} % document's sans serif font 15 | 16 | % load babel, csquotes, and microtype in order 17 | \usepackage[USenglish]{babel} % auto-regionalize hyphens, quote marks, etc 18 | \usepackage[strict,autostyle]{csquotes} % smart and nestable quote marks 19 | \usepackage[babel=true]{microtype} % enable micro-typographic adjustments 20 | 21 | % load everything else 22 | \usepackage{amsmath} % additional mathematical typesetting features 23 | \usepackage{authblk} % footnote-style author/affiliation info 24 | \usepackage{booktabs} % better looking tables 25 | \usepackage{caption} % custom figure/table caption styles 26 | \usepackage{datetime} % enable formatting of date output 27 | \usepackage[final]{draftwatermark} % watermark paper as a draft 28 | \usepackage{endnotes} % enable endnotes 29 | \usepackage{geometry} % configure page dimensions and margins 30 | \usepackage{graphicx} % better inclusion of graphics 31 | \usepackage{natbib} % textual/parenthetical author-year citations w/bibtex 32 | \usepackage{rotating} % rotate wide tables/figures to make them landscape 33 | \usepackage{setspace} % configure spacing between lines 34 | \usepackage{titlesec} % custom section and subsection heading 35 | \usepackage{url} % make nice line-breakable urls 36 | 37 | % load hyperref/orcidlink last for compatibility 38 | \usepackage{hyperref} % enable hyperlinks and pdf metadata 39 | \usepackage{orcidlink} % provide orcid logo and link 40 | 41 | % print only the month and year when using \today 42 | \newdateformat{monthyeardate}{\monthname[\THEMONTH] \THEYEAR} 43 | 44 | \newcommand{\myname}{Geoff Boeing} 45 | \newcommand{\myemail}{boeing@usc.edu} 46 | \newcommand{\myorcid}{0000-0003-1851-6411} % chktex 8 47 | \newcommand{\myaffiliation}{Department of Urban Planning and Spatial Analysis\\University of Southern California} 48 | \newcommand{\paperdate}{April 2025} 49 | \newcommand{\papertitle}{Urban Science Beyond Samples: Updated Street Network Models and Indicators for Every Urban Area in the World} 50 | \newcommand{\papercitation}{Boeing, G. 2025. \papertitle. Under review at \textit{Journal Name}.} 51 | \newcommand{\paperkeywords}{Urban Planning, Transportation, Data Science} 52 | 53 | % location of figure files, via graphicx package 54 | \graphicspath{{./figures/}} 55 | 56 | % configure the page layout, via geometry package 57 | \geometry{ 58 | paper=letterpaper, % paper size 59 | top=3.8cm, % margin sizes 60 | bottom=3.8cm, 61 | left=4cm, 62 | right=4cm} 63 | \setstretch{1} % line spacing 64 | \clubpenalty=10000 % prevent orphans 65 | \widowpenalty=10000 % prevent widows 66 | 67 | % set section/subsection headings as the sans serif font, via titlesec package 68 | \titleformat{\section}{\normalfont\sffamily\large\bfseries\color{black}}{\thesection.}{0.3em}{} 69 | \titleformat{\subsection}{\normalfont\sffamily\small\bfseries\color{black}}{\thesubsection.}{0.3em}{} 70 | \titleformat{\subsubsection}{\normalfont\sffamily\small\color{black}}{\thesubsubsection.}{0.3em}{} 71 | 72 | % make figure/table captions sans-serif small font 73 | \captionsetup{font={footnotesize,sf},labelfont=bf,labelsep=period} 74 | 75 | % configure pdf metadata and link handling, via hyperref package 76 | \hypersetup{ 77 | pdfauthor={\myname}, 78 | pdftitle={\papertitle}, 79 | pdfsubject={\papertitle}, 80 | pdfkeywords={\paperkeywords}, 81 | pdffitwindow=true, % window fit to page when opened 82 | breaklinks=true, % break links that overflow horizontally 83 | colorlinks=false, % remove link color 84 | pdfborder={0 0 0} % remove link border 85 | } 86 | 87 | \begin{document} 88 | 89 | \title{\papertitle}%\footnote{{Citation info: \papercitation}}} 90 | \author[]{Redacted for review}%\myname~\orcidlink{\myorcid}} 91 | \affil[]{Redacted for review}%\myaffiliation} 92 | \date{}%\paperdate} 93 | 94 | \maketitle 95 | 96 | \begin{abstract} 97 | 98 | In this era of rapid urbanization and change, planners need up-to-date, global, and consistent street network models and indicators to measure resilience and performance, model accessibility, and target local quality-of-life interventions. This article presents up-to-date street network models and indicators for every urban area in the world. It uses 2025 urban area boundaries from the Global Human Settlement Layer, allowing users to join these data with hundreds of other urban attributes. Its workflow ingests 180 million OpenStreetMap nodes and 360 million OpenStreetMap edges across 10,351 urban areas in 189 countries. The code, models, and indicators are publicly available for reuse. These resources unlock worldwide urban street network science without samples as well as local analyses in under-resourced regions where models and indicators are otherwise less-accessible. 99 | 100 | \end{abstract} 101 | 102 | 103 | \section{Introduction} 104 | 105 | Street networks structure the urban fabric and the flow of people and goods through cities \citep{barrington-leigh_global_2020}. Scholars and practitioners commonly use spatial graphs to model street networks to understand or predict many phenomena, including traffic dynamics, accessibility to daily living needs, and the resilience and sustainability of urban forms \citep{barthelemy_spatial_2022}. These spatial graphs are defined by both their topology (connections and configuration) and geometry (positions, lengths, areas, and angles) \citep{fischer_spatial_2014}. Various topological and geometric indicators exist throughout the literature to measure important street network characteristics: node degrees reveal streets' connectedness, weighted betweenness centralities identify relatively important parts of the network, circuity suggests its efficiency or lack thereof, etc. These indicators then inform downstream urban analytics to target planning interventions or benchmark and monitor cities' progress toward stated sustainability goals. 106 | 107 | Up-to-date, global, consistent urban street network models and indicators are needed more today than ever before as planners face intertwined sustainability and public health crises in cities around the world \citep{giles-corti_creating_2022}. Meanwhile, urban science seeks to expand beyond the limits of traditional sampling to build universal theory and better understand understudied regions, such as the Global South. Yet traditional data sources and methods present headwinds to these offers. Data on urban streets are often digitized inconsistently from place to place, thwarting apples-to-apples global comparisons and making analyses particularly difficult in under-resourced regions \citep{liu_generalized_2022}. Popular data sources such as OpenStreetMap offer reasonably high quality data around the world, but do not package it in graph-theoretic form nor provide stats or indicators \citep{boeing_modeling_2025}. Tools like OSMnx aim to fill this gap, but still require coding knowledge to conduct the analysis and potentially require extensive computational resources for someone trying to conduct global urban science. 108 | 109 | This article presents a resource a fill this gap by offering street network models and indicators worldwide for scholars and practitioners to easily reuse without reinventing the wheel. Using data from OpenStreetMap and boundaries from the 2025 Global Human Settlement Layer (GHSL), this study models and analyzes the street networks of every urban area in the world. This workflow ingests 180 million OpenStreetMap nodes and 360 million OpenStreetMap edges across 10,351 urban areas in 189 countries. This article describes this open data repository of street network models and indicators, as well as the open-source software repository containing the code to generate them. The next section describes these reproducible methods. Then we discuss the work's lineage, present contribution, and future. Finally we conclude with suggestions for getting started with these data and code. 110 | 111 | \section{Reproducible Methods} 112 | 113 | The following computational workflow, written in the Python programming language, generates these models and calculates these indicators. 114 | 115 | \subsection{Urban Boundaries} 116 | 117 | The workflow begins by extracting the boundary polygons of each urban area in the world from the 2025 GHSL Urban Centre Database (UCD), which contains 11,422 entities.\ \citet{mari_rivero_urban_2025} describe this input dataset in detail, but to summarize, the GHSL integrates a vast array of census data, remote sensing data, and volunteered geographic information to delineate the world's urbanized areas' boundaries and attach corresponding attribute data. We retain urban areas with >1 km\textsuperscript{2} built-up area and a \enquote{high} GHSL quality control score, resulting in 10,351 urban areas. This provides us with basic filtering to ensure we are modeling true urbanized areas rather than false positives or tiny villages. 118 | 119 | \subsection{Network Modeling} 120 | 121 | We used OSMnx v2.0.2 to download OpenStreetMap raw data in February 2025 and construct a spatial graph model of the drivable street network within each urban area. These models are nonplanar directed multigraphs with possible self-loops. They have node/edge attribute data from OpenStreetMap plus geographic coordinates and geometries \citep{boeing_modeling_2025}. We parameterize OSMnx to use its \enquote{drive} network type, retain all graph components, and run its edge simplification algorithm \citep{boeing_topological_2025}. Each urban area's graph is saved as a GraphML file, a standard graph serialization format. 122 | 123 | \subsection{Elevation} 124 | 125 | We attach elevation, in meters above sea level, to each node in each urban area's graph use two global digital elevation models (GDEMs): the Advanced Spaceborne Thermal Emission and Reflection Radiometer (ASTER) v3 GDEM, and the Shuttle Radar Topography Mission (SRTM) version 3.0 GDEM with voids filled. Both are 1 arcsecond (approximately 30-meter) resolution. First we download all the GDEM rasters for ASTER (45,824 tiles) and SRTM (14,297 tiles) from NASA EarthData. Next we build a virtual raster for each source. Then we use OSMnx to load each GraphML file and attach the elevation from ASTER and SRTM to each graph node. 126 | 127 | As each node has both an ASTER and an SRTM elevation value, we choose one to use as the \enquote{official} node elevation by comparing both to a \enquote{tie-breaker} value from Google. To do so, we download each node's elevation from the Google Maps Elevation API, then choose between ASTER and SRTM based on whichever is nearer to Google's value. Then we calculate edge grades and re-save each GraphML file with these node/edge attributes. 128 | 129 | \begin{table}[bth!] 130 | \centering 131 | \scriptsize 132 | \caption{The indicators dataset contents. Variables carried over from GHSL are noted.}\label{tab:indicators} 133 | \begin{tabular}{p{3.0cm} p{1.0cm} p{8.2cm}} 134 | \toprule 135 | Variable & Type & Description \\ 136 | \midrule 137 | area\_km2 & integer & Area within urban center boundary polygon, km2 (GHSL) \\ 138 | bc\_gini & decimal & Gini coefficient of normalized distance-weighted node betweenness centralities \\ 139 | bc\_max & decimal & Max normalized distance-weighted node betweenness centrality \\ 140 | built\_up\_area\_m2 & integer & Built-up surface area, square meters (GHSL) \\ 141 | cc\_avg\_dir & decimal & Average clustering coefficient (unweighted/directed) \\ 142 | cc\_avg\_undir & decimal & Average clustering coefficient (unweighted/undirected) \\ 143 | cc\_wt\_avg\_dir & decimal & Average clustering coefficient (weighted/directed) \\ 144 | cc\_wt\_avg\_undir & decimal & Average clustering coefficient (weighted/undirected) \\ 145 | circuity & decimal & Ratio of street lengths to straightline distances \\ 146 | core\_city & string & Urban center core city name \\ 147 | country & string & Primary country name \\ 148 | country\_iso & string & Primary country ISO 3166--1 alpha--3 code \\ 149 | elev\_iqr & decimal & Interquartile range of node elevations, meters \\ 150 | elev\_mean & decimal & Mean node elevation, meters \\ 151 | elev\_median & decimal & Median node elevation, meters \\ 152 | elev\_range & decimal & Range of node elevations, meters \\ 153 | elev\_std & decimal & Standard deviation of node elevations, meters \\ 154 | grade\_mean & decimal & Mean absolute street grade (incline) \\ 155 | grade\_median & decimal & Median absolute street grade (incline) \\ 156 | intersect\_count & integer & Count of (undirected) edge intersections \\ 157 | intersect\_count\_clean & integer & Count of street intersections (merged within 10 meters geometrically) \\ 158 | intersect\_count\_clean\_topo & integer & Count of street intersections (merged within 10 meters topologically) \\ 159 | k\_avg & decimal & Average node degree (undirected) \\ 160 | length\_mean & decimal & Mean street segment length (undirected edges), meters \\ 161 | length\_median & decimal & Median street segment length (undirected edges), meters \\ 162 | length\_total & decimal & Total street length (undirected edges), meters \\ 163 | node\_count & integer & Count of nodes \\ 164 | orientation\_entropy & decimal & Entropy of street network bearings \\ 165 | pagerank\_max & decimal & The maximum PageRank value of any node \\ 166 | prop\_4way & decimal & Proportion of nodes that represent 4-way street intersections \\ 167 | prop\_3way & decimal & Proportion of nodes that represent 3-way street intersections \\ 168 | prop\_deadend & decimal & Proportion of nodes that represent dead-ends \\ 169 | resident\_pop & integer & Total resident population (GHSL) \\ 170 | self\_loop\_proportion & decimal & Proportion of edges that are self-loops \\ 171 | straightness & decimal & Inverse of circuity \\ 172 | street\_segment\_count & integer & Count of streets (undirected edges) \\ 173 | uc\_id & integer & Urban center unique ID (GHSL) \\ 174 | uc\_names & string & List of city names within this urban center (GHSL) \\ 175 | world\_region & string & UN SDG geographic region \\ 176 | \bottomrule 177 | \end{tabular} 178 | \end{table} 179 | 180 | \subsection{Indicator Calculation} 181 | 182 | For each graph, we the various street network indicators described in Table~\ref{tab:indicators}. These include geometric and topological measures common in transport planning, urban design, and statistical physics. We report node counts, intersection counts (i.e., non-dead-end nodes), and both geometrically and topologically consolidated intersection counts, using the algorithm described in \citet{boeing_topological_2025}. However, the most important contribution here is the calculation of node betweenness centrality for every node in every graph. A node's betweenness centrality measures the share of all possible shortest paths in a graph that use that node. High centrality values indicate \enquote{important} nodes relied on by many shortest paths. The maximum betweenness centrality represents the highest relative value in a graph (and thus identifies the most important node), and their Gini coefficient measures the concentration of importance in a network, indicating the presence and severity of chokepoints. 183 | 184 | \subsection{Data Repository Preparation} 185 | 186 | We convert each GraphML file to a GeoPackage and node/edge list files. The former allows users to work with these spatial networks in any GIS software. The latter provides a minimal, lightweight, highly compressible version of the models. Then we perform a series of file verification checks and create metadata files for the graphs' node and edge attributes and all of the indicators. Finally we compress and upload all model files (GeoPackages, GraphML, and node/edge lists), indicators, and metadata to the Harvard Dataverse. 187 | 188 | \section{Code and Data Products} 189 | 190 | \subsection{Code Repository} 191 | 192 | The preceding methods are fully reproducible by running the modeling and analytics workflow, which is publicly available in the source code repository\endnote{Code repository: https://github.com/gboeing/street-network-models} on Github. A well-equipped personal computer can execute this workflow, but given the resource requirements it may be better (and faster) to run it in a high-performance computing cluster, where available. The code is written in Python and is operating system agnostic. The input data, dependencies, and resources required to run it are documented in the repository's readme file. 193 | 194 | \subsection{Data Repository} 195 | 196 | The data repository comprises five datasets nested within a top-level Dataverse\endnote{Top-level Dataverse: https://dataverse.harvard.edu/dataverse/global-urban-street-networks} data repository: 197 | 198 | \begin{itemize} 199 | \item Global Urban Street Networks GeoPackages\endnote{Global Urban Street Networks GeoPackages: https://doi.org/10.7910/DVN/E5TPDQ} 200 | \item Global Urban Street Networks GraphML files\endnote{Global Urban Street Networks GraphML files: https://doi.org/10.7910/DVN/KA5HJ3} 201 | \item Global Urban Street Networks Node/Edge lists\endnote{Global Urban Street Networks Node/Edge lists: https://doi.org/10.7910/DVN/DC7U0A} 202 | \item Global Urban Street Networks Indicators \endnote{Global Urban Street Networks Indicators: https://doi.org/10.7910/DVN/ZTFPTB} 203 | \item Global Urban Street Networks Metadata \endnote{Global Urban Street Networks Metadata: https://doi.org/10.7910/DVN/WMPPF9} 204 | \end{itemize} 205 | 206 | The model files are zipped at the country level, and each file (and indicators row) is identified by its urban area name and UCD ID.\ The latter allows users to join them to GHSL attribute data. 207 | 208 | \section{Discussion: Lineage and Contribution} 209 | 210 | In an era of rapid urbanization, scholars and practitioners need models and indicators that keep up with the pace of transformational urban change. This project builds on prior work initially conducted in 2019--2020 that generated a preliminary version of the data repository \citep{boeing_street_2022}. That initial version was based on the 2015 version of the GHSL UCD and 2020 OpenStreetMap data. This new version takes advantage of years of advances to use the 2025 GHSL UCD and 2025 OpenStreetMap data to make six primary contributions. 211 | 212 | First, it includes over 1,400 more urban areas and 11 more countries than the earlier version. This entails significantly more worldwide coverage in an era of rapid urban expansion. 213 | 214 | Second, these new models incorporate 10 years of recent urbanization in their updated urban area boundaries and 5 years of new community additions to OpenStreetMap. As such, this workflow's modeling included approximately 20 million more street network nodes and 40 million more edges than the earlier version. The new urban boundaries allow users to link these street network models and indicators to hundreds of new, up-to-date GHSL attributes on urban climate, land use, economic conditions, etc. 215 | 216 | Third, it adds new attributes and indicators to the repository---most consequentially the betweenness centrality of every node in every urban area's street network, which is extremely time and resource intensive to calculate, yet unlocks powerful analyses of network structure and resilience for urban science. 217 | 218 | Fourth, it uses finer-grained SRTM data (30m instead of the previous 90m resolution) for more precise elevation attribute values. 219 | 220 | Fifth, from a \textit{code product} perspective, the workflow's code base has been wholly refactored and rewritten from the ground-up to significantly reduce its cyclomatic complexity, memory use, and runtime. This makes the workflow more maintainable, sustainable, and easier to re-run in the future to periodically update the data repository whenever new GHSL data are released. 221 | 222 | Sixth, and finally, these models and indicators themselves unlock other researchers' work. This project provides a global dataset to conduct both worldwide urban street network science beyond samples as well as local analyses particularly in less-resourced regions where such models and indicators are most needed, yet most scarce. 223 | 224 | \section{Getting Started} 225 | 226 | To get started, users may download the models or indicators directly from the aforementioned Dataverse, or access the source code and documentation at the aforementioned Github source code repository. 227 | 228 | % print the footnotes as endnotes, if any exist 229 | \IfFileExists{\jobname.ent}{\theendnotes}{} 230 | 231 | % print the bibliography 232 | \setlength{\bibsep}{0.00cm plus 0.05cm} % no space between items 233 | \bibliographystyle{apalike} 234 | \bibliography{references} 235 | 236 | \end{document} 237 | -------------------------------------------------------------------------------- /paper/latex/references.bib: -------------------------------------------------------------------------------- 1 | 2 | @Article{ barrington-leigh_global_2020, 3 | author = {Barrington-Leigh, Christopher and Millard-Ball, Adam}, 4 | journal = {Proceedings of the National Academy of Sciences}, 5 | month = jan, 6 | number = {4}, 7 | pages = {1941--1950}, 8 | title = {Global trends toward urban street-network sprawl}, 9 | volume = {117}, 10 | year = {2020}, 11 | doi = {10.1073/pnas.1905232116}, 12 | issn = {0027-8424, 1091-6490}, 13 | language = {en} 14 | } 15 | 16 | @Book{ barthelemy_spatial_2022, 17 | address = {Cham}, 18 | author = {Barthelemy, Marc}, 19 | publisher = {Springer International Publishing}, 20 | title = {Spatial {Networks}: {A} {Complete} {Introduction}}, 21 | year = {2022}, 22 | isbn = {978-3-030-94105-5 978-3-030-94106-2}, 23 | language = {en} 24 | } 25 | 26 | @InCollection{ fischer_spatial_2014, 27 | address = {Berlin, Germany}, 28 | author = {O'Sullivan, David}, 29 | booktitle = {Handbook of {Regional} {Science}}, 30 | editor = {Fischer, Manfred M. and Nijkamp, Peter}, 31 | pages = {1253--1273}, 32 | publisher = {Springer-Verlag}, 33 | title = {Spatial {Network} {Analysis}}, 34 | year = {2014}, 35 | isbn = {978-3-642-23429-3}, 36 | language = {en} 37 | } 38 | 39 | @Article{ giles-corti_creating_2022, 40 | author = {Giles-Corti, Billie and Moudon, Anne Vernez and Lowe, 41 | Melanie and Adlakha, Deepti and Cerin, Ester and Boeing, 42 | Geoff and Higgs, Carl and Arundel, Jonathan and Liu, Shiqin 43 | and Hinckson, Erica and Salvo, Deborah and Adams, Marc A 44 | and Badland, Hannah and Florindo, Alex A and Gebel, Klaus 45 | and Hunter, Ruth F and Mitáš, Josef and Oyeyemi, Adewale 46 | L and Puig-Ribera, Anna and Queralt, Ana and Santos, Maria 47 | Paula and Schipperijn, Jasper and Stevenson, Mark and Dyck, 48 | Delfien Van and Vich, Guillem and Sallis, James F}, 49 | journal = {The Lancet Global Health}, 50 | month = jun, 51 | number = {6}, 52 | pages = {e782--e785}, 53 | title = {Creating healthy and sustainable cities: what gets 54 | measured, gets done}, 55 | volume = {10}, 56 | year = {2022}, 57 | doi = {10.1016/S2214-109X(22)00070-5}, 58 | issn = {2214109X}, 59 | language = {en} 60 | } 61 | 62 | @Article{ liu_generalized_2022, 63 | author = {Liu, Shiqin and Higgs, Carl and Arundel, Jonathan and 64 | Boeing, Geoff and Cerdera, Nicholas and Moctezuma, David 65 | and Cerin, Ester and Adlakha, Deepti and Lowe, Melanie and 66 | Giles‐Corti, Billie}, 67 | journal = {Geographical Analysis}, 68 | month = jul, 69 | number = {3}, 70 | pages = {559--582}, 71 | title = {A {Generalized} {Framework} for {Measuring} {Pedestrian} 72 | {Accessibility} around the {World} {Using} {Open} {Data}}, 73 | volume = {54}, 74 | year = {2022}, 75 | doi = {10.1111/gean.12290}, 76 | issn = {0016-7363, 1538-4632}, 77 | language = {en} 78 | } 79 | 80 | @Article{ boeing_modeling_2025, 81 | author = {Boeing, Geoff}, 82 | journal = {Geographical Analysis}, 83 | volume = {published online ahead of print}, 84 | title = {Modeling and {Analyzing} {Urban} {Networks} and 85 | {Amenities} with {OSMnx}}, 86 | year = {2025} 87 | } 88 | 89 | @Misc{ mari_rivero_urban_2025, 90 | address = {http://data.europa.eu/89h/1a338be6-7eaf-480c-9664-3a8ade88cbcd}, 91 | author = {Mari Rivero, I. and Melchiorri, M. and Florio, P. and 92 | Schiavina, M. and {et. al}}, 93 | publisher = {European Commission, Joint Research Centre (JRC)}, 94 | title = {Urban {Centre} {Database} 2025}, 95 | year = {2025}, 96 | doi = {10.2905/1A338BE6-7EAF-480C-9664-3A8ADE88CBCD}, 97 | url = {http://data.europa.eu/89h/1a338be6-7eaf-480c-9664- 98 | 3a8ade88cbcd} 99 | } 100 | 101 | @Article{ boeing_topological_2025, 102 | author = {Boeing, Geoff}, 103 | journal = {Transactions in GIS}, 104 | volume = {published online ahead of print}, 105 | title = {Topological {Graph} {Simplification} {Solutions} to the 106 | {Street} {Intersection} {Miscount} {Problem}}, 107 | year = {2025} 108 | } 109 | 110 | @Article{ boeing_street_2022, 111 | author = {Boeing, Geoff}, 112 | journal = {Geographical Analysis}, 113 | month = jul, 114 | number = {3}, 115 | pages = {519--535}, 116 | title = {Street {Network} {Models} and {Indicators} for {Every} 117 | {Urban} {Area} in the {World}}, 118 | volume = {54}, 119 | year = {2022}, 120 | doi = {10.1111/gean.12281}, 121 | issn = {0016-7363, 1538-4632}, 122 | language = {en} 123 | } 124 | --------------------------------------------------------------------------------