├── .editorconfig ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md └── workflows │ ├── docs-build.yml │ ├── docs.yml │ ├── macos.yml │ ├── pypi.yml │ ├── ubuntu.yml │ └── windows.yml ├── .gitignore ├── Dockerfile.dev ├── LICENSE ├── MANIFEST.in ├── README.md ├── dev-container.sh ├── docs ├── changelog.md ├── common.md ├── contributing.md ├── examples │ ├── download_buildings.ipynb │ └── intro.ipynb ├── faq.md ├── index.md ├── installation.md ├── overrides │ └── main.html └── usage.md ├── mkdocs.yml ├── open_buildings ├── __init__.py ├── cli.py ├── common.py ├── download_buildings.py ├── google │ ├── __init.py__ │ ├── add_columns.py │ ├── partition.py │ ├── process.py │ └── stac-geoparquet.py ├── overture │ ├── __init.py__ │ ├── add_columns.py │ ├── partition.py │ └── places_add_columns.py └── settings.py ├── pytest.ini ├── requirements.txt ├── requirements_dev.txt ├── requirements_docs.txt ├── setup.cfg ├── setup.py └── tests ├── __init__.py └── test_open_buildings.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a bug report to help us improve 4 | labels: bug 5 | --- 6 | 7 | 8 | 9 | ### Environment Information 10 | 11 | - open_buildings version: 12 | - Python version: 13 | - Operating System: 14 | 15 | ### Description 16 | 17 | Describe what you were trying to get done. 18 | Tell us what happened, what went wrong, and what you expected to happen. 19 | 20 | ### What I Did 21 | 22 | ``` 23 | Paste the command(s) you ran and the output. 24 | If there was a crash, please include the traceback here. 25 | ``` 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | contact_links: 2 | - name: Ask questions 3 | url: https://github.com/opengeos/open-buildings/discussions/categories/q-a 4 | about: Please ask and answer questions here. 5 | - name: Ideas 6 | url: https://github.com/opengeos/open-buildings/discussions/categories/ideas 7 | about: Please share your ideas here. 8 | - name: Ask questions from the GIS community 9 | url: https://gis.stackexchange.com 10 | about: To get answers from questions in the GIS community, please ask and answer questions here. 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Submit a feature request to help us improve 4 | labels: Feature Request 5 | --- 6 | 7 | 8 | 9 | ### Description 10 | 11 | Describe the feature (e.g., new functions/tutorials) you would like to propose. 12 | Tell us what can be achieved with this new feature and what's the expected outcome. 13 | 14 | ### Source code 15 | 16 | ``` 17 | Paste your source code here if have sample code to share. 18 | ``` 19 | -------------------------------------------------------------------------------- /.github/workflows/docs-build.yml: -------------------------------------------------------------------------------- 1 | name: docs-build 2 | on: 3 | pull_request: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | with: 13 | fetch-depth: 0 14 | - uses: actions/setup-python@v4 15 | with: 16 | python-version: "3.10" 17 | - name: Install GDAL 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install --find-links=https://girder.github.io/large_image_wheels --no-cache GDAL pyproj 21 | - name: Test GDAL installation 22 | run: | 23 | python -c "from osgeo import gdal" 24 | gdalinfo --version 25 | - name: Install dependencies 26 | run: | 27 | pip install .[dev] 28 | - name: Discover typos with codespell 29 | run: codespell --skip="*.csv,*.geojson,*.json,*.js,*.html,*cff,*.pdf,./.git" --ignore-words-list="aci,acount,acounts,fallow,hart,hist,nd,ned,ois,wqs" 30 | - name: PKG-TEST 31 | run: | 32 | python3 -m pytest . -n 4 33 | - name: Build docs 34 | run: | 35 | pip install -r requirements_docs.txt 36 | mkdocs build 37 | # - name: Deploy to Netlify 38 | # uses: nwtgck/actions-netlify@v2.0 39 | # with: 40 | # publish-dir: "./site" 41 | # production-branch: master 42 | # github-token: ${{ secrets.GITHUB_TOKEN }} 43 | # deploy-message: "Deploy from GitHub Actions" 44 | # enable-pull-request-comment: true 45 | # enable-commit-comment: false 46 | # overwrites-pull-request-comment: true 47 | # env: 48 | # NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} 49 | # NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} 50 | # timeout-minutes: 10 51 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | deploy: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | - uses: actions/setup-python@v4 12 | with: 13 | python-version: 3.9 14 | - name: Install dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | pip install --user --no-cache-dir Cython 18 | pip install .[dev] 19 | - name: Discover typos with codespell 20 | run: | 21 | pip install codespell 22 | codespell --skip="*.csv,*.geojson,*.json,*.js,*.html,*cff,./.git" --ignore-words-list="aci,acount,acounts,fallow,hart,hist,nd,ned,ois,wqs,watermask" 23 | - name: PKG-TEST 24 | run: | 25 | python3 -m pytest . -n 4 26 | - run: pip install -r requirements_docs.txt 27 | - run: mkdocs gh-deploy --force 28 | -------------------------------------------------------------------------------- /.github/workflows/macos.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | pull_request: 6 | branches: 7 | - main 8 | 9 | name: macOS build 10 | jobs: 11 | test-macOS: 12 | runs-on: ${{ matrix.os }} 13 | name: ${{ matrix.os }} (${{ matrix.python-version}}) 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | os: ["macOS-latest"] 18 | python-version: ["3.10"] 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v3 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v4 25 | with: 26 | python-version: ${{ matrix.python-version}} 27 | - name: Install GDAL 28 | run: | 29 | brew install gdal 30 | - name: Test GDAL installation 31 | run: | 32 | gdalinfo --version 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip 36 | pip install --no-cache-dir Cython 37 | pip install -r requirements.txt 38 | pip install . 39 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: pypi 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up Python 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: "3.x" 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install setuptools wheel twine 24 | - name: Build and publish 25 | env: 26 | TWINE_USERNAME: ${{ secrets.PYPI_USERS }} 27 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 28 | run: | 29 | python setup.py sdist bdist_wheel 30 | twine upload dist/* 31 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | pull_request: 6 | branches: 7 | - main 8 | 9 | name: Linux build 10 | jobs: 11 | py-check: 12 | runs-on: ${{ matrix.config.os }} 13 | name: ${{ matrix.config.os }} (${{ matrix.config.py }}) 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | config: 18 | - { os: ubuntu-latest, py: "3.8" } 19 | - { os: ubuntu-latest, py: "3.9" } 20 | - { os: ubuntu-latest, py: "3.10" } 21 | - { os: ubuntu-latest, py: "3.11" } 22 | steps: 23 | - name: Checkout Code 24 | uses: actions/checkout@v3 25 | - name: Setup Python 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: ${{ matrix.config.py }} 29 | - name: Install GDAL 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install --no-cache-dir Cython 33 | pip install --find-links=https://girder.github.io/large_image_wheels --no-cache GDAL 34 | - name: Test GDAL installation 35 | run: | 36 | python -c "from osgeo import gdal" 37 | gdalinfo --version 38 | - name: Install dependencies 39 | run: | 40 | pip install .[dev] 41 | - name: PKG-TEST 42 | run: | 43 | python3 -m pytest . -n 4 -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | pull_request: 6 | branches: 7 | - main 8 | 9 | name: Windows build 10 | jobs: 11 | test-windows: 12 | runs-on: windows-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Install miniconda 16 | uses: conda-incubator/setup-miniconda@v2 17 | with: 18 | auto-activate-base: true 19 | python-version: "3.10" 20 | - name: Install GDAL 21 | run: conda install -c conda-forge gdal --yes 22 | - name: Test GDAL installation 23 | run: | 24 | python -c "from osgeo import gdal" 25 | gdalinfo --version 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install --no-cache-dir Cython 30 | pip install -r requirements.txt 31 | pip install . 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | private/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | cache 8 | issues.txt 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | .venv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # IDE settings 109 | .vscode/ -------------------------------------------------------------------------------- /Dockerfile.dev: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim-bullseye 2 | 3 | ARG USER 4 | ARG UID 5 | 6 | # install git 7 | # create current user in container and link it to host UID 8 | RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* \ 9 | && useradd -u $UID $USER -p '' -l -m && chown $UID /home/$USER 10 | 11 | # $PWD needs to be mounted to /workspace on run time 12 | WORKDIR /workspace 13 | COPY . . 14 | 15 | # install package in editable mode, install [dev] dependencies (see setup.py -> extras_require arg) 16 | RUN pip install -e '.[dev]' 17 | 18 | USER $USER -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache Software License 2.0 2 | 3 | Copyright (c) 2023, Chris Holmes 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include requirements.txt 4 | 5 | recursive-exclude * __pycache__ 6 | recursive-exclude * *.py[co] 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # open-buildings 2 | 3 | [![image](https://img.shields.io/pypi/v/open_buildings.svg)](https://pypi.python.org/pypi/open_buildings) 4 | 5 | **Tools for working with open building datasets** 6 | 7 | ***Warning**: This code has fallen out of maintenance, and isn't keeping up with the latest release. [Overture](https://overturemaps.org) has incorporated 8 | the Google and Microsoft datasets, and has fully adopted GeoParuqet & partitioned it, and provides a [nice cli](https://github.com/OvertureMaps/overturemaps-py) to download data. So I'd recommend using that, but will keep this code up for anyone interested.* 9 | 10 | - Free software: Apache Software License 2.0 11 | - Documentation: 12 | - Creator: [Chris Holmes](https://github.com/cholmes) 13 | 14 | ## Introduction 15 | 16 | This repo is intended to be a set of useful scripts for getting and converting Open Building Datasets using [Cloud Native Geospatial](https://cloudnativegeo.org) formats. 17 | Initially the focus is on Google's [Open Buildings](https://sites.research.google/open-buildings/) dataset and Overture's building dataset. 18 | 19 | The main tool that most people will be interested in is the `get_buildings` command, that 20 | lets you supply a GeoJSON file to a command-line interface and it'll download all buildings 21 | in the area supplied, output in common GIS formats (GeoPackage, FlatGeobuf, Shapefile, GeoJSON and GeoParquet). 22 | 23 | The tool works by leveraging partitioned [GeoParquet](https://geoparquet.org) files, using [DuckDB](https://duckdb.org) 24 | to just query exactly what is needed. This is done without any server - DuckDB on your computer queries, filter and downloads 25 | just the rows that you want. Right now you can query two datasets, that live on [Source Cooperative](https://beta.source.coop), see [here for Google](https://beta.source.coop/cholmes/google-open-buildings) and [here for Overture](https://beta.source.coop/cholmes/overture/). The rest of the CLI's and scripts were used to create those datasets, with some 26 | additions for benchmarking performance. 27 | 28 | This is basically my first Python project, and certainly my first open source one. It is only possible due to ChatGPT, as I'm not a python 29 | programmer, and not a great programmer in general (coded professionally for about 2 years, then shifted to doing lots of other stuff). So 30 | it's likely not great code, but it's been fun to iterate on it and seems like it might be useful to others. And contributions are welcome! 31 | I'm working on making the issue tracker accessible, so anyone who wants to try out some open source coding can jump in. 32 | 33 | ## Installation 34 | 35 | Install with pip: 36 | 37 | ```bash 38 | pip install open-buildings 39 | ``` 40 | 41 | This should add a CLI that you can then use. If it's working then: 42 | 43 | ```bash 44 | ob 45 | ``` 46 | 47 | Will print out a help message. You then will be able run the CLI (download [1.json](https://data.source.coop/cholmes/aois/1.json): 48 | 49 | 50 | ```bash 51 | ob tools get_buildings 1.json --dst my-buildings.geojson --country_iso RW 52 | ``` 53 | 54 | You can also stream the json in directly in one line: 55 | 56 | ``` 57 | curl https://data.source.coop/cholmes/aois/1.json | ob get_buildings - --dst my-buildings.geojson --country_iso RW 58 | ``` 59 | 60 | 61 | ## Functionality 62 | 63 | ### get_buildings 64 | 65 | The main tool for most people is `get_buildings`. It queries complete global 66 | building datasets for the GeoJSON provided, outputting results in common geospatial formats. The 67 | full options and explanation can be found in the `--help` command: 68 | 69 | ``` 70 | % ob get_buildings --help 71 | Usage: ob get_buildings [OPTIONS] [GEOJSON_INPUT] [DST] 72 | 73 | Tool to extract buildings in common geospatial formats from large archives 74 | of GeoParquet data online. GeoJSON input can be provided as a file or piped 75 | in from stdin. If no GeoJSON input is provided, the tool will read from 76 | stdin. 77 | 78 | Right now the tool supports two sources of data: Google and Overture. The 79 | data comes from Cloud-Native Geospatial distributions on 80 | https://source.coop, that are partitioned by admin boundaries and use a 81 | quadkey for the spatial index. In time this tool will generalize to support 82 | any admin boundary partitioned GeoParquet data, but for now it is limited to 83 | the Google and Overture datasets. 84 | 85 | The default output is GeoJSON, in a file called buildings.json. Changing the 86 | suffix will change the output format - .shp for shapefile .gpkg for 87 | GeoPackage, .fgb for FlatGeobuf and .parquet for GeoParquet, and .json or 88 | .geojson for GeoJSON. If your query is all within one country it is strongly 89 | recommended to use country_iso to hint to the query engine which country to 90 | query, as this will speed up the query significantly (5-10x). Expect query 91 | times of 5-10 seconds for a queries with country_iso and 30-60 seconds 92 | without country_iso. 93 | 94 | You can look up the country_iso for a country here: 95 | https://github.com/lukes/ISO-3166-Countries-with-Regional- 96 | Codes/blob/master/all/all.csv If you get the country wrong you will get zero 97 | results. Currently you can only query one country, so if your query crosses 98 | country boundaries you should not use country_iso. In future versions of 99 | this tool we hope to eliminate the need to hint with the country_iso. 100 | 101 | Options: 102 | --dst TEXT The path to write the output to. Can be a 103 | directory or file. 104 | --location TEXT Use city or region name instead of providing an 105 | AOI as file. 106 | --source [google|overture] Dataset to query, defaults to Overture 107 | --country_iso TEXT A 2 character country ISO code to filter the 108 | data by. 109 | -s, --silent Suppress all print outputs. 110 | --overwrite Overwrite the destination file if it already 111 | exists. 112 | -v, --verbose Print detailed logs with timestamps. 113 | --help Show this message and exit. 114 | ``` 115 | 116 | Note that the `get_buildings` operation is not very robust, there are likely a number of ways to break it. #13 117 | is used to track it, but if you have any problems please report them in the [issue tracker](https://github.com/opengeos/open-buildings/issues) 118 | to help guide how we improve it. 119 | 120 | We do hope to eliminate the need to supply an iso_country for fast querying, see #29 for that tracking issue. We also 121 | hope to add more building datasets, starting with the [Google-Microsoft Open Buildings by VIDA](https://beta.source.coop/vida/google-microsoft-open-buildings/geoparquet/by_country_s2), 122 | see #26 for more info. 123 | 124 | ### Google Building processings 125 | 126 | In the google portion of the CLI there are two functions: 127 | 128 | - `convert` takes as input either a single CSV file or a directory of CSV files, downloaded locally from the Google Buildings dataset. It can write out as GeoParquet, FlatGeobuf, GeoPackage and Shapefile, and can process the data using DuckDB, GeoPandas or OGR. 129 | - `benchmark` runs the convert command against one or more different formats, and one or more different processes, and reports out how long each took. 130 | 131 | A sample output for `benchmark`, run on 219_buildings.csv, a 101 mb CSV file is: 132 | 133 | ``` 134 | Table for file: 219_buildings.csv 135 | ╒═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕ 136 | │ process │ fgb │ gpkg │ parquet │ shp │ 137 | ╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡ 138 | │ duckdb │ 00:02.330 │ 00:00.000 │ 00:01.866 │ 00:03.119 │ 139 | ├───────────┼───────────┼───────────┼───────────┼───────────┤ 140 | │ ogr │ 00:02.034 │ 00:07.456 │ 00:01.423 │ 00:02.491 │ 141 | ├───────────┼───────────┼───────────┼───────────┼───────────┤ 142 | │ pandas │ 00:18.184 │ 00:24.096 │ 00:02.710 │ 00:20.032 │ 143 | ╘═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛ 144 | ``` 145 | 146 | The full options can be found with `--help` after each command, and I'll put them here for reference: 147 | 148 | ``` 149 | Usage: open_buildings convert [OPTIONS] INPUT_PATH OUTPUT_DIRECTORY 150 | 151 | Converts a CSV or a directory of CSV's to an alternate format. Input CSV's 152 | are assumed to be from Google's Open Buildings 153 | 154 | Options: 155 | --format [fgb|parquet|gpkg|shp] 156 | The output format. The default is FlatGeobuf (fgb) 157 | --overwrite Whether to overwrite any existing output files. 158 | --process [duckdb|pandas|ogr] The processing method to use. The default is 159 | pandas. 160 | --skip-split-multis Whether to keep multipolygons as they are 161 | without splitting into their component polygons. 162 | --verbose Whether to print detailed processing 163 | information. 164 | --help Show this message and exit. 165 | ``` 166 | 167 | ``` 168 | Usage: open_buildings benchmark [OPTIONS] INPUT_PATH OUTPUT_DIRECTORY 169 | 170 | Runs the convert function on each of the supplied processes and formats, 171 | printing the timing of each as a table 172 | 173 | Options: 174 | --processes TEXT The processing methods to use. One or more of duckdb, 175 | pandas or ogr, in a comma-separated list. Default is 176 | duckdb,pandas,ogr. 177 | --formats TEXT The output formats to benchmark. One or more of fgb, 178 | parquet, shp or gpkg, in a comma-separated list. 179 | Default is fgb,parquet,shp,gpkg. 180 | --skip-split-multis Whether to keep multipolygons as they are without 181 | splitting into their component polygons. 182 | --no-gpq Disable GPQ conversion. Timing will be faster, but not 183 | valid GeoParquet (until DuckDB adds support) 184 | --verbose Whether to print detailed processing information. 185 | --output-format TEXT The format of the output. Options: ascii, csv, json, 186 | chart. 187 | --help Show this message and exit. 188 | ``` 189 | 190 | **Warning** - note that `--no-gpq` doesn't actually work right now, see https://github.com/opengeos/open-buildings/issues/4 to track. It is just always set to true, so DuckDB times with Parquet will be inflated (you can change it in the Python code in a global variables). Note also that the `ogr` process does not work with `--skip-split-multis`, but will just report very minimal times since it skips doing anything, see https://github.com/opengeos/open-buildings/issues/5 to track. 191 | 192 | #### Format Notes 193 | 194 | I'm mostly focused on GeoParquet and FlatGeobuf, as good cloud-native geo formats. I included GeoPackage and Shapefile mostly for benchmarking purposes. GeoPackage I think is a good option for Esri and other more legacy software that is slow to adopt new formats. Shapefile is total crap for this use case - it fails on files bigger than 4 gigabytes, and lots of the source S2 Google Building CSV's are bigger, so it's not useful for translating. The truncation of field names is also annoying, since the CSV file didn't try to make short names (nor should it, the limit is silly). 195 | 196 | GeoPackage is particularly slow with DuckDB, it's likely got a bit of a bug in it. But it works well with Pandas and OGR. 197 | 198 | ## Process Notes 199 | 200 | When I was processing V2 of the Google Building's dataset I did most of the initial work with GeoPandas, which was awesome, and has the best GeoParquet implementation. But the size of the data made its all in memory processing untenable. I ended up using PostGIS a decent but, but near the end of that process I discovered DuckDB, and was blown away by it's speed and ability to manage memory well. So for this tool I was mostly focused on those two. 201 | 202 | Note also that currently DuckDB fgb, gpkg and shp output don't include projection information, so if you want to use the output then you'd need to run ogr2ogr on the output. It sounds like that may get fixed pretty soon, so I'm not going to add a step that includes the ogr conversion. 203 | 204 | OGR was added later, and as of yet does not yet do the key step of splitting multi-polygons, since it's just using ogr2ogr as a sub-process and I've yet to find a way to do that from the CLI (though knowing GDAL/OGR there probably is one - please let me know). To run the benchmark with it you need to do --skip-split-multis or else the times on it will be 0 (except for Shapefile, since it doesn't differentiate between multipolygons and regular polygons). I hope to add that functionality and get it on par, which may mean using Fiona. But it seems like that may affect performance, since Fiona doesn't use the [GDAL/OGR column-oriented API](https://gdal.org/development/rfc/rfc86_column_oriented_api.html). 205 | 206 | ### Code customizations 207 | 208 | There are 3 options that you can set as global variables in the Python code, but are not yet CLI options. These are: 209 | 210 | * `RUN_GPQ_CONVERSION` - whether GeoParquet from DuckDB by default runs [gpq](https://github.com/planetlabs/gpq) on the DuckDB Parquet output, which adds a good chunk of processing time. This makes it so the DuckDB processing output is slower than it would be if DuckDB natively wrote GeoParquet metadata, which I believe is on their roadmap. So that will likely emerge as the fastest benchmark time. In the code you can set `RUN_GPQ_CONVERSION` in the python code to false if you want to get a sense of it. In the above benchmark running the Parquet with DuckDB without GPQ conversion at the end resulted in a time of .76 seconds. 211 | * `PARQUET_COMPRESSION` - which compression to use for Parquet encoding. Note that not all processes support all compression options, and also the OGR converter currently ignores this option. 212 | * `SKIP_DUCK_GPKG` - whether to skip the GeoPackage conversion option on DuckDB, since it takes a long time to run. 213 | 214 | ## Contributing 215 | 216 | All contributions are welcome, I love running open source projects. I'm clearly just learning to code Python, so there's no judgement about crappy code. And I'm super happy to learn from others about better code. Feel free to sound in on [the issues](https://github.com/opengeos/open-buildings/issues), make new ones, grab one, or make a PR. There's lots of low hanging fruit of things to add. And if you're just starting out programming don't hesitate to ask even basic things in the [discussions](https://github.com/opengeos/open-buildings/discussions). 217 | -------------------------------------------------------------------------------- /dev-container.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ################################################################# 4 | # # 5 | # Simple Bash script to simplify development in a container. # 6 | # Builds Dockerfile.dev if it does not already exists. # 7 | # The CWD is mounted in the container. # 8 | # # 9 | # Use --rebuild flag to force rebuild of the image even if # 10 | # (a potentially older version of) the image already exists. # 11 | # This is necessary if e.g. the dependencies are updated. # 12 | # # 13 | ################################################################# 14 | 15 | 16 | rebuild=$([[ $1 == '--rebuild' ]] && echo true || echo false) 17 | 18 | if [[ $rebuild == true ]] 19 | then 20 | docker build --build-arg UID=$UID --build-arg USER=$USER -t ob-dev -f Dockerfile.dev . 21 | else 22 | docker inspect --type=image ob-dev &> /dev/null || { 23 | echo "Image doesn't exist locally, building ..."; 24 | docker build --build-arg UID=$UID --build-arg USER=$USER -t ob-dev -f Dockerfile.dev . 25 | } 26 | fi 27 | 28 | docker run -it --workdir /workspace -v $PWD:/workspace -v $HOME/.gitconfig:$HOME/.gitconfig -v $HOME/.ssh:$HOME/.ssh ob-dev bash -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v0.0.1 - Date 4 | 5 | **Improvement**: 6 | 7 | - TBD 8 | 9 | **New Features**: 10 | 11 | - TBD 12 | -------------------------------------------------------------------------------- /docs/common.md: -------------------------------------------------------------------------------- 1 | # common module 2 | 3 | ::: open_buildings.common -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every 4 | little bit helps, and credit will always be given. 5 | 6 | You can contribute in many ways: 7 | 8 | ## Types of Contributions 9 | 10 | ### Report Bugs 11 | 12 | Report bugs at . 13 | 14 | If you are reporting a bug, please include: 15 | 16 | - Your operating system name and version. 17 | - Any details about your local setup that might be helpful in troubleshooting. 18 | - Detailed steps to reproduce the bug. 19 | 20 | ### Fix Bugs 21 | 22 | Look through the GitHub issues for bugs. Anything tagged with `bug` and 23 | `help wanted` is open to whoever wants to implement it. 24 | 25 | ### Implement Features 26 | 27 | Look through the GitHub issues for features. Anything tagged with 28 | `enhancement` and `help wanted` is open to whoever wants to implement it. 29 | 30 | ### Write Documentation 31 | 32 | open-buildings could always use more documentation, 33 | whether as part of the official open-buildings docs, 34 | in docstrings, or even on the web in blog posts, articles, and such. 35 | 36 | ### Submit Feedback 37 | 38 | The best way to send feedback is to file an issue at 39 | . 40 | 41 | If you are proposing a feature: 42 | 43 | - Explain in detail how it would work. 44 | - Keep the scope as narrow as possible, to make it easier to implement. 45 | - Remember that this is a volunteer-driven project, and that contributions are welcome :) 46 | 47 | ## Get Started! 48 | 49 | Ready to contribute? Here's how to set up open-buildings for local development. 50 | 51 | 1. Fork the open-buildings repo on GitHub. 52 | 53 | 2. Clone your fork locally: 54 | 55 | ```shell 56 | $ git clone git@github.com:your_name_here/open-buildings.git 57 | ``` 58 | 59 | 3. Install your local copy into a virtualenv. Assuming you have 60 | virtualenvwrapper installed, this is how you set up your fork for 61 | local development: 62 | 63 | ```shell 64 | $ mkvirtualenv open-buildings 65 | $ cd open-buildings/ 66 | $ python setup.py develop 67 | ``` 68 | 69 | 4. Create a branch for local development: 70 | 71 | ```shell 72 | $ git checkout -b name-of-your-bugfix-or-feature 73 | ``` 74 | 75 | Now you can make your changes locally. 76 | 77 | 5. When you're done making changes, check that your changes pass flake8 78 | and the tests, including testing other Python versions with tox: 79 | 80 | ```shell 81 | $ flake8 open-buildings tests 82 | $ python setup.py test or pytest 83 | $ tox 84 | ``` 85 | 86 | To get flake8 and tox, just pip install them into your virtualenv. 87 | 88 | 6. Commit your changes and push your branch to GitHub: 89 | 90 | ```shell 91 | $ git add . 92 | $ git commit -m "Your detailed description of your changes." 93 | $ git push origin name-of-your-bugfix-or-feature 94 | ``` 95 | 96 | 7. Submit a pull request through the GitHub website. 97 | 98 | ## Pull Request Guidelines 99 | 100 | Before you submit a pull request, check that it meets these guidelines: 101 | 102 | 1. The pull request should include tests. 103 | 2. If the pull request adds functionality, the docs should be updated. 104 | Put your new functionality into a function with a docstring, and add 105 | the feature to the list in README.rst. 106 | 3. The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and 107 | for PyPy. Check and make sure that the tests pass for all 108 | supported Python versions. 109 | -------------------------------------------------------------------------------- /docs/examples/download_buildings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %pip install open-buildings" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Iimport libraries" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "import leafmap.foliumap as leafmap\n", 27 | "import geopandas as gpd" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "Read the tile geojson." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "url = 'https://sites.research.google/open-buildings/tiles.geojson'\n", 44 | "gdf = gpd.read_file(url)\n", 45 | "gdf.sort_values(by='size_mb', ascending=True, inplace=True)\n", 46 | "gdf.head()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "print(f\"Number of tiles: {len(gdf)}\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "m = leafmap.Map()\n", 65 | "m.add_gdf(gdf, layer_name=\"Open Buildings\")\n", 66 | "m" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "Get the tile URLs." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "urls = gdf['tile_url'].tolist()\n", 83 | "urls[:5]" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "Specify the output directory." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "out_dir = os.path.expanduser('~/Downloads/')" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Download all the tiles might take a while. Let's download only the first 10 tiles. " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "leafmap.download_files(urls[:10], out_dir=out_dir)" 116 | ] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "geo", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.10.9" 136 | }, 137 | "orig_nbformat": 4 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 2 141 | } 142 | -------------------------------------------------------------------------------- /docs/examples/intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import open_buildings as ob" 10 | ] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "geo", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.10.9" 30 | }, 31 | "orig_nbformat": 4 32 | }, 33 | "nbformat": 4, 34 | "nbformat_minor": 2 35 | } 36 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # open-buildings 2 | 3 | [![image](https://img.shields.io/pypi/v/open_buildings.svg)](https://pypi.python.org/pypi/open_buildings) 4 | 5 | **Tools for working with open building datasets** 6 | 7 | - Free software: Apache Software License 2.0 8 | - Documentation: 9 | - Creator: [Chris Holmes](https://github.com/cholmes) 10 | 11 | ## Introduction 12 | 13 | This repo is intended to be a set of useful scripts for working with Open Building Datasets, Initially Google's [Open Buildings](https://sites.research.google/open-buildings/) 14 | dataset and Overture's building dataset, specifically to help translate them into [Cloud Native Geospatial](https://cloudnativegeo.org) formats and then use those. The outputs will live 15 | on , [here for Google](https://beta.source.coop/cholmes/google-open-buildings) and [here for Overture](https://beta.source.coop/cholmes/overture/) so most people can just make use of those directly. 16 | 17 | The main operation that most people will be interested in is the 'get-buildings' command, that 18 | lets you supply a GeoJSON file to a command-line interface and it'll download all buildings 19 | in the area supplied, output in common GIS formats (GeoPackage, FlatGeobuf, Shapefile, GeoJSON and GeoParquet). 20 | 21 | The rest of the CLI's and scripts are intended to show the process of transforming the data, 22 | and then they've expanded to be a way to benchmark performance. 23 | 24 | This is basically my first Python project, and certainly my first open source one. It is only possible due to ChatGPT, as I'm not a python 25 | programmer, and not a great programmer in general (coded professionally for about 2 years, then shifted to doing lots of other stuff). So 26 | it's likely not great code, but it's been fun to iterate on it and seems like it might be useful to others. And contributions are welcome! I'm working on making the issue tracker accessible, so anyone who wants to try out some open source coding can jump in. 27 | 28 | ## Installation 29 | 30 | Install with pip: 31 | 32 | ```bash 33 | pip install open-buildings 34 | ``` 35 | 36 | This should add a CLI that you can then use. If it's working then: 37 | 38 | ```bash 39 | ob 40 | ``` 41 | 42 | Should print out a help message. You then should be able run the CLI (download [1.json](https://data.source.coop/cholmes/aois/1.json): 43 | 44 | 45 | ```bash 46 | ob tools get_buildings 1.json my-buildings.geojson --country_iso RW 47 | ``` 48 | 49 | You can also stream the json in directly in one line: 50 | 51 | ``` 52 | curl https://data.source.coop/cholmes/aois/1.json | ob get_buildings - my-buildings.geojson --country_iso RW 53 | ``` 54 | 55 | 56 | ## Functionality 57 | 58 | ### get_buildings 59 | 60 | The main tool for most people is `get_buildings`. It queries complete global 61 | building datasets for the GeoJSON provided, outputting results in common geospatial formats. The 62 | full options and explanation can be found in the `--help` command: 63 | 64 | ``` 65 | % ob get_buildings --help 66 | Usage: ob get_buildings [OPTIONS] [GEOJSON_INPUT] [DST] 67 | 68 | Tool to extract buildings in common geospatial formats from large archives 69 | of GeoParquet data online. GeoJSON input can be provided as a file or piped 70 | in from stdin. If no GeoJSON input is provided, the tool will read from 71 | stdin. 72 | 73 | Right now the tool supports two sources of data: Google and Overture. The 74 | data comes from Cloud-Native Geospatial distributions on 75 | https://source.coop, that are partitioned by admin boundaries and use a 76 | quadkey for the spatial index. In time this tool will generalize to support 77 | any admin boundary partitioned GeoParquet data, but for now it is limited to 78 | the Google and Overture datasets. 79 | 80 | The default output is GeoJSON, in a file called buildings.json. Changing the 81 | suffix will change the output format - .shp for shapefile .gpkg for 82 | GeoPackage, .fgb for FlatGeobuf and .parquet for GeoParquet, and .json or 83 | .geojson for GeoJSON. If your query is all within one country it is strongly 84 | recommended to use country_iso to hint to the query engine which country to 85 | query, as this will speed up the query significantly (5-10x). Expect query 86 | times of 5-10 seconds for a queries with country_iso and 30-60 seconds 87 | without country_iso. 88 | 89 | You can look up the country_iso for a country here: 90 | https://github.com/lukes/ISO-3166-Countries-with-Regional- 91 | Codes/blob/master/all/all.csv If you get the country wrong you will get zero 92 | results. Currently you can only query one country, so if your query crosses 93 | country boundaries you should not use country_iso. In future versions of 94 | this tool we hope to eliminate the need to hint with the country_iso. 95 | 96 | Options: 97 | --source [google|overture] Dataset to query, defaults to Overture 98 | --country_iso TEXT A 2 character country ISO code to filter the 99 | data by. 100 | -s, --silent Suppress all print outputs. 101 | --overwrite Overwrite the destination file if it already 102 | exists. 103 | --verbose Print detailed logs with timestamps. 104 | --help Show this message and exit. 105 | ``` 106 | 107 | ### Google Building processings 108 | 109 | In the google portion of the CLI there are two functions: 110 | 111 | - `convert` takes as input either a single CSV file or a directory of CSV files, downloaded locally from the Google Buildings dataset. It can write out as GeoParquet, FlatGeobuf, GeoPackage and Shapefile, and can process the data using DuckDB, GeoPandas or OGR. 112 | - `benchmark` runs the convert command against one or more different formats, and one or more different processes, and reports out how long each took. 113 | 114 | A sample output for `benchmark`, run on 219_buildings.csv, a 101 mb CSV file is: 115 | 116 | ``` 117 | Table for file: 219_buildings.csv 118 | ╒═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕ 119 | │ process │ fgb │ gpkg │ parquet │ shp │ 120 | ╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡ 121 | │ duckdb │ 00:02.330 │ 00:00.000 │ 00:01.866 │ 00:03.119 │ 122 | ├───────────┼───────────┼───────────┼───────────┼───────────┤ 123 | │ ogr │ 00:02.034 │ 00:07.456 │ 00:01.423 │ 00:02.491 │ 124 | ├───────────┼───────────┼───────────┼───────────┼───────────┤ 125 | │ pandas │ 00:18.184 │ 00:24.096 │ 00:02.710 │ 00:20.032 │ 126 | ╘═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛ 127 | ``` 128 | 129 | The full options can be found with `--help` after each command, and I'll put them here for reference: 130 | 131 | ``` 132 | Usage: open_buildings convert [OPTIONS] INPUT_PATH OUTPUT_DIRECTORY 133 | 134 | Converts a CSV or a directory of CSV's to an alternate format. Input CSV's 135 | are assumed to be from Google's Open Buildings 136 | 137 | Options: 138 | --format [fgb|parquet|gpkg|shp] 139 | The output format. The default is FlatGeobuf (fgb) 140 | --overwrite Whether to overwrite any existing output files. 141 | --process [duckdb|pandas|ogr] The processing method to use. The default is 142 | pandas. 143 | --skip-split-multis Whether to keep multipolygons as they are 144 | without splitting into their component polygons. 145 | --verbose Whether to print detailed processing 146 | information. 147 | --help Show this message and exit. 148 | ``` 149 | 150 | ``` 151 | Usage: open_buildings benchmark [OPTIONS] INPUT_PATH OUTPUT_DIRECTORY 152 | 153 | Runs the convert function on each of the supplied processes and formats, 154 | printing the timing of each as a table 155 | 156 | Options: 157 | --processes TEXT The processing methods to use. One or more of duckdb, 158 | pandas or ogr, in a comma-separated list. Default is 159 | duckdb,pandas,ogr. 160 | --formats TEXT The output formats to benchmark. One or more of fgb, 161 | parquet, shp or gpkg, in a comma-separated list. 162 | Default is fgb,parquet,shp,gpkg. 163 | --skip-split-multis Whether to keep multipolygons as they are without 164 | splitting into their component polygons. 165 | --no-gpq Disable GPQ conversion. Timing will be faster, but not 166 | valid GeoParquet (until DuckDB adds support) 167 | --verbose Whether to print detailed processing information. 168 | --output-format TEXT The format of the output. Options: ascii, csv, json, 169 | chart. 170 | --help Show this message and exit. 171 | ``` 172 | 173 | **Warning** - note that `--no-gpq` doesn't actually work right now, see https://github.com/opengeos/open-buildings/issues/4 to track. It is just always set to true, so DuckDB times with Parquet will be inflated (you can change it in the Python code in a global variables). Note also that the `ogr` process does not work with `--skip-split-multis`, but will just report very minimal times since it skips doing anything, see https://github.com/opengeos/open-buildings/issues/5 to track. 174 | 175 | #### Format Notes 176 | 177 | I'm mostly focused on GeoParquet and FlatGeobuf, as good cloud-native geo formats. I included GeoPackage and Shapefile mostly for benchmarking purposes. GeoPackage I think is a good option for Esri and other more legacy software that is slow to adopt new formats. Shapefile is total crap for this use case - it fails on files bigger than 4 gigabytes, and lots of the source S2 Google Building CSV's are bigger, so it's not useful for translating. The truncation of field names is also annoying, since the CSV file didn't try to make short names (nor should it, the limit is silly). 178 | 179 | GeoPackage is particularly slow with DuckDB, it's likely got a bit of a bug in it. But it works well with Pandas and OGR. 180 | 181 | ## Process Notes 182 | 183 | When I was processing V2 of the Google Building's dataset I did most of the initial work with GeoPandas, which was awesome, and has the best GeoParquet implementation. But the size of the data made its all in memory processing untenable. I ended up using PostGIS a decent but, but near the end of that process I discovered DuckDB, and was blown away by it's speed and ability to manage memory well. So for this tool I was mostly focused on those two. 184 | 185 | Note also that currently DuckDB fgb, gpkg and shp output don't include projection information, so if you want to use the output then you'd need to run ogr2ogr on the output. It sounds like that may get fixed pretty soon, so I'm not going to add a step that includes the ogr conversion. 186 | 187 | OGR was added later, and as of yet does not yet do the key step of splitting multi-polygons, since it's just using ogr2ogr as a sub-process and I've yet to find a way to do that from the CLI (though knowing GDAL/OGR there probably is one - please let me know). To run the benchmark with it you need to do --skip-split-multis or else the times on it will be 0 (except for Shapefile, since it doesn't differentiate between multipolygons and regular polygons). I hope to add that functionality and get it on par, which may mean using Fiona. But it seems like that may affect performance, since Fiona doesn't use the [GDAL/OGR column-oriented API](https://gdal.org/development/rfc/rfc86_column_oriented_api.html). 188 | 189 | ### Code customizations 190 | 191 | There are 3 options that you can set as global variables in the Python code, but are not yet CLI options. These are: 192 | 193 | * `RUN_GPQ_CONVERSION` - whether GeoParquet from DuckDB by default runs [gpq](https://github.com/planetlabs/gpq) on the DuckDB Parquet output, which adds a good chunk of processing time. This makes it so the DuckDB processing output is slower than it would be if DuckDB natively wrote GeoParquet metadata, which I believe is on their roadmap. So that will likely emerge as the fastest benchmark time. In the code you can set `RUN_GPQ_CONVERSION` in the python code to false if you want to get a sense of it. In the above benchmark running the Parquet with DuckDB without GPQ conversion at the end resulted in a time of .76 seconds. 194 | * `PARQUET_COMPRESSION` - which compression to use for Parquet encoding. Note that not all processes support all compression options, and also the OGR converter currently ignores this option. 195 | * `SKIP_DUCK_GPKG` - whether to skip the GeoPackage conversion option on DuckDB, since it takes a long time to run. 196 | 197 | ## Contributing 198 | 199 | All contributions are welcome, I love running open source projects. I'm clearly just learning to code Python, so there's no judgement about crappy code. And I'm super happy to learn from others about better code. Feel free to sound in on [the issues](https://github.com/opengeos/open-buildings/issues), make new ones, grab one, or make a PR. There's lots of low hanging fruit of things to add. And if you're just starting out programming don't hesitate to ask even basic things in the [discussions](https://github.com/opengeos/open-buildings/discussions). 200 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | To install open-buildings, run this command in your terminal: 4 | 5 | ```bash 6 | pip install open-buildings 7 | ``` 8 | 9 | This is the preferred method to install open-buildings, as it will always install the most recent stable release. 10 | 11 | If you don't have [pip](https://pip.pypa.io) installed, this [Python installation guide](http://docs.python-guide.org/en/latest/starting/installation/) can guide you through the process. 12 | 13 | This should add a CLI that you can then use. If it's working then: 14 | 15 | ```bash 16 | ob 17 | ``` 18 | 19 | Should print out a help message. You then should be able run the CLI (download [1.json](https://data.source.coop/cholmes/aois/1.json): 20 | 21 | 22 | ```bash 23 | ob tools get_buildings 1.json my-buildings.geojson --country_iso RW 24 | ``` 25 | 26 | You can also stream the json in directly in one line: 27 | 28 | ``` 29 | curl https://data.source.coop/cholmes/aois/1.json | ob get_buildings - my-buildings.geojson --country_iso RW 30 | ``` 31 | 32 | ## Install From sources 33 | 34 | To install open-buildings from sources, run this command in your terminal: 35 | 36 | ``` 37 | pip install git+https://github.com/opengeos/open-buildings 38 | ``` 39 | -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 | {% if page.nb_url %} 5 | 6 | {% include ".icons/material/download.svg" %} 7 | 8 | {% endif %} 9 | 10 | {{ super() }} 11 | {% endblock content %} 12 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | To use open-buildings in a project: 4 | 5 | ``` 6 | import open_buildings 7 | ``` 8 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: open-buildings 2 | site_description: Tools for working with open building datasets 3 | site_author: cholmes 4 | site_url: https://opengeos.github.io/open-buildings 5 | repo_url: https://github.com/opengeos/open-buildings 6 | 7 | copyright: "Copyright © 2023 - 2023 Chris Holmes" 8 | 9 | theme: 10 | palette: 11 | - scheme: default 12 | # primary: blue 13 | # accent: indigo 14 | toggle: 15 | icon: material/toggle-switch-off-outline 16 | name: Switch to dark mode 17 | - scheme: slate 18 | primary: indigo 19 | accent: indigo 20 | toggle: 21 | icon: material/toggle-switch 22 | name: Switch to light mode 23 | name: material 24 | icon: 25 | repo: fontawesome/brands/github 26 | # logo: assets/logo.png 27 | favicon: assets/favicon.png 28 | features: 29 | - navigation.instant 30 | - navigation.tracking 31 | - navigation.top 32 | - search.highlight 33 | - search.share 34 | custom_dir: "docs/overrides" 35 | font: 36 | text: Google Sans 37 | code: Regular 38 | 39 | plugins: 40 | - search 41 | - mkdocstrings 42 | - git-revision-date 43 | - git-revision-date-localized: 44 | enable_creation_date: true 45 | type: timeago 46 | # - pdf-export 47 | - mkdocs-jupyter: 48 | include_source: True 49 | ignore_h1_titles: True 50 | execute: true 51 | allow_errors: false 52 | ignore: ["conf.py"] 53 | execute_ignore: ["*ignore.ipynb"] 54 | 55 | markdown_extensions: 56 | - admonition 57 | - abbr 58 | - attr_list 59 | - def_list 60 | - footnotes 61 | - meta 62 | - md_in_html 63 | - pymdownx.superfences 64 | - pymdownx.highlight: 65 | linenums: true 66 | - toc: 67 | permalink: true 68 | 69 | # extra: 70 | # analytics: 71 | # provider: google 72 | # property: UA-XXXXXXXXX-X 73 | 74 | nav: 75 | - Home: index.md 76 | - Installation: installation.md 77 | - Usage: usage.md 78 | - Contributing: contributing.md 79 | - FAQ: faq.md 80 | - Changelog: changelog.md 81 | - Report Issues: https://github.com/opengeos/open-buildings/issues 82 | - Examples: 83 | - examples/download_buildings.ipynb 84 | - API Reference: 85 | - common module: common.md 86 | -------------------------------------------------------------------------------- /open_buildings/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for open-buildings.""" 2 | 3 | __author__ = """Chris Holmes""" 4 | __email__ = 'cholmes@9eo.org' 5 | __version__ = '0.10.0' 6 | -------------------------------------------------------------------------------- /open_buildings/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import click 4 | import json 5 | import pandas as pd 6 | import osmnx 7 | from shapely.geometry import shape, box, mapping 8 | import matplotlib.pyplot as plt 9 | from open_buildings.google.process import process_benchmark, process_geometries 10 | from open_buildings.download_buildings import download as download_buildings 11 | from open_buildings.overture.add_columns import process_parquet_files 12 | from open_buildings.overture.partition import process_db 13 | from open_buildings.settings import Source 14 | from datetime import datetime, timedelta 15 | from tabulate import tabulate 16 | import boto3 # Required for S3 operations 17 | 18 | @click.group() 19 | def main(): 20 | """CLI for Open Buildings operations.""" 21 | pass 22 | 23 | @click.group() 24 | def google(): 25 | """Commands related to Google operations.""" 26 | pass 27 | 28 | @click.group() 29 | def overture(): 30 | """Commands related to Overture operations.""" 31 | pass 32 | 33 | main.add_command(google) 34 | main.add_command(overture) 35 | 36 | def handle_comma_separated(ctx, param, value): 37 | return value.split(',') 38 | 39 | def geocode(data: str): 40 | location = osmnx.geocode_to_gdf(data) 41 | geom = location.geometry[0] 42 | geojson = json.loads(json.dumps({"type": "Feature", "geometry": mapping(geom)})) # turn geom tuple into list by (de-)serialising 43 | return geojson 44 | 45 | @main.command(name="get_buildings") 46 | @click.argument('geojson_input', type=click.File('r'), required=False) 47 | @click.option('--dst', type=str, default="buildings.json", help='The path to write the output to. Can be a directory or file.') 48 | @click.option('--location', type=str, default=None, help='Use city or region name instead of providing an AOI as file.') 49 | @click.option('--source', default="overture", type=click.Choice(['google', 'overture']), help='Dataset to query, defaults to Overture') 50 | @click.option('--country_iso', type=str, default=None, help='A 2 character country ISO code to filter the data by.') 51 | @click.option('-s', '--silent', is_flag=True, default=False, help='Suppress all print outputs.') 52 | @click.option('--overwrite', default=False, is_flag=True, help='Overwrite the destination file if it already exists.') 53 | @click.option('-v', '--verbose', default=False, is_flag=True, help='Print detailed logs with timestamps.') 54 | def get_buildings(geojson_input, dst, location, source, country_iso, silent, overwrite, verbose): 55 | """Tool to extract buildings in common geospatial formats from large archives of GeoParquet data online. GeoJSON 56 | input can be provided as a file or piped in from stdin. If no GeoJSON input is provided, the tool will read from stdin. 57 | 58 | Right now the tool supports two sources of data: Google and Overture. The data comes from Cloud-Native Geospatial distributions 59 | on https://source.coop, that are partitioned by admin boundaries and use a quadkey for the spatial index. In time this tool will generalize 60 | to support any admin boundary partitioned GeoParquet data, but for now it is limited to the Google and Overture datasets. 61 | 62 | The default output is GeoJSON, in a file called buildings.json. Changing the suffix will change the output format - .shp for shapefile 63 | .gpkg for GeoPackage, .fgb for FlatGeobuf and .parquet for GeoParquet, and .json or .geojson for GeoJSON. If your query is 64 | all within one country it is strongly recommended to use country_iso to hint to the query engine which country to query, as this 65 | will speed up the query significantly (5-10x). Expect query times of 5-10 seconds for small queries with country_iso and 30-60 seconds without country_iso. 66 | Large queries will take longer, as they have to download more data. 67 | 68 | You can look up the country_iso for a country here: https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv 69 | If you get the country wrong you will get zero results. Currently you can only query one country, so if your query crosses country boundaries you should 70 | not use country_iso. In future versions of this tool we hope to eliminate the need to hint with the country_iso. 71 | """ 72 | # map source of google and overture to values for data_path and hive 73 | # case insensitive matching 74 | if source.lower() == "google": 75 | source = Source.GOOGLE 76 | elif source.lower() == "overture": 77 | source = Source.OVERTURE 78 | else: 79 | raise ValueError(f"Invalid source '{source}', accepted values are {', '.join(v.name.lower() for v in Source)}.") 80 | 81 | if geojson_input: 82 | geojson_data = json.load(geojson_input) 83 | elif location: 84 | geojson_data = geocode(location) 85 | else: 86 | geojson_data = json.load(click.get_text_stream('stdin')) 87 | 88 | download_buildings(geojson_data, source=source, generate_sql=False, dst=dst, silent=silent, overwrite=overwrite, verbose=verbose, country_iso=country_iso) 89 | 90 | @google.command('benchmark') 91 | @click.argument('input_path', type=click.Path(exists=True)) 92 | @click.argument('output_directory', type=click.Path(exists=True)) 93 | @click.option( 94 | '--processes', 95 | callback=handle_comma_separated, 96 | default='duckdb,pandas,ogr', 97 | help="The processing methods to use. One or more of duckdb, pandas or ogr, in a comma-separated list. Default is duckdb,pandas,ogr.", 98 | ) 99 | @click.option( 100 | '--formats', 101 | callback=handle_comma_separated, 102 | default='fgb,parquet,shp,gpkg', 103 | help="The output formats to benchmark. One or more of fgb, parquet, shp or gpkg, in a comma-separated list. Default is fgb,parquet,shp,gpkg.", 104 | ) 105 | @click.option( 106 | '--skip-split-multis', 107 | is_flag=True, 108 | help="Whether to keep multipolygons as they are without splitting into their component polygons.", 109 | ) 110 | @click.option('--no-gpq', is_flag=True, help="Disable GPQ conversion. Timing will be faster, but not valid GeoParquet (until DuckDB adds support)") 111 | @click.option( 112 | '--verbose', is_flag=True, help="Whether to print detailed processing information." 113 | ) 114 | @click.option( 115 | '--output-format', 116 | callback=handle_comma_separated, 117 | default='ascii', 118 | help="The format of the output. Options: ascii, csv, json, chart.", 119 | ) 120 | def benchmark( 121 | input_path, 122 | output_directory, 123 | processes, 124 | formats, 125 | skip_split_multis, 126 | no_gpq, 127 | verbose, 128 | output_format, 129 | ): 130 | """Runs the convert function on each of the supplied processes and formats, printing the timing of each as a table""" 131 | results = process_benchmark( 132 | input_path, output_directory, processes, formats, not skip_split_multis, verbose 133 | ) 134 | 135 | df = pd.DataFrame(results) 136 | df = df.pivot(index='process', columns='format', values='execution_time') 137 | 138 | base_name = os.path.basename(input_path) 139 | file_name, file_ext = os.path.splitext(base_name) 140 | 141 | for format in output_format: 142 | if format == 'csv': 143 | df.to_csv(f"{output_directory}/{file_name}_benchmark.csv", index=False) 144 | elif format == 'json': 145 | df.to_json(f"{output_directory}/{file_name}_benchmark.json", orient='split', indent=4) 146 | elif format == 'chart': 147 | df.plot(kind='bar', rot=0) 148 | plt.title(f'Benchmark for file: {base_name}') 149 | plt.xlabel('Process') 150 | plt.ylabel('Execution Time (in seconds)') 151 | plt.tight_layout() 152 | plt.savefig(f"{output_directory}/{file_name}_benchmark.png") 153 | plt.clf() 154 | elif format == 'ascii': 155 | df_formatted = df.copy() 156 | for column in df_formatted.columns: 157 | df_formatted[column] = df_formatted[column].apply(lambda x: (datetime.min + timedelta(seconds=x)).strftime('%M:%S.%f')[:-3]) 158 | 159 | print(f"\nTable for file: {base_name}") 160 | print(tabulate(df_formatted, headers="keys", tablefmt="fancy_grid")) 161 | else: 162 | raise ValueError('Invalid output format') 163 | 164 | @google.command('convert') 165 | @click.argument('input_path', type=click.Path(exists=True)) 166 | @click.argument('output_directory', type=click.Path(exists=True)) 167 | @click.option( 168 | '--format', 169 | type=click.Choice(['fgb', 'parquet', 'gpkg', 'shp']), 170 | default='fgb', 171 | help="The output format. The default is FlatGeobuf (fgb)", 172 | ) 173 | @click.option( 174 | '--overwrite', is_flag=True, help="Whether to overwrite any existing output files." 175 | ) 176 | @click.option( 177 | '--process', 178 | type=click.Choice(['duckdb', 'pandas', 'ogr']), 179 | default='pandas', 180 | help="The processing method to use. The default is pandas.", 181 | ) 182 | @click.option( 183 | '--skip-split-multis', 184 | is_flag=True, 185 | help="Whether to keep multipolygons as they are without splitting into their component polygons.", 186 | ) 187 | @click.option( 188 | '--verbose', is_flag=True, help="Whether to print detailed processing information." 189 | ) 190 | def convert( 191 | input_path, output_directory, format, overwrite, process, skip_split_multis, verbose 192 | ): 193 | """Converts a CSV or a directory of CSV's to an alternate format. Input CSV's are assumed to be from Google's Open Buildings""" 194 | process_geometries( 195 | input_path, 196 | output_directory, 197 | format, 198 | overwrite, 199 | process, 200 | not skip_split_multis, 201 | verbose, 202 | ) 203 | 204 | @overture.command('add_columns') 205 | @click.argument('input_folder', type=click.Path(exists=True)) 206 | @click.argument('output_folder', type=click.Path()) 207 | @click.argument('country_parquet_path', type=click.Path(exists=True)) 208 | @click.option('--overwrite', is_flag=True, help="Whether to overwrite any existing output files.") 209 | @click.option('--no-quadkey', is_flag=True, help="Whether to add a quadkey column to the output.") 210 | @click.option('--no-country-iso', is_flag=True, help="Whether to add a country_iso column to the output.") 211 | @click.option('--verbose', is_flag=True, help="Whether to print detailed processing information.") 212 | def add_columns( 213 | input_folder, output_folder, country_parquet_path, overwrite, no_quadkey, no_country_iso, verbose 214 | ): 215 | """Adds columns to the input Overture parquet files, using Overture country for admin boundaries, outputting GeoParquet ordered by quadkey the output folder""" 216 | add_quadkey = not no_quadkey 217 | add_country_iso = not no_country_iso 218 | """Adds columns to the input parquet files, outputting to the output folder""" 219 | process_parquet_files( 220 | input_folder, output_folder, country_parquet_path, overwrite, add_quadkey, add_country_iso, verbose 221 | ) 222 | 223 | @overture.command('download') 224 | @click.argument('destination_folder', type=click.Path()) 225 | @click.option( 226 | '--theme', 227 | type=click.Choice(['buildings', 'admins', 'places', 'transportation']), 228 | default='buildings', 229 | help="Theme option for the files to download from S3. Default is buildings.", 230 | ) 231 | def overture_download(destination_folder, theme): 232 | """Download building files from S3 (can change theme for other overture data).""" 233 | 234 | os.makedirs(destination_folder, exist_ok=True) 235 | 236 | s3 = boto3.client('s3') 237 | bucket = 'overturemaps-us-west-2' 238 | prefix = f"release/2023-07-26-alpha.0/theme={theme}/" 239 | 240 | objects = s3.list_objects(Bucket=bucket, Prefix=prefix) 241 | 242 | for obj in objects.get('Contents', []): 243 | file_name = os.path.basename(obj['Key']) 244 | local_file_path = os.path.join(destination_folder, file_name) 245 | timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 246 | print(f"[{timestamp}] Downloading {file_name} to {destination_folder}") 247 | s3.download_file(bucket, obj['Key'], local_file_path) 248 | 249 | timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 250 | print(f"[{timestamp}] Downloaded {file_name}") 251 | 252 | @overture.command('partition') 253 | @click.argument('duckdb-path', type=click.Path(exists=True)) 254 | @click.option('--output-folder', default=os.getcwd(), type=click.Path(), help='Folder to store the output files') 255 | @click.option('--geo-conversion', default='gpq', type=click.Choice(['gpq', 'none', 'pandas', 'ogr'], case_sensitive=False)) 256 | @click.option('--verbose', is_flag=True, default=False, help='Print verbose output') 257 | @click.option('--max-per-file', default=10000000, type=int, help='Maximum number of rows per file') 258 | @click.option('--row-group-size', default=10000, type=int, help='Row group size for Parquet files') 259 | @click.option('--hive', is_flag=True, default=False, help='Output files in Hive format (folder structure)') 260 | @click.option('--table-name', default='buildings', type=str, help='Name of the table to process') 261 | def partition(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive, table_name): 262 | """Partition a DuckDB database of all overture data by country_iso""" 263 | process_db(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive, table_name) 264 | 265 | 266 | if __name__ == "__main__": 267 | sys.exit(main()) -------------------------------------------------------------------------------- /open_buildings/common.py: -------------------------------------------------------------------------------- 1 | """The common module contains common functions and classes used by the other modules. 2 | """ 3 | 4 | def hello_world(): 5 | """Prints "Hello World!" to the console. 6 | """ 7 | print("Hello World!") -------------------------------------------------------------------------------- /open_buildings/download_buildings.py: -------------------------------------------------------------------------------- 1 | import json 2 | import click 3 | from math import tan, cos, log, pi 4 | from shapely.geometry import shape, box, mapping 5 | from typing import Dict, Any, Union 6 | import mercantile 7 | import duckdb 8 | import time 9 | from pathlib import Path 10 | import datetime 11 | import os 12 | from typing import Literal, Optional 13 | import pandas as pd 14 | import geopandas as gpd 15 | import subprocess 16 | import shapely 17 | import geojson 18 | import shutil 19 | import osmnx 20 | from open_buildings.settings import Source, Format, settings 21 | 22 | def geojson_to_quadkey(data: dict) -> str: 23 | geom = shape(data["geometry"]) 24 | min_lon, min_lat, max_lon, max_lat = geom.bounds 25 | 26 | for zoom in range(12, -1, -1): 27 | tiles = list(mercantile.tiles(min_lon, min_lat, max_lon, max_lat, zooms=zoom)) 28 | if len(tiles) == 1: 29 | return mercantile.quadkey(tiles[0]) 30 | 31 | return '' 32 | 33 | def geojson_to_wkt(data: dict) -> str: 34 | geometry = shape(data['geometry']) 35 | return geometry.wkt 36 | 37 | def quadkey_to_geojson(quadkey: str) -> dict: 38 | # Convert the quadkey to tile coordinates 39 | tile = mercantile.quadkey_to_tile(quadkey) 40 | 41 | # Get the bounding box of the tile 42 | bbox = mercantile.bounds(tile) 43 | 44 | # Construct a GeoJSON Polygon representation of the bounding box 45 | geojson = { 46 | "type": "Feature", 47 | "geometry": { 48 | "type": "Polygon", 49 | "coordinates": [[ 50 | [bbox.west, bbox.south], 51 | [bbox.east, bbox.south], 52 | [bbox.east, bbox.north], 53 | [bbox.west, bbox.north], 54 | [bbox.west, bbox.south] 55 | ]] 56 | } 57 | } 58 | 59 | return geojson 60 | 61 | @click.group() 62 | def cli(): 63 | pass 64 | 65 | @cli.command() 66 | @click.argument('geojson_input', type=click.File('r'), required=False) 67 | def quadkey(geojson_input): 68 | """Convert GeoJSON to quadkey.""" 69 | if geojson_input: 70 | geojson_data = json.load(geojson_input) 71 | else: 72 | geojson_data = json.load(click.get_text_stream('stdin')) 73 | result = geojson_to_quadkey(geojson_data) 74 | click.echo(result) 75 | 76 | @cli.command() 77 | @click.argument('geojson_input', type=click.File('r'), required=False) 78 | def WKT(geojson_input): 79 | """Convert GeoJSON to Well Known Text.""" 80 | if geojson_input: 81 | geojson_data = json.load(geojson_input) 82 | else: 83 | geojson_data = json.load(click.get_text_stream('stdin')) 84 | 85 | result = geojson_to_wkt(geojson_data) 86 | click.echo(result) 87 | 88 | 89 | @click.command() 90 | @click.argument('geojson_input', type=click.File('r'), required=False) 91 | @click.option('--only-quadkey', is_flag=True, help='Include only the quadkey in the WHERE clause.') 92 | @click.option('--local', is_flag=True, help='Use local path for parquet files instead of the S3 URL.') 93 | def sql(geojson_input, only_quadkey, local): 94 | """Generate an SQL query based on the input GeoJSON.""" 95 | 96 | # Read the GeoJSON 97 | if geojson_input: 98 | geojson_data = json.load(geojson_input) 99 | else: 100 | geojson_data = json.load(click.get_text_stream('stdin')) 101 | 102 | quadkey = geojson_to_quadkey(geojson_data) 103 | wkt = geojson_to_wkt(geojson_data) 104 | 105 | # Adjust the path in read_parquet based on the --local flag 106 | path = '*.parquet' if local else 's3://us-west-2.opendata.source.coop/cholmes/overture/geoparquet-country-quad-2/*.parquet' 107 | base_sql = f"select * from read_parquet('{path}')" 108 | 109 | # Construct the WHERE clause based on the options 110 | where_clause = f"WHERE quadkey LIKE '{quadkey}%'" 111 | if not only_quadkey: 112 | where_clause += f" AND\nST_Within(ST_GeomFromWKB(geometry), ST_GeomFromText('{wkt}'))" 113 | 114 | sql_query = f"{base_sql},\n{where_clause}" 115 | full_sql_query = f"COPY ('{sql_query}' TO 'buildings.fgb' WITH (FORMAT GDAL, DRIVER 'FlatGeobuf')" 116 | click.echo(full_sql_query) 117 | 118 | @cli.command() 119 | @click.argument('quadkey_input', type=str) 120 | def quad2json(quadkey_input): 121 | """Convert quadkey to GeoJSON.""" 122 | result = quadkey_to_geojson(quadkey_input) 123 | click.echo(json.dumps(result, indent=2)) 124 | 125 | def download( 126 | geojson_data: Dict[str, Any], 127 | dst: Union[Path, str] = "buildings.json", 128 | source: Union[Source, str] = Source.OVERTURE, 129 | format: Optional[Union[Format, str]] = None, 130 | country_iso: Optional[str] = None, 131 | *, 132 | generate_sql: bool = False, # whether to actually perform actions or just generate sql 133 | verbose: bool = False, # print detailed logs 134 | silent: bool = False, # no log output 135 | overwrite: bool = False # whether to overwrite existing output file 136 | ) -> None: 137 | """ 138 | Extract buildings from online sources. 139 | 140 | Parameters 141 | ---------- 142 | geojson_input : Dict[str, Any] 143 | GeoJSON dictionary 144 | dst : Path | str 145 | The path to write the output to. Can be either a file or a directory. 146 | If a directory is provided, a file "buildings." will be created at that location. 147 | format : string, default "geojson" 148 | The output format, alternatively can be extracted from "dst". Explicitly naming the format can be useful if 149 | used in combination with a directory as "dst". If both file path and format param is provided, the format param takes 150 | precedence. 151 | country_iso : str, optional 152 | A two-letter ISO-3166 code for the country the AOI (geojson_input) is in. Not required but massively speeds up queries. 153 | generate_sql : bool, default False 154 | Whether to actually perform DuckDB queries or only generate the SQL. 155 | verbose : bool, default False 156 | Print more detailed log messages. 157 | silent : bool, default False 158 | Suppress log messages. 159 | overwrite : bool, default False 160 | Overwrite existing output files. 161 | """ 162 | # type conversion 163 | if type(source) == str: 164 | try: 165 | source = Source(source.upper()) 166 | except ValueError: 167 | raise ValueError(f"Source {source} is unknown. Please choose one of {' ,'.join([s.name.lower() for s in Source])}.") from e 168 | 169 | if type(format) == str: 170 | try: 171 | format = Format(format.upper()) 172 | except ValueError: 173 | raise ValueError(f"Format {format} is unknown. Please choose one of {', '.join(f.name.lower() for f in Format)}.") from e 174 | 175 | if type(dst) == str: 176 | dst = Path(dst) 177 | 178 | # validate path and extension 179 | if os.path.isdir(dst): 180 | dst = dst.joinpath("buildings.json") 181 | 182 | if format and dst: 183 | # format takes precedence 184 | dst = dst.joinpath(f"{dst.stem}.{settings.extensions[format]}") 185 | 186 | if not format and dst: 187 | for fmt, ext in settings.extensions.items(): 188 | if dst.name.endswith(ext): 189 | format = fmt 190 | break 191 | else: # The for-else structure means the else block runs if the loop completes normally, without a break. 192 | raise ValueError(f"Can't identify file extension of {dst}. Please choose one of {', '.join([f.name.lower() for f in Format])}.") 193 | 194 | # utils (should be in separate utils file?) 195 | def print_timestamped_message(message): 196 | if not silent: 197 | current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 198 | click.echo(f"[{current_time}] {message}") 199 | 200 | def print_elapsed_time(start_time): 201 | end_time = time.time() 202 | 203 | elapsed_time = end_time - start_time 204 | print_timestamped_message(f"Operation took {elapsed_time:.2f} seconds.") 205 | 206 | # main program 207 | start_time = time.time() 208 | if verbose: 209 | print_timestamped_message("Reading GeoJSON input...") 210 | 211 | if os.path.exists(dst) and not generate_sql: 212 | if overwrite: 213 | if verbose: 214 | print_timestamped_message(f"Deleting existing file at {dst}.") 215 | os.remove(dst) 216 | else: 217 | # Print message that the file already exists and cleanly exit the program 218 | print_timestamped_message(f"File at {dst} already exists. Use --overwrite to overwrite it.") 219 | return 220 | 221 | if verbose: 222 | print_timestamped_message("Converting GeoJSON to quadkey and WKT...") 223 | quadkey = geojson_to_quadkey(geojson_data) 224 | wkt = geojson_to_wkt(geojson_data) 225 | 226 | country_info = "" 227 | if country_iso is not None: 228 | country_info = f"in country {country_iso}" 229 | print_timestamped_message(f"Querying and downloading data for quadkey {quadkey} {country_info}...") 230 | if verbose: 231 | print_timestamped_message(f"WKT: {wkt}") 232 | if country_info != "": 233 | print_timestamped_message(f"Expect query times of at least 5-10 seconds") 234 | else: 235 | print_timestamped_message(f"Expect query times of at least 30 seconds - this can be lessened by using the --country-iso option") 236 | 237 | # download data into DuckDB 238 | hive_partitioning = settings.sources[source].hive_partitioning 239 | hive_value = 1 if hive_partitioning else 0 240 | select_values = "* EXCLUDE geometry" 241 | # if source is overture and the output is not parquet, then name the values to get 242 | # so we don't get the crazy structs that gis formats barf on 243 | if source == Source.OVERTURE and format != Format.PARQUET: 244 | select_values = "id, level, height, numfloors, class, country_iso, quadkey" 245 | base_sql = f"select {select_values}, ST_AsWKB(ST_GeomFromWKB(geometry)) AS geometry from read_parquet('{settings.sources[source].base_url}', hive_partitioning={hive_value})" 246 | where_clause = "WHERE " 247 | if country_iso: 248 | where_clause += f"country_iso = '{country_iso}' AND " 249 | where_clause += f"quadkey LIKE '{quadkey}%'" 250 | where_clause += f" AND\nST_Within(ST_GeomFromWKB(geometry), ST_GeomFromText('{wkt}'))" 251 | 252 | create_clause = f"CREATE TABLE buildings AS ({base_sql},\n{where_clause});" 253 | if generate_sql or verbose: 254 | print_timestamped_message(create_clause) 255 | 256 | if not generate_sql: 257 | conn = duckdb.connect(database=':memory:') 258 | 259 | spatial_extension_query = conn.execute("SELECT * FROM duckdb_extensions() WHERE installed IS TRUE AND extension_name = 'spatial';").fetchone() 260 | if spatial_extension_query is None: 261 | print_timestamped_message("Installing DuckDB spatial extension...") 262 | conn.execute("INSTALL spatial;") 263 | conn.execute("LOAD spatial;") 264 | conn.execute(create_clause) 265 | 266 | count = conn.execute("SELECT COUNT(*) FROM buildings;").fetchone()[0] 267 | 268 | print_timestamped_message(f"Downloaded {count} features into DuckDB.") 269 | if count == 0: 270 | if country_iso is not None: 271 | print_timestamped_message(f"If you are sure that your GeoJSON should have buildings then check to be sure that {country_iso} is the right code.") 272 | if verbose: 273 | print_elapsed_time(start_time) 274 | return 275 | 276 | # export to dst 277 | if not generate_sql: 278 | print_timestamped_message(f"Writing to {dst}...") 279 | 280 | if format == Format.PARQUET: 281 | copy_statement = f"COPY buildings TO '{dst}' WITH (FORMAT Parquet);" 282 | if generate_sql or verbose: 283 | print_timestamped_message(copy_statement) 284 | if not generate_sql: 285 | conn.execute(f"COPY buildings TO '{dst}' WITH (FORMAT Parquet);") 286 | try: 287 | df = pd.read_parquet(dst) 288 | 289 | # Convert WKB geometry to geopandas geometry 290 | df['geometry'] = df['geometry'].apply(wkb.loads, hex=True) 291 | gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326") 292 | # Change output file the input_filename with .parquet replaced with _geo.parquet 293 | output_filename = dst.replace(".parquet", "_geo.parquet") 294 | 295 | gdf.to_parquet(output_filename) 296 | # delete the original file 297 | os.remove(dst) 298 | # Rename (move) the output file to the input filename 299 | shutil.move(output_filename, dst) 300 | if verbose: 301 | print_timestamped_message(f"Finished processing {dst} at {time.ctime()}") 302 | except Exception as e: 303 | print(f"Error processing {dst} to geoparquet: {e}") 304 | else: 305 | gdal_format = { 306 | Format.SHAPEFILE: 'ESRI Shapefile', 307 | Format.GEOJSON: 'GeoJSON', 308 | Format.GEOPACKAGE: 'GPKG', 309 | Format.FLATGEOBUF: 'FlatGeobuf' 310 | } 311 | conn.execute(f"COPY buildings TO '{dst}' WITH (FORMAT GDAL, DRIVER '{gdal_format[format]}');") 312 | 313 | if verbose: 314 | print_elapsed_time(start_time) 315 | 316 | # Registering the commands with the main group 317 | cli.add_command(quadkey) 318 | cli.add_command(WKT) 319 | cli.add_command(sql) 320 | cli.add_command(quad2json) 321 | #cli.add_command(download) 322 | 323 | if __name__ == '__main__': 324 | cli() 325 | -------------------------------------------------------------------------------- /open_buildings/google/__init.py__: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /open_buildings/google/add_columns.py: -------------------------------------------------------------------------------- 1 | # This script is a slightly more generic version of the overture add_columns.py. There's 2 | # some chance it could be completely generic, but I was just trying to work on google buildings 3 | # so put it under there. The main difference is that it doesn't use the midpoint of the 4 | # bbox struct, since that's unique to overture. It just uses the centroid of the geometry. 5 | # That could likely work just as well if not better for overture too, so we likely can just 6 | # get rid of that. 7 | # The other thing that would be nice to make it truly generic is to be able to supply the 8 | # table name, since this should work fine with other types of data. Could also just call it 9 | # 'features' by default, the table name doesn't really matter in these processings. Should probably check 10 | # to be sure it works with lines and points too. So this could use clean up, also just 11 | # removing the 'midpoint' code. 12 | 13 | import os 14 | import duckdb 15 | import time 16 | import tempfile 17 | import subprocess 18 | import glob 19 | from duckdb.typing import * 20 | import mercantile 21 | from shapely import wkt 22 | import shutil 23 | 24 | def lat_lon_to_quadkey(wkt_point: VARCHAR, level: INTEGER) -> VARCHAR: 25 | 26 | geom = wkt.loads(wkt_point) 27 | 28 | # convert geom to tile 29 | tile = mercantile.tile(geom.x, geom.y, level) 30 | 31 | # Convert the tile to a quadkey 32 | quadKey = mercantile.quadkey(tile) 33 | return quadKey 34 | 35 | def midpoint(minval: DOUBLE, maxval: DOUBLE) -> DOUBLE: 36 | return (minval + maxval) / 2.0 37 | 38 | def add_quadkey(con): 39 | 40 | # Register Python UDFs 41 | con.create_function('lat_lon_to_quadkey', lat_lon_to_quadkey, [VARCHAR, INTEGER], VARCHAR) 42 | con.create_function('midpoint', midpoint, [DOUBLE, DOUBLE], DOUBLE) 43 | 44 | # Add a quadkey column to the table if it doesn't exist 45 | con.execute("ALTER TABLE buildings ADD COLUMN IF NOT EXISTS quadkey VARCHAR") 46 | 47 | # Update the quadkey column 48 | con.execute(""" 49 | UPDATE buildings 50 | SET quadkey = lat_lon_to_quadkey(ST_Centroid(ST_GeomFromWKB(geometry)), 51 | 12 52 | ); 53 | """) 54 | 55 | def add_country_iso(con, country_parquet_path): 56 | # Load country parquet file into duckdb 57 | con.execute(f"CREATE TABLE countries AS SELECT * FROM read_parquet('{country_parquet_path}')") 58 | 59 | # Add a country_iso column to the buildings table 60 | con.execute("ALTER TABLE buildings ADD COLUMN IF NOT EXISTS country_iso VARCHAR") 61 | 62 | # Update the country_iso column in the buildings table 63 | con.execute(""" 64 | UPDATE buildings 65 | SET country_iso = countries.isocountrycodealpha2 66 | FROM countries 67 | WHERE ST_Intersects(ST_GeomFromWKB(countries.geometry), ST_GeomFromWKB(buildings.geometry)) 68 | """) 69 | 70 | def process_parquet_file(input_parquet_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False): 71 | # Ensure output_folder exists 72 | os.makedirs(output_folder, exist_ok=True) 73 | 74 | # Get unique identifier from file name 75 | file_id = os.path.basename(input_parquet_path) 76 | 77 | # Define output paths 78 | output_db_path = os.path.join(output_folder, f'{file_id}.duckdb') 79 | output_parquet_path = os.path.join(output_folder, f'{file_id}') 80 | 81 | # Check if output files exist 82 | if (os.path.exists(output_db_path) or os.path.exists(output_parquet_path)) and not overwrite: 83 | print(f'Files with ID {file_id} already exist. Skipping...') 84 | return 85 | 86 | # Overwrite mode: remove existing files 87 | if overwrite: 88 | for file_path in [output_db_path, output_parquet_path]: 89 | if os.path.exists(file_path): 90 | os.remove(file_path) 91 | timestamp = time.time() 92 | print(f"Starting processing for file {input_parquet_path} at {time.ctime(timestamp)}") 93 | 94 | # Connect to DuckDB 95 | con = duckdb.connect(output_db_path) 96 | 97 | con.execute('LOAD spatial;') 98 | 99 | # Load parquet file into duckdb 100 | con.execute(f"CREATE TABLE buildings AS SELECT * FROM read_parquet('{input_parquet_path}')") 101 | 102 | if add_quadkey_option: 103 | add_quadkey(con) 104 | 105 | if add_country_iso_option: 106 | add_country_iso(con, country_parquet_path) 107 | 108 | # Write out to Parquet 109 | con.execute(f"COPY (SELECT * FROM buildings ORDER BY quadkey) TO '{output_parquet_path}' WITH (FORMAT Parquet)") 110 | 111 | if (False): 112 | print(f"Converting to geoparquet: {output_parquet_path}") 113 | # Create a temporary file 114 | temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) 115 | temp_file.close() # Close the file so gpq can open it 116 | 117 | # Convert the Parquet file to a GeoParquet file using gpq 118 | gpq_cmd = ['gpq', 'convert', f'{output_parquet_path}', temp_file.name] 119 | subprocess.run(gpq_cmd, check=True) 120 | 121 | # Rename the temp file to the final filename 122 | shutil.move(temp_file.name, f'{output_parquet_path}') 123 | #os.rename(temp_file.name, f'{output_parquet_path}') 124 | 125 | print(f"Processing complete for file {input_parquet_path}") 126 | 127 | remove_duckdb = False 128 | 129 | # remove duckdb file 130 | if (remove_duckdb): 131 | os.remove(output_db_path) 132 | 133 | def process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False): 134 | # If input_path is a directory, process all Parquet files in it 135 | if os.path.isdir(input_path): 136 | for file in glob.glob(os.path.join(input_path, "*.parquet")): 137 | process_parquet_file(file, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option) 138 | else: 139 | process_parquet_file(input_path, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option) 140 | 141 | # Call the function 142 | input_path = '/Users/cholmes/geodata/google-buildings-v3/geoparquet/' 143 | output_folder = '/Users/cholmes/geodata/google-buildings-v3/geoparquet-columns' 144 | country_parquet_path = '/Volumes/fastdata/overture/countries.parquet' 145 | process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=True, add_country_iso_option=True) -------------------------------------------------------------------------------- /open_buildings/google/partition.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script takes a DuckDB database with a buildings table and converts it to GeoParquet 3 | files partitioned on first country and then quadkey. The buildings table must have a 4 | country_iso field and quadkey field, populated by overture-buildings-parquet-add-columns.py. 5 | The main function is process_db(), and it will take as input a maximum number of rows per 6 | file and a row group size for the Parquet files. It will then iterate through the countries 7 | in the database and partition the buildings table into GeoParquet files for each country. 8 | If the number of rows for a country is greater than the maximum number of rows per file, 9 | it will partition the country into quadkeys and create GeoParquet files for each quadkey. 10 | Those quadkeys will be further partitioned if necessary until the number of rows for a 11 | quadkey is less than or equal to the maximum number of rows per file. 12 | """ 13 | 14 | import duckdb 15 | import datetime 16 | import subprocess 17 | import tempfile 18 | import os 19 | import click 20 | import shutil 21 | import geopandas as gpd 22 | from shapely import wkb 23 | import pandas as pd 24 | import time 25 | 26 | def current_time_str(): 27 | return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 28 | 29 | def print_verbose(msg, verbose): 30 | if verbose: 31 | print(f"[{current_time_str()}] {msg}") 32 | 33 | def convert_gpq(input_filename, row_group_size, verbose): 34 | print_verbose(f"Starting conversion for {input_filename} using gpq (row_group_size ignored).", verbose) 35 | 36 | # Create a temporary file 37 | temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) 38 | temp_file.close() # Close the file so gpq can open it 39 | 40 | # Convert the Parquet file to a GeoParquet file using gpq 41 | gpq_cmd = ['gpq', 'convert', input_filename, temp_file.name] 42 | subprocess.run(gpq_cmd, check=True) 43 | 44 | print_verbose(f"Conversion for {input_filename} using gpq finished.", verbose) 45 | 46 | # Rename (move) the temp file to the final filename 47 | shutil.move(temp_file.name, input_filename) 48 | 49 | # Delete the initial temp file if it still exists 50 | #initial_temp_filename = f'{country_code}_temp.parquet' 51 | #if os.path.exists(initial_temp_filename): 52 | # os.remove(initial_temp_filename) 53 | 54 | def convert_pandas(input_filename, rg_size, verbose): 55 | # Placeholder function to be fleshed out 56 | print_verbose("Starting conversion using pandas.", verbose) 57 | try: 58 | df = pd.read_parquet(input_filename) 59 | 60 | # Convert WKB geometry to geopandas geometry 61 | df['geometry'] = df['geometry'].apply(wkb.loads, hex=True) 62 | gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326") 63 | # Change output file the input_filename with .parquet replaced with _geo.parquet 64 | output_filename = input_filename.replace(".parquet", "_geo.parquet") 65 | 66 | gdf.to_parquet(output_filename, row_group_size=rg_size) 67 | # delete the original file 68 | os.remove(input_filename) 69 | # Rename (move) the output file to the input filename 70 | shutil.move(output_filename, input_filename) 71 | print(f"Finished processing {input_filename} at {time.ctime()}") 72 | except Exception as e: 73 | print(f"Error processing {input_filename}: {e}") 74 | 75 | #not quite working yet - not sure what's wrong. Should go faster than pandas. 76 | def convert_ogr(input_filename, rg_size, verbose): 77 | fields_to_keep = ['confidence', 'area_in_meters', 'full_plus_code', 'country_iso', 'quadkey'] 78 | output_filename = input_filename.replace(".parquet", "_geo.parquet") 79 | rg_cmd = f"ROW_GROUP_SIZE={rg_size}" 80 | cmd = [ 81 | 'ogr2ogr', 82 | '-f', 83 | 'Parquet', 84 | '-select', 85 | ','.join(fields_to_keep), 86 | output_filename, 87 | input_filename, 88 | # '-oo', 89 | # rg_cmd, 90 | '-oo', 91 | 'GEOM_POSSIBLE_NAMES=geometry', 92 | '-a_srs', 93 | 'EPSG:4326', ] 94 | 95 | # print the ogr2ogr command that will be run 96 | if verbose: 97 | print("ogr2ogr command:") 98 | print(' '.join(cmd)) 99 | 100 | # Run the command 101 | subprocess.run(cmd, check=True) 102 | 103 | # delete the original file 104 | os.remove(input_filename) 105 | # Rename (move) the output file to the input filename 106 | shutil.move(output_filename, input_filename) 107 | print(f"Finished processing {input_filename} at {time.ctime()}") 108 | 109 | if verbose: 110 | print(f"Converted {input_filename} to {output_filename} using ogr2ogr.") 111 | 112 | 113 | 114 | def fetch_quadkeys(conn, table_name, country_code, length, verbose, prev_qk=""): 115 | query = f"SELECT DISTINCT SUBSTR(quadkey, 1, {length}) FROM {table_name} WHERE country_iso = '{country_code}'" 116 | if prev_qk: 117 | query += f" AND SUBSTR(quadkey, 1, {len(prev_qk)}) = '{prev_qk}'" 118 | print_verbose(f'Executing: {query}', verbose) 119 | return conn.execute(query).fetchall() 120 | 121 | def convert_to_geoparquet(parquet_path, geo_conversion, row_group_size, verbose): 122 | if geo_conversion == 'gpq': 123 | convert_gpq(parquet_path, row_group_size, verbose) 124 | print_verbose(f"File: {parquet_path} written with gpq", verbose) 125 | elif geo_conversion == 'pandas': 126 | convert_pandas(parquet_path, row_group_size, verbose) 127 | print_verbose(f"File: {parquet_path} written with pandas", verbose) 128 | elif geo_conversion == 'ogr': 129 | convert_ogr(parquet_path, row_group_size, verbose) 130 | print_verbose(f"File: {parquet_path} written with ogr", verbose) 131 | else: 132 | print_verbose(f"File: {parquet_path} written without converting to GeoParquet", verbose) 133 | 134 | #TODO: go all the way into the quad to find the smallest quadkey that contains less than max_per_file rows 135 | def process_quadkey_recursive(conn, table_name, country_code, output_folder, length, geo_conversion, row_group_size, verbose, max_per_file, current_qk=""): 136 | distinct_quadkeys = fetch_quadkeys(conn, table_name, country_code, length, verbose, current_qk) 137 | print_verbose(f"The list of quadkeys for country {country_code} and length {length} is {distinct_quadkeys}", verbose) 138 | #num_distinct_qk = len(distinct_quadkeys) 139 | for qk in distinct_quadkeys: 140 | qk_str = qk[0] 141 | qk_count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}'" 142 | print_verbose(f'Executing: {qk_count_query}', verbose) 143 | qk_count = conn.execute(qk_count_query).fetchone()[0] 144 | print_verbose(f"Quadkey {qk_str} has {qk_count} rows", verbose) 145 | if qk_count > max_per_file: 146 | process_quadkey_recursive(conn, table_name, country_code, output_folder, length + 1, geo_conversion, row_group_size, verbose, max_per_file, qk_str) 147 | else: 148 | quad_output_filename = os.path.join(output_folder, f'{country_code}_{qk_str}.parquet') 149 | if os.path.exists(quad_output_filename): 150 | print_verbose(f"Output file {quad_output_filename} already exists, skipping...", verbose) 151 | else: 152 | copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}' ORDER BY quadkey) TO '{quad_output_filename}' WITH (FORMAT PARQUET);" 153 | print_verbose(f'Executing: {copy_cmd}', verbose) 154 | conn.execute(copy_cmd) 155 | convert_to_geoparquet(quad_output_filename, geo_conversion, row_group_size, verbose) 156 | 157 | 158 | # TODO: add option for 'hive' output (put things in folder) 159 | # TODO: add option to read duckdb path from an environment variable 160 | # TODO: add row group size option (first works with duckdb) 161 | 162 | @click.command() 163 | @click.argument('duckdb-path', type=click.Path(exists=True)) 164 | @click.option('--output-folder', default=os.getcwd(), type=click.Path(), help='Folder to store the output files') 165 | @click.option('--geo-conversion', default='gpq', type=click.Choice(['gpq', 'none', 'pandas', 'ogr'], case_sensitive=False)) 166 | @click.option('--verbose', is_flag=True, default=False, help='Print verbose output') 167 | @click.option('--max-per-file', default=10000000, type=int, help='Maximum number of rows per file') 168 | @click.option('--row-group-size', default=10000, type=int, help='Row group size for Parquet files') 169 | @click.option('--hive', is_flag=True, default=False, help='Output files in Hive format (folder structure)') 170 | def process_db(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive): 171 | table_name = 'buildings' 172 | # create output folder if it does not exist 173 | os.makedirs(output_folder, exist_ok=True) 174 | conn = duckdb.connect(duckdb_path) 175 | conn.execute('LOAD spatial;') 176 | cursor = conn.execute('SELECT DISTINCT country_iso FROM buildings') 177 | countries = cursor.fetchall() 178 | 179 | print_verbose(f'Found {len(countries)} unique countries', verbose) 180 | countries.reverse() 181 | for country in countries: 182 | country_code = country[0] 183 | write_folder = output_folder 184 | if (hive): 185 | write_folder = os.path.join(output_folder, f'country_iso={country_code}') 186 | os.makedirs(write_folder, exist_ok=True) 187 | output_filename = os.path.join(write_folder, f'{country_code}.parquet') 188 | if os.path.exists(output_filename): 189 | print_verbose(f"Output file for country {country_code} already exists, skipping...", verbose) 190 | continue 191 | 192 | count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}'" 193 | print_verbose(f'Executing: {count_query}', verbose) 194 | count = conn.execute(count_query).fetchone()[0] 195 | print_verbose(f"Country {country_code} has {count} rows", verbose) 196 | 197 | if count <= max_per_file: 198 | copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' ORDER BY quadkey) TO '{output_filename}' WITH (FORMAT PARQUET);" 199 | print_verbose(f'Executing: {copy_cmd}', verbose) 200 | conn.execute(copy_cmd) 201 | convert_to_geoparquet(output_filename, geo_conversion, row_group_size, verbose) 202 | else: 203 | process_quadkey_recursive(conn, table_name, country_code, write_folder, 1, geo_conversion, row_group_size, verbose, max_per_file) 204 | 205 | if __name__ == "__main__": 206 | process_db() -------------------------------------------------------------------------------- /open_buildings/google/process.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | from datetime import datetime, timedelta 5 | import json 6 | 7 | import click 8 | import glob 9 | import duckdb 10 | import pandas as pd 11 | import geopandas as gpd 12 | from shapely import wkt 13 | from shapely.geometry import mapping 14 | from openlocationcode import openlocationcode as olc 15 | 16 | # Global variable, that runs GPQ (https://github.com/planetlabs/gpq) after DuckDB writes the Parquet file. 17 | # This is necessary because DuckDB does not write the GeoParquet metadata (yet). Once DuckDB implements 18 | # this feature can be removed. Setting it to false will give a sense of how fast DuckDB will be, but 19 | # if you want to actually use the output GeoParquet files, set it to True. 20 | RUN_GPQ_CONVERSION = True 21 | 22 | # Global variable, that sets the compression type for the Parquet files. The two options that 23 | # will work for both DuckDB and pandas are 'snappy' and 'gzip'. 'snappy' is the default. You can 24 | # try out brotli with pandas, it seems to give the most compression. DuckDB additional supports 25 | # zstd, but pandas does not. Note that GPQ conversion on DuckDB output likely keeps the same 26 | # compression, but I have not tested this. GPQ conversion from Parquet does not yet support 27 | # the other GPQ compression options. 28 | PARQUET_COMPRESSION = 'snappy' 29 | 30 | # Don't run the DuckDB GPKG conversion if set to true, as it takes a long time, likely due to a bug. 31 | # It means longer runs and puts one big time on the graphs. 32 | SKIP_DUCK_GPKG = True 33 | 34 | @click.group() 35 | def cli(): 36 | pass 37 | 38 | 39 | def define_output_paths(input_file_path, output_directory, format): 40 | output_file_name = os.path.basename(input_file_path)[:-3] + format 41 | output_file_path = os.path.join(output_directory, output_file_name) 42 | # TODO: the -3 doesn't work with .parquet, leads to a weird file name, but duck doesn't care. 43 | duckdb_file_path = output_file_path[:-3] + 'duckdb' 44 | return output_file_path, duckdb_file_path 45 | 46 | 47 | def remove_existing_files(output_file_path, duckdb_file_path, overwrite): 48 | if overwrite: 49 | if os.path.exists(output_file_path): 50 | os.remove(output_file_path) 51 | if os.path.exists(duckdb_file_path): 52 | os.remove(duckdb_file_path) 53 | 54 | 55 | def process_with_duckdb( 56 | input_file_path, 57 | duckdb_file_path, 58 | split_multipolygons, 59 | verbose, 60 | format, 61 | output_file_path, 62 | ): 63 | # new duckdb at input file path but with .duckdb 64 | conn = duckdb.connect(duckdb_file_path) 65 | c = conn.cursor() 66 | c.execute(f"install spatial;") 67 | c.execute(f"load spatial;") 68 | c.execute( 69 | f"create table buildings as (select * EXCLUDE (latitude, longitude) from '{input_file_path}');" 70 | ) 71 | 72 | if verbose: 73 | c.execute("SELECT COUNT(*) FROM buildings") 74 | print(f"Original rows: {c.fetchone()[0]}") 75 | 76 | if split_multipolygons: 77 | # Fetch the multipolygons 78 | c.execute("SELECT * FROM buildings WHERE geometry LIKE 'MULTIPOLYGON%'") 79 | results = c.fetchall() 80 | columns = [desc[0] for desc in c.description] 81 | 82 | multipolygon_count = 0 83 | 84 | # Process each multipolygon 85 | for row in results: 86 | multipolygon_count += 1 87 | row_dict = dict(zip(columns, row)) 88 | multipolygon = wkt.loads(row_dict['geometry']) 89 | 90 | if verbose: 91 | # Print the original MultiPolygon 92 | feature = { 93 | "type": "Feature", 94 | "properties": { 95 | k: v for k, v in row_dict.items() if k != 'geometry' 96 | }, 97 | "geometry": multipolygon.__geo_interface__, 98 | } 99 | print("Original MultiPolygon:") 100 | print(json.dumps(feature)) 101 | 102 | for polygon in multipolygon.geoms: 103 | # Convert the polygon to a GeoSeries in order to project it 104 | polygon_projected = gpd.GeoSeries([polygon], crs="EPSG:4326").to_crs( 105 | 'EPSG:6933' 106 | ) 107 | 108 | # Compute the new area (geopandas calculates area in square meters for projected CRS) 109 | new_area = polygon_projected.area.values[0] 110 | 111 | # Compute the centroid and encode it into a Plus Code 112 | centroid = polygon.centroid 113 | new_plus_code = olc.encode(centroid.y, centroid.x, codeLength=12) 114 | 115 | # Create new properties for the polygon 116 | properties = {k: v for k, v in row_dict.items() if k != 'geometry'} 117 | properties['area_in_meters'] = new_area 118 | properties['full_plus_code'] = new_plus_code 119 | 120 | if verbose: 121 | # Print the new Polygon 122 | feature = { 123 | "type": "Feature", 124 | "properties": properties, 125 | "geometry": polygon.__geo_interface__, 126 | } 127 | print("Component Polygon:") 128 | print(json.dumps(feature)) 129 | 130 | # Insert new polygon into buildings table 131 | columns_str = ', '.join( 132 | [f'"{k}"' for k in properties.keys()] + ['geometry'] 133 | ) 134 | values_str = ', '.join( 135 | [ 136 | f"'{v}'" if isinstance(v, str) else str(v) 137 | for v in properties.values() 138 | ] 139 | + [f"'{polygon.wkt}'"] 140 | ) 141 | c.execute( 142 | f"INSERT INTO buildings ({columns_str}) VALUES ({values_str})" 143 | ) 144 | 145 | if verbose: 146 | print(f"Processed {multipolygon_count} multipolygons.") 147 | 148 | # Delete the original multipolygons 149 | c.execute("DELETE FROM buildings WHERE geometry LIKE 'MULTIPOLYGON%'") 150 | 151 | if verbose: 152 | c.execute("SELECT COUNT(*) FROM buildings") 153 | print(f"Output rows: {c.fetchone()[0]}") 154 | 155 | c.execute("SELECT COUNT(*) FROM buildings WHERE geometry LIKE 'MULTIPOLYGON%'") 156 | print(f"Output multipolygons: {c.fetchone()[0]}") 157 | 158 | c.execute("SELECT COUNT(*) FROM buildings WHERE geometry LIKE 'POLYGON%'") 159 | print(f"Output polygons: {c.fetchone()[0]}") 160 | 161 | if format == 'fgb': 162 | c.execute( 163 | f"COPY (SELECT * EXCLUDE geometry, ST_AsWKB(ST_GeomFromText(geometry)) AS geometry from buildings) \ 164 | TO '{output_file_path}' WITH (FORMAT GDAL, DRIVER 'FlatGeobuf');" 165 | ) 166 | elif format == 'parquet': 167 | c.execute( 168 | f"COPY (SELECT * EXCLUDE geometry, ST_AsWKB(ST_GeomFromText(geometry)) AS geometry from buildings) \ 169 | TO '{output_file_path}' WITH (FORMAT PARQUET, COMPRESSION '{PARQUET_COMPRESSION}');" 170 | ) 171 | if RUN_GPQ_CONVERSION: 172 | print( 173 | f"Running gpq convert on {output_file_path}. This takes extra time but ensures the output is valid GeoParquet." 174 | ) 175 | base_name, ext = os.path.splitext(output_file_path) 176 | temp_output_file_path = base_name + '_temp' + ext 177 | 178 | # convert from parquet file with a geometry column named wkb to GeoParquet 179 | command = ['gpq', 'convert', output_file_path, temp_output_file_path] 180 | gpq_start_time = time.time() 181 | subprocess.run(command, check=True) 182 | os.rename(temp_output_file_path, output_file_path) 183 | gpq_end_time = time.time() 184 | gpq_elapsed_time = gpq_end_time - gpq_start_time 185 | print(f"Time taken to run gpq: {gpq_elapsed_time:.2f} seconds") 186 | else: 187 | print( 188 | f"Skipping gpq convert on {output_file_path}. This means the output will be WKB, but it will need to be converted to GeoParquet." 189 | ) 190 | elif format == 'gpkg': 191 | if SKIP_DUCK_GPKG: 192 | print( 193 | f"Skipping duckdb-gpkg conversion on {output_file_path}, since SKIP_DUCK_GPKG is set to True. There is likely a bug, since it takes way longer and skews the graphs" 194 | ) 195 | else: 196 | c.execute( 197 | f"COPY (SELECT * EXCLUDE geometry, ST_AsWKB(ST_GeomFromText(geometry)) AS geometry from buildings) \ 198 | TO '{output_file_path}' WITH (FORMAT GDAL, DRIVER 'GPKG');" 199 | ) 200 | elif format == 'shp': 201 | c.execute( 202 | f"COPY (SELECT * EXCLUDE geometry, ST_AsWKB(ST_GeomFromText(geometry)) AS geometry from buildings) \ 203 | TO '{output_file_path}' WITH (FORMAT GDAL, DRIVER 'ESRI Shapefile');" 204 | ) 205 | 206 | conn.close() 207 | 208 | 209 | def process_with_pandas( 210 | input_file_path, split_multipolygons, verbose, format, output_file_path 211 | ): 212 | df = pd.read_csv(input_file_path) 213 | gs = gpd.GeoSeries.from_wkt(df['geometry']) 214 | 215 | # Drop the 'latitude', 'longitude' and 'geometry' columns 216 | df = df.drop(['latitude', 'longitude', 'geometry'], axis=1) 217 | 218 | # Convert the DataFrame to a GeoDataFrame 219 | gdf = gpd.GeoDataFrame(df, geometry=gs, crs="EPSG:4326") 220 | 221 | # Create an empty GeoDataFrame for the output 222 | output_gdf = gpd.GeoDataFrame(columns=list(gdf.columns), crs=gdf.crs) 223 | 224 | if split_multipolygons: 225 | multipolygons = gdf[gdf.geometry.type == 'MultiPolygon'] 226 | multipolygon_count = 0 227 | for i, row in multipolygons.iterrows(): 228 | multipolygon_count += 1 229 | # Print the original MultiPolygon 230 | feature = { 231 | "type": "Feature", 232 | "properties": row.drop('geometry').to_dict(), 233 | "geometry": row.geometry.__geo_interface__, 234 | } 235 | if verbose: 236 | print("Original MultiPolygon:") 237 | print(json.dumps(feature)) 238 | 239 | # Print each component Polygon 240 | for polygon in row.geometry.geoms: 241 | # Convert the polygon to a GeoSeries in order to project it 242 | polygon_projected = gpd.GeoSeries([polygon], crs=gdf.crs).to_crs( 243 | 'EPSG:6933' 244 | ) 245 | 246 | # Compute the new area (geopandas calculates area in square meters for projected CRS) 247 | new_area = polygon_projected.area.values[0] 248 | 249 | # Compute the centroid and encode it into a Plus Code 250 | centroid = polygon.centroid 251 | new_plus_code = olc.encode(centroid.y, centroid.x, codeLength=12) 252 | 253 | # Create new properties for the polygon 254 | properties = row.drop('geometry').to_dict() 255 | properties['area_in_meters'] = new_area 256 | properties['full_plus_code'] = new_plus_code 257 | 258 | # Append to the output GeoDataFrame 259 | output_gdf = pd.concat( 260 | [ 261 | output_gdf, 262 | gpd.GeoDataFrame([properties], geometry=[polygon], crs=gdf.crs), 263 | ], 264 | ignore_index=True, 265 | ) 266 | 267 | # Print the new Polygon 268 | feature = { 269 | "type": "Feature", 270 | "properties": properties, 271 | "geometry": polygon.__geo_interface__, 272 | } 273 | if verbose: 274 | print("Component Polygon:") 275 | print(json.dumps(feature)) 276 | 277 | print(f"Processed {multipolygon_count} multipolygons.") 278 | # Add the original Polygons to the output 279 | polygons = gdf[gdf.geometry.type == 'Polygon'] 280 | output_gdf = pd.concat([output_gdf, polygons], ignore_index=True) 281 | else: 282 | output_gdf = gdf 283 | 284 | if verbose: 285 | # Print the number of original rows in the datafram, and the number of rows in the output 286 | print(f"Original rows: {len(gdf)}") 287 | print(f"Output rows: {len(output_gdf)}") 288 | # Print number of multipolygons and polygons for the output_gdf 289 | print( 290 | f"Output multipolygons: {len(output_gdf[output_gdf.geometry.type == 'MultiPolygon'])}" 291 | ) 292 | print( 293 | f"Output polygons: {len(output_gdf[output_gdf.geometry.type == 'Polygon'])}" 294 | ) 295 | # Write the output GeoDataFrame to a file 296 | if format == 'fgb': 297 | output_gdf.to_file(output_file_path, driver="FlatGeobuf", engine="pyogrio") 298 | elif format == 'parquet': 299 | output_gdf.to_parquet(output_file_path, compression=PARQUET_COMPRESSION) 300 | elif format == 'gpkg': 301 | output_gdf.to_file( 302 | output_file_path, driver='GPKG', engine="pyogrio", spatial_index=False 303 | ) 304 | elif format == 'shp': 305 | output_gdf.to_file(output_file_path, driver='ESRI Shapefile', engine="pyogrio") 306 | 307 | 308 | def process_with_ogr2ogr( 309 | input_file_path, split_multipolygons, verbose, format, output_file_path 310 | ): 311 | # Define the SQL query to select specific columns 312 | table_name = os.path.splitext(os.path.basename(input_file_path))[0] 313 | 314 | if format == 'fgb': 315 | format_string = "FlatGeobuf" 316 | elif format == 'parquet': 317 | format_string = "Parquet" 318 | elif format == 'gpkg': 319 | format_string = "GPKG" 320 | elif format == 'shp': 321 | format_string = "ESRI Shapefile" 322 | 323 | fields_to_keep = ['confidence', 'area_in_meters', 'full_plus_code'] 324 | 325 | # Define the ogr2ogr command 326 | cmd = [ 327 | 'ogr2ogr', 328 | '-f', 329 | format_string, 330 | '-select', 331 | ','.join(fields_to_keep), 332 | output_file_path, 333 | input_file_path, 334 | '-oo', 335 | 'GEOM_POSSIBLE_NAMES=geometry', 336 | '-a_srs', 337 | 'EPSG:4326', 338 | ] 339 | 340 | # If split_multipolygons is True, print a message and return. 341 | # But skip this if the output format is Shapefile, because shapefiles don't have a difference between polygons and multipolygons. 342 | if split_multipolygons and format != 'shp': 343 | print("OGR processing doesn't yet support multi polygons") 344 | return 345 | 346 | # print the ogr2ogr command that will be run 347 | if verbose: 348 | print("ogr2ogr command:") 349 | print(' '.join(cmd)) 350 | 351 | # Run the command 352 | subprocess.run(cmd, check=True) 353 | 354 | if verbose: 355 | print(f"Converted {input_file_path} to {output_file_path} using ogr2ogr.") 356 | 357 | 358 | def process_csv_file( 359 | input_file_path, 360 | output_directory, 361 | format, 362 | overwrite, 363 | process, 364 | split_multipolygons, 365 | verbose, 366 | ): 367 | output_file_path, duckdb_file_path = define_output_paths( 368 | input_file_path, output_directory, format 369 | ) 370 | remove_existing_files(output_file_path, duckdb_file_path, overwrite) 371 | 372 | if os.path.exists(output_file_path): 373 | print(f'Skipping {input_file_path} as {output_file_path} already exists.') 374 | return 375 | else: 376 | print( 377 | f'Started converting {input_file_path} with {process} to {format} at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}...' 378 | ) 379 | 380 | start_time = time.time() 381 | 382 | if process == 'duckdb': 383 | process_with_duckdb( 384 | input_file_path, 385 | duckdb_file_path, 386 | split_multipolygons, 387 | verbose, 388 | format, 389 | output_file_path, 390 | ) 391 | elif process == 'pandas': 392 | process_with_pandas( 393 | input_file_path, split_multipolygons, verbose, format, output_file_path 394 | ) 395 | elif process == 'ogr': 396 | process_with_ogr2ogr( 397 | input_file_path, split_multipolygons, verbose, format, output_file_path 398 | ) 399 | 400 | execution_time = time.time() - start_time 401 | print( 402 | f'Finished processing {output_file_path} at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}. Execution time: {str(timedelta(seconds=execution_time))}' 403 | ) 404 | 405 | 406 | def process_geometries( 407 | input_path, 408 | output_directory, 409 | format, 410 | overwrite, 411 | process, 412 | split_multipolygons, 413 | verbose, 414 | ): 415 | # Check if the provided path is a directory or a file 416 | if os.path.isdir(input_path): 417 | # List all csv files in the directory 418 | csv_files = glob.glob(os.path.join(input_path, '*.csv')) 419 | 420 | # Sort files by size in ascending order 421 | csv_files.sort(key=lambda x: os.path.getsize(x)) 422 | 423 | # Process each csv file 424 | for input_file_path in csv_files: 425 | process_csv_file( 426 | input_file_path, 427 | output_directory, 428 | format, 429 | overwrite, 430 | process, 431 | split_multipolygons, 432 | verbose, 433 | ) 434 | elif os.path.isfile(input_path) and input_path.endswith('.csv'): 435 | # Process the single csv file 436 | process_csv_file( 437 | input_path, 438 | output_directory, 439 | format, 440 | overwrite, 441 | process, 442 | split_multipolygons, 443 | verbose, 444 | ) 445 | else: 446 | raise ValueError(f"Invalid input path: {input_path}") 447 | 448 | 449 | def process_benchmark( 450 | input_path, output_directory, processes, formats, split_multipolygons, verbose 451 | ): 452 | results = [] 453 | for process in processes: 454 | for format in formats: 455 | start_time = time.time() 456 | process_geometries( 457 | input_path, 458 | output_directory, 459 | format, 460 | True, 461 | process, 462 | split_multipolygons, 463 | verbose, 464 | ) 465 | execution_time = time.time() - start_time 466 | if process == 'duckdb' and format == 'gpkg' and SKIP_DUCK_GPKG: 467 | execution_time = 0 468 | results.append( 469 | { 470 | 'process': process, 471 | 'format': format, 472 | #'execution_time': str(timedelta(seconds=execution_time)), 473 | 'execution_time': execution_time, 474 | } 475 | ) 476 | return results 477 | 478 | if __name__ == "__main__": 479 | cli() 480 | -------------------------------------------------------------------------------- /open_buildings/google/stac-geoparquet.py: -------------------------------------------------------------------------------- 1 | # WARNING - Work in progress 2 | # This isn't working yet, but it's close. The main issue is that the catalog 3 | # and collections aren't getting formed right - I want them in the hive partitions, but 4 | # pystac keeps trying to move them in the recommended STAC structure. Committing in case 5 | # its useful. 6 | # Next approach may just be to form the items individually, as that part seems to be fine, 7 | # and then place them in the catalog and collection manually (maybe pystac can help, but 8 | # may be easier to just use python to adjust the links) 9 | 10 | 11 | import os 12 | import pystac 13 | from pystac import Catalog, Collection, Item, Asset, CatalogType 14 | import geopandas as gpd 15 | from datetime import datetime 16 | import click 17 | from shapely.geometry import box 18 | from dateutil.parser import parse 19 | 20 | def read_geoparquet_bounds(filepath): 21 | """ 22 | Reads a Geoparquet file and returns its bounds and EPSG. 23 | """ 24 | gdf = gpd.read_parquet(filepath) 25 | bounds = gdf.total_bounds.tolist() 26 | epsg = gdf.crs.to_epsg() # Extract the EPSG code 27 | return bounds, epsg 28 | 29 | def create_stac_item_for_geoparquet(filepath, collection, item_datetime): 30 | filename = os.path.basename(filepath) 31 | file_id, _ = os.path.splitext(filename) 32 | title = filename 33 | 34 | # Get the bounds and CRS 35 | bbox, epsg = read_geoparquet_bounds(filepath) 36 | 37 | # Use the bounds as the geometry too 38 | geometry = box(*bbox).__geo_interface__ 39 | 40 | item = Item(id=file_id, 41 | geometry=geometry, 42 | bbox=bbox, 43 | datetime=item_datetime, 44 | properties={'title': title, 'proj:epsg': epsg}, 45 | collection=collection.id) 46 | 47 | pystac.extensions.projection.ProjectionExtension.add_to(item) 48 | item.add_asset(key="data", asset=Asset(href=filepath, media_type="application/parquet")) 49 | 50 | return item 51 | 52 | @click.command() 53 | @click.argument('directory', type=click.Path(exists=True)) 54 | @click.option('--collection-path', default='collection.json', help='Path to the collection.json file relative to the directory.') 55 | @click.option('--item-datetime', default='2023-05-30T00:00:00Z', help='Datetime for the STAC items.') 56 | @click.option('--catalog-type', type=click.Choice(['SELF_CONTAINED', 'ABSOLUTE_PUBLISHED'], case_sensitive=False), default='SELF_CONTAINED', help='Type of the catalog.') 57 | @click.option('--root-path', default=None, help='Root path for the catalog. Relevant for ABSOLUTE_PUBLISHED catalog type.') 58 | # ... [other necessary imports and functions] 59 | 60 | def main(directory, collection_path, item_datetime, catalog_type, root_path): 61 | catalog_id = 'my-catalog' 62 | catalog_description = 'A catalog of geoparquet files.' 63 | item_datetime = parse(item_datetime) 64 | collection = Collection.from_file(collection_path) 65 | 66 | # Create the catalog first 67 | catalog = Catalog(id=catalog_id, description=catalog_description, catalog_type=CatalogType[catalog_type]) 68 | 69 | items = [] 70 | for root, _, files in os.walk(directory): 71 | for filename in files: 72 | if filename.endswith(".parquet"): 73 | filepath = os.path.join(root, filename) 74 | item = create_stac_item_for_geoparquet(filepath, collection, item_datetime) 75 | 76 | # Save the item alongside the parquet file 77 | item_path = os.path.join(root, f"{item.id}.json") 78 | item.set_self_href(item_path) 79 | item.save_object() 80 | items.append(item) 81 | 82 | # Create and save the catalog 83 | catalog_path = os.path.join(directory, 'catalog.json') 84 | catalog.set_self_href(catalog_path) 85 | catalog.save_object() 86 | 87 | # Reload the catalog from file 88 | catalog = Catalog.from_file(catalog_path) 89 | 90 | # Add items to the catalog 91 | for item in items: 92 | catalog.add_item(item) 93 | item.add_link(pystac.Link("parent", os.path.relpath(catalog.get_self_href(), os.path.dirname(item.get_self_href())))) 94 | 95 | # Save the updated catalog 96 | catalog.save_object() 97 | 98 | # Load the collection and set its links 99 | collection_path_new = os.path.join(directory, "collection.json") 100 | collection.set_self_href(collection_path_new) 101 | collection.add_child(catalog) 102 | collection.save_object() 103 | 104 | if __name__ == "__main__": 105 | main() 106 | -------------------------------------------------------------------------------- /open_buildings/overture/__init.py__: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /open_buildings/overture/add_columns.py: -------------------------------------------------------------------------------- 1 | # This script is used to take an Overture Parquet file and add columns 2 | # useful for partitioning - it can put in both a quadkey and the country 3 | # ISO code. And then it will write out parquet and use gpq to convert the 4 | # parquet to geoparquet. 5 | 6 | 7 | import glob 8 | import os 9 | import shutil 10 | import subprocess 11 | import tempfile 12 | import time 13 | 14 | import duckdb 15 | import mercantile 16 | from duckdb.typing import * 17 | 18 | 19 | def lat_lon_to_quadkey(lat: DOUBLE, lon: DOUBLE, level: INTEGER) -> VARCHAR: 20 | # Convert latitude and longitude to tile using mercantile 21 | tile = mercantile.tile(lon, lat, level) 22 | 23 | # Convert the tile to a quadkey 24 | quadKey = mercantile.quadkey(tile) 25 | return quadKey 26 | 27 | def midpoint(minval: DOUBLE, maxval: DOUBLE) -> DOUBLE: 28 | return (minval + maxval) / 2.0 29 | 30 | def add_quadkey(con): 31 | 32 | # Register Python UDFs 33 | con.create_function('lat_lon_to_quadkey', lat_lon_to_quadkey, [DOUBLE, DOUBLE, INTEGER], VARCHAR) 34 | con.create_function('midpoint', midpoint, [DOUBLE, DOUBLE], DOUBLE) 35 | 36 | # Add a quadkey column to the table if it doesn't exist 37 | con.execute("ALTER TABLE buildings ADD COLUMN IF NOT EXISTS quadkey VARCHAR") 38 | 39 | # Update the quadkey column 40 | con.execute(""" 41 | UPDATE buildings 42 | SET quadkey = lat_lon_to_quadkey( 43 | midpoint(bbox.miny, bbox.maxy), 44 | midpoint(bbox.minx, bbox.maxx), 45 | 12 46 | ); 47 | """) 48 | 49 | 50 | def add_country_iso(con, country_parquet_path): 51 | # Load country parquet file into duckdb 52 | con.execute(f"CREATE TABLE countries AS SELECT * FROM read_parquet('{country_parquet_path}')") 53 | 54 | # Add a country_iso column to the buildings table 55 | con.execute("ALTER TABLE buildings ADD COLUMN IF NOT EXISTS country_iso VARCHAR") 56 | 57 | # Update the country_iso column in the buildings table 58 | con.execute(""" 59 | UPDATE buildings 60 | SET country_iso = countries.isocountrycodealpha2 61 | FROM countries 62 | WHERE ST_Intersects(ST_GeomFromWKB(countries.geometry), ST_GeomFromWKB(buildings.geometry)) 63 | """) 64 | 65 | def process_parquet_file(input_parquet_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False, verbose=False): 66 | # Ensure output_folder exists 67 | os.makedirs(output_folder, exist_ok=True) 68 | 69 | # Get unique identifier from file name 70 | unique_id = os.path.basename(input_parquet_path).split('_')[-1] 71 | 72 | # Define output paths 73 | output_db_path = os.path.join(output_folder, f'{unique_id}.duckdb') 74 | output_parquet_path = os.path.join(output_folder, f'{unique_id}.parquet') 75 | 76 | # Check if output files exist 77 | if (os.path.exists(output_db_path) or os.path.exists(output_parquet_path)) and not overwrite: 78 | print(f'Files with ID {unique_id} already exist. Skipping...') 79 | return 80 | 81 | # Overwrite mode: remove existing files 82 | if overwrite: 83 | for file_path in [output_db_path, output_parquet_path]: 84 | if os.path.exists(file_path): 85 | os.remove(file_path) 86 | timestamp = time.time() 87 | print(f"Starting processing for file {input_parquet_path} at {time.ctime(timestamp)}") 88 | 89 | # Connect to DuckDB 90 | con = duckdb.connect(output_db_path) 91 | 92 | con.execute('LOAD spatial;') 93 | 94 | # NOTE: exclude names column because it's all NULL and causes InternalException: INTERNAL Error: Attempted to dereference unique_ptr that is NULL! 95 | con.execute(f"CREATE OR REPLACE TABLE buildings AS SELECT * EXCLUDE(names) FROM read_parquet('{input_parquet_path}')") 96 | 97 | if add_quadkey_option: 98 | add_quadkey(con) 99 | 100 | if add_country_iso_option: 101 | add_country_iso(con, country_parquet_path) 102 | 103 | # Write out to Parquet 104 | con.execute(f"COPY (SELECT * FROM buildings ORDER BY quadkey) TO '{output_parquet_path}' WITH (FORMAT Parquet)") 105 | 106 | #TODO: turn this into an option to convert to geoparquet or not 107 | if (True): 108 | print(f"Converting to geoparquet: {output_parquet_path}") 109 | # Create a temporary file 110 | temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) 111 | temp_file.close() # Close the file so gpq can open it 112 | 113 | # Convert the Parquet file to a GeoParquet file using gpq 114 | gpq_cmd = ['gpq', 'convert', f'{output_parquet_path}', temp_file.name] 115 | subprocess.run(gpq_cmd, check=True) 116 | 117 | # Rename the temp file to the final filename 118 | shutil.move(temp_file.name, f'{output_parquet_path}') 119 | #os.rename(temp_file.name, f'{output_parquet_path}') 120 | 121 | print(f"Processing complete for file {input_parquet_path}") 122 | 123 | def process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False, verbose=False): 124 | # If input_path is a directory, process all Parquet files in it 125 | if os.path.isdir(input_path): 126 | for file in glob.glob(os.path.join(input_path, "*")): 127 | process_parquet_file(file, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option, verbose) 128 | else: 129 | process_parquet_file(input_path, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option, verbose) 130 | 131 | # Call the function - uncomment if you want to call this directly from python and put values in here. 132 | # OVERTURE_DIR = pathlib.Path('~/data/src/overture/2024-02-15-alpha.0').expanduser() 133 | # OUT_DIR = pathlib.Path('~/data/prc/overture/2024-02-15') 134 | # ADMIN_BOUNDARIES_LEVEL_1_FP = pathlib.Path("~/data/prc/overture/2024-02-15/admin_boundaries_level_1.parquet") 135 | 136 | # process_parquet_files(str(OVERTURE_DIR), str(OUT_DIR), str(ADMIN_BOUNDARIES_LEVEL_1_FP), overwrite=False, add_quadkey_option=True, add_country_iso_option=False) 137 | 138 | -------------------------------------------------------------------------------- /open_buildings/overture/partition.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script takes a DuckDB database with a buildings table and converts it to GeoParquet 3 | files partitioned on first country and then quadkey. The buildings table must have a 4 | country_iso field and quadkey field, populated by overture-buildings-parquet-add-columns.py. 5 | The main function is process_db(), and it will take as input a maximum number of rows per 6 | file and a row group size for the Parquet files. It will then iterate through the countries 7 | in the database and partition the buildings table into GeoParquet files for each country. 8 | If the number of rows for a country is greater than the maximum number of rows per file, 9 | it will partition the country into quadkeys and create GeoParquet files for each quadkey. 10 | Those quadkeys will be further partitioned if necessary until the number of rows for a 11 | quadkey is less than or equal to the maximum number of rows per file. 12 | """ 13 | 14 | import duckdb 15 | import datetime 16 | import subprocess 17 | import tempfile 18 | import os 19 | import click 20 | import shutil 21 | import geopandas as gpd 22 | from shapely import wkb 23 | import pandas as pd 24 | import time 25 | 26 | def current_time_str(): 27 | return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 28 | 29 | def print_verbose(msg, verbose): 30 | if verbose: 31 | print(f"[{current_time_str()}] {msg}") 32 | 33 | def convert_gpq(input_filename, row_group_size, verbose): 34 | print_verbose(f"Starting conversion for {input_filename} using gpq (row_group_size ignored).", verbose) 35 | 36 | # Create a temporary file 37 | temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) 38 | temp_file.close() # Close the file so gpq can open it 39 | 40 | # Convert the Parquet file to a GeoParquet file using gpq 41 | gpq_cmd = ['gpq', 'convert', input_filename, temp_file.name] 42 | subprocess.run(gpq_cmd, check=True) 43 | 44 | print_verbose(f"Conversion for {input_filename} using gpq finished.", verbose) 45 | 46 | # Rename (move) the temp file to the final filename 47 | shutil.move(temp_file.name, input_filename) 48 | 49 | # Delete the initial temp file if it still exists 50 | #initial_temp_filename = f'{country_code}_temp.parquet' 51 | #if os.path.exists(initial_temp_filename): 52 | # os.remove(initial_temp_filename) 53 | 54 | def convert_pandas(input_filename, rg_size, verbose): 55 | # Placeholder function to be fleshed out 56 | print_verbose("Starting conversion using pandas.", verbose) 57 | try: 58 | df = pd.read_parquet(input_filename) 59 | 60 | # Convert WKB geometry to geopandas geometry 61 | df['geometry'] = df['geometry'].apply(wkb.loads, hex=True) 62 | gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326") 63 | # Change output file the input_filename with .parquet replaced with _geo.parquet 64 | output_filename = input_filename.replace(".parquet", "_geo.parquet") 65 | 66 | gdf.to_parquet(output_filename, row_group_size=rg_size) 67 | # delete the original file 68 | os.remove(input_filename) 69 | # Rename (move) the output file to the input filename 70 | shutil.move(output_filename, input_filename) 71 | print(f"Finished processing {input_filename} at {time.ctime()}") 72 | except Exception as e: 73 | print(f"Error processing {input_filename}: {e}") 74 | 75 | # Note, this doesn't work, but I'm not sure why. May be that ogr doesn't really support 76 | # compatible geospatial parquet, but it really looks like it should. Maybe there's something 77 | # weird with the ones written out. 78 | def convert_ogr(input_filename, rg_size, verbose): 79 | output_filename = input_filename.replace(".parquet", "_geo.parquet") 80 | rg_cmd = f"ROW_GROUP_SIZE={rg_size}" 81 | cmd = [ 82 | 'ogr2ogr', 83 | '-f', 84 | 'Parquet', 85 | output_filename, 86 | input_filename, 87 | # '-oo', 88 | # rg_cmd, 89 | '-oo', 90 | 'GEOM_POSSIBLE_NAMES=geometry', ] 91 | 92 | # print the ogr2ogr command that will be run 93 | if verbose: 94 | print("ogr2ogr command:") 95 | print(' '.join(cmd)) 96 | 97 | # Run the command 98 | subprocess.run(cmd, check=True) 99 | 100 | # delete the original file 101 | os.remove(input_filename) 102 | # Rename (move) the output file to the input filename 103 | shutil.move(output_filename, input_filename) 104 | print(f"Finished processing {input_filename} at {time.ctime()}") 105 | 106 | if verbose: 107 | print(f"Converted {input_filename} to {output_filename} using ogr2ogr.") 108 | 109 | 110 | 111 | def fetch_quadkeys(conn, table_name, country_code, length, verbose, prev_qk=""): 112 | query = f"SELECT DISTINCT SUBSTR(quadkey, 1, {length}) FROM {table_name} WHERE country_iso = '{country_code}'" 113 | if prev_qk: 114 | query += f" AND SUBSTR(quadkey, 1, {len(prev_qk)}) = '{prev_qk}'" 115 | print_verbose(f'Executing: {query}', verbose) 116 | return conn.execute(query).fetchall() 117 | 118 | def convert_to_geoparquet(parquet_path, geo_conversion, row_group_size, verbose): 119 | if geo_conversion == 'gpq': 120 | convert_gpq(parquet_path, row_group_size, verbose) 121 | print_verbose(f"File: {parquet_path} written with gpq", verbose) 122 | elif geo_conversion == 'pandas': 123 | convert_pandas(parquet_path, row_group_size, verbose) 124 | print_verbose(f"File: {parquet_path} written with pandas", verbose) 125 | elif geo_conversion == 'ogr': 126 | convert_ogr(parquet_path, row_group_size, verbose) 127 | print_verbose(f"File: {parquet_path} written with ogr", verbose) 128 | else: 129 | print_verbose(f"File: {parquet_path} written without converting to GeoParquet", verbose) 130 | 131 | #TODO: go all the way into the quad to find the smallest quadkey that contains less than max_per_file rows 132 | def process_quadkey_recursive(conn, table_name, country_code, output_folder, length, geo_conversion, row_group_size, verbose, max_per_file, current_qk=""): 133 | distinct_quadkeys = fetch_quadkeys(conn, table_name, country_code, length, verbose, current_qk) 134 | print_verbose(f"The list of quadkeys for country {country_code} and length {length} is {distinct_quadkeys}", verbose) 135 | #num_distinct_qk = len(distinct_quadkeys) 136 | for qk in distinct_quadkeys: 137 | qk_str = qk[0] 138 | qk_count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}'" 139 | print_verbose(f'Executing: {qk_count_query}', verbose) 140 | qk_count = conn.execute(qk_count_query).fetchone()[0] 141 | print_verbose(f"Quadkey {qk_str} has {qk_count} rows", verbose) 142 | if qk_count > max_per_file: 143 | process_quadkey_recursive(conn, table_name, country_code, output_folder, length + 1, geo_conversion, row_group_size, verbose, max_per_file, qk_str) 144 | else: 145 | quad_output_filename = os.path.join(output_folder, f'{country_code}_{qk_str}.parquet') 146 | if os.path.exists(quad_output_filename): 147 | print_verbose(f"Output file {quad_output_filename} already exists, skipping...", verbose) 148 | else: 149 | copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}' ORDER BY quadkey) TO '{quad_output_filename}' WITH (FORMAT PARQUET);" 150 | print_verbose(f'Executing: {copy_cmd}', verbose) 151 | conn.execute(copy_cmd) 152 | convert_to_geoparquet(quad_output_filename, geo_conversion, row_group_size, verbose) 153 | 154 | 155 | def process_db(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive, table_name): 156 | # create output folder if it does not exist 157 | os.makedirs(output_folder, exist_ok=True) 158 | conn = duckdb.connect(duckdb_path) 159 | conn.execute('LOAD spatial;') 160 | cursor = conn.execute(f'SELECT DISTINCT country_iso FROM {table_name}') 161 | countries = cursor.fetchall() 162 | 163 | print_verbose(f'Found {len(countries)} unique countries', verbose) 164 | #countries.reverse() 165 | for country in countries: 166 | country_code = country[0] 167 | write_folder = output_folder 168 | if (hive): 169 | write_folder = os.path.join(output_folder, f'country_iso={country_code}') 170 | os.makedirs(write_folder, exist_ok=True) 171 | output_filename = os.path.join(write_folder, f'{country_code}.parquet') 172 | if os.path.exists(output_filename): 173 | print_verbose(f"Output file for country {country_code} already exists, skipping...", verbose) 174 | continue 175 | 176 | count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}'" 177 | print_verbose(f'Executing: {count_query}', verbose) 178 | count = conn.execute(count_query).fetchone()[0] 179 | print_verbose(f"Country {country_code} has {count} rows", verbose) 180 | 181 | if count <= max_per_file: 182 | copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' ORDER BY quadkey) TO '{output_filename}' WITH (FORMAT PARQUET);" 183 | print_verbose(f'Executing: {copy_cmd}', verbose) 184 | conn.execute(copy_cmd) 185 | convert_to_geoparquet(output_filename, geo_conversion, row_group_size, verbose) 186 | else: 187 | process_quadkey_recursive(conn, table_name, country_code, output_folder, 1, geo_conversion, row_group_size, verbose, max_per_file) 188 | 189 | if __name__ == "__main__": 190 | process_db() -------------------------------------------------------------------------------- /open_buildings/overture/places_add_columns.py: -------------------------------------------------------------------------------- 1 | # This script is used to take an Overture Parquet file and add columns 2 | # useful for partitioning - it can put in both a quadkey and the country 3 | # ISO code. And then it will write out parquet and use gpq to convert the 4 | # parquet to geoparquet. 5 | # 6 | # There is much more to do, my plan is to incorporate it into the open_buildings 7 | # CLI and let people pick which of the columns they want to add. Also could 8 | # be nice to add the ability to get the data downloaded - this just assumes 9 | # you've already got it. Also need to add the command to create the 10 | # countries.parquet, but it's basically the one in https://github.com/OvertureMaps/data/blob/main/duckdb_queries/admins.sql 11 | # but saved to parquet. You also could just use that command to pull it 12 | # directly into your duckdb database, and change this code (perhaps we 13 | # add an option to pull it remote if not present). This also would 14 | # ideally work with any of the Overture data types, and let you choose 15 | # your table names. 16 | import os 17 | import duckdb 18 | import time 19 | import tempfile 20 | import subprocess 21 | import glob 22 | from duckdb.typing import * 23 | import mercantile 24 | import shutil 25 | 26 | def lat_lon_to_quadkey(lat: DOUBLE, lon: DOUBLE, level: INTEGER) -> VARCHAR: 27 | # Convert latitude and longitude to tile using mercantile 28 | tile = mercantile.tile(lon, lat, level) 29 | 30 | # Convert the tile to a quadkey 31 | quadKey = mercantile.quadkey(tile) 32 | return quadKey 33 | 34 | def add_quadkey(con): 35 | 36 | # Register Python UDFs 37 | con.create_function('lat_lon_to_quadkey', lat_lon_to_quadkey, [DOUBLE, DOUBLE, INTEGER], VARCHAR) 38 | 39 | # Add a quadkey column to the table if it doesn't exist 40 | con.execute("ALTER TABLE places ADD COLUMN IF NOT EXISTS quadkey VARCHAR") 41 | 42 | # Update the quadkey column 43 | # (no need to use midpoint as places is just points, so maxy and miny are the same) 44 | con.execute(""" 45 | UPDATE places 46 | SET quadkey = lat_lon_to_quadkey( 47 | bbox.maxy, 48 | bbox.maxx, 49 | 12 50 | ); 51 | """) 52 | 53 | def add_country_iso(con, country_parquet_path): 54 | # Load country parquet file into duckdb 55 | con.execute(f"CREATE TABLE countries AS SELECT * FROM read_parquet('{country_parquet_path}')") 56 | 57 | # Add a country_iso column to the buildings table 58 | con.execute("ALTER TABLE places ADD COLUMN IF NOT EXISTS country_iso VARCHAR") 59 | 60 | # Update the country_iso column in the buildings table 61 | con.execute(""" 62 | UPDATE places 63 | SET country_iso = countries.isocountrycodealpha2 64 | FROM countries 65 | WHERE ST_Intersects(ST_GeomFromWKB(countries.geometry), ST_GeomFromWKB(places.geometry)) 66 | """) 67 | 68 | def process_parquet_file(input_parquet_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False): 69 | # Ensure output_folder exists 70 | os.makedirs(output_folder, exist_ok=True) 71 | 72 | # Get unique identifier from file name 73 | unique_id = os.path.basename(input_parquet_path).split('_')[-1] 74 | 75 | # Define output paths 76 | output_db_path = os.path.join(output_folder, f'{unique_id}.duckdb') 77 | output_parquet_path = os.path.join(output_folder, f'{unique_id}.parquet') 78 | 79 | # Check if output files exist 80 | if (os.path.exists(output_db_path) or os.path.exists(output_parquet_path)) and not overwrite: 81 | print(f'Files with ID {unique_id} already exist. Skipping...') 82 | return 83 | 84 | # Overwrite mode: remove existing files 85 | if overwrite: 86 | for file_path in [output_db_path, output_parquet_path]: 87 | if os.path.exists(file_path): 88 | os.remove(file_path) 89 | timestamp = time.time() 90 | print(f"Starting processing for file {input_parquet_path} at {time.ctime(timestamp)}") 91 | 92 | # Connect to DuckDB 93 | con = duckdb.connect(output_db_path) 94 | 95 | con.execute('LOAD spatial;') 96 | 97 | # Load parquet file into duckdb 98 | con.execute(f"CREATE TABLE places AS SELECT * FROM read_parquet('{input_parquet_path}')") 99 | 100 | if add_quadkey_option: 101 | add_quadkey(con) 102 | 103 | if add_country_iso_option: 104 | add_country_iso(con, country_parquet_path) 105 | 106 | # Write out to Parquet 107 | con.execute(f"COPY (SELECT * FROM places ORDER BY quadkey) TO '{output_parquet_path}' WITH (FORMAT Parquet)") 108 | 109 | if (True): 110 | print(f"Converting to geoparquet: {output_parquet_path}") 111 | # Create a temporary file 112 | temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) 113 | temp_file.close() # Close the file so gpq can open it 114 | 115 | # Convert the Parquet file to a GeoParquet file using gpq 116 | gpq_cmd = ['gpq', 'convert', f'{output_parquet_path}', temp_file.name] 117 | subprocess.run(gpq_cmd, check=True) 118 | 119 | # Rename the temp file to the final filename 120 | shutil.move(temp_file.name, f'{output_parquet_path}') 121 | #os.rename(temp_file.name, f'{output_parquet_path}') 122 | 123 | print(f"Processing complete for file {input_parquet_path}") 124 | 125 | def process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False): 126 | # If output_folder doesn't exist, create it 127 | os.makedirs(output_folder, exist_ok=True) 128 | # If input_path is a directory, process all Parquet files in it 129 | if os.path.isdir(input_path): 130 | for file in glob.glob(os.path.join(input_path, "*")): 131 | process_parquet_file(file, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option) 132 | else: 133 | process_parquet_file(input_path, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option) 134 | 135 | # Call the function 136 | input_path = '/Volumes/fastdata/overture/s3-data/places/' 137 | output_folder = '/Volumes/fastdata/overture/refined-places-geoparquet/' 138 | country_parquet_path = '/Volumes/fastdata/overture/countries.parquet' 139 | process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=True, add_country_iso_option=True) -------------------------------------------------------------------------------- /open_buildings/settings.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Dict 3 | from pydantic import BaseModel 4 | 5 | class Source(Enum): 6 | GOOGLE = 1 7 | OVERTURE = 2 8 | 9 | class Format(Enum): 10 | SHAPEFILE = 1 11 | GEOJSON = 2 12 | GEOPACKAGE = 3 13 | FLATGEOBUF = 4 14 | PARQUET = 5 15 | 16 | 17 | class SourceSettings(BaseModel): 18 | base_url: str 19 | hive_partitioning: bool 20 | 21 | class SettingsSchema(BaseModel): 22 | sources: Dict[Source, SourceSettings] 23 | extensions: Dict[Format, str] 24 | 25 | settings = SettingsSchema( 26 | sources={ 27 | Source.GOOGLE: SourceSettings( 28 | base_url="s3://us-west-2.opendata.source.coop/google-research-open-buildings/geoparquet-by-country/*/*.parquet", 29 | hive_partitioning=True 30 | ), 31 | Source.OVERTURE: SourceSettings( 32 | base_url="s3://us-west-2.opendata.source.coop/cholmes/overture/geoparquet-country-quad-hive/*/*.parquet", 33 | hive_partitioning=True 34 | ) 35 | }, 36 | extensions={ 37 | Format.SHAPEFILE: 'shp', 38 | Format.GEOJSON: 'json', 39 | Format.GEOPACKAGE: 'gpkg', 40 | Format.FLATGEOBUF: 'fgb', 41 | Format.PARQUET: 'parquet' 42 | } 43 | ) 44 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | integration: marks tests as integration tests that span network and DB I/O (deselect with '-m "not integration"') -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | duckdb 3 | pandas 4 | geopandas 5 | pyogrio 6 | osmnx 7 | shapely 8 | openlocationcode 9 | tabulate 10 | leafmap 11 | boto3 12 | mercantile 13 | pydantic==2.4.2 -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | black[jupyter] 3 | pip 4 | bump2version 5 | wheel 6 | watchdog 7 | flake8 8 | tox 9 | coverage 10 | Sphinx 11 | twine 12 | Click 13 | codespell 14 | pydantic==2.4.2 15 | pytest==7.4.2 16 | pytest-rerunfailures==12.0 -------------------------------------------------------------------------------- /requirements_docs.txt: -------------------------------------------------------------------------------- 1 | bump2version 2 | coverage 3 | flake8 4 | ipykernel 5 | livereload 6 | nbconvert 7 | nbformat 8 | pip 9 | sphinx 10 | tox 11 | twine 12 | watchdog 13 | wheel 14 | mkdocs 15 | mkdocs-git-revision-date-plugin 16 | mkdocs-git-revision-date-localized-plugin 17 | mkdocs-jupyter>=0.24.0 18 | mkdocs-material>=9.1.3 19 | mkdocs-pdf-export-plugin 20 | mkdocstrings 21 | mkdocstrings-crystal 22 | mkdocstrings-python-legacy 23 | pygments 24 | pymdown-extensions -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.10.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:open_buildings/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """The setup script.""" 4 | 5 | import io 6 | from os import path as op 7 | from setuptools import setup, find_packages 8 | 9 | with open('README.md', encoding="utf-8") as readme_file: 10 | readme = readme_file.read() 11 | 12 | here = op.abspath(op.dirname(__file__)) 13 | 14 | # get the dependencies and installs 15 | with io.open(op.join(here, "requirements.txt"), encoding="utf-8") as f: 16 | all_reqs = f.read().split("\n") 17 | 18 | install_requires = [x.strip() for x in all_reqs if "git+" not in x] 19 | dependency_links = [x.strip().replace("git+", "") for x in all_reqs if "git+" not in x] 20 | 21 | setup_requirements = [] 22 | 23 | test_requirements = ["codespell==2.2.6", "pytest==7.4.2", "pytest-rerunfailures==12.0", "pytest-xdist==3.3.1"] 24 | 25 | setup( 26 | author="Chris Holmes", 27 | author_email='cholmes@9eo.org', 28 | python_requires='>=3.8', 29 | classifiers=[ 30 | 'Intended Audience :: Developers', 31 | 'License :: OSI Approved :: Apache Software License', 32 | 'Natural Language :: English', 33 | 'Programming Language :: Python :: 3', 34 | 'Programming Language :: Python :: 3.8', 35 | 'Programming Language :: Python :: 3.9', 36 | 'Programming Language :: Python :: 3.10', 37 | 'Programming Language :: Python :: 3.11', 38 | ], 39 | description="Tools for working with open building datasets", 40 | entry_points={ 41 | 'console_scripts': [ 42 | 'ob=open_buildings.cli:main', 43 | ], 44 | }, 45 | install_requires=install_requires, 46 | dependency_links=dependency_links, 47 | license="Apache Software License 2.0", 48 | long_description=readme, 49 | long_description_content_type='text/markdown', 50 | include_package_data=True, 51 | keywords='open_buildings', 52 | name='open-buildings', 53 | packages=find_packages(), 54 | package_data={ 55 | 'open_buildings': ['google/*', 'overture/*' ], 56 | }, 57 | setup_requires=setup_requirements, 58 | test_suite='tests', 59 | tests_require=test_requirements, 60 | url='https://github.com/opengeos/open-buildings', 61 | version='0.10.0', 62 | zip_safe=False, 63 | extras_require={ 64 | 'dev': test_requirements, 65 | } 66 | ) 67 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit test package for open_buildings.""" 2 | -------------------------------------------------------------------------------- /tests/test_open_buildings.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from typing import Dict, Any 3 | from pathlib import Path 4 | import os 5 | import json 6 | from shapely.geometry import shape, box, mapping 7 | import re 8 | import subprocess 9 | 10 | from open_buildings.download_buildings import download, geojson_to_wkt, geojson_to_quadkey, quadkey_to_geojson 11 | from open_buildings.cli import geocode 12 | from open_buildings.settings import Source, Format, settings 13 | 14 | ########################################################################### 15 | # # 16 | # RUN TESTS with `python3 -m pytest . -n ` # 17 | # # 18 | ########################################################################### 19 | 20 | 21 | NUM_RERUNS = 2 # number of re-runs for integration tests 22 | 23 | @pytest.fixture 24 | def aoi() -> Dict[str, Any]: 25 | """ Sample AOI over Seychelles. """ 26 | return { 27 | "type": "Feature", 28 | "properties": {}, 29 | "geometry": { 30 | "coordinates": [ 31 | [ 32 | [ 33 | 55.45280573412927, 34 | -4.6227964300457245 35 | ], 36 | [ 37 | 55.45280573412927, 38 | -4.623440862045413 39 | ], 40 | [ 41 | 55.453376761871795, 42 | -4.623440862045413 43 | ], 44 | [ 45 | 55.453376761871795, 46 | -4.6227964300457245 47 | ], 48 | [ 49 | 55.45280573412927, 50 | -4.6227964300457245 51 | ] 52 | ] 53 | ], 54 | "type": "Polygon" 55 | } 56 | } 57 | 58 | def test_geojson_to_wkt(aoi: Dict[str, Any]): 59 | """ Tests the geojson_to_wkt() function. """ 60 | assert geojson_to_wkt(aoi) == 'POLYGON ((55.45280573412927 -4.6227964300457245, 55.45280573412927 -4.623440862045413, 55.453376761871795 -4.623440862045413, 55.453376761871795 -4.6227964300457245, 55.45280573412927 -4.6227964300457245))' 61 | 62 | def test_geojson_to_quadkey(aoi: Dict[str, Any]): 63 | """ Tests geojson_to_quadkey() using a pre-established true value. """ 64 | assert geojson_to_quadkey(aoi) == '301001330310' 65 | 66 | def test_quadkey_to_geojson(): 67 | """ Tests quadkey_to_geojson() using a pre-established true value. """ 68 | assert quadkey_to_geojson('031313131112') == {'type': 'Feature', 'geometry': {'type': 'Polygon', 'coordinates': [[[-0.17578125, 51.50874245880333], [-0.087890625, 51.50874245880333], [-0.087890625, 51.56341232867588], [-0.17578125, 51.56341232867588], [-0.17578125, 51.50874245880333]]]}} 69 | 70 | def test_geocode(): 71 | """ Tests geocode() using a pre-established true value. Verifies the bbox of the returned geometry. """ 72 | geocoding_result = geocode('plymouth') 73 | assert geocoding_result["type"] == "Feature" 74 | assert shape(geocoding_result["geometry"]).bounds == (-4.2055324, 50.3327426, -4.0196056, 50.4441737) 75 | 76 | @pytest.mark.integration 77 | @pytest.mark.flaky(reruns=NUM_RERUNS) 78 | @pytest.mark.parametrize("source", [s for s in Source]) 79 | def test_download(source: Source, aoi: Dict[str, Any], tmp_path: Path): 80 | """ Tests that the download function successfully downloads a GeoJSON file from all sources (parametrised test) into a temporary directory (teardown after test). """ 81 | output_file = tmp_path.joinpath(f"output_{source.name}.json") 82 | download(aoi, source=source, dst=output_file, country_iso="SC") 83 | assert os.path.exists(output_file) 84 | assert os.path.getsize(output_file) != 0 85 | 86 | @pytest.mark.integration 87 | @pytest.mark.flaky(reruns=NUM_RERUNS) 88 | def test_download_no_output(aoi: Dict[str, Any], tmp_path: Path): 89 | """ Test that no empty output file gets created if a query doesn't return anything (in this case because a wrong country_iso argument is given.) """ 90 | output_file = tmp_path.joinpath("no_output.json") 91 | download(aoi, dst=output_file, country_iso="AI") # wrong country, aoi is in SC, not Anguilla 92 | assert not os.path.exists(output_file) 93 | 94 | @pytest.mark.integration 95 | @pytest.mark.flaky(reruns=NUM_RERUNS) 96 | def test_download_directory(aoi: Dict[str, Any], tmp_path: Path): 97 | """ Test that, if a directory is passed, the output gets downloaded to a default file name in that directory. """ 98 | download(aoi, dst=tmp_path, country_iso="SC") 99 | assert os.path.exists(tmp_path.joinpath("buildings.json")) 100 | assert os.path.getsize(tmp_path.joinpath("buildings.json")) != 0 101 | 102 | @pytest.mark.integration 103 | @pytest.mark.flaky(reruns=NUM_RERUNS) 104 | def test_download_overwrite(aoi: Dict[str, Any], tmp_path: Path): 105 | """ Tests that, if the "overwrite" option is set to True, an existing file does indeed get overwritten. """ 106 | output_path = tmp_path.joinpath("file_exists.json") 107 | with open(output_path, "w") as f: 108 | f.write("Foo bar") 109 | 110 | download(aoi, dst=output_path, country_iso="SC", overwrite=True) 111 | assert os.path.exists(output_path) 112 | with open(output_path, "r") as f: 113 | assert f.read() != "Foo bar" # verify that the file was updated 114 | 115 | @pytest.mark.integration 116 | @pytest.mark.flaky(reruns=NUM_RERUNS) 117 | @pytest.mark.parametrize("format", [f for f in Format if f != Format.SHAPEFILE]) # fails for shapefile! 118 | def test_download_format(format: Format, aoi: Dict[str, Any], tmp_path: Path): 119 | """ Requests data in all file formats defined in the settings. Attempts to validate the output for each of those too. """ 120 | output_file = tmp_path.joinpath(f"output.{settings.extensions[format]}") 121 | download(aoi, dst=output_file, country_iso="SC") 122 | assert os.path.exists(output_file) 123 | assert os.path.getsize(output_file) != 0 124 | 125 | # validate output 126 | if format == Format.GEOJSON: 127 | with open(output_file, "r") as f: 128 | json.load(f) 129 | elif format == Format.FLATGEOBUF: 130 | pass 131 | elif format == Format.SHAPEFILE: 132 | pass 133 | elif format == Format.PARQUET: 134 | pass 135 | elif format == Format.GEOPACKAGE: 136 | pass 137 | else: 138 | raise NotImplementedError(f"Test not implemented for {format} - please add.") 139 | 140 | def test_download_unknown_format(aoi: Dict[str, Any]): 141 | """ Tests that an unknown format (.abc) raises an Exception. """ 142 | with pytest.raises(ValueError): 143 | download(aoi, dst="buildings.abc") 144 | 145 | @pytest.mark.integration 146 | @pytest.mark.flaky(reruns=NUM_RERUNS) 147 | def test_cli_get_buildings_from_file_to_directory(aoi: Dict[str, Any], tmp_path: Path): 148 | """ 149 | Tests the CLI for get_buildings - provides the path to a GeoJSON file as input and a directory as output path. 150 | Verifies that the output gets written to a default file name in the given directory. 151 | """ 152 | # write aoi dict to geojson file in temporary directory 153 | input_path = tmp_path.joinpath("input.json") 154 | with open(input_path, "w") as f: 155 | json.dump(aoi, f) 156 | subprocess.run(["ob", "get_buildings", str(input_path), "--dst", str(tmp_path), "--country_iso", "SC"], check=True) 157 | output_path = tmp_path.joinpath("buildings.json") # default file name 158 | assert os.path.exists(output_path) 159 | assert os.path.getsize(output_path) != 0 160 | 161 | 162 | @pytest.mark.integration 163 | @pytest.mark.flaky(reruns=NUM_RERUNS) 164 | def test_cli_get_buildings_from_stdin_to_directory(aoi: Dict[str, Any], tmp_path: Path): 165 | """ 166 | Tests the CLI for get_buildings - provides a GeoJSON string via stdin and a directory as output path. 167 | Verifies that a log message with timestamp gets written to stdout. 168 | """ 169 | # we can't use pipes (e.g. f"echo {json.dumps(aoi)} | ...") in subprocess.run, instead we pass the json as stdin using the input/text arguments, 170 | process = subprocess.run([ "ob", "get_buildings", "-", "--dst", str(tmp_path), "--country_iso", "SC"], input=json.dumps(aoi), text=True, check=True, capture_output=True) 171 | dt_regex = re.compile(r"^\[[0-9]{4}(-[0-9]{2}){2} ([0-9]{2}:){2}[0-9]{2}\] ") # match timestamp format e.g. "[2023-10-18 19:08:24]" 172 | assert dt_regex.search(process.stdout) # ensure that stdout contains at least one timestamped message 173 | output_path = tmp_path.joinpath("buildings.json") # default file name 174 | assert os.path.exists(output_path) 175 | assert os.path.getsize(output_path) != 0 176 | 177 | @pytest.mark.integration 178 | @pytest.mark.flaky(reruns=NUM_RERUNS) 179 | def test_cli_get_buildings_from_stdin_to_file_silent(aoi: Dict[str, Any], tmp_path: Path): 180 | """ 181 | Tests the CLI for get_buildings - provides a GeoJSON string via stdin and an exact filepath to write the output to. 182 | Verifies that nothing gets written to stdout. 183 | """ 184 | output_path = tmp_path.joinpath("test123.json") 185 | # we can't use pipes (e.g. f"echo {json.dumps(aoi)} | ...") in subprocess.run, instead we pass the json as stdin using the input/text arguments, 186 | process = subprocess.run(["ob", "get_buildings", "-", "--dst", str(output_path), "--silent", "--country_iso", "SC"], input=json.dumps(aoi), text=True, check=True, capture_output=True) 187 | assert process.stdout == "" # assert that nothing gets printed to stdout 188 | assert process.stderr == "" # assert that nothing gets printed to stdout 189 | assert os.path.exists(output_path) 190 | assert os.path.getsize(output_path) != 0 191 | 192 | 193 | @pytest.mark.integration 194 | @pytest.mark.flaky(reruns=NUM_RERUNS) 195 | def test_cli_get_buildings_from_stdin_to_file_overwrite_false(aoi: Dict[str, Any], tmp_path: Path): 196 | """ 197 | Tests the CLI for get_buildings - provides a GeoJSON string via stdin and an exact filepath to write the output to. 198 | Verifies that, if the output file already exists, nothing happens and the user is notified of this. 199 | """ 200 | output_path = tmp_path.joinpath("file_exists.json") 201 | with open(output_path, "w") as f: 202 | f.write("Foo bar") 203 | # we can't use pipes (e.g. f"echo {json.dumps(aoi)} | ...") in subprocess.run, instead we pass the json as stdin using the input/text arguments, 204 | process = subprocess.run(["ob", "get_buildings", "-", "--dst", str(output_path), "--country_iso", "SC"], input=json.dumps(aoi), text=True, check=True, capture_output=True) 205 | assert os.path.exists(output_path) 206 | with open(output_path, "r") as f: 207 | assert f.read() == "Foo bar" # verify that the file still has the same content as before 208 | assert "exists" in process.stdout # verify that the user has been warned about the existing file 209 | 210 | @pytest.mark.integration 211 | @pytest.mark.flaky(reruns=NUM_RERUNS) 212 | def test_cli_get_buildings_geocode(tmp_path: Path): 213 | """ 214 | Tests the geocoding functionality, implemented as the argument "location". 215 | """ 216 | output_path = tmp_path.joinpath("geocode_test.json") 217 | subprocess.run(["ob", "get_buildings", "--dst", str(output_path), "--location", "oxford uk", "--country_iso", "GB"], check=True) 218 | assert os.path.exists(output_path) 219 | assert os.path.getsize(output_path) != 0 220 | 221 | @pytest.mark.integration 222 | @pytest.mark.flaky(reruns=NUM_RERUNS) 223 | def test_cli_get_buildings_geocode_multipolygon(tmp_path: Path): 224 | """ 225 | Tests the geocoding functionality, implemented as the argument "location". Makes sure that a MultiPolygon geometry (the outline of Dubrovnik) 226 | is simplified to a polygon (convex hull). 227 | """ 228 | output_path = tmp_path.joinpath("geocode_test.json") 229 | subprocess.run(["ob", "get_buildings", "--dst", str(output_path), "--location", "dubrovnik", "--country_iso", "HR"], check=True) 230 | assert os.path.exists(output_path) 231 | assert os.path.getsize(output_path) != 0 --------------------------------------------------------------------------------