├── .editorconfig
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── config.yml
    │   └── feature_request.md
    └── workflows
    │   ├── docs-build.yml
    │   ├── docs.yml
    │   ├── macos.yml
    │   ├── pypi.yml
    │   ├── ubuntu.yml
    │   └── windows.yml
├── .gitignore
├── Dockerfile.dev
├── LICENSE
├── MANIFEST.in
├── README.md
├── dev-container.sh
├── docs
    ├── changelog.md
    ├── common.md
    ├── contributing.md
    ├── examples
    │   ├── download_buildings.ipynb
    │   └── intro.ipynb
    ├── faq.md
    ├── index.md
    ├── installation.md
    ├── overrides
    │   └── main.html
    └── usage.md
├── mkdocs.yml
├── open_buildings
    ├── __init__.py
    ├── cli.py
    ├── common.py
    ├── download_buildings.py
    ├── google
    │   ├── __init.py__
    │   ├── add_columns.py
    │   ├── partition.py
    │   ├── process.py
    │   └── stac-geoparquet.py
    ├── overture
    │   ├── __init.py__
    │   ├── add_columns.py
    │   ├── partition.py
    │   └── places_add_columns.py
    └── settings.py
├── pytest.ini
├── requirements.txt
├── requirements_dev.txt
├── requirements_docs.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    └── test_open_buildings.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Create a bug report to help us improve
 4 | labels: bug
 5 | ---
 6 | 
 7 | <!-- Please search existing issues to avoid creating duplicates. -->
 8 | 
 9 | ### Environment Information
10 | 
11 | -   open_buildings version:
12 | -   Python version:
13 | -   Operating System:
14 | 
15 | ### Description
16 | 
17 | Describe what you were trying to get done.
18 | Tell us what happened, what went wrong, and what you expected to happen.
19 | 
20 | ### What I Did
21 | 
22 | ```
23 | Paste the command(s) you ran and the output.
24 | If there was a crash, please include the traceback here.
25 | ```
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
 1 | contact_links:
 2 |     - name: Ask questions
 3 |       url: https://github.com/opengeos/open-buildings/discussions/categories/q-a
 4 |       about: Please ask and answer questions here.
 5 |     - name: Ideas
 6 |       url: https://github.com/opengeos/open-buildings/discussions/categories/ideas
 7 |       about: Please share your ideas here.
 8 |     - name: Ask questions from the GIS community
 9 |       url: https://gis.stackexchange.com
10 |       about: To get answers from questions in the GIS community, please ask and answer questions here.
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Request
 3 | about: Submit a feature request to help us improve
 4 | labels: Feature Request
 5 | ---
 6 | 
 7 | <!-- Please search existing issues to avoid creating duplicates. -->
 8 | 
 9 | ### Description
10 | 
11 | Describe the feature (e.g., new functions/tutorials) you would like to propose.
12 | Tell us what can be achieved with this new feature and what's the expected outcome.
13 | 
14 | ### Source code
15 | 
16 | ```
17 | Paste your source code here if have sample code to share.
18 | ```
19 | 


--------------------------------------------------------------------------------
/.github/workflows/docs-build.yml:
--------------------------------------------------------------------------------
 1 | name: docs-build
 2 | on:
 3 |     pull_request:
 4 |         branches:
 5 |             - main
 6 | 
 7 | jobs:
 8 |     deploy:
 9 |         runs-on: ubuntu-latest
10 |         steps:
11 |             - uses: actions/checkout@v3
12 |               with:
13 |                   fetch-depth: 0
14 |             - uses: actions/setup-python@v4
15 |               with:
16 |                   python-version: "3.10"
17 |             - name: Install GDAL
18 |               run: |
19 |                   python -m pip install --upgrade pip
20 |                   pip install --find-links=https://girder.github.io/large_image_wheels --no-cache GDAL pyproj
21 |             - name: Test GDAL installation
22 |               run: |
23 |                   python -c "from osgeo import gdal"
24 |                   gdalinfo --version
25 |             - name: Install dependencies
26 |               run: |
27 |                   pip install .[dev]
28 |             - name: Discover typos with codespell
29 |               run: codespell --skip="*.csv,*.geojson,*.json,*.js,*.html,*cff,*.pdf,./.git" --ignore-words-list="aci,acount,acounts,fallow,hart,hist,nd,ned,ois,wqs"
30 |             - name: PKG-TEST
31 |               run: |
32 |                   python3 -m pytest . -n 4
33 |             - name: Build docs
34 |               run: |
35 |                   pip install -r requirements_docs.txt
36 |                   mkdocs build
37 |             # - name: Deploy to Netlify
38 |             #   uses: nwtgck/actions-netlify@v2.0
39 |             #   with:
40 |             #       publish-dir: "./site"
41 |             #       production-branch: master
42 |             #       github-token: ${{ secrets.GITHUB_TOKEN }}
43 |             #       deploy-message: "Deploy from GitHub Actions"
44 |             #       enable-pull-request-comment: true
45 |             #       enable-commit-comment: false
46 |             #       overwrites-pull-request-comment: true
47 |             #   env:
48 |             #       NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
49 |             #       NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
50 |             #   timeout-minutes: 10
51 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | on:
 3 |     push:
 4 |         branches:
 5 |             - main
 6 | jobs:
 7 |     deploy:
 8 |         runs-on: ubuntu-latest
 9 |         steps:
10 |             - uses: actions/checkout@v3
11 |             - uses: actions/setup-python@v4
12 |               with:
13 |                   python-version: 3.9
14 |             - name: Install dependencies
15 |               run: |
16 |                   python -m pip install --upgrade pip
17 |                   pip install --user --no-cache-dir Cython
18 |                   pip install .[dev]
19 |             - name: Discover typos with codespell
20 |               run: |
21 |                   pip install codespell
22 |                   codespell --skip="*.csv,*.geojson,*.json,*.js,*.html,*cff,./.git" --ignore-words-list="aci,acount,acounts,fallow,hart,hist,nd,ned,ois,wqs,watermask"
23 |             - name: PKG-TEST
24 |               run: |
25 |                   python3 -m pytest . -n 4
26 |             - run: pip install -r requirements_docs.txt
27 |             - run: mkdocs gh-deploy --force
28 | 


--------------------------------------------------------------------------------
/.github/workflows/macos.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |     push:
 3 |         branches:
 4 |             - main
 5 |     pull_request:
 6 |         branches:
 7 |             - main
 8 | 
 9 | name: macOS build
10 | jobs:
11 |     test-macOS:
12 |         runs-on: ${{ matrix.os }}
13 |         name: ${{ matrix.os }} (${{ matrix.python-version}})
14 |         strategy:
15 |             fail-fast: false
16 |             matrix:
17 |               os: ["macOS-latest"]
18 |               python-version: ["3.10"]
19 |         steps:
20 |           - name: Checkout code
21 |             uses: actions/checkout@v3
22 |       
23 |           - name: Set up Python
24 |             uses: actions/setup-python@v4
25 |             with:
26 |               python-version: ${{ matrix.python-version}}
27 |           - name: Install GDAL
28 |             run: |
29 |               brew install gdal
30 |           - name: Test GDAL installation
31 |             run: |
32 |               gdalinfo --version
33 |           - name: Install dependencies
34 |             run: |
35 |                 python -m pip install --upgrade pip
36 |                 pip install --no-cache-dir Cython
37 |                 pip install -r requirements.txt
38 |                 pip install .
39 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: pypi
 5 | 
 6 | on:
 7 |     release:
 8 |         types: [created]
 9 | 
10 | jobs:
11 |     deploy:
12 |         runs-on: ubuntu-latest
13 | 
14 |         steps:
15 |             - uses: actions/checkout@v3
16 |             - name: Set up Python
17 |               uses: actions/setup-python@v4
18 |               with:
19 |                   python-version: "3.x"
20 |             - name: Install dependencies
21 |               run: |
22 |                   python -m pip install --upgrade pip
23 |                   pip install setuptools wheel twine
24 |             - name: Build and publish
25 |               env:
26 |                   TWINE_USERNAME: ${{ secrets.PYPI_USERS }}
27 |                   TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
28 |               run: |
29 |                   python setup.py sdist bdist_wheel
30 |                   twine upload dist/*
31 | 


--------------------------------------------------------------------------------
/.github/workflows/ubuntu.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |     push:
 3 |         branches:
 4 |             - main
 5 |     pull_request:
 6 |         branches:
 7 |             - main
 8 | 
 9 | name: Linux build
10 | jobs:
11 |     py-check:
12 |         runs-on: ${{ matrix.config.os }}
13 |         name: ${{ matrix.config.os }} (${{ matrix.config.py }})
14 |         strategy:
15 |             fail-fast: false
16 |             matrix:
17 |                 config:
18 |                     - { os: ubuntu-latest, py: "3.8" }
19 |                     - { os: ubuntu-latest, py: "3.9" }
20 |                     - { os: ubuntu-latest, py: "3.10" }
21 |                     - { os: ubuntu-latest, py: "3.11" }
22 |         steps:
23 |             - name: Checkout Code
24 |               uses: actions/checkout@v3
25 |             - name: Setup Python
26 |               uses: actions/setup-python@v4
27 |               with:
28 |                   python-version: ${{ matrix.config.py }}
29 |             - name: Install GDAL
30 |               run: |
31 |                   python -m pip install --upgrade pip
32 |                   pip install --no-cache-dir Cython
33 |                   pip install --find-links=https://girder.github.io/large_image_wheels --no-cache GDAL
34 |             - name: Test GDAL installation
35 |               run: |
36 |                   python -c "from osgeo import gdal"
37 |                   gdalinfo --version
38 |             - name: Install dependencies
39 |               run: |
40 |                   pip install .[dev]
41 |             - name: PKG-TEST
42 |               run: |
43 |                   python3 -m pytest . -n 4


--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |     push:
 3 |         branches:
 4 |             - main
 5 |     pull_request:
 6 |         branches:
 7 |             - main
 8 | 
 9 | name: Windows build
10 | jobs:
11 |     test-windows:
12 |         runs-on: windows-latest
13 |         steps:
14 |             - uses: actions/checkout@v3
15 |             - name: Install miniconda
16 |               uses: conda-incubator/setup-miniconda@v2
17 |               with:
18 |                   auto-activate-base: true
19 |                   python-version: "3.10"
20 |             - name: Install GDAL
21 |               run: conda install -c conda-forge gdal --yes
22 |             - name: Test GDAL installation
23 |               run: |
24 |                   python -c "from osgeo import gdal"
25 |                   gdalinfo --version
26 |             - name: Install dependencies
27 |               run: |
28 |                   python -m pip install --upgrade pip
29 |                   pip install --no-cache-dir Cython
30 |                   pip install -r requirements.txt
31 |                   pip install .
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | private/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | cache
  8 | issues.txt
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | env/
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # dotenv
 88 | .env
 89 | 
 90 | # virtualenv
 91 | .venv
 92 | venv/
 93 | ENV/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | # IDE settings
109 | .vscode/


--------------------------------------------------------------------------------
/Dockerfile.dev:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim-bullseye
 2 | 
 3 | ARG USER
 4 | ARG UID
 5 | 
 6 | # install git
 7 | # create current user in container and link it to host UID
 8 | RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* \
 9 |     && useradd -u $UID $USER -p '' -l -m && chown $UID /home/$USER
10 | 
11 | # $PWD needs to be mounted to /workspace on run time
12 | WORKDIR /workspace
13 | COPY . .
14 | 
15 | # install package in editable mode, install [dev] dependencies (see setup.py -> extras_require arg)
16 | RUN pip install -e '.[dev]'
17 | 
18 | USER $USER


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Apache Software License 2.0
 2 | 
 3 | Copyright (c) 2023, Chris Holmes
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 | http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | 
17 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include requirements.txt
4 | 
5 | recursive-exclude * __pycache__
6 | recursive-exclude * *.py[co]
7 | 
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # open-buildings
  2 | 
  3 | [![image](https://img.shields.io/pypi/v/open_buildings.svg)](https://pypi.python.org/pypi/open_buildings)
  4 | 
  5 | **Tools for working with open building datasets**
  6 | 
  7 | ***Warning**: This code has fallen out of maintenance, and isn't keeping up with the latest release. [Overture](https://overturemaps.org) has incorporated
  8 | the Google and Microsoft datasets, and has fully adopted GeoParuqet & partitioned it, and provides a [nice cli](https://github.com/OvertureMaps/overturemaps-py) to download data. So I'd recommend using that, but will keep this code up for anyone interested.*
  9 | 
 10 | -   Free software: Apache Software License 2.0
 11 | -   Documentation: <https://opengeos.github.io/open-buildings>
 12 | -   Creator: [Chris Holmes](https://github.com/cholmes)
 13 | 
 14 | ## Introduction
 15 | 
 16 | This repo is intended to be a set of useful scripts for getting and converting Open Building Datasets using [Cloud Native Geospatial](https://cloudnativegeo.org) formats. 
 17 | Initially the focus is on Google's [Open Buildings](https://sites.research.google/open-buildings/) dataset and Overture's building dataset. 
 18 | 
 19 | The main tool that most people will be interested in is the `get_buildings` command, that
 20 | lets you supply a GeoJSON file to a command-line interface and it'll download all buildings
 21 | in the area supplied, output in common GIS formats (GeoPackage, FlatGeobuf, Shapefile, GeoJSON and GeoParquet).
 22 | 
 23 | The tool works by leveraging partitioned [GeoParquet](https://geoparquet.org) files, using [DuckDB](https://duckdb.org)
 24 | to just query exactly what is needed. This is done without any server - DuckDB on your computer queries, filter and downloads
 25 | just the rows that you want. Right now you can query two datasets, that live on [Source Cooperative](https://beta.source.coop), see [here for Google](https://beta.source.coop/cholmes/google-open-buildings) and [here for Overture](https://beta.source.coop/cholmes/overture/). The rest of the CLI's and scripts were used to create those datasets, with some
 26 | additions for benchmarking performance.
 27 | 
 28 | This is basically my first Python project, and certainly my first open source one. It is only possible due to ChatGPT, as I'm not a python
 29 | programmer, and not a great programmer in general (coded professionally for about 2 years, then shifted to doing lots of other stuff). So
 30 | it's likely not great code, but it's been fun to iterate on it and seems like it might be useful to others. And contributions are welcome! 
 31 | I'm working on making the issue tracker accessible, so anyone who wants to try out some open source coding can jump in.
 32 | 
 33 | ## Installation
 34 | 
 35 | Install with pip:
 36 | 
 37 | ```bash
 38 | pip install open-buildings
 39 | ```
 40 | 
 41 | This should add a CLI that you can then use. If it's working then:
 42 | 
 43 | ```bash
 44 | ob
 45 | ```
 46 | 
 47 | Will print out a help message. You then will be able run the CLI (download [1.json](https://data.source.coop/cholmes/aois/1.json):
 48 | 
 49 | 
 50 | ```bash
 51 | ob tools get_buildings 1.json --dst my-buildings.geojson --country_iso RW
 52 | ```
 53 | 
 54 | You can also stream the json in directly in one line:
 55 | 
 56 | ```
 57 | curl https://data.source.coop/cholmes/aois/1.json | ob get_buildings - --dst my-buildings.geojson --country_iso RW
 58 | ```
 59 | 
 60 | 
 61 | ## Functionality
 62 | 
 63 | ### get_buildings
 64 | 
 65 | The main tool for most people is `get_buildings`. It queries complete global
 66 | building datasets for the GeoJSON provided, outputting results in common geospatial formats. The 
 67 | full options and explanation can be found in the `--help` command:
 68 | 
 69 | ```
 70 | % ob get_buildings --help
 71 | Usage: ob get_buildings [OPTIONS] [GEOJSON_INPUT] [DST]
 72 | 
 73 |   Tool to extract buildings in common geospatial formats from large archives
 74 |   of GeoParquet data online. GeoJSON input can be provided as a file or piped
 75 |   in from stdin. If no GeoJSON input is provided, the tool will read from
 76 |   stdin.
 77 | 
 78 |   Right now the tool supports two sources of data: Google and Overture. The
 79 |   data comes from Cloud-Native Geospatial distributions on
 80 |   https://source.coop, that are partitioned by admin boundaries and use a
 81 |   quadkey for the spatial index. In time this tool will generalize to support
 82 |   any admin boundary partitioned GeoParquet data, but for now it is limited to
 83 |   the Google and Overture datasets.
 84 | 
 85 |   The default output is GeoJSON, in a file called buildings.json. Changing the
 86 |   suffix will change the output format - .shp for shapefile .gpkg for
 87 |   GeoPackage, .fgb for FlatGeobuf and .parquet for GeoParquet, and .json or
 88 |   .geojson for GeoJSON. If your query is all within one country it is strongly
 89 |   recommended to use country_iso to hint to the query engine which country to
 90 |   query, as this  will speed up the query significantly (5-10x). Expect query
 91 |   times of 5-10 seconds for a queries with country_iso and 30-60 seconds
 92 |   without country_iso.
 93 | 
 94 |   You can look up the country_iso for a country here:
 95 |   https://github.com/lukes/ISO-3166-Countries-with-Regional-
 96 |   Codes/blob/master/all/all.csv If you get the country wrong you will get zero
 97 |   results. Currently you can only query one country, so if your query crosses
 98 |   country boundaries you should not use country_iso. In future versions of
 99 |   this tool we hope to eliminate the need to hint with the country_iso.
100 | 
101 | Options:
102 |   --dst TEXT                  The path to write the output to. Can be a
103 |                               directory or file.
104 |   --location TEXT             Use city or region name instead of providing an
105 |                               AOI as file.
106 |   --source [google|overture]  Dataset to query, defaults to Overture
107 |   --country_iso TEXT          A 2 character country ISO code to filter the
108 |                               data by.
109 |   -s, --silent                Suppress all print outputs.
110 |   --overwrite                 Overwrite the destination file if it already
111 |                               exists.
112 |   -v, --verbose               Print detailed logs with timestamps.
113 |   --help                      Show this message and exit.
114 | ```
115 | 
116 | Note that the `get_buildings` operation is not very robust, there are likely a number of ways to break it. #13 
117 | is used to track it, but if you have any problems please report them in the [issue tracker](https://github.com/opengeos/open-buildings/issues)
118 | to help guide how we improve it. 
119 | 
120 | We do hope to eliminate the need to supply an iso_country for fast querying, see #29 for that tracking issue. We also
121 | hope to add more building datasets, starting with the [Google-Microsoft Open Buildings by VIDA](https://beta.source.coop/vida/google-microsoft-open-buildings/geoparquet/by_country_s2),
122 | see #26 for more info.
123 | 
124 | ### Google Building processings
125 | 
126 | In the google portion of the CLI there are two functions:
127 | 
128 | -   `convert` takes as input either a single CSV file or a directory of CSV files, downloaded locally from the Google Buildings dataset. It can write out as GeoParquet, FlatGeobuf, GeoPackage and Shapefile, and can process the data using DuckDB, GeoPandas or OGR.
129 | -   `benchmark` runs the convert command against one or more different formats, and one or more different processes, and reports out how long each took.
130 | 
131 | A sample output for `benchmark`, run on 219_buildings.csv, a 101 mb CSV file is:
132 | 
133 | ```
134 | Table for file: 219_buildings.csv
135 | ╒═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕
136 | │ process   │ fgb       │ gpkg      │ parquet   │ shp       │
137 | ╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
138 | │ duckdb    │ 00:02.330 │ 00:00.000 │ 00:01.866 │ 00:03.119 │
139 | ├───────────┼───────────┼───────────┼───────────┼───────────┤
140 | │ ogr       │ 00:02.034 │ 00:07.456 │ 00:01.423 │ 00:02.491 │
141 | ├───────────┼───────────┼───────────┼───────────┼───────────┤
142 | │ pandas    │ 00:18.184 │ 00:24.096 │ 00:02.710 │ 00:20.032 │
143 | ╘═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛
144 | ```
145 | 
146 | The full options can be found with `--help` after each command, and I'll put them here for reference:
147 | 
148 | ```
149 | Usage: open_buildings convert [OPTIONS] INPUT_PATH OUTPUT_DIRECTORY
150 | 
151 |   Converts a CSV or a directory of CSV's to an alternate format. Input CSV's
152 |   are assumed to be from Google's Open Buildings
153 | 
154 | Options:
155 |   --format [fgb|parquet|gpkg|shp]
156 |                                   The output format. The default is FlatGeobuf (fgb)
157 |   --overwrite                     Whether to overwrite any existing output files.
158 |   --process [duckdb|pandas|ogr]   The processing method to use. The default is 
159 |                                   pandas.
160 |   --skip-split-multis             Whether to keep multipolygons as they are
161 |                                   without splitting into their component polygons.
162 |   --verbose                       Whether to print detailed processing
163 |                                   information.
164 |   --help                          Show this message and exit.
165 | ```
166 | 
167 | ```
168 | Usage: open_buildings benchmark [OPTIONS] INPUT_PATH OUTPUT_DIRECTORY
169 | 
170 |   Runs the convert function on each of the supplied processes and formats,
171 |   printing the timing of each as a table
172 | 
173 | Options:
174 |   --processes TEXT      The processing methods to use. One or more of duckdb,
175 |                         pandas or ogr, in a comma-separated list. Default is
176 |                         duckdb,pandas,ogr.
177 |   --formats TEXT        The output formats to benchmark. One or more of fgb,
178 |                         parquet, shp or gpkg, in a comma-separated list.
179 |                         Default is fgb,parquet,shp,gpkg.
180 |   --skip-split-multis   Whether to keep multipolygons as they are without
181 |                         splitting into their component polygons.
182 |   --no-gpq              Disable GPQ conversion. Timing will be faster, but not
183 |                         valid GeoParquet (until DuckDB adds support)
184 |   --verbose             Whether to print detailed processing information.
185 |   --output-format TEXT  The format of the output. Options: ascii, csv, json,
186 |                         chart.
187 |   --help                Show this message and exit.
188 | ```
189 | 
190 | **Warning** - note that `--no-gpq` doesn't actually work right now, see https://github.com/opengeos/open-buildings/issues/4 to track. It is just always set to true, so DuckDB times with Parquet will be inflated (you can change it in the Python code in a global variables). Note also that the `ogr` process does not work with `--skip-split-multis`, but will just report very minimal times since it skips doing anything, see https://github.com/opengeos/open-buildings/issues/5 to track.
191 | 
192 | #### Format Notes
193 | 
194 | I'm mostly focused on GeoParquet and FlatGeobuf, as good cloud-native geo formats. I included GeoPackage and Shapefile mostly for benchmarking purposes. GeoPackage I think is a good option for Esri and other more legacy software that is slow to adopt new formats. Shapefile is total crap for this use case - it fails on files bigger than 4 gigabytes, and lots of the source S2 Google Building CSV's are bigger, so it's not useful for translating. The truncation of field names is also annoying, since the CSV file didn't try to make short names (nor should it, the limit is silly).
195 | 
196 | GeoPackage is particularly slow with DuckDB, it's likely got a bit of a bug in it. But it works well with Pandas and OGR.
197 | 
198 | ## Process Notes
199 | 
200 | When I was processing V2 of the Google Building's dataset I did most of the initial work with GeoPandas, which was awesome, and has the best GeoParquet implementation. But the size of the data made its all in memory processing untenable. I ended up using PostGIS a decent but, but near the end of that process I discovered DuckDB, and was blown away by it's speed and ability to manage memory well. So for this tool I was mostly focused on those two.
201 | 
202 | Note also that currently DuckDB fgb, gpkg and shp output don't include projection information, so if you want to use the output then you'd need to run ogr2ogr on the output. It sounds like that may get fixed pretty soon, so I'm not going to add a step that includes the ogr conversion.
203 | 
204 | OGR was added later, and as of yet does not yet do the key step of splitting multi-polygons, since it's just using ogr2ogr as a sub-process and I've yet to find a way to do that from the CLI (though knowing GDAL/OGR there probably is one - please let me know). To run the benchmark with it you need to do --skip-split-multis or else the times on it will be 0 (except for Shapefile, since it doesn't differentiate between multipolygons and regular polygons). I hope to add that functionality and get it on par, which may mean using Fiona. But it seems like that may affect performance, since Fiona doesn't use the [GDAL/OGR column-oriented API](https://gdal.org/development/rfc/rfc86_column_oriented_api.html).
205 | 
206 | ### Code customizations
207 | 
208 | There are 3 options that you can set as global variables in the Python code, but are not yet CLI options. These are:
209 | 
210 | * `RUN_GPQ_CONVERSION` - whether GeoParquet from DuckDB by default runs [gpq](https://github.com/planetlabs/gpq) on the DuckDB Parquet output, which adds a good chunk of processing time. This makes it so the DuckDB processing output is slower than it would be if DuckDB natively wrote GeoParquet metadata, which I believe is on their roadmap. So that will likely emerge as the fastest benchmark time. In the code you can set `RUN_GPQ_CONVERSION` in the python code to false if you want to get a sense of it. In the above benchmark running the Parquet with DuckDB without GPQ conversion at the end resulted in a time of .76 seconds. 
211 | * `PARQUET_COMPRESSION` - which compression to use for Parquet encoding. Note that not all processes support all compression options, and also the OGR converter currently ignores this option.
212 | * `SKIP_DUCK_GPKG` - whether to skip the GeoPackage conversion option on DuckDB, since it takes a long time to run.
213 | 
214 | ## Contributing
215 | 
216 | All contributions are welcome, I love running open source projects. I'm clearly just learning to code Python, so there's no judgement about crappy code. And I'm super happy to learn from others about better code. Feel free to sound in on [the issues](https://github.com/opengeos/open-buildings/issues), make new ones, grab one, or make a PR. There's lots of low hanging fruit of things to add. And if you're just starting out programming don't hesitate to ask even basic things in the [discussions](https://github.com/opengeos/open-buildings/discussions).
217 | 


--------------------------------------------------------------------------------
/dev-container.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #################################################################
 4 | #                                                               #
 5 | #   Simple Bash script to simplify development in a container.  #
 6 | #   Builds Dockerfile.dev if it does not already exists.        #
 7 | #   The CWD is mounted in the container.                        #
 8 | #                                                               #
 9 | #   Use --rebuild flag to force rebuild of the image even if    #
10 | #   (a potentially older version of) the image already exists.  #
11 | #   This is necessary if e.g. the dependencies are updated.     #
12 | #                                                               #
13 | #################################################################
14 | 
15 | 
16 | rebuild=$([[ $1 == '--rebuild' ]] && echo true || echo false)
17 | 
18 | if [[ $rebuild == true ]] 
19 | then
20 |     docker build --build-arg UID=$UID --build-arg USER=$USER -t ob-dev -f Dockerfile.dev .
21 | else
22 |     docker inspect --type=image ob-dev &> /dev/null || {
23 |         echo "Image doesn't exist locally, building ...";
24 |         docker build --build-arg UID=$UID --build-arg USER=$USER -t ob-dev -f Dockerfile.dev .
25 |     }
26 | fi
27 | 
28 | docker run -it --workdir /workspace -v $PWD:/workspace -v $HOME/.gitconfig:$HOME/.gitconfig -v $HOME/.ssh:$HOME/.ssh ob-dev bash


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## v0.0.1 - Date
 4 | 
 5 | **Improvement**:
 6 | 
 7 | -   TBD
 8 | 
 9 | **New Features**:
10 | 
11 | -   TBD
12 | 


--------------------------------------------------------------------------------
/docs/common.md:
--------------------------------------------------------------------------------
1 | # common module
2 | 
3 | ::: open_buildings.common


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | Contributions are welcome, and they are greatly appreciated! Every
  4 | little bit helps, and credit will always be given.
  5 | 
  6 | You can contribute in many ways:
  7 | 
  8 | ## Types of Contributions
  9 | 
 10 | ### Report Bugs
 11 | 
 12 | Report bugs at <https://github.com/opengeos/open-buildings/issues>.
 13 | 
 14 | If you are reporting a bug, please include:
 15 | 
 16 | -   Your operating system name and version.
 17 | -   Any details about your local setup that might be helpful in troubleshooting.
 18 | -   Detailed steps to reproduce the bug.
 19 | 
 20 | ### Fix Bugs
 21 | 
 22 | Look through the GitHub issues for bugs. Anything tagged with `bug` and
 23 | `help wanted` is open to whoever wants to implement it.
 24 | 
 25 | ### Implement Features
 26 | 
 27 | Look through the GitHub issues for features. Anything tagged with
 28 | `enhancement` and `help wanted` is open to whoever wants to implement it.
 29 | 
 30 | ### Write Documentation
 31 | 
 32 | open-buildings could always use more documentation,
 33 | whether as part of the official open-buildings docs,
 34 | in docstrings, or even on the web in blog posts, articles, and such.
 35 | 
 36 | ### Submit Feedback
 37 | 
 38 | The best way to send feedback is to file an issue at
 39 | <https://github.com/opengeos/open-buildings/issues>.
 40 | 
 41 | If you are proposing a feature:
 42 | 
 43 | -   Explain in detail how it would work.
 44 | -   Keep the scope as narrow as possible, to make it easier to implement.
 45 | -   Remember that this is a volunteer-driven project, and that contributions are welcome :)
 46 | 
 47 | ## Get Started!
 48 | 
 49 | Ready to contribute? Here's how to set up open-buildings for local development.
 50 | 
 51 | 1.  Fork the open-buildings repo on GitHub.
 52 | 
 53 | 2.  Clone your fork locally:
 54 | 
 55 |     ```shell
 56 |     $ git clone git@github.com:your_name_here/open-buildings.git
 57 |     ```
 58 | 
 59 | 3.  Install your local copy into a virtualenv. Assuming you have
 60 |     virtualenvwrapper installed, this is how you set up your fork for
 61 |     local development:
 62 | 
 63 |     ```shell
 64 |     $ mkvirtualenv open-buildings
 65 |     $ cd open-buildings/
 66 |     $ python setup.py develop
 67 |     ```
 68 | 
 69 | 4.  Create a branch for local development:
 70 | 
 71 |     ```shell
 72 |     $ git checkout -b name-of-your-bugfix-or-feature
 73 |     ```
 74 | 
 75 |     Now you can make your changes locally.
 76 | 
 77 | 5.  When you're done making changes, check that your changes pass flake8
 78 |     and the tests, including testing other Python versions with tox:
 79 | 
 80 |     ```shell
 81 |     $ flake8 open-buildings tests
 82 |     $ python setup.py test or pytest
 83 |     $ tox
 84 |     ```
 85 | 
 86 |     To get flake8 and tox, just pip install them into your virtualenv.
 87 | 
 88 | 6.  Commit your changes and push your branch to GitHub:
 89 | 
 90 |     ```shell
 91 |     $ git add .
 92 |     $ git commit -m "Your detailed description of your changes."
 93 |     $ git push origin name-of-your-bugfix-or-feature
 94 |     ```
 95 | 
 96 | 7.  Submit a pull request through the GitHub website.
 97 | 
 98 | ## Pull Request Guidelines
 99 | 
100 | Before you submit a pull request, check that it meets these guidelines:
101 | 
102 | 1.  The pull request should include tests.
103 | 2.  If the pull request adds functionality, the docs should be updated.
104 |     Put your new functionality into a function with a docstring, and add
105 |     the feature to the list in README.rst.
106 | 3.  The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and
107 |     for PyPy. Check <https://github.com/opengeos/open-buildings/pull_requests> and make sure that the tests pass for all
108 |     supported Python versions.
109 | 


--------------------------------------------------------------------------------
/docs/examples/download_buildings.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# %pip install open-buildings"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Iimport libraries"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import os\n",
 26 |     "import leafmap.foliumap as leafmap\n",
 27 |     "import geopandas as gpd"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "Read the tile geojson."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "url = 'https://sites.research.google/open-buildings/tiles.geojson'\n",
 44 |     "gdf = gpd.read_file(url)\n",
 45 |     "gdf.sort_values(by='size_mb', ascending=True, inplace=True)\n",
 46 |     "gdf.head()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "print(f\"Number of tiles: {len(gdf)}\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "m = leafmap.Map()\n",
 65 |     "m.add_gdf(gdf, layer_name=\"Open Buildings\")\n",
 66 |     "m"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "Get the tile URLs."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "urls = gdf['tile_url'].tolist()\n",
 83 |     "urls[:5]"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Specify the output directory."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "out_dir = os.path.expanduser('~/Downloads/')"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "Download all the tiles might take a while. Let's download only the first 10 tiles. "
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "leafmap.download_files(urls[:10], out_dir=out_dir)"
116 |    ]
117 |   }
118 |  ],
119 |  "metadata": {
120 |   "kernelspec": {
121 |    "display_name": "geo",
122 |    "language": "python",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.10.9"
136 |   },
137 |   "orig_nbformat": 4
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 2
141 | }
142 | 


--------------------------------------------------------------------------------
/docs/examples/intro.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import open_buildings as ob"
10 |    ]
11 |   }
12 |  ],
13 |  "metadata": {
14 |   "kernelspec": {
15 |    "display_name": "geo",
16 |    "language": "python",
17 |    "name": "python3"
18 |   },
19 |   "language_info": {
20 |    "codemirror_mode": {
21 |     "name": "ipython",
22 |     "version": 3
23 |    },
24 |    "file_extension": ".py",
25 |    "mimetype": "text/x-python",
26 |    "name": "python",
27 |    "nbconvert_exporter": "python",
28 |    "pygments_lexer": "ipython3",
29 |    "version": "3.10.9"
30 |   },
31 |   "orig_nbformat": 4
32 |  },
33 |  "nbformat": 4,
34 |  "nbformat_minor": 2
35 | }
36 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | # open-buildings
  2 | 
  3 | [![image](https://img.shields.io/pypi/v/open_buildings.svg)](https://pypi.python.org/pypi/open_buildings)
  4 | 
  5 | **Tools for working with open building datasets**
  6 | 
  7 | -   Free software: Apache Software License 2.0
  8 | -   Documentation: <https://opengeos.github.io/open-buildings>
  9 | -   Creator: [Chris Holmes](https://github.com/cholmes)
 10 | 
 11 | ## Introduction
 12 | 
 13 | This repo is intended to be a set of useful scripts for working with Open Building Datasets, Initially Google's [Open Buildings](https://sites.research.google/open-buildings/)
 14 | dataset and Overture's building dataset, specifically to help translate them into [Cloud Native Geospatial](https://cloudnativegeo.org) formats and then use those. The outputs will live
 15 | on <https://beta.source.coop>, [here for Google](https://beta.source.coop/cholmes/google-open-buildings) and [here for Overture](https://beta.source.coop/cholmes/overture/) so most people can just make use of those directly. 
 16 | 
 17 | The main operation that most people will be interested in is the 'get-buildings' command, that
 18 | lets you supply a GeoJSON file to a command-line interface and it'll download all buildings
 19 | in the area supplied, output in common GIS formats (GeoPackage, FlatGeobuf, Shapefile, GeoJSON and GeoParquet).
 20 | 
 21 | The rest of the CLI's and scripts are intended to show the process of transforming the data,
 22 |  and then they've expanded to be a way to benchmark performance.
 23 | 
 24 | This is basically my first Python project, and certainly my first open source one. It is only possible due to ChatGPT, as I'm not a python
 25 | programmer, and not a great programmer in general (coded professionally for about 2 years, then shifted to doing lots of other stuff). So
 26 | it's likely not great code, but it's been fun to iterate on it and seems like it might be useful to others. And contributions are welcome! I'm working on making the issue tracker accessible, so anyone who wants to try out some open source coding can jump in.
 27 | 
 28 | ## Installation
 29 | 
 30 | Install with pip:
 31 | 
 32 | ```bash
 33 | pip install open-buildings
 34 | ```
 35 | 
 36 | This should add a CLI that you can then use. If it's working then:
 37 | 
 38 | ```bash
 39 | ob
 40 | ```
 41 | 
 42 | Should print out a help message. You then should be able run the CLI (download [1.json](https://data.source.coop/cholmes/aois/1.json):
 43 | 
 44 | 
 45 | ```bash
 46 | ob tools get_buildings 1.json my-buildings.geojson --country_iso RW
 47 | ```
 48 | 
 49 | You can also stream the json in directly in one line:
 50 | 
 51 | ```
 52 | curl https://data.source.coop/cholmes/aois/1.json | ob get_buildings - my-buildings.geojson --country_iso RW
 53 | ```
 54 | 
 55 | 
 56 | ## Functionality
 57 | 
 58 | ### get_buildings
 59 | 
 60 | The main tool for most people is `get_buildings`. It queries complete global
 61 | building datasets for the GeoJSON provided, outputting results in common geospatial formats. The 
 62 | full options and explanation can be found in the `--help` command:
 63 | 
 64 | ```
 65 | % ob get_buildings --help
 66 | Usage: ob get_buildings [OPTIONS] [GEOJSON_INPUT] [DST]
 67 | 
 68 |   Tool to extract buildings in common geospatial formats from large archives
 69 |   of GeoParquet data online. GeoJSON input can be provided as a file or piped
 70 |   in from stdin. If no GeoJSON input is provided, the tool will read from
 71 |   stdin.
 72 | 
 73 |   Right now the tool supports two sources of data: Google and Overture. The
 74 |   data comes from Cloud-Native Geospatial distributions on
 75 |   https://source.coop, that are partitioned by admin boundaries and use a
 76 |   quadkey for the spatial index. In time this tool will generalize to support
 77 |   any admin boundary partitioned GeoParquet data, but for now it is limited to
 78 |   the Google and Overture datasets.
 79 | 
 80 |   The default output is GeoJSON, in a file called buildings.json. Changing the
 81 |   suffix will change the output format - .shp for shapefile .gpkg for
 82 |   GeoPackage, .fgb for FlatGeobuf and .parquet for GeoParquet, and .json or
 83 |   .geojson for GeoJSON. If your query is all within one country it is strongly
 84 |   recommended to use country_iso to hint to the query engine which country to
 85 |   query, as this  will speed up the query significantly (5-10x). Expect query
 86 |   times of 5-10 seconds for a queries with country_iso and 30-60 seconds
 87 |   without country_iso.
 88 | 
 89 |   You can look up the country_iso for a country here:
 90 |   https://github.com/lukes/ISO-3166-Countries-with-Regional-
 91 |   Codes/blob/master/all/all.csv If you get the country wrong you will get zero
 92 |   results. Currently you can only query one country, so if your query crosses
 93 |   country boundaries you should not use country_iso. In future versions of
 94 |   this tool we hope to eliminate the need to hint with the country_iso.
 95 | 
 96 | Options:
 97 |   --source [google|overture]  Dataset to query, defaults to Overture
 98 |   --country_iso TEXT          A 2 character country ISO code to filter the
 99 |                               data by.
100 |   -s, --silent                Suppress all print outputs.
101 |   --overwrite                 Overwrite the destination file if it already
102 |                               exists.
103 |   --verbose                   Print detailed logs with timestamps.
104 |   --help                      Show this message and exit.
105 | ```
106 | 
107 | ### Google Building processings
108 | 
109 | In the google portion of the CLI there are two functions:
110 | 
111 | -   `convert` takes as input either a single CSV file or a directory of CSV files, downloaded locally from the Google Buildings dataset. It can write out as GeoParquet, FlatGeobuf, GeoPackage and Shapefile, and can process the data using DuckDB, GeoPandas or OGR.
112 | -   `benchmark` runs the convert command against one or more different formats, and one or more different processes, and reports out how long each took.
113 | 
114 | A sample output for `benchmark`, run on 219_buildings.csv, a 101 mb CSV file is:
115 | 
116 | ```
117 | Table for file: 219_buildings.csv
118 | ╒═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕
119 | │ process   │ fgb       │ gpkg      │ parquet   │ shp       │
120 | ╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
121 | │ duckdb    │ 00:02.330 │ 00:00.000 │ 00:01.866 │ 00:03.119 │
122 | ├───────────┼───────────┼───────────┼───────────┼───────────┤
123 | │ ogr       │ 00:02.034 │ 00:07.456 │ 00:01.423 │ 00:02.491 │
124 | ├───────────┼───────────┼───────────┼───────────┼───────────┤
125 | │ pandas    │ 00:18.184 │ 00:24.096 │ 00:02.710 │ 00:20.032 │
126 | ╘═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛
127 | ```
128 | 
129 | The full options can be found with `--help` after each command, and I'll put them here for reference:
130 | 
131 | ```
132 | Usage: open_buildings convert [OPTIONS] INPUT_PATH OUTPUT_DIRECTORY
133 | 
134 |   Converts a CSV or a directory of CSV's to an alternate format. Input CSV's
135 |   are assumed to be from Google's Open Buildings
136 | 
137 | Options:
138 |   --format [fgb|parquet|gpkg|shp]
139 |                                   The output format. The default is FlatGeobuf (fgb)
140 |   --overwrite                     Whether to overwrite any existing output files.
141 |   --process [duckdb|pandas|ogr]   The processing method to use. The default is 
142 |                                   pandas.
143 |   --skip-split-multis             Whether to keep multipolygons as they are
144 |                                   without splitting into their component polygons.
145 |   --verbose                       Whether to print detailed processing
146 |                                   information.
147 |   --help                          Show this message and exit.
148 | ```
149 | 
150 | ```
151 | Usage: open_buildings benchmark [OPTIONS] INPUT_PATH OUTPUT_DIRECTORY
152 | 
153 |   Runs the convert function on each of the supplied processes and formats,
154 |   printing the timing of each as a table
155 | 
156 | Options:
157 |   --processes TEXT      The processing methods to use. One or more of duckdb,
158 |                         pandas or ogr, in a comma-separated list. Default is
159 |                         duckdb,pandas,ogr.
160 |   --formats TEXT        The output formats to benchmark. One or more of fgb,
161 |                         parquet, shp or gpkg, in a comma-separated list.
162 |                         Default is fgb,parquet,shp,gpkg.
163 |   --skip-split-multis   Whether to keep multipolygons as they are without
164 |                         splitting into their component polygons.
165 |   --no-gpq              Disable GPQ conversion. Timing will be faster, but not
166 |                         valid GeoParquet (until DuckDB adds support)
167 |   --verbose             Whether to print detailed processing information.
168 |   --output-format TEXT  The format of the output. Options: ascii, csv, json,
169 |                         chart.
170 |   --help                Show this message and exit.
171 | ```
172 | 
173 | **Warning** - note that `--no-gpq` doesn't actually work right now, see https://github.com/opengeos/open-buildings/issues/4 to track. It is just always set to true, so DuckDB times with Parquet will be inflated (you can change it in the Python code in a global variables). Note also that the `ogr` process does not work with `--skip-split-multis`, but will just report very minimal times since it skips doing anything, see https://github.com/opengeos/open-buildings/issues/5 to track.
174 | 
175 | #### Format Notes
176 | 
177 | I'm mostly focused on GeoParquet and FlatGeobuf, as good cloud-native geo formats. I included GeoPackage and Shapefile mostly for benchmarking purposes. GeoPackage I think is a good option for Esri and other more legacy software that is slow to adopt new formats. Shapefile is total crap for this use case - it fails on files bigger than 4 gigabytes, and lots of the source S2 Google Building CSV's are bigger, so it's not useful for translating. The truncation of field names is also annoying, since the CSV file didn't try to make short names (nor should it, the limit is silly).
178 | 
179 | GeoPackage is particularly slow with DuckDB, it's likely got a bit of a bug in it. But it works well with Pandas and OGR.
180 | 
181 | ## Process Notes
182 | 
183 | When I was processing V2 of the Google Building's dataset I did most of the initial work with GeoPandas, which was awesome, and has the best GeoParquet implementation. But the size of the data made its all in memory processing untenable. I ended up using PostGIS a decent but, but near the end of that process I discovered DuckDB, and was blown away by it's speed and ability to manage memory well. So for this tool I was mostly focused on those two.
184 | 
185 | Note also that currently DuckDB fgb, gpkg and shp output don't include projection information, so if you want to use the output then you'd need to run ogr2ogr on the output. It sounds like that may get fixed pretty soon, so I'm not going to add a step that includes the ogr conversion.
186 | 
187 | OGR was added later, and as of yet does not yet do the key step of splitting multi-polygons, since it's just using ogr2ogr as a sub-process and I've yet to find a way to do that from the CLI (though knowing GDAL/OGR there probably is one - please let me know). To run the benchmark with it you need to do --skip-split-multis or else the times on it will be 0 (except for Shapefile, since it doesn't differentiate between multipolygons and regular polygons). I hope to add that functionality and get it on par, which may mean using Fiona. But it seems like that may affect performance, since Fiona doesn't use the [GDAL/OGR column-oriented API](https://gdal.org/development/rfc/rfc86_column_oriented_api.html).
188 | 
189 | ### Code customizations
190 | 
191 | There are 3 options that you can set as global variables in the Python code, but are not yet CLI options. These are:
192 | 
193 | * `RUN_GPQ_CONVERSION` - whether GeoParquet from DuckDB by default runs [gpq](https://github.com/planetlabs/gpq) on the DuckDB Parquet output, which adds a good chunk of processing time. This makes it so the DuckDB processing output is slower than it would be if DuckDB natively wrote GeoParquet metadata, which I believe is on their roadmap. So that will likely emerge as the fastest benchmark time. In the code you can set `RUN_GPQ_CONVERSION` in the python code to false if you want to get a sense of it. In the above benchmark running the Parquet with DuckDB without GPQ conversion at the end resulted in a time of .76 seconds. 
194 | * `PARQUET_COMPRESSION` - which compression to use for Parquet encoding. Note that not all processes support all compression options, and also the OGR converter currently ignores this option.
195 | * `SKIP_DUCK_GPKG` - whether to skip the GeoPackage conversion option on DuckDB, since it takes a long time to run.
196 | 
197 | ## Contributing
198 | 
199 | All contributions are welcome, I love running open source projects. I'm clearly just learning to code Python, so there's no judgement about crappy code. And I'm super happy to learn from others about better code. Feel free to sound in on [the issues](https://github.com/opengeos/open-buildings/issues), make new ones, grab one, or make a PR. There's lots of low hanging fruit of things to add. And if you're just starting out programming don't hesitate to ask even basic things in the [discussions](https://github.com/opengeos/open-buildings/discussions).
200 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | To install open-buildings, run this command in your terminal:
 4 | 
 5 | ```bash
 6 | pip install open-buildings
 7 | ```
 8 | 
 9 | This is the preferred method to install open-buildings, as it will always install the most recent stable release.
10 | 
11 | If you don't have [pip](https://pip.pypa.io) installed, this [Python installation guide](http://docs.python-guide.org/en/latest/starting/installation/) can guide you through the process.
12 | 
13 | This should add a CLI that you can then use. If it's working then:
14 | 
15 | ```bash
16 | ob
17 | ```
18 | 
19 | Should print out a help message. You then should be able run the CLI (download [1.json](https://data.source.coop/cholmes/aois/1.json):
20 | 
21 | 
22 | ```bash
23 | ob tools get_buildings 1.json my-buildings.geojson --country_iso RW
24 | ```
25 | 
26 | You can also stream the json in directly in one line:
27 | 
28 | ```
29 | curl https://data.source.coop/cholmes/aois/1.json | ob get_buildings - my-buildings.geojson --country_iso RW
30 | ```
31 | 
32 | ## Install From sources
33 | 
34 | To install open-buildings from sources, run this command in your terminal:
35 | 
36 | ```
37 | pip install git+https://github.com/opengeos/open-buildings
38 | ```
39 | 


--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block content %}
 4 | {% if page.nb_url %}
 5 |     <a href="{{ page.nb_url }}" title="Download Notebook" class="md-content__button md-icon">
 6 |         {% include ".icons/material/download.svg" %}
 7 |     </a>
 8 | {% endif %}
 9 | 
10 | {{ super() }}
11 | {% endblock content %}
12 | 


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
1 | # Usage
2 | 
3 | To use open-buildings in a project:
4 | 
5 | ```
6 | import open_buildings
7 | ```
8 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: open-buildings
 2 | site_description: Tools for working with open building datasets
 3 | site_author: cholmes
 4 | site_url: https://opengeos.github.io/open-buildings
 5 | repo_url: https://github.com/opengeos/open-buildings
 6 | 
 7 | copyright: "Copyright &copy; 2023 - 2023 Chris Holmes"
 8 | 
 9 | theme:
10 |     palette:
11 |         - scheme: default
12 |           #   primary: blue
13 |           #   accent: indigo
14 |           toggle:
15 |               icon: material/toggle-switch-off-outline
16 |               name: Switch to dark mode
17 |         - scheme: slate
18 |           primary: indigo
19 |           accent: indigo
20 |           toggle:
21 |               icon: material/toggle-switch
22 |               name: Switch to light mode
23 |     name: material
24 |     icon:
25 |         repo: fontawesome/brands/github
26 |     # logo: assets/logo.png
27 |     favicon: assets/favicon.png
28 |     features:
29 |         - navigation.instant
30 |         - navigation.tracking
31 |         - navigation.top
32 |         - search.highlight
33 |         - search.share
34 |     custom_dir: "docs/overrides"
35 |     font:
36 |         text: Google Sans
37 |         code: Regular
38 | 
39 | plugins:
40 |     - search
41 |     - mkdocstrings
42 |     - git-revision-date
43 |     - git-revision-date-localized:
44 |           enable_creation_date: true
45 |           type: timeago
46 |     # - pdf-export
47 |     - mkdocs-jupyter:
48 |           include_source: True
49 |           ignore_h1_titles: True
50 |           execute: true
51 |           allow_errors: false
52 |           ignore: ["conf.py"]
53 |           execute_ignore: ["*ignore.ipynb"]
54 | 
55 | markdown_extensions:
56 |     - admonition
57 |     - abbr
58 |     - attr_list
59 |     - def_list
60 |     - footnotes
61 |     - meta
62 |     - md_in_html
63 |     - pymdownx.superfences
64 |     - pymdownx.highlight:
65 |           linenums: true
66 |     - toc:
67 |           permalink: true
68 | 
69 | # extra:
70 | #     analytics:
71 | #         provider: google
72 | #         property: UA-XXXXXXXXX-X
73 | 
74 | nav:
75 |     - Home: index.md
76 |     - Installation: installation.md
77 |     - Usage: usage.md
78 |     - Contributing: contributing.md
79 |     - FAQ: faq.md
80 |     - Changelog: changelog.md
81 |     - Report Issues: https://github.com/opengeos/open-buildings/issues
82 |     - Examples:
83 |           - examples/download_buildings.ipynb
84 |     - API Reference:
85 |           - common module: common.md
86 | 


--------------------------------------------------------------------------------
/open_buildings/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for open-buildings."""
2 | 
3 | __author__ = """Chris Holmes"""
4 | __email__ = 'cholmes@9eo.org'
5 | __version__ = '0.10.0'
6 | 


--------------------------------------------------------------------------------
/open_buildings/cli.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import click
  4 | import json
  5 | import pandas as pd
  6 | import osmnx
  7 | from shapely.geometry import shape, box, mapping
  8 | import matplotlib.pyplot as plt
  9 | from open_buildings.google.process import process_benchmark, process_geometries
 10 | from open_buildings.download_buildings import download as download_buildings
 11 | from open_buildings.overture.add_columns import process_parquet_files
 12 | from open_buildings.overture.partition import process_db
 13 | from open_buildings.settings import Source
 14 | from datetime import datetime, timedelta
 15 | from tabulate import tabulate
 16 | import boto3  # Required for S3 operations
 17 | 
 18 | @click.group()
 19 | def main():
 20 |     """CLI for Open Buildings operations."""
 21 |     pass
 22 | 
 23 | @click.group()
 24 | def google():
 25 |     """Commands related to Google operations."""
 26 |     pass
 27 | 
 28 | @click.group()
 29 | def overture():
 30 |     """Commands related to Overture operations."""
 31 |     pass
 32 | 
 33 | main.add_command(google)
 34 | main.add_command(overture)
 35 | 
 36 | def handle_comma_separated(ctx, param, value):
 37 |     return value.split(',')
 38 | 
 39 | def geocode(data: str):
 40 |     location = osmnx.geocode_to_gdf(data)
 41 |     geom = location.geometry[0]
 42 |     geojson = json.loads(json.dumps({"type": "Feature", "geometry": mapping(geom)})) # turn geom tuple into list by (de-)serialising
 43 |     return geojson
 44 | 
 45 | @main.command(name="get_buildings")
 46 | @click.argument('geojson_input', type=click.File('r'), required=False)
 47 | @click.option('--dst', type=str, default="buildings.json", help='The path to write the output to. Can be a directory or file.')
 48 | @click.option('--location', type=str, default=None, help='Use city or region name instead of providing an AOI as file.')
 49 | @click.option('--source', default="overture", type=click.Choice(['google', 'overture']), help='Dataset to query, defaults to Overture')
 50 | @click.option('--country_iso', type=str, default=None, help='A 2 character country ISO code to filter the data by.')
 51 | @click.option('-s', '--silent', is_flag=True, default=False, help='Suppress all print outputs.')
 52 | @click.option('--overwrite', default=False, is_flag=True, help='Overwrite the destination file if it already exists.')
 53 | @click.option('-v', '--verbose', default=False, is_flag=True, help='Print detailed logs with timestamps.')
 54 | def get_buildings(geojson_input, dst, location, source, country_iso, silent, overwrite, verbose):
 55 |     """Tool to extract buildings in common geospatial formats from large archives of GeoParquet data online. GeoJSON
 56 |     input can be provided as a file or piped in from stdin. If no GeoJSON input is provided, the tool will read from stdin.
 57 | 
 58 |     Right now the tool supports two sources of data: Google and Overture. The data comes from Cloud-Native Geospatial distributions
 59 |     on https://source.coop, that are partitioned by admin boundaries and use a quadkey for the spatial index. In time this tool will generalize
 60 |     to support any admin boundary partitioned GeoParquet data, but for now it is limited to the Google and Overture datasets.
 61 | 
 62 |     The default output is GeoJSON, in a file called buildings.json. Changing the suffix will change the output format - .shp for shapefile
 63 |     .gpkg for GeoPackage, .fgb for FlatGeobuf and .parquet for GeoParquet, and .json or .geojson for GeoJSON. If your query is
 64 |     all within one country it is strongly recommended to use country_iso to hint to the query engine which country to query, as this 
 65 |     will speed up the query significantly (5-10x). Expect query times of 5-10 seconds for small queries with country_iso and 30-60 seconds without country_iso.
 66 |     Large queries will take longer, as they have to download more data. 
 67 | 
 68 |     You can look up the country_iso for a country here: https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv
 69 |     If you get the country wrong you will get zero results. Currently you can only query one country, so if your query crosses country boundaries you should
 70 |     not use country_iso. In future versions of this tool we hope to eliminate the need to hint with the country_iso.
 71 |     """
 72 |     # map source of google and overture to values for data_path and hive
 73 |     # case insensitive matching
 74 |     if source.lower() == "google":
 75 |         source = Source.GOOGLE
 76 |     elif source.lower() == "overture":
 77 |         source = Source.OVERTURE
 78 |     else:
 79 |         raise ValueError(f"Invalid source '{source}', accepted values are {', '.join(v.name.lower() for v in Source)}.")
 80 |     
 81 |     if geojson_input:
 82 |         geojson_data = json.load(geojson_input)
 83 |     elif location:
 84 |         geojson_data = geocode(location)
 85 |     else:
 86 |         geojson_data = json.load(click.get_text_stream('stdin'))
 87 |     
 88 |     download_buildings(geojson_data, source=source, generate_sql=False, dst=dst, silent=silent, overwrite=overwrite, verbose=verbose, country_iso=country_iso)
 89 | 
 90 | @google.command('benchmark')
 91 | @click.argument('input_path', type=click.Path(exists=True))
 92 | @click.argument('output_directory', type=click.Path(exists=True))
 93 | @click.option(
 94 |     '--processes',
 95 |     callback=handle_comma_separated,
 96 |     default='duckdb,pandas,ogr',
 97 |     help="The processing methods to use. One or more of duckdb, pandas or ogr, in a comma-separated list. Default is duckdb,pandas,ogr.",
 98 | )
 99 | @click.option(
100 |     '--formats',
101 |     callback=handle_comma_separated,
102 |     default='fgb,parquet,shp,gpkg',
103 |     help="The output formats to benchmark. One or more of fgb, parquet, shp or gpkg, in a comma-separated list. Default is fgb,parquet,shp,gpkg.",
104 | )
105 | @click.option(
106 |     '--skip-split-multis',
107 |     is_flag=True,
108 |     help="Whether to keep multipolygons as they are without splitting into their component polygons.",
109 | )
110 | @click.option('--no-gpq', is_flag=True, help="Disable GPQ conversion. Timing will be faster, but not valid GeoParquet (until DuckDB adds support)")
111 | @click.option(
112 |     '--verbose', is_flag=True, help="Whether to print detailed processing information."
113 | )
114 | @click.option(
115 |     '--output-format',
116 |     callback=handle_comma_separated,
117 |     default='ascii',
118 |     help="The format of the output. Options: ascii, csv, json, chart.",
119 | )
120 | def benchmark(
121 |     input_path,
122 |     output_directory,
123 |     processes,
124 |     formats,
125 |     skip_split_multis,
126 |     no_gpq,
127 |     verbose,
128 |     output_format,
129 | ):
130 |     """Runs the convert function on each of the supplied processes and formats, printing the timing of each as a table"""
131 |     results = process_benchmark(
132 |         input_path, output_directory, processes, formats, not skip_split_multis, verbose
133 |     )
134 | 
135 |     df = pd.DataFrame(results)
136 |     df = df.pivot(index='process', columns='format', values='execution_time')
137 | 
138 |     base_name = os.path.basename(input_path)
139 |     file_name, file_ext = os.path.splitext(base_name)
140 | 
141 |     for format in output_format:
142 |         if format == 'csv':
143 |             df.to_csv(f"{output_directory}/{file_name}_benchmark.csv", index=False)
144 |         elif format == 'json':
145 |             df.to_json(f"{output_directory}/{file_name}_benchmark.json", orient='split', indent=4)
146 |         elif format == 'chart':
147 |             df.plot(kind='bar', rot=0)
148 |             plt.title(f'Benchmark for file: {base_name}')
149 |             plt.xlabel('Process')
150 |             plt.ylabel('Execution Time (in seconds)')
151 |             plt.tight_layout()
152 |             plt.savefig(f"{output_directory}/{file_name}_benchmark.png")
153 |             plt.clf()
154 |         elif format == 'ascii':
155 |             df_formatted = df.copy()
156 |             for column in df_formatted.columns:
157 |                 df_formatted[column] = df_formatted[column].apply(lambda x: (datetime.min + timedelta(seconds=x)).strftime('%M:%S.%f')[:-3])
158 | 
159 |             print(f"\nTable for file: {base_name}")
160 |             print(tabulate(df_formatted, headers="keys", tablefmt="fancy_grid"))
161 |         else:
162 |             raise ValueError('Invalid output format')
163 | 
164 | @google.command('convert')
165 | @click.argument('input_path', type=click.Path(exists=True))
166 | @click.argument('output_directory', type=click.Path(exists=True))
167 | @click.option(
168 |     '--format',
169 |     type=click.Choice(['fgb', 'parquet', 'gpkg', 'shp']),
170 |     default='fgb',
171 |     help="The output format. The default is FlatGeobuf (fgb)",
172 | )
173 | @click.option(
174 |     '--overwrite', is_flag=True, help="Whether to overwrite any existing output files."
175 | )
176 | @click.option(
177 |     '--process',
178 |     type=click.Choice(['duckdb', 'pandas', 'ogr']),
179 |     default='pandas',
180 |     help="The processing method to use. The default is pandas.",
181 | )
182 | @click.option(
183 |     '--skip-split-multis',
184 |     is_flag=True,
185 |     help="Whether to keep multipolygons as they are without splitting into their component polygons.",
186 | )
187 | @click.option(
188 |     '--verbose', is_flag=True, help="Whether to print detailed processing information."
189 | )
190 | def convert(
191 |     input_path, output_directory, format, overwrite, process, skip_split_multis, verbose
192 | ):
193 |     """Converts a CSV or a directory of CSV's to an alternate format. Input CSV's are assumed to be from Google's Open Buildings"""
194 |     process_geometries(
195 |         input_path,
196 |         output_directory,
197 |         format,
198 |         overwrite,
199 |         process,
200 |         not skip_split_multis,
201 |         verbose,
202 |     )
203 | 
204 | @overture.command('add_columns')
205 | @click.argument('input_folder', type=click.Path(exists=True))
206 | @click.argument('output_folder', type=click.Path())
207 | @click.argument('country_parquet_path', type=click.Path(exists=True))
208 | @click.option('--overwrite', is_flag=True, help="Whether to overwrite any existing output files.")
209 | @click.option('--no-quadkey', is_flag=True, help="Whether to add a quadkey column to the output.")
210 | @click.option('--no-country-iso', is_flag=True, help="Whether to add a country_iso column to the output.")
211 | @click.option('--verbose', is_flag=True, help="Whether to print detailed processing information.")
212 | def add_columns(
213 |     input_folder, output_folder, country_parquet_path, overwrite, no_quadkey, no_country_iso, verbose
214 | ):
215 |     """Adds columns to the input Overture parquet files, using Overture country for admin boundaries, outputting GeoParquet ordered by quadkey the output folder"""
216 |     add_quadkey = not no_quadkey
217 |     add_country_iso = not no_country_iso
218 |     """Adds columns to the input parquet files, outputting to the output folder"""
219 |     process_parquet_files(
220 |         input_folder, output_folder, country_parquet_path, overwrite, add_quadkey, add_country_iso, verbose
221 |     )
222 | 
223 | @overture.command('download')
224 | @click.argument('destination_folder', type=click.Path())
225 | @click.option(
226 |     '--theme',
227 |     type=click.Choice(['buildings', 'admins', 'places', 'transportation']),
228 |     default='buildings',
229 |     help="Theme option for the files to download from S3. Default is buildings.",
230 | )
231 | def overture_download(destination_folder, theme):
232 |     """Download building files from S3 (can change theme for other overture data)."""
233 | 
234 |     os.makedirs(destination_folder, exist_ok=True)
235 | 
236 |     s3 = boto3.client('s3')
237 |     bucket = 'overturemaps-us-west-2'
238 |     prefix = f"release/2023-07-26-alpha.0/theme={theme}/"
239 |     
240 |     objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
241 |     
242 |     for obj in objects.get('Contents', []):
243 |         file_name = os.path.basename(obj['Key'])
244 |         local_file_path = os.path.join(destination_folder, file_name)
245 |         timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
246 |         print(f"[{timestamp}] Downloading {file_name} to {destination_folder}")
247 |         s3.download_file(bucket, obj['Key'], local_file_path)
248 | 
249 |         timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
250 |         print(f"[{timestamp}] Downloaded {file_name}")
251 | 
252 | @overture.command('partition')
253 | @click.argument('duckdb-path', type=click.Path(exists=True))
254 | @click.option('--output-folder', default=os.getcwd(), type=click.Path(), help='Folder to store the output files')
255 | @click.option('--geo-conversion', default='gpq', type=click.Choice(['gpq', 'none', 'pandas', 'ogr'], case_sensitive=False))
256 | @click.option('--verbose', is_flag=True, default=False, help='Print verbose output')
257 | @click.option('--max-per-file', default=10000000, type=int, help='Maximum number of rows per file')
258 | @click.option('--row-group-size', default=10000, type=int, help='Row group size for Parquet files')
259 | @click.option('--hive', is_flag=True, default=False, help='Output files in Hive format (folder structure)')
260 | @click.option('--table-name', default='buildings', type=str, help='Name of the table to process')
261 | def partition(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive, table_name):
262 |     """Partition a DuckDB database of all overture data by country_iso"""
263 |     process_db(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive, table_name)
264 | 
265 | 
266 | if __name__ == "__main__":
267 |     sys.exit(main())


--------------------------------------------------------------------------------
/open_buildings/common.py:
--------------------------------------------------------------------------------
1 | """The common module contains common functions and classes used by the other modules.
2 | """
3 | 
4 | def hello_world():
5 |     """Prints "Hello World!" to the console.
6 |     """
7 |     print("Hello World!")


--------------------------------------------------------------------------------
/open_buildings/download_buildings.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import click
  3 | from math import tan, cos, log, pi
  4 | from shapely.geometry import shape, box, mapping
  5 | from typing import Dict, Any, Union
  6 | import mercantile 
  7 | import duckdb
  8 | import time
  9 | from pathlib import Path
 10 | import datetime
 11 | import os
 12 | from typing import Literal, Optional
 13 | import pandas as pd
 14 | import geopandas as gpd
 15 | import subprocess
 16 | import shapely
 17 | import geojson
 18 | import shutil
 19 | import osmnx
 20 | from open_buildings.settings import Source, Format, settings
 21 | 
 22 | def geojson_to_quadkey(data: dict) -> str:
 23 |     geom = shape(data["geometry"])
 24 |     min_lon, min_lat, max_lon, max_lat = geom.bounds
 25 | 
 26 |     for zoom in range(12, -1, -1):
 27 |         tiles = list(mercantile.tiles(min_lon, min_lat, max_lon, max_lat, zooms=zoom))
 28 |         if len(tiles) == 1:
 29 |             return mercantile.quadkey(tiles[0])
 30 | 
 31 |     return ''
 32 | 
 33 | def geojson_to_wkt(data: dict) -> str:
 34 |     geometry = shape(data['geometry'])
 35 |     return geometry.wkt
 36 | 
 37 | def quadkey_to_geojson(quadkey: str) -> dict:
 38 |     # Convert the quadkey to tile coordinates
 39 |     tile = mercantile.quadkey_to_tile(quadkey)
 40 |     
 41 |     # Get the bounding box of the tile
 42 |     bbox = mercantile.bounds(tile)
 43 |     
 44 |     # Construct a GeoJSON Polygon representation of the bounding box
 45 |     geojson = {
 46 |         "type": "Feature",
 47 |         "geometry": {
 48 |             "type": "Polygon",
 49 |             "coordinates": [[
 50 |                 [bbox.west, bbox.south],
 51 |                 [bbox.east, bbox.south],
 52 |                 [bbox.east, bbox.north],
 53 |                 [bbox.west, bbox.north],
 54 |                 [bbox.west, bbox.south]
 55 |             ]]
 56 |         }
 57 |     }
 58 |     
 59 |     return geojson
 60 | 
 61 | @click.group()
 62 | def cli():
 63 |     pass
 64 | 
 65 | @cli.command()
 66 | @click.argument('geojson_input', type=click.File('r'), required=False)
 67 | def quadkey(geojson_input):
 68 |     """Convert GeoJSON to quadkey."""
 69 |     if geojson_input:
 70 |         geojson_data = json.load(geojson_input)
 71 |     else:
 72 |         geojson_data = json.load(click.get_text_stream('stdin'))
 73 |     result = geojson_to_quadkey(geojson_data)
 74 |     click.echo(result)
 75 | 
 76 | @cli.command()
 77 | @click.argument('geojson_input', type=click.File('r'), required=False)
 78 | def WKT(geojson_input):
 79 |     """Convert GeoJSON to Well Known Text."""
 80 |     if geojson_input:
 81 |         geojson_data = json.load(geojson_input)
 82 |     else:
 83 |         geojson_data = json.load(click.get_text_stream('stdin'))
 84 |     
 85 |     result = geojson_to_wkt(geojson_data)
 86 |     click.echo(result)
 87 | 
 88 | 
 89 | @click.command()
 90 | @click.argument('geojson_input', type=click.File('r'), required=False)
 91 | @click.option('--only-quadkey', is_flag=True, help='Include only the quadkey in the WHERE clause.')
 92 | @click.option('--local', is_flag=True, help='Use local path for parquet files instead of the S3 URL.')
 93 | def sql(geojson_input, only_quadkey, local):
 94 |     """Generate an SQL query based on the input GeoJSON."""
 95 |     
 96 |     # Read the GeoJSON
 97 |     if geojson_input:
 98 |         geojson_data = json.load(geojson_input)
 99 |     else:
100 |         geojson_data = json.load(click.get_text_stream('stdin'))
101 | 
102 |     quadkey = geojson_to_quadkey(geojson_data)
103 |     wkt = geojson_to_wkt(geojson_data)
104 | 
105 |     # Adjust the path in read_parquet based on the --local flag
106 |     path = '*.parquet' if local else 's3://us-west-2.opendata.source.coop/cholmes/overture/geoparquet-country-quad-2/*.parquet'
107 |     base_sql = f"select * from read_parquet('{path}')"
108 |     
109 |     # Construct the WHERE clause based on the options
110 |     where_clause = f"WHERE quadkey LIKE '{quadkey}%'"
111 |     if not only_quadkey:
112 |         where_clause += f" AND\nST_Within(ST_GeomFromWKB(geometry), ST_GeomFromText('{wkt}'))"
113 | 
114 |     sql_query = f"{base_sql},\n{where_clause}"
115 |     full_sql_query = f"COPY ('{sql_query}' TO 'buildings.fgb' WITH (FORMAT GDAL, DRIVER 'FlatGeobuf')"
116 |     click.echo(full_sql_query) 
117 | 
118 | @cli.command()
119 | @click.argument('quadkey_input', type=str)
120 | def quad2json(quadkey_input):
121 |     """Convert quadkey to GeoJSON."""
122 |     result = quadkey_to_geojson(quadkey_input)
123 |     click.echo(json.dumps(result, indent=2))
124 | 
125 | def download(
126 |         geojson_data: Dict[str, Any],
127 |         dst: Union[Path, str] = "buildings.json",
128 |         source: Union[Source, str] = Source.OVERTURE,
129 |         format: Optional[Union[Format, str]] = None, 
130 |         country_iso: Optional[str] = None,
131 |         *,
132 |         generate_sql: bool = False, # whether to actually perform actions or just generate sql
133 |         verbose: bool = False, # print detailed logs
134 |         silent: bool = False, # no log output
135 |         overwrite: bool = False # whether to overwrite existing output file
136 |     ) -> None:
137 |     """
138 |     Extract buildings from online sources.
139 | 
140 |     Parameters
141 |     ----------
142 |     geojson_input : Dict[str, Any]
143 |         GeoJSON dictionary
144 |     dst : Path | str
145 |         The path to write the output to. Can be either a file or a directory.
146 |         If a directory is provided, a file "buildings.<ext>" will be created at that location.
147 |     format : string, default "geojson"
148 |         The output format, alternatively can be extracted from "dst". Explicitly naming the format can be useful if
149 |         used in combination with a directory as "dst". If both file path and format param is provided, the format param takes
150 |         precedence.
151 |     country_iso : str, optional
152 |         A two-letter ISO-3166 code for the country the AOI (geojson_input) is in. Not required but massively speeds up queries.
153 |     generate_sql : bool, default False
154 |         Whether to actually perform DuckDB queries or only generate the SQL.
155 |     verbose : bool, default False
156 |         Print more detailed log messages.
157 |     silent : bool, default False
158 |         Suppress log messages.
159 |     overwrite : bool, default False
160 |         Overwrite existing output files.
161 |     """
162 |     # type conversion
163 |     if type(source) == str:
164 |         try:
165 |             source = Source(source.upper())
166 |         except ValueError:
167 |             raise ValueError(f"Source {source} is unknown. Please choose one of {' ,'.join([s.name.lower() for s in Source])}.") from e
168 | 
169 |     if type(format) == str:
170 |         try:
171 |             format = Format(format.upper())
172 |         except ValueError:
173 |             raise ValueError(f"Format {format} is unknown. Please choose one of {', '.join(f.name.lower() for f in Format)}.") from e
174 | 
175 |     if type(dst) == str:
176 |         dst = Path(dst)
177 | 
178 |     # validate path and extension
179 |     if os.path.isdir(dst):
180 |         dst = dst.joinpath("buildings.json")
181 | 
182 |     if format and dst:
183 |         # format takes precedence
184 |         dst = dst.joinpath(f"{dst.stem}.{settings.extensions[format]}")
185 | 
186 |     if not format and dst:
187 |         for fmt, ext in settings.extensions.items():
188 |             if dst.name.endswith(ext):
189 |                 format = fmt
190 |                 break
191 |         else:  # The for-else structure means the else block runs if the loop completes normally, without a break.
192 |             raise ValueError(f"Can't identify file extension of {dst}. Please choose one of {', '.join([f.name.lower() for f in Format])}.")
193 |     
194 |     # utils (should be in separate utils file?)
195 |     def print_timestamped_message(message):
196 |         if not silent:
197 |             current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
198 |             click.echo(f"[{current_time}] {message}")
199 | 
200 |     def print_elapsed_time(start_time):
201 |         end_time = time.time()
202 | 
203 |         elapsed_time = end_time - start_time
204 |         print_timestamped_message(f"Operation took {elapsed_time:.2f} seconds.")
205 | 
206 |     # main program
207 |     start_time = time.time()
208 |     if verbose:
209 |         print_timestamped_message("Reading GeoJSON input...")
210 | 
211 |     if os.path.exists(dst) and not generate_sql:
212 |         if overwrite:
213 |             if verbose:
214 |                 print_timestamped_message(f"Deleting existing file at {dst}.")
215 |             os.remove(dst)
216 |         else:
217 |             # Print message that the file already exists and cleanly exit the program
218 |             print_timestamped_message(f"File at {dst} already exists. Use --overwrite to overwrite it.")
219 |             return
220 | 
221 |     if verbose:
222 |         print_timestamped_message("Converting GeoJSON to quadkey and WKT...")
223 |     quadkey = geojson_to_quadkey(geojson_data)
224 |     wkt = geojson_to_wkt(geojson_data)
225 | 
226 |     country_info = ""
227 |     if country_iso is not None:
228 |         country_info = f"in country {country_iso}"
229 |     print_timestamped_message(f"Querying and downloading data for quadkey {quadkey} {country_info}...")
230 |     if verbose:
231 |         print_timestamped_message(f"WKT: {wkt}")
232 |     if country_info != "":
233 |         print_timestamped_message(f"Expect query times of at least 5-10 seconds")
234 |     else:
235 |         print_timestamped_message(f"Expect query times of at least 30 seconds - this can be lessened by using the --country-iso option")
236 |    
237 |     # download data into DuckDB
238 |     hive_partitioning = settings.sources[source].hive_partitioning
239 |     hive_value = 1 if hive_partitioning else 0
240 |     select_values = "* EXCLUDE geometry"
241 |     # if source is overture and the output is not parquet, then name the values to get
242 |     # so we don't get the crazy structs that gis formats barf on
243 |     if source == Source.OVERTURE and format != Format.PARQUET:
244 |         select_values = "id, level, height, numfloors, class, country_iso, quadkey"
245 |     base_sql = f"select {select_values}, ST_AsWKB(ST_GeomFromWKB(geometry)) AS geometry from read_parquet('{settings.sources[source].base_url}', hive_partitioning={hive_value})"
246 |     where_clause = "WHERE "
247 |     if country_iso:
248 |         where_clause += f"country_iso = '{country_iso}' AND "
249 |     where_clause += f"quadkey LIKE '{quadkey}%'"
250 |     where_clause += f" AND\nST_Within(ST_GeomFromWKB(geometry), ST_GeomFromText('{wkt}'))"
251 |                 
252 |     create_clause = f"CREATE TABLE buildings AS ({base_sql},\n{where_clause});"
253 |     if generate_sql or verbose:
254 |         print_timestamped_message(create_clause)
255 | 
256 |     if not generate_sql:
257 |         conn = duckdb.connect(database=':memory:')
258 | 
259 |         spatial_extension_query = conn.execute("SELECT * FROM duckdb_extensions() WHERE installed IS TRUE AND extension_name = 'spatial';").fetchone()
260 |         if spatial_extension_query is None:
261 |             print_timestamped_message("Installing DuckDB spatial extension...")
262 |             conn.execute("INSTALL spatial;")
263 |         conn.execute("LOAD spatial;")
264 |         conn.execute(create_clause)
265 | 
266 |         count = conn.execute("SELECT COUNT(*) FROM buildings;").fetchone()[0]
267 | 
268 |         print_timestamped_message(f"Downloaded {count} features into DuckDB.")
269 |         if count == 0:
270 |             if country_iso is not None:
271 |                 print_timestamped_message(f"If you are sure that your GeoJSON should have buildings then check to be sure that {country_iso} is the right code.")
272 |             if verbose:
273 |                 print_elapsed_time(start_time)
274 |             return
275 |     
276 |     # export to dst
277 |     if not generate_sql:
278 |         print_timestamped_message(f"Writing to {dst}...")
279 | 
280 |     if format == Format.PARQUET:
281 |         copy_statement = f"COPY buildings TO '{dst}' WITH (FORMAT Parquet);"
282 |         if generate_sql or verbose:
283 |             print_timestamped_message(copy_statement)
284 |         if not generate_sql:
285 |             conn.execute(f"COPY buildings TO '{dst}' WITH (FORMAT Parquet);")
286 |             try:
287 |                 df = pd.read_parquet(dst)
288 | 
289 |                 # Convert WKB geometry to geopandas geometry
290 |                 df['geometry'] = df['geometry'].apply(wkb.loads, hex=True)
291 |                 gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
292 |                 # Change output file the input_filename with .parquet replaced with _geo.parquet
293 |                 output_filename = dst.replace(".parquet", "_geo.parquet")
294 |             
295 |                 gdf.to_parquet(output_filename)
296 |                 # delete the original file
297 |                 os.remove(dst)
298 |                 # Rename (move) the output file to the input filename
299 |                 shutil.move(output_filename, dst)
300 |                 if verbose:
301 |                     print_timestamped_message(f"Finished processing {dst} at {time.ctime()}")
302 |             except Exception as e:
303 |                 print(f"Error processing {dst} to geoparquet: {e}")
304 |     else:
305 |         gdal_format = {
306 |             Format.SHAPEFILE: 'ESRI Shapefile',
307 |             Format.GEOJSON: 'GeoJSON',
308 |             Format.GEOPACKAGE: 'GPKG',
309 |             Format.FLATGEOBUF: 'FlatGeobuf'
310 |         }
311 |         conn.execute(f"COPY buildings TO '{dst}' WITH (FORMAT GDAL, DRIVER '{gdal_format[format]}');")
312 |           
313 |     if verbose:
314 |         print_elapsed_time(start_time)
315 | 
316 | # Registering the commands with the main group
317 | cli.add_command(quadkey)
318 | cli.add_command(WKT)
319 | cli.add_command(sql)
320 | cli.add_command(quad2json)
321 | #cli.add_command(download)
322 | 
323 | if __name__ == '__main__':
324 |     cli()
325 | 


--------------------------------------------------------------------------------
/open_buildings/google/__init.py__:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/open_buildings/google/add_columns.py:
--------------------------------------------------------------------------------
  1 | # This script is a slightly more generic version of the overture add_columns.py. There's
  2 | # some chance it could be completely generic, but I was just trying to work on google buildings
  3 | # so put it under there. The main difference is that it doesn't use the midpoint of the 
  4 | # bbox struct, since that's unique to overture. It just uses the centroid of the geometry.
  5 | # That could likely work just as well if not better for overture too, so we likely can just
  6 | # get rid of that.
  7 | # The other thing that would be nice to make it truly generic is to be able to supply the 
  8 | # table name, since this should work fine with other types of data. Could also just call it
  9 | # 'features' by default, the table name doesn't really matter in these processings. Should probably check
 10 | # to be sure it works with lines and points too. So this could use clean up, also just
 11 | # removing the 'midpoint' code. 
 12 | 
 13 | import os
 14 | import duckdb
 15 | import time
 16 | import tempfile
 17 | import subprocess
 18 | import glob
 19 | from duckdb.typing import *
 20 | import mercantile
 21 | from shapely import wkt
 22 | import shutil
 23 | 
 24 | def lat_lon_to_quadkey(wkt_point: VARCHAR, level: INTEGER) -> VARCHAR:
 25 | 
 26 |     geom = wkt.loads(wkt_point)
 27 | 
 28 |     # convert geom to tile
 29 |     tile = mercantile.tile(geom.x, geom.y, level)
 30 |     
 31 |     # Convert the tile to a quadkey
 32 |     quadKey = mercantile.quadkey(tile)
 33 |     return quadKey
 34 | 
 35 | def midpoint(minval: DOUBLE, maxval: DOUBLE) -> DOUBLE:
 36 |     return (minval + maxval) / 2.0
 37 | 
 38 | def add_quadkey(con):
 39 | 
 40 |     # Register Python UDFs
 41 |     con.create_function('lat_lon_to_quadkey', lat_lon_to_quadkey, [VARCHAR, INTEGER], VARCHAR)
 42 |     con.create_function('midpoint', midpoint, [DOUBLE, DOUBLE], DOUBLE)
 43 | 
 44 |     # Add a quadkey column to the table if it doesn't exist
 45 |     con.execute("ALTER TABLE buildings ADD COLUMN IF NOT EXISTS quadkey VARCHAR")
 46 | 
 47 |     # Update the quadkey column
 48 |     con.execute("""
 49 |     UPDATE buildings 
 50 |     SET quadkey = lat_lon_to_quadkey(ST_Centroid(ST_GeomFromWKB(geometry)),  
 51 |         12
 52 |     );
 53 |     """)
 54 | 
 55 | def add_country_iso(con, country_parquet_path):
 56 |     # Load country parquet file into duckdb
 57 |     con.execute(f"CREATE TABLE countries AS SELECT * FROM read_parquet('{country_parquet_path}')")
 58 | 
 59 |     # Add a country_iso column to the buildings table
 60 |     con.execute("ALTER TABLE buildings ADD COLUMN IF NOT EXISTS country_iso VARCHAR")
 61 |     
 62 |     # Update the country_iso column in the buildings table
 63 |     con.execute("""
 64 |     UPDATE buildings 
 65 |     SET country_iso = countries.isocountrycodealpha2 
 66 |     FROM countries 
 67 |     WHERE ST_Intersects(ST_GeomFromWKB(countries.geometry), ST_GeomFromWKB(buildings.geometry))
 68 |     """)
 69 | 
 70 | def process_parquet_file(input_parquet_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False):
 71 |     # Ensure output_folder exists
 72 |     os.makedirs(output_folder, exist_ok=True)
 73 |     
 74 |     # Get unique identifier from file name
 75 |     file_id = os.path.basename(input_parquet_path)
 76 |     
 77 |     # Define output paths
 78 |     output_db_path = os.path.join(output_folder, f'{file_id}.duckdb')
 79 |     output_parquet_path = os.path.join(output_folder, f'{file_id}')
 80 |     
 81 |     # Check if output files exist
 82 |     if (os.path.exists(output_db_path) or os.path.exists(output_parquet_path)) and not overwrite:
 83 |         print(f'Files with ID {file_id} already exist. Skipping...')
 84 |         return
 85 |     
 86 |     # Overwrite mode: remove existing files
 87 |     if overwrite:
 88 |         for file_path in [output_db_path, output_parquet_path]:
 89 |             if os.path.exists(file_path):
 90 |                 os.remove(file_path)
 91 |     timestamp = time.time()
 92 |     print(f"Starting processing for file {input_parquet_path} at {time.ctime(timestamp)}")
 93 |     
 94 |     # Connect to DuckDB
 95 |     con = duckdb.connect(output_db_path)
 96 |     
 97 |     con.execute('LOAD spatial;')
 98 | 
 99 |     # Load parquet file into duckdb
100 |     con.execute(f"CREATE TABLE buildings AS SELECT * FROM read_parquet('{input_parquet_path}')")
101 |     
102 |     if add_quadkey_option:
103 |         add_quadkey(con)
104 | 
105 |     if add_country_iso_option:
106 |         add_country_iso(con, country_parquet_path)
107 | 
108 |     # Write out to Parquet
109 |     con.execute(f"COPY (SELECT * FROM buildings ORDER BY quadkey) TO '{output_parquet_path}' WITH (FORMAT Parquet)")
110 |     
111 |     if (False):
112 |         print(f"Converting to geoparquet: {output_parquet_path}")
113 |         # Create a temporary file
114 |         temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
115 |         temp_file.close()  # Close the file so gpq can open it
116 | 
117 |         # Convert the Parquet file to a GeoParquet file using gpq
118 |         gpq_cmd = ['gpq', 'convert', f'{output_parquet_path}', temp_file.name]
119 |         subprocess.run(gpq_cmd, check=True)
120 | 
121 |         # Rename the temp file to the final filename
122 |         shutil.move(temp_file.name, f'{output_parquet_path}')
123 |         #os.rename(temp_file.name, f'{output_parquet_path}')
124 | 
125 |     print(f"Processing complete for file {input_parquet_path}")
126 | 
127 |     remove_duckdb = False
128 | 
129 |     # remove duckdb file
130 |     if (remove_duckdb):
131 |         os.remove(output_db_path)
132 | 
133 | def process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False):
134 |     # If input_path is a directory, process all Parquet files in it
135 |     if os.path.isdir(input_path):
136 |         for file in glob.glob(os.path.join(input_path, "*.parquet")):
137 |             process_parquet_file(file, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option)
138 |     else:
139 |         process_parquet_file(input_path, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option)
140 | 
141 | # Call the function
142 | input_path = '/Users/cholmes/geodata/google-buildings-v3/geoparquet/'
143 | output_folder = '/Users/cholmes/geodata/google-buildings-v3/geoparquet-columns'
144 | country_parquet_path = '/Volumes/fastdata/overture/countries.parquet'
145 | process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=True, add_country_iso_option=True)


--------------------------------------------------------------------------------
/open_buildings/google/partition.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script takes a DuckDB database with a buildings table and converts it to GeoParquet 
  3 | files partitioned on first country and then quadkey. The buildings table must have a
  4 | country_iso field and quadkey field, populated by overture-buildings-parquet-add-columns.py.
  5 | The main function is process_db(), and it will take as input a maximum number of rows per
  6 | file and a row group size for the Parquet files. It will then iterate through the countries
  7 | in the database and partition the buildings table into GeoParquet files for each country.
  8 | If the number of rows for a country is greater than the maximum number of rows per file,
  9 | it will partition the country into quadkeys and create GeoParquet files for each quadkey.
 10 | Those quadkeys will be further partitioned if necessary until the number of rows for a
 11 | quadkey is less than or equal to the maximum number of rows per file. 
 12 | """
 13 | 
 14 | import duckdb
 15 | import datetime
 16 | import subprocess
 17 | import tempfile
 18 | import os
 19 | import click
 20 | import shutil
 21 | import geopandas as gpd
 22 | from shapely import wkb
 23 | import pandas as pd
 24 | import time
 25 | 
 26 | def current_time_str():
 27 |     return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 28 | 
 29 | def print_verbose(msg, verbose):
 30 |     if verbose:
 31 |         print(f"[{current_time_str()}] {msg}")
 32 | 
 33 | def convert_gpq(input_filename, row_group_size, verbose):
 34 |     print_verbose(f"Starting conversion for {input_filename} using gpq (row_group_size ignored).", verbose)
 35 | 
 36 |     # Create a temporary file
 37 |     temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
 38 |     temp_file.close()  # Close the file so gpq can open it
 39 | 
 40 |     # Convert the Parquet file to a GeoParquet file using gpq
 41 |     gpq_cmd = ['gpq', 'convert', input_filename, temp_file.name]
 42 |     subprocess.run(gpq_cmd, check=True)
 43 | 
 44 |     print_verbose(f"Conversion for {input_filename} using gpq finished.", verbose)
 45 | 
 46 |     # Rename (move) the temp file to the final filename
 47 |     shutil.move(temp_file.name, input_filename)
 48 | 
 49 |     # Delete the initial temp file if it still exists
 50 |     #initial_temp_filename = f'{country_code}_temp.parquet'
 51 |     #if os.path.exists(initial_temp_filename):
 52 |     #    os.remove(initial_temp_filename)
 53 | 
 54 | def convert_pandas(input_filename, rg_size, verbose):
 55 |     # Placeholder function to be fleshed out
 56 |     print_verbose("Starting conversion using pandas.", verbose)
 57 |     try:
 58 |         df = pd.read_parquet(input_filename)
 59 | 
 60 |         # Convert WKB geometry to geopandas geometry
 61 |         df['geometry'] = df['geometry'].apply(wkb.loads, hex=True)
 62 |         gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
 63 |         # Change output file the input_filename with .parquet replaced with _geo.parquet
 64 |         output_filename = input_filename.replace(".parquet", "_geo.parquet")
 65 |     
 66 |         gdf.to_parquet(output_filename, row_group_size=rg_size)
 67 |         # delete the original file
 68 |         os.remove(input_filename)
 69 |         # Rename (move) the output file to the input filename
 70 |         shutil.move(output_filename, input_filename)
 71 |         print(f"Finished processing {input_filename} at {time.ctime()}")
 72 |     except Exception as e:
 73 |         print(f"Error processing {input_filename}: {e}")
 74 | 
 75 | #not quite working yet - not sure what's wrong. Should go faster than pandas.  
 76 | def convert_ogr(input_filename, rg_size, verbose):
 77 |     fields_to_keep = ['confidence', 'area_in_meters', 'full_plus_code', 'country_iso', 'quadkey']
 78 |     output_filename = input_filename.replace(".parquet", "_geo.parquet")
 79 |     rg_cmd = f"ROW_GROUP_SIZE={rg_size}"
 80 |     cmd = [
 81 |         'ogr2ogr',
 82 |         '-f',
 83 |         'Parquet',
 84 |         '-select',
 85 |         ','.join(fields_to_keep),
 86 |         output_filename,
 87 |         input_filename,
 88 |    #     '-oo',
 89 |    #     rg_cmd,
 90 |         '-oo',
 91 |         'GEOM_POSSIBLE_NAMES=geometry',
 92 |         '-a_srs',
 93 |         'EPSG:4326', ]
 94 | 
 95 |     # print the ogr2ogr command that will be run
 96 |     if verbose:
 97 |         print("ogr2ogr command:")
 98 |         print(' '.join(cmd))
 99 | 
100 |     # Run the command
101 |     subprocess.run(cmd, check=True)
102 | 
103 |     # delete the original file
104 |     os.remove(input_filename)
105 |     # Rename (move) the output file to the input filename
106 |     shutil.move(output_filename, input_filename)
107 |     print(f"Finished processing {input_filename} at {time.ctime()}")
108 | 
109 |     if verbose:
110 |         print(f"Converted {input_filename} to {output_filename} using ogr2ogr.")
111 | 
112 |  
113 | 
114 | def fetch_quadkeys(conn, table_name, country_code, length, verbose, prev_qk=""):
115 |     query = f"SELECT DISTINCT SUBSTR(quadkey, 1, {length}) FROM {table_name} WHERE country_iso = '{country_code}'"
116 |     if prev_qk:
117 |         query += f" AND SUBSTR(quadkey, 1, {len(prev_qk)}) = '{prev_qk}'"
118 |     print_verbose(f'Executing: {query}', verbose)
119 |     return conn.execute(query).fetchall()
120 | 
121 | def convert_to_geoparquet(parquet_path, geo_conversion, row_group_size, verbose):
122 |     if geo_conversion == 'gpq':
123 |         convert_gpq(parquet_path, row_group_size, verbose)
124 |         print_verbose(f"File: {parquet_path} written with gpq", verbose)
125 |     elif geo_conversion == 'pandas':
126 |         convert_pandas(parquet_path, row_group_size, verbose)
127 |         print_verbose(f"File: {parquet_path} written with pandas", verbose)
128 |     elif geo_conversion == 'ogr':
129 |         convert_ogr(parquet_path, row_group_size, verbose)
130 |         print_verbose(f"File: {parquet_path} written with ogr", verbose)
131 |     else:
132 |         print_verbose(f"File: {parquet_path} written without converting to GeoParquet", verbose)
133 | 
134 | #TODO: go all the way into the quad to find the smallest quadkey that contains less than max_per_file rows
135 | def process_quadkey_recursive(conn, table_name, country_code, output_folder, length, geo_conversion, row_group_size, verbose, max_per_file, current_qk=""):
136 |     distinct_quadkeys = fetch_quadkeys(conn, table_name, country_code, length, verbose, current_qk)
137 |     print_verbose(f"The list of quadkeys for country {country_code} and length {length} is {distinct_quadkeys}", verbose)
138 |     #num_distinct_qk = len(distinct_quadkeys)
139 |     for qk in distinct_quadkeys:
140 |         qk_str = qk[0]
141 |         qk_count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}'"
142 |         print_verbose(f'Executing: {qk_count_query}', verbose)
143 |         qk_count = conn.execute(qk_count_query).fetchone()[0]
144 |         print_verbose(f"Quadkey {qk_str} has {qk_count} rows", verbose)
145 |         if qk_count > max_per_file:
146 |             process_quadkey_recursive(conn, table_name, country_code, output_folder, length + 1, geo_conversion, row_group_size, verbose, max_per_file, qk_str)
147 |         else:
148 |             quad_output_filename = os.path.join(output_folder, f'{country_code}_{qk_str}.parquet')
149 |             if os.path.exists(quad_output_filename):
150 |                 print_verbose(f"Output file {quad_output_filename} already exists, skipping...", verbose)
151 |             else:
152 |                 copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}' ORDER BY quadkey) TO '{quad_output_filename}' WITH (FORMAT PARQUET);"
153 |                 print_verbose(f'Executing: {copy_cmd}', verbose)
154 |                 conn.execute(copy_cmd)
155 |                 convert_to_geoparquet(quad_output_filename, geo_conversion, row_group_size, verbose)
156 | 
157 | 
158 | # TODO: add option for 'hive' output (put things in folder)
159 | # TODO: add option to read duckdb path from an environment variable
160 | # TODO: add row group size option (first works with duckdb)
161 | 
162 | @click.command()
163 | @click.argument('duckdb-path', type=click.Path(exists=True))
164 | @click.option('--output-folder', default=os.getcwd(), type=click.Path(), help='Folder to store the output files')
165 | @click.option('--geo-conversion', default='gpq', type=click.Choice(['gpq', 'none', 'pandas', 'ogr'], case_sensitive=False))
166 | @click.option('--verbose', is_flag=True, default=False, help='Print verbose output')
167 | @click.option('--max-per-file', default=10000000, type=int, help='Maximum number of rows per file')
168 | @click.option('--row-group-size', default=10000, type=int, help='Row group size for Parquet files')
169 | @click.option('--hive', is_flag=True, default=False, help='Output files in Hive format (folder structure)')
170 | def process_db(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive):
171 |     table_name = 'buildings'
172 |     # create output folder if it does not exist
173 |     os.makedirs(output_folder, exist_ok=True)
174 |     conn = duckdb.connect(duckdb_path)
175 |     conn.execute('LOAD spatial;')
176 |     cursor = conn.execute('SELECT DISTINCT country_iso FROM buildings')
177 |     countries = cursor.fetchall()
178 |     
179 |     print_verbose(f'Found {len(countries)} unique countries', verbose)
180 |     countries.reverse()
181 |     for country in countries:
182 |         country_code = country[0]
183 |         write_folder = output_folder
184 |         if (hive):
185 |             write_folder = os.path.join(output_folder, f'country_iso={country_code}')
186 |             os.makedirs(write_folder, exist_ok=True)
187 |         output_filename = os.path.join(write_folder, f'{country_code}.parquet')
188 |         if os.path.exists(output_filename):
189 |             print_verbose(f"Output file for country {country_code} already exists, skipping...", verbose)
190 |             continue
191 | 
192 |         count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}'"
193 |         print_verbose(f'Executing: {count_query}', verbose)
194 |         count = conn.execute(count_query).fetchone()[0]
195 |         print_verbose(f"Country {country_code} has {count} rows", verbose)
196 | 
197 |         if count <= max_per_file:
198 |             copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' ORDER BY quadkey) TO '{output_filename}' WITH (FORMAT PARQUET);"
199 |             print_verbose(f'Executing: {copy_cmd}', verbose)
200 |             conn.execute(copy_cmd)
201 |             convert_to_geoparquet(output_filename, geo_conversion, row_group_size, verbose)
202 |         else:
203 |             process_quadkey_recursive(conn, table_name, country_code, write_folder, 1, geo_conversion, row_group_size, verbose, max_per_file)
204 | 
205 | if __name__ == "__main__":
206 |     process_db()


--------------------------------------------------------------------------------
/open_buildings/google/process.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import time
  4 | from datetime import datetime, timedelta
  5 | import json
  6 | 
  7 | import click
  8 | import glob
  9 | import duckdb
 10 | import pandas as pd
 11 | import geopandas as gpd
 12 | from shapely import wkt
 13 | from shapely.geometry import mapping
 14 | from openlocationcode import openlocationcode as olc
 15 | 
 16 | # Global variable, that runs GPQ (https://github.com/planetlabs/gpq) after DuckDB writes the Parquet file.
 17 | # This is necessary because DuckDB does not write the GeoParquet metadata (yet). Once DuckDB implements
 18 | # this feature can be removed. Setting it to false will give a sense of how fast DuckDB will be, but
 19 | # if you want to actually use the output GeoParquet files, set it to True.
 20 | RUN_GPQ_CONVERSION = True
 21 | 
 22 | # Global variable, that sets the compression type for the Parquet files. The two options that
 23 | # will work for both DuckDB and pandas are 'snappy' and 'gzip'. 'snappy' is the default. You can
 24 | # try out brotli with pandas, it seems to give the most compression. DuckDB additional supports
 25 | # zstd, but pandas does not. Note that GPQ conversion on DuckDB output likely keeps the same
 26 | # compression, but I have not tested this. GPQ conversion from Parquet does not yet support
 27 | # the other GPQ compression options.
 28 | PARQUET_COMPRESSION = 'snappy'
 29 | 
 30 | # Don't run the DuckDB GPKG conversion if set to true, as it takes a long time, likely due to a bug.
 31 | # It means longer runs and puts one big time on the graphs.
 32 | SKIP_DUCK_GPKG = True
 33 | 
 34 | @click.group()
 35 | def cli():
 36 |     pass
 37 | 
 38 | 
 39 | def define_output_paths(input_file_path, output_directory, format):
 40 |     output_file_name = os.path.basename(input_file_path)[:-3] + format
 41 |     output_file_path = os.path.join(output_directory, output_file_name)
 42 |     # TODO: the -3 doesn't work with .parquet, leads to a weird file name, but duck doesn't care.
 43 |     duckdb_file_path = output_file_path[:-3] + 'duckdb'
 44 |     return output_file_path, duckdb_file_path
 45 | 
 46 | 
 47 | def remove_existing_files(output_file_path, duckdb_file_path, overwrite):
 48 |     if overwrite:
 49 |         if os.path.exists(output_file_path):
 50 |             os.remove(output_file_path)
 51 |         if os.path.exists(duckdb_file_path):
 52 |             os.remove(duckdb_file_path)
 53 | 
 54 | 
 55 | def process_with_duckdb(
 56 |     input_file_path,
 57 |     duckdb_file_path,
 58 |     split_multipolygons,
 59 |     verbose,
 60 |     format,
 61 |     output_file_path,
 62 | ):
 63 |     # new duckdb at input file path but with .duckdb
 64 |     conn = duckdb.connect(duckdb_file_path)
 65 |     c = conn.cursor()
 66 |     c.execute(f"install spatial;")
 67 |     c.execute(f"load spatial;")
 68 |     c.execute(
 69 |         f"create table buildings as (select * EXCLUDE (latitude, longitude) from '{input_file_path}');"
 70 |     )
 71 | 
 72 |     if verbose:
 73 |         c.execute("SELECT COUNT(*) FROM buildings")
 74 |         print(f"Original rows: {c.fetchone()[0]}")
 75 | 
 76 |     if split_multipolygons:
 77 |         # Fetch the multipolygons
 78 |         c.execute("SELECT * FROM buildings WHERE geometry LIKE 'MULTIPOLYGON%'")
 79 |         results = c.fetchall()
 80 |         columns = [desc[0] for desc in c.description]
 81 | 
 82 |         multipolygon_count = 0
 83 | 
 84 |         # Process each multipolygon
 85 |         for row in results:
 86 |             multipolygon_count += 1
 87 |             row_dict = dict(zip(columns, row))
 88 |             multipolygon = wkt.loads(row_dict['geometry'])
 89 | 
 90 |             if verbose:
 91 |                 # Print the original MultiPolygon
 92 |                 feature = {
 93 |                     "type": "Feature",
 94 |                     "properties": {
 95 |                         k: v for k, v in row_dict.items() if k != 'geometry'
 96 |                     },
 97 |                     "geometry": multipolygon.__geo_interface__,
 98 |                 }
 99 |                 print("Original MultiPolygon:")
100 |                 print(json.dumps(feature))
101 | 
102 |             for polygon in multipolygon.geoms:
103 |                 # Convert the polygon to a GeoSeries in order to project it
104 |                 polygon_projected = gpd.GeoSeries([polygon], crs="EPSG:4326").to_crs(
105 |                     'EPSG:6933'
106 |                 )
107 | 
108 |                 # Compute the new area (geopandas calculates area in square meters for projected CRS)
109 |                 new_area = polygon_projected.area.values[0]
110 | 
111 |                 # Compute the centroid and encode it into a Plus Code
112 |                 centroid = polygon.centroid
113 |                 new_plus_code = olc.encode(centroid.y, centroid.x, codeLength=12)
114 | 
115 |                 # Create new properties for the polygon
116 |                 properties = {k: v for k, v in row_dict.items() if k != 'geometry'}
117 |                 properties['area_in_meters'] = new_area
118 |                 properties['full_plus_code'] = new_plus_code
119 | 
120 |                 if verbose:
121 |                     # Print the new Polygon
122 |                     feature = {
123 |                         "type": "Feature",
124 |                         "properties": properties,
125 |                         "geometry": polygon.__geo_interface__,
126 |                     }
127 |                     print("Component Polygon:")
128 |                     print(json.dumps(feature))
129 | 
130 |                 # Insert new polygon into buildings table
131 |                 columns_str = ', '.join(
132 |                     [f'"{k}"' for k in properties.keys()] + ['geometry']
133 |                 )
134 |                 values_str = ', '.join(
135 |                     [
136 |                         f"'{v}'" if isinstance(v, str) else str(v)
137 |                         for v in properties.values()
138 |                     ]
139 |                     + [f"'{polygon.wkt}'"]
140 |                 )
141 |                 c.execute(
142 |                     f"INSERT INTO buildings ({columns_str}) VALUES ({values_str})"
143 |                 )
144 | 
145 |         if verbose:
146 |             print(f"Processed {multipolygon_count} multipolygons.")
147 | 
148 |         # Delete the original multipolygons
149 |         c.execute("DELETE FROM buildings WHERE geometry LIKE 'MULTIPOLYGON%'")
150 | 
151 |     if verbose:
152 |         c.execute("SELECT COUNT(*) FROM buildings")
153 |         print(f"Output rows: {c.fetchone()[0]}")
154 | 
155 |         c.execute("SELECT COUNT(*) FROM buildings WHERE geometry LIKE 'MULTIPOLYGON%'")
156 |         print(f"Output multipolygons: {c.fetchone()[0]}")
157 | 
158 |         c.execute("SELECT COUNT(*) FROM buildings WHERE geometry LIKE 'POLYGON%'")
159 |         print(f"Output polygons: {c.fetchone()[0]}")
160 | 
161 |     if format == 'fgb':
162 |         c.execute(
163 |             f"COPY (SELECT * EXCLUDE geometry, ST_AsWKB(ST_GeomFromText(geometry)) AS geometry from buildings) \
164 |                 TO '{output_file_path}' WITH  (FORMAT GDAL, DRIVER 'FlatGeobuf');"
165 |         )
166 |     elif format == 'parquet':
167 |         c.execute(
168 |             f"COPY (SELECT * EXCLUDE geometry, ST_AsWKB(ST_GeomFromText(geometry)) AS geometry from buildings) \
169 |                 TO '{output_file_path}' WITH  (FORMAT PARQUET, COMPRESSION '{PARQUET_COMPRESSION}');"
170 |         )
171 |         if RUN_GPQ_CONVERSION:
172 |             print(
173 |                 f"Running gpq convert on {output_file_path}. This takes extra time but ensures the output is valid GeoParquet."
174 |             )
175 |             base_name, ext = os.path.splitext(output_file_path)
176 |             temp_output_file_path = base_name + '_temp' + ext
177 | 
178 |             # convert from parquet file with a geometry column named wkb to GeoParquet
179 |             command = ['gpq', 'convert', output_file_path, temp_output_file_path]
180 |             gpq_start_time = time.time()
181 |             subprocess.run(command, check=True)
182 |             os.rename(temp_output_file_path, output_file_path)
183 |             gpq_end_time = time.time()
184 |             gpq_elapsed_time = gpq_end_time - gpq_start_time
185 |             print(f"Time taken to run gpq: {gpq_elapsed_time:.2f} seconds")
186 |         else:
187 |             print(
188 |                 f"Skipping gpq convert on {output_file_path}. This means the output will be WKB, but it will need to be converted to GeoParquet."
189 |             )
190 |     elif format == 'gpkg':
191 |         if SKIP_DUCK_GPKG:
192 |             print(
193 |                 f"Skipping duckdb-gpkg conversion on {output_file_path}, since SKIP_DUCK_GPKG is set to True. There is likely a bug, since it takes way longer and skews the graphs"
194 |             )
195 |         else:
196 |             c.execute(
197 |                 f"COPY (SELECT * EXCLUDE geometry, ST_AsWKB(ST_GeomFromText(geometry)) AS geometry from buildings) \
198 |                     TO '{output_file_path}' WITH  (FORMAT GDAL, DRIVER 'GPKG');"
199 |             )
200 |     elif format == 'shp':
201 |         c.execute(
202 |             f"COPY (SELECT * EXCLUDE geometry, ST_AsWKB(ST_GeomFromText(geometry)) AS geometry from buildings) \
203 |                 TO '{output_file_path}' WITH  (FORMAT GDAL, DRIVER 'ESRI Shapefile');"
204 |         )
205 | 
206 |     conn.close()
207 | 
208 | 
209 | def process_with_pandas(
210 |     input_file_path, split_multipolygons, verbose, format, output_file_path
211 | ):
212 |     df = pd.read_csv(input_file_path)
213 |     gs = gpd.GeoSeries.from_wkt(df['geometry'])
214 | 
215 |     # Drop the 'latitude', 'longitude' and 'geometry' columns
216 |     df = df.drop(['latitude', 'longitude', 'geometry'], axis=1)
217 | 
218 |     # Convert the DataFrame to a GeoDataFrame
219 |     gdf = gpd.GeoDataFrame(df, geometry=gs, crs="EPSG:4326")
220 | 
221 |     # Create an empty GeoDataFrame for the output
222 |     output_gdf = gpd.GeoDataFrame(columns=list(gdf.columns), crs=gdf.crs)
223 | 
224 |     if split_multipolygons:
225 |         multipolygons = gdf[gdf.geometry.type == 'MultiPolygon']
226 |         multipolygon_count = 0
227 |         for i, row in multipolygons.iterrows():
228 |             multipolygon_count += 1
229 |             # Print the original MultiPolygon
230 |             feature = {
231 |                 "type": "Feature",
232 |                 "properties": row.drop('geometry').to_dict(),
233 |                 "geometry": row.geometry.__geo_interface__,
234 |             }
235 |             if verbose:
236 |                 print("Original MultiPolygon:")
237 |                 print(json.dumps(feature))
238 | 
239 |             # Print each component Polygon
240 |             for polygon in row.geometry.geoms:
241 |                 # Convert the polygon to a GeoSeries in order to project it
242 |                 polygon_projected = gpd.GeoSeries([polygon], crs=gdf.crs).to_crs(
243 |                     'EPSG:6933'
244 |                 )
245 | 
246 |                 # Compute the new area (geopandas calculates area in square meters for projected CRS)
247 |                 new_area = polygon_projected.area.values[0]
248 | 
249 |                 # Compute the centroid and encode it into a Plus Code
250 |                 centroid = polygon.centroid
251 |                 new_plus_code = olc.encode(centroid.y, centroid.x, codeLength=12)
252 | 
253 |                 # Create new properties for the polygon
254 |                 properties = row.drop('geometry').to_dict()
255 |                 properties['area_in_meters'] = new_area
256 |                 properties['full_plus_code'] = new_plus_code
257 | 
258 |                 # Append to the output GeoDataFrame
259 |                 output_gdf = pd.concat(
260 |                     [
261 |                         output_gdf,
262 |                         gpd.GeoDataFrame([properties], geometry=[polygon], crs=gdf.crs),
263 |                     ],
264 |                     ignore_index=True,
265 |                 )
266 | 
267 |                 # Print the new Polygon
268 |                 feature = {
269 |                     "type": "Feature",
270 |                     "properties": properties,
271 |                     "geometry": polygon.__geo_interface__,
272 |                 }
273 |                 if verbose:
274 |                     print("Component Polygon:")
275 |                     print(json.dumps(feature))
276 | 
277 |         print(f"Processed {multipolygon_count} multipolygons.")
278 |         # Add the original Polygons to the output
279 |         polygons = gdf[gdf.geometry.type == 'Polygon']
280 |         output_gdf = pd.concat([output_gdf, polygons], ignore_index=True)
281 |     else:
282 |         output_gdf = gdf
283 | 
284 |     if verbose:
285 |         # Print the number of original rows in the datafram, and the number of rows in the output
286 |         print(f"Original rows: {len(gdf)}")
287 |         print(f"Output rows: {len(output_gdf)}")
288 |         # Print number of multipolygons and polygons for the output_gdf
289 |         print(
290 |             f"Output multipolygons: {len(output_gdf[output_gdf.geometry.type == 'MultiPolygon'])}"
291 |         )
292 |         print(
293 |             f"Output polygons: {len(output_gdf[output_gdf.geometry.type == 'Polygon'])}"
294 |         )
295 |     # Write the output GeoDataFrame to a file
296 |     if format == 'fgb':
297 |         output_gdf.to_file(output_file_path, driver="FlatGeobuf", engine="pyogrio")
298 |     elif format == 'parquet':
299 |         output_gdf.to_parquet(output_file_path, compression=PARQUET_COMPRESSION)
300 |     elif format == 'gpkg':
301 |         output_gdf.to_file(
302 |             output_file_path, driver='GPKG', engine="pyogrio", spatial_index=False
303 |         )
304 |     elif format == 'shp':
305 |         output_gdf.to_file(output_file_path, driver='ESRI Shapefile', engine="pyogrio")
306 | 
307 | 
308 | def process_with_ogr2ogr(
309 |     input_file_path, split_multipolygons, verbose, format, output_file_path
310 | ):
311 |     # Define the SQL query to select specific columns
312 |     table_name = os.path.splitext(os.path.basename(input_file_path))[0]
313 | 
314 |     if format == 'fgb':
315 |         format_string = "FlatGeobuf"
316 |     elif format == 'parquet':
317 |         format_string = "Parquet"
318 |     elif format == 'gpkg':
319 |         format_string = "GPKG"
320 |     elif format == 'shp':
321 |         format_string = "ESRI Shapefile"
322 | 
323 |     fields_to_keep = ['confidence', 'area_in_meters', 'full_plus_code']
324 | 
325 |     # Define the ogr2ogr command
326 |     cmd = [
327 |         'ogr2ogr',
328 |         '-f',
329 |         format_string,
330 |         '-select',
331 |         ','.join(fields_to_keep),
332 |         output_file_path,
333 |         input_file_path,
334 |         '-oo',
335 |         'GEOM_POSSIBLE_NAMES=geometry',
336 |         '-a_srs',
337 |         'EPSG:4326',
338 |     ]
339 | 
340 |     # If split_multipolygons is True, print a message and return.
341 |     # But skip this if the output format is Shapefile, because shapefiles don't have a difference between polygons and multipolygons.
342 |     if split_multipolygons and format != 'shp':
343 |         print("OGR processing doesn't yet support multi polygons")
344 |         return
345 | 
346 |     # print the ogr2ogr command that will be run
347 |     if verbose:
348 |         print("ogr2ogr command:")
349 |         print(' '.join(cmd))
350 | 
351 |     # Run the command
352 |     subprocess.run(cmd, check=True)
353 | 
354 |     if verbose:
355 |         print(f"Converted {input_file_path} to {output_file_path} using ogr2ogr.")
356 | 
357 | 
358 | def process_csv_file(
359 |     input_file_path,
360 |     output_directory,
361 |     format,
362 |     overwrite,
363 |     process,
364 |     split_multipolygons,
365 |     verbose,
366 | ):
367 |     output_file_path, duckdb_file_path = define_output_paths(
368 |         input_file_path, output_directory, format
369 |     )
370 |     remove_existing_files(output_file_path, duckdb_file_path, overwrite)
371 | 
372 |     if os.path.exists(output_file_path):
373 |         print(f'Skipping {input_file_path} as {output_file_path} already exists.')
374 |         return
375 |     else:
376 |         print(
377 |             f'Started converting {input_file_path} with {process} to {format} at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}...'
378 |         )
379 | 
380 |     start_time = time.time()
381 | 
382 |     if process == 'duckdb':
383 |         process_with_duckdb(
384 |             input_file_path,
385 |             duckdb_file_path,
386 |             split_multipolygons,
387 |             verbose,
388 |             format,
389 |             output_file_path,
390 |         )
391 |     elif process == 'pandas':
392 |         process_with_pandas(
393 |             input_file_path, split_multipolygons, verbose, format, output_file_path
394 |         )
395 |     elif process == 'ogr':
396 |         process_with_ogr2ogr(
397 |             input_file_path, split_multipolygons, verbose, format, output_file_path
398 |         )
399 | 
400 |     execution_time = time.time() - start_time
401 |     print(
402 |         f'Finished processing {output_file_path} at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}. Execution time: {str(timedelta(seconds=execution_time))}'
403 |     )
404 | 
405 | 
406 | def process_geometries(
407 |     input_path,
408 |     output_directory,
409 |     format,
410 |     overwrite,
411 |     process,
412 |     split_multipolygons,
413 |     verbose,
414 | ):
415 |     # Check if the provided path is a directory or a file
416 |     if os.path.isdir(input_path):
417 |         # List all csv files in the directory
418 |         csv_files = glob.glob(os.path.join(input_path, '*.csv'))
419 | 
420 |         # Sort files by size in ascending order
421 |         csv_files.sort(key=lambda x: os.path.getsize(x))
422 | 
423 |         # Process each csv file
424 |         for input_file_path in csv_files:
425 |             process_csv_file(
426 |                 input_file_path,
427 |                 output_directory,
428 |                 format,
429 |                 overwrite,
430 |                 process,
431 |                 split_multipolygons,
432 |                 verbose,
433 |             )
434 |     elif os.path.isfile(input_path) and input_path.endswith('.csv'):
435 |         # Process the single csv file
436 |         process_csv_file(
437 |             input_path,
438 |             output_directory,
439 |             format,
440 |             overwrite,
441 |             process,
442 |             split_multipolygons,
443 |             verbose,
444 |         )
445 |     else:
446 |         raise ValueError(f"Invalid input path: {input_path}")
447 | 
448 | 
449 | def process_benchmark(
450 |     input_path, output_directory, processes, formats, split_multipolygons, verbose
451 | ):
452 |     results = []
453 |     for process in processes:
454 |         for format in formats:
455 |             start_time = time.time()
456 |             process_geometries(
457 |                 input_path,
458 |                 output_directory,
459 |                 format,
460 |                 True,
461 |                 process,
462 |                 split_multipolygons,
463 |                 verbose,
464 |             )
465 |             execution_time = time.time() - start_time
466 |             if process == 'duckdb' and format == 'gpkg' and SKIP_DUCK_GPKG:
467 |                 execution_time = 0
468 |             results.append(
469 |                 {
470 |                     'process': process,
471 |                     'format': format,
472 |                     #'execution_time': str(timedelta(seconds=execution_time)),
473 |                     'execution_time': execution_time,
474 |                 }
475 |             )
476 |     return results
477 | 
478 | if __name__ == "__main__":
479 |     cli()
480 | 


--------------------------------------------------------------------------------
/open_buildings/google/stac-geoparquet.py:
--------------------------------------------------------------------------------
  1 | # WARNING - Work in progress
  2 | # This isn't working yet, but it's close. The main issue is that the catalog
  3 | # and collections aren't getting formed right - I want them in the hive partitions, but
  4 | # pystac keeps trying to move them in the recommended STAC structure. Committing in case 
  5 | # its useful.
  6 | # Next approach may just be to form the items individually, as that part seems to be fine,
  7 | # and then place them in the catalog and collection manually (maybe pystac can help, but 
  8 | # may be easier to just use python to adjust the links)
  9 | 
 10 | 
 11 | import os
 12 | import pystac
 13 | from pystac import Catalog, Collection, Item, Asset, CatalogType
 14 | import geopandas as gpd
 15 | from datetime import datetime
 16 | import click
 17 | from shapely.geometry import box
 18 | from dateutil.parser import parse
 19 | 
 20 | def read_geoparquet_bounds(filepath):
 21 |     """
 22 |     Reads a Geoparquet file and returns its bounds and EPSG.
 23 |     """
 24 |     gdf = gpd.read_parquet(filepath)
 25 |     bounds = gdf.total_bounds.tolist()
 26 |     epsg = gdf.crs.to_epsg()  # Extract the EPSG code
 27 |     return bounds, epsg
 28 | 
 29 | def create_stac_item_for_geoparquet(filepath, collection, item_datetime):
 30 |     filename = os.path.basename(filepath)
 31 |     file_id, _ = os.path.splitext(filename)
 32 |     title = filename
 33 | 
 34 |     # Get the bounds and CRS
 35 |     bbox, epsg = read_geoparquet_bounds(filepath)
 36 |     
 37 |     # Use the bounds as the geometry too
 38 |     geometry = box(*bbox).__geo_interface__
 39 | 
 40 |     item = Item(id=file_id,
 41 |                 geometry=geometry, 
 42 |                 bbox=bbox,
 43 |                 datetime=item_datetime,
 44 |                 properties={'title': title, 'proj:epsg': epsg},
 45 |                 collection=collection.id)
 46 | 
 47 |     pystac.extensions.projection.ProjectionExtension.add_to(item)
 48 |     item.add_asset(key="data", asset=Asset(href=filepath, media_type="application/parquet"))
 49 | 
 50 |     return item
 51 | 
 52 | @click.command()
 53 | @click.argument('directory', type=click.Path(exists=True))
 54 | @click.option('--collection-path', default='collection.json', help='Path to the collection.json file relative to the directory.')
 55 | @click.option('--item-datetime', default='2023-05-30T00:00:00Z', help='Datetime for the STAC items.')
 56 | @click.option('--catalog-type', type=click.Choice(['SELF_CONTAINED', 'ABSOLUTE_PUBLISHED'], case_sensitive=False), default='SELF_CONTAINED', help='Type of the catalog.')
 57 | @click.option('--root-path', default=None, help='Root path for the catalog. Relevant for ABSOLUTE_PUBLISHED catalog type.')
 58 | # ... [other necessary imports and functions]
 59 | 
 60 | def main(directory, collection_path, item_datetime, catalog_type, root_path):
 61 |     catalog_id = 'my-catalog'
 62 |     catalog_description = 'A catalog of geoparquet files.'
 63 |     item_datetime = parse(item_datetime)
 64 |     collection = Collection.from_file(collection_path)
 65 |     
 66 |     # Create the catalog first
 67 |     catalog = Catalog(id=catalog_id, description=catalog_description, catalog_type=CatalogType[catalog_type])
 68 |     
 69 |     items = []
 70 |     for root, _, files in os.walk(directory):
 71 |         for filename in files:
 72 |             if filename.endswith(".parquet"):
 73 |                 filepath = os.path.join(root, filename)
 74 |                 item = create_stac_item_for_geoparquet(filepath, collection, item_datetime)
 75 |                 
 76 |                 # Save the item alongside the parquet file
 77 |                 item_path = os.path.join(root, f"{item.id}.json")
 78 |                 item.set_self_href(item_path)
 79 |                 item.save_object()
 80 |                 items.append(item)
 81 |     
 82 |      # Create and save the catalog
 83 |     catalog_path = os.path.join(directory, 'catalog.json')
 84 |     catalog.set_self_href(catalog_path)
 85 |     catalog.save_object()
 86 | 
 87 |     # Reload the catalog from file
 88 |     catalog = Catalog.from_file(catalog_path)
 89 | 
 90 |     # Add items to the catalog
 91 |     for item in items:
 92 |         catalog.add_item(item)
 93 |         item.add_link(pystac.Link("parent", os.path.relpath(catalog.get_self_href(), os.path.dirname(item.get_self_href()))))
 94 | 
 95 |     # Save the updated catalog
 96 |     catalog.save_object()
 97 | 
 98 |     # Load the collection and set its links
 99 |     collection_path_new = os.path.join(directory, "collection.json")
100 |     collection.set_self_href(collection_path_new)
101 |     collection.add_child(catalog)
102 |     collection.save_object()
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/open_buildings/overture/__init.py__:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/open_buildings/overture/add_columns.py:
--------------------------------------------------------------------------------
  1 | # This script is used to take an Overture Parquet file and add columns
  2 | # useful for partitioning - it can put in both a quadkey and the country
  3 | # ISO code. And then it will write out parquet and use gpq to convert the
  4 | # parquet to geoparquet.
  5 | 
  6 | 
  7 | import glob
  8 | import os
  9 | import shutil
 10 | import subprocess
 11 | import tempfile
 12 | import time
 13 | 
 14 | import duckdb
 15 | import mercantile
 16 | from duckdb.typing import *
 17 | 
 18 | 
 19 | def lat_lon_to_quadkey(lat: DOUBLE, lon: DOUBLE, level: INTEGER) -> VARCHAR:
 20 |     # Convert latitude and longitude to tile using mercantile
 21 |     tile = mercantile.tile(lon, lat, level)
 22 |     
 23 |     # Convert the tile to a quadkey
 24 |     quadKey = mercantile.quadkey(tile)
 25 |     return quadKey
 26 | 
 27 | def midpoint(minval: DOUBLE, maxval: DOUBLE) -> DOUBLE:
 28 |     return (minval + maxval) / 2.0
 29 | 
 30 | def add_quadkey(con):
 31 | 
 32 |     # Register Python UDFs
 33 |     con.create_function('lat_lon_to_quadkey', lat_lon_to_quadkey, [DOUBLE, DOUBLE, INTEGER], VARCHAR)
 34 |     con.create_function('midpoint', midpoint, [DOUBLE, DOUBLE], DOUBLE)
 35 | 
 36 |     # Add a quadkey column to the table if it doesn't exist
 37 |     con.execute("ALTER TABLE buildings ADD COLUMN IF NOT EXISTS quadkey VARCHAR")
 38 | 
 39 |     # Update the quadkey column
 40 |     con.execute("""
 41 |     UPDATE buildings 
 42 |     SET quadkey = lat_lon_to_quadkey(
 43 |         midpoint(bbox.miny, bbox.maxy), 
 44 |         midpoint(bbox.minx, bbox.maxx), 
 45 |         12
 46 |     );
 47 |     """)
 48 | 
 49 | 
 50 | def add_country_iso(con, country_parquet_path):
 51 |     # Load country parquet file into duckdb
 52 |     con.execute(f"CREATE TABLE countries AS SELECT * FROM read_parquet('{country_parquet_path}')")
 53 | 
 54 |     # Add a country_iso column to the buildings table
 55 |     con.execute("ALTER TABLE buildings ADD COLUMN IF NOT EXISTS country_iso VARCHAR")
 56 |     
 57 |     # Update the country_iso column in the buildings table
 58 |     con.execute("""
 59 |     UPDATE buildings 
 60 |     SET country_iso = countries.isocountrycodealpha2 
 61 |     FROM countries 
 62 |     WHERE ST_Intersects(ST_GeomFromWKB(countries.geometry), ST_GeomFromWKB(buildings.geometry))
 63 |     """)
 64 | 
 65 | def process_parquet_file(input_parquet_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False, verbose=False):
 66 |     # Ensure output_folder exists
 67 |     os.makedirs(output_folder, exist_ok=True)
 68 |     
 69 |     # Get unique identifier from file name
 70 |     unique_id = os.path.basename(input_parquet_path).split('_')[-1]
 71 |     
 72 |     # Define output paths
 73 |     output_db_path = os.path.join(output_folder, f'{unique_id}.duckdb')
 74 |     output_parquet_path = os.path.join(output_folder, f'{unique_id}.parquet')
 75 |     
 76 |     # Check if output files exist
 77 |     if (os.path.exists(output_db_path) or os.path.exists(output_parquet_path)) and not overwrite:
 78 |         print(f'Files with ID {unique_id} already exist. Skipping...')
 79 |         return
 80 |     
 81 |     # Overwrite mode: remove existing files
 82 |     if overwrite:
 83 |         for file_path in [output_db_path, output_parquet_path]:
 84 |             if os.path.exists(file_path):
 85 |                 os.remove(file_path)
 86 |     timestamp = time.time()
 87 |     print(f"Starting processing for file {input_parquet_path} at {time.ctime(timestamp)}")
 88 |     
 89 |     # Connect to DuckDB
 90 |     con = duckdb.connect(output_db_path)
 91 |     
 92 |     con.execute('LOAD spatial;')
 93 | 
 94 |     # NOTE: exclude names column because it's all NULL and causes InternalException: INTERNAL Error: Attempted to dereference unique_ptr that is NULL!
 95 |     con.execute(f"CREATE OR REPLACE TABLE buildings AS SELECT * EXCLUDE(names) FROM read_parquet('{input_parquet_path}')")
 96 |     
 97 |     if add_quadkey_option:
 98 |         add_quadkey(con)
 99 | 
100 |     if add_country_iso_option:
101 |         add_country_iso(con, country_parquet_path)
102 | 
103 |     # Write out to Parquet
104 |     con.execute(f"COPY (SELECT * FROM buildings ORDER BY quadkey) TO '{output_parquet_path}' WITH (FORMAT Parquet)")
105 |     
106 |     #TODO: turn this into an option to convert to geoparquet or not
107 |     if (True):
108 |         print(f"Converting to geoparquet: {output_parquet_path}")
109 |         # Create a temporary file
110 |         temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
111 |         temp_file.close()  # Close the file so gpq can open it
112 | 
113 |         # Convert the Parquet file to a GeoParquet file using gpq
114 |         gpq_cmd = ['gpq', 'convert', f'{output_parquet_path}', temp_file.name]
115 |         subprocess.run(gpq_cmd, check=True)
116 | 
117 |         # Rename the temp file to the final filename
118 |         shutil.move(temp_file.name, f'{output_parquet_path}')
119 |         #os.rename(temp_file.name, f'{output_parquet_path}')
120 | 
121 |     print(f"Processing complete for file {input_parquet_path}")
122 | 
123 | def process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False, verbose=False):
124 |     # If input_path is a directory, process all Parquet files in it
125 |     if os.path.isdir(input_path):
126 |         for file in glob.glob(os.path.join(input_path, "*")):
127 |             process_parquet_file(file, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option, verbose)
128 |     else:
129 |         process_parquet_file(input_path, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option, verbose)
130 | 
131 | # Call the function - uncomment if you want to call this directly from python and put values in here.
132 | # OVERTURE_DIR = pathlib.Path('~/data/src/overture/2024-02-15-alpha.0').expanduser()
133 | # OUT_DIR = pathlib.Path('~/data/prc/overture/2024-02-15')
134 | # ADMIN_BOUNDARIES_LEVEL_1_FP = pathlib.Path("~/data/prc/overture/2024-02-15/admin_boundaries_level_1.parquet")
135 | 
136 | # process_parquet_files(str(OVERTURE_DIR), str(OUT_DIR), str(ADMIN_BOUNDARIES_LEVEL_1_FP), overwrite=False, add_quadkey_option=True, add_country_iso_option=False)
137 | 
138 | 


--------------------------------------------------------------------------------
/open_buildings/overture/partition.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script takes a DuckDB database with a buildings table and converts it to GeoParquet 
  3 | files partitioned on first country and then quadkey. The buildings table must have a
  4 | country_iso field and quadkey field, populated by overture-buildings-parquet-add-columns.py.
  5 | The main function is process_db(), and it will take as input a maximum number of rows per
  6 | file and a row group size for the Parquet files. It will then iterate through the countries
  7 | in the database and partition the buildings table into GeoParquet files for each country.
  8 | If the number of rows for a country is greater than the maximum number of rows per file,
  9 | it will partition the country into quadkeys and create GeoParquet files for each quadkey.
 10 | Those quadkeys will be further partitioned if necessary until the number of rows for a
 11 | quadkey is less than or equal to the maximum number of rows per file. 
 12 | """
 13 | 
 14 | import duckdb
 15 | import datetime
 16 | import subprocess
 17 | import tempfile
 18 | import os
 19 | import click
 20 | import shutil
 21 | import geopandas as gpd
 22 | from shapely import wkb
 23 | import pandas as pd
 24 | import time
 25 | 
 26 | def current_time_str():
 27 |     return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 28 | 
 29 | def print_verbose(msg, verbose):
 30 |     if verbose:
 31 |         print(f"[{current_time_str()}] {msg}")
 32 | 
 33 | def convert_gpq(input_filename, row_group_size, verbose):
 34 |     print_verbose(f"Starting conversion for {input_filename} using gpq (row_group_size ignored).", verbose)
 35 | 
 36 |     # Create a temporary file
 37 |     temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
 38 |     temp_file.close()  # Close the file so gpq can open it
 39 | 
 40 |     # Convert the Parquet file to a GeoParquet file using gpq
 41 |     gpq_cmd = ['gpq', 'convert', input_filename, temp_file.name]
 42 |     subprocess.run(gpq_cmd, check=True)
 43 | 
 44 |     print_verbose(f"Conversion for {input_filename} using gpq finished.", verbose)
 45 | 
 46 |     # Rename (move) the temp file to the final filename
 47 |     shutil.move(temp_file.name, input_filename)
 48 | 
 49 |     # Delete the initial temp file if it still exists
 50 |     #initial_temp_filename = f'{country_code}_temp.parquet'
 51 |     #if os.path.exists(initial_temp_filename):
 52 |     #    os.remove(initial_temp_filename)
 53 | 
 54 | def convert_pandas(input_filename, rg_size, verbose):
 55 |     # Placeholder function to be fleshed out
 56 |     print_verbose("Starting conversion using pandas.", verbose)
 57 |     try:
 58 |         df = pd.read_parquet(input_filename)
 59 | 
 60 |         # Convert WKB geometry to geopandas geometry
 61 |         df['geometry'] = df['geometry'].apply(wkb.loads, hex=True)
 62 |         gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
 63 |         # Change output file the input_filename with .parquet replaced with _geo.parquet
 64 |         output_filename = input_filename.replace(".parquet", "_geo.parquet")
 65 |     
 66 |         gdf.to_parquet(output_filename, row_group_size=rg_size)
 67 |         # delete the original file
 68 |         os.remove(input_filename)
 69 |         # Rename (move) the output file to the input filename
 70 |         shutil.move(output_filename, input_filename)
 71 |         print(f"Finished processing {input_filename} at {time.ctime()}")
 72 |     except Exception as e:
 73 |         print(f"Error processing {input_filename}: {e}")
 74 |     
 75 | # Note, this doesn't work, but I'm not sure why. May be that ogr doesn't really support
 76 | # compatible geospatial parquet, but it really looks like it should. Maybe there's something
 77 | # weird with the ones written out. 
 78 | def convert_ogr(input_filename, rg_size, verbose):
 79 |     output_filename = input_filename.replace(".parquet", "_geo.parquet")
 80 |     rg_cmd = f"ROW_GROUP_SIZE={rg_size}"
 81 |     cmd = [
 82 |         'ogr2ogr',
 83 |         '-f',
 84 |         'Parquet',
 85 |         output_filename,
 86 |         input_filename,
 87 |    #     '-oo',
 88 |    #     rg_cmd,
 89 |         '-oo',
 90 |         'GEOM_POSSIBLE_NAMES=geometry', ]
 91 | 
 92 |     # print the ogr2ogr command that will be run
 93 |     if verbose:
 94 |         print("ogr2ogr command:")
 95 |         print(' '.join(cmd))
 96 | 
 97 |     # Run the command
 98 |     subprocess.run(cmd, check=True)
 99 | 
100 |     # delete the original file
101 |     os.remove(input_filename)
102 |     # Rename (move) the output file to the input filename
103 |     shutil.move(output_filename, input_filename)
104 |     print(f"Finished processing {input_filename} at {time.ctime()}")
105 | 
106 |     if verbose:
107 |         print(f"Converted {input_filename} to {output_filename} using ogr2ogr.")
108 | 
109 |  
110 | 
111 | def fetch_quadkeys(conn, table_name, country_code, length, verbose, prev_qk=""):
112 |     query = f"SELECT DISTINCT SUBSTR(quadkey, 1, {length}) FROM {table_name} WHERE country_iso = '{country_code}'"
113 |     if prev_qk:
114 |         query += f" AND SUBSTR(quadkey, 1, {len(prev_qk)}) = '{prev_qk}'"
115 |     print_verbose(f'Executing: {query}', verbose)
116 |     return conn.execute(query).fetchall()
117 | 
118 | def convert_to_geoparquet(parquet_path, geo_conversion, row_group_size, verbose):
119 |     if geo_conversion == 'gpq':
120 |         convert_gpq(parquet_path, row_group_size, verbose)
121 |         print_verbose(f"File: {parquet_path} written with gpq", verbose)
122 |     elif geo_conversion == 'pandas':
123 |         convert_pandas(parquet_path, row_group_size, verbose)
124 |         print_verbose(f"File: {parquet_path} written with pandas", verbose)
125 |     elif geo_conversion == 'ogr':
126 |         convert_ogr(parquet_path, row_group_size, verbose)
127 |         print_verbose(f"File: {parquet_path} written with ogr", verbose)
128 |     else:
129 |         print_verbose(f"File: {parquet_path} written without converting to GeoParquet", verbose)
130 | 
131 | #TODO: go all the way into the quad to find the smallest quadkey that contains less than max_per_file rows
132 | def process_quadkey_recursive(conn, table_name, country_code, output_folder, length, geo_conversion, row_group_size, verbose, max_per_file, current_qk=""):
133 |     distinct_quadkeys = fetch_quadkeys(conn, table_name, country_code, length, verbose, current_qk)
134 |     print_verbose(f"The list of quadkeys for country {country_code} and length {length} is {distinct_quadkeys}", verbose)
135 |     #num_distinct_qk = len(distinct_quadkeys)
136 |     for qk in distinct_quadkeys:
137 |         qk_str = qk[0]
138 |         qk_count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}'"
139 |         print_verbose(f'Executing: {qk_count_query}', verbose)
140 |         qk_count = conn.execute(qk_count_query).fetchone()[0]
141 |         print_verbose(f"Quadkey {qk_str} has {qk_count} rows", verbose)
142 |         if qk_count > max_per_file:
143 |             process_quadkey_recursive(conn, table_name, country_code, output_folder, length + 1, geo_conversion, row_group_size, verbose, max_per_file, qk_str)
144 |         else:
145 |             quad_output_filename = os.path.join(output_folder, f'{country_code}_{qk_str}.parquet')
146 |             if os.path.exists(quad_output_filename):
147 |                 print_verbose(f"Output file {quad_output_filename} already exists, skipping...", verbose)
148 |             else:
149 |                 copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}' ORDER BY quadkey) TO '{quad_output_filename}' WITH (FORMAT PARQUET);"
150 |                 print_verbose(f'Executing: {copy_cmd}', verbose)
151 |                 conn.execute(copy_cmd)
152 |                 convert_to_geoparquet(quad_output_filename, geo_conversion, row_group_size, verbose)
153 | 
154 | 
155 | def process_db(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive, table_name):
156 |     # create output folder if it does not exist
157 |     os.makedirs(output_folder, exist_ok=True)
158 |     conn = duckdb.connect(duckdb_path)
159 |     conn.execute('LOAD spatial;')
160 |     cursor = conn.execute(f'SELECT DISTINCT country_iso FROM {table_name}')
161 |     countries = cursor.fetchall()
162 |     
163 |     print_verbose(f'Found {len(countries)} unique countries', verbose)
164 |     #countries.reverse()
165 |     for country in countries:
166 |         country_code = country[0]
167 |         write_folder = output_folder
168 |         if (hive):
169 |             write_folder = os.path.join(output_folder, f'country_iso={country_code}')
170 |             os.makedirs(write_folder, exist_ok=True)
171 |         output_filename = os.path.join(write_folder, f'{country_code}.parquet')
172 |         if os.path.exists(output_filename):
173 |             print_verbose(f"Output file for country {country_code} already exists, skipping...", verbose)
174 |             continue
175 | 
176 |         count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}'"
177 |         print_verbose(f'Executing: {count_query}', verbose)
178 |         count = conn.execute(count_query).fetchone()[0]
179 |         print_verbose(f"Country {country_code} has {count} rows", verbose)
180 | 
181 |         if count <= max_per_file:
182 |             copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' ORDER BY quadkey) TO '{output_filename}' WITH (FORMAT PARQUET);"
183 |             print_verbose(f'Executing: {copy_cmd}', verbose)
184 |             conn.execute(copy_cmd)
185 |             convert_to_geoparquet(output_filename, geo_conversion, row_group_size, verbose)
186 |         else:
187 |             process_quadkey_recursive(conn, table_name, country_code, output_folder, 1, geo_conversion, row_group_size, verbose, max_per_file)
188 | 
189 | if __name__ == "__main__":
190 |     process_db()


--------------------------------------------------------------------------------
/open_buildings/overture/places_add_columns.py:
--------------------------------------------------------------------------------
  1 | # This script is used to take an Overture Parquet file and add columns
  2 | # useful for partitioning - it can put in both a quadkey and the country
  3 | # ISO code. And then it will write out parquet and use gpq to convert the
  4 | # parquet to geoparquet.
  5 | #
  6 | # There is much more to do, my plan is to incorporate it into the open_buildings
  7 | # CLI and let people pick which of the columns they want to add. Also could
  8 | # be nice to add the ability to get the data downloaded - this just assumes
  9 | # you've already got it. Also need to add the command to create the 
 10 | # countries.parquet, but it's basically the one in https://github.com/OvertureMaps/data/blob/main/duckdb_queries/admins.sql
 11 | # but saved to parquet. You also could just use that command to pull it
 12 | # directly into your duckdb database, and change this code (perhaps we
 13 | # add an option to pull it remote if not present). This also would
 14 | # ideally work with any of the Overture data types, and let you choose
 15 | # your table names.
 16 | import os
 17 | import duckdb
 18 | import time
 19 | import tempfile
 20 | import subprocess
 21 | import glob
 22 | from duckdb.typing import *
 23 | import mercantile
 24 | import shutil
 25 | 
 26 | def lat_lon_to_quadkey(lat: DOUBLE, lon: DOUBLE, level: INTEGER) -> VARCHAR:
 27 |     # Convert latitude and longitude to tile using mercantile
 28 |     tile = mercantile.tile(lon, lat, level)
 29 |     
 30 |     # Convert the tile to a quadkey
 31 |     quadKey = mercantile.quadkey(tile)
 32 |     return quadKey
 33 | 
 34 | def add_quadkey(con):
 35 | 
 36 |     # Register Python UDFs
 37 |     con.create_function('lat_lon_to_quadkey', lat_lon_to_quadkey, [DOUBLE, DOUBLE, INTEGER], VARCHAR)
 38 | 
 39 |     # Add a quadkey column to the table if it doesn't exist
 40 |     con.execute("ALTER TABLE places ADD COLUMN IF NOT EXISTS quadkey VARCHAR")
 41 | 
 42 |     # Update the quadkey column
 43 |     # (no need to use midpoint as places is just points, so maxy and miny are the same)
 44 |     con.execute("""
 45 |     UPDATE places 
 46 |     SET quadkey = lat_lon_to_quadkey(
 47 |         bbox.maxy,  
 48 |         bbox.maxx, 
 49 |         12
 50 |     );
 51 |     """)
 52 | 
 53 | def add_country_iso(con, country_parquet_path):
 54 |     # Load country parquet file into duckdb
 55 |     con.execute(f"CREATE TABLE countries AS SELECT * FROM read_parquet('{country_parquet_path}')")
 56 | 
 57 |     # Add a country_iso column to the buildings table
 58 |     con.execute("ALTER TABLE places ADD COLUMN IF NOT EXISTS country_iso VARCHAR")
 59 |     
 60 |     # Update the country_iso column in the buildings table
 61 |     con.execute("""
 62 |     UPDATE places 
 63 |     SET country_iso = countries.isocountrycodealpha2 
 64 |     FROM countries 
 65 |     WHERE ST_Intersects(ST_GeomFromWKB(countries.geometry), ST_GeomFromWKB(places.geometry))
 66 |     """)
 67 | 
 68 | def process_parquet_file(input_parquet_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False):
 69 |     # Ensure output_folder exists
 70 |     os.makedirs(output_folder, exist_ok=True)
 71 |     
 72 |     # Get unique identifier from file name
 73 |     unique_id = os.path.basename(input_parquet_path).split('_')[-1]
 74 |     
 75 |     # Define output paths
 76 |     output_db_path = os.path.join(output_folder, f'{unique_id}.duckdb')
 77 |     output_parquet_path = os.path.join(output_folder, f'{unique_id}.parquet')
 78 |     
 79 |     # Check if output files exist
 80 |     if (os.path.exists(output_db_path) or os.path.exists(output_parquet_path)) and not overwrite:
 81 |         print(f'Files with ID {unique_id} already exist. Skipping...')
 82 |         return
 83 |     
 84 |     # Overwrite mode: remove existing files
 85 |     if overwrite:
 86 |         for file_path in [output_db_path, output_parquet_path]:
 87 |             if os.path.exists(file_path):
 88 |                 os.remove(file_path)
 89 |     timestamp = time.time()
 90 |     print(f"Starting processing for file {input_parquet_path} at {time.ctime(timestamp)}")
 91 |     
 92 |     # Connect to DuckDB
 93 |     con = duckdb.connect(output_db_path)
 94 |     
 95 |     con.execute('LOAD spatial;')
 96 | 
 97 |     # Load parquet file into duckdb
 98 |     con.execute(f"CREATE TABLE places AS SELECT * FROM read_parquet('{input_parquet_path}')")
 99 |     
100 |     if add_quadkey_option:
101 |         add_quadkey(con)
102 | 
103 |     if add_country_iso_option:
104 |         add_country_iso(con, country_parquet_path)
105 | 
106 |     # Write out to Parquet
107 |     con.execute(f"COPY (SELECT * FROM places ORDER BY quadkey) TO '{output_parquet_path}' WITH (FORMAT Parquet)")
108 |     
109 |     if (True):
110 |         print(f"Converting to geoparquet: {output_parquet_path}")
111 |         # Create a temporary file
112 |         temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
113 |         temp_file.close()  # Close the file so gpq can open it
114 | 
115 |         # Convert the Parquet file to a GeoParquet file using gpq
116 |         gpq_cmd = ['gpq', 'convert', f'{output_parquet_path}', temp_file.name]
117 |         subprocess.run(gpq_cmd, check=True)
118 | 
119 |         # Rename the temp file to the final filename
120 |         shutil.move(temp_file.name, f'{output_parquet_path}')
121 |         #os.rename(temp_file.name, f'{output_parquet_path}')
122 | 
123 |     print(f"Processing complete for file {input_parquet_path}")
124 | 
125 | def process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=False, add_country_iso_option=False):
126 |     # If output_folder doesn't exist, create it
127 |     os.makedirs(output_folder, exist_ok=True)
128 |     # If input_path is a directory, process all Parquet files in it
129 |     if os.path.isdir(input_path):
130 |         for file in glob.glob(os.path.join(input_path, "*")):
131 |             process_parquet_file(file, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option)
132 |     else:
133 |         process_parquet_file(input_path, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option)
134 | 
135 | # Call the function 
136 | input_path = '/Volumes/fastdata/overture/s3-data/places/'
137 | output_folder = '/Volumes/fastdata/overture/refined-places-geoparquet/'
138 | country_parquet_path = '/Volumes/fastdata/overture/countries.parquet'
139 | process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=True, add_country_iso_option=True)


--------------------------------------------------------------------------------
/open_buildings/settings.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Dict
 3 | from pydantic import BaseModel
 4 | 
 5 | class Source(Enum):
 6 |     GOOGLE = 1
 7 |     OVERTURE = 2
 8 | 
 9 | class Format(Enum):
10 |     SHAPEFILE = 1
11 |     GEOJSON = 2
12 |     GEOPACKAGE = 3
13 |     FLATGEOBUF = 4
14 |     PARQUET = 5
15 | 
16 | 
17 | class SourceSettings(BaseModel):
18 |     base_url: str
19 |     hive_partitioning: bool
20 | 
21 | class SettingsSchema(BaseModel):
22 |     sources: Dict[Source, SourceSettings]
23 |     extensions: Dict[Format, str]
24 | 
25 | settings = SettingsSchema(
26 |     sources={
27 |         Source.GOOGLE: SourceSettings(
28 |             base_url="s3://us-west-2.opendata.source.coop/google-research-open-buildings/geoparquet-by-country/*/*.parquet",
29 |             hive_partitioning=True
30 |         ),
31 |         Source.OVERTURE: SourceSettings(
32 |             base_url="s3://us-west-2.opendata.source.coop/cholmes/overture/geoparquet-country-quad-hive/*/*.parquet",
33 |             hive_partitioning=True
34 |         )
35 |     },
36 |     extensions={
37 |         Format.SHAPEFILE: 'shp',
38 |         Format.GEOJSON: 'json',
39 |         Format.GEOPACKAGE: 'gpkg',
40 |         Format.FLATGEOBUF: 'fgb',
41 |         Format.PARQUET: 'parquet'
42 |     }
43 | )
44 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     integration: marks tests as integration tests that span network and DB I/O (deselect with '-m "not integration"')


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | click
 2 | duckdb
 3 | pandas
 4 | geopandas
 5 | pyogrio
 6 | osmnx
 7 | shapely
 8 | openlocationcode
 9 | tabulate
10 | leafmap
11 | boto3
12 | mercantile
13 | pydantic==2.4.2


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | black
 2 | black[jupyter]
 3 | pip
 4 | bump2version
 5 | wheel
 6 | watchdog
 7 | flake8
 8 | tox
 9 | coverage
10 | Sphinx
11 | twine
12 | Click
13 | codespell
14 | pydantic==2.4.2
15 | pytest==7.4.2
16 | pytest-rerunfailures==12.0


--------------------------------------------------------------------------------
/requirements_docs.txt:
--------------------------------------------------------------------------------
 1 | bump2version
 2 | coverage
 3 | flake8
 4 | ipykernel
 5 | livereload
 6 | nbconvert
 7 | nbformat
 8 | pip
 9 | sphinx
10 | tox
11 | twine
12 | watchdog
13 | wheel
14 | mkdocs
15 | mkdocs-git-revision-date-plugin 
16 | mkdocs-git-revision-date-localized-plugin
17 | mkdocs-jupyter>=0.24.0
18 | mkdocs-material>=9.1.3 
19 | mkdocs-pdf-export-plugin
20 | mkdocstrings 
21 | mkdocstrings-crystal
22 | mkdocstrings-python-legacy
23 | pygments
24 | pymdown-extensions


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.10.0
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:open_buildings/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """The setup script."""
 4 | 
 5 | import io
 6 | from os import path as op
 7 | from setuptools import setup, find_packages
 8 | 
 9 | with open('README.md', encoding="utf-8") as readme_file:
10 |     readme = readme_file.read()
11 | 
12 | here = op.abspath(op.dirname(__file__))
13 | 
14 | # get the dependencies and installs
15 | with io.open(op.join(here, "requirements.txt"), encoding="utf-8") as f:
16 |     all_reqs = f.read().split("\n")
17 | 
18 | install_requires = [x.strip() for x in all_reqs if "git+" not in x]
19 | dependency_links = [x.strip().replace("git+", "") for x in all_reqs if "git+" not in x]
20 | 
21 | setup_requirements = []
22 | 
23 | test_requirements = ["codespell==2.2.6", "pytest==7.4.2", "pytest-rerunfailures==12.0", "pytest-xdist==3.3.1"]
24 | 
25 | setup(
26 |     author="Chris Holmes",
27 |     author_email='cholmes@9eo.org',
28 |     python_requires='>=3.8',
29 |     classifiers=[
30 |         'Intended Audience :: Developers',
31 |         'License :: OSI Approved :: Apache Software License',
32 |         'Natural Language :: English',
33 |         'Programming Language :: Python :: 3',
34 |         'Programming Language :: Python :: 3.8',
35 |         'Programming Language :: Python :: 3.9',
36 |         'Programming Language :: Python :: 3.10',
37 |         'Programming Language :: Python :: 3.11',
38 |     ],
39 |     description="Tools for working with open building datasets",
40 |     entry_points={
41 |         'console_scripts': [
42 |             'ob=open_buildings.cli:main',
43 |         ],
44 |     },
45 |     install_requires=install_requires,
46 |     dependency_links=dependency_links,
47 |     license="Apache Software License 2.0",
48 |     long_description=readme,
49 |     long_description_content_type='text/markdown',
50 |     include_package_data=True,
51 |     keywords='open_buildings',
52 |     name='open-buildings',
53 |     packages=find_packages(),
54 |     package_data={
55 |         'open_buildings': ['google/*', 'overture/*' ],
56 |     },
57 |     setup_requires=setup_requirements,
58 |     test_suite='tests',
59 |     tests_require=test_requirements,
60 |     url='https://github.com/opengeos/open-buildings',
61 |     version='0.10.0',
62 |     zip_safe=False,
63 |     extras_require={
64 |         'dev':  test_requirements,
65 |     }
66 | )
67 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit test package for open_buildings."""
2 | 


--------------------------------------------------------------------------------
/tests/test_open_buildings.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from typing import Dict, Any
  3 | from pathlib import Path
  4 | import os
  5 | import json
  6 | from shapely.geometry import shape, box, mapping
  7 | import re
  8 | import subprocess
  9 | 
 10 | from open_buildings.download_buildings import download, geojson_to_wkt, geojson_to_quadkey, quadkey_to_geojson
 11 | from open_buildings.cli import geocode
 12 | from open_buildings.settings import Source, Format, settings
 13 | 
 14 | ###########################################################################
 15 | #                                                                         #
 16 | #   RUN TESTS with `python3 -m pytest . -n <number of parallel workers>`  #
 17 | #                                                                         #
 18 | ###########################################################################
 19 | 
 20 | 
 21 | NUM_RERUNS = 2 # number of re-runs for integration tests
 22 | 
 23 | @pytest.fixture
 24 | def aoi() -> Dict[str, Any]:
 25 |     """ Sample AOI over Seychelles. """
 26 |     return {
 27 |       "type": "Feature",
 28 |       "properties": {},
 29 |       "geometry": {
 30 |         "coordinates": [
 31 |           [
 32 |             [
 33 |               55.45280573412927,
 34 |               -4.6227964300457245
 35 |             ],
 36 |             [
 37 |               55.45280573412927,
 38 |               -4.623440862045413
 39 |             ],
 40 |             [
 41 |               55.453376761871795,
 42 |               -4.623440862045413
 43 |             ],
 44 |             [
 45 |               55.453376761871795,
 46 |               -4.6227964300457245
 47 |             ],
 48 |             [
 49 |               55.45280573412927,
 50 |               -4.6227964300457245
 51 |             ]
 52 |           ]
 53 |         ],
 54 |         "type": "Polygon"
 55 |       }
 56 |     }
 57 | 
 58 | def test_geojson_to_wkt(aoi: Dict[str, Any]):
 59 |     """ Tests the geojson_to_wkt() function. """
 60 |     assert geojson_to_wkt(aoi) == 'POLYGON ((55.45280573412927 -4.6227964300457245, 55.45280573412927 -4.623440862045413, 55.453376761871795 -4.623440862045413, 55.453376761871795 -4.6227964300457245, 55.45280573412927 -4.6227964300457245))'
 61 | 
 62 | def test_geojson_to_quadkey(aoi: Dict[str, Any]):
 63 |     """ Tests geojson_to_quadkey() using a pre-established true value. """
 64 |     assert geojson_to_quadkey(aoi) == '301001330310'
 65 |     
 66 | def test_quadkey_to_geojson():
 67 |     """ Tests quadkey_to_geojson() using a pre-established true value. """
 68 |     assert quadkey_to_geojson('031313131112') == {'type': 'Feature', 'geometry': {'type': 'Polygon', 'coordinates': [[[-0.17578125, 51.50874245880333], [-0.087890625, 51.50874245880333], [-0.087890625, 51.56341232867588], [-0.17578125, 51.56341232867588], [-0.17578125, 51.50874245880333]]]}}
 69 | 
 70 | def test_geocode():
 71 |     """ Tests geocode() using a pre-established true value. Verifies the bbox of the returned geometry. """
 72 |     geocoding_result = geocode('plymouth')
 73 |     assert geocoding_result["type"] == "Feature"
 74 |     assert shape(geocoding_result["geometry"]).bounds == (-4.2055324, 50.3327426, -4.0196056, 50.4441737)
 75 | 
 76 | @pytest.mark.integration
 77 | @pytest.mark.flaky(reruns=NUM_RERUNS)
 78 | @pytest.mark.parametrize("source", [s for s in Source])
 79 | def test_download(source: Source, aoi: Dict[str, Any], tmp_path: Path):
 80 |     """ Tests that the download function successfully downloads a GeoJSON file from all sources (parametrised test) into a temporary directory (teardown after test). """
 81 |     output_file = tmp_path.joinpath(f"output_{source.name}.json")
 82 |     download(aoi, source=source, dst=output_file, country_iso="SC")
 83 |     assert os.path.exists(output_file)
 84 |     assert os.path.getsize(output_file) != 0
 85 | 
 86 | @pytest.mark.integration
 87 | @pytest.mark.flaky(reruns=NUM_RERUNS)
 88 | def test_download_no_output(aoi: Dict[str, Any], tmp_path: Path):
 89 |     """ Test that no empty output file gets created if a query doesn't return anything (in this case because a wrong country_iso argument is given.) """
 90 |     output_file = tmp_path.joinpath("no_output.json")
 91 |     download(aoi, dst=output_file, country_iso="AI") # wrong country, aoi is in SC, not Anguilla
 92 |     assert not os.path.exists(output_file)
 93 | 
 94 | @pytest.mark.integration
 95 | @pytest.mark.flaky(reruns=NUM_RERUNS)
 96 | def test_download_directory(aoi: Dict[str, Any], tmp_path: Path):
 97 |     """ Test that, if a directory is passed, the output gets downloaded to a default file name in that directory. """
 98 |     download(aoi, dst=tmp_path, country_iso="SC")
 99 |     assert os.path.exists(tmp_path.joinpath("buildings.json"))
100 |     assert os.path.getsize(tmp_path.joinpath("buildings.json")) != 0
101 | 
102 | @pytest.mark.integration
103 | @pytest.mark.flaky(reruns=NUM_RERUNS)
104 | def test_download_overwrite(aoi: Dict[str, Any], tmp_path: Path):
105 |     """ Tests that, if the "overwrite" option is set to True, an existing file does indeed get overwritten. """
106 |     output_path = tmp_path.joinpath("file_exists.json")
107 |     with open(output_path, "w") as f:
108 |         f.write("Foo bar")
109 |     
110 |     download(aoi, dst=output_path, country_iso="SC", overwrite=True)
111 |     assert os.path.exists(output_path)
112 |     with open(output_path, "r") as f:
113 |         assert f.read() != "Foo bar" # verify that the file was updated
114 | 
115 | @pytest.mark.integration
116 | @pytest.mark.flaky(reruns=NUM_RERUNS)
117 | @pytest.mark.parametrize("format", [f for f in Format if f != Format.SHAPEFILE]) # fails for shapefile!
118 | def test_download_format(format: Format, aoi: Dict[str, Any], tmp_path: Path):
119 |     """ Requests data in all file formats defined in the settings. Attempts to validate the output for each of those too. """
120 |     output_file = tmp_path.joinpath(f"output.{settings.extensions[format]}")
121 |     download(aoi, dst=output_file, country_iso="SC")
122 |     assert os.path.exists(output_file)
123 |     assert os.path.getsize(output_file) != 0
124 | 
125 |     # validate output
126 |     if format == Format.GEOJSON:
127 |         with open(output_file, "r") as f:
128 |             json.load(f)
129 |     elif format == Format.FLATGEOBUF:
130 |         pass
131 |     elif format == Format.SHAPEFILE:
132 |         pass
133 |     elif format == Format.PARQUET:
134 |         pass
135 |     elif format == Format.GEOPACKAGE:
136 |         pass
137 |     else:
138 |         raise NotImplementedError(f"Test not implemented for {format} - please add.")
139 | 
140 | def test_download_unknown_format(aoi: Dict[str, Any]):
141 |     """ Tests that an unknown format (.abc) raises an Exception. """
142 |     with pytest.raises(ValueError):
143 |         download(aoi, dst="buildings.abc")
144 | 
145 | @pytest.mark.integration
146 | @pytest.mark.flaky(reruns=NUM_RERUNS)
147 | def test_cli_get_buildings_from_file_to_directory(aoi: Dict[str, Any], tmp_path: Path):
148 |     """ 
149 |     Tests the CLI for get_buildings - provides the path to a GeoJSON file as input and a directory as output path. 
150 |     Verifies that the output gets written to a default file name in the given directory.
151 |     """
152 |     # write aoi dict to geojson file in temporary directory
153 |     input_path = tmp_path.joinpath("input.json")
154 |     with open(input_path, "w") as f:
155 |         json.dump(aoi, f)
156 |     subprocess.run(["ob", "get_buildings", str(input_path), "--dst", str(tmp_path), "--country_iso", "SC"], check=True)
157 |     output_path = tmp_path.joinpath("buildings.json") # default file name
158 |     assert os.path.exists(output_path)
159 |     assert os.path.getsize(output_path) != 0
160 |     
161 | 
162 | @pytest.mark.integration
163 | @pytest.mark.flaky(reruns=NUM_RERUNS)
164 | def test_cli_get_buildings_from_stdin_to_directory(aoi: Dict[str, Any], tmp_path: Path):
165 |     """ 
166 |     Tests the CLI for get_buildings - provides a GeoJSON string via stdin and a directory as output path. 
167 |     Verifies that a log message with timestamp gets written to stdout. 
168 |     """
169 |     # we can't use pipes (e.g. f"echo {json.dumps(aoi)} | ...") in subprocess.run, instead we pass the json as stdin using the input/text arguments,
170 |     process = subprocess.run([ "ob", "get_buildings", "-", "--dst", str(tmp_path), "--country_iso", "SC"], input=json.dumps(aoi), text=True, check=True, capture_output=True)
171 |     dt_regex = re.compile(r"^\[[0-9]{4}(-[0-9]{2}){2} ([0-9]{2}:){2}[0-9]{2}\] ") # match timestamp format e.g. "[2023-10-18 19:08:24]"
172 |     assert dt_regex.search(process.stdout) # ensure that stdout contains at least one timestamped message
173 |     output_path = tmp_path.joinpath("buildings.json") # default file name
174 |     assert os.path.exists(output_path)
175 |     assert os.path.getsize(output_path) != 0
176 | 
177 | @pytest.mark.integration
178 | @pytest.mark.flaky(reruns=NUM_RERUNS)
179 | def test_cli_get_buildings_from_stdin_to_file_silent(aoi: Dict[str, Any], tmp_path: Path):
180 |     """ 
181 |     Tests the CLI for get_buildings - provides a GeoJSON string via stdin and an exact filepath to write the output to. 
182 |     Verifies that nothing gets written to stdout. 
183 |     """
184 |     output_path = tmp_path.joinpath("test123.json")
185 |     # we can't use pipes (e.g. f"echo {json.dumps(aoi)} | ...") in subprocess.run, instead we pass the json as stdin using the input/text arguments,
186 |     process = subprocess.run(["ob", "get_buildings", "-", "--dst", str(output_path), "--silent", "--country_iso", "SC"], input=json.dumps(aoi), text=True, check=True, capture_output=True)
187 |     assert process.stdout == "" # assert that nothing gets printed to stdout
188 |     assert process.stderr == "" # assert that nothing gets printed to stdout
189 |     assert os.path.exists(output_path)
190 |     assert os.path.getsize(output_path) != 0
191 | 
192 | 
193 | @pytest.mark.integration
194 | @pytest.mark.flaky(reruns=NUM_RERUNS)
195 | def test_cli_get_buildings_from_stdin_to_file_overwrite_false(aoi: Dict[str, Any], tmp_path: Path):
196 |     """ 
197 |     Tests the CLI for get_buildings - provides a GeoJSON string via stdin and an exact filepath to write the output to. 
198 |     Verifies that, if the output file already exists, nothing happens and the user is notified of this. 
199 |     """
200 |     output_path = tmp_path.joinpath("file_exists.json")
201 |     with open(output_path, "w") as f:
202 |         f.write("Foo bar")
203 |     # we can't use pipes (e.g. f"echo {json.dumps(aoi)} | ...") in subprocess.run, instead we pass the json as stdin using the input/text arguments,
204 |     process = subprocess.run(["ob", "get_buildings", "-", "--dst", str(output_path), "--country_iso", "SC"], input=json.dumps(aoi), text=True, check=True, capture_output=True)
205 |     assert os.path.exists(output_path)
206 |     with open(output_path, "r") as f:
207 |         assert f.read() == "Foo bar" # verify that the file still has the same content as before
208 |     assert "exists" in process.stdout # verify that the user has been warned about the existing file
209 | 
210 | @pytest.mark.integration
211 | @pytest.mark.flaky(reruns=NUM_RERUNS)
212 | def test_cli_get_buildings_geocode(tmp_path: Path):
213 |     """ 
214 |     Tests the geocoding functionality, implemented as the argument "location".
215 |     """
216 |     output_path = tmp_path.joinpath("geocode_test.json")
217 |     subprocess.run(["ob", "get_buildings", "--dst", str(output_path), "--location", "oxford uk", "--country_iso", "GB"], check=True)
218 |     assert os.path.exists(output_path)
219 |     assert os.path.getsize(output_path) != 0
220 | 
221 | @pytest.mark.integration
222 | @pytest.mark.flaky(reruns=NUM_RERUNS)
223 | def test_cli_get_buildings_geocode_multipolygon(tmp_path: Path):
224 |     """ 
225 |     Tests the geocoding functionality, implemented as the argument "location". Makes sure that a MultiPolygon geometry (the outline of Dubrovnik)
226 |     is simplified to a polygon (convex hull).
227 |     """
228 |     output_path = tmp_path.joinpath("geocode_test.json")
229 |     subprocess.run(["ob", "get_buildings", "--dst", str(output_path), "--location", "dubrovnik", "--country_iso", "HR"], check=True)
230 |     assert os.path.exists(output_path)
231 |     assert os.path.getsize(output_path) != 0


--------------------------------------------------------------------------------