├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    ├── release-template.yml
    └── workflows
    │   ├── ci-tests.yml
    │   ├── publish-to-pypi.yml
    │   └── release-drafter.yml
├── .gitignore
├── LICENSE.md
├── README.md
├── docs
    ├── .readthedocs.yaml
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── _config.yml
    ├── _toc.yml
    ├── api.md
    ├── changelog.md
    ├── chipping.md
    ├── index.md
    ├── multi-resolution.md
    ├── object-detection-boxes.md
    ├── stacking.md
    ├── vector-segmentation-masks.md
    └── walkthrough.md
├── poetry.lock
├── pyproject.toml
└── zen3geo
    ├── __init__.py
    ├── datapipes
        ├── __init__.py
        ├── datashader.py
        ├── geopandas.py
        ├── pyogrio.py
        ├── pystac.py
        ├── pystac_client.py
        ├── rioxarray.py
        ├── stackstac.py
        ├── xbatcher.py
        └── xpystac.py
    └── tests
        ├── test_datapipes_datashader.py
        ├── test_datapipes_geopandas.py
        ├── test_datapipes_pyogrio.py
        ├── test_datapipes_pystac.py
        ├── test_datapipes_pystac_client.py
        ├── test_datapipes_rioxarray.py
        ├── test_datapipes_stackstac.py
        ├── test_datapipes_xbatcher.py
        ├── test_datapipes_xpystac.py
        └── test_zen3geo.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go do '...'
16 | 2. Run the following code
17 | 
18 | ```python
19 | # Insert your code here
20 | ```
21 | 
22 | 3. See error `...`
23 | 
24 | **Expected behavior**
25 | A clear and concise description of what you expected to happen.
26 | 
27 | **System details (please complete the following information):**
28 |  - OS: [e.g. Linux, macOS, Windows]
29 |  - Python Version [e.g. 3.11]
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/release-template.yml:
--------------------------------------------------------------------------------
 1 | name-template: 'v$RESOLVED_VERSION 🌈'
 2 | tag-template: 'v$RESOLVED_VERSION'
 3 | categories:
 4 |   - title: '🚀 Features'
 5 |     label: 'feature'
 6 |   - title: '🐛 Bug Fixes'
 7 |     label: 'bug'
 8 |   - title: '📖 Documentation'
 9 |     label: 'documentation'
10 |   - title: '🧰 Maintenance'
11 |     label: 'maintenance'
12 | version-resolver:
13 |   minor:
14 |     labels:
15 |       - 'feature'
16 |   default: patch
17 | exclude-labels:
18 |   - 'skip-changelog'
19 | category-template: '### $TITLE'
20 | change-template: '* $TITLE ([#$NUMBER]($URL))'
21 | template: |
22 |   ## Release v$RESOLVED_VERSION (20YY/MM/DD)
23 | 
24 |   ### 💫 Highlights
25 | 
26 |   *
27 | 
28 |   $CHANGES
29 | 
30 |   ### 🧑‍🤝‍🧑 Contributors
31 | 
32 |   $CONTRIBUTORS
33 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     types: [opened, ready_for_review, reopened, synchronize]
11 |     branches: [ "main" ]
12 | 
13 | permissions:
14 |   contents: read
15 | 
16 | jobs:
17 |   test:
18 |     name: ${{ matrix.os }} - Python ${{ matrix.python-version }}
19 |     runs-on: ${{ matrix.os }}
20 |     strategy:
21 |       fail-fast: false
22 |       matrix:
23 |         python-version: ["3.8", "3.10", "3.11.8"]
24 |         os: [ubuntu-22.04]
25 |         # Is it a draft Pull Request (true or false)?
26 |         isDraft:
27 |           - ${{ github.event.pull_request.draft }}
28 |         # Exclude Ubuntu + Python 3.8 and 3.11 jobs for draft PRs
29 |         exclude:
30 |           - python-version: '3.8'
31 |             isDraft: true
32 |           - python-version: '3.11.8'
33 |             isDraft: true
34 |         # Only install optional packages on Ubuntu-22.04/Python 3.10 and 3.11
35 |         include:
36 |           - os: 'ubuntu-22.04'
37 |             python-version: '3.10'
38 |             extra-packages: '--extras "raster spatial stac vector"'
39 |           - os: 'ubuntu-22.04'
40 |             python-version: '3.11.8'
41 |             extra-packages: '--extras "raster spatial stac vector"'
42 | 
43 |     steps:
44 |       # Checkout current git repository
45 |       - name: Checkout
46 |         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
47 | 
48 |       # Install Python
49 |       - name: Set up Python ${{ matrix.python-version }}
50 |         uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
51 |         with:
52 |           python-version: ${{ matrix.python-version }}
53 | 
54 |       # Install poetry package manager and dependencies from poetry.lock
55 |       - name: Install Poetry python dependencies
56 |         run: |
57 |           pip install poetry==1.6.1
58 |           poetry install ${{ matrix.extra-packages }}
59 |           poetry self add poetry-dynamic-versioning[plugin]
60 |           poetry show
61 | 
62 |       # Run the unit tests and doctests
63 |       - name: Test with pytest
64 |         run: poetry run pytest --verbose --doctest-modules zen3geo/
65 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | # Publish archives to PyPI and TestPyPI using GitHub Actions
 2 | # https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
 3 | 
 4 | name: Publish to PyPI
 5 | 
 6 | # Only run for pushes to the main branch and releases.
 7 | on:
 8 |   push:
 9 |     branches:
10 |       - main
11 |   release:
12 |     types:
13 |       - published
14 |   # Runs for pull requests should be disabled other than for testing purposes
15 |   #pull_request:
16 |   #  branches:
17 |   #    - main
18 | 
19 | permissions:
20 |   contents: read
21 | 
22 | jobs:
23 |   publish-pypi:
24 |     name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
25 |     runs-on: ubuntu-22.04
26 |     permissions:
27 |       # This permission is mandatory for OIDC publishing
28 |       id-token: write
29 |     if: github.repository == 'weiji14/zen3geo'
30 | 
31 |     steps:
32 |       - name: Checkout
33 |         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
34 |         with:
35 |           # fetch all history so that poetry-dynamic-versioning works
36 |           fetch-depth: 0
37 | 
38 |       - name: Set up Python 3.11
39 |         uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
40 |         with:
41 |           python-version: '3.11.8'
42 | 
43 |       - name: Install Poetry and dynamic-versioning plugin
44 |         run: |
45 |           pip install poetry==1.6.1
46 |           poetry self add poetry-dynamic-versioning[plugin]
47 |           poetry show
48 | 
49 |       - name: Fix up version string for TestPyPI and PyPI
50 |         run: |
51 |           # Change poetry-dynamic-versioning to use metadata=false so that the
52 |           # local part of the version isn't included, making the version string
53 |           # compatible with PyPI.
54 |           sed --in-place "s/metadata = true/metadata = false/g" pyproject.toml
55 | 
56 |       - name: Build a binary wheel and a source tarball
57 |         run: |
58 |           poetry build -vvv
59 |           echo ""
60 |           echo "Generated files:"
61 |           ls -lh dist/
62 | 
63 |       - name: Publish distribution 📦 to Test PyPI
64 |         uses: pypa/gh-action-pypi-publish@a56da0b891b3dc519c7ee3284aff1fad93cc8598 # v1.8.6
65 |         with:
66 |           repository-url: https://test.pypi.org/legacy/
67 |           skip-existing: true
68 | 
69 |       - name: Publish distribution 📦 to PyPI
70 |         if: startsWith(github.ref, 'refs/tags')
71 |         uses: pypa/gh-action-pypi-publish@a56da0b891b3dc519c7ee3284aff1fad93cc8598 # v1.8.6
72 | 


--------------------------------------------------------------------------------
/.github/workflows/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name: Release Drafter
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   update_release_draft:
13 |     permissions:
14 |       contents: write  # for release-drafter/release-drafter to create a github release
15 |     runs-on: ubuntu-22.04
16 |     steps:
17 |       # Drafts your next Release notes as Pull Requests are merged into "main"
18 |       - uses: release-drafter/release-drafter@569eb7ee3a85817ab916c8f8ff03a5bd96c9c83e # v5.23.0
19 |         with:
20 |           # (Optional) specify config name to use, relative to .github/. Default: release-drafter.yml
21 |           config-name: release-template.yml
22 |         env:
23 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # Distribution / packaging
 6 | build/
 7 | dist/
 8 | *.egg
 9 | *.egg-info/
10 | .eggs/
11 | MANIFEST
12 | 
13 | # Unit test / coverage reports
14 | .pytest_cache/
15 | 
16 | # Jupyter Book
17 | /docs/_build/
18 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | 
  2 |                    GNU LESSER GENERAL PUBLIC LICENSE
  3 |                        Version 3, 29 June 2007
  4 | 
  5 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | 
 10 |   This version of the GNU Lesser General Public License incorporates
 11 | the terms and conditions of version 3 of the GNU General Public
 12 | License, supplemented by the additional permissions listed below.
 13 | 
 14 |   0. Additional Definitions.
 15 | 
 16 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 17 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 18 | General Public License.
 19 | 
 20 |   "The Library" refers to a covered work governed by this License,
 21 | other than an Application or a Combined Work as defined below.
 22 | 
 23 |   An "Application" is any work that makes use of an interface provided
 24 | by the Library, but which is not otherwise based on the Library.
 25 | Defining a subclass of a class defined by the Library is deemed a mode
 26 | of using an interface provided by the Library.
 27 | 
 28 |   A "Combined Work" is a work produced by combining or linking an
 29 | Application with the Library.  The particular version of the Library
 30 | with which the Combined Work was made is also called the "Linked
 31 | Version".
 32 | 
 33 |   The "Minimal Corresponding Source" for a Combined Work means the
 34 | Corresponding Source for the Combined Work, excluding any source code
 35 | for portions of the Combined Work that, considered in isolation, are
 36 | based on the Application, and not on the Linked Version.
 37 | 
 38 |   The "Corresponding Application Code" for a Combined Work means the
 39 | object code and/or source code for the Application, including any data
 40 | and utility programs needed for reproducing the Combined Work from the
 41 | Application, but excluding the System Libraries of the Combined Work.
 42 | 
 43 |   1. Exception to Section 3 of the GNU GPL.
 44 | 
 45 |   You may convey a covered work under sections 3 and 4 of this License
 46 | without being bound by section 3 of the GNU GPL.
 47 | 
 48 |   2. Conveying Modified Versions.
 49 | 
 50 |   If you modify a copy of the Library, and, in your modifications, a
 51 | facility refers to a function or data to be supplied by an Application
 52 | that uses the facility (other than as an argument passed when the
 53 | facility is invoked), then you may convey a copy of the modified
 54 | version:
 55 | 
 56 |    a) under this License, provided that you make a good faith effort to
 57 |    ensure that, in the event an Application does not supply the
 58 |    function or data, the facility still operates, and performs
 59 |    whatever part of its purpose remains meaningful, or
 60 | 
 61 |    b) under the GNU GPL, with none of the additional permissions of
 62 |    this License applicable to that copy.
 63 | 
 64 |   3. Object Code Incorporating Material from Library Header Files.
 65 | 
 66 |   The object code form of an Application may incorporate material from
 67 | a header file that is part of the Library.  You may convey such object
 68 | code under terms of your choice, provided that, if the incorporated
 69 | material is not limited to numerical parameters, data structure
 70 | layouts and accessors, or small macros, inline functions and templates
 71 | (ten or fewer lines in length), you do both of the following:
 72 | 
 73 |    a) Give prominent notice with each copy of the object code that the
 74 |    Library is used in it and that the Library and its use are
 75 |    covered by this License.
 76 | 
 77 |    b) Accompany the object code with a copy of the GNU GPL and this license
 78 |    document.
 79 | 
 80 |   4. Combined Works.
 81 | 
 82 |   You may convey a Combined Work under terms of your choice that,
 83 | taken together, effectively do not restrict modification of the
 84 | portions of the Library contained in the Combined Work and reverse
 85 | engineering for debugging such modifications, if you also do each of
 86 | the following:
 87 | 
 88 |    a) Give prominent notice with each copy of the Combined Work that
 89 |    the Library is used in it and that the Library and its use are
 90 |    covered by this License.
 91 | 
 92 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 93 |    document.
 94 | 
 95 |    c) For a Combined Work that displays copyright notices during
 96 |    execution, include the copyright notice for the Library among
 97 |    these notices, as well as a reference directing the user to the
 98 |    copies of the GNU GPL and this license document.
 99 | 
100 |    d) Do one of the following:
101 | 
102 |        0) Convey the Minimal Corresponding Source under the terms of this
103 |        License, and the Corresponding Application Code in a form
104 |        suitable for, and under terms that permit, the user to
105 |        recombine or relink the Application with a modified version of
106 |        the Linked Version to produce a modified Combined Work, in the
107 |        manner specified by section 6 of the GNU GPL for conveying
108 |        Corresponding Source.
109 | 
110 |        1) Use a suitable shared library mechanism for linking with the
111 |        Library.  A suitable mechanism is one that (a) uses at run time
112 |        a copy of the Library already present on the user's computer
113 |        system, and (b) will operate properly with a modified version
114 |        of the Library that is interface-compatible with the Linked
115 |        Version.
116 | 
117 |    e) Provide Installation Information, but only if you would otherwise
118 |    be required to provide such information under section 6 of the
119 |    GNU GPL, and only to the extent that such information is
120 |    necessary to install and execute a modified version of the
121 |    Combined Work produced by recombining or relinking the
122 |    Application with a modified version of the Linked Version. (If
123 |    you use option 4d0, the Installation Information must accompany
124 |    the Minimal Corresponding Source and Corresponding Application
125 |    Code. If you use option 4d1, you must provide the Installation
126 |    Information in the manner specified by section 6 of the GNU GPL
127 |    for conveying Corresponding Source.)
128 | 
129 |   5. Combined Libraries.
130 | 
131 |   You may place library facilities that are a work based on the
132 | Library side by side in a single library together with other library
133 | facilities that are not Applications and are not covered by this
134 | License, and convey such a combined library under terms of your
135 | choice, if you do both of the following:
136 | 
137 |    a) Accompany the combined library with a copy of the same work based
138 |    on the Library, uncombined with any other library facilities,
139 |    conveyed under the terms of this License.
140 | 
141 |    b) Give prominent notice with the combined library that part of it
142 |    is a work based on the Library, and explaining where to find the
143 |    accompanying uncombined form of the same work.
144 | 
145 |   6. Revised Versions of the GNU Lesser General Public License.
146 | 
147 |   The Free Software Foundation may publish revised and/or new versions
148 | of the GNU Lesser General Public License from time to time. Such new
149 | versions will be similar in spirit to the present version, but may
150 | differ in detail to address new problems or concerns.
151 | 
152 |   Each version is given a distinguishing version number. If the
153 | Library as you received it specifies that a certain numbered version
154 | of the GNU Lesser General Public License "or any later version"
155 | applies to it, you have the option of following the terms and
156 | conditions either of that published version or of any later version
157 | published by the Free Software Foundation. If the Library as you
158 | received it does not specify a version number of the GNU Lesser
159 | General Public License, you may choose any version of the GNU Lesser
160 | General Public License ever published by the Free Software Foundation.
161 | 
162 |   If the Library as you received it specifies that a proxy can decide
163 | whether future versions of the GNU Lesser General Public License shall
164 | apply, that proxy's public statement of acceptance of any version is
165 | permanent authorization for you to choose that version for the
166 | Library.
167 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # zen3geo
 2 | 
 3 | The 🌏 data science library you've been waiting for~
 4 | 
 5 | > 君の前前前世から僕は 君を探しはじめたよ
 6 | >
 7 | > Since your past life, I have been searching for you
 8 | 
 9 | ## 公案
10 | 
11 | ```
12 | Geography is difficult, but easy it can also be
13 | Deep Learning, you hope, has an answer to all
14 | Too this, too that, where to though, where to?
15 | Look out, sense within, and now you must know
16 | ```
17 | 
18 | ## Installation
19 | 
20 | To install the development version from GitHub, do:
21 | 
22 |     pip install git+https://github.com/weiji14/zen3geo.git
23 | 
24 | Or the stable version from [PyPI](https://pypi.org/project/zen3geo):
25 | 
26 |     pip install zen3geo
27 | 
28 | If instead, [conda-forge](https://anaconda.org/conda-forge/zen3geo) you desire:
29 | 
30 |     mamba install --channel conda-forge zen3geo
31 | 
32 | Other instructions, see https://zen3geo.readthedocs.io/en/latest/#installation
33 | 


--------------------------------------------------------------------------------
/docs/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for Sphinx projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.11"
12 |   apt_packages:
13 |     - graphviz
14 |   jobs:
15 |     pre_build:
16 |       # Generate the Sphinx configuration for this Jupyter Book so it builds.
17 |       # https://jupyterbook.org/en/stable/publish/readthedocs.html
18 |       - "jupyter-book config sphinx docs/"
19 |     post_install:
20 |       # Install stackstac=0.4.4 instead of 0.5.0 to prevent
21 |       # TypeError: Unsupported data type float16
22 |       # because stackstac casts to float16 at read-in instead of post-read
23 |       # see https://github.com/gjoseph92/stackstac/pull/208
24 |       # Need to wait for rasterio/GDAL to support float16
25 |       # see https://gdal.org/api/raster_c_api.html#_CPPv412GDALDataType
26 |       # Install dask<2024.3.0 to prevent
27 |       # ModuleNotFoundError: No module named 'dask_expr'
28 |       # ImportError: Dask dataframe requirements are not installed
29 |       # https://github.com/holoviz/datashader/issues/1319
30 |       - "pip install stackstac==0.4.4 dask==2024.2.1"
31 | 
32 | # Optional but recommended, declare the Python requirements required
33 | # to build your documentation
34 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
35 | python:
36 |   install:
37 |     - method: pip
38 |       path: .
39 |       extra_requirements:
40 |       - docs
41 | 
42 | sphinx:
43 |   builder: html
44 |   fail_on_warning: true
45 | 


--------------------------------------------------------------------------------
/docs/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | te6s3z67 at duck dot com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series of
 86 | actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or permanent
 93 | ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 | 
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | [translations]: https://www.contributor-covenant.org/translations
133 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing Guidelines
  2 | 
  3 | We accept different types of contributions,
  4 | including some that don't require you to write a single line of code.
  5 | 
  6 | ## 📝 Types of contributions
  7 | 
  8 | ### Discussions 🎉
  9 | 
 10 | Discussions are where we have conversations.
 11 | 
 12 | If have a great new idea, or want to share something amazing with the community,
 13 | join us in [discussions](https://github.com/weiji14/zen3geo/discussions).
 14 | 
 15 | ### Issues 🐞
 16 | 
 17 | [Issues](https://docs.github.com/en/github/managing-your-work-on-github/about-issues)
 18 | are used to track tasks that contributors can help with.
 19 | 
 20 | If you've found something in the content or the website that should be updated,
 21 | search open issues to see if someone else has reported the same thing. If it's
 22 | something new, [open an issue](https://github.com/weiji14/zen3geo/issues/new/choose)!
 23 | We'll use the issue to have a conversation about the problem you want to fix.
 24 | 
 25 | ### Pull requests 🛠️
 26 | 
 27 | A [pull request](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
 28 | is a way to suggest changes in our repository.
 29 | 
 30 | When we merge those changes, they should be deployed to the live site within a few minutes.
 31 | To learn more about opening a pull request in this repo,
 32 | see [Opening a pull request](#opening-a-pull-request) below.
 33 | 
 34 | ### Translations 🌏
 35 | 
 36 | 人虽有南北之分，但佛性本无南北。
 37 | 
 38 | Yes, the source content in this repository is mostly written in English,
 39 | but we welcome folks from across the world! Please reach out if you have experience in translations and are interested in contributing!
 40 | 
 41 | ---
 42 | 
 43 | ## 👐 Opening a Pull Request
 44 | 
 45 | 1. [Login](https://github.com/login) to your GitHub account,
 46 |     or sign up for a new one at https://github.com/signup.
 47 | 
 48 | 2. Navigate to the file you want to modify, e.g. the
 49 |    [API docs file](https://github.com/weiji14/zen3geo/blob/main/docs/api.md).
 50 | 
 51 | 3. Click on the pen 🖊️ icon on the top right corner that says "Edit this file"
 52 | 
 53 | 4. This should bring you to a page similar to
 54 |    https://github.com/weiji14/zen3geo/edit/main/docs/api.md
 55 |    where you can make edits to the text using a web-based editor.
 56 |    Feel free to switch between the "Edit file" and "Preview changes" tabs as
 57 |    you modify the content to make sure things look ok.
 58 | 
 59 | 5. Once you're happy with your changes, scroll down to the bottom where it says
 60 |    **Commit changes**. This is where you will add a short summary of the
 61 |    changes you have made.
 62 | 
 63 |    ![The place to commit changes](https://user-images.githubusercontent.com/23487320/172029885-947e4e24-675a-4498-a2d8-f1fa4c26b934.png)
 64 | 
 65 |    Specifically, in the first box, you will need to give a short title (e.g.
 66 |    "Fixed typo in api.md file") that describes the changes you've made.
 67 |    Optionally, you can write a few extra sentences in the second box to explain
 68 |    things in more detail.
 69 | 
 70 | 6. Select the "Create a new branch for this commit and start a pull request"
 71 |    option and provide a new branch name (e.g. "fix-api-typo"). What this
 72 |    does is to ensure your changes are made in an independent manner or 'branch'
 73 |    away from the main trunk, and those changes will have the opportunity to be
 74 |    double checked and openly reviewed by other people.
 75 | 
 76 | 7. Click on the green 'Propose changes' button. This will bring you to a new
 77 |    page.
 78 | 
 79 | 8. Almost there! This "Open a pull request" page is where you can finalize
 80 |    things for the 'pull request' (a request to make changes) you will be
 81 |    opening soon. Again you will need to provide a title (e.g. 'Minor changes to
 82 |    the API markdown file') and a description.
 83 | 
 84 |    ![Pull request dialog page](https://user-images.githubusercontent.com/23487320/172030066-63dbdaa3-c7d4-403f-a3b6-5bccd966d038.png)
 85 | 
 86 |    Be sure to provide any context on **why** you are making the changes, and
 87 |    **how** you are doing so. This will make it easier for other people to
 88 |    know what is happening when they review your changes.
 89 | 
 90 | 9. Ready? Click on the green 'Create pull request' button! This will make your
 91 |    changes available for everyone to see and review publicly. The maintainers
 92 |    will be notified about your great new addition and will get back to you on
 93 |    the next steps.
 94 | 
 95 | ---
 96 | 
 97 | (contributing:running:locally)=
 98 | ## 🏠 Running things locally
 99 | 
100 | This project uses [``poetry``](https://python-poetry.org/docs/master/) for
101 | installing Python dependencies required in ``zen3geo``, as well as the
102 | development and documentation-related dependencies.
103 | 
104 | ### Cloning the repository ♊
105 | 
106 | ```
107 | git clone git@github.com:weiji14/zen3geo.git
108 | cd zen3geo
109 | ```
110 | 
111 | ### Setup virtual environment ☁️
112 | 
113 | ```
114 | mamba create --name zen3geo python=3.11
115 | mamba activate zen3geo
116 | 
117 | pip install poetry==1.6.1
118 | poetry install --extras "raster spatial stac vector"
119 | ```
120 | 
121 | ### Building documentation 📖
122 | 
123 | ```
124 | poetry install --extras=docs  # or `pip install .[docs]`
125 | sudo apt install graphviz  # if rendering graphviz plots
126 | jupyter-book build docs/
127 | ```
128 | 
129 | Then open ``docs/_build/html/index.html`` in your browser to see the docs.
130 | 
131 | ---
132 | 
133 | ## 🥳 And that's it!
134 | 
135 | You're now part of the zen3geo community ✨
136 | 
137 | ```{admonition} Credits
138 | :class: seealso
139 | *This contributing guide was adapted from*
140 | [GitHub docs](https://github.com/github/docs/blob/main/contributing/types-of-contributions.md)
141 | and the [APECS-Earth-Observation/Polar-EO-Database](https://github.com/APECS-Earth-Observation/Polar-EO-Database/blob/main/CONTRIBUTING.md) project.
142 | ```
143 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
  1 | # Book settings
  2 | # Learn more at https://jupyterbook.org/customize/config.html
  3 | 
  4 | title: zen3geo
  5 | author: The zen3geo Team
  6 | 
  7 | # Cache execution outputs of notebooks on each build.
  8 | # See https://jupyterbook.org/content/execute.html
  9 | execute:
 10 |   execute_notebooks: cache
 11 |   # https://jupyterbook.org/en/latest/content/execute.html#setting-execution-timeout
 12 |   timeout: 300
 13 | 
 14 | # Define the name of the latex output file for PDF builds
 15 | latex:
 16 |   latex_documents:
 17 |     targetname: zen3geo.tex
 18 | 
 19 | # Information about where the book exists on the web
 20 | repository:
 21 |   url: https://github.com/weiji14/zen3geo  # Online location of your book
 22 |   path_to_book: docs  # Optional path to your book, relative to the repository root
 23 |   branch: main  # Which branch of the repository should be used when creating links (optional)
 24 | 
 25 | # Add GitHub buttons to your book
 26 | # See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository
 27 | html:
 28 |   use_edit_page_button: true
 29 |   use_issues_button: true
 30 |   use_repository_button: true
 31 | 
 32 | sphinx:
 33 |   config:
 34 |     autodoc_typehints: 'description'
 35 |     myst_all_links_external: true
 36 |     nb_execution_show_tb: true
 37 |     html_show_copyright: false
 38 |     html_theme_options:
 39 |       # https://sphinx-book-theme.readthedocs.io/en/stable/customize/sidebar-secondary.html
 40 |       show_toc_level: 3
 41 |     intersphinx_mapping:
 42 |       contextily:
 43 |         - 'https://contextily.readthedocs.io/en/latest/'
 44 |         - null
 45 |       dask:
 46 |         - 'https://docs.dask.org/en/latest/'
 47 |         - null
 48 |       datashader:
 49 |         - 'https://datashader.org/'
 50 |         - null
 51 |       datatree:
 52 |         - 'https://xarray-datatree.readthedocs.io/en/latest/'
 53 |         - null
 54 |       geopandas:
 55 |         - 'https://geopandas.org/en/latest/'
 56 |         - null
 57 |       mmdetection:
 58 |         - 'https://mmdetection.readthedocs.io/zh_CN/latest/'
 59 |         - null
 60 |       numpy:
 61 |         - 'https://numpy.org/doc/stable/'
 62 |         - null
 63 |       pyogrio:
 64 |         - 'https://pyogrio.readthedocs.io/en/latest/'
 65 |         - null
 66 |       pystac:
 67 |         - 'https://pystac.readthedocs.io/en/latest/'
 68 |         - null
 69 |       pystac_client:
 70 |         - 'https://pystac-client.readthedocs.io/en/latest/'
 71 |         - null
 72 |       python:
 73 |         - 'https://docs.python.org/3/'
 74 |         - null
 75 |       rasterio:
 76 |         - 'https://rasterio.readthedocs.io/en/stable/'
 77 |         - null
 78 |       rioxarray:
 79 |         - 'https://corteva.github.io/rioxarray/stable/'
 80 |         - null
 81 |       shapely:
 82 |         - 'https://shapely.readthedocs.io/en/latest/'
 83 |         - null
 84 |       stackstac:
 85 |         - 'https://stackstac.readthedocs.io/en/latest/'
 86 |         - null
 87 |       torch:
 88 |         - 'https://pytorch.org/docs/stable/'
 89 |         - null
 90 |       torchdata:
 91 |         - 'https://pytorch.org/data/main/'
 92 |         - null
 93 |       torchvision:
 94 |         - 'https://pytorch.org/vision/main/'
 95 |         - null
 96 |       xarray:
 97 |         - 'https://docs.xarray.dev/en/stable/'
 98 |         - null
 99 |       xbatcher:
100 |         - 'https://xbatcher.readthedocs.io/en/latest/'
101 |         - null
102 |       zarr:
103 |         - 'https://zarr.readthedocs.io/en/latest/'
104 |         - null
105 |   extra_extensions:
106 |     - 'sphinx.ext.autodoc'
107 |     - 'sphinx.ext.intersphinx'
108 |     - 'sphinx.ext.napoleon'
109 |     - 'sphinx.ext.viewcode'
110 | 


--------------------------------------------------------------------------------
/docs/_toc.yml:
--------------------------------------------------------------------------------
 1 | # Table of contents
 2 | # Learn more at https://jupyterbook.org/customize/toc.html
 3 | 
 4 | format: jb-book
 5 | root: index
 6 | chapters:
 7 |   - title: 🦮 Walkthrough
 8 |     file: walkthrough
 9 |     sections:
10 |       - title: 🀄 Chipping and Batching
11 |         file: chipping
12 |       - title: 🫧 Vector Segmentation Masks
13 |         file: vector-segmentation-masks
14 |       - title: 🥡 Object Detection Boxes
15 |         file: object-detection-boxes
16 |       - title: 🏳️‍🌈 Stacking layers
17 |         file: stacking
18 |       - title: 📶 Multi-resolution
19 |         file: multi-resolution
20 |   - title: 📖 API Reference
21 |     file: api
22 |   - title: 📆 Changelog
23 |     file: changelog
24 |   - title: 🫶 Code of Conduct
25 |     file: CODE_OF_CONDUCT
26 |   - title: 🧑‍🤝‍🧑 Contributing
27 |     file: CONTRIBUTING
28 | 


--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
 1 | # API Reference
 2 | 
 3 | ## DataPipes
 4 | 
 5 | ```{eval-rst}
 6 | .. automodule:: zen3geo.datapipes
 7 |     :members:
 8 | ```
 9 | 
10 | ### Datashader
11 | 
12 | ```{eval-rst}
13 | .. automodule:: zen3geo.datapipes.datashader
14 | .. autoclass:: zen3geo.datapipes.DatashaderRasterizer
15 | .. autoclass:: zen3geo.datapipes.datashader.DatashaderRasterizerIterDataPipe
16 | .. autoclass:: zen3geo.datapipes.XarrayCanvas
17 | .. autoclass:: zen3geo.datapipes.datashader.XarrayCanvasIterDataPipe
18 |     :show-inheritance:
19 | ```
20 | 
21 | ### Geopandas
22 | 
23 | ```{eval-rst}
24 | .. automodule:: zen3geo.datapipes.geopandas
25 | .. autoclass:: zen3geo.datapipes.GeoPandasRectangleClipper
26 | .. autoclass:: zen3geo.datapipes.geopandas.GeoPandasRectangleClipperIterDataPipe
27 |     :show-inheritance:
28 | ```
29 | 
30 | ### Pyogrio
31 | 
32 | ```{eval-rst}
33 | .. automodule:: zen3geo.datapipes.pyogrio
34 | .. autoclass:: zen3geo.datapipes.PyogrioReader
35 | .. autoclass:: zen3geo.datapipes.pyogrio.PyogrioReaderIterDataPipe
36 |     :show-inheritance:
37 | ```
38 | 
39 | ### PySTAC
40 | 
41 | ```{eval-rst}
42 | .. automodule:: zen3geo.datapipes.pystac
43 | .. autoclass:: zen3geo.datapipes.PySTACItemReader
44 | .. autoclass:: zen3geo.datapipes.pystac.PySTACItemReaderIterDataPipe
45 |     :show-inheritance:
46 | ```
47 | 
48 | ### PySTAC Client
49 | 
50 | ```{eval-rst}
51 | .. automodule:: zen3geo.datapipes.pystac_client
52 | .. autoclass:: zen3geo.datapipes.PySTACAPISearcher
53 | .. autoclass:: zen3geo.datapipes.pystac_client.PySTACAPISearcherIterDataPipe
54 | .. autoclass:: zen3geo.datapipes.PySTACAPIItemLister
55 | .. autoclass:: zen3geo.datapipes.pystac_client.PySTACAPIItemListerIterDataPipe
56 |     :show-inheritance:
57 | ```
58 | 
59 | ### Rioxarray
60 | 
61 | ```{eval-rst}
62 | .. automodule:: zen3geo.datapipes.rioxarray
63 | .. autoclass:: zen3geo.datapipes.RioXarrayReader
64 | .. autoclass:: zen3geo.datapipes.rioxarray.RioXarrayReaderIterDataPipe
65 |     :show-inheritance:
66 | ```
67 | 
68 | ### Stackstac
69 | 
70 | ```{eval-rst}
71 | .. automodule:: zen3geo.datapipes.stackstac
72 | .. autoclass:: zen3geo.datapipes.StackSTACMosaicker
73 | .. autoclass:: zen3geo.datapipes.stackstac.StackSTACMosaickerIterDataPipe
74 | .. autoclass:: zen3geo.datapipes.StackSTACStacker
75 | .. autoclass:: zen3geo.datapipes.stackstac.StackSTACStackerIterDataPipe
76 |     :show-inheritance:
77 | ```
78 | 
79 | ### Xbatcher
80 | 
81 | ```{eval-rst}
82 | .. automodule:: zen3geo.datapipes.xbatcher
83 | .. autoclass:: zen3geo.datapipes.XbatcherSlicer
84 | .. autoclass:: zen3geo.datapipes.xbatcher.XbatcherSlicerIterDataPipe
85 |     :show-inheritance:
86 | ```
87 | 
88 | ### XpySTAC
89 | 
90 | ```{eval-rst}
91 | .. automodule:: zen3geo.datapipes.xpystac
92 | .. autoclass:: zen3geo.datapipes.XpySTACAssetReader
93 | .. autoclass:: zen3geo.datapipes.xpystac.XpySTACAssetReaderIterDataPipe
94 |     :show-inheritance:
95 | ```
96 | 


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## Release v0.6.2 (2023/06/29)
  4 | 
  5 | ### 💫 Highlights
  6 | 
  7 | * 🎉 **Patch release for zen3geo** 🎉
  8 | * 🚀 Quick addition of a new PySTACAPIItemLister DataPipe
  9 | 
 10 | ### 🚀 Features
 11 | 
 12 | * ✨ PySTACAPIItemLister to list STAC Items matching STAC API search ([#111](https://github.com/weiji14/zen3geo/pull/111))
 13 | 
 14 | ### 🧰 Maintenance
 15 | 
 16 | * ⬆️ Bump poetry from 1.4.2 to 1.5.1 ([#110](https://github.com/weiji14/zen3geo/pull/110))
 17 | 
 18 | ### 🧑‍🤝‍🧑 Contributors
 19 | 
 20 | [@dependabot[bot]](https://github.com/dependabot-bot) and [@weiji14](https://github.com/weiji14)
 21 | 
 22 | ---
 23 | 
 24 | ## Release v0.6.1 (2023/05/31)
 25 | 
 26 | ### 💫 Highlights
 27 | 
 28 | * 🎉 **Patch release for zen3geo** 🎉
 29 | * 😎 Full Python 3.11 support and a couple of bug fixes for DatashaderRasterizer
 30 | 
 31 | ### 🚀 Features
 32 | 
 33 | * 🥚 Allow using XpySTACAssetReader without xpystac when engine!=stac ([#100](https://github.com/weiji14/zen3geo/pull/100))
 34 | 
 35 | ### 🐛 Bug Fixes
 36 | 
 37 | * 🐛 Fix DatashaderRasterizer for GeoDataFrame wrapped in StreamWrapper ([#104](https://github.com/weiji14/zen3geo/pull/104))
 38 | * 🐛 Fix DatashaderRasterizer to allow N:1 instead of just 1:1 ([#98](https://github.com/weiji14/zen3geo/pull/98))
 39 | 
 40 | ### 📖 Documentation
 41 | 
 42 | * 👽️ Handle ms-buildings 20230425 update in Object Detection tutorial ([#106](https://github.com/weiji14/zen3geo/pull/106))
 43 | 
 44 | ### 🧰 Maintenance
 45 | 
 46 | * 👷 NEP29: Run CI and Docs build on Python 3.11 ([#103](https://github.com/weiji14/zen3geo/pull/103))
 47 | * ⬆️ Bump poetry from 1.3.0 to 1.4.2 ([#99](https://github.com/weiji14/zen3geo/pull/99))
 48 | 
 49 | ### 🧑‍🤝‍🧑 Contributors
 50 | 
 51 | [@dependabot[bot]](https://github.com/dependabot-bot) and [@weiji14](https://github.com/weiji14)
 52 | 
 53 | ---
 54 | 
 55 | ## Release v0.6.0 (2023/04/18)
 56 | 
 57 | ### 💫 Highlights
 58 | 
 59 | * 🎉 **Sixth release of zen3geo** 🎉
 60 | * 🚸 Walkthrough on handling multi-resolution climate data ([#91](https://github.com/weiji14/zen3geo/pull/91))
 61 | 
 62 | ### 🚀 Features
 63 | 
 64 | * ✨ XpySTACAssetReader for reading COG, NetCDF & Zarr STAC assets ([#87](https://github.com/weiji14/zen3geo/pull/87))
 65 | * ✨ Implement len function for XbatcherSlicerIterDataPipe ([#75](https://github.com/weiji14/zen3geo/pull/75))
 66 | 
 67 | ### 📖 Documentation
 68 | 
 69 | * ♻️ Use xarray.merge with join="override" in collate functions ([#72](https://github.com/weiji14/zen3geo/pull/72))
 70 | 
 71 | ### 🧰 Maintenance
 72 | 
 73 | * ⬆️ Bump jupyter-book from 0.14.0 to 0.15.1 ([#94](https://github.com/weiji14/zen3geo/pull/94))
 74 | * 📦️ Publish to TestPyPI and PyPI via OpenID Connect token ([#90](https://github.com/weiji14/zen3geo/pull/90))
 75 | * 👷 NEP29: Run Continuous Integration on Python 3.11 ([#89](https://github.com/weiji14/zen3geo/pull/89))
 76 | * ⬆️ Bump jupyter-book from 0.13.0 to 0.14.0 ([#85](https://github.com/weiji14/zen3geo/pull/85))
 77 | * 📌 Pin maximum python version to <4.0 ([#78](https://github.com/weiji14/zen3geo/pull/78))
 78 | * ⬆️ Bump poetry from 1.2.0 to 1.3.0 ([#77](https://github.com/weiji14/zen3geo/pull/77))
 79 | * 📌 Pin minimum xbatcher version to 0.2.0 ([#73](https://github.com/weiji14/zen3geo/pull/73))
 80 | 
 81 | ### 🧑‍🤝‍🧑 Contributors
 82 | 
 83 | [@dependabot[bot]](https://github.com/dependabot-bot) and [@weiji14](https://github.com/weiji14)
 84 | 
 85 | ---
 86 | 
 87 | ## Release v0.5.0 (2022/09/26)
 88 | 
 89 | ### 💫 Highlights
 90 | 
 91 | * 🎉 **Fifth release of zen3geo** 🎉
 92 | * 🚸 Walkthrough on stacking time-series earth observation data ([#62](https://github.com/weiji14/zen3geo/pull/62))
 93 | 
 94 | ### 🚀 Features
 95 | 
 96 | * ✨ StackSTACMosaicIterDataPipe to mosaic tiles into one piece ([#63](https://github.com/weiji14/zen3geo/pull/63))
 97 | * ✨ StackSTACStackerIterDataPipe for stacking STAC items ([#61](https://github.com/weiji14/zen3geo/pull/61))
 98 | * ✨ PySTACAPISearchIterDataPipe to query dynamic STAC Catalogs ([#59](https://github.com/weiji14/zen3geo/pull/59))
 99 | * ✨ PySTACItemReaderIterDataPipe for reading STAC Items ([#46](https://github.com/weiji14/zen3geo/pull/46))
100 | 
101 | ### 📖 Documentation
102 | 
103 | * 🚚 Rename to PySTACAPISearcher and StackSTACMosaicker ([#64](https://github.com/weiji14/zen3geo/pull/64))
104 | 
105 | ### 🧰 Maintenance
106 | 
107 | * 📌 Pin min pystac-client and stackstac to v0.4.0, pystac to 1.4.0 ([#66](https://github.com/weiji14/zen3geo/pull/66))
108 | * 📦️ Exclude tests from source distribution and binary wheel ([#58](https://github.com/weiji14/zen3geo/pull/58))
109 | 
110 | ### 🧑‍🤝‍🧑 Contributors
111 | 
112 | [@dependabot[bot]](https://github.com/dependabot-bot) and [@weiji14](https://github.com/weiji14)
113 | 
114 | ---
115 | 
116 | ## Release v0.4.0 (2022/09/08)
117 | 
118 | ### 💫 Highlights
119 | 
120 | * 🎉 **Fourth release of zen3geo** 🎉
121 | * 🚸 Walkthrough on object detection with bounding boxes ([#49](https://github.com/weiji14/zen3geo/pull/49))
122 | 
123 | ### 🚀 Features
124 | 
125 | * ✨ GeoPandasRectangleClipper for spatially subsetting vectors ([#52](https://github.com/weiji14/zen3geo/pull/52))
126 | 
127 | ### 📖 Documentation
128 | 
129 | * 📝 Add install from conda-forge instructions ([#55](https://github.com/weiji14/zen3geo/pull/55))
130 | * ✏️ Edit docs to use OGC:CRS84 lon/lat instead of EPSG:4326 ([#45](https://github.com/weiji14/zen3geo/pull/45))
131 | * 💡 Warn about overlapping strides if followed by train/val split ([#43](https://github.com/weiji14/zen3geo/pull/43))
132 | 
133 | ### 🧰 Maintenance
134 | 
135 | * ⬆️ Bump poetry from 1.2.0rc1 to 1.2.0 ([#47](https://github.com/weiji14/zen3geo/pull/47))
136 | * ⬆️ Bump poetry from 1.2.0b3 to 1.2.0rc1 ([#44](https://github.com/weiji14/zen3geo/pull/44))
137 | 
138 | ### 🧑‍🤝‍🧑 Contributors
139 | 
140 | [@dependabot[bot]](https://github.com/dependabot-bot) and [@weiji14](https://github.com/weiji14)
141 | 
142 | ---
143 | 
144 | ## Release v0.3.0 (2022/08/19)
145 | 
146 | ### 💫 Highlights
147 | 
148 | * 🎉 **Third release of zen3geo** 🎉
149 | * 🚸 Walkthrough on rasterizing vector polygons into label masks ([#31](https://github.com/weiji14/zen3geo/pull/31))
150 | 
151 | ### 🚀 Features
152 | 
153 | * ✨ DatashaderRasterizer for burning vector shapes to xarray grids ([#35](https://github.com/weiji14/zen3geo/pull/35))
154 | * ✨ XarrayCanvasIterDataPipe for creating blank datashader canvas ([#34](https://github.com/weiji14/zen3geo/pull/34))
155 | * ♻️ Let PyogrioReader return geodataframe only instead of tuple ([#33](https://github.com/weiji14/zen3geo/pull/33))
156 | 
157 | ### 🐛 Bug Fixes
158 | 
159 | * ♻️ Refactor DatashaderRasterizer to be up front about datapipe lengths ([#39](https://github.com/weiji14/zen3geo/pull/39))
160 | * 🩹 Raise ModuleNotFoundError when xbatcher not installed ([#37](https://github.com/weiji14/zen3geo/pull/37))
161 | 
162 | ### 📖 Documentation
163 | 
164 | * 📝 Improve pip install zen3geo instructions with extras dependencies ([#40](https://github.com/weiji14/zen3geo/pull/40))
165 | * 🔍 Show more levels for the in-page table of contents ([#36](https://github.com/weiji14/zen3geo/pull/36))
166 | 
167 | ### 🧑‍🤝‍🧑 Contributors
168 | 
169 | [@weiji14](https://github.com/weiji14)
170 | 
171 | ---
172 | 
173 | ## Release v0.2.0 (2022/07/17)
174 | 
175 | ### 💫 Highlights
176 | 
177 | * 🎉 **Second release of zen3geo** 🎉
178 | * 🚸 Walkthrough on creating batches of data chips ([#20](https://github.com/weiji14/zen3geo/pull/20))
179 | 
180 | ### 🚀 Features
181 | 
182 | * ♻️ Let RioXarrayReader return dataarray only instead of tuple ([#24](https://github.com/weiji14/zen3geo/pull/24))
183 | * ✨ XbatcherSlicerIterDataPipe for slicing xarray.DataArray ([#22](https://github.com/weiji14/zen3geo/pull/22))
184 | * ✨ PyogrioReaderIterDataPipe for reading vector OGR files ([#19](https://github.com/weiji14/zen3geo/pull/19))
185 | 
186 | ### 📖 Documentation
187 | 
188 | * 🎨 Extra subsection for rioxarray datapipes ([#18](https://github.com/weiji14/zen3geo/pull/18))
189 | 
190 | ### 🧰 Maintenance
191 | 
192 | * 👷 NEP29: Run CI and Docs build on Python 3.10 ([#29](https://github.com/weiji14/zen3geo/pull/29))
193 | * ⬆️ Bump poetry from 1.2.0b2 to 1.2.0b3 ([#28](https://github.com/weiji14/zen3geo/pull/28))
194 | * 📌 Pin minimum torchdata version to 0.4.0 ([#25](https://github.com/weiji14/zen3geo/pull/25))
195 | * 📌 Pin minimum pyogrio version to 0.4.0 ([#21](https://github.com/weiji14/zen3geo/pull/21))
196 | 
197 | ### 🧑‍🤝‍🧑 Contributors
198 | 
199 | [@weiji14](https://github.com/weiji14)
200 | 
201 | ---
202 | 
203 | ## Release v0.1.0 (2022/06/08)
204 | 
205 | ### 💫 Highlights
206 | 
207 | * 🎉 **First release of zen3geo** 🎉
208 | * 🚸 Walkthrough on using RioXarray IterDataPipes at https://zen3geo.readthedocs.io/en/latest/walkthrough.html ([#8](https://github.com/weiji14/zen3geo/pull/8))
209 | 
210 | ### 🚀 Features
211 | 
212 | * ✨ Introducing RioXarrayReaderIterDataPipe for reading GeoTIFFs ([#6](https://github.com/weiji14/zen3geo/pull/6))
213 | 
214 | ### 📖 Documentation
215 | 
216 | * 🔧 Configure readthedocs documentation build ([#13](https://github.com/weiji14/zen3geo/pull/13))
217 | * 💬 Show how to convert xarray.DataArray to torch.Tensor ([#9](https://github.com/weiji14/zen3geo/pull/9))
218 | * 📝 Add basic installation instructions ([#7](https://github.com/weiji14/zen3geo/pull/7))
219 | * 👥 Healthy community standards ([#4](https://github.com/weiji14/zen3geo/pull/4))
220 | 
221 | ### 🧰 Maintenance
222 | 
223 | * 📦 Publish to TestPyPI and PyPI using GitHub Actions ([#14](https://github.com/weiji14/zen3geo/pull/14))
224 | * 🧑‍💻 Draft changelog with Release Drafter GitHub Actions ([#11](https://github.com/weiji14/zen3geo/pull/11))
225 | * 👷 Setup GitHub Actions Continuous Integration tests ([#2](https://github.com/weiji14/zen3geo/pull/2))
226 | * 🌱 Initialize pyproject.toml file ([#1](https://github.com/weiji14/zen3geo/pull/1))
227 | 
228 | ### 🧑‍🤝‍🧑 Contributors
229 | 
230 | [@weiji14](https://github.com/weiji14)
231 | 


--------------------------------------------------------------------------------
/docs/chipping.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupytext:
  3 |   formats: md:myst
  4 |   text_representation:
  5 |     extension: .md
  6 |     format_name: myst
  7 | kernelspec:
  8 |   display_name: Python 3
  9 |   language: python
 10 |   name: python3
 11 | ---
 12 | 
 13 | # Chipping and batching data
 14 | 
 15 | > What is separation?
 16 | >
 17 | > What isn't?
 18 | 
 19 | Following on from the previous tutorial,
 20 | let's 🧑‍🎓 learn more about creating a more complicated 🌈 raster data pipeline.
 21 | Specifically, we'll go through the following:
 22 | - Loading Cloud-Optimized GeoTIFFs (COGs) from different geographic regions 🌏
 23 | - Cut up each large GeoTIFF into several 512 x 512 pixel chips 🥨
 24 | - Create batches of chips/tensors to feed into a DataLoader 🏋️
 25 | 
 26 | Some terminology 📜 disambiguation:
 27 | - scene - the big image (e.g. 10000x10000 pixels) from a satellite 🛰️ (e.g. a GeoTIFF)
 28 | - chip - the small image (e.g. 512x512 pixels) cut ✂️ out from a satellite scene to be loaded as a tensor
 29 | 
 30 | See also:
 31 | - https://github.com/microsoft/torchgeo/wiki/Design-Decisions#chip-vs-tile-vs-region
 32 | - https://github.com/cogeotiff/cog-spec/blob/master/spec.md
 33 | 
 34 | ## 🎉 **Getting started**
 35 | 
 36 | Load up them libraries!
 37 | 
 38 | ```{code-cell}
 39 | import pystac
 40 | import planetary_computer
 41 | import rioxarray
 42 | 
 43 | import torch
 44 | import torchdata
 45 | import zen3geo
 46 | ```
 47 | 
 48 | ## 0️⃣ Find [Cloud-Optimized GeoTIFFs](https://www.cogeo.org) ☁️
 49 | 
 50 | Synthetic-Aperture Radar (SAR) from a [STAC](https://stacspec.org) catalog!
 51 | We'll get some Sentinel-1 Ground-Range Detected (GRD) data over Osaka and Tokyo
 52 | in Japan 🇯🇵.
 53 | 
 54 | 🔗 Links:
 55 | - [Official Sentinel-1 description page at ESA](https://sentinel.esa.int/web/sentinel/missions/sentinel-1)
 56 | - [Microsoft Planetary Computer STAC Explorer](https://planetarycomputer.microsoft.com/explore?c=137.4907%2C35.0014&z=7.94&v=2&d=sentinel-1-grd&s=false%3A%3A100%3A%3Atrue&ae=0&m=cql%3A08211c0dd907a5066c41422c75629d5f&r=VV%2C+VH+False-color+composite)
 57 | - [AWS Sentinel-1 Cloud-Optimized GeoTIFFs](https://registry.opendata.aws/sentinel-1)
 58 | 
 59 | 
 60 | ```{code-cell}
 61 | item_urls = [
 62 |     # Osaka
 63 |     "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-1-grd/items/S1A_IW_GRDH_1SDV_20220614T210034_20220614T210059_043664_05368A",
 64 |     # Tokyo
 65 |     "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-1-grd/items/S1A_IW_GRDH_1SDV_20220616T204349_20220616T204414_043693_053764",
 66 | ]
 67 | 
 68 | # Load each STAC item's metadata and sign the assets
 69 | items = [pystac.Item.from_file(item_url) for item_url in item_urls]
 70 | signed_items = [planetary_computer.sign(item) for item in items]
 71 | signed_items
 72 | ```
 73 | 
 74 | ### Inspect one of the data assets 🍱
 75 | 
 76 | The Sentinel-1 STAC item contains several assets.
 77 | These include different 〰️ polarizations (e.g. 'VH', 'VV').
 78 | Let's just take the 'thumbnail' product for now which is an RGB preview, with
 79 | the red 🟥 channel (R) representing the co-polarization (VV or HH), the green
 80 | 🟩 channel (G) representing the cross-polarization (VH or HV) and the blue 🟦
 81 | channel (B) representing the ratio of the cross and co-polarizations.
 82 | 
 83 | ```{code-cell}
 84 | url: str = signed_items[0].assets["thumbnail"].href
 85 | da = rioxarray.open_rasterio(filename=url)
 86 | da
 87 | ```
 88 | 
 89 | This is how the Sentinel-1 radar image looks like over Osaka on 14 June 2022.
 90 | 
 91 | ![Sentinel-1 GRD image over Osaka, Japan on 20220614](https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=sentinel-1-grd&item=S1A_IW_GRDH_1SDV_20220614T210034_20220614T210059_043664_05368A&assets=vv&assets=vh&expression=vv%3Bvh%3Bvv%2Fvh&rescale=0%2C600&rescale=0%2C270&rescale=0%2C9&asset_as_band=True&tile_format=png&format=png)
 92 | 
 93 | ## 1️⃣ Creating 512x512 chips from large satellite scenes 🪟
 94 | 
 95 | Unless you have a lot of RAM, it is common to cut ✂️ a large satellite scene
 96 | into multiple smaller chips (or patches, tiles 🀄, etc) first.
 97 | This is typically done in a rolling or sliding window 🪟 fashion,
 98 | via a nested loop through the y-dimension and x-dimension in strides of say,
 99 | 512 pixels x 512 pixels.
100 | 
101 | Let's begin by setting up the first part of the DataPipe,
102 | which is to read the satellite scene 🖼️ using `rioxarray`.
103 | 
104 | ```{code-cell}
105 | # Just get the VV polarization for now from Sentinel-1
106 | urls = [item.assets["vv"].href for item in signed_items]
107 | dp = torchdata.datapipes.iter.IterableWrapper(iterable=urls)
108 | dp_rioxarray = dp.read_from_rioxarray(overview_level=3)
109 | dp_rioxarray
110 | ```
111 | 
112 | ### Slicing with XbatcherSlicer 🍕
113 | 
114 | To create the chips, we'll be using ``xbatcher`` which allows slicing 🔪 of an
115 | n-dimensional datacube along any dimension (e.g. longitude, latitude, time 🕛).
116 | This ``xbatcher`` library is integrated into ☯ ``zen3geo`` as a DataPipe called
117 | {py:class}`zen3geo.datapipes.XbatcherSlicer` (functional name:
118 | `slice_with_xbatcher`), which can be used as follows:
119 | 
120 | ```{code-cell}
121 | dp_xbatcher = dp_rioxarray.slice_with_xbatcher(input_dims={"y": 512, "x": 512})
122 | dp_xbatcher
123 | ```
124 | 
125 | This should give us about 12 chips in total, 6 from each of the 2 Sentinel-1
126 | images that were passed in.
127 | 
128 | ```{code-cell}
129 | print(f"Number of chips: {len(dp_xbatcher)}")
130 | ```
131 | 
132 | Now, if you want to customize the sliding window (e.g. do overlapping strides),
133 | pass in extra parameters to ``slice_with_xbatcher``, and it will be handled by
134 | {py:class}`xbatcher.BatchGenerator`.
135 | 
136 | ```{code-cell}
137 | dp_xbatcher = dp_rioxarray.slice_with_xbatcher(
138 |         input_dims={"y": 512, "x": 512}, input_overlap={"y": 256, "x": 256}
139 | )
140 | dp_xbatcher
141 | ```
142 | 
143 | Great, and this overlapping stride method should give us more 512x512 chips 🧮
144 | than before.
145 | 
146 | ```{code-cell}
147 | print(f"Number of chips: {len(dp_xbatcher)}")
148 | ```
149 | 
150 | Double-check that single chips are of the correct dimensions
151 | (band: 1, y: 512, x: 512).
152 | 
153 | ```{code-cell}
154 | chips = list(dp_xbatcher)
155 | sample = chips[0]
156 | sample
157 | ```
158 | 
159 | ```{danger}
160 | Please do not use overlapping strides (i.e. `input_overlap` < `input_dim`) if
161 | you will be 🪓 splitting your chips into training, validation and test sets
162 | later! If you have say 60 overlapping chips and then go on to divide those 🍪
163 | chips randomly into train/val/test sets of 30/20/10, you will have information
164 | leakage 🚰 between the 30 training chips and 20 validation plus 10 test chips,
165 | so your model's reported validation and test metrics 📈 will be overestimating
166 | the actual performance 😲!
167 | 
168 | Ideally, your train/val/test chips should be situated independently within
169 | spatially contiguous blocks 🧱. See these links for more information on why:
170 | 
171 | - Kattenborn, T., Schiefer, F., Frey, J., Feilhauer, H., Mahecha, M. D., &
172 |   Dormann, C. F. (2022). Spatially autocorrelated training and validation
173 |   samples inflate performance assessment of convolutional neural networks.
174 |   ISPRS Open Journal of Photogrammetry and Remote Sensing, 5, 100018.
175 |   https://doi.org/10.1016/j.ophoto.2022.100018
176 | - https://github.com/pangeo-data/xbatcher/discussions/78#discussioncomment-3387295
177 | 
178 | Yes, spatial statistics 🧮 matter, geography is special 🤓.
179 | ```
180 | 
181 | 
182 | ## 2️⃣ Pool chips into mini-batches ⚙️
183 | 
184 | In total, we now have a set of 30 🍪 chips of size 512 x 512 pixels each.
185 | These chips can be divided into batches that are of a reasonable size.
186 | Let's use {py:class}`torchdata.datapipes.iter.Batcher`
187 | (functional name: `batch`) to do so.
188 | 
189 | ```{code-cell}
190 | dp_batch = dp_xbatcher.batch(batch_size=10)
191 | print(f"Number of items in first batch: {len(list(dp_batch)[0])}")
192 | ```
193 | 
194 | Now each batch will have 10 chips of size 512 x 512, with
195 | each chip being an {py:class}``xarray.DataArray``.
196 | 
197 | ```{note}
198 | Notice how no mosaicking nor reprojection was done for the two satellite
199 | scenes. This is the beauty of zen3geo - full flexibility of combining
200 | geospatial datasets 😎. Respect the native coordinate system and let the data
201 | flow directly into your models!
202 | 
203 | Oh, and to be super clear, of the 3 batches of 10 chips each:
204 | - The first batch has 10 chips are from the 1st satellite scene over Osaka
205 | - The second batch has 5 chips over Osaka, and 5 chips over Tokyo
206 | - The third batch has 10 chips from the 2nd satellite scene over Tokyo
207 | ```
208 | 
209 | ### Stack many chips in mini-batches into a single tensor 🥞
210 | 
211 | Let's now stack all these chips into a single tensor per batch, with a
212 | (number, channel, height, width) shape like (10, 1, 512, 512). We'll need a
213 | custom 🪄 collate function to do the conversion
214 | (from {py:class}``xarray.DataArray`` to {py:class}``torch.Tensor``) and
215 | stacking.
216 | 
217 | ```{code-cell}
218 | def xr_collate_fn(samples) -> torch.Tensor:
219 |     """
220 |     Converts individual xarray.DataArray objects to a torch.Tensor (int16
221 |     dtype), and stacks them all into a single torch.Tensor.
222 |     """
223 |     tensors = [
224 |         torch.as_tensor(data=sample.data.astype(dtype="int16")) for sample in samples
225 |     ]
226 |     return torch.stack(tensors=tensors)
227 | ```
228 | 
229 | Then, pass this collate function to
230 | {py:class}`torchdata.datapipes.iter.Collator` (functional name: `collate`).
231 | 
232 | ```{code-cell}
233 | dp_collate = dp_batch.collate(collate_fn=xr_collate_fn)
234 | print(f"Number of mini-batches: {len(dp_collate)}")
235 | print(f"Mini-batch tensor shape: {list(dp_collate)[0].shape}")
236 | ```
237 | 
238 | ### Into a DataLoader 🏋️
239 | 
240 | One more thing 🍎, throw the DataPipe into
241 | {py:class}`torch.utils.data.DataLoader`!
242 | Set `batch_size` to `None`, since we've handled the batching manually in the
243 | above sections already.
244 | 
245 | ```{code-cell}
246 | dataloader = torch.utils.data.DataLoader(dataset=dp_collate, batch_size=None)
247 | for i, batch in enumerate(dataloader):
248 |     tensor = batch
249 |     print(f"Batch {i}: {tensor.shape}")
250 | ```
251 | 
252 | Lights, camera, action 💥
253 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # <center> ☯ *zen3geo* - The 🌏 data science library you've been waiting for <center>
 2 | 
 3 | ## Installation
 4 | 
 5 | Get what you need, not more, not less:
 6 | 
 7 | | Command                        |  Dependencies |
 8 | |:-------------------------------|---------------|
 9 | | `pip install zen3geo`          | rioxarray, torchdata |
10 | | `pip install zen3geo[raster]`  | rioxarray, torchdata, xbatcher, zarr |
11 | | `pip install zen3geo[spatial]` | rioxarray, torchdata, datashader, spatialpandas |
12 | | `pip install zen3geo[stac]`    | rioxarray, torchdata, pystac, pystac-client, stackstac, xpystac |
13 | | `pip install zen3geo[vector]`  | rioxarray, torchdata, pyogrio[geopandas] |
14 | 
15 | Retrieve more ['extras'](https://github.com/weiji14/zen3geo/blob/main/pyproject.toml) using
16 | 
17 |     pip install zen3geo[raster,spatial,stac,vector]
18 | 
19 | To install the development version from [TestPyPI](https://test.pypi.org/project/zen3geo), do:
20 | 
21 |     pip install --pre --extra-index-url https://test.pypi.org/simple/ zen3geo
22 | 
23 | May [conda-forge](https://anaconda.org/conda-forge/zen3geo) be with you,
24 | though optional dependencies it has not.
25 | 
26 |     mamba install --channel conda-forge zen3geo
27 | 
28 | For the eager ones, {ref}`contributing <contributing:running:locally>` will take you further.
29 | 


--------------------------------------------------------------------------------
/docs/multi-resolution.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupytext:
  3 |   formats: md:myst
  4 |   text_representation:
  5 |     extension: .md
  6 |     format_name: myst
  7 | kernelspec:
  8 |   display_name: Python 3
  9 |   language: python
 10 |   name: python3
 11 | ---
 12 | 
 13 | # Multi-resolution
 14 | 
 15 | > On top of a hundred foot pole you linger
 16 | >
 17 | > Clinging to the first mark of the scale
 18 | >
 19 | > How do you proceed higher?
 20 | >
 21 | > It will take more than a leap of faith
 22 | 
 23 | Earth Observation 🛰️ and climate projection 🌡️ data can be captured at
 24 | different levels of detail. In this lesson, we'll work with a multitude of
 25 | spatial resolutions 📏, learning to respect the ground sampling distance or
 26 | native resolution 🔬 of the physical variable being measured, while 🪶
 27 | minimizing memory usage. By the end of the lesson, you should be able to:
 28 | 
 29 | - Find 🔍 low and high spatial resolution climate datasets and load them from
 30 |   {doc}`Zarr <zarr:index>` stores
 31 | - Stack 🥞 and subset time-series datasets with different spatial resolutions
 32 |   stored in a hierarchical {py:class}`datatree.DataTree` structure
 33 | - Slice 🔪 the multi-resolution dataset along the time-axis into monthly bins
 34 | 
 35 | 🔗 Links:
 36 | - https://carbonplan.org/research/cmip6-downscaling-explainer
 37 | - https://github.com/carbonplan/cmip6-downscaling/blob/1.0/notebooks/accessing_data_example.ipynb
 38 | - https://github.com/xarray-contrib/xbatcher/issues/93
 39 | 
 40 | 
 41 | ## 🎉 **Getting started**
 42 | 
 43 | These are the tools 🛠️ you'll need.
 44 | 
 45 | ```{code-cell}
 46 | import matplotlib.pyplot as plt
 47 | import pandas as pd
 48 | import torchdata.dataloader2
 49 | import xarray as xr
 50 | import xpystac
 51 | import zen3geo
 52 | 
 53 | from datatree import DataTree
 54 | ```
 55 | 
 56 | ## 0️⃣ Find climate model datasets 🪸
 57 | 
 58 | The two datasets we'll be working with are 🌐 gridded climate projections, one
 59 | that is in its original low 🔅 spatial resolution, and another one of a
 60 | higher 🔆 spatial resolution. Specifically, we'll be looking at the maximum
 61 | temperature 🌡️ (tasmax) variable from one of the Coupled Model Intercomparison
 62 | Project Phase 6 (CMIP6) global coupled ocean-atmosphere general circulation
 63 | model (GCM) 💨 outputs that is of low-resolution (67.5 arcminute), and a
 64 | super-resolution product from DeepSD 🤔 that is of a higher resolution (15
 65 | arcminute).
 66 | 
 67 | ```{note}
 68 | The following tutorial will mostly use the term super-resolution 🔭 from
 69 | Computer Vision instead of downscaling ⏬. It's just that the term
 70 | downscaling ⏬ (going from low to high resolution) can get confused with
 71 | downsampling 🙃 (going from high to low resolution), whereas
 72 | super-resolution 🔭 is unambiguously about going from low 🔅 to high 🔆
 73 | resolution.
 74 | ```
 75 | 
 76 | 🔖 References:
 77 | - https://carbonplan.org/research/cmip6-downscaling
 78 | - https://github.com/tjvandal/deepsd
 79 | - https://tutorial.xarray.dev/intermediate/cmip6-cloud.html
 80 | 
 81 | ```{code-cell}
 82 | lowres_raw = "https://cpdataeuwest.blob.core.windows.net/cp-cmip/cmip6/ScenarioMIP/MRI/MRI-ESM2-0/ssp585/r1i1p1f1/Amon/tasmax/gn/v20191108"
 83 | highres_deepsd = "https://cpdataeuwest.blob.core.windows.net/cp-cmip/version1/data/DeepSD/ScenarioMIP.MRI.MRI-ESM2-0.ssp585.r1i1p1f1.month.DeepSD.tasmax.zarr"
 84 | ```
 85 | 
 86 | This is how the projected maximum temperature 🥵 for August 2089 looks like over
 87 | South Asia 🪷 for a low-resolution 🔅 Global Climate Model (left) and a
 88 | high-resolution 🔆 downscaled product (right).
 89 | 
 90 | ```{code-cell}
 91 | :tags: [hide-input]
 92 | # Zarr datasets from https://github.com/carbonplan/research/blob/d05d148fd716ba6304e3833d765069dd890eaf4a/articles/cmip6-downscaling-explainer/components/downscaled-data.js#L97-L122
 93 | ds_gcm = xr.open_dataset(
 94 |     filename_or_obj="https://cmip6downscaling.blob.core.windows.net/vis/article/fig1/regions/india/gcm-tasmax.zarr"
 95 | )
 96 | ds_gcm -= 273.15  # convert from Kelvin to Celsius
 97 | ds_downscaled = xr.open_dataset(
 98 |     filename_or_obj="https://cmip6downscaling.blob.core.windows.net/vis/article/fig1/regions/india/downscaled-tasmax.zarr"
 99 | )
100 | ds_downscaled -= 273.15  # convert from Kelvin to Celsius
101 | 
102 | # Plot projected maximum temperature over South Asia from GCM and GARD-MV
103 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 3), sharey=True)
104 | 
105 | img1 = ds_gcm.tasmax.plot.imshow(
106 |     ax=ax[0], cmap="inferno", vmin=16, vmax=48, add_colorbar=False
107 | )
108 | ax[0].set_title("Global Climate Model (67.5 arcminute)")
109 | 
110 | img2 = ds_downscaled.tasmax.plot.imshow(
111 |     ax=ax[1], cmap="inferno", vmin=16, vmax=48, add_colorbar=False
112 | )
113 | ax[1].set_title("Downscaled result (15 arcminute)")
114 | 
115 | cbar = fig.colorbar(mappable=img1, ax=ax.ravel().tolist(), extend="both")
116 | cbar.set_label(label="Daily Max Near-Surface Air\nTemperature in Aug 2089 (°C)")
117 | 
118 | plt.show()
119 | ```
120 | 
121 | ### Load Zarr stores 📦
122 | 
123 | The {doc}`Zarr <zarr:index>` stores 🧊 can be loaded into an
124 | {py:class}`xarray.Dataset` via {py:class}`zen3geo.datapipes.XpySTACAssetReader`
125 | (functional name: ``read_from_xpystac``) with the `engine="zarr"` keyword
126 | argument.
127 | 
128 | ```{code-cell}
129 | dp_lowres = torchdata.datapipes.iter.IterableWrapper(iterable=[lowres_raw])
130 | dp_highres = torchdata.datapipes.iter.IterableWrapper(iterable=[highres_deepsd])
131 | 
132 | dp_lowres_dataset = dp_lowres.read_from_xpystac(engine="zarr", chunks="auto")
133 | dp_highres_dataset = dp_highres.read_from_xpystac(engine="zarr", chunks="auto")
134 | ```
135 | 
136 | ### Inspect the climate datasets 🔥
137 | 
138 | Let's now preview 👀 the low-resolution 🔅 and high-resolution 🔆 temperature
139 | datasets.
140 | 
141 | ```{code-cell}
142 | it = iter(dp_lowres_dataset)
143 | ds_lowres = next(it)
144 | ds_lowres
145 | ```
146 | 
147 | ```{code-cell}
148 | it = iter(dp_highres_dataset)
149 | ds_highres = next(it)
150 | ds_highres
151 | ```
152 | 
153 | Notice that the low-resolution 🔅 dataset has lon/lat pixels of shape
154 | (320, 160), whereas the high-resolution 🔆 dataset is of shape (1440, 720). So
155 | there has been a 4.5x increase 📈 in spatial resolution going from the raw GCM
156 | 🌐 grid to the super-resolution 🔭 DeepSD grid.
157 | 
158 | ### Shift from 0-360 to -180-180 🌐
159 | 
160 | A sharp eye 👁️ would have noticed that the longitudinal range of the
161 | low-resolution 🔅 and high-resolution 🔆 dataset are offset ↔️ by 180°, going
162 | from 0° to 360° and -180° to +180° respectively. Let's shift the coordinates 📍
163 | of the low-resolution grid 🌍 from 0-360 to -180-180 using a custom
164 | {py:class}`torchdata.datapipes.iter.Mapper` (functional name: `map`) function.
165 | 
166 | 🔖 References:
167 | - https://discourse.pangeo.io/t/handling-slicing-with-circular-longitude-coordinates-in-xarray/1608/3
168 | - https://gis.stackexchange.com/questions/416091/converting-a-netcdf-from-0-to-360-to-180-to-180-via-xarray
169 | 
170 | ```{code-cell}
171 | def shift_longitude_360_to_180(ds: xr.Dataset) -> xr.Dataset:
172 |     ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180))
173 |     ds = ds.roll(lon=int(len(ds.lon) / 2), roll_coords=True)
174 |     return ds
175 | ```
176 | 
177 | ```{code-cell}
178 | dp_lowres_dataset_180 = dp_lowres_dataset.map(fn=shift_longitude_360_to_180)
179 | dp_lowres_dataset_180
180 | ```
181 | 
182 | Double check that the low-resolution 🔆 grid's longitude coordinates 🔢 are now
183 | in the -180° to +180° range.
184 | 
185 | ```{code-cell}
186 | it = iter(dp_lowres_dataset_180)
187 | ds_lowres_180 = next(it)
188 | ds_lowres_180
189 | ```
190 | 
191 | 
192 | ## Spatiotemporal stack and subset 🍱
193 | 
194 | Following on from {doc}`./stacking` where multiple 🥞 layers with the **same**
195 | spatial resolution were stacked together into an {py:class}`xarray.DataArray`
196 | object, this section will teach 🧑‍🏫 you about stacking datasets with
197 | **different** spatial resolutions 📶 into a {py:class}`datatree.DataTree`
198 | object that has a nested/hierarchical structure. That
199 | {py:class}`datatree.DataTree` can then be subsetted 🥮 to the desired spatial
200 | and temporal extent in one go 😎.
201 | 
202 | ### Stack multi-resolution datasets 📚
203 | 
204 | First, we'll need to combine 🪢 the low-resolution GCM and high-resolution
205 | DeepSD {py:class}`xarray.Dataset` objects into a tuple 🎵 using
206 | {py:class}`torchdata.datapipes.iter.Zipper` (functional name: zip).
207 | 
208 | ```{code-cell}
209 | dp_lowres_highres = dp_lowres_dataset_180.zip(dp_highres_dataset)
210 | dp_lowres_highres
211 | ```
212 | 
213 | Next, use {py:class}`torchdata.datapipes.iter.Collator` (functional name:
214 | `collate`) to convert 🤸 the tuple of {py:class}`xarray.Dataset` objects into
215 | an {py:class}`datatree.DataTree` 🎋, similar to what was done in
216 | {doc}`./stacking`. Note that we'll only take the 'tasmax' ♨️ (Daily Maximum
217 | Near-Surface Air Temperature) {py:class}`xarray.DataArray` variable from each
218 | of the {py:class}`xarray.Dataset` objects.
219 | 
220 | ```{code-cell}
221 | def multires_collate_fn(lowres_and_highres: tuple) -> DataTree:
222 |     """
223 |     Combine a pair of xarray.Dataset (lowres, highres) inputs into a
224 |     datatree.DataTree with groups named 'lowres' and 'highres'.
225 |     """
226 |     # Turn 2 xr.Dataset objects into 1 xr.DataTree with multiple groups
227 |     ds_lowres, ds_highres = lowres_and_highres
228 | 
229 |     # Create DataTree with lowres and highres groups
230 |     datatree: DataTree = DataTree.from_dict(
231 |         d={"lowres": ds_lowres.tasmax, "highres": ds_highres.tasmax}
232 |     )
233 | 
234 |     return datatree
235 | ```
236 | 
237 | ```{code-cell}
238 | dp_datatree = dp_lowres_highres.collate(collate_fn=multires_collate_fn)
239 | dp_datatree
240 | ```
241 | 
242 | See the nested 🪆 structure of the {py:class}`datatree.DataTree`. The
243 | low-resolution 🔅 GCM and high-resolution 🔆 DeepSD outputs have been placed in
244 | separate groups 🖖.
245 | 
246 | ```{code-cell}
247 | it = iter(dp_datatree)
248 | datatree = next(it)
249 | datatree
250 | ```
251 | 
252 | ### Subset multi-resolution layers 🥮
253 | 
254 | The climate model outputs above are a global 🗺️ one covering a timespan from
255 | January 2015 to December 2100 📅. If you're only interested in a particular
256 | region 🌏 or timespan ⌚, then the {py:class}`datatree.DataTree` will need to
257 | be trimmed 💇 down. Let's use {py:meth}`datatree.DataTree.sel` to subset the
258 | multi-resolution data to just the Philippines 🇵🇭 for the period 2015 to 2030.
259 | 
260 | ```{code-cell}
261 | def spatiotemporal_subset(dt: DataTree) -> DataTree:
262 |     dt_subset = dt.sel(
263 |         lon=slice(116.4375, 126.5625),
264 |         lat=slice(5.607445, 19.065325),
265 |         time=slice("2015-01-01", "2030-12-31"),
266 |     )
267 |     return dt_subset
268 | ```
269 | 
270 | ```{code-cell}
271 | dp_datatree_subset = dp_datatree.map(fn=spatiotemporal_subset)
272 | dp_datatree_subset
273 | ```
274 | 
275 | Inspect the subsetted climate dataset 🕵️
276 | 
277 | ```{code-cell}
278 | it = iter(dp_datatree_subset)
279 | datatree_subset = next(it)
280 | datatree_subset
281 | ```
282 | 
283 | Let's plot the projected temperature 🌡️ for Dec 2030 over the Philippine
284 | Archipelago to ensure things look ok.
285 | 
286 | ```{code-cell}
287 | ds_lowres = (
288 |     datatree_subset["lowres/tasmax"]
289 |     .sel(time=slice("2030-12-01", "2030-12-31"))
290 |     .squeeze()
291 | )
292 | ds_lowres -= 273.15  # convert from Kelvin to Celsius
293 | ds_highres = (
294 |     datatree_subset["highres/tasmax"]
295 |     .sel(time=slice("2030-12-01", "2030-12-31"))
296 |     .squeeze()
297 | )
298 | ds_highres -= 273.15  # convert from Kelvin to Celsius
299 | 
300 | # Plot projected maximum temperature over the Philippines from GCM and DeepSD
301 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 8), sharey=True)
302 | 
303 | img1 = ds_lowres.plot.imshow(
304 |     ax=ax[0], cmap="inferno", vmin=22, vmax=33, add_colorbar=False
305 | )
306 | ax[0].set_title("Global Climate Model (67.5 arcminute)")
307 | 
308 | img2 = ds_highres.plot.imshow(
309 |     ax=ax[1], cmap="inferno", vmin=22, vmax=33, add_colorbar=False
310 | )
311 | ax[1].set_title("DeepSD output (15 arcminute)")
312 | 
313 | cbar = fig.colorbar(mappable=img1, ax=ax.ravel().tolist(), extend="max")
314 | cbar.set_label(label="Daily Max Near-Surface Air\nTemperature in Dec 2030 (°C)")
315 | 
316 | plt.show()
317 | ```
318 | 
319 | ```{important}
320 | When slicing ✂️ different spatial resolution grids, put some 🧠 thought into the
321 | process. Do some 🧮 math to ensure the coordinates of the bounding box (min/max
322 | lon/lat) cut through the pixels exactly at the 📐 pixel boundaries whenever
323 | possible.
324 | 
325 | If your multi-resolution 📶 layers have spatial resolutions that are
326 | round multiples ✖️ of each other (e.g. 10m, 20m, 60m), it is advisable to align
327 | 🎯 the pixel corners, such that the high-resolution 🔆 pixels fit within the
328 | low-resolution 🔅 pixels (e.g. one 20m pixel should contain four 10m pixels).
329 | This can be done by resampling 🖌️ or interpolating the grid (typically the
330 | higher resolution one) onto a new reference frame 🖼️.
331 | 
332 | For datasets ℹ️ that come from different sources and need to be reprojected 🔁,
333 | you can do the reprojection and pixel alignment in a single step 🔂. Be extra
334 | careful about resampling, as certain datasets (e.g. complex SAR 📡 data that
335 | has been collected off-nadir) may require special 🌷 treatment.
336 | ```
337 | 
338 | 
339 | ## Time to slice again ⌛
340 | 
341 | So, we now have a {py:class}`datatree.DataTree` with two 💕 groups/nodes called
342 | 'lowres' and 'highres' that have tensor shapes `(lat: 12, lon: 9, time: 192)`
343 | and `(lat: 54, lon: 40, time: 192)` respectively. While the time dimension ⏱️
344 | is of the same length, the timestamp values between the low-resolution 🔅 GCM
345 | and high-resolution 🔆 DeepSD output are different. Specifically, the GCM
346 | output dates at the middle of the month 📅, while the DeepSD output has dates
347 | at the start of the month. Let's see how this can be handled 🫖.
348 | 
349 | ### Slicing by month 🗓️
350 | 
351 | Assuming that the roughly two week offset ↔️ between the monthly resolution GCM
352 | and DeepSD time-series is negligible 🤏, we can split the dataset on the time
353 | dimension at the start/end of each month 📆. Let's write a function and use
354 | {py:class}`torchdata.datapipes.iter.FlatMapper` (functional name: `flatmap`)
355 | for this.
356 | 
357 | ```{code-cell}
358 | def split_on_month(dt: DataTree, node:str = "highres/tasmax") -> DataTree:
359 |     """
360 |     Return a slice of data for every month in a datatree.DataTree time-series.
361 |     """
362 |     for t in dt[node].time.to_pandas():
363 |         dt_slice = dt.sel(
364 |             time=slice(t + pd.offsets.MonthBegin(0), t + pd.offsets.MonthEnd(0))
365 |         )
366 |         yield dt_slice.squeeze(dim="time")
367 | ```
368 | 
369 | ```{code-cell}
370 | dp_datatree_timeslices = dp_datatree_subset.flatmap(fn=split_on_month)
371 | dp_datatree_timeslices
372 | ```
373 | 
374 | The datapipe should yield a {py:class}`datatree.DataTree` with just one
375 | month's 📅 worth of temperature 🌡️ data per iteration.
376 | 
377 | ```{code-cell}
378 | it = iter(dp_datatree_timeslices)
379 | datatree_timeslice = next(it)
380 | datatree_timeslice
381 | ```
382 | 
383 | ```{seealso}
384 | Those interested in slicing multi-resolution arrays spatially can keep an eye
385 | on the 🚧 ongoing implementation at
386 | https://github.com/xarray-contrib/xbatcher/pull/171 and the discussion at
387 | https://github.com/xarray-contrib/xbatcher/issues/93. This 🧑‍🏫 tutorial will be
388 | updated ♻️ once there's a clean way to generate multi-resolution
389 | {py:class}`datatree.DataTree` slices in a newer release of
390 | {doc}`xbatcher <xbatcher:index>` 😉
391 | ```
392 | 
393 | Visualize the final DataPipe graph ⛓️.
394 | 
395 | ```{code-cell}
396 | torchdata.datapipes.utils.to_graph(dp=dp_datatree_timeslices)
397 | ```
398 | 
399 | ### Into a DataLoader 🏋️
400 | 
401 | Ready to populate the {py:class}`torchdata.dataloader2.DataLoader2` 🏭!
402 | 
403 | ```{code-cell}
404 | dataloader = torchdata.dataloader2.DataLoader2(datapipe=dp_datatree_timeslices)
405 | for i, batch in enumerate(dataloader):
406 |     ds_lowres = batch["lowres/tasmax"]
407 |     ds_highres = batch["highres/tasmax"]
408 |     print(f"Batch {i} - lowres: {ds_lowres.shape}, highres: {ds_highres.shape}")
409 |     if i > 8:
410 |         break
411 | ```
412 | 
413 | Do super-resolution, but make no illusion 🧚
414 | 
415 | ```{seealso}
416 | Credits to [CarbonPlan](https://github.com/carbonplan) for making the code and
417 | data for their
418 | [CMIP6 downscaling](https://github.com/carbonplan/cmip6-downscaling) work
419 | openly available. Find out more at
420 | https://docs.carbonplan.org/cmip6-downscaling!
421 | ```
422 | 


--------------------------------------------------------------------------------
/docs/object-detection-boxes.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupytext:
  3 |   formats: md:myst
  4 |   text_representation:
  5 |     extension: .md
  6 |     format_name: myst
  7 | kernelspec:
  8 |   display_name: Python 3
  9 |   language: python
 10 |   name: python3
 11 | ---
 12 | 
 13 | # Object detection boxes
 14 | 
 15 | > *You shouldn't set up limits in boundless openness,
 16 | > but if you set up limitlessness as boundless openness,
 17 | > you've trapped yourself*
 18 | 
 19 | Boxes are quick to draw ✏️, but finicky to train a neural network with.
 20 | This time, we'll show you a geospatial object detection 🕵️ problem, where the
 21 | objects are defined by a bounding box 🔲 with a specific class.
 22 | By the end of this lesson, you should be able to:
 23 | 
 24 | - Read OGR supported vector files and obtain the bounding boxes 🟨 of each
 25 |   geometry
 26 | - Convert bounding boxes from geographic coordinates to 🖼️ image coordinates
 27 |   while clipping to the image extent
 28 | - Use an affine transform to convert boxes in image coordinates to 🌐
 29 |   geographic coordinates
 30 | 
 31 | 🔗 Links:
 32 | - https://planetarycomputer.microsoft.com/dataset/ms-buildings#Example-Notebook
 33 | - https://github.com/microsoft/GlobalMLBuildingFootprints/
 34 | - https://mlhub.earth/datasets?tags=object+detection
 35 | 
 36 | ## 🎉 **Getting started**
 37 | 
 38 | These are the tools 🛠️ you'll need.
 39 | 
 40 | ```{code-cell}
 41 | import contextily
 42 | import numpy as np
 43 | import geopandas as gpd
 44 | import matplotlib.patches
 45 | import matplotlib.pyplot as plt
 46 | import pandas as pd
 47 | import planetary_computer
 48 | import pystac_client
 49 | import rioxarray
 50 | import shapely.affinity
 51 | import shapely.geometry
 52 | import torch
 53 | import torchdata
 54 | import torchdata.dataloader2
 55 | import xarray as xr
 56 | import zen3geo
 57 | ```
 58 | 
 59 | ## 0️⃣ Find high-resolution imagery and building footprints 🌇
 60 | 
 61 | Let's take a look at buildings over
 62 | [Kampong Ayer](https://en.wikipedia.org/wiki/Kampong_Ayer), Brunei 🇧🇳! We'll
 63 | use {py:func}`contextily.bounds2img` to get some 4-band RGBA
 64 | 🌈 [optical imagery](https://www.arcgis.com/home/item.html?id=10df2279f9684e4a9f6a7f08febac2a9)
 65 | in a {py:class}`numpy.ndarray` format.
 66 | 
 67 | ```{code-cell}
 68 | image, extent = contextily.bounds2img(
 69 |     w=114.94,
 70 |     s=4.88,
 71 |     e=114.95,
 72 |     n=4.89,
 73 |     ll=True,
 74 |     source=contextily.providers.Esri.WorldImagery,
 75 | )
 76 | print(f"Spatial extent in EPSG:3857: {extent}")
 77 | print(f"Image dimensions (height, width, channels): {image.shape}")
 78 | ```
 79 | 
 80 | This is how Brunei's 🚣 Venice of the East looks like from above.
 81 | 
 82 | ```{code-cell}
 83 | fig, ax = plt.subplots(nrows=1, figsize=(9, 9))
 84 | plt.imshow(X=image, extent=extent)
 85 | ```
 86 | 
 87 | ```{tip}
 88 | For more raster basemaps, check out:
 89 | - https://xyzservices.readthedocs.io/en/stable/introduction.html#overview-of-built-in-providers
 90 | - https://leaflet-extras.github.io/leaflet-providers/preview/
 91 | ```
 92 | 
 93 | ### Georeference image using rioxarray 🌐
 94 | 
 95 | To enable slicing 🔪 with xbatcher later, we'll need to turn the
 96 | {py:class}`numpy.ndarray` image 🖼️ into an {py:class}`xarray.DataArray` grid
 97 | with coordinates 🖼️. If you already have a georeferenced grid (e.g. from
 98 | {py:class}`zen3geo.datapipes.RioXarrayReader`), this step can be skipped ⏭️.
 99 | 
100 | 
101 | ```{code-cell}
102 | # Turn RGBA image from channel-last to channel-first and get 3-band RGB only
103 | _image = image.transpose(2, 0, 1)  # Change image from (H, W, C) to (C, H, W)
104 | rgb_image = _image[0:3, :, :]  # Get just RGB by dropping RGBA's alpha channel
105 | print(f"RGB image shape: {rgb_image.shape}")
106 | ```
107 | 
108 | Georeferencing is done by putting the 🚦 RGB image into an
109 | {py:class}`xarray.DataArray` object with (band, y, x) coordinates, and then
110 | setting a coordinate reference system 📐 using
111 | {py:meth}`rioxarray.rioxarray.XRasterBase.set_crs`.
112 | 
113 | ```{code-cell}
114 | left, right, bottom, top = extent  # xmin, xmax, ymin, ymax
115 | dataarray = xr.DataArray(
116 |     data=rgb_image,
117 |     coords=dict(
118 |         band=[0, 1, 2],  # Red, Green, Blue
119 |         y=np.linspace(start=top, stop=bottom, num=rgb_image.shape[1]),
120 |         x=np.linspace(start=left, stop=right, num=rgb_image.shape[2]),
121 |     ),
122 |     dims=("band", "y", "x"),
123 | )
124 | dataarray = dataarray.rio.write_crs(input_crs="EPSG:3857")
125 | dataarray
126 | ```
127 | 
128 | ### Load cloud-native vector files 💠
129 | 
130 | Now to pull in some building footprints 🛖. Let's make a STAC API query to get
131 | a [GeoParquet](https://github.com/opengeospatial/geoparquet) file (a
132 | cloud-native columnar 🀤 geospatial vector file format) that intersects our
133 | study area.
134 | 
135 | ```{code-cell}
136 | catalog = pystac_client.Client.open(
137 |     url="https://planetarycomputer.microsoft.com/api/stac/v1",
138 |     modifier=planetary_computer.sign_inplace,
139 | )
140 | search = catalog.search(
141 |     collections=["ms-buildings"],
142 |     query={"msbuildings:region": {"eq": "Brunei"}},
143 |     intersects=shapely.geometry.box(minx=114.94, miny=4.88, maxx=114.95, maxy=4.89),
144 | )
145 | item = next(search.items())
146 | item
147 | ```
148 | 
149 | ```{note}
150 | Accessing the building footprint STAC Assets from Planetary Computer will
151 | require signing 🔏 the URL. This can be done with a `modifier` function in the
152 | {py:meth}`pystac_client.Client.open` call. See also 'Automatically modifying
153 | results' under {doc}`PySTAC-Client Usage <pystac_client:usage>`).
154 | ```
155 | 
156 | Next, we'll load ⤵️ the GeoParquet file using
157 | {py:func}`geopandas.read_parquet`.
158 | 
159 | ```{code-cell}
160 | asset = item.assets["data"]
161 | 
162 | geodataframe = gpd.read_parquet(
163 |     path=asset.href, storage_options=asset.extra_fields["table:storage_options"]
164 | )
165 | geodataframe
166 | ```
167 | 
168 | This {py:class}`geopandas.GeoDataFrame` contains building outlines across
169 | Brunei 🇧🇳 that intersects and extends beyond our study area. Let's do a spatial
170 | subset ✂️ to just the Kampong Ayer study area using
171 | {py:attr}`geopandas.GeoDataFrame.cx`, and reproject the polygon coordinates
172 | using {py:meth}`geopandas.GeoDataFrame.to_crs` to match the coordinate
173 | reference system of the optical image.
174 | 
175 | ```{code-cell}
176 | _gdf_kpgayer = geodataframe.cx[114.94:114.95, 4.88:4.89]
177 | gdf_kpgayer = _gdf_kpgayer.to_crs(crs="EPSG:3857")
178 | gdf_kpgayer
179 | ```
180 | 
181 | Preview 👀 the building footprints to check that things are in the right place.
182 | 
183 | ```{code-cell}
184 | ax = gdf_kpgayer.plot(figsize=(9, 9))
185 | contextily.add_basemap(
186 |     ax=ax,
187 |     source=contextily.providers.CartoDB.Voyager,
188 |     crs=gdf_kpgayer.crs.to_string(),
189 | )
190 | ax
191 | ```
192 | 
193 | Cool, we see that there are some building are on water as expected 😁.
194 | 
195 | 
196 | ## 1️⃣ Pair image chips with bounding boxes 🧑‍🤝‍🧑
197 | 
198 | Here comes the fun 🛝 part! This section is all about generating 128x128 chips
199 | 🫶 paired with bounding boxes. Let's go 🚲!
200 | 
201 | ### Create 128x128 raster chips and clip vector geometries with it ✂️
202 | 
203 | From the large 1280x1280 scene 🖽️, we will first slice out a hundred 128x128
204 | chips 🍕 using {py:class}`zen3geo.datapipes.XbatcherSlicer` (functional name:
205 | `slice_with_xbatcher`).
206 | 
207 | ```{code-cell}
208 | dp_raster = torchdata.datapipes.iter.IterableWrapper(iterable=[dataarray])
209 | dp_xbatcher = dp_raster.slice_with_xbatcher(input_dims={"y": 128, "x": 128})
210 | dp_xbatcher
211 | ```
212 | 
213 | For each 128x128 chip 🍕, we'll then find the vector geometries 🌙 that fit
214 | within the chip's spatial extent. This will be 🤸 done using
215 | {py:class}`zen3geo.datapipes.GeoPandasRectangleClipper` (functional name:
216 | `clip_vector_with_rectangle`).
217 | 
218 | ```{code-cell}
219 | dp_vector = torchdata.datapipes.iter.IterableWrapper(iterable=[gdf_kpgayer])
220 | dp_clipped = dp_vector.clip_vector_with_rectangle(mask_datapipe=dp_xbatcher)
221 | dp_clipped
222 | ```
223 | 
224 | ```{important}
225 | When using {py:class}`zen3geo.datapipes.GeoPandasRectangleClipper` 💇, there
226 | should only be one 'global' 🌐 vector {py:class}`geopandas.GeoSeries` or
227 | {py:class}`geopandas.GeoDataFrame`.
228 | 
229 | If your raster DataPipe has chips 🍕 with different coordinate reference
230 | systems (e.g. multiple UTM Zones 🌏🌍🌎),
231 | {py:class}`zen3geo.datapipes.GeoPandasRectangleClipper` will actually reproject
232 | 🔄 the 'global' vector to the coordinate reference system of each chip, and
233 | clip ✂️ the geometries accordingly to the chip's bounding box extent 😎.
234 | ```
235 | 
236 | This ``dp_clipped`` DataPipe will yield 🤲 a tuple of ``(vector, raster)``
237 | objects for each 128x128 chip. Let's inspect 🧐 one to see how they look like.
238 | 
239 | ```{code-cell}
240 | # Get one chip with over 10 building footprint geometries
241 | for vector, raster in dp_clipped:
242 |     if len(vector) > 10:
243 |         break
244 | ```
245 | 
246 | These are the spatially subsetted vector geometries 🌙 in one 128x128 chip.
247 | 
248 | ```{code-cell}
249 | vector
250 | ```
251 | 
252 | This is the raster chip/mask 🤿 used to clip the vector.
253 | 
254 | ```{code-cell}
255 | raster
256 | ```
257 | 
258 | And here's a side by side visualization of the 🌈 RGB chip image (left) and
259 | 🔷 vector building footprint polygons (right).
260 | 
261 | ```{code-cell}
262 | fig, ax = plt.subplots(ncols=2, figsize=(18, 9), sharex=True, sharey=True)
263 | raster.plot.imshow(ax=ax[0])
264 | vector.plot(ax=ax[1])
265 | ```
266 | 
267 | Cool, these buildings are part of the 🏬
268 | [Yayasan Shopping Complex](https://web.archive.org/web/20220906020248/http://www.yayasancomplex.com)
269 | in Bandar Seri Begawan 🌆. We can see that the raster image 🖼️ on the left
270 | aligns ok with the vector polygons 💠 on the right.
271 | 
272 | ```{note}
273 | The optical 🛰️ imagery shown here is **not** the imagery used to digitize the
274 | [building footprints](https://planetarycomputer.microsoft.com/dataset/ms-buildings)
275 | 🏢! This is an example tutorial using two different data sources, that we just
276 | so happened to have plotted in the same geographic space 😝.
277 | ```
278 | 
279 | ### From polygons in geographic coordinates to boxes in image coordinates ↕️
280 | 
281 | Up to this point, we still have the actual 🛖 building footprint polygons. In
282 | this step 📶, we'll convert these polygons into a format suitable for 'basic'
283 | object detection 🥅 models in computer vision. Specifically:
284 | 
285 | 1. The polygons 🌙 (with multiple vertices) will be simplified to a horizontal
286 |    bounding box 🔲 with 4 corner vertices only.
287 | 2. The 🌐 geographic coordinates of the box which use lower left corner and
288 |    upper right corner (i.e. y increases from South to North ⬆️) will be
289 |    converted to 🖼️ image coordinates (0-128) which use the top left corner and
290 |    bottom right corner (i.e y increases from Top to Bottom ⬇️).
291 | 
292 | Let's start by using {py:attr}`geopandas.GeoSeries.bounds` to get the
293 | geographic bounds 🗺️ of each building footprint geometry 📐 in each 128x128
294 | chip.
295 | 
296 | ```{code-cell}
297 | def polygon_to_bbox(geom_and_chip) -> (gpd.GeoDataFrame, xr.DataArray):
298 |     """
299 |     Get bounding box (minx, miny, maxx, maxy) coordinates for each geometry in
300 |     a geopandas.GeoDataFrame.
301 | 
302 |                           (maxx,maxy)
303 |                ul-------ur
304 |              ^  |       |
305 |              |  |  geo  |    y increases going up, x increases going right
306 |              y  |       |
307 |                ll-------lr
308 |     (minx,miny)    x-->
309 | 
310 |     """
311 |     gdf, chip = geom_and_chip
312 |     bounds: gpd.GeoDataFrame = gdf.bounds
313 |     assert tuple(bounds.columns) == ("minx", "miny", "maxx", "maxy")
314 | 
315 |     return bounds, chip
316 | ```
317 | 
318 | ```{code-cell}
319 | dp_bbox = dp_clipped.map(fn=polygon_to_bbox)
320 | ```
321 | 
322 | Next, the geographic 🗺️ bounding box coordinates (in EPSG:3857) will be
323 | converted to image 🖼️ or pixel coordinates (0-128 scale). The y-direction will
324 | be flipped 🤸 upside down, and we'll be using the spatial bounds (or corner
325 | coordinates) of the 128x128 image chip as a reference 📍.
326 | 
327 | ```{code-cell}
328 | def geobox_to_imgbox(bbox_and_chip) -> (pd.DataFrame, xr.DataArray):
329 |     """
330 |     Convert bounding boxes in a pandas.DataFrame from geographic coordinates
331 |     (minx, miny, maxx, maxy) to image coordinates (x1, y1, x2, y2) based on the
332 |     spatial extent of a raster image chip.
333 | 
334 |         (x1,y1)
335 |                ul-------ur
336 |              y  |       |
337 |              |  |  img  |    y increases going down, x increases going right
338 |              v  |       |
339 |                ll-------lr
340 |                    x-->    (x2,y2)
341 | 
342 |     """
343 |     geobox, chip = bbox_and_chip
344 | 
345 |     x_res, y_res = chip.rio.resolution()
346 |     assert y_res < 0
347 | 
348 |     left, bottom, right, top = chip.rio.bounds()
349 |     assert top > bottom
350 | 
351 |     imgbox = pd.DataFrame()
352 |     imgbox["x1"] = (geobox.minx - left) / x_res  # left
353 |     imgbox["y1"] = (top - geobox.maxy) / -y_res  # top
354 |     imgbox["x2"] = (geobox.maxx - left) / x_res  # right
355 |     imgbox["y2"] = (top - geobox.miny) / -y_res  # bottom
356 | 
357 |     assert all(imgbox.x2 > imgbox.x1)
358 |     assert all(imgbox.y2 > imgbox.y1)
359 | 
360 |     return imgbox, chip
361 | ```
362 | 
363 | ```{code-cell}
364 | dp_ibox = dp_bbox.map(fn=geobox_to_imgbox)
365 | ```
366 | 
367 | Now to plot 🎨 and double check that the boxes are positioned correctly in
368 | 0-128 image space 🌌.
369 | 
370 | ```{code-cell}
371 | # Get one chip with over 10 building footprint geometries
372 | for ibox, ichip in dp_ibox:
373 |     if len(ibox) > 10:
374 |         break
375 | ibox
376 | ```
377 | 
378 | ```{code-cell}
379 | fig, ax = plt.subplots(ncols=2, figsize=(18, 9), sharex=True, sharey=True)
380 | ax[0].imshow(X=ichip.transpose("y", "x", "band"))
381 | for i, row in ibox.iterrows():
382 |     rectangle = matplotlib.patches.Rectangle(
383 |         xy=(row.x1, row.y1),
384 |         width=row.x2 - row.x1,
385 |         height=row.y2 - row.y1,
386 |         edgecolor="blue",
387 |         linewidth=1,
388 |         facecolor="none",
389 |     )
390 |     ax[1].add_patch(rectangle)
391 | ```
392 | 
393 | Cool, the 🟦 bounding boxes on the right subplot are correctly positioned 🧭
394 | (compare it with the figure in the previous subsection).
395 | 
396 | ```{hint}
397 | Instead of a bounding box 🥡 object detection task, you can also use the
398 | building polygons 🏘️ for a segmentation task 🧑‍🎨 following
399 | {doc}`./vector-segmentation-masks`.
400 | 
401 | If you still prefer doing object detection 🕵️, but want a different box format
402 | (see options in {py:func}`torchvision.ops.box_convert`),
403 | like 🎌 centre-based coordinates with width and height (`cxcywh`), or
404 | 📨 oriented/rotated bounding box coordinates, feel free to implement your own
405 | function and DataPipe for it 🤗!
406 | ```
407 | 
408 | 
409 | ## 2️⃣ There and back again 🧙
410 | 
411 | What follows on from here requires focus 🤫. To start, we'll pool the hundred
412 | 💯 128x128 chips into 10 batches (10 chips per batch) using
413 | {py:class}`torchdata.datapipes.iter.Batcher` (functional name: `batch`).
414 | 
415 | ```{code-cell}
416 | dp_batch = dp_ibox.batch(batch_size=10)
417 | print(f"Number of items in first batch: {len(list(dp_batch)[0])}")
418 | ```
419 | 
420 | ### Batch boxes with variable lengths 📏
421 | 
422 | Next, we'll stack 🥞 all the image chips into a single tensor (recall
423 | {doc}`./chipping`), and concatenate 📚 the bounding boxes into a list of
424 | tensors using {py:class}`torchdata.datapipes.iter.Collator` (functional name:
425 | `collate`).
426 | 
427 | ```{code-cell}
428 | def boximg_collate_fn(samples) -> (list[torch.Tensor], torch.Tensor, list[dict]):
429 |     """
430 |     Converts bounding boxes and raster images to tensor objects and keeps
431 |     geographic metadata (spatial extent, coordinate reference system and
432 |     spatial resolution).
433 | 
434 |     Specifically, the bounding boxes in pandas.DataFrame format are each
435 |     converted to a torch.Tensor and collated into a list, while the raster
436 |     images in xarray.DataArray format are converted to a torch.Tensor (int16
437 |     dtype) and stacked into a single torch.Tensor.
438 |     """
439 |     box_tensors: list[torch.Tensor] = [
440 |         torch.as_tensor(sample[0].to_numpy(dtype=np.float32)) for sample in samples
441 |     ]
442 | 
443 |     tensors: list[torch.Tensor] = [
444 |         torch.as_tensor(data=sample[1].data.astype(dtype="int16")) for sample in samples
445 |     ]
446 |     img_tensors = torch.stack(tensors=tensors)
447 | 
448 |     metadata: list[dict] = [
449 |         {
450 |             "bbox": sample[1].rio.bounds(),
451 |             "crs": sample[1].rio.crs,
452 |             "resolution": sample[1].rio.resolution(),
453 |         }
454 |         for sample in samples
455 |     ]
456 | 
457 |     return box_tensors, img_tensors, metadata
458 | ```
459 | 
460 | ```{code-cell}
461 | dp_collate = dp_batch.collate(collate_fn=boximg_collate_fn)
462 | print(f"Number of mini-batches: {len(dp_collate)}")
463 | mini_batch_box, mini_batch_img, mini_batch_metadata = list(dp_collate)[1]
464 | print(f"Mini-batch image tensor shape: {mini_batch_img.shape}")
465 | print(f"Mini-batch box tensors: {mini_batch_box}")
466 | print(f"Mini-batch metadata: {mini_batch_metadata}")
467 | ```
468 | 
469 | The DataPipe is complete 🙌, let's visualize the entire data pipeline graph.
470 | 
471 | ```{code-cell}
472 | torchdata.datapipes.utils.to_graph(dp=dp_collate)
473 | ```
474 | 
475 | ### Into a DataLoader 🏋️
476 | 
477 | Loop over the DataPipe using {py:class}`torch.utils.data.DataLoader` ⚙️!
478 | 
479 | ```{code-cell}
480 | dataloader = torchdata.dataloader2.DataLoader2(datapipe=dp_collate)
481 | for i, batch in enumerate(dataloader):
482 |     box, img, metadata = batch
483 |     print(f"Batch {i} - img: {img.shape}, box sizes: {[len(b) for b in box]}")
484 | ```
485 | 
486 | There's probably hundreds of models you can 🍜 feed this data into, from
487 | mmdetection's {doc}`mmdetection:model_zoo` 🐼 to torchvision's
488 | {doc}`torchvision:models`). But are we out of the woods yet?
489 | 
490 | ### Georeference image boxes 📍
491 | 
492 | To turn the model's predicted bounding boxes in image space 🌌 back to
493 | geographic coordinates 🌐, you'll need to use an
494 | [affine transform](https://web.archive.org/web/20210506173651/https://www.perrygeo.com/python-affine-transforms.html).
495 | Assuming you've kept your 🏷️ metadata intact, here's an example on how to do
496 | the georeferencing:
497 | 
498 | ```{code-cell}
499 | for batch in dataloader:
500 |     pred_boxes, images, metadata = batch
501 | 
502 |     objs: list = []
503 |     for idx in range(0, len(images)):
504 |         left, bottom, right, top = metadata[idx]["bbox"]
505 |         crs = metadata[idx]["crs"]
506 |         x_res, y_res = metadata[idx]["resolution"]
507 | 
508 |         gdf = gpd.GeoDataFrame(
509 |             geometry=[
510 |                 shapely.affinity.affine_transform(
511 |                     geom=shapely.geometry.box(*coords),
512 |                     matrix=[x_res, 0, 0, y_res, left, top],
513 |                 )
514 |                 for coords in pred_boxes[idx]
515 |             ],
516 |             crs=crs,
517 |         )
518 |         objs.append(gdf.to_crs(crs=crs))
519 | 
520 |     geodataframe: gpd.GeoDataFrame = pd.concat(objs=objs, ignore_index=True)
521 |     geodataframe.set_crs(crs=crs, inplace=True)
522 |     break
523 | 
524 | geodataframe
525 | ```
526 | 
527 | Back at square one, or are we?
528 | 


--------------------------------------------------------------------------------
/docs/vector-segmentation-masks.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupytext:
  3 |   formats: md:myst
  4 |   text_representation:
  5 |     extension: .md
  6 |     format_name: myst
  7 | kernelspec:
  8 |   display_name: Python 3
  9 |   language: python
 10 |   name: python3
 11 | ---
 12 | 
 13 | # Vector segmentation masks
 14 | 
 15 | > *Clouds float by, water flows on;
 16 | > in movement there is no grasping, in Chan there is no settling*
 17 | 
 18 | For 🧑‍🏫 supervised machine learning, labels 🏷️ are needed in addition to the
 19 | input image 🖼️. Here, we'll step through an example workflow on matching vector
 20 | 🚏 label data (points, lines, polygons) to 🛰️ Earth Observation data inputs.
 21 | Specifically, this tutorial will cover:
 22 | 
 23 | - Reading shapefiles 📁 directly from the web via {doc}`pyogrio <pyogrio:index>`
 24 | - Rasterizing vector polygons from a {py:class}`geopandas.GeoDataFrame` to an {py:class}`xarray.DataArray`
 25 | - Pairing 🛰️ satellite images with the rasterized label masks and feeding them into a DataLoader
 26 | 
 27 | 
 28 | ## 🎉 **Getting started**
 29 | 
 30 | These are the tools 🛠️ you'll need.
 31 | 
 32 | ```{code-cell}
 33 | import matplotlib.pyplot as plt
 34 | import numpy as np
 35 | import planetary_computer
 36 | import pyogrio
 37 | import pystac
 38 | import torch
 39 | import torchdata
 40 | import xarray as xr
 41 | import zen3geo
 42 | ```
 43 | 
 44 | ## 0️⃣ Find cloud-hosted raster and vector data ⛳
 45 | 
 46 | In this case study, we'll look at the flood water extent over the Narathiwat Province
 47 | in Thailand 🇹🇭 and the Northern Kelantan State in Malaysia 🇲🇾 on 04 Jan 2017 that were
 48 | digitized by 🇺🇳 UNITAR-UNOSAT's rapid mapping service over Synthetic Aperture Radar
 49 | (SAR) 🛰️ images. Specifically, we'll be using the 🇪🇺 Sentinel-1 Ground Range Detected
 50 | (GRD) product's VV polarization channel.
 51 | 
 52 | 🔗 Links:
 53 | - https://www.unitar.org/maps
 54 | - https://unitar.org/maps/all-maps
 55 | - [Microsoft Planetary Computer STAC Explorer](https://planetarycomputer.microsoft.com/explore?c=102.7555%2C5.7222&z=7.92&v=2&d=sentinel-1-grd&m=cql%3Afdba821238c1a390e7c75d7ced805b2e&r=VV%2C+VH+False-color+composite&s=false%3A%3A100%3A%3Atrue&sr=desc&ae=0)
 56 | 
 57 | To start, let's get the 🛰️ satellite scene we'll be using for this tutorial.
 58 | 
 59 | ```{code-cell}
 60 | item_url = "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-1-grd/items/S1A_IW_GRDH_1SDV_20170104T225443_20170104T225512_014688_017E5D"
 61 | 
 62 | # Load the individual item metadata and sign the assets
 63 | item = pystac.Item.from_file(item_url)
 64 | signed_item = planetary_computer.sign(item)
 65 | signed_item
 66 | ```
 67 | 
 68 | This is how the Sentinel-1 🩻 image looks like over Southern Thailand / Northern
 69 | Peninsular Malaysia on 04 Jan 2017.
 70 | 
 71 | ![Sentinel-1 GRD image over Southern Thailand and Northern Peninsular Malaysia on 20170104](https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=sentinel-1-grd&item=S1A_IW_GRDH_1SDV_20170104T225443_20170104T225512_014688_017E5D&assets=vv&assets=vh&expression=vv%3Bvh%3Bvv%2Fvh&rescale=0%2C600&rescale=0%2C270&rescale=0%2C9&asset_as_band=True&tile_format=png&format=png)
 72 | 
 73 | ### Load and reproject image data 🔄
 74 | 
 75 | To keep things simple, we'll load just the VV channel into a DataPipe via
 76 | {py:class}`zen3geo.datapipes.RioXarrayReader` (functional name:
 77 | `read_from_rioxarray`) 😀.
 78 | 
 79 | ```{code-cell}
 80 | url = signed_item.assets["vv"].href
 81 | dp = torchdata.datapipes.iter.IterableWrapper(iterable=[url])
 82 | # Reading lower resolution grid using overview_level=3
 83 | dp_rioxarray = dp.read_from_rioxarray(overview_level=3)
 84 | dp_rioxarray
 85 | ```
 86 | 
 87 | The Sentinel-1 image from Planetary Computer comes in longitude/latitude 🌐
 88 | geographic coordinates by default (OGC:CRS84). To make the pixels more equal 🔲
 89 | area, we can project it to a 🌏 local projected coordinate system instead.
 90 | 
 91 | ```{code-cell}
 92 | def reproject_to_local_utm(dataarray: xr.DataArray, resolution: float=80.0) -> xr.DataArray:
 93 |     """
 94 |     Reproject an xarray.DataArray grid from OGC:CRS84 to a local UTM coordinate
 95 |     reference system.
 96 |     """
 97 |     # Estimate UTM coordinate reference from a single pixel
 98 |     pixel = dataarray.isel(y=slice(0, 1), x=slice(0,1))
 99 |     new_crs = dataarray.rio.reproject(dst_crs="OGC:CRS84").rio.estimate_utm_crs()
100 | 
101 |     return dataarray.rio.reproject(dst_crs=new_crs, resolution=resolution)
102 | ```
103 | 
104 | ```{code-cell}
105 | dp_reprojected = dp_rioxarray.map(fn=reproject_to_local_utm)
106 | ```
107 | 
108 | ```{note}
109 | Universal Transverse Mercator (UTM) isn't actually an equal-area projection
110 | system. However, Sentinel-1 🛰️ satellite scenes from Copernicus are usually
111 | distributed in a UTM coordinate reference system, and UTM is typically a close
112 | enough 🤏 approximation to the local geographic area, or at least it won't
113 | matter much when we're looking at spatial resolutions over several 10s of
114 | metres 🙂.
115 | ```
116 | 
117 | ```{hint}
118 | For those wondering what `OGC:CRS84` is, it is the longitude/latitude version
119 | of [`EPSG:4326`](https://epsg.io/4326) 🌐 (latitude/longitude). I.e., it's a
120 | matter of axis order, with `OGC:CRS84` being x/y and `EPSG:4326` being y/x.
121 | 
122 | 🔖 References:
123 | - https://gis.stackexchange.com/questions/54073/what-is-crs84-projection
124 | - https://github.com/opengeospatial/geoparquet/issues/52
125 | ```
126 | 
127 | ### Transform and visualize raster data 🔎
128 | 
129 | Let's visualize 👀 the Sentinel-1 image, but before that, we'll transform 🔄
130 | the VV data from linear to [decibel](https://en.wikipedia.org/wiki/Decibel)
131 | scale.
132 | 
133 | ```{code-cell}
134 | def linear_to_decibel(dataarray: xr.DataArray) -> xr.DataArray:
135 |     """
136 |     Transforming the input xarray.DataArray's VV or VH values from linear to
137 |     decibel scale using the formula ``10 * log_10(x)``.
138 |     """
139 |     # Mask out areas with 0 so that np.log10 is not undefined
140 |     da_linear = dataarray.where(cond=dataarray != 0)
141 |     da_decibel = 10 * np.log10(da_linear)
142 |     return da_decibel
143 | ```
144 | 
145 | ```{code-cell}
146 | dp_decibel = dp_reprojected.map(fn=linear_to_decibel)
147 | dp_decibel
148 | ```
149 | 
150 | As an aside, we'll be using the Sentinel-1 image datapipe twice later, once as
151 | a template to create a blank canvas 🎞️, and another time by itself 🪞. This
152 | requires forking 🍴 the DataPipe into two branches, which can be achieved using
153 | {py:class}`torchdata.datapipes.iter.Forker` (functional name: `fork`).
154 | 
155 | ```{code-cell}
156 | dp_decibel_canvas, dp_decibel_image = dp_decibel.fork(num_instances=2)
157 | dp_decibel_canvas, dp_decibel_image
158 | ```
159 | 
160 | Now to visualize the transformed Sentinel-1 image 🖼️. Let's zoom in 🔭 to one
161 | of the analysis extent areas we'll be working on later.
162 | 
163 | ```{code-cell}
164 | it = iter(dp_decibel_image)
165 | dataarray = next(it)
166 | 
167 | da_clip = dataarray.rio.clip_box(minx=125718, miny=523574, maxx=326665, maxy=722189)
168 | da_clip.isel(band=0).plot.imshow(figsize=(11.5, 9), cmap="Blues_r", vmin=18, vmax=26)
169 | ```
170 | 
171 | Notice how the darker blue areas 🔵 tend to correlate more with water features
172 | like the meandering rivers and the 🐚 sea on the NorthEast. This is because the
173 | SAR 🛰️ signal which is side looking reflects off flat water bodies like a
174 | mirror 🪞, with little energy getting reflected 🙅 back directly to the sensor
175 | (hence why it looks darker ⚫).
176 | 
177 | ### Load and visualize cloud-hosted vector files 💠
178 | 
179 | Let's now load some vector data from the web 🕸️. These are polygons of the
180 | segmented 🌊 water extent digitized by UNOSAT's AI Based Rapid Mapping Service.
181 | We'll be converting these vector polygons to 🌈 raster masks later.
182 | 
183 | 🔗 Links:
184 | - https://github.com/UNITAR-UNOSAT/UNOSAT-AI-Based-Rapid-Mapping-Service
185 | - [UNOSAT link to polygon dataset](https://unosat.org/products/2460)
186 | - [Disaster Risk Monitoring Using Satellite Imagery online course](https://courses.nvidia.com/courses/course-v1:DLI+S-ES-01+V1)
187 | 
188 | ```{code-cell}
189 | # https://gdal.org/user/virtual_file_systems.html#vsizip-zip-archives
190 | shape_url = "/vsizip/vsicurl/https://web.archive.org/web/20240411214446/https://unosat.org/static/unosat_filesystem/2460/FL20170106THA_SHP.zip/ST20170104_SatelliteDetectedWaterAndSaturatedSoil.shp"
191 | ```
192 | 
193 | This is a shapefile containing 🔷 polygons of the mapped water extent. Let's
194 | put it into a DataPipe called {py:class}`zen3geo.datapipes.PyogrioReader`
195 | (functional name: ``read_from_pyogrio``).
196 | 
197 | ```{code-cell}
198 | dp_shapes = torchdata.datapipes.iter.IterableWrapper(iterable=[shape_url])
199 | dp_pyogrio = dp_shapes.read_from_pyogrio()
200 | dp_pyogrio
201 | ```
202 | 
203 | This will take care of loading the shapefile into a
204 | {py:class}`geopandas.GeoDataFrame` object. Let's take a look at the data table
205 | 📊 to see what attributes are inside.
206 | 
207 | ```{code-cell}
208 | it = iter(dp_pyogrio)
209 | geodataframe = next(it)
210 | geodataframe.dropna(axis="columns")
211 | ```
212 | 
213 | Cool, and we can also visualize the polygons 🔷 on a 2D map. To align the
214 | coordinates with the 🛰️ Sentinel-1 image above, we'll first use
215 | {py:meth}`geopandas.GeoDataFrame.to_crs` to reproject the vector from 🌐
216 | EPSG:9707 (WGS 84 + EGM96 height, latitude/longitude) to 🌏 EPSG:32648 (UTM
217 | Zone 48N).
218 | 
219 | ```{code-cell}
220 | print(f"Original bounds in EPSG:9707:\n{geodataframe.bounds}")
221 | gdf = geodataframe.to_crs(crs="EPSG:32648")
222 | print(f"New bounds in EPSG:32648:\n{gdf.bounds}")
223 | ```
224 | 
225 | Plot it with {py:meth}`geopandas.GeoDataFrame.plot`. This vector map 🗺️ should
226 | correspond to the zoomed in Sentinel-1 image plotted earlier above.
227 | 
228 | ```{code-cell}
229 | gdf.plot(figsize=(11.5, 9))
230 | ```
231 | 
232 | ```{tip}
233 | Make sure to understand your raster and vector datasets well first! Open the
234 | files up in your favourite 🌐 Geographic Information System (GIS) tool, see how
235 | they actually look like spatially. Then you'll have a better idea to decide on
236 | how to create your data pipeline. The zen3geo way puts you as the Master 🧙 in
237 | control.
238 | ```
239 | 
240 | 
241 | ## 1️⃣ Create a canvas to paint on 🎨
242 | 
243 | In this section, we'll work on converting the flood water 🌊 polygons above
244 | from a 🚩 vector to a 🌈 raster format, i.e. rasterization. This will be done
245 | in two steps 📶:
246 | 
247 | 1. Defining a blank canvas 🎞️
248 | 2. Paint the polygons onto this blank canvas 🧑‍🎨
249 | 
250 | For this, we'll be using tools from {py:meth}`zen3geo.datapipes.datashader`.
251 | Let's see how this can be done.
252 | 
253 | ### Blank canvas from template raster 🖼️
254 | 
255 | A canvas represents a 2D area with a height and a width 📏. For us, we'll be
256 | using a {py:class}`datashader.Canvas`, which also defines the range of y-values
257 | (ymin to ymax) and x-values (xmin to xmax), essentially coordinates for
258 | every unit 🇾 height and 🇽 width.
259 | 
260 | Since we already have a Sentinel-1 🛰️ raster grid with defined height/width
261 | and y/x coordinates, let's use it as a 📄 template to define our canvas. This
262 | is done via {py:class}`zen3geo.datapipes.XarrayCanvas` (functional name:
263 | ``canvas_from_xarray``).
264 | 
265 | ```{code-cell}
266 | dp_canvas = dp_decibel_canvas.canvas_from_xarray()
267 | dp_canvas
268 | ```
269 | 
270 | Cool, and here's a quick inspection 👀 of the canvas dimensions and metadata.
271 | 
272 | ```{code-cell}
273 | it = iter(dp_canvas)
274 | canvas = next(it)
275 | print(f"Canvas height: {canvas.plot_height}, width: {canvas.plot_width}")
276 | print(f"Y-range: {canvas.y_range}")
277 | print(f"X-range: {canvas.x_range}")
278 | print(f"Coordinate reference system: {canvas.crs}")
279 | ```
280 | 
281 | This information should match the template Sentinel-1 dataarray 🏁.
282 | 
283 | ```{code-cell}
284 | print(f"Dimensions: {dict(dataarray.sizes)}")
285 | print(f"Affine transform: {dataarray.rio.transform()}")
286 | print(f"Bounding box: {dataarray.rio.bounds()}")
287 | print(f"Coordinate reference system: {dataarray.rio.crs}")
288 | ```
289 | 
290 | ### Rasterize vector polygons onto canvas 🖌️
291 | 
292 | Now's the time to paint or rasterize the
293 | vector {py:class}`geopandas.GeoDataFrame` polygons 🔷 onto the blank
294 | {py:class}`datashader.Canvas`! This would enable us to have a direct pixel-wise
295 | X -> Y mapping ↔️ between the Sentinel-1 image (X) and target flood label (Y).
296 | 
297 | The vector polygons can be rasterized or painted 🖌️ onto the template canvas
298 | using {py:class}`zen3geo.datapipes.DatashaderRasterizer` (functional name:
299 | ``rasterize_with_datashader``).
300 | 
301 | ```{code-cell}
302 | dp_datashader = dp_canvas.rasterize_with_datashader(vector_datapipe=dp_pyogrio)
303 | dp_datashader
304 | ```
305 | 
306 | This will turn the vector {py:class}`geopandas.GeoDataFrame` into a
307 | raster {py:class}`xarray.DataArray` grid, with the spatial coordinates and
308 | bounds matching exactly with the template Sentinel-1 image 😎.
309 | 
310 | ```{note}
311 | Since we have just one Sentinel-1 🛰️ image and one raster 💧 flood
312 | mask, we have an easy 1:1 mapping. There are two other scenarios supported by
313 | {py:class}`zen3geo.datapipes.DatashaderRasterizer`:
314 | 
315 | 1. N:1 - Many {py:class}`datashader.Canvas` objects to one vector
316 |    {py:class}`geopandas.GeoDataFrame`. The single vector geodataframe will be
317 |    broadcasted to match the length of the canvas list. This is useful for
318 |    situations when you have a 🌐 'global' vector database that you want to pair
319 |    with multiple 🛰️ satellite images.
320 | 2. N:N - Many {py:class}`datashader.Canvas` objects to many vector
321 |    {py:class}`geopandas.GeoDataFrame` objects. In this case, the list of grids
322 |    **must** ❗ have the same length as the list of vector geodataframes. E.g.
323 |    if you have 5 grids, there must also be 5 vector files. This is so that a
324 |    1:1 pairing can be done, useful when each raster tile 🖽 has its own
325 |    associated vector annotation.
326 | ```
327 | 
328 | ```{seealso}
329 | For more details on how rasterization of polygons work behind the scenes 🎦,
330 | check out {doc}`Datashader <datashader:index>`'s documentation on:
331 | 
332 | - {doc}`The datashader pipeline <datashader:getting_started/Pipeline>`
333 |   (especially the section on Aggregation).
334 | - {doc}`Rendering large collections of polygons <datashader:user_guide/Polygons>`
335 | ```
336 | 
337 | 
338 | ## 2️⃣ Combine and conquer ⚔️
339 | 
340 | So far, we've got two datapipes that should be 🧑‍🤝‍🧑 paired up in an X -> Y
341 | manner:
342 | 
343 | 1. The pre-processed Sentinel-1 🌈 raster image in ``dp_decibel_image``
344 | 2. The rasterized 💧 flood segmentation masks in ``dp_datashader``
345 | 
346 | One way to get these two pieces in a Machine Learning ready chip format is via
347 | a stack, slice and split ™️ approach. Think of it like a sandwich 🥪, we first
348 | stack the bread 🍞 and lettuce 🥬, and then slice the pieces 🍕 through the
349 | layers once. Ok, that was a bad analogy, let's just stick with tensors 🤪.
350 | 
351 | ### Stacking the raster layers 🥞
352 | 
353 | Each of our 🌈 raster inputs are {py:class}`xarray.DataArray` objects with the
354 | same spatial resolution and extent 🪟, so these can be stacked into an
355 | {py:class}`xarray.Dataset` with multiple data variables. First, we'll zip 🤐
356 | the two datapipes together using {py:class}`torchdata.datapipes.iter.Zipper`
357 | (functional name: ``zip``)
358 | 
359 | ```{code-cell}
360 | dp_zip = dp_decibel_image.zip(dp_datashader)
361 | dp_zip
362 | ```
363 | 
364 | This will result in a DataPipe where each item is a tuple of (X, Y) pairs 🧑‍🤝‍🧑.
365 | Just to illustrate what we've done so far, we can use
366 | {py:class}`torchdata.datapipes.utils.to_graph` to visualize the data pipeline
367 | ⛓️.
368 | 
369 | ```{code-cell}
370 | torchdata.datapipes.utils.to_graph(dp=dp_zip)
371 | ```
372 | 
373 | Next, let's combine 🖇️ the two (X, Y) {py:class}`xarray.DataArray` objects in
374 | the tuple into an {py:class}`xarray.Dataset` using
375 | {py:class}`torchdata.datapipes.iter.Collator` (functional name: `collate`).
376 | We'll also ✂️ clip the dataset to a bounding box area where the target water
377 | mask has no 0 or NaN values.
378 | 
379 | ```{code-cell}
380 | def xr_collate_fn(image_and_mask: tuple) -> xr.Dataset:
381 |     """
382 |     Combine a pair of xarray.DataArray (image, mask) inputs into an
383 |     xarray.Dataset with two data variables named 'image' and 'mask'.
384 |     """
385 |     # Turn 2 xr.DataArray objects into 1 xr.Dataset with multiple data vars
386 |     image, mask = image_and_mask
387 |     dataset: xr.Dataset = xr.merge(
388 |         objects=[image.isel(band=0).rename("image"), mask.rename("mask")],
389 |         join="override",
390 |     )
391 | 
392 |     # Clip dataset to bounding box extent of where labels are
393 |     mask_extent: tuple = mask.where(cond=mask == 1, drop=True).rio.bounds()
394 |     clipped_dataset: xr.Dataset = dataset.rio.clip_box(*mask_extent)
395 | 
396 |     return clipped_dataset
397 | ```
398 | 
399 | ```{code-cell}
400 | dp_dataset = dp_zip.collate(collate_fn=xr_collate_fn)
401 | dp_dataset
402 | ```
403 | 
404 | Double check to see that resulting {py:class}`xarray.Dataset`'s image and mask
405 | looks ok 🙆‍♂️.
406 | 
407 | ```{code-cell}
408 | it = iter(dp_dataset)
409 | dataset = next(it)
410 | 
411 | # Create subplot with VV image on the left and Water mask on the right
412 | fig, axs = plt.subplots(ncols=2, figsize=(11.5, 4.5), sharey=True)
413 | dataset.image.plot.imshow(ax=axs[0], cmap="Blues_r")
414 | axs[0].set_title("Sentinel-1 VV channel")
415 | dataset.mask.plot.imshow(ax=axs[1], cmap="Blues")
416 | axs[1].set_title("Water mask")
417 | plt.show()
418 | ```
419 | 
420 | ### Slice into chips and turn into tensors 🗡️
421 | 
422 | To cut 🔪 the {py:class}`xarray.Dataset` into 512x512 sized chips, we'll use
423 | {py:class}`zen3geo.datapipes.XbatcherSlicer` (functional name:
424 | `slice_with_xbatcher`). Refer to {doc}`./chipping` if you need a 🧑‍🎓 refresher.
425 | 
426 | ```{code-cell}
427 | dp_xbatcher = dp_dataset.slice_with_xbatcher(input_dims={"y": 512, "x": 512})
428 | dp_xbatcher
429 | ```
430 | 
431 | Next step is to convert the 512x512 chips into a {py:class}`torch.Tensor` via
432 | {py:class}`torchdata.datapipes.iter.Mapper` (functional name: `map`). The 🛰️
433 | Sentinel-1 image and 💧 water mask will be split out at this point too.
434 | 
435 | ```{code-cell}
436 | def dataset_to_tensors(chip: xr.Dataset) -> (torch.Tensor, torch.Tensor):
437 |     """
438 |     Converts an xarray.Dataset into to two torch.Tensor objects, the first one
439 |     being the satellite image, and the second one being the target mask.
440 |     """
441 |     image: torch.Tensor = torch.as_tensor(chip.image.data)
442 |     mask: torch.Tensor = torch.as_tensor(chip.mask.data.astype("uint8"))
443 | 
444 |     return image, mask
445 | ```
446 | 
447 | ```{code-cell}
448 | dp_map = dp_xbatcher.map(fn=dataset_to_tensors)
449 | dp_map
450 | ```
451 | 
452 | At this point, we could do some batching and collating, but we'll point you
453 | again to {doc}`./chipping` to figure it out 😝. Let's take a look at a graph
454 | of the complete data pipeline.
455 | 
456 | ```{code-cell}
457 | torchdata.datapipes.utils.to_graph(dp=dp_map)
458 | ```
459 | 
460 | Sweet, time for the final step ⏩.
461 | 
462 | ### Into a DataLoader 🏋️
463 | 
464 | Pass the DataPipe into {py:class}`torch.utils.data.DataLoader` 🤾!
465 | 
466 | ```{code-cell}
467 | dataloader = torch.utils.data.DataLoader(dataset=dp_map)
468 | for i, batch in enumerate(dataloader):
469 |     image, mask = batch
470 |     print(f"Batch {i} - image: {image.shape}, mask: {mask.shape}")
471 | ```
472 | 
473 | Now go train some flood water detection models 🌊🌊🌊
474 | 
475 | ```{seealso}
476 | To learn more about AI-based flood mapping with SAR, check out these resources:
477 | 
478 | - [UNOSAT/NVIDIA Disaster Risk Monitoring Using Satellite Imagery online course](https://event.unitar.org/full-catalog/disaster-risk-monitoring-using-satellite-imagery)
479 | - [Code to train a Convolutional Neural Network for flood segmentation](https://github.com/UNITAR-UNOSAT/UNOSAT-AI-Based-Rapid-Mapping-Service/blob/master/Fastai%20training.ipynb)
480 | ```
481 | 


--------------------------------------------------------------------------------
/docs/walkthrough.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupytext:
  3 |   formats: md:myst
  4 |   text_representation:
  5 |     extension: .md
  6 |     format_name: myst
  7 | kernelspec:
  8 |   display_name: Python 3
  9 |   language: python
 10 |   name: python3
 11 | ---
 12 | 
 13 | # Walkthrough
 14 | 
 15 | > *To get it, you first see it, and then let it go*
 16 | 
 17 | In this tutorial 🧑‍🏫, we'll step through an Earth Observation 🛰️ data pipeline
 18 | using ``torchdata`` and by the end of this lesson, you should be able to:
 19 | - Find Cloud-Optimized GeoTIFFs (COGs) from STAC catalogs 🥞
 20 | - Construct a DataPipe that iteratively reads several COGs in a stream 🌊
 21 | - Loop through batches of images in a DataPipe with a DataLoader 🏋️
 22 | 
 23 | ## 🎉 **Getting started**
 24 | 
 25 | These are the tools 🛠️ you'll need.
 26 | 
 27 | ```{code-cell}
 28 | # Geospatial libraries
 29 | import pystac
 30 | import planetary_computer
 31 | import rioxarray
 32 | # Deep Learning libraries
 33 | import torch
 34 | import torchdata
 35 | import zen3geo
 36 | ```
 37 | 
 38 | Just to make sure we’re on the same page 📃,
 39 | let’s check that we’ve got compatible versions installed.
 40 | 
 41 | ```{code-cell}
 42 | print(f"pystac version: {pystac.__version__}")
 43 | print(f"planetary-computer version: {planetary_computer.__version__}")
 44 | print(f"torch version: {torch.__version__}")
 45 | 
 46 | print(f"torchdata version: {torchdata.__version__}")
 47 | print(f"zen3geo version: {zen3geo.__version__}")
 48 | rioxarray.show_versions()
 49 | ```
 50 | 
 51 | ## 0️⃣ Find [Cloud-Optimized GeoTIFFs](https://www.cogeo.org) 🗺️
 52 | 
 53 | Let's get some optical satellite data using [STAC](https://stacspec.org)!
 54 | How about Sentinel-2 L2A data over Singapore 🇸🇬?
 55 | 
 56 | 🔗 Links:
 57 | - [Official Sentinel-2 description page at ESA](https://sentinel.esa.int/web/sentinel/missions/sentinel-2)
 58 | - [Microsoft Planetary Computer STAC Explorer](https://planetarycomputer.microsoft.com/explore?c=103.8152%2C1.3338&z=10.08&v=2&d=sentinel-2-l2a&s=false%3A%3A100%3A%3Atrue&ae=0&m=cql%3A2ff1401acb50731fa0a6d1e2a46f3064&r=Natural+color)
 59 | - [AWS Sentinel-2 Cloud-Optimized GeoTIFFs](https://registry.opendata.aws/sentinel-2-l2a-cogs)
 60 | 
 61 | 
 62 | ```{code-cell}
 63 | item_url = "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items/S2A_MSIL2A_20220115T032101_R118_T48NUG_20220115T170435"
 64 | 
 65 | # Load the individual item metadata and sign the assets
 66 | item = pystac.Item.from_file(item_url)
 67 | signed_item = planetary_computer.sign(item)
 68 | signed_item
 69 | ```
 70 | 
 71 | ### Inspect one of the data assets 🍱
 72 | 
 73 | The Sentinel-2 STAC item contains several assets.
 74 | These include different 🌈 bands (e.g. 'B02', 'B03', 'B04').
 75 | Let's just use the 'visual' product for now which includes the RGB bands.
 76 | 
 77 | ```{code-cell}
 78 | url: str = signed_item.assets["visual"].href
 79 | da = rioxarray.open_rasterio(filename=url)
 80 | da
 81 | ```
 82 | 
 83 | This is how the Sentinel-2 image looks like over Singapore on 15 Jan 2022.
 84 | 
 85 | ![Sentinel-2 L2A image over Singapore on 20220115](https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=sentinel-2-l2a&item=S2A_MSIL2A_20220115T032101_R118_T48NUG_20220115T170435&assets=visual&asset_bidx=visual%7C1%2C2%2C3&nodata=0&format=png)
 86 | 
 87 | ## 1️⃣ Construct [DataPipe](https://github.com/pytorch/data/tree/v0.6.1#what-are-datapipes) 📡
 88 | 
 89 | A torch `DataPipe` is a way of composing data (rather than inheriting data).
 90 | Yes, I don't know what it really means either, so here's some extra reading.
 91 | 
 92 | 🔖 References:
 93 | - https://pytorch.org/blog/pytorch-1.11-released/#introducing-torchdata
 94 | - https://github.com/pytorch/data/tree/v0.6.1#what-are-datapipes
 95 | - https://realpython.com/inheritance-composition-python
 96 | 
 97 | ### Create an Iterable 📏
 98 | 
 99 | Start by wrapping a list of URLs to the Cloud-Optimized GeoTIFF files.
100 | We only have 1 item so we'll use ``[url]``, but if you have more, you can do
101 | ``[url1, url2, url3]``, etc. Pass this iterable list into
102 | {py:class}`torchdata.datapipes.iter.IterableWrapper`:
103 | 
104 | ```{code-cell}
105 | dp = torchdata.datapipes.iter.IterableWrapper(iterable=[url])
106 | dp
107 | ```
108 | 
109 | The ``dp`` variable is the DataPipe!
110 | Now to apply some more transformations/functions on it.
111 | 
112 | ### Read using RioXarrayReader 🌐
113 | 
114 | This is where ☯ ``zen3geo`` comes in. We'll be using the
115 | {py:class}`zen3geo.datapipes.rioxarray.RioXarrayReaderIterDataPipe` class, or
116 | rather, the short alias {py:class}`zen3geo.datapipes.RioXarrayReader`.
117 | 
118 | Confusingly, there are two ways or forms of applying ``RioXarrayReader``,
119 | a class-based method and a functional method.
120 | 
121 | ```{code-cell}
122 | # Using class constructors
123 | dp_rioxarray = zen3geo.datapipes.RioXarrayReader(source_datapipe=dp)
124 | dp_rioxarray
125 | ```
126 | 
127 | ```{code-cell}
128 | # Using functional form (recommended)
129 | dp_rioxarray = dp.read_from_rioxarray()
130 | dp_rioxarray
131 | ```
132 | 
133 | Note that both ways are equivalent (they produce the same IterDataPipe output),
134 | but the latter (functional) form is preferred, see also
135 | https://pytorch.org/data/0.4/tutorial.html#registering-datapipes-with-the-functional-api
136 | 
137 | What if you don't want the whole Sentinel-2 scene at the full 10m resolution?
138 | Since we're using Cloud-Optimized GeoTIFFs, you could set an ``overview_level``
139 | (following https://corteva.github.io/rioxarray/stable/examples/COG.html).
140 | 
141 | ```{code-cell}
142 | dp_rioxarray_zoom3 = dp.read_from_rioxarray(overview_level=3)
143 | dp_rioxarray_zoom3
144 | ```
145 | 
146 | Extra keyword arguments will be handled by {py:func}`rioxarray.open_rasterio`
147 | or {py:func}`rasterio.open`.
148 | 
149 | ```{note}
150 | Other DataPipe classes/functions can be stacked or joined to this basic GeoTIFF
151 | reader. For example, clipping by bounding box or reprojecting to a certain
152 | Coordinate Reference System. If you would like to implement this, check out the
153 | [Contributing Guidelines](./CONTRIBUTING) to get started!
154 | ```
155 | 
156 | ## 2️⃣ Loop through DataPipe ⚙️
157 | 
158 | A DataPipe describes a flow of information.
159 | Through a series of steps it goes,
160 | as one piece comes in, another might follow.
161 | 
162 | ### Basic iteration ♻️
163 | 
164 | At the most basic level, you could iterate through the DataPipe like so:
165 | 
166 | ```{code-cell}
167 | it = iter(dp_rioxarray_zoom3)
168 | dataarray = next(it)
169 | dataarray
170 | ```
171 | 
172 | Or if you're more familiar with a for-loop, here it is:
173 | 
174 | ```{code-cell}
175 | for dataarray in dp_rioxarray_zoom3:
176 |     print(dataarray)
177 |     # Run model on this data batch
178 | ```
179 | 
180 | ### Into a DataLoader 🏋️
181 | 
182 | For the deep learning folks, you might need one extra step.
183 | The {py:class}``xarray.DataArray`` needs to be converted to a tensor.
184 | In the Pytorch world, that can happen via {py:func}``torch.as_tensor``.
185 | 
186 | ```{code-cell}
187 | def fn(da):
188 |     return torch.as_tensor(da.data)
189 | ```
190 | 
191 | Using {py:class}`torchdata.datapipes.iter.Mapper` (functional name: `map`),
192 | we'll apply the tensor conversion function to each dataarray in the DataPipe.
193 | 
194 | ```{code-cell}
195 | dp_tensor = dp_rioxarray_zoom3.map(fn=fn)
196 | dp_tensor
197 | ```
198 | 
199 | Finally, let's put our DataPipe into a {py:class}`torch.utils.data.DataLoader`!
200 | 
201 | ```{code-cell}
202 | dataloader = torch.utils.data.DataLoader(dataset=dp_tensor)
203 | for batch in dataloader:
204 |     tensor = batch
205 |     print(tensor)
206 | ```
207 | 
208 | And so it begins 🌄
209 | 
210 | ---
211 | 
212 | That’s all 🎉! For more information on how to use DataPipes, check out:
213 | 
214 | - {doc}`TorchData DataPipe Tutorial <torchdata:dp_tutorial>`
215 | - {doc}`TorchData Usage Examples <torchdata:examples>`
216 | 
217 | If you have any questions 🙋, feel free to ask us anything at
218 | https://github.com/weiji14/zen3geo/discussions or visit the Pytorch forums at
219 | https://discuss.pytorch.org/c/data/37.
220 | 
221 | Cheers!
222 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "zen3geo"
  3 | version = "0.6.2"
  4 | description = "The 🌏 data science library you've been waiting for~"
  5 | authors = ["Wei Ji <23487320+weiji14@users.noreply.github.com>"]
  6 | license = "LGPL-3.0-or-later"
  7 | readme = "README.md"
  8 | classifiers = [
  9 |     "Development Status :: 4 - Beta",
 10 |     "Intended Audience :: Science/Research",
 11 |     "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
 12 |     "Topic :: Scientific/Engineering",
 13 |     "Topic :: Scientific/Engineering :: GIS",
 14 |     "Topic :: Scientific/Engineering :: Image Processing",
 15 |     "Topic :: Software Development :: Libraries",
 16 |     "Programming Language :: Python :: 3.8",
 17 |     "Programming Language :: Python :: 3.9",
 18 |     "Programming Language :: Python :: 3.10",
 19 |     "Programming Language :: Python :: 3.11",
 20 | ]
 21 | exclude = ["zen3geo/tests"]
 22 | 
 23 | [tool.poetry.urls]
 24 | "Homepage" = "https://github.com/weiji14/zen3geo/discussions"
 25 | "Changelog" = "https://zen3geo.readthedocs.io/en/latest/changelog.html"
 26 | "Documentation" = "https://zen3geo.readthedocs.io"
 27 | "Download" = "https://anaconda.org/conda-forge/zen3geo"
 28 | "Source Code" = "https://github.com/weiji14/zen3geo"
 29 | "Sponsor" = "https://github.com/sponsors/weiji14"
 30 | 
 31 | [tool.poetry.dependencies]
 32 | # Required
 33 | python = ">=3.8, <4.0"
 34 | rioxarray = ">=0.10.0"
 35 | torchdata = ">=0.4.0"
 36 | # Optional
 37 | datashader = {version = ">=0.14.0", optional = true}
 38 | pyogrio = {version = ">=0.4.0", extras = ["geopandas"], optional = true}
 39 | pystac = {version=">=1.4.0", optional=true}
 40 | pystac-client = {version = ">=0.4.0", optional = true}
 41 | spatialpandas = {version = ">=0.4.0", optional = true}
 42 | stackstac = {version = ">=0.4.0", optional = true}
 43 | xbatcher = {version = ">=0.2.0", optional = true}
 44 | xpystac = {version = ">=0.0.1", optional = true}
 45 | zarr = {version = ">=2.13.0", optional = true}
 46 | # Docs
 47 | adlfs = {version = "*", optional = true}
 48 | contextily = {version = "*", optional = true}
 49 | graphviz = {version = "*", optional = true}
 50 | jupyter-book = {version="*", optional=true}
 51 | matplotlib = {version = "*", optional = true}
 52 | planetary-computer = {version="*", optional=true}
 53 | xarray-datatree = {version="*", optional=true}
 54 | 
 55 | [tool.poetry.group.dev.dependencies]
 56 | aiohttp = "*"
 57 | black = "*"
 58 | pytest = "*"
 59 | 
 60 | [tool.poetry.extras]
 61 | docs = [
 62 |     "adlfs",
 63 |     "contextily",
 64 |     "datashader",
 65 |     "graphviz",
 66 |     "jupyter-book",
 67 |     "matplotlib",
 68 |     "planetary-computer",
 69 |     "pyogrio",
 70 |     "pystac",
 71 |     "pystac_client",
 72 |     "spatialpandas",
 73 |     "stackstac",
 74 |     "xarray-datatree",
 75 |     "xbatcher",
 76 |     "xpystac",
 77 |     "zarr"
 78 | ]
 79 | raster = [
 80 |     "xbatcher",
 81 |     "zarr"
 82 | ]
 83 | spatial = [
 84 |     "datashader",
 85 |     "spatialpandas"
 86 | ]
 87 | stac = [
 88 |     "pystac",
 89 |     "pystac_client",
 90 |     "stackstac",
 91 |     "xpystac"
 92 | ]
 93 | vector = ["pyogrio"]
 94 | 
 95 | [tool.poetry-dynamic-versioning]
 96 | bump = true
 97 | enable = true
 98 | metadata = true
 99 | style = "pep440"
100 | 
101 | [build-system]
102 | requires = ["poetry-core>=1.7.0", "poetry-dynamic-versioning"]
103 | build-backend = "poetry.core.masonry.api"
104 | 


--------------------------------------------------------------------------------
/zen3geo/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | zen3geo - The 🌏 data science library you've been waiting for~
 3 | """
 4 | 
 5 | from importlib.metadata import version
 6 | 
 7 | from zen3geo import datapipes
 8 | 
 9 | __version__ = version("zen3geo")  # e.g. 0.1.2.dev3+g0ab3cd78
10 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Iterable-style DataPipes for geospatial raster 🌈 and vector 🚏 data.
 3 | """
 4 | 
 5 | from zen3geo.datapipes.datashader import (
 6 |     DatashaderRasterizerIterDataPipe as DatashaderRasterizer,
 7 |     XarrayCanvasIterDataPipe as XarrayCanvas,
 8 | )
 9 | from zen3geo.datapipes.geopandas import (
10 |     GeoPandasRectangleClipperIterDataPipe as GeoPandasRectangleClipper,
11 | )
12 | from zen3geo.datapipes.pyogrio import PyogrioReaderIterDataPipe as PyogrioReader
13 | from zen3geo.datapipes.pystac import PySTACItemReaderIterDataPipe as PySTACItemReader
14 | from zen3geo.datapipes.pystac_client import (
15 |     PySTACAPIItemListerIterDataPipe as PySTACAPIItemLister,
16 |     PySTACAPISearcherIterDataPipe as PySTACAPISearcher,
17 | )
18 | from zen3geo.datapipes.rioxarray import RioXarrayReaderIterDataPipe as RioXarrayReader
19 | from zen3geo.datapipes.stackstac import (
20 |     StackSTACMosaickerIterDataPipe as StackSTACMosaicker,
21 |     StackSTACStackerIterDataPipe as StackSTACStacker,
22 | )
23 | from zen3geo.datapipes.xbatcher import XbatcherSlicerIterDataPipe as XbatcherSlicer
24 | from zen3geo.datapipes.xpystac import (
25 |     XpySTACAssetReaderIterDataPipe as XpySTACAssetReader,
26 | )
27 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/datashader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DataPipes for :doc:`datashader <datashader:index>`.
  3 | """
  4 | from typing import Any, Dict, Iterator, Optional, Union
  5 | 
  6 | try:
  7 |     import datashader
  8 | except ImportError:
  9 |     datashader = None
 10 | try:
 11 |     import spatialpandas
 12 |     from spatialpandas.geometry import (
 13 |         LineDtype,
 14 |         MultiLineDtype,
 15 |         MultiPointDtype,
 16 |         MultiPolygonDtype,
 17 |         PointDtype,
 18 |         PolygonDtype,
 19 |     )
 20 | except ImportError:
 21 |     spatialpandas = None
 22 | 
 23 | import xarray as xr
 24 | from torchdata.datapipes import functional_datapipe
 25 | from torchdata.datapipes.iter import IterDataPipe
 26 | 
 27 | 
 28 | @functional_datapipe("rasterize_with_datashader")
 29 | class DatashaderRasterizerIterDataPipe(IterDataPipe):
 30 |     """
 31 |     Takes vector :py:class:`geopandas.GeoSeries` or
 32 |     :py:class:`geopandas.GeoDataFrame` geometries and rasterizes them using
 33 |     :py:class:`datashader.Canvas` to yield an :py:class:`xarray.DataArray`
 34 |     raster with the input geometries aggregated into a fixed-sized grid
 35 |     (functional name: ``rasterize_with_datashader``).
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     source_datapipe : IterDataPipe[datashader.Canvas]
 40 |         A DataPipe that contains :py:class:`datashader.Canvas` objects with a
 41 |         ``.crs`` attribute. This will be the template defining the output
 42 |         raster's spatial extent and x/y range.
 43 | 
 44 |     vector_datapipe : IterDataPipe[geopandas.GeoDataFrame]
 45 |         A DataPipe that contains :py:class:`geopandas.GeoSeries` or
 46 |         :py:class:`geopandas.GeoDataFrame` vector geometries with a
 47 |         :py:attr:`.crs <geopandas.GeoDataFrame.crs>` property.
 48 | 
 49 |     agg : Optional[datashader.reductions.Reduction]
 50 |         Reduction operation to compute. Default depends on the input vector
 51 |         type:
 52 | 
 53 |         - For points, default is :py:class:`datashader.reductions.count`
 54 |         - For lines, default is :py:class:`datashader.reductions.any`
 55 |         - For polygons, default is :py:class:`datashader.reductions.any`
 56 | 
 57 |         For more information, refer to the section on Aggregation under
 58 |         datashader's :doc:`datashader:getting_started/Pipeline` docs.
 59 | 
 60 |     kwargs : Optional
 61 |         Extra keyword arguments to pass to the :py:class:`datashader.Canvas`
 62 |         class's aggregation methods such as ``datashader.Canvas.points``.
 63 | 
 64 |     Yields
 65 |     ------
 66 |     raster : xarray.DataArray
 67 |         An :py:class:`xarray.DataArray` object containing the raster data. This
 68 |         raster will have a :py:attr:`rioxarray.rioxarray.XRasterBase.crs`
 69 |         property and a proper affine transform viewable with
 70 |         :py:meth:`rioxarray.rioxarray.XRasterBase.transform`.
 71 | 
 72 |     Raises
 73 |     ------
 74 |     ModuleNotFoundError
 75 |         If ``spatialpandas`` is not installed. Please install it (e.g. via
 76 |         ``pip install spatialpandas``) before using this class.
 77 | 
 78 |     ValueError
 79 |         If either the length of the ``vector_datapipe`` is not 1, or if the
 80 |         length of the ``vector_datapipe`` is not equal to the length of the
 81 |         ``source_datapipe``. I.e. the ratio of vector:canvas must be 1:N or
 82 |         be exactly N:N.
 83 | 
 84 |     AttributeError
 85 |         If either the canvas in ``source_datapipe`` or vector geometry in
 86 |         ``vector_datapipe`` is missing a ``.crs`` attribute. Please set the
 87 |         coordinate reference system (e.g. using ``canvas.crs = 'OGC:CRS84'``
 88 |         for the :py:class:`datashader.Canvas` input or
 89 |         ``vector = vector.set_crs(crs='OGC:CRS84')`` for the
 90 |         :py:class:`geopandas.GeoSeries` or :py:class:`geopandas.GeoDataFrame`
 91 |         input) before passing them into the datapipe.
 92 | 
 93 |     NotImplementedError
 94 |         If the input vector geometry type to ``vector_datapipe`` is not
 95 |         supported, typically when a
 96 |         :py:class:`shapely.geometry.GeometryCollection` is used. Supported
 97 |         types include `Point`, `LineString`, and `Polygon`, plus their
 98 |         multipart equivalents `MultiPoint`, `MultiLineString`, and
 99 |         `MultiPolygon`.
100 | 
101 |     Example
102 |     -------
103 |     >>> import pytest
104 |     >>> datashader = pytest.importorskip("datashader")
105 |     >>> pyogrio = pytest.importorskip("pyogrio")
106 |     >>> spatialpandas = pytest.importorskip("spatialpandas")
107 |     ...
108 |     >>> from torchdata.datapipes.iter import IterableWrapper
109 |     >>> from zen3geo.datapipes import DatashaderRasterizer
110 |     ...
111 |     >>> # Read in a vector point data source
112 |     >>> geodataframe = pyogrio.read_dataframe(
113 |     ...     "https://github.com/geopandas/pyogrio/raw/v0.4.0/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg",
114 |     ...     read_geometry=True,
115 |     ... )
116 |     >>> assert geodataframe.crs == "EPSG:4326"  # latitude/longitude coords
117 |     >>> dp_vector = IterableWrapper(iterable=[geodataframe])
118 |     ...
119 |     >>> # Setup blank raster canvas where we will burn vector geometries onto
120 |     >>> canvas = datashader.Canvas(
121 |     ...     plot_width=5,
122 |     ...     plot_height=6,
123 |     ...     x_range=(160000.0, 620000.0),
124 |     ...     y_range=(0.0, 450000.0),
125 |     ... )
126 |     >>> canvas.crs = "EPSG:32631"  # UTM Zone 31N, North of Gulf of Guinea
127 |     >>> dp_canvas = IterableWrapper(iterable=[canvas])
128 |     ...
129 |     >>> # Rasterize vector point geometries onto blank canvas
130 |     >>> dp_datashader = dp_canvas.rasterize_with_datashader(
131 |     ...     vector_datapipe=dp_vector
132 |     ... )
133 |     ...
134 |     >>> # Loop or iterate over the DataPipe stream
135 |     >>> it = iter(dp_datashader)
136 |     >>> dataarray = next(it)
137 |     >>> dataarray
138 |     <xarray.DataArray (y: 6, x: 5)>
139 |     array([[0, 0, 0, 0, 1],
140 |            [0, 0, 0, 0, 0],
141 |            [0, 0, 0, 0, 0],
142 |            [0, 0, 1, 0, 0],
143 |            [0, 1, 0, 0, 0],
144 |            [1, 0, 0, 0, 0]], dtype=uint32)
145 |     Coordinates:
146 |       * x            (x) float64 2.094e+05 3.083e+05 4.072e+05 5.06e+05 6.049e+05
147 |       * y            (y) float64 4.157e+05 3.47e+05 2.783e+05 ... 1.41e+05 7.237e+04
148 |         spatial_ref  int64 0
149 |     ...
150 |     >>> dataarray.rio.crs
151 |     CRS.from_epsg(32631)
152 |     >>> dataarray.rio.transform()
153 |     Affine(98871.00388807665, 0.0, 160000.0,
154 |            0.0, -68660.4193667199, 450000.0)
155 |     """
156 | 
157 |     def __init__(
158 |         self,
159 |         source_datapipe: IterDataPipe,
160 |         vector_datapipe: IterDataPipe,
161 |         agg: Optional = None,
162 |         **kwargs: Optional[Dict[str, Any]],
163 |     ) -> None:
164 |         if spatialpandas is None:
165 |             raise ModuleNotFoundError(
166 |                 "Package `spatialpandas` is required to be installed to use this datapipe. "
167 |                 "Please use `pip install spatialpandas` or "
168 |                 "`conda install -c conda-forge spatialpandas` "
169 |                 "to install the package"
170 |             )
171 |         self.source_datapipe: IterDataPipe = source_datapipe  # datashader.Canvas
172 |         self.vector_datapipe: IterDataPipe = vector_datapipe  # geopandas.GeoDataFrame
173 |         self.agg: Optional = agg  # Datashader Aggregation/Reduction function
174 |         self.kwargs = kwargs
175 | 
176 |         len_vector_datapipe: int = len(self.vector_datapipe)
177 |         len_canvas_datapipe: int = len(self.source_datapipe)
178 |         if len_vector_datapipe != 1 and len_vector_datapipe != len_canvas_datapipe:
179 |             raise ValueError(
180 |                 f"Unmatched lengths for the canvas datapipe ({self.source_datapipe}) "
181 |                 f"and vector datapipe ({self.vector_datapipe}). \n"
182 |                 f"The vector datapipe's length ({len_vector_datapipe}) should either "
183 |                 f"be (1) to allow for broadcasting, or match the canvas datapipe's "
184 |                 f"length of ({len_canvas_datapipe})."
185 |             )
186 | 
187 |     def __iter__(self) -> Iterator[xr.DataArray]:
188 |         # Broadcast vector iterator to match length of raster iterator
189 |         for canvas, vector in self.source_datapipe.zip_longest(
190 |             self.vector_datapipe, fill_value=list(self.vector_datapipe).pop()
191 |         ):
192 |             # print(canvas, vector)
193 |             # If canvas has no CRS attribute, set one to prevent AttributeError
194 |             canvas.crs = getattr(canvas, "crs", None)
195 |             if canvas.crs is None:
196 |                 raise AttributeError(
197 |                     "Missing crs information for datashader.Canvas with "
198 |                     f"x_range: {canvas.x_range} and y_range: {canvas.y_range}. "
199 |                     "Please set crs using e.g. `canvas.crs = 'OGC:CRS84'`."
200 |                 )
201 | 
202 |             # Reproject vector geometries to coordinate reference system
203 |             # of the raster canvas if both are different
204 |             try:
205 |                 if vector.crs != canvas.crs:
206 |                     vector = vector.to_crs(crs=canvas.crs)
207 |             except (AttributeError, ValueError) as e:
208 |                 raise AttributeError(
209 |                     f"Missing crs information for input {vector.__class__} object "
210 |                     f"with the following bounds: \n {vector.bounds} \n"
211 |                     f"Please set crs using e.g. `vector = vector.set_crs(crs='OGC:CRS84')`."
212 |                 ) from e
213 | 
214 |             # Convert vector to spatialpandas format to allow datashader's
215 |             # rasterization methods to work
216 |             try:
217 |                 _vector = spatialpandas.GeoDataFrame(data=vector.geometry)
218 |             except ValueError as e:
219 |                 if str(e) == "Unable to convert data argument to a GeometryList array":
220 |                     raise NotImplementedError(
221 |                         f"Unsupported geometry type(s) {set(vector.geom_type)} detected, "
222 |                         "only point, line or polygon vector geometry types "
223 |                         "(or their multi- equivalents) are supported."
224 |                     ) from e
225 |                 else:
226 |                     raise e
227 | 
228 |             # Determine geometry type to know which rasterization method to use
229 |             vector_dtype: spatialpandas.geometry.GeometryDtype = _vector.geometry.dtype
230 | 
231 |             if isinstance(vector_dtype, (PointDtype, MultiPointDtype)):
232 |                 raster: xr.DataArray = canvas.points(
233 |                     source=_vector, agg=self.agg, geometry="geometry", **self.kwargs
234 |                 )
235 |             elif isinstance(vector_dtype, (LineDtype, MultiLineDtype)):
236 |                 raster: xr.DataArray = canvas.line(
237 |                     source=_vector, agg=self.agg, geometry="geometry", **self.kwargs
238 |                 )
239 |             elif isinstance(vector_dtype, (PolygonDtype, MultiPolygonDtype)):
240 |                 raster: xr.DataArray = canvas.polygons(
241 |                     source=_vector, agg=self.agg, geometry="geometry", **self.kwargs
242 |                 )
243 | 
244 |             # Convert boolean dtype rasters to uint8 to enable reprojection
245 |             if raster.dtype == "bool":
246 |                 raster: xr.DataArray = raster.astype(dtype="uint8")
247 |             # Set coordinate transform for raster and ensure affine
248 |             # transform is correct (the y-coordinate goes from North to South)
249 |             raster: xr.DataArray = raster.rio.set_crs(input_crs=canvas.crs)
250 |             # assert raster.rio.transform().e > 0  # y goes South to North
251 |             _raster: xr.DataArray = raster.rio.reproject(
252 |                 dst_crs=canvas.crs, shape=raster.rio.shape
253 |             )
254 |             # assert _raster.rio.transform().e < 0  # y goes North to South
255 | 
256 |             yield _raster
257 | 
258 |     def __len__(self) -> int:
259 |         return len(self.source_datapipe)
260 | 
261 | 
262 | @functional_datapipe("canvas_from_xarray")
263 | class XarrayCanvasIterDataPipe(IterDataPipe[Union[xr.DataArray, xr.Dataset]]):
264 |     """
265 |     Takes an :py:class:`xarray.DataArray` or :py:class:`xarray.Dataset`
266 |     and creates a blank :py:class:`datashader.Canvas` based on the spatial
267 |     extent and coordinates of the input (functional name:
268 |     ``canvas_from_xarray``).
269 | 
270 |     Parameters
271 |     ----------
272 |     source_datapipe : IterDataPipe[xarrray.DataArray]
273 |         A DataPipe that contains :py:class:`xarray.DataArray` or
274 |         :py:class:`xarray.Dataset` objects. These data objects need to have
275 |         both a ``.rio.x_dim`` and ``.rio.y_dim`` attribute, which is present
276 |         if the original dataset was opened using
277 |         :py:func:`rioxarray.open_rasterio`, or by setting it manually using
278 |         :py:meth:`rioxarray.rioxarray.XRasterBase.set_spatial_dims`.
279 | 
280 |     kwargs : Optional
281 |         Extra keyword arguments to pass to :py:class:`datashader.Canvas`.
282 | 
283 |     Yields
284 |     ------
285 |     canvas : datashader.Canvas
286 |         A :py:class:`datashader.Canvas` object representing the same spatial
287 |         extent and x/y coordinates of the input raster grid. This canvas
288 |         will also have a ``.crs`` attribute that captures the original
289 |         Coordinate Reference System from the input xarray object's
290 |         :py:attr:`rioxarray.rioxarray.XRasterBase.crs` property.
291 | 
292 |     Raises
293 |     ------
294 |     ModuleNotFoundError
295 |         If ``datashader`` is not installed. Follow
296 |         :doc:`install instructions for datashader <datashader:getting_started/index>`
297 |         before using this class.
298 | 
299 |     Example
300 |     -------
301 |     >>> import pytest
302 |     >>> import numpy as np
303 |     >>> import xarray as xr
304 |     >>> datashader = pytest.importorskip("datashader")
305 |     ...
306 |     >>> from torchdata.datapipes.iter import IterableWrapper
307 |     >>> from zen3geo.datapipes import XarrayCanvas
308 |     ...
309 |     >>> # Create blank canvas from xarray.DataArray using DataPipe
310 |     >>> y = np.arange(0, -3, step=-1)
311 |     >>> x = np.arange(0, 6)
312 |     >>> dataarray: xr.DataArray = xr.DataArray(
313 |     ...     data=np.zeros(shape=(1, 3, 6)),
314 |     ...     coords=dict(band=[1], y=y, x=x),
315 |     ... )
316 |     >>> dataarray = dataarray.rio.set_spatial_dims(x_dim="x", y_dim="y")
317 |     >>> dp = IterableWrapper(iterable=[dataarray])
318 |     >>> dp_canvas = dp.canvas_from_xarray()
319 |     ...
320 |     >>> # Loop or iterate over the DataPipe stream
321 |     >>> it = iter(dp_canvas)
322 |     >>> canvas = next(it)
323 |     >>> print(canvas.raster(source=dataarray))
324 |     <xarray.DataArray (band: 1, y: 3, x: 6)>
325 |     array([[[0., 0., 0., 0., 0., 0.],
326 |             [0., 0., 0., 0., 0., 0.],
327 |             [0., 0., 0., 0., 0., 0.]]])
328 |     Coordinates:
329 |       * x        (x) int64 0 1 2 3 4 5
330 |       * y        (y) int64 0 -1 -2
331 |       * band     (band) int64 1
332 |     ...
333 |     """
334 | 
335 |     def __init__(
336 |         self,
337 |         source_datapipe: IterDataPipe[Union[xr.DataArray, xr.Dataset]],
338 |         **kwargs: Optional[Dict[str, Any]],
339 |     ) -> None:
340 |         if datashader is None:
341 |             raise ModuleNotFoundError(
342 |                 "Package `datashader` is required to be installed to use this datapipe. "
343 |                 "Please use `pip install datashader` or "
344 |                 "`conda install -c conda-forge datashader` "
345 |                 "to install the package"
346 |             )
347 |         self.source_datapipe: IterDataPipe[
348 |             Union[xr.DataArray, xr.Dataset]
349 |         ] = source_datapipe
350 |         self.kwargs = kwargs
351 | 
352 |     def __iter__(self) -> Iterator:
353 |         for dataarray in self.source_datapipe:
354 |             x_dim: str = dataarray.rio.x_dim
355 |             y_dim: str = dataarray.rio.y_dim
356 |             plot_width: int = len(dataarray[x_dim])
357 |             plot_height: int = len(dataarray[y_dim])
358 |             xmin, ymin, xmax, ymax = dataarray.rio.bounds()
359 | 
360 |             canvas = datashader.Canvas(
361 |                 plot_width=plot_width,
362 |                 plot_height=plot_height,
363 |                 x_range=(xmin, xmax),
364 |                 y_range=(ymin, ymax),
365 |                 **self.kwargs,
366 |             )
367 |             canvas.crs = dataarray.rio.crs
368 |             yield canvas
369 | 
370 |     def __len__(self) -> int:
371 |         return len(self.source_datapipe)
372 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/geopandas.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DataPipes for :doc:`geopandas <geopandas:index>`.
  3 | """
  4 | from typing import Any, Dict, Iterator, Optional, Union
  5 | 
  6 | try:
  7 |     import geopandas as gpd
  8 | except ImportError:
  9 |     gpd = None
 10 | import xarray as xr
 11 | from torchdata.datapipes import functional_datapipe
 12 | from torchdata.datapipes.iter import IterDataPipe
 13 | 
 14 | 
 15 | @functional_datapipe("clip_vector_with_rectangle")
 16 | class GeoPandasRectangleClipperIterDataPipe(IterDataPipe):
 17 |     """
 18 |     Takes vector :py:class:`geopandas.GeoSeries` or
 19 |     :py:class:`geopandas.GeoDataFrame` geometries and clips them with the
 20 |     rectangular extent of an :py:class:`xarray.DataArray` or
 21 |     :py:class:`xarray.Dataset` grid to yield tuples of spatially subsetted
 22 |     :py:class:`geopandas.GeoSeries` or :py:class:`geopandas.GeoDataFrame`
 23 |     vectors and the correponding :py:class:`xarray.DataArray` or
 24 |     :py:class:`xarray.Dataset` raster object used as the clip mask (functional
 25 |     name: ``clip_vector_with_rectangle``).
 26 | 
 27 |     Uses the rectangular clip algorithm of :py:func:`geopandas.clip`, with the
 28 |     bounding box rectangle (minx, miny, maxx, maxy) derived from input raster
 29 |     mask's bounding box extent.
 30 | 
 31 |     Note
 32 |     ----
 33 |     If the input vector's coordinate reference system (``crs``) is different to
 34 |     the raster mask's coordinate reference system (``rio.crs``), the vector
 35 |     will be reprojected using :py:meth:`geopandas.GeoDataFrame.to_crs` to match
 36 |     the raster's coordinate reference system.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     source_datapipe : IterDataPipe[geopandas.GeoDataFrame]
 41 |         A DataPipe that contains :py:class:`geopandas.GeoSeries` or
 42 |         :py:class:`geopandas.GeoDataFrame` vector geometries with a
 43 |         :py:attr:`.crs <geopandas.GeoDataFrame.crs>` property.
 44 | 
 45 |     mask_datapipe : IterDataPipe[xarray.DataArray]
 46 |         A DataPipe that contains :py:class:`xarray.DataArray` or
 47 |         :py:class:`xarray.Dataset` objects with a
 48 |         :py:attr:`.rio.crs <rioxarray.rioxarray.XRasterBase.crs>` property and
 49 |         :py:meth:`.rio.bounds <rioxarray.rioxarray.XRasterBase.bounds>` method.
 50 | 
 51 |     kwargs : Optional
 52 |         Extra keyword arguments to pass to :py:func:`geopandas.clip`.
 53 | 
 54 |     Yields
 55 |     ------
 56 |     paired_obj : Tuple[geopandas.GeoDataFrame, xarray.DataArray]
 57 |         A tuple consisting of the spatially subsetted
 58 |         :py:class:`geopandas.GeoSeries` or :py:class:`geopandas.GeoDataFrame`
 59 |         vector, and the corresponding :py:class:`xarray.DataArray` or
 60 |         :py:class:`xarray.Dataset` raster used as the clip mask.
 61 | 
 62 |     Raises
 63 |     ------
 64 |     ModuleNotFoundError
 65 |         If ``geopandas`` is not installed. See
 66 |         :doc:`install instructions for geopandas <geopandas:getting_started/install>`
 67 |         (e.g. via ``pip install geopandas``) before using this class.
 68 | 
 69 |     NotImplementedError
 70 |         If the length of the vector ``source_datapipe`` is not 1. Currently,
 71 |         all of the vector geometries have to be merged into a single
 72 |         :py:class:`geopandas.GeoSeries` or :py:class:`geopandas.GeoDataFrame`.
 73 |         Refer to the section on Appending under geopandas'
 74 |         :doc:`geopandas:docs/user_guide/mergingdata` docs.
 75 | 
 76 |     Example
 77 |     -------
 78 |     >>> import pytest
 79 |     >>> import rioxarray
 80 |     >>> gpd = pytest.importorskip("geopandas")
 81 |     ...
 82 |     >>> from torchdata.datapipes.iter import IterableWrapper
 83 |     >>> from zen3geo.datapipes import GeoPandasRectangleClipper
 84 |     ...
 85 |     >>> # Read in a vector polygon data source
 86 |     >>> geodataframe = gpd.read_file(
 87 |     ...     filename="https://github.com/geopandas/geopandas/raw/v0.11.1/geopandas/tests/data/overlay/polys/df1.geojson",
 88 |     ... )
 89 |     >>> assert geodataframe.crs == "EPSG:4326"  # latitude/longitude coords
 90 |     >>> dp_vector = IterableWrapper(iterable=[geodataframe])
 91 |     ...
 92 |     >>> # Get list of raster grids to cut up the vector polygon later
 93 |     >>> dataarray = rioxarray.open_rasterio(
 94 |     ...     filename="https://github.com/rasterio/rasterio/raw/1.3.2/tests/data/world.byte.tif"
 95 |     ... )
 96 |     >>> assert dataarray.rio.crs == "EPSG:4326"  # latitude/longitude coords
 97 |     >>> dp_raster = IterableWrapper(
 98 |     ...     iterable=[
 99 |     ...         dataarray.sel(x=slice(0, 2)),  # longitude 0 to 2 degrees
100 |     ...         dataarray.sel(x=slice(2, 4)),  # longitude 2 to 4 degrees
101 |     ...     ]
102 |     ... )
103 |     ...
104 |     >>> # Clip vector point geometries based on raster masks
105 |     >>> dp_clipped = dp_vector.clip_vector_with_rectangle(
106 |     ...     mask_datapipe=dp_raster
107 |     ... )
108 |     ...
109 |     >>> # Loop or iterate over the DataPipe stream
110 |     >>> it = iter(dp_clipped)
111 |     >>> geodataframe0, raster0 = next(it)
112 |     >>> geodataframe0
113 |        col1                                           geometry
114 |     0     1  POLYGON ((0.00000 0.00000, 0.00000 2.00000, 2....
115 |     >>> raster0
116 |     <xarray.DataArray (band: 1, y: 1200, x: 16)>
117 |     array([[[0, 0, ..., 0, 0],
118 |             [0, 0, ..., 0, 0],
119 |             ...,
120 |             [1, 1, ..., 1, 1],
121 |             [1, 1, ..., 1, 1]]], dtype=uint8)
122 |     Coordinates:
123 |       * band         (band) int64 1
124 |       * x            (x) float64 0.0625 0.1875 0.3125 0.4375 ... 1.688 1.812 1.938
125 |       * y            (y) float64 74.94 74.81 74.69 74.56 ... -74.69 -74.81 -74.94
126 |         spatial_ref  int64 0
127 |     ...
128 |     >>> geodataframe1, raster1 = next(it)
129 |     >>> geodataframe1
130 |        col1                                           geometry
131 |     1     2  POLYGON ((2.00000 2.00000, 2.00000 4.00000, 4....
132 |     """
133 | 
134 |     def __init__(
135 |         self,
136 |         source_datapipe: IterDataPipe,
137 |         mask_datapipe: IterDataPipe[Union[xr.DataArray, xr.Dataset]],
138 |         **kwargs: Optional[Dict[str, Any]],
139 |     ) -> None:
140 |         if gpd is None:
141 |             raise ModuleNotFoundError(
142 |                 "Package `geopandas` is required to be installed to use this datapipe. "
143 |                 "Please use `pip install geopandas` or "
144 |                 "`conda install -c conda-forge geopandas` "
145 |                 "to install the package"
146 |             )
147 |         self.source_datapipe: IterDataPipe = source_datapipe
148 |         self.mask_datapipe: IterDataPipe[xr.DataArray] = mask_datapipe
149 |         self.kwargs = kwargs
150 | 
151 |         len_vector_datapipe: int = len(self.source_datapipe)
152 |         if len_vector_datapipe != 1:
153 |             raise NotImplementedError(
154 |                 f"The vector datapipe's length can only be (1) for now, but got "
155 |                 f"({len_vector_datapipe}) instead. Consider merging your vector data "
156 |                 f"into a single `geopandas.GeoSeries` or `geopandas.GeoDataFrame`, "
157 |                 f"e.g. using `geodataframe0.append(geodataframe2)`."
158 |             )
159 | 
160 |     def __iter__(self) -> Iterator:
161 |         geodataframe = list(self.source_datapipe).pop()
162 | 
163 |         for raster in self.mask_datapipe:
164 |             mask = raster.rio.bounds()
165 | 
166 |             try:
167 |                 assert geodataframe.crs == raster.rio.crs
168 |                 _geodataframe = geodataframe
169 |             except AssertionError:
170 |                 _geodataframe = geodataframe.to_crs(crs=raster.rio.crs)
171 | 
172 |             clipped_geodataframe = _geodataframe.clip(mask=mask, **self.kwargs)
173 | 
174 |             yield clipped_geodataframe, raster
175 | 
176 |     def __len__(self) -> int:
177 |         return len(self.mask_datapipe)
178 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/pyogrio.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DataPipes for :doc:`pyogrio <pyogrio:index>`.
 3 | """
 4 | from typing import Any, Dict, Iterator, Optional
 5 | 
 6 | try:
 7 |     import pyogrio
 8 | except ImportError:
 9 |     pyogrio = None
10 | from torchdata.datapipes import functional_datapipe
11 | from torchdata.datapipes.iter import IterDataPipe
12 | from torchdata.datapipes.utils import StreamWrapper
13 | 
14 | 
15 | @functional_datapipe("read_from_pyogrio")
16 | class PyogrioReaderIterDataPipe(IterDataPipe[StreamWrapper]):
17 |     """
18 |     Takes vector files (e.g. FlatGeoBuf, GeoPackage, GeoJSON) from local disk
19 |     or URLs (as long as they can be read by pyogrio) and yields
20 |     :py:class:`geopandas.GeoDataFrame` objects (functional name:
21 |     ``read_from_pyogrio``).
22 | 
23 |     Based on
24 |     https://github.com/pytorch/data/blob/v0.4.0/torchdata/datapipes/iter/load/iopath.py#L42-L97
25 | 
26 |     Parameters
27 |     ----------
28 |     source_datapipe : IterDataPipe[str]
29 |         A DataPipe that contains filepaths or URL links to vector files such as
30 |         FlatGeoBuf, GeoPackage, GeoJSON, etc.
31 | 
32 |     kwargs : Optional
33 |         Extra keyword arguments to pass to :py:func:`pyogrio.read_dataframe`.
34 | 
35 |     Yields
36 |     ------
37 |     stream_obj : geopandas.GeoDataFrame
38 |         A :py:class:`geopandas.GeoDataFrame` object containing the vector data.
39 | 
40 |     Raises
41 |     ------
42 |     ModuleNotFoundError
43 |         If ``pyogrio`` is not installed. See
44 |         :doc:`install instructions for pyogrio <pyogrio:install>`, and ensure
45 |         that ``geopandas`` is installed too (e.g. via
46 |         ``pip install pyogrio[geopandas]``) before using this class.
47 | 
48 |     Example
49 |     -------
50 |     >>> import pytest
51 |     >>> pyogrio = pytest.importorskip("pyogrio")
52 |     ...
53 |     >>> from torchdata.datapipes.iter import IterableWrapper
54 |     >>> from zen3geo.datapipes import PyogrioReader
55 |     ...
56 |     >>> # Read in GeoPackage data using DataPipe
57 |     >>> file_url: str = "https://github.com/geopandas/pyogrio/raw/v0.4.0/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg"
58 |     >>> dp = IterableWrapper(iterable=[file_url])
59 |     >>> dp_pyogrio = dp.read_from_pyogrio()
60 |     ...
61 |     >>> # Loop or iterate over the DataPipe stream
62 |     >>> it = iter(dp_pyogrio)
63 |     >>> geodataframe = next(it)
64 |     >>> geodataframe
65 |     StreamWrapper<   col_bool  col_int8  ...  col_float64                 geometry
66 |     0       1.0       1.0  ...          1.5  POINT (0.00000 0.00000)
67 |     1       0.0       2.0  ...          2.5  POINT (1.00000 1.00000)
68 |     2       1.0       3.0  ...          3.5  POINT (2.00000 2.00000)
69 |     3       NaN       NaN  ...          NaN  POINT (4.00000 4.00000)
70 |     <BLANKLINE>
71 |     [4 rows x 12 columns]>
72 |     """
73 | 
74 |     def __init__(
75 |         self, source_datapipe: IterDataPipe[str], **kwargs: Optional[Dict[str, Any]]
76 |     ) -> None:
77 |         if pyogrio is None:
78 |             raise ModuleNotFoundError(
79 |                 "Package `pyogrio` is required to be installed to use this datapipe. "
80 |                 "Please use `pip install pyogrio[geopandas]` or "
81 |                 "`conda install -c conda-forge pyogrio` "
82 |                 "to install the package"
83 |             )
84 |         self.source_datapipe: IterDataPipe[str] = source_datapipe
85 |         self.kwargs = kwargs
86 | 
87 |     def __iter__(self) -> Iterator[StreamWrapper]:
88 |         for filename in self.source_datapipe:
89 |             yield StreamWrapper(pyogrio.read_dataframe(filename, **self.kwargs))
90 | 
91 |     def __len__(self) -> int:
92 |         return len(self.source_datapipe)
93 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/pystac.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DataPipes for :doc:`pystac <pystac:index>`.
 3 | """
 4 | from typing import Any, Dict, Iterator, Optional
 5 | 
 6 | try:
 7 |     import pystac
 8 | except ImportError:
 9 |     pystac = None
10 | from torchdata.datapipes import functional_datapipe
11 | from torchdata.datapipes.iter import IterDataPipe
12 | 
13 | 
14 | @functional_datapipe("read_to_pystac_item")
15 | class PySTACItemReaderIterDataPipe(IterDataPipe):
16 |     """
17 |     Takes files from local disk or URLs (as long as they can be read by pystac)
18 |     and yields :py:class:`pystac.Item` objects (functional name:
19 |     ``read_to_pystac_item``).
20 | 
21 |     Parameters
22 |     ----------
23 |     source_datapipe : IterDataPipe[str]
24 |         A DataPipe that contains filepaths or URL links to STAC items.
25 | 
26 |     kwargs : Optional
27 |         Extra keyword arguments to pass to :py:meth:`pystac.Item.from_file`.
28 | 
29 |     Yields
30 |     ------
31 |     stac_item : pystac.Item
32 |         A :py:class:`pystac.Item` object containing the specific
33 |         :py:class:`pystac.STACObject` implementation class represented in a
34 |         JSON format.
35 | 
36 |     Raises
37 |     ------
38 |     ModuleNotFoundError
39 |         If ``pystac`` is not installed. See
40 |         :doc:`install instructions for pystac <pystac:installation>`, (e.g. via
41 |         ``pip install pystac``) before using this class.
42 | 
43 |     Example
44 |     -------
45 |     >>> import pytest
46 |     >>> pystac = pytest.importorskip("pystac")
47 |     ...
48 |     >>> from torchdata.datapipes.iter import IterableWrapper
49 |     >>> from zen3geo.datapipes import PySTACItemReader
50 |     ...
51 |     >>> # Read in STAC Item using DataPipe
52 |     >>> item_url: str = "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items/S2A_MSIL2A_20220115T032101_R118_T48NUG_20220115T170435"
53 |     >>> dp = IterableWrapper(iterable=[item_url])
54 |     >>> dp_pystac = dp.read_to_pystac_item()
55 |     ...
56 |     >>> # Loop or iterate over the DataPipe stream
57 |     >>> it = iter(dp_pystac)
58 |     >>> stac_item = next(it)
59 |     >>> stac_item.bbox
60 |     [103.20205689, 0.81602476, 104.18934086, 1.8096362]
61 |     >>> stac_item.properties  # doctest: +NORMALIZE_WHITESPACE
62 |     {'datetime': '2022-01-15T03:21:01.024000Z',
63 |      'platform': 'Sentinel-2A',
64 |      'proj:epsg': 32648,
65 |      'instruments': ['msi'],
66 |      's2:mgrs_tile': '48NUG',
67 |      'constellation': 'Sentinel 2',
68 |      's2:granule_id': 'S2A_OPER_MSI_L2A_TL_ESRI_20220115T170436_A034292_T48NUG_N03.00',
69 |      'eo:cloud_cover': 17.352597,
70 |      's2:datatake_id': 'GS2A_20220115T032101_034292_N03.00',
71 |      's2:product_uri': 'S2A_MSIL2A_20220115T032101_N0300_R118_T48NUG_20220115T170435.SAFE',
72 |      's2:datastrip_id': 'S2A_OPER_MSI_L2A_DS_ESRI_20220115T170436_S20220115T033502_N03.00',
73 |      's2:product_type': 'S2MSI2A',
74 |      'sat:orbit_state': 'descending',
75 |     ...
76 |     """
77 | 
78 |     def __init__(
79 |         self, source_datapipe: IterDataPipe[str], **kwargs: Optional[Dict[str, Any]]
80 |     ) -> None:
81 |         if pystac is None:
82 |             raise ModuleNotFoundError(
83 |                 "Package `pystac` is required to be installed to use this datapipe. "
84 |                 "Please use `pip install pystac` or "
85 |                 "`conda install -c conda-forge pystac` "
86 |                 "to install the package"
87 |             )
88 |         self.source_datapipe: IterDataPipe[str] = source_datapipe
89 |         self.kwargs = kwargs
90 | 
91 |     def __iter__(self) -> Iterator:
92 |         for href in self.source_datapipe:
93 |             yield pystac.Item.from_file(href=href, **self.kwargs)
94 | 
95 |     def __len__(self) -> int:
96 |         return len(self.source_datapipe)
97 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/pystac_client.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DataPipes for :doc:`pystac-client <pystac_client:index>`.
  3 | """
  4 | from typing import Any, Dict, Iterator, Optional
  5 | 
  6 | try:
  7 |     import pystac_client
  8 | except ImportError:
  9 |     pystac_client = None
 10 | from torchdata.datapipes import functional_datapipe
 11 | from torchdata.datapipes.iter import IterDataPipe
 12 | 
 13 | 
 14 | @functional_datapipe("search_for_pystac_item")
 15 | class PySTACAPISearcherIterDataPipe(IterDataPipe):
 16 |     """
 17 |     Takes dictionaries containing a STAC API query (as long as the parameters
 18 |     are understood by :py:meth:`pystac_client.Client.search`) and yields
 19 |     :py:class:`pystac_client.ItemSearch` objects (functional name:
 20 |     ``search_for_pystac_item``).
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     source_datapipe : IterDataPipe[dict]
 25 |         A DataPipe that contains STAC API query parameters in the form of a
 26 |         Python dictionary to pass to :py:meth:`pystac_client.Client.search`.
 27 |         For example:
 28 | 
 29 |         - **bbox** -  A list, tuple, or iterator representing a bounding box of
 30 |           2D or 3D coordinates. Results will be filtered to only those
 31 |           intersecting the bounding box.
 32 |         - **datetime** - Either a single datetime or datetime range used to
 33 |           filter results. You may express a single datetime using a
 34 |           :py:class:`datetime.datetime` instance, a
 35 |           `RFC 3339-compliant <https://tools.ietf.org/html/rfc3339>`_
 36 |           timestamp, or a simple date string.
 37 |         - **collections** - List of one or more Collection IDs or
 38 |           :py:class:`pystac.Collection` instances. Only Items in one of the
 39 |           provided Collections will be searched.
 40 | 
 41 |     catalog_url : str
 42 |         The URL of a STAC Catalog.
 43 | 
 44 |     kwargs : Optional
 45 |         Extra keyword arguments to pass to
 46 |         :py:meth:`pystac_client.Client.open`. For example:
 47 | 
 48 |         - **headers** - A dictionary of additional headers to use in all
 49 |           requests made to any part of this Catalog/API.
 50 |         - **parameters** - Optional dictionary of query string parameters to
 51 |           include in all requests.
 52 |         - **modifier** - A callable that modifies the children collection and
 53 |           items returned by this Client. This can be useful for injecting
 54 |           authentication parameters into child assets to access data from
 55 |           non-public sources.
 56 | 
 57 |     Yields
 58 |     ------
 59 |     item_search : pystac_client.ItemSearch
 60 |         A :py:class:`pystac_client.ItemSearch` object instance that represents
 61 |         a deferred query to a STAC search endpoint as described in the
 62 |         `STAC API - Item Search spec <https://github.com/radiantearth/stac-api-spec/tree/main/item-search>`_.
 63 | 
 64 |     Raises
 65 |     ------
 66 |     ModuleNotFoundError
 67 |         If ``pystac_client`` is not installed. See
 68 |         :doc:`install instructions for pystac-client <pystac_client:index>`,
 69 |         (e.g. via ``pip install pystac-client``) before using this class.
 70 | 
 71 |     Example
 72 |     -------
 73 |     >>> import pytest
 74 |     >>> pystac_client = pytest.importorskip("pystac_client")
 75 |     ...
 76 |     >>> from torchdata.datapipes.iter import IterableWrapper
 77 |     >>> from zen3geo.datapipes import PySTACAPISearcher
 78 |     ...
 79 |     >>> # Peform STAC API query using DataPipe
 80 |     >>> query = dict(
 81 |     ...     bbox=[174.5, -41.37, 174.9, -41.19],  # xmin, ymin, xmax, ymax
 82 |     ...     datetime=["2012-02-20T00:00:00Z", "2022-12-22T00:00:00Z"],
 83 |     ...     collections=["cop-dem-glo-30"],
 84 |     ... )
 85 |     >>> dp = IterableWrapper(iterable=[query])
 86 |     >>> dp_pystac_client = dp.search_for_pystac_item(
 87 |     ...     catalog_url="https://planetarycomputer.microsoft.com/api/stac/v1",
 88 |     ...     # modifier=planetary_computer.sign_inplace,
 89 |     ... )
 90 |     >>> # Loop or iterate over the DataPipe stream
 91 |     >>> it = iter(dp_pystac_client)
 92 |     >>> stac_item_search = next(it)
 93 |     >>> stac_items = list(stac_item_search.items())
 94 |     >>> stac_items
 95 |     [<Item id=Copernicus_DSM_COG_10_S42_00_E174_00_DEM>]
 96 |     >>> stac_items[0].properties  # doctest: +NORMALIZE_WHITESPACE
 97 |     {'gsd': 30,
 98 |      'datetime': '2021-04-22T00:00:00Z',
 99 |      'platform': 'TanDEM-X',
100 |      'proj:epsg': 4326,
101 |      'proj:shape': [3600, 3600],
102 |      'proj:transform': [0.0002777777777777778,
103 |       0.0,
104 |       173.9998611111111,
105 |       0.0,
106 |       -0.0002777777777777778,
107 |       -40.99986111111111]}
108 |     """
109 | 
110 |     def __init__(
111 |         self,
112 |         source_datapipe: IterDataPipe[dict],
113 |         catalog_url: str,
114 |         **kwargs: Optional[Dict[str, Any]]
115 |     ) -> None:
116 |         if pystac_client is None:
117 |             raise ModuleNotFoundError(
118 |                 "Package `pystac_client` is required to be installed to use this datapipe. "
119 |                 "Please use `pip install pystac-client` or "
120 |                 "`conda install -c conda-forge pystac-client` "
121 |                 "to install the package"
122 |             )
123 |         self.source_datapipe: IterDataPipe[dict] = source_datapipe
124 |         self.catalog_url: str = catalog_url
125 |         self.kwargs = kwargs
126 | 
127 |     def __iter__(self) -> Iterator:
128 |         catalog = pystac_client.Client.open(url=self.catalog_url, **self.kwargs)
129 | 
130 |         for query in self.source_datapipe:
131 |             search = catalog.search(**query)
132 |             yield search
133 | 
134 |     def __len__(self) -> int:
135 |         return len(self.source_datapipe)
136 | 
137 | 
138 | @functional_datapipe("list_pystac_items_by_search")
139 | class PySTACAPIItemListerIterDataPipe(IterDataPipe):
140 |     """
141 |     Lists the :py:class:`pystac.Item` objects that match the provided STAC API
142 |     search parameters (functional name: ``list_pystac_items_by_search``).
143 | 
144 |     Parameters
145 |     ----------
146 |     source_datapipe : IterDataPipe[pystac_client.ItemSearch]
147 |         A DataPipe that contains :py:class:`pystac_client.ItemSearch` object
148 |         instances that represents
149 |         a deferred query to a STAC search endpoint as described in the
150 |         `STAC API - Item Search spec <https://github.com/radiantearth/stac-api-spec/tree/main/item-search>`_.
151 | 
152 |     Yields
153 |     ------
154 |     stac_item : pystac.Item
155 |         A :py:class:`pystac.Item` object containing the specific
156 |         :py:class:`pystac.STACObject` implementation class represented in a
157 |         JSON format.
158 | 
159 |     Raises
160 |     ------
161 |     ModuleNotFoundError
162 |         If ``pystac_client`` is not installed. See
163 |         :doc:`install instructions for pystac-client <pystac_client:index>`,
164 |         (e.g. via ``pip install pystac-client``) before using this class.
165 | 
166 |     Example
167 |     -------
168 |     >>> import pytest
169 |     >>> pystac_client = pytest.importorskip("pystac_client")
170 |     ...
171 |     >>> from torchdata.datapipes.iter import IterableWrapper
172 |     >>> from zen3geo.datapipes import PySTACAPIItemLister
173 |     ...
174 |     >>> # List STAC Items from a STAC API query
175 |     >>> catalog = pystac_client.Client.open(
176 |     ...     url="https://explorer.digitalearth.africa/stac/"
177 |     ... )
178 |     >>> search = catalog.search(
179 |     ...     bbox=[57.2, -20.6, 57.9, -19.9],  # xmin, ymin, xmax, ymax
180 |     ...     datetime=["2023-01-01T00:00:00Z", "2023-01-31T00:00:00Z"],
181 |     ...     collections=["s2_l2a"],
182 |     ... )
183 |     >>> dp = IterableWrapper(iterable=[search])
184 |     >>> dp_pystac_item_list = dp.list_pystac_items_by_search()
185 |     ...
186 |     >>> # Loop or iterate over the DataPipe stream
187 |     >>> it = iter(dp_pystac_item_list)
188 |     >>> stac_item = next(it)
189 |     >>> stac_item
190 |     <Item id=ec16dbf6-9729-5a8f-9d72-5e83a8b9f30d>
191 |     >>> stac_item.properties  # doctest: +NORMALIZE_WHITESPACE
192 |     {'title': 'S2B_MSIL2A_20230103T062449_N0509_R091_T40KED_20230103T075000',
193 |      'gsd': 10,
194 |      'proj:epsg': 32740,
195 |      'platform': 'sentinel-2b',
196 |      'view:off_nadir': 0,
197 |      'instruments': ['msi'],
198 |      'eo:cloud_cover': 0.02,
199 |      'odc:file_format': 'GeoTIFF',
200 |      'odc:region_code': '40KED',
201 |      'constellation': 'sentinel-2',
202 |      'sentinel:sequence': '0',
203 |      'sentinel:utm_zone': 40,
204 |      'sentinel:product_id': 'S2B_MSIL2A_20230103T062449_N0509_R091_T40KED_20230103T075000',
205 |      'sentinel:grid_square': 'ED',
206 |      'sentinel:data_coverage': 28.61,
207 |      'sentinel:latitude_band': 'K',
208 |      'created': '2023-01-03T06:24:53Z',
209 |      'sentinel:valid_cloud_cover': True,
210 |      'sentinel:boa_offset_applied': True,
211 |      'sentinel:processing_baseline': '05.09',
212 |      'proj:shape': [10980, 10980],
213 |      'proj:transform': [10.0, 0.0, 499980.0, 0.0, -10.0, 7900000.0, 0.0, 0.0, 1.0],
214 |      'cubedash:region_code': '40KED',
215 |      'datetime': '2023-01-03T06:24:53Z'}
216 |     """
217 | 
218 |     def __init__(self, source_datapipe):
219 |         if pystac_client is None:
220 |             raise ModuleNotFoundError(
221 |                 "Package `pystac_client` is required to be installed to use this datapipe. "
222 |                 "Please use `pip install pystac-client` or "
223 |                 "`conda install -c conda-forge pystac-client` "
224 |                 "to install the package"
225 |             )
226 |         self.source_datapipe = source_datapipe
227 | 
228 |     def __iter__(self):
229 |         for item_search in self.source_datapipe:
230 |             yield from item_search.items()
231 | 
232 |     def __len__(self):
233 |         return sum(item_search.matched() for item_search in self.source_datapipe)
234 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/rioxarray.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DataPipes for :doc:`rioxarray <rioxarray:index>`.
 3 | """
 4 | from typing import Any, Dict, Iterator, Optional
 5 | 
 6 | import rioxarray
 7 | from torchdata.datapipes import functional_datapipe
 8 | from torchdata.datapipes.iter import IterDataPipe
 9 | from torchdata.datapipes.utils import StreamWrapper
10 | 
11 | 
12 | @functional_datapipe("read_from_rioxarray")
13 | class RioXarrayReaderIterDataPipe(IterDataPipe[StreamWrapper]):
14 |     """
15 |     Takes raster files (e.g. GeoTIFFs) from local disk or URLs
16 |     (as long as they can be read by rioxarray and/or rasterio)
17 |     and yields :py:class:`xarray.DataArray` objects (functional name:
18 |     ``read_from_rioxarray``).
19 | 
20 |     Based on
21 |     https://github.com/pytorch/data/blob/v0.4.0/torchdata/datapipes/iter/load/online.py#L55-L96
22 | 
23 |     Parameters
24 |     ----------
25 |     source_datapipe : IterDataPipe[str]
26 |         A DataPipe that contains filepaths or URL links to raster files such as
27 |         GeoTIFFs.
28 | 
29 |     kwargs : Optional
30 |         Extra keyword arguments to pass to :py:func:`rioxarray.open_rasterio`
31 |         and/or :py:func:`rasterio.open`.
32 | 
33 |     Yields
34 |     ------
35 |     stream_obj : xarray.DataArray
36 |         An :py:class:`xarray.DataArray` object containing the raster data.
37 | 
38 |     Example
39 |     -------
40 |     >>> from torchdata.datapipes.iter import IterableWrapper
41 |     >>> from zen3geo.datapipes import RioXarrayReader
42 |     ...
43 |     >>> # Read in GeoTIFF data using DataPipe
44 |     >>> file_url: str = "https://github.com/GenericMappingTools/gmtserver-admin/raw/master/cache/earth_day_HD.tif"
45 |     >>> dp = IterableWrapper(iterable=[file_url])
46 |     >>> dp_rioxarray = dp.read_from_rioxarray()
47 |     ...
48 |     >>> # Loop or iterate over the DataPipe stream
49 |     >>> it = iter(dp_rioxarray)
50 |     >>> dataarray = next(it)
51 |     >>> dataarray.encoding["source"]
52 |     'https://github.com/GenericMappingTools/gmtserver-admin/raw/master/cache/earth_day_HD.tif'
53 |     >>> dataarray
54 |     StreamWrapper<<xarray.DataArray (band: 1, y: 960, x: 1920)>
55 |     [1843200 values with dtype=uint8]
56 |     Coordinates:
57 |       * band         (band) int64 1
58 |       * x            (x) float64 -179.9 -179.7 -179.5 -179.3 ... 179.5 179.7 179.9
59 |       * y            (y) float64 89.91 89.72 89.53 89.34 ... -89.53 -89.72 -89.91
60 |         spatial_ref  int64 0
61 |     ...
62 |     """
63 | 
64 |     def __init__(
65 |         self, source_datapipe: IterDataPipe[str], **kwargs: Optional[Dict[str, Any]]
66 |     ) -> None:
67 |         self.source_datapipe: IterDataPipe[str] = source_datapipe
68 |         self.kwargs = kwargs
69 | 
70 |     def __iter__(self) -> Iterator[StreamWrapper]:
71 |         for filename in self.source_datapipe:
72 |             yield StreamWrapper(
73 |                 rioxarray.open_rasterio(filename=filename, **self.kwargs)
74 |             )
75 | 
76 |     def __len__(self) -> int:
77 |         return len(self.source_datapipe)
78 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/stackstac.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DataPipes for :doc:`stackstac <stackstac:index>`.
  3 | """
  4 | from typing import Any, Dict, Iterator, Optional
  5 | 
  6 | import xarray as xr
  7 | 
  8 | try:
  9 |     import stackstac
 10 | except ImportError:
 11 |     stackstac = None
 12 | from torchdata.datapipes import functional_datapipe
 13 | from torchdata.datapipes.iter import IterDataPipe
 14 | 
 15 | 
 16 | @functional_datapipe("mosaic_dataarray")
 17 | class StackSTACMosaickerIterDataPipe(IterDataPipe[xr.DataArray]):
 18 |     """
 19 |     Takes :py:class:`xarray.DataArray` objects, flattens a dimension by picking
 20 |     the first valid pixel, to yield mosaicked :py:class:`xarray.DataArray`
 21 |     objects (functional name: ``mosaic_dataarray``).
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     source_datapipe : IterDataPipe[xarray.DataArray]
 26 |         A DataPipe that contains :py:class:`xarray.DataArray` objects, with
 27 |         e.g. dimensions ("time", "band", "y", "x").
 28 | 
 29 |     kwargs : Optional
 30 |         Extra keyword arguments to pass to :py:func:`stackstac.mosaic`.
 31 | 
 32 |     Yields
 33 |     ------
 34 |     dataarray : xarray.DataArray
 35 |         An :py:class:`xarray.DataArray` that has been mosaicked with e.g.
 36 |         dimensions ("band", "y", "x").
 37 | 
 38 |     Raises
 39 |     ------
 40 |     ModuleNotFoundError
 41 |         If ``stackstac`` is not installed. See
 42 |         :doc:`install instructions for stackstac <stackstac:index>`, (e.g. via
 43 |         ``pip install stackstac``) before using this class.
 44 | 
 45 |     Example
 46 |     -------
 47 |     >>> import pytest
 48 |     >>> import xarray as xr
 49 |     >>> pystac = pytest.importorskip("pystac")
 50 |     >>> stackstac = pytest.importorskip("stackstac")
 51 |     ...
 52 |     >>> from torchdata.datapipes.iter import IterableWrapper
 53 |     >>> from zen3geo.datapipes import StackSTACMosaicker
 54 |     ...
 55 |     >>> # Get list of ALOS DEM tiles to mosaic together later
 56 |     >>> item_urls = [
 57 |     ...     "https://planetarycomputer.microsoft.com/api/stac/v1/collections/alos-dem/items/ALPSMLC30_N022E113_DSM",
 58 |     ...     "https://planetarycomputer.microsoft.com/api/stac/v1/collections/alos-dem/items/ALPSMLC30_N022E114_DSM",
 59 |     ... ]
 60 |     >>> stac_items = [pystac.Item.from_file(href=url) for url in item_urls]
 61 |     >>> dataarray = stackstac.stack(items=stac_items)
 62 |     >>> assert dataarray.sizes == {'time': 2, 'band': 1, 'y': 3600, 'x': 7200}
 63 |     ...
 64 |     >>> # Mosaic different tiles in an xarray.DataArray using DataPipe
 65 |     >>> dp = IterableWrapper(iterable=[dataarray])
 66 |     >>> dp_mosaic = dp.mosaic_dataarray()
 67 |     ...
 68 |     >>> # Loop or iterate over the DataPipe stream
 69 |     >>> it = iter(dp_mosaic)
 70 |     >>> dataarray = next(it)
 71 |     >>> print(dataarray.sizes)
 72 |     Frozen({'band': 1, 'y': 3600, 'x': 7200})
 73 |     >>> print(dataarray.coords)
 74 |     Coordinates:
 75 |       * band         (band) <U4 'data'
 76 |       * x            (x) float64 113.0 113.0 113.0 113.0 ... 115.0 115.0 115.0 115.0
 77 |       * y            (y) float64 23.0 23.0 23.0 23.0 23.0 ... 22.0 22.0 22.0 22.0
 78 |     ...
 79 |     >>> print(dataarray.attrs["spec"])
 80 |     RasterSpec(epsg=4326, bounds=(113.0, 22.0, 115.0, 23.0), resolutions_xy=(0.0002777777777777778, 0.0002777777777777778))
 81 |     """
 82 | 
 83 |     def __init__(
 84 |         self,
 85 |         source_datapipe: IterDataPipe[xr.DataArray],
 86 |         **kwargs: Optional[Dict[str, Any]]
 87 |     ) -> None:
 88 |         if stackstac is None:
 89 |             raise ModuleNotFoundError(
 90 |                 "Package `stackstac` is required to be installed to use this datapipe. "
 91 |                 "Please use `pip install stackstac` or "
 92 |                 "`conda install -c conda-forge stackstac` "
 93 |                 "to install the package"
 94 |             )
 95 |         self.source_datapipe: IterDataPipe = source_datapipe
 96 |         self.kwargs = kwargs
 97 | 
 98 |     def __iter__(self) -> Iterator[xr.DataArray]:
 99 |         for dataarray in self.source_datapipe:
100 |             yield stackstac.mosaic(arr=dataarray, **self.kwargs)
101 | 
102 |     def __len__(self) -> int:
103 |         return len(self.source_datapipe)
104 | 
105 | 
106 | @functional_datapipe("stack_stac_items")
107 | class StackSTACStackerIterDataPipe(IterDataPipe[xr.DataArray]):
108 |     """
109 |     Takes :py:class:`pystac.Item` objects, reprojects them to the same grid
110 |     and stacks them along time, to yield :py:class:`xarray.DataArray` objects
111 |     (functional name: ``stack_stac_items``).
112 | 
113 |     Parameters
114 |     ----------
115 |     source_datapipe : IterDataPipe[pystac.Item]
116 |         A DataPipe that contains :py:class:`pystac.Item` objects.
117 | 
118 |     kwargs : Optional
119 |         Extra keyword arguments to pass to :py:func:`stackstac.stack`.
120 | 
121 |     Yields
122 |     ------
123 |     datacube : xarray.DataArray
124 |         An :py:class:`xarray.DataArray` backed by a
125 |         :py:class:`dask.array.Array` containing the time-series datacube. The
126 |         dimensions will be ("time", "band", "y", "x").
127 | 
128 |     Raises
129 |     ------
130 |     ModuleNotFoundError
131 |         If ``stackstac`` is not installed. See
132 |         :doc:`install instructions for stackstac <stackstac:index>`, (e.g. via
133 |         ``pip install stackstac``) before using this class.
134 | 
135 |     Example
136 |     -------
137 |     >>> import pytest
138 |     >>> pystac = pytest.importorskip("pystac")
139 |     >>> stacstac = pytest.importorskip("stackstac")
140 |     ...
141 |     >>> from torchdata.datapipes.iter import IterableWrapper
142 |     >>> from zen3geo.datapipes import StackSTACStacker
143 |     ...
144 |     >>> # Stack different bands in a STAC Item using DataPipe
145 |     >>> item_url: str = "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-1-grd/items/S1A_IW_GRDH_1SDV_20220914T093226_20220914T093252_044999_056053"
146 |     >>> stac_item = pystac.Item.from_file(href=item_url)
147 |     >>> dp = IterableWrapper(iterable=[stac_item])
148 |     >>> dp_stackstac = dp.stack_stac_items(
149 |     ...     assets=["vh", "vv"], epsg=32652, resolution=10
150 |     ... )
151 |     ...
152 |     >>> # Loop or iterate over the DataPipe stream
153 |     >>> it = iter(dp_stackstac)
154 |     >>> dataarray = next(it)
155 |     >>> print(dataarray.sizes)
156 |     Frozen({'time': 1, 'band': 2, 'y': 20686, 'x': 28043})
157 |     >>> print(dataarray.coords)
158 |     Coordinates:
159 |       * time                                   (time) datetime64[ns] 2022-09-14T0...
160 |         id                                     (time) <U62 'S1A_IW_GRDH_1SDV_2022...
161 |       * band                                   (band) <U2 'vh' 'vv'
162 |       * x                                      (x) float64 1.354e+05 ... 4.158e+05
163 |       * y                                      (y) float64 4.305e+06 ... 4.098e+06
164 |     ...
165 |     >>> print(dataarray.attrs["spec"])
166 |     RasterSpec(epsg=32652, bounds=(135370, 4098080, 415800, 4304940), resolutions_xy=(10, 10))
167 |     """
168 | 
169 |     def __init__(
170 |         self, source_datapipe: IterDataPipe, **kwargs: Optional[Dict[str, Any]]
171 |     ) -> None:
172 |         if stackstac is None:
173 |             raise ModuleNotFoundError(
174 |                 "Package `stackstac` is required to be installed to use this datapipe. "
175 |                 "Please use `pip install stackstac` or "
176 |                 "`conda install -c conda-forge stackstac` "
177 |                 "to install the package"
178 |             )
179 |         self.source_datapipe: IterDataPipe = source_datapipe
180 |         self.kwargs = kwargs
181 | 
182 |     def __iter__(self) -> Iterator[xr.DataArray]:
183 |         for stac_items in self.source_datapipe:
184 |             yield stackstac.stack(items=stac_items, **self.kwargs)
185 | 
186 |     def __len__(self) -> int:
187 |         return len(self.source_datapipe)
188 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/xbatcher.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DataPipes for :doc:`xbatcher <xbatcher:index>`.
  3 | """
  4 | from typing import Any, Dict, Hashable, Iterator, Optional, Tuple, Union
  5 | 
  6 | import xarray as xr
  7 | 
  8 | try:
  9 |     import xbatcher
 10 | except ImportError:
 11 |     xbatcher = None
 12 | from torchdata.datapipes import functional_datapipe
 13 | from torchdata.datapipes.iter import IterDataPipe
 14 | 
 15 | 
 16 | @functional_datapipe("slice_with_xbatcher")
 17 | class XbatcherSlicerIterDataPipe(IterDataPipe[Union[xr.DataArray, xr.Dataset]]):
 18 |     """
 19 |     Takes an :py:class:`xarray.DataArray` or :py:class:`xarray.Dataset`
 20 |     and creates a sliced window view (also known as a chip or tile) of the
 21 |     n-dimensional array (functional name: ``slice_with_xbatcher``).
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     source_datapipe : IterDataPipe[xarray.DataArray]
 26 |         A DataPipe that contains :py:class:`xarray.DataArray` or
 27 |         :py:class:`xarray.Dataset` objects.
 28 | 
 29 |     input_dims : dict
 30 |         A dictionary specifying the size of the inputs in each dimension to
 31 |         slice along, e.g. ``{'lon': 64, 'lat': 64}``. These are the dimensions
 32 |         the machine learning library will see. All other dimensions will be
 33 |         stacked into one dimension called ``batch``.
 34 | 
 35 |     kwargs : Optional
 36 |         Extra keyword arguments to pass to :py:class:`xbatcher.BatchGenerator`.
 37 | 
 38 |     Yields
 39 |     ------
 40 |     chip : xarray.DataArray
 41 |         An :py:class:`xarray.DataArray` or :py:class:`xarray.Dataset` object
 42 |         containing the sliced raster data, with the size/shape defined by the
 43 |         ``input_dims`` parameter.
 44 | 
 45 |     Raises
 46 |     ------
 47 |     ModuleNotFoundError
 48 |         If ``xbatcher`` is not installed. Follow
 49 |         :doc:`install instructions for xbatcher <xbatcher:index>`
 50 |         before using this class.
 51 | 
 52 |     Example
 53 |     -------
 54 |     >>> import pytest
 55 |     >>> import numpy as np
 56 |     >>> import xarray as xr
 57 |     >>> xbatcher = pytest.importorskip("xbatcher")
 58 |     ...
 59 |     >>> from torchdata.datapipes.iter import IterableWrapper
 60 |     >>> from zen3geo.datapipes import XbatcherSlicer
 61 |     ...
 62 |     >>> # Sliced window view of xarray.DataArray using DataPipe
 63 |     >>> dataarray: xr.DataArray = xr.DataArray(
 64 |     ...     data=np.ones(shape=(3, 64, 64)),
 65 |     ...     name="foo",
 66 |     ...     dims=["band", "y", "x"]
 67 |     ... )
 68 |     >>> dp = IterableWrapper(iterable=[dataarray])
 69 |     >>> dp_xbatcher = dp.slice_with_xbatcher(input_dims={"y": 2, "x": 2})
 70 |     ...
 71 |     >>> # Loop or iterate over the DataPipe stream
 72 |     >>> it = iter(dp_xbatcher)
 73 |     >>> dataarray_chip = next(it)
 74 |     >>> dataarray_chip
 75 |     <xarray.DataArray 'foo' (band: 3, y: 2, x: 2)>
 76 |     array([[[1., 1.],
 77 |             [1., 1.]],
 78 |     <BLANKLINE>
 79 |            [[1., 1.],
 80 |             [1., 1.]],
 81 |     <BLANKLINE>
 82 |            [[1., 1.],
 83 |             [1., 1.]]])
 84 |     Dimensions without coordinates: band, y, x
 85 |     """
 86 | 
 87 |     def __init__(
 88 |         self,
 89 |         source_datapipe: IterDataPipe[Union[xr.DataArray, xr.Dataset]],
 90 |         input_dims: Dict[Hashable, int],
 91 |         **kwargs: Optional[Dict[str, Any]],
 92 |     ) -> None:
 93 |         if xbatcher is None:
 94 |             raise ModuleNotFoundError(
 95 |                 "Package `xbatcher` is required to be installed to use this datapipe. "
 96 |                 "Please use `pip install xbatcher` "
 97 |                 "to install the package"
 98 |             )
 99 |         self.source_datapipe: IterDataPipe[
100 |             Union[xr.DataArray, xr.Dataset]
101 |         ] = source_datapipe
102 |         self.input_dims: Dict[Hashable, int] = input_dims
103 |         self.kwargs = kwargs
104 | 
105 |     def __iter__(self) -> Iterator[Union[xr.DataArray, xr.Dataset]]:
106 |         for dataarray in self.source_datapipe:
107 |             for chip in dataarray.batch.generator(
108 |                 input_dims=self.input_dims, **self.kwargs
109 |             ):
110 |                 yield chip
111 | 
112 |     def __len__(self) -> int:
113 |         return sum(
114 |             len(dataarray.batch.generator(input_dims=self.input_dims, **self.kwargs))
115 |             for dataarray in self.source_datapipe
116 |         )
117 | 


--------------------------------------------------------------------------------
/zen3geo/datapipes/xpystac.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DataPipes for `xpystac <https://github.com/stac-utils/xpystac>`__.
  3 | """
  4 | from typing import Any, Dict, Iterator, Optional
  5 | 
  6 | import xarray as xr
  7 | 
  8 | try:
  9 |     import pystac
 10 |     import xpystac
 11 | except ImportError:
 12 |     pystac = None
 13 |     xpystac = None
 14 | from torchdata.datapipes import functional_datapipe
 15 | from torchdata.datapipes.iter import IterDataPipe
 16 | from torchdata.datapipes.utils import StreamWrapper
 17 | 
 18 | 
 19 | @functional_datapipe("read_from_xpystac")
 20 | class XpySTACAssetReaderIterDataPipe(IterDataPipe[StreamWrapper]):
 21 |     """
 22 |     Takes a :py:class:`pystac.Asset` object containing n-dimensional data (e.g.
 23 |     :doc:`Zarr <zarr:index>`,
 24 |     `NetCDF <https://www.unidata.ucar.edu/software/netcdf>`__,
 25 |     `Cloud-Optimized GeoTIFF <https://www.cogeo.org>`__, etc) from local disk
 26 |     or URLs (as long as they can be read by xpystac) and yields
 27 |     :py:class:`xarray.Dataset` objects (functional name:
 28 |     ``read_from_xpystac``).
 29 | 
 30 |     Based on
 31 |     https://github.com/pytorch/data/blob/v0.5.1/torchdata/datapipes/iter/load/iopath.py#L42-L97
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     source_datapipe : IterDataPipe[pystac.Asset]
 36 |         A DataPipe that contains :py:class:`pystac.Asset` objects to
 37 |         n-dimensional files such as :doc:`Zarr <zarr:index>`,
 38 |         `NetCDF <https://www.unidata.ucar.edu/software/netcdf>`__,
 39 |         `Cloud-Optimized GeoTIFF <https://www.cogeo.org>`__, etc.
 40 | 
 41 |     engine : str or xarray.backends.BackendEntrypoint
 42 |         Engine to use when reading files. If not provided, the default engine
 43 |         will be the "stac" backend from ``xpystac``. Alternatively, set
 44 |         ``engine=None`` to let ``xarray`` choose the default engine based on
 45 |         available dependencies, with a preference for "netcdf4". See also
 46 |         :py:func:`xarray.open_dataset` for details about other engine options.
 47 | 
 48 |     kwargs : Optional
 49 |         Extra keyword arguments to pass to :py:func:`xarray.open_dataset`.
 50 | 
 51 |     Yields
 52 |     ------
 53 |     stream_obj : xarray.Dataset
 54 |         An :py:class:`xarray.Dataset` object containing the n-dimensional data.
 55 | 
 56 |     Raises
 57 |     ------
 58 |     ModuleNotFoundError
 59 |         If ``xpystac`` is not installed. See
 60 |         `install instructions for xpystac
 61 |         <https://github.com/stac-utils/xpystac#install>`__,
 62 |         (e.g. via ``pip install xpystac``) before using this class.
 63 | 
 64 |     Example
 65 |     -------
 66 |     >>> import pytest
 67 |     >>> pystac = pytest.importorskip("pystac")
 68 |     >>> xpystac = pytest.importorskip("xpystac")
 69 |     >>> zarr = pytest.importorskip("zarr")
 70 |     ...
 71 |     >>> from torchdata.datapipes.iter import IterableWrapper
 72 |     >>> from zen3geo.datapipes import XpySTACAssetReader
 73 |     ...
 74 |     >>> # Read in STAC Asset using DataPipe
 75 |     >>> collection_url: str = "https://planetarycomputer.microsoft.com/api/stac/v1/collections/nasa-nex-gddp-cmip6"
 76 |     >>> asset: pystac.Asset = pystac.Collection.from_file(href=collection_url).assets[
 77 |     ...     "ACCESS-CM2.historical"
 78 |     ... ]
 79 |     >>> dp = IterableWrapper(iterable=[asset])
 80 |     >>> dp_xpystac = dp.read_from_xpystac()
 81 |     ...
 82 |     >>> # Loop or iterate over the DataPipe stream
 83 |     >>> it = iter(dp_xpystac)
 84 |     >>> dataset = next(it)
 85 |     >>> dataset.sizes
 86 |     Frozen({'time': 23741, 'lat': 600, 'lon': 1440})
 87 |     >>> print(dataset.data_vars)
 88 |     Data variables:
 89 |         hurs     (time, lat, lon) float32 ...
 90 |         huss     (time, lat, lon) float32 ...
 91 |         pr       (time, lat, lon) float32 ...
 92 |         rlds     (time, lat, lon) float32 ...
 93 |         rsds     (time, lat, lon) float32 ...
 94 |         sfcWind  (time, lat, lon) float32 ...
 95 |         tas      (time, lat, lon) float32 ...
 96 |         tasmax   (time, lat, lon) float32 ...
 97 |         tasmin   (time, lat, lon) float32 ...
 98 |     >>> dataset.attrs  # doctest: +NORMALIZE_WHITESPACE
 99 |     {'Conventions': 'CF-1.7',
100 |      'activity': 'NEX-GDDP-CMIP6',
101 |      'cmip6_institution_id': 'CSIRO-ARCCSS',
102 |      'cmip6_license': 'CC-BY-SA 4.0',
103 |      'cmip6_source_id': 'ACCESS-CM2',
104 |      ...
105 |      'history': '2021-10-04T13:59:21.654137+00:00: install global attributes',
106 |      'institution': 'NASA Earth Exchange, NASA Ames Research Center, ...
107 |      'product': 'output',
108 |      'realm': 'atmos',
109 |      'references': 'BCSD method: Thrasher et al., 2012, ...
110 |      'resolution_id': '0.25 degree',
111 |      'scenario': 'historical',
112 |      'source': 'BCSD',
113 |      'title': 'ACCESS-CM2, r1i1p1f1, historical, global downscaled CMIP6 ...
114 |      'tracking_id': '16d27564-470f-41ea-8077-f4cc3efa5bfe',
115 |      'variant_label': 'r1i1p1f1',
116 |      'version': '1.0'}
117 |     """
118 | 
119 |     def __init__(
120 |         self,
121 |         source_datapipe: IterDataPipe,
122 |         engine: str = "stac",
123 |         **kwargs: Optional[Dict[str, Any]]
124 |     ) -> None:
125 |         if xpystac is None and engine == "stac":
126 |             raise ModuleNotFoundError(
127 |                 "Package `xpystac` is required to be installed to use this datapipe. "
128 |                 "Please use `pip install xpystac` "
129 |                 "to install the package"
130 |             )
131 |         self.source_datapipe: IterDataPipe = source_datapipe
132 |         self.engine: str = engine
133 |         self.kwargs = kwargs
134 | 
135 |     def __iter__(self) -> Iterator[StreamWrapper]:
136 |         for asset in self.source_datapipe:
137 |             yield StreamWrapper(
138 |                 xr.open_dataset(asset, engine=self.engine, **self.kwargs)
139 |             )
140 | 
141 |     def __len__(self) -> int:
142 |         return len(self.source_datapipe)
143 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_datashader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for datashader datapipes.
  3 | """
  4 | import numpy as np
  5 | import pytest
  6 | import xarray as xr
  7 | from torchdata.datapipes.iter import IterableWrapper
  8 | 
  9 | from zen3geo.datapipes import DatashaderRasterizer, XarrayCanvas
 10 | 
 11 | datashader = pytest.importorskip("datashader")
 12 | 
 13 | 
 14 | # %%
 15 | @pytest.fixture(scope="function", name="canvas")
 16 | def fixture_canvas():
 17 |     """
 18 |     The blank datashader.Canvas to use in the tests.
 19 |     """
 20 |     canvas = datashader.Canvas(
 21 |         plot_width=14, plot_height=10, x_range=(1, 8), y_range=(0, 5)
 22 |     )
 23 |     canvas.crs = "OGC:CRS84"
 24 |     return canvas
 25 | 
 26 | 
 27 | @pytest.fixture(scope="module", name="geodataframe")
 28 | def fixture_geodataframe():
 29 |     """
 30 |     A geopandas.GeoDataFrame containing a collection of shapely.geometry
 31 |     objects to use in the tests.
 32 |     """
 33 |     gpd = pytest.importorskip("geopandas")
 34 |     shapely = pytest.importorskip("shapely")
 35 | 
 36 |     geometries: list = [
 37 |         shapely.geometry.MultiPoint([(4.5, 4.5), (3.5, 1), (6, 3.5)]),
 38 |         shapely.geometry.LineString([(3, 5), (5, 3), (3, 2), (5, 0)]),
 39 |         shapely.geometry.Polygon([(6, 5), (3.5, 2.5), (6, 0), (6, 2.5), (5, 2.5)]),
 40 |     ]
 41 |     geodataframe = gpd.GeoDataFrame(data={"geometry": geometries})
 42 |     geodataframe = geodataframe.set_crs(crs="OGC:CRS84")
 43 | 
 44 |     return geodataframe
 45 | 
 46 | 
 47 | # %%
 48 | def test_datashader_canvas_dataset():
 49 |     """
 50 |     Ensure that XarrayCanvas works to create a blank datashader.Canvas object
 51 |     from an xarray.Dataset.
 52 |     """
 53 |     dataset: xr.Dataset = xr.Dataset(
 54 |         data_vars={"temperature": (["y", "x"], 15 * np.ones(shape=(12, 8)))},
 55 |         coords={
 56 |             "y": (["y"], np.linspace(start=6, stop=0, num=12)),
 57 |             "x": (["x"], np.linspace(start=0, stop=4, num=8)),
 58 |         },
 59 |     )
 60 |     dp = IterableWrapper(iterable=[dataset])
 61 | 
 62 |     # Using class constructors
 63 |     dp_canvas = XarrayCanvas(source_datapipe=dp)
 64 |     # Using functional form (recommended)
 65 |     dp_canvas = dp.canvas_from_xarray()
 66 | 
 67 |     assert len(dp_canvas) == 1
 68 |     it = iter(dp_canvas)
 69 |     canvas = next(it)
 70 | 
 71 |     assert canvas.plot_height == 12
 72 |     assert canvas.plot_width == 8
 73 |     assert hasattr(canvas, "crs")
 74 |     assert hasattr(canvas, "raster")
 75 | 
 76 | 
 77 | @pytest.mark.parametrize(
 78 |     ("geom_type", "sum_val"), [("Point", 3), ("Line", 13), ("Polygon", 15)]
 79 | )
 80 | def test_datashader_rasterize_vector_geometry(canvas, geodataframe, geom_type, sum_val):
 81 |     """
 82 |     Ensure that DatashaderRasterizer works to rasterize a
 83 |     geopandas.GeoDataFrame of point, line or polygon type into an
 84 |     xarray.DataArray grid.
 85 |     """
 86 |     dp = IterableWrapper(iterable=[canvas, canvas])
 87 | 
 88 |     vector = geodataframe[geodataframe.type.str.contains(geom_type)]
 89 |     dp_vector = IterableWrapper(iterable=[vector])
 90 | 
 91 |     # Using class constructors
 92 |     dp_datashader = DatashaderRasterizer(source_datapipe=dp, vector_datapipe=dp_vector)
 93 |     # Using functional form (recommended)
 94 |     dp_datashader = dp.rasterize_with_datashader(vector_datapipe=dp_vector)
 95 | 
 96 |     assert len(dp_datashader) == 2
 97 |     it = iter(dp_datashader)
 98 |     dataarray = next(it)
 99 | 
100 |     assert dataarray.data.sum() == sum_val
101 |     assert dataarray.dims == ("y", "x")
102 |     assert dataarray.rio.crs == "OGC:CRS84"
103 |     assert dataarray.rio.shape == (10, 14)
104 |     assert dataarray.rio.transform().e == -0.5
105 | 
106 | 
107 | def test_datashader_rasterize_canvas_missing_crs(canvas, geodataframe):
108 |     """
109 |     Ensure that DatashaderRasterizer raises an AttributeError when the
110 |     input datashader.Canvas has no crs attribute.
111 |     """
112 |     canvas.crs = None
113 |     dp_canvas = IterableWrapper(iterable=[canvas])
114 |     dp_vector = IterableWrapper(iterable=[geodataframe.geometry])
115 |     dp_datashader = dp_canvas.rasterize_with_datashader(vector_datapipe=dp_vector)
116 | 
117 |     assert len(dp_datashader) == 1
118 |     it = iter(dp_datashader)
119 |     with pytest.raises(
120 |         AttributeError, match="Missing crs information for datashader.Canvas"
121 |     ):
122 |         raster = next(it)
123 | 
124 | 
125 | def test_datashader_rasterize_vector_missing_crs(canvas, geodataframe):
126 |     """
127 |     Ensure that DatashaderRasterizer raises an AttributeError when the
128 |     input geopandas.GeoSeries has no crs attribute.
129 |     """
130 |     vector = geodataframe.geometry
131 |     vector.crs = None
132 |     dp_canvas = IterableWrapper(iterable=[canvas])
133 |     dp_vector = IterableWrapper(iterable=[vector])
134 |     dp_datashader = dp_canvas.rasterize_with_datashader(vector_datapipe=dp_vector)
135 | 
136 |     assert len(dp_datashader) == 1
137 |     it = iter(dp_datashader)
138 |     with pytest.raises(AttributeError, match="Missing crs information for input"):
139 |         raster = next(it)
140 | 
141 | 
142 | def test_datashader_rasterize_unmatched_lengths(canvas, geodataframe):
143 |     """
144 |     Ensure that DatashaderRasterizer raises a ValueError when the length of the
145 |     canvas datapipe is unmatched with the length of the vector datapipe.
146 |     """
147 |     # Canvas:Vector ratio of 3:2
148 |     dp_canvas = IterableWrapper(iterable=[canvas, canvas, canvas])
149 |     dp_vector = IterableWrapper(iterable=[geodataframe, geodataframe])
150 | 
151 |     with pytest.raises(ValueError, match="Unmatched lengths for the"):
152 |         dp_datashader = dp_canvas.rasterize_with_datashader(vector_datapipe=dp_vector)
153 | 
154 | 
155 | def test_datashader_rasterize_vector_geometrycollection(canvas, geodataframe):
156 |     """
157 |     Ensure that DatashaderRasterizer raises a NotImplementedError when an
158 |     unsupported vector type like GeometryCollection is used.
159 |     """
160 |     gpd = pytest.importorskip("geopandas")
161 | 
162 |     # Merge points, lines and polygons into a single GeometryCollection
163 |     geocollection = gpd.GeoSeries(data=geodataframe.unary_union)
164 |     geocollection = geocollection.set_crs(crs="OGC:CRS84")
165 | 
166 |     dp = IterableWrapper(iterable=[canvas])
167 |     dp_vector = IterableWrapper(iterable=[geocollection])
168 |     dp_datashader = dp.rasterize_with_datashader(vector_datapipe=dp_vector)
169 | 
170 |     assert len(dp_datashader) == 1
171 |     it = iter(dp_datashader)
172 |     with pytest.raises(NotImplementedError, match="Unsupported geometry type"):
173 |         raster = next(it)
174 | 
175 | 
176 | def test_datashader_rasterize_invalid_vector(canvas, geodataframe):
177 |     """
178 |     Ensure that DatashaderRasterizer raises a ValueError when an invalid
179 |     geopandas.GeoDataFrame without a geometry is passed in as input.
180 | 
181 |     Regression test for https://github.com/weiji14/zen3geo/pull/104.
182 |     """
183 |     # GeoDataFrame with empty data
184 |     gdf_none = geodataframe.loc[5:]
185 |     gdf_none = gdf_none.set_crs(crs="OGC:CRS84")
186 | 
187 |     dp = IterableWrapper(iterable=[canvas])
188 |     dp_vector = IterableWrapper(iterable=[gdf_none])
189 |     dp_datashader = dp.rasterize_with_datashader(vector_datapipe=dp_vector)
190 | 
191 |     assert len(dp_datashader) == 1
192 |     it = iter(dp_datashader)
193 |     with pytest.raises(ValueError, match="Cannot infer spatialpandas geometry type"):
194 |         raster = next(it)
195 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_geopandas.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for geopandas datapipes.
  3 | """
  4 | import numpy as np
  5 | import pytest
  6 | import xarray as xr
  7 | from torchdata.datapipes.iter import IterableWrapper
  8 | 
  9 | from zen3geo.datapipes import GeoPandasRectangleClipper
 10 | 
 11 | gpd = pytest.importorskip("geopandas")
 12 | shapely = pytest.importorskip("shapely")
 13 | 
 14 | # %%
 15 | @pytest.fixture(scope="module", name="geodataframe")
 16 | def fixture_geodataframe():
 17 |     """
 18 |     A geopandas.GeoDataFrame containing a collection of shapely.geometry
 19 |     objects to use in the tests.
 20 |     """
 21 |     geometries: list = [
 22 |         shapely.geometry.box(minx=0.0, miny=0.0, maxx=2.0, maxy=2.0),
 23 |         shapely.geometry.box(minx=2.0, miny=2.0, maxx=4.0, maxy=4.0),
 24 |     ]
 25 |     geodataframe = gpd.GeoDataFrame(data={"geometry": geometries})
 26 |     geodataframe = geodataframe.set_crs(crs="OGC:CRS84")
 27 | 
 28 |     return geodataframe
 29 | 
 30 | 
 31 | @pytest.fixture(scope="function", name="dataset")
 32 | def fixture_dataset():
 33 |     """
 34 |     The sample xarray.Dataset to use in the tests.
 35 |     """
 36 |     dataarray = xr.DataArray(
 37 |         data=np.ones(shape=(1, 5, 7)),
 38 |         coords=dict(
 39 |             band=[0],
 40 |             y=np.linspace(start=4.0, stop=0.0, num=5),
 41 |             x=np.linspace(start=-1.0, stop=5, num=7),
 42 |         ),
 43 |         dims=("band", "y", "x"),
 44 |         name="foo",
 45 |     )
 46 |     dataset: xr.Dataset = dataarray.to_dataset()
 47 |     dataset: xr.Dataset = dataset.rio.write_crs(input_crs="OGC:CRS84")
 48 | 
 49 |     return dataset
 50 | 
 51 | 
 52 | # %%
 53 | def test_geopandas_rectangle_clipper_geoseries_dataset(geodataframe, dataset):
 54 |     """
 55 |     Ensure that GeoPandasRectangleClipper works to clip a geopandas.GeoSeries
 56 |     vector with xarray.Dataset rasters and outputs a tuple made up of a
 57 |     spatially subsetted geopandas.GeoSeries and an xarray.Dataset raster mask.
 58 |     """
 59 |     dp_vector = IterableWrapper(iterable=[geodataframe.geometry])
 60 |     dp_raster = IterableWrapper(
 61 |         iterable=[
 62 |             dataset.rio.clip_box(minx=-1, miny=0, maxx=1, maxy=1),
 63 |             dataset.rio.clip_box(minx=3, miny=3, maxx=5, maxy=4),
 64 |         ]
 65 |     )
 66 | 
 67 |     # Using class constructors
 68 |     dp_clipped = GeoPandasRectangleClipper(
 69 |         source_datapipe=dp_vector, mask_datapipe=dp_raster
 70 |     )
 71 |     # Using functional form (recommended)
 72 |     dp_clipped = dp_vector.clip_vector_with_rectangle(mask_datapipe=dp_raster)
 73 | 
 74 |     assert len(dp_clipped) == 2
 75 |     it = iter(dp_clipped)
 76 | 
 77 |     clipped_geoseries, raster_chip = next(it)
 78 |     assert clipped_geoseries.crs == "OGC:CRS84"
 79 |     assert all(clipped_geoseries.geom_type == "Polygon")
 80 |     assert clipped_geoseries.shape == (1,)
 81 |     assert clipped_geoseries[0].bounds == (0.0, 0.0, 1.5, 1.5)
 82 |     assert raster_chip.dims == {"band": 1, "y": 2, "x": 3}
 83 |     assert raster_chip.rio.bounds() == (-1.5, -0.5, 1.5, 1.5)
 84 | 
 85 |     clipped_geoseries, raster_chip = next(it)
 86 |     assert clipped_geoseries.shape == (1,)
 87 |     assert clipped_geoseries[1].bounds == (2.5, 2.5, 4.0, 4.0)
 88 |     assert raster_chip.dims == {"band": 1, "y": 2, "x": 3}
 89 |     assert raster_chip.rio.bounds() == (2.5, 2.5, 5.5, 4.5)
 90 |     assert raster_chip.rio.crs == "OGC:CRS84"
 91 | 
 92 | 
 93 | def test_geopandas_rectangle_clipper_different_crs(geodataframe, dataset):
 94 |     """
 95 |     Ensure that GeoPandasRectangleClipper works to clip a geopandas.GeoSeries
 96 |     vector with xarray.Dataset rasters which have different coordinate
 97 |     reference systems, and outputs a tuple made up of a spatially subsetted
 98 |     geopandas.GeoSeries and an xarray.Dataset raster mask that both have the
 99 |     same coordinate reference system.
100 |     """
101 |     dp_vector = IterableWrapper(iterable=[geodataframe.geometry])
102 | 
103 |     dataset_3857 = dataset.rio.clip_box(minx=-1, miny=0, maxx=1, maxy=1).rio.reproject(
104 |         "EPSG:3857"
105 |     )
106 |     dataset_32631 = dataset.rio.clip_box(minx=3, miny=3, maxx=5, maxy=4).rio.reproject(
107 |         "EPSG:32631"
108 |     )
109 |     dp_raster = IterableWrapper(iterable=[dataset_3857, dataset_32631])
110 | 
111 |     # Using class constructors
112 |     dp_clipped = GeoPandasRectangleClipper(
113 |         source_datapipe=dp_vector, mask_datapipe=dp_raster
114 |     )
115 |     # Using functional form (recommended)
116 |     dp_clipped = dp_vector.clip_vector_with_rectangle(mask_datapipe=dp_raster)
117 | 
118 |     assert len(dp_clipped) == 2
119 |     it = iter(dp_clipped)
120 | 
121 |     clipped_geoseries, raster_chip = next(it)
122 |     assert clipped_geoseries.crs == "EPSG:3857"
123 |     assert all(clipped_geoseries.geom_type == "Polygon")
124 |     assert clipped_geoseries.shape == (1,)
125 |     assert clipped_geoseries[0].bounds == (
126 |         0.0,
127 |         0.0,
128 |         166988.3675623712,
129 |         166998.31375292226,
130 |     )
131 |     assert raster_chip.dims == {"band": 1, "y": 2, "x": 3}
132 |     assert raster_chip.rio.bounds() == (
133 |         -166979.23618991036,
134 |         -55646.75541526544,
135 |         166988.3675623712,
136 |         166998.31375292226,
137 |     )
138 |     assert raster_chip.rio.crs == "EPSG:3857"
139 | 
140 |     clipped_geoseries, raster_chip = next(it)
141 |     assert clipped_geoseries.crs == "EPSG:32631"
142 |     assert clipped_geoseries.shape == (1,)
143 |     assert clipped_geoseries[1].bounds == (
144 |         444414.4114896285,
145 |         276009.81064532325,
146 |         611163.137304327,
147 |         442194.9725083875,
148 |     )
149 |     assert raster_chip.dims == {"band": 1, "y": 2, "x": 3}
150 |     assert raster_chip.rio.bounds() == (
151 |         444414.4114896285,
152 |         276009.81064532325,
153 |         777205.5384580799,
154 |         497870.56195762416,
155 |     )
156 |     assert raster_chip.rio.crs == "EPSG:32631"
157 | 
158 | 
159 | def test_geopandas_rectangle_clipper_incorrect_length(geodataframe, dataset):
160 |     """
161 |     Ensure that GeoPandasRectangleClipper raises a NotImplementedError when the
162 |     length of the vector datapipe is not equal to 1.
163 |     """
164 |     dp_vector = IterableWrapper(iterable=[geodataframe, geodataframe])
165 |     dp_raster = IterableWrapper(iterable=[dataset, dataset, dataset])
166 | 
167 |     with pytest.raises(NotImplementedError, match="The vector datapipe's length can"):
168 |         dp_clipped = dp_vector.clip_vector_with_rectangle(mask_datapipe=dp_raster)
169 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_pyogrio.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for pyogrio datapipes.
 3 | """
 4 | import pytest
 5 | from torchdata.datapipes.iter import IterableWrapper
 6 | 
 7 | from zen3geo.datapipes import PyogrioReader
 8 | 
 9 | pyogrio = pytest.importorskip("pyogrio")
10 | 
11 | # %%
12 | def test_pyogrio_reader():
13 |     """
14 |     Ensure that PyogrioReader works to read in a GeoPackage file and outputs a
15 |     geopandas.GeoDataFrame object.
16 |     """
17 |     file_url: str = "https://github.com/geopandas/pyogrio/raw/v0.4.0/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg"
18 |     dp = IterableWrapper(iterable=[file_url])
19 | 
20 |     # Using class constructors
21 |     dp_pyogrio = PyogrioReader(source_datapipe=dp)
22 |     # Using functional form (recommended)
23 |     dp_pyogrio = dp.read_from_pyogrio()
24 | 
25 |     assert len(dp_pyogrio) == 1
26 |     it = iter(dp_pyogrio)
27 |     geodataframe = next(it)
28 | 
29 |     assert geodataframe.shape == (4, 12)
30 |     assert any(geodataframe.isna())
31 |     assert all(geodataframe.geom_type == "Point")
32 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_pystac.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for pystac datapipes.
 3 | """
 4 | import pytest
 5 | from torchdata.datapipes.iter import IterableWrapper
 6 | 
 7 | from zen3geo.datapipes import PySTACItemReader
 8 | 
 9 | pystac = pytest.importorskip("pystac")
10 | 
11 | # %%
12 | def test_pystac_item_reader():
13 |     """
14 |     Ensure that PySTACItemReader works to read in a JSON STAC item and outputs
15 |     to a pystac.Item object.
16 |     """
17 |     item_url: str = "https://github.com/stac-utils/pystac/raw/v1.6.1/tests/data-files/item/sample-item.json"
18 |     dp = IterableWrapper(iterable=[item_url])
19 | 
20 |     # Using class constructors
21 |     dp_pystac = PySTACItemReader(source_datapipe=dp)
22 |     # Using functional form (recommended)
23 |     dp_pystac = dp.read_to_pystac_item()
24 | 
25 |     assert len(dp_pystac) == 1
26 |     it = iter(dp_pystac)
27 |     stac_item = next(it)
28 | 
29 |     assert stac_item.bbox == [-122.59750209, 37.48803556, -122.2880486, 37.613537207]
30 |     assert stac_item.datetime.isoformat() == "2016-05-03T13:22:30.040000+00:00"
31 |     assert stac_item.geometry["type"] == "Polygon"
32 |     assert stac_item.properties == {
33 |         "datetime": "2016-05-03T13:22:30.040000Z",
34 |         "title": "A CS3 item",
35 |         "license": "PDDL-1.0",
36 |         "providers": [
37 |             {
38 |                 "name": "CoolSat",
39 |                 "roles": ["producer", "licensor"],
40 |                 "url": "https://cool-sat.com/",
41 |             }
42 |         ],
43 |     }
44 |     assert (
45 |         stac_item.assets["analytic"].extra_fields["product"]
46 |         == "http://cool-sat.com/catalog/products/analytic.json"
47 |     )
48 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_pystac_client.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for pystac-client datapipes.
  3 | """
  4 | import pytest
  5 | from torchdata.datapipes.iter import IterableWrapper
  6 | 
  7 | from zen3geo.datapipes import PySTACAPIItemLister, PySTACAPISearcher
  8 | 
  9 | pystac_client = pytest.importorskip("pystac_client")
 10 | 
 11 | 
 12 | # %%
 13 | def test_pystac_client_item_search():
 14 |     """
 15 |     Ensure that PySTACAPISearcher works to query a STAC API /search/ endpoint
 16 |     and outputs a pystac_client.ItemSearch object.
 17 |     """
 18 |     query: dict = dict(
 19 |         bbox=[150.9, -34.36, 151.3, -33.46],
 20 |         datetime=["2000-01-01T00:00:00Z", "2020-12-31T00:00:00Z"],
 21 |         collections=["nidem"],
 22 |     )
 23 |     dp = IterableWrapper(iterable=[query])
 24 | 
 25 |     # Using class constructors
 26 |     dp_pystac_client = PySTACAPISearcher(
 27 |         source_datapipe=dp, catalog_url="https://explorer.sandbox.dea.ga.gov.au/stac/"
 28 |     )
 29 |     # Using functional form (recommended)
 30 |     dp_pystac_client = dp.search_for_pystac_item(
 31 |         catalog_url="https://explorer.sandbox.dea.ga.gov.au/stac/"
 32 |     )
 33 | 
 34 |     assert len(dp_pystac_client) == 1
 35 |     it = iter(dp_pystac_client)
 36 |     stac_item_search = next(it)
 37 |     assert stac_item_search.client.title == "AWS Explorer"
 38 |     assert stac_item_search.matched() == 2
 39 | 
 40 |     stac_items = list(stac_item_search.items())
 41 |     stac_item = stac_items[0]
 42 | 
 43 |     assert stac_item.bbox == [
 44 |         149.965907628116,
 45 |         -35.199398016548116,
 46 |         152.1053101683708,
 47 |         -32.97280658665687,
 48 |     ]
 49 |     assert stac_item.datetime.isoformat() == "2001-07-02T00:00:00+00:00"
 50 |     assert stac_item.geometry["type"] == "Polygon"
 51 |     assert stac_item.properties == {
 52 |         "title": "NIDEM_104_151.29_-34.22",
 53 |         "created": "2018-10-15T10:00:00Z",
 54 |         "proj:epsg": 4326,
 55 |         "datetime": "2001-07-02T00:00:00Z",
 56 |         "cubedash:region_code": None,
 57 |     }
 58 |     assert stac_item.assets["nidem"].extra_fields["eo:bands"] == [{"name": "nidem"}]
 59 | 
 60 | 
 61 | def test_pystac_client_item_search_open_headers():
 62 |     """
 63 |     Ensure that PySTACAPISearcher works to query a STAC API /search/ endpoint
 64 |     with headers passed to pystac_client.Client.open.
 65 |     """
 66 |     query: dict = dict(
 67 |         bbox=[150.9, -34.36, 151.3, -33.46],
 68 |         datetime=["2020-01-01T00:00:00Z", "2022-12-31T00:00:00Z"],
 69 |         collections=["HLSS30.v2.0"],
 70 |     )
 71 |     dp = IterableWrapper(iterable=[query])
 72 | 
 73 |     # Using class constructors
 74 |     dp_pystac_client = PySTACAPISearcher(
 75 |         source_datapipe=dp,
 76 |         catalog_url="https://cmr.earthdata.nasa.gov/cloudstac/LPCLOUD",
 77 |         headers={"Authorization": "Bearer <EDL_TOKEN>"},
 78 |     )
 79 |     # Using functional form (recommended)
 80 |     dp_pystac_client = dp.search_for_pystac_item(
 81 |         catalog_url="https://cmr.earthdata.nasa.gov/cloudstac/LPCLOUD",
 82 |         headers={"Authorization": "Bearer <EDL_TOKEN>"},
 83 |     )
 84 | 
 85 |     assert len(dp_pystac_client) == 1
 86 |     it = iter(dp_pystac_client)
 87 |     stac_item_search = next(it)
 88 |     assert stac_item_search.client.title == "LPCLOUD"
 89 |     assert stac_item_search.client.description == "Root catalog for LPCLOUD"
 90 | 
 91 | 
 92 | def test_pystac_client_item_lister():
 93 |     """
 94 |     Ensure that PySTACAPIItemLister works to yield pystac.Item instances for
 95 |     each item matching the given search parameters in a
 96 |     pystac_client.ItemSearch query.
 97 |     """
 98 |     catalog = pystac_client.Client.open(
 99 |         url="https://earth-search.aws.element84.com/v1/"
100 |     )
101 |     search = catalog.search(
102 |         bbox=[134.2, 6.9, 134.8, 8.5],
103 |         datetime=["2023-01-01T00:00:00Z", "2023-01-31T00:00:00Z"],
104 |         collections=["sentinel-2-l1c"],
105 |     )
106 |     dp = IterableWrapper(iterable=[search])
107 | 
108 |     # Using class constructors
109 |     dp_pystac_item_list = PySTACAPIItemLister(source_datapipe=dp)
110 |     # Using functional form (recommended)
111 |     dp_pystac_item_list = dp.list_pystac_items_by_search()
112 | 
113 |     assert len(dp_pystac_item_list) == 14
114 |     it = iter(dp_pystac_item_list)
115 |     stac_item = next(it)
116 |     assert stac_item.bbox == [
117 |         134.093840347073,
118 |         6.2442879900058115,
119 |         135.08840137750929,
120 |         7.237809826458827,
121 |     ]
122 |     assert stac_item.datetime.isoformat() == "2023-01-29T01:35:24.640000+00:00"
123 |     assert stac_item.geometry["type"] == "Polygon"
124 |     assert stac_item.properties == {
125 |         "created": "2023-01-29T06:01:33.679Z",
126 |         "platform": "sentinel-2b",
127 |         "constellation": "sentinel-2",
128 |         "instruments": ["msi"],
129 |         "eo:cloud_cover": 92.7676417582305,
130 |         "proj:epsg": 32653,
131 |         "mgrs:utm_zone": 53,
132 |         "mgrs:latitude_band": "N",
133 |         "mgrs:grid_square": "MH",
134 |         "grid:code": "MGRS-53NMH",
135 |         "view:sun_azimuth": 135.719785438016,
136 |         "view:sun_elevation": 55.1713941690268,
137 |         "s2:degraded_msi_data_percentage": 0.2816,
138 |         "s2:product_type": "S2MSI1C",
139 |         "s2:processing_baseline": "05.09",
140 |         "s2:product_uri": "S2B_MSIL1C_20230129T013449_N0509_R031_T53NMH_20230129T025811.SAFE",
141 |         "s2:generation_time": "2023-01-29T02:58:11.000000Z",
142 |         "s2:datatake_id": "GS2B_20230129T013449_030802_N05.09",
143 |         "s2:datatake_type": "INS-NOBS",
144 |         "s2:datastrip_id": "S2B_OPER_MSI_L1C_DS_2BPS_20230129T025811_S20230129T013450_N05.09",
145 |         "s2:granule_id": "S2B_OPER_MSI_L1C_TL_2BPS_20230129T025811_A030802_T53NMH_N05.09",
146 |         "s2:reflectance_conversion_factor": 1.03193080888673,
147 |         "datetime": "2023-01-29T01:35:24.640000Z",
148 |         "s2:sequence": "0",
149 |         "earthsearch:s3_path": "s3://earthsearch-data/sentinel-2-l1c/53/N/MH/2023/1/S2B_53NMH_20230129_0_L1C",
150 |         "earthsearch:payload_id": "roda-sentinel2/workflow-sentinel2-to-stac/15626e44fb54c2182e5ed5d3aec4a209",
151 |         "processing:software": {"sentinel2-to-stac": "0.1.0"},
152 |         "updated": "2023-01-29T06:01:33.679Z",
153 |     }
154 |     assert stac_item.assets["visual"].extra_fields["eo:bands"] == [
155 |         {
156 |             "name": "red",
157 |             "common_name": "red",
158 |             "description": "Red (band 4)",
159 |             "center_wavelength": 0.665,
160 |             "full_width_half_max": 0.038,
161 |         },
162 |         {
163 |             "name": "green",
164 |             "common_name": "green",
165 |             "description": "Green (band 3)",
166 |             "center_wavelength": 0.56,
167 |             "full_width_half_max": 0.045,
168 |         },
169 |         {
170 |             "name": "blue",
171 |             "common_name": "blue",
172 |             "description": "Blue (band 2)",
173 |             "center_wavelength": 0.49,
174 |             "full_width_half_max": 0.098,
175 |         },
176 |     ]
177 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_rioxarray.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for rioxarray datapipes.
 3 | """
 4 | from torchdata.datapipes.iter import IterableWrapper
 5 | 
 6 | from zen3geo.datapipes import RioXarrayReader
 7 | 
 8 | 
 9 | # %%
10 | def test_rioxarray_reader():
11 |     """
12 |     Ensure that RioXarrayReader works to read in a GeoTIFF file and outputs an
13 |     xarray.DataArray object.
14 |     """
15 |     file_url: str = "https://github.com/GenericMappingTools/gmtserver-admin/raw/master/cache/earth_day_HD.tif"
16 |     dp = IterableWrapper(iterable=[file_url])
17 | 
18 |     # Using class constructors
19 |     dp_rioxarray = RioXarrayReader(source_datapipe=dp)
20 |     # Using functional form (recommended)
21 |     dp_rioxarray = dp.read_from_rioxarray()
22 | 
23 |     assert len(dp_rioxarray) == 1
24 |     it = iter(dp_rioxarray)
25 |     dataarray = next(it)
26 | 
27 |     assert dataarray.shape == (1, 960, 1920)
28 |     assert dataarray.dims == ("band", "y", "x")
29 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_stackstac.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for stackstac datapipes.
 3 | """
 4 | import numpy as np
 5 | import pytest
 6 | import xarray as xr
 7 | from torchdata.datapipes.iter import IterableWrapper
 8 | 
 9 | from zen3geo.datapipes import StackSTACStacker
10 | 
11 | pystac = pytest.importorskip("pystac")
12 | stackstac = pytest.importorskip("stackstac")
13 | 
14 | # %%
15 | def test_stackstac_mosaicker():
16 |     """
17 |     Ensure that StackSTACMosaicker works to mosaic tiles within a 4D
18 |     xarray.DataArray to a 3D xarray.DataArray.
19 |     """
20 |     datacube: xr.DataArray = xr.DataArray(
21 |         data=np.ones(shape=(3, 1, 32, 32)), dims=["tile", "band", "y", "x"]
22 |     )
23 |     dataarray = stackstac.mosaic(arr=datacube, dim="tile")
24 |     assert dataarray.sizes == {"band": 1, "y": 32, "x": 32}
25 |     assert dataarray.sum() == 1 * 32 * 32
26 | 
27 | 
28 | def test_stackstac_stacker():
29 |     """
30 |     Ensure that StackSTACStacker works to stack multiple bands within a STAC
31 |     item and outputs an xarray.DataArray object.
32 |     """
33 |     item_url: str = "https://github.com/stac-utils/pystac/raw/v1.6.1/tests/data-files/raster/raster-sentinel2-example.json"
34 |     stac_item = pystac.Item.from_file(href=item_url)
35 |     dp = IterableWrapper(iterable=[stac_item])
36 | 
37 |     # Using class constructors
38 |     dp_stackstac = StackSTACStacker(source_datapipe=dp, assets=["B02", "B03", "B04"])
39 |     # Using functional form (recommended)
40 |     dp_stackstac = dp.stack_stac_items(assets=["B02", "B03", "B04"])
41 | 
42 |     assert len(dp_stackstac) == 1
43 |     it = iter(dp_stackstac)
44 |     dataarray = next(it)
45 | 
46 |     assert dataarray.shape == (1, 3, 10980, 10980)
47 |     assert dataarray.dims == ("time", "band", "y", "x")
48 |     assert dataarray.rio.bounds() == (399955.0, 4090205.0, 509755.0, 4200005.0)
49 |     assert dataarray.rio.resolution() == (10.0, -10.0)
50 |     assert dataarray.rio.crs == "EPSG:32633"
51 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_xbatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for xbatcher datapipes.
 3 | """
 4 | import numpy as np
 5 | import pytest
 6 | import xarray as xr
 7 | from torchdata.datapipes.iter import IterableWrapper
 8 | 
 9 | from zen3geo.datapipes import XbatcherSlicer
10 | 
11 | xbatcher = pytest.importorskip("xbatcher")
12 | 
13 | 
14 | # %%
15 | def test_xbatcher_slicer_dataarray():
16 |     """
17 |     Ensure that XbatcherSlicer works to slice an xarray.DataArray object and
18 |     outputs a smaller xarray.DataArray chip.
19 |     """
20 | 
21 |     dataarray: xr.DataArray = xr.DataArray(
22 |         data=np.ones(shape=(3, 128, 128)), dims=["band", "y", "x"]
23 |     ).chunk({"band": 1})
24 |     dp = IterableWrapper(iterable=[dataarray])
25 | 
26 |     # Using class constructors
27 |     dp_xbatcher = XbatcherSlicer(source_datapipe=dp, input_dims={"y": 64, "x": 64})
28 |     # Using functional form (recommended)
29 |     dp_xbatcher = dp.slice_with_xbatcher(input_dims={"y": 64, "x": 64})
30 | 
31 |     assert len(dp_xbatcher) == 4
32 |     it = iter(dp_xbatcher)
33 |     dataarray_chip = next(it)
34 | 
35 |     assert dataarray_chip.sizes == {"band": 3, "y": 64, "x": 64}
36 |     assert dataarray_chip.sum() == 3 * 64 * 64
37 | 
38 | 
39 | def test_xbatcher_slicer_dataset():
40 |     """
41 |     Ensure that XbatcherSlicer works to slice an xarray.Dataset object and
42 |     outputs a smaller xarray.Dataset chip.
43 |     """
44 | 
45 |     dataset: xr.Dataset = xr.Dataset(
46 |         data_vars={"temperature": (["x", "y"], 15 * np.ones(shape=(32, 32)))},
47 |         coords={
48 |             "lon": (["x"], np.linspace(start=0, stop=32, num=32)),
49 |             "lat": (["y"], np.linspace(start=64, stop=32, num=32)),
50 |         },
51 |     )
52 |     dp = IterableWrapper(iterable=[dataset])
53 | 
54 |     # Using class constructors
55 |     dp_xbatcher = XbatcherSlicer(source_datapipe=dp, input_dims={"y": 16, "x": 16})
56 |     # Using functional form (recommended)
57 |     dp_xbatcher = dp.slice_with_xbatcher(input_dims={"y": 16, "x": 16})
58 | 
59 |     assert len(dp_xbatcher) == 4
60 |     it = iter(dp_xbatcher)
61 |     dataset_chip = next(it)
62 | 
63 |     assert dataset_chip.temperature.sizes == {"y": 16, "x": 16}
64 |     assert dataset_chip.temperature.sum() == 15 * 16 * 16
65 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_datapipes_xpystac.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for pystac datapipes.
  3 | """
  4 | import pytest
  5 | from torchdata.datapipes.iter import IterableWrapper
  6 | 
  7 | from zen3geo.datapipes import XpySTACAssetReader
  8 | 
  9 | 
 10 | # %%
 11 | def test_xpystac_asset_reader_cog():
 12 |     """
 13 |     Ensure that XpySTACAssetReader works to read in a pystac.Asset object
 14 |     stored as a Cloud-Optimized GeoTIFF and output to an xarray.Dataset object.
 15 |     """
 16 |     pystac = pytest.importorskip("pystac")
 17 |     xpystac = pytest.importorskip("xpystac")
 18 | 
 19 |     item_url: str = "https://github.com/stac-utils/pystac/raw/v1.7.1/tests/data-files/raster/raster-sentinel2-example.json"
 20 |     asset: pystac.Asset = pystac.Item.from_file(href=item_url).assets["overview"]
 21 |     assert asset.media_type == pystac.MediaType.COG
 22 | 
 23 |     dp = IterableWrapper(iterable=[asset])
 24 | 
 25 |     # Using class constructors
 26 |     dp_xpystac = XpySTACAssetReader(source_datapipe=dp)
 27 |     # Using functional form (recommended)
 28 |     dp_xpystac = dp.read_from_xpystac()
 29 | 
 30 |     assert len(dp_xpystac) == 1
 31 |     it = iter(dp_xpystac)
 32 |     dataset = next(it)
 33 | 
 34 |     assert dataset.sizes == {"band": 3, "x": 343, "y": 343}
 35 |     assert dataset.band_data.dtype == "float32"
 36 |     assert dataset.rio.bounds() == (399960.0, 4090240.0, 509720.0, 4200000.0)
 37 |     assert dataset.rio.resolution() == (320.0, -320.0)
 38 |     assert dataset.rio.crs == "EPSG:32633"
 39 | 
 40 | 
 41 | def test_xpystac_asset_reader_zarr():
 42 |     """
 43 |     Ensure that XpySTACAssetReader works to read in a pystac.Asset object
 44 |     stored as a Zarr file and output to an xarray.Dataset object.
 45 |     """
 46 |     pystac = pytest.importorskip("pystac")
 47 |     xpystac = pytest.importorskip("xpystac")
 48 | 
 49 |     collection_url: str = "https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-daily-hi"
 50 |     asset: pystac.Asset = pystac.Collection.from_file(href=collection_url).assets[
 51 |         "zarr-https"
 52 |     ]
 53 |     assert asset.media_type == "application/vnd+zarr"
 54 | 
 55 |     dp = IterableWrapper(iterable=[asset])
 56 | 
 57 |     # Using class constructors
 58 |     dp_xpystac = XpySTACAssetReader(source_datapipe=dp)
 59 |     # Using functional form (recommended)
 60 |     dp_xpystac = dp.read_from_xpystac()
 61 | 
 62 |     assert len(dp_xpystac) == 1
 63 |     it = iter(dp_xpystac)
 64 |     dataset = next(it)
 65 | 
 66 |     assert dataset.sizes == {"time": 14965, "y": 584, "x": 284, "nv": 2}
 67 |     assert dataset.prcp.dtype == "float32"
 68 |     assert dataset.rio.bounds() == (-5802750.0, -622500.0, -5518750.0, -38500.0)
 69 |     assert dataset.rio.resolution() == (1000.0, -1000.0)
 70 |     assert dataset.rio.grid_mapping == "lambert_conformal_conic"
 71 | 
 72 | 
 73 | def test_xpystac_asset_reader_geotiff_without_xpystac():
 74 |     """
 75 |     Ensure that XpySTACAssetReader works to read in a GeoTIFF file and output
 76 |     to an xarray.Dataset object, even when xpystac is not installed.
 77 | 
 78 |     Note that `engine="rasterio"` has been removed in xarray v2023.04.0, see
 79 |     https://github.com/pydata/xarray/pull/7671. So, this test will need to be
 80 |     updated once we change to require an xarray verson greater than 2023.04.0.
 81 |     Only included this test to check an alternative to `engine="stac"` that
 82 |     did not require installing extra required dependencies like `netcdf4` or
 83 |     `h5netcdf`.
 84 |     """
 85 |     tif_url: str = "https://github.com/corteva/rioxarray/raw/0.14.1/test/test_data/input/cint16.tif"
 86 | 
 87 |     dp = IterableWrapper(iterable=[tif_url])
 88 | 
 89 |     # Using class constructors
 90 |     dp_xpystac = XpySTACAssetReader(source_datapipe=dp, engine="rasterio")
 91 |     # Using functional form (recommended)
 92 |     dp_xpystac = dp.read_from_xpystac(engine="rasterio")
 93 | 
 94 |     assert len(dp_xpystac) == 1
 95 |     it = iter(dp_xpystac)
 96 |     dataset = next(it)
 97 | 
 98 |     assert dataset.sizes == {"band": 1, "x": 100, "y": 100}
 99 |     assert dataset.band_data.dtype == "complex64"
100 |     assert dataset.rio.bounds() == (0.0, 100.0, 100.0, 0.0)
101 |     assert dataset.rio.resolution() == (1.0, 1.0)
102 |     assert dataset.rio.crs == "EPSG:4326"
103 | 


--------------------------------------------------------------------------------
/zen3geo/tests/test_zen3geo.py:
--------------------------------------------------------------------------------
1 | from packaging.version import Version
2 | 
3 | from zen3geo import __version__
4 | 
5 | 
6 | def test_version():
7 |     assert Version(version=__version__) >= Version(version="0.0.0")
8 | 


--------------------------------------------------------------------------------