├── .github ├── dependabot.yml └── workflows │ ├── nightly-build.yaml │ ├── publish-book.yaml │ ├── trigger-book-build.yaml │ ├── trigger-delete-preview.yaml │ ├── trigger-link-check.yaml │ └── trigger-preview.yaml ├── .gitignore ├── CITATION.cff ├── LICENSE ├── README.md ├── _config.yml ├── _gallery_info.yml ├── _static ├── custom.css └── footer-logo-nsf.png ├── _templates └── footer-extra.html ├── _toc.yml ├── environment.yml ├── notebooks ├── 0.0_Intro_Landsat.ipynb ├── 1.0_Data_Ingestion-Geospatial.ipynb ├── 1.1_Data_Ingestion-General.ipynb ├── 2.0_Spectral_Clustering_PC.ipynb ├── data │ ├── catalog.yml │ ├── landsat5_bands.csv │ ├── landsat5_crop.nc │ └── landsat8_bands.csv └── images │ ├── L-Next-SpectralBands-stack.png │ ├── ProjectPythia_Logo_Final-01-Blue.svg │ ├── icons │ └── favicon.ico │ ├── intake_landsat.png │ ├── landsat_8_rend-sm1.png │ ├── landsat_timeline.png │ ├── logos │ ├── NSF-NCAR_Lockup-UCAR-Dark_102523.svg │ ├── UAlbany-A2-logo-purple-gold.svg │ ├── Unidata_logo_horizontal_1200x300.svg │ ├── pythia_logo-white-notext.svg │ └── pythia_logo-white-rtext.svg │ ├── nasa_bands.png │ ├── nasa_landsat8.jpg │ ├── planetary_computer_header_800w.png │ ├── pystac.png │ ├── spectral_clustering.png │ └── spectral_clustering_lake.png └── thumbnail.png /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # - package-ecosystem: pip 4 | # directory: "/" 5 | # schedule: 6 | # interval: daily 7 | - package-ecosystem: 'github-actions' 8 | directory: '/' 9 | schedule: 10 | # Check for updates once a week 11 | interval: 'weekly' 12 | -------------------------------------------------------------------------------- /.github/workflows/nightly-build.yaml: -------------------------------------------------------------------------------- 1 | name: nightly-build 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '0 0 * * *' # Daily “At 00:00” 7 | 8 | jobs: 9 | build: 10 | if: ${{ github.repository_owner == 'ProjectPythia' }} 11 | uses: ProjectPythia/cookbook-actions/.github/workflows/build-book.yaml@main 12 | with: 13 | environment_name: cookbook-dev 14 | 15 | link-check: 16 | if: ${{ github.repository_owner == 'ProjectPythia' }} 17 | uses: ProjectPythia/cookbook-actions/.github/workflows/link-checker.yaml@main 18 | -------------------------------------------------------------------------------- /.github/workflows/publish-book.yaml: -------------------------------------------------------------------------------- 1 | name: publish-book 2 | 3 | on: 4 | # Trigger the workflow on push to main branch 5 | push: 6 | branches: 7 | - main 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | uses: ProjectPythia/cookbook-actions/.github/workflows/build-book.yaml@main 13 | with: 14 | environment_name: cookbook-dev 15 | 16 | deploy: 17 | needs: build 18 | uses: ProjectPythia/cookbook-actions/.github/workflows/deploy-book.yaml@main 19 | -------------------------------------------------------------------------------- /.github/workflows/trigger-book-build.yaml: -------------------------------------------------------------------------------- 1 | name: trigger-book-build 2 | on: 3 | pull_request: 4 | 5 | jobs: 6 | build: 7 | uses: ProjectPythia/cookbook-actions/.github/workflows/build-book.yaml@main 8 | with: 9 | environment_name: cookbook-dev 10 | artifact_name: book-zip-${{ github.event.number }} 11 | # Other input options are possible, see ProjectPythiaCookbooks/cookbook-actions/.github/workflows/build-book.yaml 12 | -------------------------------------------------------------------------------- /.github/workflows/trigger-delete-preview.yaml: -------------------------------------------------------------------------------- 1 | name: trigger-delete-preview 2 | 3 | on: 4 | pull_request_target: 5 | types: closed 6 | 7 | jobs: 8 | delete: 9 | uses: ProjectPythia/cookbook-actions/.github/workflows/delete-preview.yaml@main -------------------------------------------------------------------------------- /.github/workflows/trigger-link-check.yaml: -------------------------------------------------------------------------------- 1 | name: trigger-link-check 2 | on: 3 | pull_request: 4 | 5 | jobs: 6 | link-check: 7 | uses: ProjectPythia/cookbook-actions/.github/workflows/link-checker.yaml@main 8 | -------------------------------------------------------------------------------- /.github/workflows/trigger-preview.yaml: -------------------------------------------------------------------------------- 1 | name: trigger-preview 2 | on: 3 | workflow_run: 4 | workflows: 5 | - trigger-book-build 6 | types: 7 | - requested 8 | - completed 9 | 10 | jobs: 11 | find-pull-request: 12 | uses: ProjectPythia/cookbook-actions/.github/workflows/find-pull-request.yaml@main 13 | deploy-preview: 14 | needs: find-pull-request 15 | if: github.event.workflow_run.conclusion == 'success' 16 | uses: ProjectPythia/cookbook-actions/.github/workflows/deploy-book.yaml@main 17 | with: 18 | artifact_name: book-zip-${{ needs.find-pull-request.outputs.number }} 19 | destination_dir: _preview/${{ needs.find-pull-request.outputs.number }} # deploy to subdirectory labeled with PR number 20 | is_preview: 'true' 21 | 22 | preview-comment: 23 | needs: find-pull-request 24 | uses: ProjectPythia/cookbook-actions/.github/workflows/preview-comment.yaml@main 25 | with: 26 | pull_request_number: ${{ needs.find-pull-request.outputs.number }} 27 | sha: ${{ needs.find-pull-request.outputs.sha }} 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | _build/ 13 | notebooks/_build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | # DS store 134 | **/.DS_Store 135 | 136 | # IDEs 137 | .vscode/ 138 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this cookbook, please cite it as below." 3 | authors: 4 | # add additional entries for each author -- see https://github.com/citation-file-format/citation-file-format/blob/main/schema-guide.md 5 | - family-names: Roumis 6 | given-names: Demetris 7 | website: https://github.com/droumis 8 | orcid: https://orcid.org/0000-0003-4670-1657 9 | - name: "Landsat ML Cookbook contributors" # use the 'name' field to acknowledge organizations 10 | website: "https://github.com/ProjectPythia/landsat-ml-cookbook/graphs/contributors" 11 | title: "Landsat ML Cookbook" 12 | abstract: "Machine learning on Landsat data." 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Landsat 8](./notebooks/images/nasa_landsat8.jpg "Landsat 8") 2 | 3 | # Landsat ML Cookbook 4 | 5 | [![nightly-build](https://github.com/ProjectPythia/landsat-ml-cookbook/actions/workflows/nightly-build.yaml/badge.svg)](https://github.com/ProjectPythia/landsat-ml-cookbook/actions/workflows/nightly-build.yaml) 6 | [![Binder](http://binder.projectpythia.org/badge_logo.svg)](http://binder.projectpythia.org/v2/gh/ProjectPythia/landsat-ml-cookbook/main?labpath=notebooks) 7 | [![DOI](https://zenodo.org/badge/563445694.svg)](https://zenodo.org/badge/latestdoi/563445694) 8 | 9 | This Project Pythia Cookbook covers the essential materials for working with Landsat data in the context of machine learning workflows. 10 | 11 | ## Motivation 12 | 13 | Once you complete this cookbook, you will have the skills to access, resample, regrid, reshape, and rescale satellite data, as well as the foundation for applying machine learning to it. You will also learn how to interactively visualize your data at every step in the process. 14 | 15 | ## Authors 16 | 17 | [Demetris Roumis](https://github.com/droumis) 18 | [Andrew Huang](https://github.com/ahuang11) 19 | 20 | ### Contributors 21 | 22 | 23 | 24 | 25 | 26 | 27 | This cookbook was initially inspired by the [EarthML](https://github.com/pyviz-topics/EarthML) . See a list of the EarthML contributors [here:](https://github.com/pyviz-topics/EarthML/graphs/contributors) 28 | 29 | 30 | 31 | 32 | ## Structure 33 | This cookbook is broken up into two main sections - "Foundations" and "Example Workflows." 34 | 35 | ### Foundations 36 | The foundational content includes: 37 | - Start Here - Introduction to Landsat data. 38 | - Data Ingestion - Geospatial-Specific Tooling - Demonstrating a method for loading and accessing Landsat data from Microsoft's Planetary Computer platform with tooling from pystac and odc. 39 | - Data Ingestion - General Purpose Tooling - Demonstrating approaches for domain-independent data access using Intake. 40 | 41 | ### Example Workflows 42 | Example workflows include: 43 | - Spectral Clustering - Demonstrating a machine learning approach to cluster pixels of satellite data and comparing cluster results across time 44 | 45 | ## Running the Notebooks 46 | You can either run the notebook using [Binder](https://binder.projectpythia.org/) or on your local machine. 47 | 48 | ### Running on Binder 49 | 50 | The simplest way to interact with a Jupyter Notebook is through 51 | [Binder](https://binder.projectpythia.org/), which enables the execution of a 52 | [Jupyter Book](https://jupyterbook.org) in the cloud. The details of how this works are not 53 | important for now. All you need to know is how to launch a Pythia 54 | Cookbooks chapter via Binder. Simply navigate your mouse to 55 | the top right corner of the book chapter you are viewing and click 56 | on the rocket ship icon, (see figure below), and be sure to select 57 | “launch Binder”. After a moment you should be presented with a 58 | notebook that you can interact with. I.e. you’ll be able to execute 59 | and even change the example programs. You’ll see that the code cells 60 | have no output at first, until you execute them by pressing 61 | {kbd}`Shift`\+{kbd}`Enter`. Complete details on how to interact with 62 | a live Jupyter notebook are described in [Getting Started with 63 | Jupyter](https://foundations.projectpythia.org/foundations/getting-started-jupyter.html). 64 | 65 | ### Running on Your Own Machine 66 | If you are interested in running this material locally on your computer, you will need to follow this workflow: 67 | 68 | 1. Clone the Landsat ML Cookbook repository: 69 | 70 | ```bash 71 | git clone https://github.com/ProjectPythia/landsat-ml-cookbook.git 72 | ``` 73 | 1. Move into the `landsat-ml-cookbook` directory 74 | ```bash 75 | cd landsat-ml-cookbook 76 | ``` 77 | 1. Create and activate your conda environment from the `environment.yml` file 78 | ```bash 79 | conda env create -f environment.yml 80 | conda activate landsat-ml-cookbook 81 | ``` 82 | 1. Move into the `notebooks` directory and start up Jupyterlab 83 | ```bash 84 | cd notebooks/ 85 | jupyter lab 86 | ``` 87 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | # Learn more at https://jupyterbook.org/customize/config.html 3 | 4 | title: Landsat ML Cookbook 5 | author: Demetris Roumis 6 | logo: notebooks/images/logos/pythia_logo-white-rtext.svg 7 | copyright: '2024' 8 | 9 | execute: 10 | # To execute notebooks via a binder instead, replace 'cache' with 'binder' 11 | execute_notebooks: force 12 | timeout: 600 13 | allow_errors: True 14 | 15 | # Add a few extensions to help with parsing content 16 | parse: 17 | myst_enable_extensions: # default extensions to enable in the myst parser. See https://myst-parser.readthedocs.io/en/latest/using/syntax-optional.html 18 | - amsmath 19 | - colon_fence 20 | - deflist 21 | - dollarmath 22 | - html_admonition 23 | - html_image 24 | - replacements 25 | - smartquotes 26 | - substitution 27 | 28 | sphinx: 29 | config: 30 | html_favicon: notebooks/images/icons/favicon.ico 31 | html_last_updated_fmt: '%-d %B %Y' 32 | html_theme: sphinx_pythia_theme 33 | html_permalinks_icon: '' 34 | html_theme_options: 35 | home_page_in_toc: true 36 | repository_url: https://github.com/ProjectPythia/landsat-ml-cookbook/ # Online location of your book 37 | repository_branch: main # Which branch of the repository should be used when creating links (optional) 38 | use_issues_button: true 39 | use_repository_button: true 40 | use_edit_page_button: true 41 | use_fullscreen_button: true 42 | analytics: 43 | google_analytics_id: G-T52X8HNYE8 44 | github_url: https://github.com/ProjectPythia 45 | twitter_url: https://twitter.com/project_pythia 46 | icon_links: 47 | - name: YouTube 48 | url: https://www.youtube.com/channel/UCoZPBqJal5uKpO8ZiwzavCw 49 | icon: fab fa-youtube-square 50 | type: fontawesome 51 | launch_buttons: 52 | binderhub_url: https://binder.projectpythia.org 53 | notebook_interface: jupyterlab 54 | logo: 55 | link: https://projectpythia.org 56 | navbar_start: 57 | - navbar-logo 58 | navbar_end: 59 | - navbar-icon-links 60 | navbar_links: 61 | - name: Home 62 | url: https://projectpythia.org 63 | - name: Foundations 64 | url: https://foundations.projectpythia.org 65 | - name: Cookbooks 66 | url: https://cookbooks.projectpythia.org 67 | - name: Resources 68 | url: https://projectpythia.org/resource-gallery.html 69 | - name: Community 70 | url: https://projectpythia.org/index.html#join-us 71 | footer_logos: 72 | NCAR: notebooks/images/logos/NSF-NCAR_Lockup-UCAR-Dark_102523.svg 73 | Unidata: notebooks/images/logos/Unidata_logo_horizontal_1200x300.svg 74 | UAlbany: notebooks/images/logos/UAlbany-A2-logo-purple-gold.svg 75 | footer_start: 76 | - footer-logos 77 | - footer-info 78 | - footer-extra 79 | -------------------------------------------------------------------------------- /_gallery_info.yml: -------------------------------------------------------------------------------- 1 | thumbnail: thumbnail.png 2 | tags: 3 | domains: 4 | - satellite 5 | - ml 6 | - climate 7 | packages: 8 | - hvPlot 9 | - intake 10 | - xarray 11 | - dask -------------------------------------------------------------------------------- /_static/custom.css: -------------------------------------------------------------------------------- 1 | .bd-main .bd-content .bd-article-container { 2 | max-width: 100%; /* default is 60em */ 3 | } 4 | .bd-page-width { 5 | max-width: 100%; /* default is 88rem */ 6 | } 7 | -------------------------------------------------------------------------------- /_static/footer-logo-nsf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/_static/footer-logo-nsf.png -------------------------------------------------------------------------------- /_templates/footer-extra.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | format: jb-book 2 | root: README 3 | parts: 4 | - caption: Preamble 5 | chapters: 6 | - file: notebooks/how-to-cite 7 | - caption: Foundations 8 | chapters: 9 | - file: notebooks/0.0_Intro_Landsat 10 | - file: notebooks/1.0_Data_Ingestion-Geospatial 11 | - file: notebooks/1.1_Data_Ingestion-General 12 | - caption: Example Workflows 13 | chapters: 14 | - file: notebooks/2.0_Spectral_Clustering_PC 15 | 16 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: landsat-ml-cookbook 2 | channels: 3 | - nodefaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.10 7 | - shapely<2.0.0 8 | - pandas 9 | - xarray-datatree 10 | - planetary-computer 11 | - pystac 12 | - pystac-client 13 | - odc-stac 14 | - ipykernel 15 | - hvplot 16 | - panel<1.4.0 17 | - geoviews 18 | - datashader 19 | - colorcet 20 | - intake-xarray 21 | - xarray<2023.04 22 | - bokeh<3.4.0 23 | - dask 24 | - dask-ml 25 | - pandas 26 | - numpy 27 | - cartopy 28 | - rioxarray 29 | - rasterio 30 | - s3fs 31 | - jupyter-book 32 | - jupyterlab 33 | - jupyter_server<2 34 | - pyopenssl>22 35 | - adlfs 36 | - pip 37 | - pip: 38 | - sphinx-pythia-theme 39 | - stac_geoparquet 40 | - dask_geopandas 41 | - jupyter_bokeh 42 | - pygeos 43 | - intake>=2.0.4 44 | -------------------------------------------------------------------------------- /notebooks/0.0_Intro_Landsat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e9a66402-d146-40a2-a013-ef1078026efa", 6 | "metadata": {}, 7 | "source": [ 8 | "# Start Here - Intro to Landsat Data" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "477b00c6-6565-4a16-b302-46dac2fff9de", 14 | "metadata": {}, 15 | "source": [ 16 | "![Landsat8](./images/nasa_landsat8.jpg \"Landsat8\")" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "8a8416c4-c56b-4b53-bc3f-8eb6285dfcac", 22 | "metadata": {}, 23 | "source": [ 24 | "---" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "a6587d1b-8e22-4404-abf0-0da0bfde096e", 30 | "metadata": {}, 31 | "source": [ 32 | "## Overview\n", 33 | "\n", 34 | "In this cookbook, you will access, process, analyze, and visualize satellite data in the context of machine learning workflows. This particular cookbook notebook will provide an introduction to Landsat data to build our intuition as we move toward data ingestion, processing, and analysis." 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "a8cb6848-e976-4390-9195-64ffb9049d81", 40 | "metadata": {}, 41 | "source": [ 42 | "- **Time to learn**: 5 minutes" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "5ecd112d-3666-4412-acce-05a6dd770de9", 48 | "metadata": {}, 49 | "source": [ 50 | "## Landsat Data" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "3b8e2b5e-9a96-4161-96b6-f0bbe6b17f47", 56 | "metadata": {}, 57 | "source": [ 58 | "The data in this cookbook originally come from the [Landsat](https://en.wikipedia.org/wiki/Landsat_program) program, which is the longest record of moderate resolution multispectral data of the Earth’s surface. This program has launched several different satellites spanning many years which are designated as Landsat 1-9.\n", 59 | "\n", 60 | "![USGS Landsat Timeline](images/landsat_timeline.png \"USGS Landsat Timeline\")\n", 61 | "\n", 62 | "\n", 63 | "When accessing the data, it's important to keep in mind a couple key points. First, the instruments on different Landsat missions (1-9) varied in certain aspects. Second, Landsat data is available from multiple providers (USGS, NASA, Google, Microsoft, AWS, etc) but may vary in completeness and the level of processing applied. For the dataset that you end up using, it is crucial to review to relevant information from the particular data provider and the specific Landsat mission to understand the details, especially if you are comparing data across providers or missions.\n", 64 | "\n", 65 | "In general, a common aspect of Landsat data is the use of different wavelength-bands to capture multiple images of the same area - together providing much more information about different features on the ground than a single image alone. This provides us with a stack of images for each spatial region that we might be interested.\n", 66 | "\n", 67 | "![Spectral Bands](images/nasa_bands.png \"Spectral Bands\")\n", 68 | "\n", 69 | "Additionally, whenever we are looking at changes in satellite images over time, we will have an additional time dimension. For example, we will consider two stacks of images from different years to look at the change in the water level around a lake.\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "e7b29113-9cf8-40d8-b03b-0b7f0a6a6604", 75 | "metadata": {}, 76 | "source": [ 77 | "___" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "eb228292-bbda-437e-a6e8-2133cbb8e18b", 83 | "metadata": {}, 84 | "source": [ 85 | "## Summary\n", 86 | "Before accessing any data, it's a good idea to start by learning about the context and details of the dataset. This will give you the intuition to make informed decisions as you form a processing and analysis pipeline.\n", 87 | "\n", 88 | "### What's next?\n", 89 | "Next, we'll learn about loading the data using the Microsoft Planetary Computer platform." 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "id": "6e54a0ab-93e4-4299-a036-0fa7c3e23ec5", 95 | "metadata": {}, 96 | "source": [ 97 | "## Resources and references\n", 98 | "- The Landsat timeline image is originally from [USGS](https://www.usgs.gov/landsat-missions/landsat-satellite-missions?qt-science_support_page_related_con=2) but discovered through [earthsciencedata.org](https://www.earthdatascience.org/courses/use-data-open-source-python/multispectral-remote-sensing/landsat-in-Python/)\n", 99 | "- The Landsat 8 banner image is from NASA\n", 100 | "- The Landsat spectral bands is from [NASA](https://landsat.gsfc.nasa.gov/satellites/landsat-next/)\n", 101 | "- This page was authored by Demetris Roumis circa Jan, 2023" 102 | ] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3 (ipykernel)", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.10.13" 122 | }, 123 | "vscode": { 124 | "interpreter": { 125 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 126 | } 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 5 131 | } 132 | -------------------------------------------------------------------------------- /notebooks/1.0_Data_Ingestion-Geospatial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "65c844ec-98e2-40e0-9312-9d6bcd30e4a4", 6 | "metadata": {}, 7 | "source": [ 8 | "# Data Ingestion - Geospatial-Specific Tooling" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "53b39a59-2225-4406-9136-65b0a4956a6c", 14 | "metadata": {}, 15 | "source": [ 16 | "![PySTAC](images/pystac.png \"PySTACK\")" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "27bd740d-8c47-4843-9899-40a282eb5a18", 22 | "metadata": {}, 23 | "source": [ 24 | "---" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "091f838a-f459-4c41-957d-5f04083f95da", 30 | "metadata": {}, 31 | "source": [ 32 | "## Overview" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "488398aa-5d16-4a7a-b074-c2de5f6fac24", 38 | "metadata": {}, 39 | "source": [ 40 | "In this notebook, you will ingest Landsat data for use in machine learning. Machine learning tasks often involve a lot of data, and in Python, data is typically stored in memory as simple [NumPy](https://foundations.projectpythia.org/core/numpy.html) arrays. However, higher-level containers built on top of NumPy arrays provide more functionality for multidimensional gridded data ([xarray](http://xarray.pydata.org)) or out-of-core and distributed data ([Dask](http://dask.pydata.org)). Our goal for data ingestion will be to load specific Landsat data of interest into one of these higher-level containers.\n", 41 | "\n", 42 | "[Microsoft Plantery Computer](https://planetarycomputer.microsoft.com/docs/overview/about) is one of several providers of [Landsat Data](https://planetarycomputer.microsoft.com/dataset/group/landsat). We are using it together with [pystac-client](https://pystac-client.readthedocs.io/en/stable/index.html) and [odc-stac](https://odc-stac.readthedocs.io/en/latest/index.html) because together they provide a nice Python API for searching and loading with specific criteria such as spatial area, datetime, Landsat mission, and cloud coverage.\n", 43 | "\n", 44 | "Earth science datasets are often stored on remote servers that may be too large to download locally. Therefore, in this cookbook, we will focus primarily on ingestion approaches that load small portions of data from a remote source, as needed. However, the approach for your own work will depend not only on data size and location but also the intended analysis, so in a follow up notebook, you will see an alternative approache for generalized data access and management." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "836bb190-f0a1-4cb4-b599-386ed72a63ff", 50 | "metadata": {}, 51 | "source": [ 52 | "## Prerequisites\n", 53 | "\n", 54 | "| Concepts | Importance | Notes |\n", 55 | "| --- | --- | --- |\n", 56 | "| [Intro to Landsat](./0.0_Intro_Landsat.ipynb) | Necessary | Background |\n", 57 | "| [About the Microsoft Planetary Computer](https://planetarycomputer.microsoft.com/docs/overview/about) | Helpful | Background |\n", 58 | "| [pystac-client Usage](https://pystac-client.readthedocs.io/en/stable/usage.html) | Helpful | Consult as needed |\n", 59 | "| [odc.stac.load Reference](https://odc-stac.readthedocs.io/en/latest/_api/odc.stac.load.html) | Helpful | Consult as needed |\n", 60 | "| [xarray](https://foundations.projectpythia.org/core/xarray.html) | Necessary | |\n", 61 | "| [Intro to Dask Array](https://docs.dask.org/en/stable/array.html) | Helpful | |\n", 62 | "| [Panel Getting Started Guide](https://panel.holoviz.org/getting_started/build_app.html) | Helpful | |\n", 63 | "\n", 64 | "- **Time to learn**: 10 minutes" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "70f621c3-8cdc-401b-a21c-79f31abd7bbf", 70 | "metadata": {}, 71 | "source": [ 72 | "## Imports" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "28f179f7-1dbc-4127-b284-8ebabe3eff72", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "import odc.stac\n", 83 | "import pandas as pd\n", 84 | "import planetary_computer\n", 85 | "import pystac_client\n", 86 | "import xarray as xr\n", 87 | "from pystac.extensions.eo import EOExtension as eo\n", 88 | "\n", 89 | "# Viz\n", 90 | "import hvplot.xarray\n", 91 | "import panel as pn\n", 92 | "\n", 93 | "pn.extension()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "ec00475e-01fa-48f4-9323-0deaff92086b", 99 | "metadata": {}, 100 | "source": [ 101 | "## Open and read the root of the STAC catalog" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "a4007807-b3b1-40fe-8aae-a01a0d01b03b", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "catalog = pystac_client.Client.open(\n", 112 | " \"https://planetarycomputer.microsoft.com/api/stac/v1\",\n", 113 | " modifier=planetary_computer.sign_inplace,\n", 114 | ")\n", 115 | "catalog.title" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "3ec64999-76f4-41db-b89b-b8c55c1ba788", 121 | "metadata": {}, 122 | "source": [ 123 | "Microsoft Planetary Computer has a public STAC metadata but the actual data assets are in private Azure Blob Storage containers and require authentication. `pystac-client` provides a `modifier` keyword that we can use to manually sign the item. Otherwise, we'd get an error when trying to access the asset." 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "id": "21cb2e57-3fd5-4fae-8fa3-beefc415478e", 129 | "metadata": {}, 130 | "source": [ 131 | "# Search for Landsat Data" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "8444b837-8826-45f7-841e-4b053a4bea86", 137 | "metadata": {}, 138 | "source": [ 139 | "Let's say that an analysis we want to run requires landsat data over a specific region and from a specific time period. We can use our catalog to search for assets that fit our search criteria." 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "id": "42052634-9c2f-4073-9080-a346703b2081", 145 | "metadata": {}, 146 | "source": [ 147 | "First, let's find the name of the landsat dataset. [This page](https://planetarycomputer.microsoft.com/catalog) is a nice resource for browsing the available collections, but we can also just search the catalog for 'landsat':" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "ba583769-59d7-49bd-b506-79c744430a42", 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "all_collections = [i.id for i in catalog.get_collections()]\n", 158 | "landsat_collections = [\n", 159 | " collection for collection in all_collections if \"landsat\" in collection\n", 160 | "]\n", 161 | "landsat_collections" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "id": "a15872e0-c837-48a0-833b-9815ef4262f6", 167 | "metadata": {}, 168 | "source": [ 169 | "We'll use the `landsat-c2-l2` dataset, which stands for Collection 2 Level-2. It contains data from several landsat missions and has better data quality than Level 1 (`landsat-c2-l1`). Microsoft Planetary Computer has descriptions of [Level 1](https://planetarycomputer.microsoft.com/dataset/landsat-c2-l1) and [Level 2](https://planetarycomputer.microsoft.com/dataset/landsat-c2-l2), but a direct and succinct comparison can be found in [this community post](https://gis.stackexchange.com/questions/439767/landsat-collections), and the information can be verified with [USGS](https://www.usgs.gov/landsat-missions/landsat-collection-2)." 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "id": "a05e403e-92ae-46b4-b925-28c4f9628aa2", 175 | "metadata": {}, 176 | "source": [ 177 | "Now, let's set our search parameters. You may already know the bounding box (region/area of interest) coordinates, but if you don't, there are many useful tools like [bboxfinder.com](http://bboxfinder.com/) that can help." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "id": "77b40b78-c7f5-493d-b6a2-e5c765d2aaae", 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "bbox = [-118.89, 38.54, -118.57, 38.84] # Region over a lake in Nevada, USA\n", 188 | "datetime = \"2017-06-01/2017-09-30\" # Summer months of 2017\n", 189 | "collection = \"landsat-c2-l2\"" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "id": "1f3f7c56-4cfb-4764-b7ee-c01f58667d13", 195 | "metadata": {}, 196 | "source": [ 197 | "We can also specify other parameters in the query, such as a specific landsat mission and the max percent of cloud cover:" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "id": "ce29099d-1a57-447b-83dc-957f3b9d0096", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "platform = \"landsat-8\"\n", 208 | "cloudy_less_than = 1 # percent" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "id": "e44b0124-aaeb-461a-a48c-9e576da3bc54", 214 | "metadata": {}, 215 | "source": [ 216 | "Now we run the search and list the results:" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "id": "4d40e83c-d293-43d6-8f54-75ee67f66b99", 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "search = catalog.search(\n", 227 | " collections=[\"landsat-c2-l2\"],\n", 228 | " bbox=bbox,\n", 229 | " datetime=datetime,\n", 230 | " query={\"eo:cloud_cover\": {\"lt\": cloudy_less_than}, \"platform\": {\"in\": [platform]}},\n", 231 | ")\n", 232 | "items = search.item_collection()\n", 233 | "print(f\"Returned {len(items)} Items:\")\n", 234 | "item_id = {(i, item.id): i for i, item in enumerate(items)}\n", 235 | "item_id" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "id": "c246504f-f36c-4e9c-a740-8041a4d13612", 241 | "metadata": {}, 242 | "source": [ 243 | "It looks like there were three image stacks taken by Landsat 8 over this spatial region during the summer months of 2017 that has less than 1 percent cloud cover." 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "id": "1ec703af-b574-42eb-b97c-756fd6f8f909", 249 | "metadata": {}, 250 | "source": [ 251 | "## Preview Results and Select a Dataset" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "478b863e-e32c-499c-8a39-79c66ca1fb33", 257 | "metadata": {}, 258 | "source": [ 259 | "Before loading one of the available image stacks, it would be useful to get a visual check of the results. Many datasets have a rendered preview or thumbnail image that can be accessed without having to load the full resolution data.\n", 260 | "\n", 261 | "We can create a simple interactive application using the [Panel](https://panel.holoviz.org/index.html) library to access and display rendered PNG previews of the our search results. Note that these pre-rendered images are of large tiles that span beyond our bounding box of interest. In the next steps, we will only be loading in a small area around the lake." 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "id": "1f8e05bf-1c36-446b-8bc2-58bbecf6b68a", 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "item_sel = pn.widgets.Select(value=1, options=item_id, name=\"item\")\n", 272 | "\n", 273 | "def get_preview(i):\n", 274 | " return pn.panel(items[i].assets[\"rendered_preview\"].href, height=300)\n", 275 | "\n", 276 | "\n", 277 | "pn.Row(item_sel, pn.bind(get_preview, item_sel))" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "18f2d557-f274-4076-b328-39a06dc066c9", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "selected_item = items[1]\n", 288 | "selected_item" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "id": "311bb64c-230c-4a09-acd0-1c6d48dc927e", 294 | "metadata": {}, 295 | "source": [ 296 | "## Access the Data" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "id": "7e253896-b4cb-4111-8094-8572e2c6d2ff", 302 | "metadata": {}, 303 | "source": [ 304 | "Now that we have selected a dataset from our catalog, we can procede to access the data. We want to be very selective about the data that we read and when we read it because the amount of downloaded data can quickly get out of hand. Therefore, let's select only a subset of images.\n", 305 | "\n", 306 | "First, we'll preview the different image assets (or [Bands](https://github.com/stac-extensions/eo)) available in the Landsat item." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "id": "d49a164f-3d5d-4a7d-a5cb-029d6bede1db", 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "assets = []\n", 317 | "for _, asset in selected_item.assets.items():\n", 318 | " try:\n", 319 | " assets.append(asset.extra_fields[\"eo:bands\"][0])\n", 320 | " except:\n", 321 | " pass\n", 322 | "\n", 323 | "cols_ordered = [\n", 324 | " \"common_name\",\n", 325 | " \"description\",\n", 326 | " \"name\",\n", 327 | " \"center_wavelength\",\n", 328 | " \"full_width_half_max\",\n", 329 | "]\n", 330 | "bands = pd.DataFrame.from_dict(assets)[cols_ordered]\n", 331 | "bands" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "id": "f8cbc4e1-7754-47b2-8477-e9ed3fef00f7", 337 | "metadata": {}, 338 | "source": [ 339 | "Then we will select a few bands (images) of interest:" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "1831d5f4-f068-476e-80fe-de32a851d12a", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "bands_of_interest = [\"red\", \"green\", \"blue\"]" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "id": "764ecd52-b893-4aed-b021-02d25b257c12", 355 | "metadata": {}, 356 | "source": [ 357 | "Finally, we lazily load the selected data. We will use the package called `odc` which allows us to load only a specific region of interest (bounding box or 'bbox') and specific bands (images) of interest. We will also use the `chunks` argument to load the data as dask arrays; this will load the metadata now and delay the loading until we actually use the data, or until we force the data to be loaded by using `.compute()`." 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "id": "eca1a4a4-c0bc-4c81-960f-fb3550048a33", 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "ds = odc.stac.stac_load(\n", 368 | " [selected_item],\n", 369 | " bands=bands_of_interest,\n", 370 | " bbox=bbox,\n", 371 | " chunks={}, # <-- use Dask\n", 372 | ").isel(time=0)\n", 373 | "ds" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "id": "1e410e59-8f71-4b0c-b2db-177b7ce00278", 379 | "metadata": {}, 380 | "source": [ 381 | "Let's combine the bands of the dataset into a single DataArray that has the band names as coordinates of a new 'band' dimension, and also call `.compute()` to finally load the data." 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "id": "e63d16a2-a36d-4967-808c-ffcb1479088b", 388 | "metadata": { 389 | "tags": [] 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "da = ds.to_array(dim=\"band\").compute()\n", 394 | "da" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "id": "4ac0f3c1-3ff8-46c5-aa51-b06c73d96c66", 400 | "metadata": {}, 401 | "source": [ 402 | "## Visualize the data" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "id": "ce573b8c-815d-439b-a49f-fa258d0e07a7", 408 | "metadata": {}, 409 | "source": [ 410 | "Often, data ingestion involves quickly visualizing your raw data to get a sense that things are proceeding accordingly. As we have created an array with red, blue, and green bands, we can quickly display a natural color image of the lake using the `.plot.imshow()` function of `xarray`. We'll use the `robust=True` argument because the data values are outside the range of typical RGB images." 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "id": "30c9eec0-6975-4b1e-9a7c-318b98c332b2", 417 | "metadata": { 418 | "tags": [] 419 | }, 420 | "outputs": [], 421 | "source": [ 422 | "da.plot.imshow(robust=True, size=3)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "id": "c188d2c2-9eb9-4030-8fd0-54f88bb252c3", 428 | "metadata": {}, 429 | "source": [ 430 | "Now, let's use `hvplot` to provide an interactive visualization of the inividual bands in our array." 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "id": "31d88fa3-dbb9-4c14-b086-51ee20b52602", 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "ds" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "id": "c0cc8aa2-2878-4d2e-8bf3-6d9f1fcf9a58", 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "da.hvplot.image(x=\"x\", y=\"y\", cmap=\"viridis\", aspect=1)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "id": "a9724bab-42c1-4275-a6dc-562c16b907f7", 456 | "metadata": {}, 457 | "source": [ 458 | "Let's plot the bands as seperate columns by specifying a dimension to expand with `col='band'`. We can also set `rasterize=True` to use [Datashader](https://datashader.org/) (another HoloViz tool) to render large data into a 2D histogram, where every array cell counts the data points falling into that pixel, as set by the resolution of your screen. This is especially important for large and high resolution images that would otherwise cause issues when attempting to render in a browser." 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "id": "2ef8c6d8-3717-44ca-8096-216e00116d45", 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "da.hvplot.image(\n", 469 | " x=\"x\", y=\"y\", col=\"band\", cmap=\"viridis\", xaxis=False, yaxis=False, colorbar=False, rasterize=True\n", 470 | ")" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "id": "651f855c-0b9b-49d8-a851-5717a08532e7", 476 | "metadata": {}, 477 | "source": [ 478 | "Select the zoom tool and zoom in on of the plots to see that all the images are all automatically linked!" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "id": "bbb72422-b9d6-4eee-a070-7f5ae5bca468", 484 | "metadata": {}, 485 | "source": [ 486 | "## Retain Attributes" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "id": "32708bb1-adf1-423c-b76b-d2c2e0ced5cd", 492 | "metadata": {}, 493 | "source": [ 494 | "When working with many image arrays, it's critical to retain the data properties as xarray attributes:" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "id": "1c746f5e-c68b-40ac-9672-8c184c6b2d30", 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "da.attrs = selected_item.properties\n", 505 | "da" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "id": "146c063b-563a-46fb-b964-676adb4a09c2", 511 | "metadata": { 512 | "jupyter": { 513 | "outputs_hidden": true 514 | }, 515 | "tags": [] 516 | }, 517 | "source": [ 518 | "Notice that you can now expand the `Attributes: ` dropdown to see the properties of this data." 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "id": "63747da5-e126-4104-a27c-0501da98121b", 524 | "metadata": {}, 525 | "source": [ 526 | "## Set the `crs` attribute" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "id": "8f315ec7-c868-4946-afe8-0a9fb5632aea", 532 | "metadata": {}, 533 | "source": [ 534 | "As the data is in 'meter' units from a reference point, we can plot in commonly used longitude, latitude coordinates with `.hvplot(geo=True)` if our array has a valid coordinate reference system (CRS) attribute. This value is provided from Microsoft Planetary Computer as the `proj:epsg` property, so we just need to copy it to a new attribute `crs` so that hvPlot can automatically find it, without us having to further specify anything in our plotting code\n", 535 | "\n", 536 | "Note, this CRS is referenced by an EPSG code that can be accessed from the metadata of our selected catalog search result. We can see more about this dataset's specific code at [EPSG.io/32611](https://epsg.io/32611). You can also read more about EPSG codes in general in this [Coordinate Reference Systems: EPSG codes](https://pygis.io/docs/d_understand_crs_codes.html#epsg-codes) online book chapter. " 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "id": "cfb957fb-94e7-458f-a4dd-cff04d6331ec", 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "da.attrs[\"crs\"] = f\"epsg:{selected_item.properties['proj:epsg']}\"\n", 547 | "da.attrs[\"crs\"]" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "id": "36637826-b5de-41bf-86fc-5d8d659a8217", 553 | "metadata": {}, 554 | "source": [ 555 | "Now we can use `.hvplot(geo=True)` to plot in longitude and latitude coordinates. Informing `hvPlot` that this is geographic data also allows us to overlay data on aligned geographic tiles using the `tiles` parameter." 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "id": "87dbdd45-6a65-48c1-b0b0-559f31cb873f", 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "da.hvplot.image(\n", 566 | " x=\"x\", y=\"y\", cmap=\"viridis\", geo=True, alpha=.9, tiles=\"ESRI\", xlabel=\"Longitude\", ylabel=\"Latitude\", colorbar=False, aspect=1,\n", 567 | ")" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "id": "54214e63-420e-436c-aa95-c806c74c4c02", 573 | "metadata": {}, 574 | "source": [ 575 | "___" 576 | ] 577 | }, 578 | { 579 | "cell_type": "markdown", 580 | "id": "356f1670-ffea-4e9c-a7df-632864aecff0", 581 | "metadata": {}, 582 | "source": [ 583 | "## Summary\n", 584 | "The data access approach should adapt to features of the data and your intended analysis. As Landsat data is large and multidimensional, a good approach is to use [Microsoft Plantery Computer](https://planetarycomputer.microsoft.com/docs/overview/about), [pystac-client](https://pystac-client.readthedocs.io/en/stable/index.html), and [odc-stac](https://odc-stac.readthedocs.io/en/latest/index.html) together for searching the metadata catalog and lazily loading specific data chunks. Once you have accessed data, visualize it with hvPlot to ensure that it matches your expectations.\n", 585 | "\n", 586 | "### What's next?\n", 587 | "Before we proceed to workflow examples, we can explore an alternate way of accessing data using generalized tooling." 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "id": "d200eed4-b3be-4a09-ae4a-671fa6f22e23", 593 | "metadata": {}, 594 | "source": [ 595 | "## Resources and References\n", 596 | "- Authored by Demetris Roumis circa Jan, 2023\n", 597 | "- Guidance for parts of this notebook was provided by Microsoft in ['Reading Data from the STAC API'](https://planetarycomputer.microsoft.com/docs/quickstarts/reading-stac/)\n", 598 | "- The image used in the banner is from an announcement about PySTAC from Azavea" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "id": "22affb0b-715d-4520-9d5b-2631bf1105c3", 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [] 608 | } 609 | ], 610 | "metadata": { 611 | "kernelspec": { 612 | "display_name": "Python 3 (ipykernel)", 613 | "language": "python", 614 | "name": "python3" 615 | }, 616 | "language_info": { 617 | "codemirror_mode": { 618 | "name": "ipython", 619 | "version": 3 620 | }, 621 | "file_extension": ".py", 622 | "mimetype": "text/x-python", 623 | "name": "python", 624 | "nbconvert_exporter": "python", 625 | "pygments_lexer": "ipython3", 626 | "version": "3.10.13" 627 | }, 628 | "vscode": { 629 | "interpreter": { 630 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 631 | } 632 | } 633 | }, 634 | "nbformat": 4, 635 | "nbformat_minor": 5 636 | } 637 | -------------------------------------------------------------------------------- /notebooks/1.1_Data_Ingestion-General.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Ingestion - General Purpose Tooling" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | " ![intake landsat](images/intake_landsat.png \"intake landsat\")" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "---" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Overview\n", 29 | "\n", 30 | "If the specialized geospatial tools discussed in the previous notebook suit your needs, feel free to proceed to explore a workflow example, such as [Spectral Clustering](2.0_Spectral_Clustering_PC.ipynb). However, if you're seeking a tool that is adaptable across a wider range of data types and sources, welcome to this introduction to [Intake V2](https://intake.readthedocs.io), a general-purpose data ingestion and management library.\n", 31 | "\n", 32 | "Intake is a high-level library designed for data ingestion and management. While the [geospatial-specific tooling](1.0_Data_Ingestion-Geospatial.ipynb) approach is optimized for satellite data, Intake offers a broader and potentially more flexible approach for multimodal data workflows, characterized by:\n", 33 | "\n", 34 | "- **Unified Interface**: Abstracts the details of data sources, enabling users to interact with a consistent API regardless of the data's underlying format.\n", 35 | "- **Dynamic and Shareable Catalogs**: Facilitates the creation and sharing of data catalogs that can be version-controlled, updated, and maintained.\n", 36 | "- **Extensible**: Facilitates the addition of new data sources and formats through its plugin system.\n", 37 | "\n", 38 | "In the following sections, we will guide you through an introduction to various Intake functionalities that simplify data access and enhance both modularity and reproducibility in geospatial workflows.\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Prerequisites\n", 46 | "\n", 47 | "| Concepts | Importance | Notes |\n", 48 | "| --- | --- | --- |\n", 49 | "| [Intro to Landsat](./0.0_Intro_Landsat.ipynb) | Necessary | Background |\n", 50 | "| [Data Ingestion - Geospatial-Specific Tooling](1.0_Data_Ingestion-Geospatial.ipynb) | Helpful | |\n", 51 | "| [Pandas Cookbook](https://foundations.projectpythia.org/core/pandas.html) | Helpful | |\n", 52 | "| [xarray Cookbook](https://foundations.projectpythia.org/core/xarray.html) | Necessary | |\n", 53 | "| [Intake Quickstart](https://intake.readthedocs.io/en/latest/index.html) | Helpful | |\n", 54 | "|[Intake Cookbook](https://projectpythia.org/intake-cookbook/README.html)| Necessary | |\n", 55 | "\n", 56 | "- **Time to learn**: 20 minutes" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "---" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Imports" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "tags": [] 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "import intake\n", 82 | "import planetary_computer\n", 83 | "from pprint import pprint\n", 84 | "\n", 85 | "# Viz\n", 86 | "import hvplot.xarray\n", 87 | "import panel as pn\n", 88 | "\n", 89 | "pn.extension()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Connecting to Data Sources" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "To get started, we need to provide a STAC URL (or any other data source URL) to intake, and we can ask intake to recommend some suitable datatypes." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "tags": [] 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "url = \"https://planetarycomputer.microsoft.com/api/stac/v1\"\n", 115 | "data_types = intake.readers.datatypes.recommend(url)\n", 116 | "pprint(data_types)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## Selecting the Appropriate Data Type\n", 124 | "After identifying the possible data types, we choose the one that best suits our needs. For handling STAC formatted JSON data from our URL, we will proceed with `STACJSON`." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "data_type = intake.datatypes.STACJSON(url)\n", 134 | "data_type" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "This object now represents the specific data type we will work with, allowing us to streamline subsequent data operations." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "## Initializing Data Readers\n", 149 | "\n", 150 | "With the `STACJSON` data type specified, we explore available methods to read the data." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "readers = data_type.possible_readers\n", 160 | "pprint(readers)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "This output presents us with options that can interpret the `STACJSON` data format effectively. The `StacCatalogReader` is probably the most suitable for our use case. We can use it to read the STAC catalog and explore the available contents." 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "## Reading the Catalog\n", 175 | "Next, we can access the data catalog through our reader." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "reader = intake.catalogs.StacCatalogReader(\n", 185 | " data_type, signer=planetary_computer.sign_inplace\n", 186 | ")\n", 187 | "reader" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "This reader is now configured to handle interactions with the data catalog." 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "## List Catalog Contents\n", 202 | "Once the catalog is accessible, we `read()` it and then collect each dataset's `description` to identify datasets of interest. For our purposes, we will just print the entries that include the word `'landsat'`." 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "tags": [] 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "stac_cat = reader.read()\n", 214 | "\n", 215 | "description = {}\n", 216 | "for data_description in stac_cat.data.values():\n", 217 | " data = data_description.kwargs[\"data\"]\n", 218 | " description[data[\"id\"]] = data[\"description\"]" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "tags": [] 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "# Print only keys that include the word 'landsat'\n", 230 | "pprint([key for key in description.keys() if 'landsat' in key.lower()])" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "## Detailed Dataset Examination" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "By examining specific datasets more closely, we understand their content and relevance to our project goals. We can now print the description of the desired landsat IDs." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "scrolled": true 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "print(\"1:\", description[\"landsat-c2-l1\"])\n", 256 | "print('-------------------------------\\n')\n", 257 | "print(\"2:\", description[\"landsat-c2-l2\"])" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "## Selecting and Accessing Data\n", 265 | "\n", 266 | "We want `\"landsat-c2-l2\"`, so with a chosen dataset, we can now access it directly and view the `metadata` specific to this dataset - key details that are important for analysis and interpretation. Since the output is long, we'll utilize the HoloViz Panel library to wrap the output in a scrollable element." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "tags": [] 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "landsat_reader = stac_cat[\"landsat-c2-l2\"]\n", 278 | "landsat_metadata = landsat_reader.read().metadata\n", 279 | "\n", 280 | "# View extensive metadata in scrollable block\n", 281 | "json_pane = pn.pane.JSON(landsat_metadata, name='Metadata', max_height=400, sizing_mode='stretch_width', depth=-1, theme='light')\n", 282 | "scrollable_output = pn.Column(json_pane, height=400, sizing_mode='stretch_width', scroll=True, styles={'background': 'lightgrey'})\n", 283 | "scrollable_output" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "## Visual Preview\n", 291 | "\n", 292 | "To get a visual preview of the dataset, particularly to check its quality and relevance, we use the following commands:" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": { 299 | "scrolled": true 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "landsat_reader[\"thumbnail\"].read()" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## Accessing Geospatial Data Items\n", 311 | "\n", 312 | "Once we have selected the appropriate dataset, the next step is to access the specific data items. These items typically represent individual data files or collections that are part of the dataset.\n", 313 | "\n", 314 | "The following code retrieves a handle to the 'geoparquet-items' from the Landsat dataset, which are optimized for efficient geospatial operations and queries." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "landsat_items = landsat_reader[\"geoparquet-items\"]\n", 324 | "landsat_items" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "## Converting Data for Analysis\n", 332 | "\n", 333 | "To facilitate analysis, the following code selects the last few entries (`tail`) of the dataset, converts them into a GeoDataFrame, and reads it back into a STAC catalog format. This format is particularly suited for geospatial data and necessary for compatibility with geospatial analysis tools and libraries like Geopandas." 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "tags": [] 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "cat = landsat_items.tail(output_instance=\"geopandas:GeoDataFrame\").GeoDataFrameToSTACCatalog.read()" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "## Exploring Data Collections\n", 352 | "\n", 353 | "After conversion, we explore the structure of the data collection. Each \"item\" in this collection corresponds to a set of assets, providing a structured way to access multiple related data files. We'll simply print the structure of the catalog to understand the available items and their organization.\n" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "cat" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "## Accessing Sub-Collections\n", 370 | "\n", 371 | "To dive deeper into the data, we access a specific sub-collection based on its key. This allows us to focus on a particular geographic area or time period. We'll select the first item in the catalog for now." 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "item_key = list(cat.entries.keys())[0]\n", 381 | "subcat = cat[item_key].read()\n", 382 | "subcat" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "## Reading Specific Data Bands\n", 390 | "\n", 391 | "For detailed analysis, especially in remote sensing, accessing specific spectral bands is crucial. Here, we read the red spectral band, which is often used in vegetation analysis and other remote sensing applications." 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "subcat.red.read()" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "## Preparing for Multiband Analysis\n", 408 | "To analyze true color imagery, we need to stack multiple spectral bands. Here, we prepare for this by setting up a band-stacking operation. Note, re-signing might be necessary at this point." 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "catbands = cat[item_key].to_reader(reader=\"StackBands\", bands=[\"red\", \"green\", \"blue\"], signer=planetary_computer.sign_inplace)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "## Loading and Visualizing True Color Imagery\n", 425 | "\n", 426 | "After setting up the band-stacking, we read the multiband data and prepare it for visualization." 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "data = catbands.read(dim=\"band\")\n", 436 | "data" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "## Visualizing Data\n", 444 | "Finally, we visualize the true color imagery. This visualization helps in assessing the quality of the data and the appropriateness of the bands used." 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "data.plot.imshow(robust=True, figsize=(10, 10))" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "## Summary\n", 461 | "As earth science data becomes integrated with other types of data, a powerful approach is to utilize a general purpose set of tools, including Intake and Xarray. Once you have accessed data, visualize it with hvPlot to ensure that it matches your expectations.\n", 462 | "\n", 463 | "\n", 464 | "\n", 465 | "### What's next?\n", 466 | "Now that we know how to access the data, it’s time to proceed to analysis, where we will explore a some simple machine learning approaches.\n", 467 | "\n", 468 | "\n", 469 | "## Resources and references\n", 470 | "Authored by Demetris Roumis and Andrew Huang circa April, 2024, with guidance from [Martin Durant](https://github.com/martindurant).\n", 471 | "\n", 472 | "The banner image is a mashup of a Landsat 8 image from NASA and the Intake logo.\n" 473 | ] 474 | } 475 | ], 476 | "metadata": { 477 | "kernelspec": { 478 | "display_name": "Python 3 (ipykernel)", 479 | "language": "python", 480 | "name": "python3" 481 | }, 482 | "language_info": { 483 | "codemirror_mode": { 484 | "name": "ipython", 485 | "version": 3 486 | }, 487 | "file_extension": ".py", 488 | "mimetype": "text/x-python", 489 | "name": "python", 490 | "nbconvert_exporter": "python", 491 | "pygments_lexer": "ipython3", 492 | "version": "3.10.13" 493 | }, 494 | "nbdime-conflicts": { 495 | "local_diff": [ 496 | { 497 | "diff": [ 498 | { 499 | "diff": [ 500 | { 501 | "key": 0, 502 | "op": "addrange", 503 | "valuelist": [ 504 | "Python 3" 505 | ] 506 | }, 507 | { 508 | "key": 0, 509 | "length": 1, 510 | "op": "removerange" 511 | } 512 | ], 513 | "key": "display_name", 514 | "op": "patch" 515 | } 516 | ], 517 | "key": "kernelspec", 518 | "op": "patch" 519 | } 520 | ], 521 | "remote_diff": [ 522 | { 523 | "diff": [ 524 | { 525 | "diff": [ 526 | { 527 | "key": 0, 528 | "op": "addrange", 529 | "valuelist": [ 530 | "Python3" 531 | ] 532 | }, 533 | { 534 | "key": 0, 535 | "length": 1, 536 | "op": "removerange" 537 | } 538 | ], 539 | "key": "display_name", 540 | "op": "patch" 541 | } 542 | ], 543 | "key": "kernelspec", 544 | "op": "patch" 545 | } 546 | ] 547 | }, 548 | "toc-autonumbering": false, 549 | "vscode": { 550 | "interpreter": { 551 | "hash": "d2ed0a8e3e051554a0b51e3917f81e884b169a97835ad70210b3681eb3cb39c7" 552 | } 553 | } 554 | }, 555 | "nbformat": 4, 556 | "nbformat_minor": 4 557 | } 558 | -------------------------------------------------------------------------------- /notebooks/2.0_Spectral_Clustering_PC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4257bf18-ba7c-42a6-81e3-2b5e48b3bc8b", 6 | "metadata": {}, 7 | "source": [ 8 | "# Spectral Clustering" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "95d6fc25-545a-4bcc-97ff-a17df7f6082e", 14 | "metadata": {}, 15 | "source": [ 16 | "![Spectral Clustering](images/spectral_clustering.png \"Spectral Clustering\")" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "796926b4-a134-4daf-ae19-46978acf5e89", 22 | "metadata": {}, 23 | "source": [ 24 | "---" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "9e7946b7-d7c6-4d60-bfb2-474188cfeb54", 30 | "metadata": {}, 31 | "source": [ 32 | "## Overview" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "09b5645f-ea45-4f49-9811-18bff4192034", 38 | "metadata": {}, 39 | "source": [ 40 | "The current notebook will demonstrate a simplified machine learning approach to observe the change in a lake water's extent across time. In order to identify the water, we can use spectral clustering to classify each grid cell into a category based on the similarity of the combined set of pixels across [wavelength-bands](./0.0_Intro_Landsat) in our image stacks.\n", 41 | "\n", 42 | "Our example approach uses a version of spectral clustering from [dask_ml](http://ml.dask.org/clustering.html#spectral-clustering) that is a scalable equivalent of what is available in [scikit-learn](https://scikit-learn.org/stable/modules/clustering.html#spectral-clustering). We will begin this approach with a single image stack and then conduct a direct comparison on the results from different time points.\n", 43 | "\n", 44 | "This workflow uses data from Microsoft Planetary Computer but it can be adapted to work with any data ingestion approach from this cookbook." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "f2d4e405-956d-4400-86f5-219664ef7b79", 50 | "metadata": {}, 51 | "source": [ 52 | "## Prerequisites\n", 53 | "\n", 54 | "| Concepts | Importance | Notes |\n", 55 | "| --- | --- | --- |\n", 56 | "| [Data Ingestion - Geospatial-Specific Tooling](1.0_Data_Ingestion-Geospatial.ipynb) | Necessary | |\n", 57 | "|[scikit-learn](https://scikit-learn.org/stable/modules/clustering.html#spectral-clustering) | Helpful | Spectral clustering |\n", 58 | "| [dask_ml](http://ml.dask.org/clustering.html#spectral-clustering) | Helpful | Spectral clustering at scale | \n", 59 | "\n", 60 | "\n", 61 | "- **Time to learn**: 20 minutes." 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "dd99cc55-fb0f-4bdf-bc7a-82044188a2f2", 67 | "metadata": {}, 68 | "source": [ 69 | "## Imports" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "e106b4ce-e682-4a71-817b-966bbf926989", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "import numpy as np\n", 80 | "import odc.stac\n", 81 | "import pandas as pd\n", 82 | "import planetary_computer\n", 83 | "import pystac_client\n", 84 | "import xarray as xr\n", 85 | "from dask.distributed import Client\n", 86 | "from pystac.extensions.eo import EOExtension as eo\n", 87 | "from dask_ml.cluster import SpectralClustering\n", 88 | "import pyproj\n", 89 | "\n", 90 | "# Viz\n", 91 | "import hvplot.xarray" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "2eef62e1-2df6-4e67-9e10-92c9bf74c136", 97 | "metadata": {}, 98 | "source": [ 99 | "## Loading Data" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "id": "41855f82-661f-4c02-9c48-6893a694bfe4", 105 | "metadata": {}, 106 | "source": [ 107 | "Let's start by loading some Landsat data. These steps are covered in the [Data Ingestion - Planetary Computer](1.0_Data_Ingestion-Planetary_Computer.ipynb) prerequisite." 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "id": "5dbc14ea-a560-4b01-81c0-4fc01f767de9", 113 | "metadata": {}, 114 | "source": [ 115 | "### Search the catalog" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "c5059488-8f7e-446f-97d4-992eab1e7928", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "catalog = pystac_client.Client.open(\n", 126 | " \"https://planetarycomputer.microsoft.com/api/stac/v1\",\n", 127 | " modifier=planetary_computer.sign_inplace,\n", 128 | ")\n", 129 | "\n", 130 | "bbox = [-118.89, 38.54, -118.57, 38.84] # Region over a lake in Nevada, USA\n", 131 | "datetime = \"2017-06-01/2017-09-30\" # Summer months of 2017\n", 132 | "collection = \"landsat-c2-l2\"\n", 133 | "platform = \"landsat-8\"\n", 134 | "cloudy_less_than = 1 # percent\n", 135 | "\n", 136 | "search = catalog.search(\n", 137 | " collections=[\"landsat-c2-l2\"],\n", 138 | " bbox=bbox,\n", 139 | " datetime=datetime,\n", 140 | " query={\"eo:cloud_cover\": {\"lt\": cloudy_less_than}, \"platform\": {\"in\": [platform]}},\n", 141 | ")\n", 142 | "items = search.get_all_items()\n", 143 | "print(f\"Returned {len(items)} Items:\")\n", 144 | "[[i, item.id] for i, item in enumerate(items)]" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "e89ef2a4-8b5d-4799-a529-3d9adbc61a89", 150 | "metadata": {}, 151 | "source": [ 152 | "### Load a dataset" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "4fbc1c60-ab38-49b8-beb2-167dc4b6f298", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "item = items[1] # select one of the results" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "d5f6b4f4-80aa-4e82-bc9b-bd70b6d63d49", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "assets = []\n", 173 | "for _, asset in item.assets.items():\n", 174 | " try:\n", 175 | " assets.append(asset.extra_fields[\"eo:bands\"][0])\n", 176 | " except:\n", 177 | " pass\n", 178 | "\n", 179 | "cols_ordered = [\n", 180 | " \"common_name\",\n", 181 | " \"description\",\n", 182 | " \"name\",\n", 183 | " \"center_wavelength\",\n", 184 | " \"full_width_half_max\",\n", 185 | "]\n", 186 | "bands = pd.DataFrame.from_dict(assets)[cols_ordered]\n", 187 | "bands" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "id": "d17bcf9e-116c-4471-90ca-754a3b3fb305", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "ds_2017 = odc.stac.stac_load(\n", 198 | " [item],\n", 199 | " bands=bands.common_name.values,\n", 200 | " bbox=bbox,\n", 201 | " chunks={}, # <-- use Dask\n", 202 | ").isel(time=0)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "03e8f2cb-31ef-4e42-a16f-ffa7f2e79d78", 208 | "metadata": {}, 209 | "source": [ 210 | "### Retain CRS Attribute" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "id": "a8f8e88f-a517-4a13-b823-e8dff53b0e47", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "epsg = item.properties[\"proj:epsg\"]\n", 221 | "ds_2017.attrs[\"crs\"] = f\"epsg:{epsg}\"" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "7b7d3201-bb41-48d0-90b7-565c2e454f9c", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "da_2017 = ds_2017.to_array(dim=\"band\")\n", 232 | "da_2017" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "865eb3d1-59a9-401a-b1f1-7588bb5c1700", 238 | "metadata": {}, 239 | "source": [ 240 | "## Reshaping Data\n", 241 | "\n", 242 | "The shape of our data is currently `n_bands`, `n_y`, `n_x`. In order for dask-ml / scikit-learn to consume our data, we'll need to reshape our image stacks into `n_samples, n_features`, where `n_features` is the number of wavelength-bands and `n_samples` is the total number of pixels in each wavelength-band image. Essentially, we'll be creating a vector of pixels out of each image, where each pixel has multiple features (bands), but the ordering of the pixels is no longer relevant to the computation. " 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "id": "042bfffb-c979-4958-9086-646a83918d61", 248 | "metadata": {}, 249 | "source": [ 250 | "By using xarray methods to flatten the data, we can keep track of the coordinate labels 'x' and 'y' along the way. This means that we have the ability to reshape back to our original array at any time with no information loss!" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "id": "653adf8b-da16-4eb6-ae64-d172c8eae75f", 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "flattened_xda = da_2017.stack(z=(\"x\", \"y\")) # flatten each band\n", 261 | "flattened_t_xda = flattened_xda.transpose(\"z\", \"band\")\n", 262 | "flattened_t_xda" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "id": "e427b1dd-08c3-4657-84e7-09b0269edad9", 268 | "metadata": {}, 269 | "source": [ 270 | "## Standardize Data\n", 271 | "\n", 272 | "Now that we have the data in the correct shape, let's standardize (or rescale) the values of the data. We do this to get all the flattened image vectors onto a common scale while preserving the differences in the ranges of values. Again, we'll demonstrate doing this first in NumPy and then xarray." 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "id": "d8cd5c3c-1256-400a-9cea-32debd51ec4d", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "with xr.set_options(keep_attrs=True):\n", 283 | " rescaled_xda = (flattened_t_xda - flattened_t_xda.mean()) / flattened_t_xda.std()\n", 284 | "rescaled_xda" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "id": "37a47e0b-7f74-404f-8f16-bf80c3bb1995", 290 | "metadata": {}, 291 | "source": [ 292 | "
\n", 293 | "

Info

\n", 294 | " Above, we are using a context manager \"with xr.set_options(keep_attrs=True):\" to retain the array's attributes through the operations. That is, we want any metadata like 'crs' to stay with our result so we can use 'geo=True' in our plotting.\n", 295 | "
" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "id": "27d69080-eff4-471e-9536-02f99420bd0a", 301 | "metadata": {}, 302 | "source": [ 303 | "As `rescaled_xda` is still a Dask object, if we wanted to actually run the rescaling at this point (provided that all the data can fit into memory), we would use `rescaled_xda.compute()`." 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "07b01093-da37-419c-93f2-9bba60be1578", 309 | "metadata": {}, 310 | "source": [ 311 | "\n", 312 | "## ML pipeline\n", 313 | "Now that our data is in the proper shape and value range, we are ready to conduct spectral clustering. Here we will use a version of [spectral clustering from dask_ml](https://ml.dask.org/modules/generated/dask_ml.cluster.SpectralClustering.html) that is a scalable equivalent to operations from Scikit-learn that cluster pixels based on similarity (across all wavelength-bands, which makes it spectral clustering by spectra!)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "id": "62075e17-55d1-470c-8df6-55be6ab895bd", 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "client = Client(processes=False)\n", 324 | "client" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "id": "2d68c9d8-9b46-4123-8ab1-cef625ad9ea6", 330 | "metadata": {}, 331 | "source": [ 332 | "Now we will compute and persist the rescaled data to feed into the ML pipeline. Notice that our `X` matrix below has the shape: `n_samples, n_features` as discussed earlier. " 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "id": "083a3ea0-1ae2-42d0-ac83-31d66a01b11d", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "X = client.persist(rescaled_xda)\n", 343 | "X.shape" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "id": "c0f22024-87fd-49d2-8e06-6e6eff714708", 349 | "metadata": {}, 350 | "source": [ 351 | "First we will set up the model with the number of clusters, and other options." 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "id": "4afcf563-8d46-4b35-9936-3d330e226d26", 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "clf = SpectralClustering(\n", 362 | " n_clusters=4,\n", 363 | " random_state=0,\n", 364 | " gamma=None,\n", 365 | " kmeans_params={\"init_max_iter\": 5},\n", 366 | " persist_embedding=True,\n", 367 | ")" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "id": "0666c6af-b948-41ed-a789-7f7cae9459f0", 373 | "metadata": {}, 374 | "source": [ 375 | "**This next step is the slow part.** We'll fit the model to our matrix `X`. Depending on your setup, it could take seconds to minutes to run depending on the size of our data." 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "id": "372c890c-8ce7-4ce8-958b-43909c3de0e9", 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "%time clf.fit(X)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "id": "60e746bd-5aa6-436a-84de-3b93e1bed1dd", 391 | "metadata": {}, 392 | "source": [ 393 | "Let's check the shape of the result:" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "id": "73e9af39-dadf-4956-b23e-5dc3feb65c56", 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "labels = clf.assign_labels_.labels_.compute()\n", 404 | "labels.shape" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "f9e1a56c-0d6e-4c89-bbe8-0c8d9d0dd05f", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "labels" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "id": "e8e59036-e8c0-4a0e-8d78-9037509b8c90", 420 | "metadata": {}, 421 | "source": [ 422 | "The result is a single vector of cluster labels." 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "id": "084207cc-3e00-419e-a61d-300ad0868a6e", 428 | "metadata": { 429 | "tags": [] 430 | }, 431 | "source": [ 432 | "## Un-flattening\n", 433 | "\n", 434 | "Once the computation is done, we can use the coordinates of our input array to restack our output array back into an image. Again, one of the main benefits of using `xarray` for this stacking and unstacking is that it keeps track of the coordinate information for us. " 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "id": "625ae4c4-64cd-46fd-b6ec-43810c8be359", 440 | "metadata": {}, 441 | "source": [ 442 | "Since the original array is n_samples by n_features (90000, 6) and the cluster label output is (90000,), we just need the coordinates from one of the original features in the shape of n_samples. We can just copy the coordinates from the first input feature and populate is with our output data:" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "id": "642cf283-64d8-4c9e-91c3-32be8ec35c4b", 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "template = flattened_t_xda[:, 0]\n", 453 | "output_array = template.copy(data=labels)\n", 454 | "output_array" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "id": "d321a58f-9617-4540-98c7-8a7bccd414ae", 460 | "metadata": {}, 461 | "source": [ 462 | "With this new output array with coordinates copied from the input array, we can unstack back to the original `x` and `y` image dimensions by just using `.unstack()`." 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "id": "d16ec3f0-c6dc-470b-9e56-7c3540066af7", 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "unstacked_2017 = output_array.unstack()\n", 473 | "unstacked_2017" 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "id": "588814fe-3776-4ef5-9392-dd60e6e6776c", 479 | "metadata": {}, 480 | "source": [ 481 | "Finally, we can visualize the results! By hovering over the resulting imge, we can see that the lake water has been clustered with a certain label or 'value'." 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "id": "a9e6863f-0cec-436b-bdcc-9aa533e6df9f", 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "raw_plot_2017 = da_2017.sel(band=\"red\").hvplot.image(\n", 492 | " x=\"x\", y=\"y\", geo=True, xlabel=\"lon\", ylabel=\"lat\", datashade=True, cmap=\"greys\", title=\"Raw Image 2017\",\n", 493 | ")\n", 494 | "\n", 495 | "result_plot_2017 = unstacked_2017.hvplot(\n", 496 | " x=\"x\", y=\"y\", cmap=\"Set3\", geo=True, xlabel=\"lon\", ylabel=\"lat\", colorbar=False, title=\"Spectral Clustering 2017\",\n", 497 | ")\n", 498 | "\n", 499 | "raw_plot_2017 + result_plot_2017" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "id": "ba47bdd3-995b-4b98-8cf0-efeea65688da", 505 | "metadata": {}, 506 | "source": [ 507 | "## Spectral Clustering for 1988" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "id": "bea4b2b5-5e74-4f8c-b171-89a24e57b8ae", 513 | "metadata": {}, 514 | "source": [ 515 | "We have conducted the spectral clustering for 2017 and now we want to compare this result to the lake in 1988. Let's load data from 1988 and run the same analysis as above." 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "id": "f4d284d4-23b9-4c77-9275-58aeb9c6e563", 521 | "metadata": {}, 522 | "source": [ 523 | "We will use the same catalog, but we will search it for a different point in time and different Landsat mission" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "id": "dcb18049-fb4c-44e4-9d72-90bd24094bd7", 529 | "metadata": {}, 530 | "source": [ 531 | "### Load the data" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "id": "e00ab9e6-b36b-47ec-a5a3-4b6bbf6a9ca6", 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "bbox = [-118.89, 38.54, -118.57, 38.84] # Region over a lake in Nevada, USA\n", 542 | "datetime = \"1988-06-01/1988-09-30\" # Summer months of 1988\n", 543 | "collection = \"landsat-c2-l2\"\n", 544 | "platform = \"landsat-5\" # Searching through an earlier landsat mission\n", 545 | "cloudy_less_than = 1 # percent\n", 546 | "\n", 547 | "search = catalog.search(\n", 548 | " collections=[\"landsat-c2-l2\"],\n", 549 | " bbox=bbox,\n", 550 | " datetime=datetime,\n", 551 | " query={\"eo:cloud_cover\": {\"lt\": cloudy_less_than}, \"platform\": {\"in\": [platform]}},\n", 552 | ")\n", 553 | "\n", 554 | "items = search.get_all_items()\n", 555 | "item = items[1] # select one of the results" 556 | ] 557 | }, 558 | { 559 | "cell_type": "markdown", 560 | "id": "1428fff4-fc72-4692-9c20-e9e168337fc2", 561 | "metadata": {}, 562 | "source": [ 563 | "Notice that Landsat 5 data from 1988 has slightly different spectra than Landsat 8 from 2017. Details like this are important to keep in mind when performing analyses that directly compare across missions." 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "id": "665e250d-6790-402d-b877-bbe967556089", 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "assets = []\n", 574 | "for _, asset in item.assets.items():\n", 575 | " try:\n", 576 | " assets.append(asset.extra_fields[\"eo:bands\"][0])\n", 577 | " except:\n", 578 | " pass\n", 579 | "\n", 580 | "cols_ordered = [\n", 581 | " \"common_name\",\n", 582 | " \"description\",\n", 583 | " \"name\",\n", 584 | " \"center_wavelength\",\n", 585 | " \"full_width_half_max\",\n", 586 | "]\n", 587 | "bands = pd.DataFrame.from_dict(assets)[cols_ordered]\n", 588 | "bands" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "id": "cef800e2-55f5-4777-90a5-63d466ce5f6f", 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "ds_1988 = odc.stac.stac_load(\n", 599 | " [item],\n", 600 | " bands=bands.common_name.values,\n", 601 | " bbox=bbox,\n", 602 | " chunks={}, # <-- use Dask\n", 603 | ").isel(time=0)\n", 604 | "\n", 605 | "epsg = item.properties[\"proj:epsg\"]\n", 606 | "ds_1988.attrs[\"crs\"] = f\"epsg:{epsg}\"\n", 607 | "\n", 608 | "da_1988 = ds_1988.to_array(dim=\"band\")\n", 609 | "da_1988" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "id": "057bd00b-193a-45b1-97ac-62a77fa0a322", 615 | "metadata": {}, 616 | "source": [ 617 | "### Reshape and Standardize" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "id": "ceb4ee87-9a8b-4278-9721-c2af8a56f6b8", 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [ 627 | "flattened_xda = da_1988.stack(z=(\"x\", \"y\"))\n", 628 | "flattened_t_xda = flattened_xda.transpose(\"z\", \"band\")\n", 629 | "with xr.set_options(keep_attrs=True):\n", 630 | " rescaled_xda = (flattened_t_xda - flattened_t_xda.mean()) / flattened_t_xda.std()\n", 631 | "rescaled_xda" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "id": "61bf4b05-3266-4bbd-b618-f4c01b56eccd", 637 | "metadata": {}, 638 | "source": [ 639 | "### Spectral Clustering" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "id": "4a4a50c6-f6dd-46c2-afb3-95196b341e21", 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [ 649 | "X = client.persist(rescaled_xda)\n", 650 | "clf = SpectralClustering(\n", 651 | " n_clusters=4,\n", 652 | " random_state=0,\n", 653 | " gamma=None,\n", 654 | " kmeans_params={\"init_max_iter\": 5},\n", 655 | " persist_embedding=True,\n", 656 | ")" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "id": "229368ab-ed3a-4fc1-b328-d84b7e7ebebb", 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [ 666 | "%time clf.fit(X)" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": null, 672 | "id": "ec353001-4e0e-44af-b38d-f602f5a1dab8", 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "labels = clf.assign_labels_.labels_.compute()\n", 677 | "labels.shape" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "id": "937d7db4-b848-499b-8da2-8d00287b5d81", 684 | "metadata": {}, 685 | "outputs": [], 686 | "source": [ 687 | "labels" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "id": "1c690262-bb13-48cd-9e24-d3be8ed8fb91", 693 | "metadata": {}, 694 | "source": [ 695 | "### Unstack and Visualize" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "id": "ba40cfc1-2da7-4be2-80f8-4bd0838dd64a", 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [ 705 | "template = flattened_t_xda[:, 0]\n", 706 | "output_array = template.copy(data=labels)\n", 707 | "unstacked_1988 = output_array.unstack()" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "id": "b3be0319-7f51-4384-befe-f0afcd2fecff", 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "unstacked_1988" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "id": "d41ed729-af6c-4817-aad1-154671e743c5", 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [ 727 | "raw_plot_1988 = da_1988.sel(band=\"red\").hvplot.image(\n", 728 | " x=\"x\", y=\"y\", geo=True, xlabel=\"lon\", ylabel=\"lat\", datashade=True, cmap=\"greys\", title=\"Raw 1988\"\n", 729 | ")\n", 730 | "\n", 731 | "result_plot_1988 = unstacked_1988.hvplot(\n", 732 | " x=\"x\", y=\"y\", cmap=\"Set3\", geo=True, xlabel=\"lon\", ylabel=\"lat\", colorbar=False, title=\"Spectral Clustering 1988\",\n", 733 | ")\n", 734 | "\n", 735 | "raw_plot_1988 + result_plot_1988" 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "id": "6ff424f8-b243-4a20-a069-ae45f2490370", 741 | "metadata": {}, 742 | "source": [ 743 | "## Spectral Clustering Over Time" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "id": "c872fea9-55bc-4c60-8979-91796ef649a9", 749 | "metadata": {}, 750 | "source": [ 751 | "Our hypothesis is that the lake's area is receding over time and so we want to visualize the potential change. Let's first visually compare the plot of the clustering results from the different time points." 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "id": "5b3f7a6d-3651-4324-8bed-a044034e3e67", 758 | "metadata": {}, 759 | "outputs": [], 760 | "source": [ 761 | "result_plot_1988 + result_plot_2017" 762 | ] 763 | }, 764 | { 765 | "cell_type": "markdown", 766 | "id": "4b170f41-3442-4fce-91be-40a41317f38d", 767 | "metadata": {}, 768 | "source": [ 769 | "By hovering over the lake in each image, we can see that the water was labeled ('value') with a certain cluster number in both images. We will programmatically grab the water cluster value from the middle of the lake using pyproj to convert from longtitude/latitude coordinates." 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "id": "59dc881c-0ef8-425c-b18d-b11581c8bbcf", 776 | "metadata": { 777 | "tags": [] 778 | }, 779 | "outputs": [], 780 | "source": [ 781 | "lon_lake_center = -118.71\n", 782 | "lat_lake_center = 38.7\n", 783 | "\n", 784 | "proj = pyproj.Proj(unstacked_1988.crs)\n", 785 | "lake_center_x, lake_center_y = proj(lon_lake_center, lat_lake_center)\n", 786 | "\n", 787 | "water_cluster_1988 = int(unstacked_1988.sel(x=lake_center_x, y=lake_center_y, method='nearest'))\n", 788 | "water_cluster_2017 = int(unstacked_2017.sel(x=lake_center_x, y=lake_center_y, method='nearest'))\n", 789 | "\n", 790 | "print('water cluster values:', water_cluster_1988, water_cluster_2017)" 791 | ] 792 | }, 793 | { 794 | "cell_type": "markdown", 795 | "id": "0b51d984-1b83-408b-9758-c843e9d8c58e", 796 | "metadata": {}, 797 | "source": [ 798 | "Now, let's set any value that isn't our water cluster label to 0." 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "id": "f20df2e1-b66d-4857-a39d-9a16ee50ad22", 805 | "metadata": {}, 806 | "outputs": [], 807 | "source": [ 808 | "with xr.set_options(keep_attrs=True):\n", 809 | " water_1988 = (unstacked_1988 == water_cluster_1988).astype(int)\n", 810 | " water_2017 = (unstacked_2017 == water_cluster_2017).astype(int)\n" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": null, 816 | "id": "b90cabc5-cac6-4e53-8730-9b808c688104", 817 | "metadata": {}, 818 | "outputs": [], 819 | "source": [ 820 | "water_1988_plot = water_1988.hvplot(\n", 821 | " x=\"x\", y=\"y\", cmap=\"greys\", geo=True, colorbar=False, title=\"1988 Water\"\n", 822 | ")\n", 823 | "\n", 824 | "water_2017_plot = water_2017.hvplot(\n", 825 | " x=\"x\", y=\"y\", cmap=\"greys\", geo=True, colorbar=False, title=\"2017 Water\"\n", 826 | ")\n", 827 | "\n", 828 | "water_1988_plot + water_2017_plot" 829 | ] 830 | }, 831 | { 832 | "cell_type": "markdown", 833 | "id": "313e08b0-bbec-43cf-94fa-a4b8a84e1177", 834 | "metadata": {}, 835 | "source": [ 836 | "Now we can take the difference of these water label arrays to see exactly where the water levels has changed." 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": null, 842 | "id": "4af122d1-3044-48cc-a0be-3922839474c1", 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [ 846 | "with xr.set_options(keep_attrs=True):\n", 847 | " water_diff = water_1988 - water_2017" 848 | ] 849 | }, 850 | { 851 | "cell_type": "markdown", 852 | "id": "d5517bf1-bced-455e-a053-528b47809e0a", 853 | "metadata": {}, 854 | "source": [ 855 | "Red pixels (array value '1') of our image below are where water was lost from 1988 to 2017." 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": null, 861 | "id": "2b9ac483-0240-4ffc-b167-f84eac832a32", 862 | "metadata": {}, 863 | "outputs": [], 864 | "source": [ 865 | "water_diff.hvplot(\n", 866 | " x=\"x\", y=\"y\", cmap='coolwarm', geo=True, xlabel=\"long\", ylabel=\"lat\", colorbar=False, title=\"Water Change 1988-2017\",\n", 867 | ")" 868 | ] 869 | }, 870 | { 871 | "cell_type": "markdown", 872 | "id": "1466477f-19b5-4866-940c-5cc52c979b83", 873 | "metadata": {}, 874 | "source": [ 875 | "We did it! We are observing the change in the lake shoreline over time using a simple spectral clustering approach." 876 | ] 877 | }, 878 | { 879 | "cell_type": "markdown", 880 | "id": "d14ad11a-18df-4082-a596-a0be684e75a7", 881 | "metadata": {}, 882 | "source": [ 883 | "Let's finish things off by adding some geo tiles as a background. To only display the colored pixels overlaid on geo tiles, we could either set the array's background value ('0') to 'Not a Number' (NaN), or we could just inform hvPlot that we want the background valued pixels to be transparent with `.redim.nodata(value=0)`." 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": null, 889 | "id": "17ad0e8a-c880-4c67-9994-4e1425da2829", 890 | "metadata": {}, 891 | "outputs": [], 892 | "source": [ 893 | "water_diff.hvplot(\n", 894 | " x=\"x\", y=\"y\", width=400, height=400, cmap='coolwarm', geo=True, xlabel=\"lon\", ylabel=\"lat\", alpha=1, colorbar=False, title=\"Water Loss from 1988 to 2017\", tiles=\"ESRI\",\n", 895 | ").redim.nodata(value=0)\n", 896 | "\n" 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "id": "46b24b86-d262-4ce3-8cae-5b4185574dc7", 902 | "metadata": {}, 903 | "source": [ 904 | "___" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "id": "072e479f-7b69-495c-83a3-9b77352be007", 910 | "metadata": {}, 911 | "source": [ 912 | "## Summary\n", 913 | "Starting from raw Landsat data, we have used a simple spectral clustering approach to observe the change in a lake water's extent across time.\n", 914 | "\n", 915 | "### What's next?\n", 916 | "Adapt this notebook for your own use case or select another workflow example notebook." 917 | ] 918 | }, 919 | { 920 | "cell_type": "markdown", 921 | "id": "ebee9cbd-ed88-4827-809c-c820b8af509a", 922 | "metadata": {}, 923 | "source": [ 924 | "## Resources and References\n", 925 | "- Authored by Demetris Roumis circa Jan, 2023" 926 | ] 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": null, 931 | "id": "3993488c-4dcf-4613-8c21-b02431112866", 932 | "metadata": {}, 933 | "outputs": [], 934 | "source": [] 935 | } 936 | ], 937 | "metadata": { 938 | "kernelspec": { 939 | "display_name": "Python 3 (ipykernel)", 940 | "language": "python", 941 | "name": "python3" 942 | }, 943 | "language_info": { 944 | "codemirror_mode": { 945 | "name": "ipython", 946 | "version": 3 947 | }, 948 | "file_extension": ".py", 949 | "mimetype": "text/x-python", 950 | "name": "python", 951 | "nbconvert_exporter": "python", 952 | "pygments_lexer": "ipython3", 953 | "version": "3.10.13" 954 | }, 955 | "vscode": { 956 | "interpreter": { 957 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 958 | } 959 | } 960 | }, 961 | "nbformat": 4, 962 | "nbformat_minor": 5 963 | } 964 | -------------------------------------------------------------------------------- /notebooks/data/catalog.yml: -------------------------------------------------------------------------------- 1 | sources: 2 | landsat_5_small: 3 | description: Small version of Landsat 5 Surface Reflectance Level-2 Science Product. 4 | driver: rasterio 5 | cache: 6 | - argkey: urlpath 7 | regex: 'earth-data/landsat' 8 | type: file 9 | args: 10 | urlpath: 's3://earth-data/landsat/small/LT05_L1TP_042033_19881022_20161001_01_T1_sr_band{band:d}.tif' 11 | chunks: 12 | band: 1 13 | x: 50 14 | y: 50 15 | concat_dim: band 16 | storage_options: {'anon': True} 17 | metadata: 18 | plots: 19 | band_image: 20 | kind: 'image' 21 | x: 'x' 22 | y: 'y' 23 | groupby: 'band' 24 | rasterize: True 25 | width: 400 26 | dynamic: False 27 | 28 | landsat_8_small: 29 | description: Small version of Landsat 8 Surface Reflectance Level-2 Science Product. 30 | driver: rasterio 31 | cache: 32 | - argkey: urlpath 33 | regex: 'earth-data/landsat' 34 | type: file 35 | args: 36 | urlpath: 's3://earth-data/landsat/small/LC08_L1TP_042033_20171022_20171107_01_T1_sr_band{band:d}.tif' 37 | chunks: 38 | band: 1 39 | x: 50 40 | y: 50 41 | concat_dim: band 42 | storage_options: {'anon': True} 43 | 44 | landsat_5: 45 | description: Images contain Landsat 5 Surface Reflectance Level-2 Science Product. 46 | driver: rasterio 47 | cache: 48 | - argkey: urlpath 49 | regex: 'earth-data/landsat' 50 | type: file 51 | args: 52 | urlpath: 's3://earth-data/landsat/LT05_L1TP_042033_19881022_20161001_01_T1_sr_band{band:d}.tif' 53 | chunks: 54 | band: 1 55 | x: 256 56 | y: 256 57 | concat_dim: band 58 | storage_options: {'anon': True} 59 | metadata: 60 | plots: 61 | band_image: 62 | kind: 'image' 63 | x: 'x' 64 | y: 'y' 65 | groupby: 'band' 66 | rasterize: True 67 | width: 400 68 | 69 | landsat_8: 70 | description: Images contain Landsat 8 Surface Reflectance Level-2 Science Product. 71 | driver: rasterio 72 | cache: 73 | - argkey: urlpath 74 | regex: 'earth-data/landsat' 75 | type: file 76 | args: 77 | urlpath: 's3://earth-data/landsat/LC08_L1TP_042033_20171022_20171107_01_T1_sr_band{band:d}.tif' 78 | chunks: 79 | band: 1 80 | x: 256 81 | y: 256 82 | concat_dim: band 83 | storage_options: {'anon': True} 84 | 85 | google_landsat_band: 86 | description: Landsat bands from Google Cloud Storage 87 | driver: rasterio 88 | parameters: 89 | path: 90 | description: landsat path 91 | type: int 92 | row: 93 | description: landsat row 94 | type: int 95 | product_id: 96 | description: landsat file id 97 | type: str 98 | band: 99 | description: band 100 | type: int 101 | args: 102 | urlpath: https://storage.googleapis.com/gcp-public-data-landsat/LC08/01/{{ '%03d' % path }}/{{ '%03d' % row }}/{{ product_id }}/{{ product_id }}_B{{ band }}.TIF 103 | chunks: 104 | band: 1 105 | x: 256 106 | y: 256 107 | 108 | amazon_landsat_band: 109 | description: Landsat bands from Amazon Web Services S3 110 | driver: rasterio 111 | parameters: 112 | path: 113 | description: landsat path 114 | type: int 115 | row: 116 | description: landsat row 117 | type: int 118 | product_id: 119 | description: landsat file id 120 | type: str 121 | band: 122 | description: band 123 | type: int 124 | cache: 125 | - argkey: urlpath 126 | regex: 'landsat-pds' 127 | type: file 128 | args: 129 | urlpath: s3://landsat-pds/c1/L8/{{ '%03d' % path }}/{{ '%03d' % row }}/{{ product_id }}/{{ product_id }}_B{{ band }}.TIF 130 | chunks: 131 | band: 1 132 | x: 256 133 | y: 256 134 | storage_options: {'anon': True} 135 | 136 | fluxnet_daily: 137 | driver: csv 138 | parameters: 139 | s3_path: 140 | description: Filename to load 141 | type: str 142 | default: earth-data/carbon_flux/nee_data_fusion/FLX_AR-SLu_FLUXNET2015_FULLSET_DD_2009-2011_1-3.csv 143 | cache: 144 | - argkey: urlpath 145 | regex: 'earth-data' 146 | type: file 147 | args: 148 | urlpath: "s3://{{ s3_path }}" 149 | path_as_pattern: 'FLX_{site}_FLUXNET2015_FULLSET_DD_{}.csv' 150 | csv_kwargs: 151 | assume_missing: true 152 | na_values: [-9999] 153 | parse_dates: ['TIMESTAMP'] 154 | storage_options: {'anon': True} 155 | 156 | fluxnet_metadata: 157 | driver: csv 158 | cache: 159 | - argkey: urlpath 160 | regex: 'earth-data' 161 | type: file 162 | args: 163 | urlpath: "s3://earth-data/carbon_flux/nee_data_fusion/allflux_metadata.txt" 164 | csv_kwargs: 165 | header: null 166 | names: ['site', 'lat', 'lon', 'igbp', 'network'] 167 | usecols: ['site', 'lat', 'lon', 'igbp'] 168 | storage_options: {'anon': True} 169 | 170 | seattle_lidar: 171 | driver: csv 172 | cache: 173 | - argkey: urlpath 174 | regex: 'https://s3.amazonaws.com/earth-data' 175 | type: compressed 176 | decomp: infer 177 | args: 178 | urlpath: "https://s3.amazonaws.com/earth-data/seattle-lidar.zip" 179 | storage_options: {'anon': True} 180 | metadata: 181 | crs: State Plane Coordinate System Washington North FIPS 4601 182 | -------------------------------------------------------------------------------- /notebooks/data/landsat5_bands.csv: -------------------------------------------------------------------------------- 1 | Band, Description, Range (nm) 2 | 1, Blue, 450-520 3 | 2, Green, 520-600 4 | 3, Red, 630-690 5 | 4, Near-Infrared, 760-900 6 | 5, Short Wavelength Infrared 1, 1550-1750 7 | 6, Thermal Infrared, 10400-12500 8 | 7, Short Wavelength Infrared 2, 2080-2350 9 | -------------------------------------------------------------------------------- /notebooks/data/landsat5_crop.nc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/data/landsat5_crop.nc -------------------------------------------------------------------------------- /notebooks/data/landsat8_bands.csv: -------------------------------------------------------------------------------- 1 | Band, Description, Range (nm) 2 | 1, Coastal Aerosol, 435-451 3 | 2, Blue, 452-512 4 | 3, Green, 533-590 5 | 4, Red, 636-673 6 | 5, Near-Infrared, 851-879 7 | 6, Short Wavelength Infrared 1, 1566-1651 8 | 7, Short Wavelength Infrared 2, 2107-2294 -------------------------------------------------------------------------------- /notebooks/images/L-Next-SpectralBands-stack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/L-Next-SpectralBands-stack.png -------------------------------------------------------------------------------- /notebooks/images/ProjectPythia_Logo_Final-01-Blue.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notebooks/images/icons/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/icons/favicon.ico -------------------------------------------------------------------------------- /notebooks/images/intake_landsat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/intake_landsat.png -------------------------------------------------------------------------------- /notebooks/images/landsat_8_rend-sm1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/landsat_8_rend-sm1.png -------------------------------------------------------------------------------- /notebooks/images/landsat_timeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/landsat_timeline.png -------------------------------------------------------------------------------- /notebooks/images/logos/Unidata_logo_horizontal_1200x300.svg: -------------------------------------------------------------------------------- 1 | 2 | 12 | 14 | 15 | 17 | image/svg+xml 18 | 20 | 21 | 22 | 23 | 24 | 26 | 28 | 890 | 891 | 892 | -------------------------------------------------------------------------------- /notebooks/images/logos/pythia_logo-white-notext.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 20 | 22 | image/svg+xml 23 | 25 | 26 | 27 | 28 | 29 | 55 | 57 | 59 | 60 | 63 | 67 | 71 | 75 | 79 | 83 | 87 | 91 | 95 | 99 | 103 | 107 | 111 | 115 | 119 | 123 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /notebooks/images/logos/pythia_logo-white-rtext.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 20 | 22 | image/svg+xml 23 | 25 | 26 | 27 | 28 | 29 | 55 | 57 | 59 | 60 | 64 | 69 | 74 | 79 | 84 | 89 | 94 | 99 | 104 | 109 | 114 | 119 | 124 | 129 | 134 | 139 | 144 | 145 | 149 | 153 | 158 | 163 | 168 | 173 | 178 | 183 | 188 | 189 | 193 | 198 | 203 | 208 | 213 | 218 | 223 | 224 | 225 | 226 | -------------------------------------------------------------------------------- /notebooks/images/nasa_bands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/nasa_bands.png -------------------------------------------------------------------------------- /notebooks/images/nasa_landsat8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/nasa_landsat8.jpg -------------------------------------------------------------------------------- /notebooks/images/planetary_computer_header_800w.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/planetary_computer_header_800w.png -------------------------------------------------------------------------------- /notebooks/images/pystac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/pystac.png -------------------------------------------------------------------------------- /notebooks/images/spectral_clustering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/spectral_clustering.png -------------------------------------------------------------------------------- /notebooks/images/spectral_clustering_lake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/notebooks/images/spectral_clustering_lake.png -------------------------------------------------------------------------------- /thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPythia/landsat-ml-cookbook/0854cfe3bcc5b27ee99e64363c216b8d9711593b/thumbnail.png --------------------------------------------------------------------------------