├── .github └── workflows │ └── build.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .prettierrc.toml ├── LICENSE ├── README.md ├── binder └── environment.yml ├── images ├── Dask Array (Light).png ├── Dask Cluster Manager (Light)(1).png ├── Dask Dataframe (Light).png ├── Dask Overview (Light).png ├── Distributed Overview (Light).png ├── Xarray-data-structures.png ├── dask-task-stream.gif ├── dask_horizontal.svg ├── dataset-diagram-logo.png ├── should-i-use-dask.png ├── xarray-data-structures.svg └── xarray-split-apply-combine.png ├── notebooks ├── 00-download-data.ipynb ├── 01-xarray-fundamentals.ipynb ├── 02-indexing-and-selecting-data.ipynb ├── 03-data-visualization.ipynb ├── 04-computation.ipynb ├── 05-masking.ipynb ├── 06-end-to-end-example.ipynb ├── 07-dask-intro.ipynb ├── 08-dask-delayed.ipynb ├── 09-dask-array.ipynb ├── 10-dask-and-xarray.ipynb ├── 11-dask-distributed.ipynb ├── blank-01-xarray-fundamentals.ipynb ├── blank-02-indexing-and-selecting-data.ipynb ├── blank-03-data-visualization.ipynb ├── blank-04-computation.ipynb ├── blank-05-masking.ipynb └── template.ipynb ├── pyproject.toml └── setup.cfg /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: [push] 3 | 4 | jobs: 5 | binder: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Build and cache on mybinder.org 9 | uses: jupyterhub/repo2docker-action@master 10 | with: 11 | NO_PUSH: true 12 | MYBINDERORG_TAG: ${{ github.event.ref }} 13 | 14 | conda-solve: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [windows-latest, ubuntu-latest, macos-latest] 19 | 20 | steps: 21 | - name: Checkout source 22 | uses: actions/checkout@v2 23 | 24 | - name: Setup Conda Environment 25 | uses: conda-incubator/setup-miniconda@v2 26 | with: 27 | environment-file: binder/environment.yml 28 | activate-environment: xarray-tutorial 29 | auto-activate-base: false 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | data/ 130 | dask-worker-space/ 131 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: schema/generic_schema.yaml 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.2.0 5 | hooks: 6 | - id: trailing-whitespace 7 | - id: end-of-file-fixer 8 | - id: check-docstring-first 9 | - id: check-json 10 | - id: check-yaml 11 | - id: double-quote-string-fixer 12 | 13 | - repo: https://github.com/ambv/black 14 | rev: 22.3.0 15 | hooks: 16 | - id: black 17 | 18 | - repo: https://github.com/keewis/blackdoc 19 | rev: v0.3.4 20 | hooks: 21 | - id: blackdoc 22 | 23 | - repo: https://gitlab.com/pycqa/flake8 24 | rev: 3.9.2 25 | hooks: 26 | - id: flake8 27 | 28 | - repo: https://github.com/asottile/seed-isort-config 29 | rev: v2.2.0 30 | hooks: 31 | - id: seed-isort-config 32 | - repo: https://github.com/pre-commit/mirrors-isort 33 | rev: v5.10.1 34 | hooks: 35 | - id: isort 36 | 37 | - repo: https://github.com/pre-commit/mirrors-prettier 38 | rev: v2.6.2 39 | hooks: 40 | - id: prettier 41 | 42 | - repo: https://github.com/nbQA-dev/nbQA 43 | rev: 1.3.1 44 | hooks: 45 | - id: nbqa-black 46 | additional_dependencies: [black==21.5b1] 47 | - id: nbqa-pyupgrade 48 | additional_dependencies: [pyupgrade==2.7.3] 49 | - id: nbqa-isort 50 | additional_dependencies: [isort==5.8.0] 51 | -------------------------------------------------------------------------------- /.prettierrc.toml: -------------------------------------------------------------------------------- 1 | tabWidth = 2 2 | semi = false 3 | singleQuote = true 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xarray-tutorial 2 | 3 | [![Build](https://github.com/andersy005/xarray-tutorial/actions/workflows/build.yml/badge.svg)](https://github.com/andersy005/xarray-tutorial/actions/workflows/build.yml) 4 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/andersy005/xarray-tutorial/main?urlpath=lab) 5 | 6 | This repository contains materials for the xarray tutorial. 7 | 8 | ## Running the tutorial 9 | 10 | There are two different ways in which you can set up and go through the tutorial materials. Both of which are outlined in the table below. 11 | 12 | | Method | Setup | Description | 13 | | :-----------: | :----------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 14 | | Binder | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/andersy005/xarray-tutorial/main?urlpath=lab) | Run the tutorial notebooks on mybinder.org without installing anything locally. | 15 | | Local install | [Instructions](#Local-installation-instructions) | Download the tutorial notebooks and install the necessary packages (via `conda`) locally. Setting things up locally can take a few minutes, so we recommend going through the installation steps prior to the tutorial. | 16 | 17 | ## Local installation instructions 18 | 19 | ### 1. Clone the repository 20 | 21 | First clone this repository to your local machine via: 22 | 23 | ``` 24 | git clone https://github.com/andersy005/xarray-tutorial 25 | ``` 26 | 27 | ### 2. Download conda (if you haven't already) 28 | 29 | If you do not already have the conda package manager installed, please follow the instructions [here](https://github.com/conda-forge/miniforge#install). 30 | 31 | ### 3. Create a conda environment 32 | 33 | Navigate to the `xarray-tutorial/` directory and create a new conda environment with the required 34 | packages via: 35 | 36 | ```terminal 37 | cd xarray-tutorial 38 | conda env update --file binder/environment.yml 39 | ``` 40 | 41 | This will create a new conda environment named "xarray-tutorial". 42 | 43 | ### 4. Activate the environment 44 | 45 | Next, activate the environment: 46 | 47 | ``` 48 | conda activate xarray-tutorial 49 | ``` 50 | 51 | ### 5. Download sample datasets 52 | 53 | To download sample datasets, run the `00-download-data.ipynb` notebook: 54 | 55 | ```bash 56 | cd notebooks/ 57 | nbterm --run 00-download-data.ipynb 58 | ``` 59 | 60 | ### 6. Launch JupyterLab 61 | 62 | Finally, launch JupyterLab with: 63 | 64 | ``` 65 | jupyter lab 66 | ``` 67 | -------------------------------------------------------------------------------- /binder/environment.yml: -------------------------------------------------------------------------------- 1 | name: xarray-tutorial 2 | channels: 3 | - conda-forge 4 | - nodefaults 5 | dependencies: 6 | - cartopy 7 | - cfgrib 8 | - cftime < 1.5 9 | - dask 10 | - dask-labextension 11 | - distributed 12 | - h5netcdf 13 | - hvplot 14 | - ipywidgets 15 | - jupyterlab-system-monitor 16 | - jupyterlab>=3 17 | - matplotlib 18 | - nbterm 19 | - nc-time-axis 20 | - netcdf4 21 | - nodejs 22 | - pip 23 | - pre-commit 24 | - pydap 25 | - python-graphviz 26 | - python=3.9 27 | - scipy 28 | - watermark 29 | - xarray>=2022.3.0 30 | -------------------------------------------------------------------------------- /images/Dask Array (Light).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Dask Array (Light).png -------------------------------------------------------------------------------- /images/Dask Cluster Manager (Light)(1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Dask Cluster Manager (Light)(1).png -------------------------------------------------------------------------------- /images/Dask Dataframe (Light).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Dask Dataframe (Light).png -------------------------------------------------------------------------------- /images/Dask Overview (Light).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Dask Overview (Light).png -------------------------------------------------------------------------------- /images/Distributed Overview (Light).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Distributed Overview (Light).png -------------------------------------------------------------------------------- /images/Xarray-data-structures.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Xarray-data-structures.png -------------------------------------------------------------------------------- /images/dask-task-stream.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/dask-task-stream.gif -------------------------------------------------------------------------------- /images/dask_horizontal.svg: -------------------------------------------------------------------------------- 1 | dask -------------------------------------------------------------------------------- /images/dataset-diagram-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/dataset-diagram-logo.png -------------------------------------------------------------------------------- /images/should-i-use-dask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/should-i-use-dask.png -------------------------------------------------------------------------------- /images/xarray-split-apply-combine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/xarray-split-apply-combine.png -------------------------------------------------------------------------------- /notebooks/00-download-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5d7665ed-2124-4773-b8db-5f88cd88ada9", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import gzip\n", 11 | "import pathlib\n", 12 | "import shutil\n", 13 | "\n", 14 | "import requests\n", 15 | "\n", 16 | "\n", 17 | "def download_data(url, data_dir=\"data\", unarchive=False, clobber=False):\n", 18 | " data_dir = pathlib.Path(data_dir)\n", 19 | " data_dir.mkdir(parents=True, exist_ok=True)\n", 20 | " local_filename = data_dir / url.split('/')[-1]\n", 21 | " if (local_filename.exists() and clobber) or not local_filename.exists():\n", 22 | " with requests.get(url, stream=True) as rstream:\n", 23 | " with local_filename.open(\"wb\") as f:\n", 24 | " shutil.copyfileobj(rstream.raw, f)\n", 25 | "\n", 26 | " if unarchive:\n", 27 | " local_filename_unarchived = data_dir / local_filename.stem\n", 28 | " if (\n", 29 | " local_filename_unarchived.exists() and clobber\n", 30 | " ) and not local_filename_unarchived.exists():\n", 31 | " with gzip.open(local_filename, \"rb\") as fin:\n", 32 | " with local_filename_unarchived.open(\"wb\") as fout:\n", 33 | " shutil.copyfileobj(fin, fout)\n", 34 | " return str(local_filename_unarchived)\n", 35 | " return local_filename" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "d8c529ad-183e-4c23-ac2f-ea41bbc6950c", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "%%time\n", 46 | "\n", 47 | "urls = [\n", 48 | " (\"http://download.ecmwf.int/test-data/cfgrib/era5-levels-members.grib\", False),\n", 49 | " (\"https://psl.noaa.gov/thredds/fileServer/Datasets/noaa.oisst.v2/sst.mnmean.nc\", False),\n", 50 | " (\n", 51 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Amon/tas/gn/v20190514/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\",\n", 52 | " False,\n", 53 | " ),\n", 54 | " (\n", 55 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Amon/ta/gn/v20190514/ta_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\",\n", 56 | " False,\n", 57 | " ),\n", 58 | " (\n", 59 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Ofx/areacello/gr/v20190514/areacello_Ofx_CESM2_historical_r11i1p1f1_gr.nc\",\n", 60 | " False,\n", 61 | " ),\n", 62 | " (\n", 63 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Omon/tos/gr/v20190514/tos_Omon_CESM2_historical_r11i1p1f1_gr_200001-201412.nc\",\n", 64 | " False,\n", 65 | " ),\n", 66 | " (\n", 67 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r9i1p1f1/Omon/tos/gr/v20190311/tos_Omon_CESM2_historical_r9i1p1f1_gr_200001-201412.nc\",\n", 68 | " False,\n", 69 | " ),\n", 70 | " (\n", 71 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r7i1p1f1/Omon/tos/gr/v20190311/tos_Omon_CESM2_historical_r7i1p1f1_gr_200001-201412.nc\",\n", 72 | " False,\n", 73 | " ),\n", 74 | " (\n", 75 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r8i1p1f1/Omon/tos/gr/v20190311/tos_Omon_CESM2_historical_r8i1p1f1_gr_200001-201412.nc\",\n", 76 | " False,\n", 77 | " ),\n", 78 | "]\n", 79 | "for url, unarchive in urls:\n", 80 | " download_data(url, unarchive=unarchive)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "56d6f41f-7f98-4c2a-828c-4ae420cabe83", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "%load_ext watermark\n", 91 | "%watermark --time --python --updated --iversion" 92 | ] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3 (ipykernel)", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.9.6" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 5 116 | } 117 | -------------------------------------------------------------------------------- /notebooks/01-xarray-fundamentals.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Xarray Fundamentals" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Learning Objectives\n", 22 | "\n", 23 | "- Provide an overview of xarray\n", 24 | "- Describe the core xarray data structures, the `DataArray` and the `Dataset`, and the components that make them up\n", 25 | "- Load xarray dataset from a netCDF file \n", 26 | "- Load xarray dataset from a GRIB file\n", 27 | "- Load xarray dataset from a remote dataset from a THREDDS server\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Prerequisites\n", 35 | "\n", 36 | "\n", 37 | "| Concepts | Importance | Notes |\n", 38 | "| --- | --- | --- |\n", 39 | "| Basic familiarity with NumPy | Necessary | |\n", 40 | "| Basic familiarity with Pandas | Helpful | |\n", 41 | "| [Understanding of NetCDF Data Model](https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_model.html) | Helpful | Familiarity with metadata structure |\n", 42 | "\n", 43 | "\n", 44 | "- **Time to learn**: *15-20 minutes*\n", 45 | "\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "---" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Imports\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import xarray as xr # \"canonical\" namespace short-hand" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## What is Xarray?\n", 76 | "\n", 77 | "Xarray is a Python library for working with **labeled**, **multi dimensional** arrays. \n", 78 | "\n", 79 | "- Built on top of numpy and pandas \n", 80 | "- Brings the power of pandas to multidimensional arrays \n", 81 | "- Supports data of any dimensionality " 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Core Data Structures\n", 89 | "\n", 90 | "- Xarray has **two** main data structures:\n", 91 | " - `xarray.DataArray`: a fancy, labelled version of `numpy.ndarray` with associated coordinates. \n", 92 | " - `xarray.Dataset`: a collection of multiple `xarray.DataArray` that share the same coordinates and/or dimensions.\n", 93 | "\n", 94 | "---\n", 95 | "\n", 96 | "" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "### Dataset" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Xarray's interface is heavily inspired by the [netCDF data model](https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_model.html). Xarray's Dataset is designed as an in-memory representation of a netCDF dataset. \n", 111 | "\n", 112 | "\n", 113 | "#### Loading data from a netCDF file\n", 114 | "\n", 115 | "First, let's open a local netCDF file using the `xarray.open_dataset()` function:" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "%%time\n", 125 | "ds = xr.open_dataset(\n", 126 | " \"./data/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n", 127 | ")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "By default, `xarray.open_dataset()` function uses **lazy loading** i.e. it just loads in the coordinate and attribute metadata and **not** the data that correspond to data variables themselves. The data variables are loaded only on actual values access (e.g. when performing some calculation, slicing, ...) or with `.load()` method. \n", 135 | "\n", 136 | "Let's look at the HTML representation of the loaded dataset:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "ds" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "\n", 153 | "
\n", 154 | "

Text based representation

\n", 155 | " If you prefer a text based representation, you can set the display_style='text' by uncommenting the line below\n", 156 | "
\n" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "# xr.set_options(display_style=\"text\")" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "To look at the corresponding netCDF representation, we can use the `.info()` method:" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "ds.info()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "Datasets have the following key properties:\n", 189 | "- `data_vars`: an dictionary of `DataArrays` corresponding to data variables \n", 190 | "- `dims`: a dictionary mapping from dimenion names to the fixed length of each dimension (e.g. `{'time': 1815, 'nv': 2, 'latitude': 180, 'longitude': 360}` )\n", 191 | "- `coords`: a dictionary-like container of arrays (coordinates) that label each point (tick label) along our dimensions\n", 192 | "- `attrs`: a dictionary holding arbitrary metadata pertaining to the dataset" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "# variables that are in our dataset\n", 202 | "ds.data_vars" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# dataset dimensions\n", 212 | "ds.dims" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# dataset coordinates\n", 222 | "ds.coords" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "# dataset global attributes\n", 232 | "ds.attrs" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### DataArray\n", 240 | "\n", 241 | "The DataArray is xarray's implementation of a labeled, multi-dimensional array. It has several key properties:\n", 242 | "\n", 243 | "- `data`: a Duck array (`numpy.ndarray` or [`dask.array`](https://docs.dask.org/en/latest/array.html) or [`sparse`](https://sparse.pydata.org/en/stable/) or [`cupy.array`](https://docs.cupy.dev/en/stable/index.html) holding the array's values). \n", 244 | "- `dims`: dimension names for each axis e.g. `(lat, lon, time)`\n", 245 | "- `coords`: a dictionary-like container of arrays (coordinates) that label each point (tick label) along our dimensions\n", 246 | "- `attrs`: a dictionary that holds arbitrary attributes/metadata (such as units). \n", 247 | "- `name`: an arbitrary name of the array" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "# Extract the tas variable (dataarray)\n", 257 | "ds[\"tas\"]" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "# ds[\"tas\"] is equivalent to ds.tas\n", 267 | "ds.tas" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "\n", 275 | "
\n", 276 | "

Warning: dot notation vs bracket notation

\n", 277 | "\n", 278 | "\n", 279 | "\n", 280 | "\n", 281 | "\n", 285 | "
" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "# The actual array data\n", 295 | "ds[\"tas\"].data" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "# datarray coordinates\n", 305 | "ds[\"tas\"].coords" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "# dataarray attributes\n", 315 | "ds[\"tas\"].attrs" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "### Dimensions vs Coordinates\n", 323 | "\n", 324 | "- A dimension is just a name of an axis, like \"longitude\" or \"time\"\n", 325 | "- Labeled coordinates are tick labels along an axis, e.g. \"2021-06-08\"\n", 326 | "\n", 327 | "\n", 328 | "#### `repr` & HTML representation of dimensions with or without coordinates \n", 329 | "\n", 330 | "| Dimension | HTML repr | Text based repr |\n", 331 | "| --- | --- | --- |\n", 332 | "| with coordinates | **bold** | `*` symbol in `.coords` |\n", 333 | "| without coordinates | normal | listed explicitly |\n", 334 | "\n" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "ds" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "with xr.set_options(display_style=\"text\"):\n", 353 | " print(ds)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "\n", 361 | "\n", 362 | "### Loading data in other file formats \n", 363 | "\n", 364 | "\n", 365 | "#### Loading data from a grib file \n", 366 | "\n", 367 | "To load a grib file in an xarray Dataset, we use the `xarray.open_dataset()` and we need to specify `engine=\"cfgrib\"`. This requires the presence of `cfgrib` package in our Python environment:" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "ds = xr.open_dataset(\"./data/era5-levels-members.grib\", engine=\"cfgrib\")\n", 377 | "ds" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "slideshow": { 384 | "slide_type": "subslide" 385 | } 386 | }, 387 | "source": [ 388 | "#### Loading data from a remote OPENDAP server \n", 389 | "\n", 390 | "\n", 391 | "If you happen to have access to netCDF datasets that are hosted remotely on a THREDDS server, you can point xarray to a url and it will load/stream the data over the network without needing to download it locally. " 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "url = \"http://crd-esgf-drc.ec.gc.ca/thredds/dodsC/esgD_dataroot/AR6/CMIP6/ScenarioMIP/CCCma/CanESM5/ssp126/r12i1p2f1/Amon/wap/gn/v20190429/wap_Amon_CanESM5_ssp126_r12i1p2f1_gn_201501-210012.nc\"" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "xr.open_dataset(url, engine=\"netcdf4\", chunks={})" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "---" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "%load_ext watermark\n", 426 | "%watermark --time --python --updated --iversion" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "## Summary \n", 434 | "\n", 435 | "\n", 436 | "- Xarray has two main data structures: DataArray and Dataset\n", 437 | "- DataArrays store the multi-dimensional arrays\n", 438 | "- Xarray is built on top of Numpy and Pandas and its architecture is heavily inspired by the netCDF data model" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "## Resources and References\n", 446 | "\n", 447 | "- [Xarray Documentation on Data Structures](http://xarray.pydata.org/en/latest/data-structures.html)\n", 448 | "- [Xarray Documentation on reading files and writing files](https://xarray.pydata.org/en/stable/io.html)\n", 449 | "- [cfgrib Documentation](https://github.com/ecmwf/cfgrib)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "
\n", 457 | "

Next: Indexing and selecting data

\n", 458 | "
" 459 | ] 460 | } 461 | ], 462 | "metadata": { 463 | "kernelspec": { 464 | "display_name": "Python 3 (ipykernel)", 465 | "language": "python", 466 | "name": "python3" 467 | }, 468 | "language_info": { 469 | "codemirror_mode": { 470 | "name": "ipython", 471 | "version": 3 472 | }, 473 | "file_extension": ".py", 474 | "mimetype": "text/x-python", 475 | "name": "python", 476 | "nbconvert_exporter": "python", 477 | "pygments_lexer": "ipython3", 478 | "version": "3.9.12" 479 | }, 480 | "toc-autonumbering": false 481 | }, 482 | "nbformat": 4, 483 | "nbformat_minor": 4 484 | } 485 | -------------------------------------------------------------------------------- /notebooks/02-indexing-and-selecting-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Indexing and Selecting data\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Learning Objectives \n", 22 | "\n", 23 | "\n", 24 | "- Select data by position using `.isel()` with values or slices\n", 25 | "- Select data by coordinate label/value using `.sel()` with values or slices\n", 26 | "- Use nearest-neighbor lookups with `.sel()`\n", 27 | "- Use `interp()` to interpolate by coordinate labels" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Prerequisites\n", 35 | "\n", 36 | "\n", 37 | "| Concepts | Importance | Notes |\n", 38 | "| --- | --- | --- |\n", 39 | "| [Understanding of xarray core data structures](./01-xarray-fundamentals.ipynb) | Necessary | |\n", 40 | "| [Basic familiarity with NumPy indexing](https://numpy.org/doc/stable/reference/arrays.indexing.html) | Helpful | |\n", 41 | "| [Basic familiarity with Pandas indexing](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html) | Helpful | |\n", 42 | "\n", 43 | "- **Time to learn**: *15-20 minutes*\n", 44 | "\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "---" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Imports\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "import xarray as xr" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "ds = xr.open_dataset(\n", 77 | " \"./data/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n", 78 | ")\n", 79 | "ds" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## NumPy Positional Indexing\n", 87 | "\n", 88 | "When working with numpy, indexing is done by position (slices/ranges/scalars)." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "temp = ds[\"tas\"].data # retrieve numpy array\n", 98 | "temp" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "temp.shape, temp.ndim" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Let's extract a timeseries for a single spatical location \n" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "temp[:, 20, 40]" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "
\n", 131 | "

\n", 132 | " but wait, what labels go with 20 and 40? Was that lat/lon or lon/lat? Where are the timestamps that go along with this time-series?\n", 133 | "
" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## Different choices for indexing \n", 141 | "\n", 142 | "\n", 143 | "Xarray supports two kinds of indexing \n", 144 | "\n", 145 | "- Positional indexing via `.isel()`: provides primarily integer position based index (from `0` to `length-1` of the axis/dimension\n", 146 | "- Label indexing via `.sel()`: provides primarily label based index\n", 147 | "\n", 148 | "Xarray's indexing methods preserves the coordinate labels and associated metadata.\n", 149 | "\n", 150 | "\n", 151 | "\n", 152 | "### Selection by position\n", 153 | "\n", 154 | "The `.isel()` method is the primary access method for **purely integer based indexing**. The following are valid inputs:\n", 155 | "- An integer e.g. `lat=10`\n", 156 | "- A list or array of integers `lon=[10, 20, 39]`\n", 157 | "- A slice object with integers e.g. `time=slice(2, 20)`" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "ds.tas.isel() # the original object i.e. no selection" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "ds.tas.isel(lat=100)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "ds.tas.isel(lat=100, time=[-2, -1])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "ds.tas.isel(lon=100, time=slice(10, 20))" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### Selection by label \n", 201 | "\n", 202 | "\n", 203 | "The `.sel()` method is the primary access method for **purely coordinate label based indexing.**. The following are valid inputs:\n", 204 | "\n", 205 | "- A single coordinate label e.g. `time=\"2021-03-01\"`\n", 206 | "- A list or array of coordinate labels `lon=[=\"2021-01-01\", =\"2021-03-10\", =\"2021-03-12\"]`\n", 207 | "- A slice object with coordinate labels e.g. `time=slice(\"2021-01-01\", \"2021-03-01\")`. (Note that contrary to usual Python slices, both the start and the stop are included, when present in the index!)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "ds.tas.sel(time=\"2013\")" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "It is also possible to use slice for the time dimension:" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "ds.tas.sel(time=slice(\"2013-01-01\", \"2014-12-31\"))" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "\n", 240 | "
\n", 241 | "

Warning: Be careful when working with floating coordinate labels

\n", 242 | " \n", 243 | " When we have integer, string, datetime-like values for coordinate labels, \"sel()\" works flawlessly. When we try to work with floating coordinate labels, things get a little tricky:\n", 244 | " \n", 245 | "
\n", 246 | "\n", 247 | "\n" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "ds.tas.sel(lat=39.5, lon=105.7)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "### Nearest-neighbor lookups\n", 264 | "\n", 265 | "As shown above, when our coordinate labels are not integers or strings or datetime-like but floating point numbers, `.sel()` may throw a `KeyError`:" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "`ds.tas.sel(lat=39.5, lon=105.7)` fails because we are trying to use a conditional for an approximate value i.e floating numbers are represented approximately inside the computer, and xarray is unable to locate this exact value. To address this issue, xarray supports `method` and `tolerance` keyword argument. The `method` parameter allows for enabling nearest neighbor (inexact) lookups by use of the methods `'pad', 'backfill' or 'nearest'`: " 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "ds.tas.sel(lat=39.5, lon=105.7, method='nearest')" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "So the closest location in the data was at `lat=39.11`, `lon=106.2`." 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "See the [xarray documentation](https://xarray.pydata.org/en/stable/generated/xarray.DataArray.sel.html) for more on usage of `method` and `tolerance` parameters in `.sel()`. " 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "
\n", 303 | "

Tip

\n", 304 | "Another way to use the nearest neighbor lookup is via slice objects. For e.g.:\n", 305 | "
" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "ds.tas.sel(lat=slice(39, 39.5), lon=slice(106.1, 106.3))" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "Operators can be chained, so multiple operations can be peformed sequentially. For example, to select an area of interest and the first time index" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "ds.tas.isel(time=0).sel(lon=slice(20, 160), lat=slice(-80, 25))" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "### Interpolation\n", 338 | "\n", 339 | "If we want to interpolate along coordinates rather than looking up the nearest neighbos, we can use the `.interp()` method. To use `interp()` requires the presence of `scipy` library. \n" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "ds.tas.interp(lat=[10, 10.1, 10.2], method='nearest')" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "---" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "%load_ext watermark\n", 365 | "%watermark --time --python --updated --iversion" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "## Summary \n", 373 | "\n", 374 | "- Xarray’s named dimensions and labeled coordinates free the user from having to track positional ordering of dimensions when accessing data\n", 375 | "- Xarray provides a variety of methods for subsetting data via `.sel()`, `.isel()`, `.interp()` methods\n" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "## Resources and References\n", 383 | "\n", 384 | "- [Xarray Documentation - Indexing and Selecting Data](https://xarray.pydata.org/en/stable/indexing.html)\n", 385 | "- [Xarray Documentation - Interpolation](https://xarray.pydata.org/en/stable/user-guide/interpolation.html)\n" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "
\n", 393 | "

Previous: Xarray Fundamentals

\n", 394 | "

Next: Data Visualization

\n", 395 | "
" 396 | ] 397 | } 398 | ], 399 | "metadata": { 400 | "kernelspec": { 401 | "display_name": "Python 3", 402 | "language": "python", 403 | "name": "python3" 404 | }, 405 | "language_info": { 406 | "codemirror_mode": { 407 | "name": "ipython", 408 | "version": 3 409 | }, 410 | "file_extension": ".py", 411 | "mimetype": "text/x-python", 412 | "name": "python", 413 | "nbconvert_exporter": "python", 414 | "pygments_lexer": "ipython3", 415 | "version": "3.9.4" 416 | }, 417 | "toc-autonumbering": false, 418 | "toc-showcode": false, 419 | "toc-showmarkdowntxt": false, 420 | "toc-showtags": false 421 | }, 422 | "nbformat": 4, 423 | "nbformat_minor": 4 424 | } 425 | -------------------------------------------------------------------------------- /notebooks/03-data-visualization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Visualization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Learning Objectives \n", 22 | "\n", 23 | "\n", 24 | "- How to use xarray's builtin, matplotlib-backed plotting interface to visualize datasets.\n", 25 | "- How to use `hvplot` to produce interactive plots " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Prerequisites\n", 33 | "\n", 34 | "\n", 35 | "| Concepts | Importance | Notes |\n", 36 | "| --- | --- | --- |\n", 37 | "| [Understanding of xarray core data structures](./01-xarray-fundamentals.ipynb) | Necessary | |\n", 38 | "| [Familiarity with xarray indexing and subsetting](./02-indexing-and-subsetting.ipynb) | Necessary | |\n", 39 | "| [Basic familiarity with Matplotlib](https://numpy.org/doc/stable/reference/arrays.indexing.html) | Helpful | |\n", 40 | "\n", 41 | "\n", 42 | "- **Time to learn**: *15-20 minutes*\n", 43 | "\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "---" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Imports\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "import xarray as xr" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "Let's open the same dataset as before" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "ds = xr.open_dataset(\n", 83 | " \"./data/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n", 84 | ")\n", 85 | "ds" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Basic plotting with via `.plot()`\n", 93 | "\n", 94 | "Xarray provides a `.plot()` method on `DataArray` and `Dataset`. This method is a wrapper around Matplotlib's `matplotlib.pyplot.plot()`. xaarray will automatically guess the type of plot based on the dimensionality of the data. By default `.plot()` creates:\n", 95 | "\n", 96 | "- a **line** plot for `1-D arrays` using `matplotlib.pyplot.plot()`\n", 97 | "- a **pcolormesh** plot for 2-D arrays using `matplotlib.pyplot.pcolormesh()`\n", 98 | "- a **histogram** for everything else (more than 2 dimensions) using `matplotlib.pyplot.hist()`\n", 99 | " " 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "### 1D line plots\n", 107 | "\n", 108 | "Let's select one spatial location and plot a time sesries of the near surface temperature" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "ds.tas.sel(lon=100, lat=10, method='nearest').plot(marker=\"o\", size=6);" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "
\n", 125 | " We are selecting a single point, so `.sel()` requires either an exact location that exists in the data, or to specify method argument to tell it how to choose a location from the data. \n", 126 | "
" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "\n", 134 | "Lets say we want to compare plots of temperature at three different latitudes. We can use the `hue` keyword argument to do this." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "ds.tas.sel(lat=[-40, 0, 40], time=\"2013-03\", method=\"nearest\")" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "ds.tas.sel(lat=[-40, 0, 40], time=\"2013-03\", method=\"nearest\").plot(\n", 153 | " x=\"lon\", hue=\"lat\", figsize=(8, 6)\n", 154 | ");" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### 2D plots" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "Operator chaining means it is possible to have multiple selection operators and add `.plot()` to the end to visualise the result" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "ds.tas.isel(time=-10).sel(lon=slice(20, 160), lat=slice(-80, 25)).plot(robust=True, figsize=(8, 6));" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "The x- and y-axes are labeled with full names — \"Latitude\", \"Longitude\" — along with units. The colorbar has a nice label, again with units. And the title tells us the timestamp of the data presented." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "# define keyword arguments that are passed to matptolib.pyplot.colorbar\n", 194 | "colorbar_kwargs = {\n", 195 | " \"orientation\": \"horizontal\",\n", 196 | " \"label\": \"my clustom label\",\n", 197 | " \"pad\": 0.2,\n", 198 | "}\n", 199 | "\n", 200 | "ds.tas.isel(lon=1).plot(\n", 201 | " x=\"time\", # coordinate to plot on the x-axis of the plot\n", 202 | " robust=True, # set colorbar limits to 2nd and 98th percentile of data\n", 203 | " cbar_kwargs=colorbar_kwargs,\n", 204 | ");" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "\n", 212 | "### Faceting\n", 213 | "\n", 214 | "Faceting is an effective way of visualizing variations of 3D data where 2D slices are visualized in a panel (subplot) and the third dimensions is varied between panels (subplots)." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "ds.tas.sel(time=slice(\"2010\", \"2011\")).plot(col=\"time\", col_wrap=6, robust=True);" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "See the [xarray documentation](https://xarray.pydata.org/en/stable/user-guide/plotting.html) for more on \"faceted\" plots or subplots." 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "### Histograms" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "ds.tas.plot();" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "### Bonus Plot \n", 254 | "\n", 255 | "Let's look at the air temperature data but at for **all pressure levels**. We are going to select out the first time index and the longitude corresponding to the Himalayas and plot a vertical profile of the atmosphere from pole to pole:" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "ds_air_all_pressure_levels = xr.open_dataset(\n", 265 | " \"data/ta_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n", 266 | ")\n", 267 | "ds_air_all_pressure_levels" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "data = ds_air_all_pressure_levels.ta.isel(time=-1).sel(lon=86.93, method='nearest')\n", 277 | "fig = data.plot(size=6, yincrease=False)\n", 278 | "fig.axes.set_title(\n", 279 | " f'Vertical profile of Temperature from pole to pole \\nat longitude = {data.lon.data} and time = {data.time.data}',\n", 280 | " size=15,\n", 281 | ");" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "
\n", 289 | " \n", 293 | "
" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "---" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "## Interactive visualization using `hvplot`\n", 308 | "\n", 309 | "Let's switch gears and look at how we can produce interactive plots via [holoviews](https://holoviews.org/). The holoviews plotting ecosystem provides the [hvplot](https://hvplot.holoviz.org/) package to allow easy visualization of xarray (and other) objects. These plots build on [Bokeh](https://docs.bokeh.org/en/latest/index.html) -- a Python library for creating interactive visualziatons for web browsers.\n", 310 | "\n", 311 | "\n", 312 | "To enable the `.hvplot` interface on xarray object, let's import the `hvplot.xarray` module:" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "import hvplot.xarray" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "To use `hvplot` instead of `matplotlib`, we use the `.hvplot()` method:" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "ds.tas.hvplot()" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "As you can see, calling `.hvplot()` behaves the same as `.plot()` i.e. it uses the same heuristics as `.plot()`. In this case, it produces a histogram for data with more than 3 dimensions. To plot a `pcolormesh`, let's reduce the dimensionality of our data to 2D and call `.hvplot()` again:" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "ds.tas.isel(time=1).hvplot(cmap=\"fire\")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "ds.tas.isel(time=-1, lon=100).hvplot()" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "ds.tas.sel(lat=28.5, lon=83.9, method='nearest').hvplot()" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "So far we have had to subset our data in order to produce plots. `hvplot` provides convenient functionality for producing plots on-demand via interactive widgets. Let's create a series of 2D for each time slice, We will use the `groupby` parameter to let hvplot know that we want to create a widget (a slider) for the time dimension:" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "ds.tas.hvplot(groupby=\"time\", clim=(ds.tas.min(), ds.tas.max()), cmap='turbo')" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "Let's add more customizations to our time widget:" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "ds.tas.hvplot(\n", 404 | " groupby=\"time\",\n", 405 | " clim=(ds.tas.min(), ds.tas.max()),\n", 406 | " cmap=\"turbo\",\n", 407 | " widget_type=\"scrubber\",\n", 408 | " widget_location=\"bottom\",\n", 409 | ")" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "---" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "%load_ext watermark\n", 426 | "%watermark --time --python --updated --iversion" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "## Summary \n", 434 | "\n", 435 | "- Xarray has plotting functionality that is a thin wrapper around the Matplotlib library\n", 436 | "- Xarray uses syntax and function names from Matplotlib whenever possible\n", 437 | "- Hvplot provides a neat interface to xarray for creating interactive plots" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "## Resources and References\n", 445 | "\n", 446 | "- [Hvplot Documentation](https://hvplot.holoviz.org/index.html)\n", 447 | "- [Xarray Documentation - Plotting](https://xarray.pydata.org/en/stable/user-guide/plotting.html)\n", 448 | "- [Matplolib Documentation](https://matplotlib.org/stable/contents.html)\n", 449 | "\n", 450 | "
\n", 451 | "

Geocat-examples Gallery

\n", 452 | " For geo-science specific visualization examples, please see the geocat-examples gallery which resides here.\n", 453 | "
\n" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "
\n", 461 | "

Previous: Xarray Fundamentals

\n", 462 | "

Next: Computation

\n", 463 | "
" 464 | ] 465 | } 466 | ], 467 | "metadata": { 468 | "interpreter": { 469 | "hash": "affdb75a3ef70a25a87eb00ec52822d75cad558e88f93d5bb3da0d72a04ea7e1" 470 | }, 471 | "kernelspec": { 472 | "display_name": "Python 3", 473 | "language": "python", 474 | "name": "python3" 475 | }, 476 | "language_info": { 477 | "codemirror_mode": { 478 | "name": "ipython", 479 | "version": 3 480 | }, 481 | "file_extension": ".py", 482 | "mimetype": "text/x-python", 483 | "name": "python", 484 | "nbconvert_exporter": "python", 485 | "pygments_lexer": "ipython3", 486 | "version": "3.9.4" 487 | }, 488 | "toc-autonumbering": false 489 | }, 490 | "nbformat": 4, 491 | "nbformat_minor": 4 492 | } 493 | -------------------------------------------------------------------------------- /notebooks/05-masking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Masking Data\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Learning Objectives \n", 22 | "\n", 23 | "\n", 24 | "- Provide an overview of masking data in xarray\n", 25 | "- Masking data using `.where()` method" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Prerequisites\n", 33 | "\n", 34 | "\n", 35 | "| Concepts | Importance | Notes |\n", 36 | "| --- | --- | --- |\n", 37 | "| [Understanding of xarray core data structures](./01-xarray-fundamentals.ipynb) | Necessary | |\n", 38 | "| [Familiarity with NumPy ](https://numpy.org/doc/stable/reference/arrays.indexing.html) | Helpful | |\n", 39 | "\n", 40 | "- **Time to learn**: *10 minutes*\n", 41 | "\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "---" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Overview\n", 56 | "\n", 57 | "Using `xr.where()` or `.where()` method, elements of an Xarray Dataset or xarray DataArray that satisfy a given condition or multiple conditions can be replaced/masked.To demonstrate this, we are going to use the `.where()` method on the `tos` dataarray. " 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Imports\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "import matplotlib.pyplot as plt\n", 74 | "import xarray as xr" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Open the sea surface temperature dataset:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "ds = xr.open_dataset(\n", 91 | " \"./data/tos_Omon_CESM2_historical_r11i1p1f1_gr_200001-201412.nc\", engine=\"netcdf4\"\n", 92 | ")\n", 93 | "ds" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## Using `where` with one condition" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "sample = ds.tos.isel(time=-1)\n", 110 | "sample" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Before applying `.where()`, let's look at the documentation" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "sample.where?" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "- As the documention points out, the conditional expression in `.where` can be: \n", 134 | "\n", 135 | " - a DataArray\n", 136 | " - a Dataset\n", 137 | " - a function\n", 138 | "\n", 139 | "- Unlike `.isel()` and `sel()` that change the shape of the returned results, `.where()` preserves the shape of the original data. It does accomplishes this by returning values from the original DataArray or Dataset if the `condition` is `True`, and fills in missing values wherever the `condition` is `False`. \n", 140 | "\n", 141 | "\n", 142 | "For demonstration purposes, let's use where to mask locations with temperature values greater than `0`:" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "masked_sample = sample.where(sample < 0.0)\n", 152 | "masked_sample" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "Let's plot both our original sample, and the masked sample:" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "fig, axes = plt.subplots(ncols=2, figsize=(19, 6))\n", 169 | "sample.plot(ax=axes[0], robust=True)\n", 170 | "masked_sample.plot(ax=axes[1], robust=True);" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "## Using `where` with multiple conditions" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "`.where()` allows providing multiple conditions. To do this, we need to make sure each conditional expression is enclosed in `()`. To combine conditions, we use the `bit-wise and` (`&`) operator and/or the `bit-wise or` (`|`). let's use `where` to mask locations with temperature values less than 25 and greater than 30:" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "sample.where((sample > 25) & (sample < 30)).plot(size=6, robust=True);" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "We can use coordinates to apply a mask as well. Below, we use the `latitude` and `longitude` coordinates to mask the [Niño 3.4 region](https://www.ncdc.noaa.gov/teleconnections/enso/indicators/sst/):\n", 201 | "\n", 202 | "![](https://www.ncdc.noaa.gov/monitoring-content/teleconnections/nino-regions.gif)\n", 203 | "\n" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "sample.where((sample.lat < 5) & (sample.lat > -5) & (sample.lon > 190) & (sample.lon < 240)).plot(\n", 213 | " size=6, robust=True\n", 214 | ");" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "## Using `where` with a custom fill value" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "`.where()` can take a second argument, which, if supplied, is used to fill value for the masked region. Below we fill masked regtions with a constant `0`" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "sample.where((sample > 25) & (sample < 30), 0).plot(size=6, robust=True);" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "---" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "%load_ext watermark\n", 254 | "%watermark --time --python --updated --iversion" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "## Resources and References\n", 262 | "\n", 263 | "- [Xarray Documentation - Masking with `where()`](https://xarray.pydata.org/en/stable/user-guide/indexing.html#masking-with-where)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "
\n", 271 | "

Previous: Computation

\n", 272 | "

Next: End-to-End example: Computing Niño 3.4 Index

\n", 273 | "
" 274 | ] 275 | } 276 | ], 277 | "metadata": { 278 | "kernelspec": { 279 | "display_name": "Python 3", 280 | "language": "python", 281 | "name": "python3" 282 | }, 283 | "language_info": { 284 | "codemirror_mode": { 285 | "name": "ipython", 286 | "version": 3 287 | }, 288 | "file_extension": ".py", 289 | "mimetype": "text/x-python", 290 | "name": "python", 291 | "nbconvert_exporter": "python", 292 | "pygments_lexer": "ipython3", 293 | "version": "3.9.4" 294 | }, 295 | "toc-autonumbering": false, 296 | "toc-showcode": false, 297 | "toc-showmarkdowntxt": false, 298 | "toc-showtags": false 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 4 302 | } 303 | -------------------------------------------------------------------------------- /notebooks/06-end-to-end-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# End-to-End example: Computing Niño 3.4 Index\n", 8 | "\n", 9 | "In this notebook, we are going to combine all concepts/topics we've covered so far to compute [Niño 3.4 Index](https://climatedataguide.ucar.edu/climate-data/nino-sst-indices-nino-12-3-34-4-oni-and-tni) for the CESM2 submission for [CMIP6 project](https://esgf-node.llnl.gov/projects/cmip6/). \n", 10 | "\n", 11 | "\n", 12 | "> Niño 3.4 (5N-5S, 170W-120W): The Niño 3.4 anomalies may be thought of as representing the average equatorial SSTs across the Pacific from about the dateline to the South American coast. The Niño 3.4 index typically uses a 5-month running mean, and El Niño or La Niña events are defined when the Niño 3.4 SSTs exceed +/- 0.4C for a period of six months or more.\n", 13 | "\n", 14 | "> Nino X Index computation: (a) Compute area averaged total SST from Niño X region; (b) Compute monthly climatology (e.g., 1950-1979) for area averaged total SST from Niño X region, and subtract climatology from area averaged total SST time series to obtain anomalies; (c) Smooth the anomalies with a 5-month running mean; (d) Normalize the smoothed values by its standard deviation over the climatological period.\n", 15 | "\n", 16 | "\n", 17 | "![](https://www.ncdc.noaa.gov/monitoring-content/teleconnections/nino-regions.gif)\n", 18 | "\n", 19 | "\n", 20 | "At the end of this notebook, you should be able to produce a plot that looks similar [to this one](https://climatedataguide.ucar.edu/sites/default/files/styles/node_lightbox_display/public/key_figures/climate_data_set/indices_oni_2_2_lg.png?itok=61jS7Jz7)\n", 21 | "\n", 22 | "![](https://climatedataguide.ucar.edu/sites/default/files/styles/node_lightbox_display/public/key_figures/climate_data_set/indices_oni_2_2_lg.png?itok=61jS7Jz7)\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "---" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Learning Objectives \n", 37 | "\n", 38 | "\n", 39 | "- Load data\n", 40 | "- Masking data using `.where()` method\n", 41 | "- Compute climatologies and anomalies using `.groupby()`\n", 42 | "- Use `.rolling()` to compute moving average\n", 43 | "- Normalize computed Niño 3.4 Index \n", 44 | "- Visualize the computed Niño 3.4 Index " 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Prerequisites\n", 52 | "\n", 53 | "\n", 54 | "| Concepts | Importance | Notes |\n", 55 | "| --- | --- | --- |\n", 56 | "| [Understanding of xarray core data structures](./01-xarray-fundamentals.ipynb) | Necessary | |\n", 57 | "| [Familiarity with xarray indexing and subsetting](./02-indexing-and-subsetting.ipynb) | Necessary | |\n", 58 | "| [Familiarity with xarray's plotting functionality](./03-data-visualization.ipynb) | Necessary | |\n", 59 | "| [Familiarity with xarray's computation routins](./05-computation.ipynb) | Necessary | |\n", 60 | "\n", 61 | "\n", 62 | "\n", 63 | "- **Time to learn**: *20 minutes*\n", 64 | "\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "---" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Task 1: Import packages \n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "import cartopy.crs as ccrs\n", 88 | "import matplotlib.pyplot as plt\n", 89 | "import xarray as xr" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Task 2: Open the sea surface temperature dataset and the areacello" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "data = xr.open_dataset(\n", 106 | " \"./data/tos_Omon_CESM2_historical_r11i1p1f1_gr_200001-201412.nc\", engine=\"netcdf4\"\n", 107 | ")\n", 108 | "areacello = xr.open_dataset(\n", 109 | " \"./data/areacello_Ofx_CESM2_historical_r11i1p1f1_gr.nc\", engine=\"netcdf4\"\n", 110 | ")\n", 111 | "\n", 112 | "# Merge the two datasets in a single dataset\n", 113 | "ds = xr.merge([data, areacello])\n", 114 | "ds" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## Task 3: Visualize the first time slice to make sure the data looks okay" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "fig = plt.figure(figsize=(12, 6))\n", 131 | "ax = plt.axes(projection=ccrs.Robinson(central_longitude=180))\n", 132 | "ax.coastlines()\n", 133 | "ax.gridlines()\n", 134 | "ds.tos.isel(time=0).plot(\n", 135 | " robust=True, ax=ax, transform=ccrs.PlateCarree(), cbar_kwargs={'shrink': 0.5}\n", 136 | ")\n", 137 | "ax.set_global()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "## Task 4: Select the Niño 3.4 region \n", 145 | "\n", 146 | "There are a few ways for selecting the Niño 3.4 region\n", 147 | "\n", 148 | "1. Use `sel()` or `isel()`\n", 149 | "2. Use `where()` and select all vlaues within the bounds of interest" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "tos_nino34 = ds.sel(lat=slice(-5, 5), lon=slice(190, 240))\n", 159 | "tos_nino34" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "Another option for selecting our region of interest is to use:" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# tos_nino34 = ds.where((ds.lat<5) & (ds.lat>-5) & (ds.lon>190) & (ds.lon<240), drop=True)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Let's plot the selected region to make sure we are doing the right thing:" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "fig = plt.figure(figsize=(12, 6))\n", 192 | "ax = plt.axes(projection=ccrs.Robinson(central_longitude=180))\n", 193 | "ax.coastlines()\n", 194 | "ax.gridlines()\n", 195 | "tos_nino34.tos.isel(time=0).plot(ax=ax, transform=ccrs.PlateCarree(), cbar_kwargs={'shrink': 0.5})\n", 196 | "ax.set_extent((120, 300, 10, -10))" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "## Task 5: Compute monthly climatology for area averaged total SST from Niño 3.4 region, and subtract climatology from area averaged total SST time series to obtain anomalies;\n", 204 | "\n" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "gb = tos_nino34.tos.groupby('time.month')\n", 214 | "tos_nino34_anom = gb - gb.mean(dim='time')\n", 215 | "index_nino34 = tos_nino34_anom.weighted(tos_nino34.areacello).mean(dim=['lat', 'lon'])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "## Task 6: Smooth the anomalies with a 5-month running mean" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "index_nino34_rolling_mean = index_nino34.rolling(time=5).mean()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "index_nino34.plot(size=8)\n", 241 | "index_nino34_rolling_mean.plot()\n", 242 | "plt.legend(['anomaly', '5-month running mean anomaly']);" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## Task 7: Normalize the smoothed values by its standard deviation over the climatological period" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "std_dev = tos_nino34.tos.std()\n", 259 | "std_dev" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "normalized_index_nino34_rolling_mean = index_nino34_rolling_mean / std_dev" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "## Visualize the computed Niño 3.4\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "fig = plt.figure(figsize=(12, 6))\n", 285 | "\n", 286 | "# Add -0.4/+0.4 lines which are the El Niño 3.4 thresholds\n", 287 | "plt.fill_between(\n", 288 | " normalized_index_nino34_rolling_mean.time.data,\n", 289 | " normalized_index_nino34_rolling_mean.where(normalized_index_nino34_rolling_mean >= 0.4).data,\n", 290 | " 0.4,\n", 291 | " color='red',\n", 292 | " alpha=0.9,\n", 293 | ")\n", 294 | "plt.fill_between(\n", 295 | " normalized_index_nino34_rolling_mean.time.data,\n", 296 | " normalized_index_nino34_rolling_mean.where(normalized_index_nino34_rolling_mean <= -0.4).data,\n", 297 | " -0.4,\n", 298 | " color='blue',\n", 299 | " alpha=0.9,\n", 300 | ")\n", 301 | "\n", 302 | "normalized_index_nino34_rolling_mean.plot(color='black')\n", 303 | "plt.axhline(0, color='black', lw=0.5)\n", 304 | "plt.axhline(0.4, color='black', linewidth=0.5, linestyle='dotted')\n", 305 | "plt.axhline(-0.4, color='black', linewidth=0.5, linestyle='dotted');" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "---" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "%load_ext watermark\n", 322 | "%watermark --time --python --updated --iversion" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## Resources and References\n", 330 | "\n", 331 | "- [Project Pythia Resource Gallery](https://projectpythia.org/gallery.html)\n", 332 | "- [Pangeo Gallery](http://gallery.pangeo.io/)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "
\n", 340 | "

Previous: Masking Data

\n", 341 | " \n", 342 | "
" 343 | ] 344 | } 345 | ], 346 | "metadata": { 347 | "kernelspec": { 348 | "display_name": "Python 3", 349 | "language": "python", 350 | "name": "python3" 351 | }, 352 | "language_info": { 353 | "codemirror_mode": { 354 | "name": "ipython", 355 | "version": 3 356 | }, 357 | "file_extension": ".py", 358 | "mimetype": "text/x-python", 359 | "name": "python", 360 | "nbconvert_exporter": "python", 361 | "pygments_lexer": "ipython3", 362 | "version": "3.9.4" 363 | }, 364 | "toc-autonumbering": false, 365 | "toc-showcode": false, 366 | "toc-showmarkdowntxt": false, 367 | "toc-showtags": false 368 | }, 369 | "nbformat": 4, 370 | "nbformat_minor": 4 371 | } 372 | -------------------------------------------------------------------------------- /notebooks/07-dask-intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7df3a8f0-0ef1-4e50-9d41-afb98c875426", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "0d4b7e4f-b66d-49c5-8ddd-7c460742878c", 14 | "metadata": {}, 15 | "source": [ 16 | "# Introducing Dask\n", 17 | "\n", 18 | "**First,...**\n", 19 | "\n", 20 | "\n", 21 | "\n", 22 | "Dask is a parallel computing library that scales the existing Python libraries. This tutorial will introduce Dask and parallel data analysis more generally.\n", 23 | "\n", 24 | "\n", 25 | "## Learning Objectives \n", 26 | "\n", 27 | "- Describe components that make up Dask\n", 28 | "\n", 29 | "\n", 30 | "## Prerequisites\n", 31 | "\n", 32 | "\n", 33 | "| Concepts | Importance | Notes |\n", 34 | "| --- | --- | --- |\n", 35 | "| Familiarity with Python | Helpful | |\n", 36 | "\n", 37 | "\n", 38 | "- **Time to learn**: *5 minutes*\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "87cd1263-b2fe-47b5-9ed1-12c4b3de2240", 44 | "metadata": {}, 45 | "source": [ 46 | "\n", 47 | "## Dask Components \n", 48 | "\n", 49 | "Dask is composed of two main parts:\n", 50 | "\n", 51 | "- **Dask Collections**\n", 52 | "- **Dynamic Task Scheduling**\n", 53 | "\n", 54 | "\n", 55 | "\n", 56 | "1. High-level collection APIs:\n", 57 | " - **Dask Array**: Parallel NumPy Arrays\n", 58 | " - **Dask DataFrame**: Parallel Pandas DataFrames\n", 59 | " - **Dask Bag**: Parallel lists\n", 60 | " - **Dask ML**: Parallel Scikit-learn\n", 61 | "\n", 62 | "\n", 63 | "2. Low-level collection APIs:\n", 64 | " - **Dask Delayed**: Lazy parallel objects\n", 65 | " - **Dask Futures**: Eager parallel objects\n", 66 | "\n", 67 | "\n", 68 | "3. Task Scheduling\n", 69 | " - **Scheduler**: \n", 70 | " - creates and manages directed acyclic graphs (DAG)s\n", 71 | " - distributes tasks to workers\n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | "
\n", 76 | "

Lazy evaluation vs eager evaluation

\n", 77 | " \n", 83 | "
\n", 84 | " \n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "9587dd62-6873-4056-b606-5806f64e11e7", 90 | "metadata": {}, 91 | "source": [ 92 | "## Advantages of using Dask\n", 93 | "\n", 94 | "- **Familiarity**: Dask collections such as Dask Array, Dask DataFrames provide decent NumPy and Pandas compatible APIs.\n", 95 | "- **Responsive**: Dask is designed with interactive computing in mind. \n", 96 | " - It provides rapid feedback and diagnostics to aid humans\n", 97 | "- **Scale up and scale down**: It scales well from single machine (laptop) to clusters (100s of machines)\n", 98 | " - This ease of transition between single machine to moderate clusters makes it easy for users to prototype their workflows on their local machines and seamlessy transition to a cluster when needed. \n", 99 | " - This also gives users a lot of flexibility when choosing the best to deploy and run their workflows. \n", 100 | "- **Flexibility**: Dask supports interfacing with popular cluster resource managers such as PBS/SLURM/Kubernetes, etc.. with a minimal amount of effort\n", 101 | "\n", 102 | "" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "ea2c600b-8fb5-4dfb-aede-d85870fbb9bf", 108 | "metadata": {}, 109 | "source": [ 110 | "## Task Graphs\n", 111 | "\n", 112 | "Dask represents distributed/parallel computations with task graphs, more specifically [directed acyclic graphs](https://en.wikipedia.org/wiki/Directed_acyclic_graph).\n", 113 | "\n", 114 | "- A task is a function that you want to call and its corresponding inputs\n", 115 | "- A task graph is a collection of (1) the functions we want to call + their inputs (2) their dependencies. \n", 116 | "\n", 117 | "\n", 118 | "Directed acyclic graphs are made up of nodes and have a clearly defined start and end, a single traversal path, and no looping \n", 119 | "\n", 120 | "" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "45775347-0f7e-4882-a5a2-4bc0ce60791a", 126 | "metadata": {}, 127 | "source": [ 128 | "---" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "238a4c70-964a-4667-b399-26e308ea4841", 134 | "metadata": {}, 135 | "source": [ 136 | "## Resources and references\n", 137 | "\n", 138 | "* Reference\n", 139 | " * [Docs](https://dask.org/)\n", 140 | " * [Examples](https://examples.dask.org/)\n", 141 | " * [Code](https://github.com/dask/dask/)\n", 142 | " * [Blog](https://blog.dask.org/)\n", 143 | "* Ask for help\n", 144 | " * [`dask`](http://stackoverflow.com/questions/tagged/dask) tag on Stack Overflow, for usage questions\n", 145 | " * [github discussions](https://github.com/dask/dask/discussions) for general, non-bug, discussion, and usage questions\n", 146 | " * [github issues](https://github.com/dask/dask/issues/new) for bug reports and feature requests\n", 147 | " \n", 148 | " \n", 149 | "
\n", 150 | "

Next: Parallelizing code with dask.delayed

\n", 151 | " \n", 152 | "
" 153 | ] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Python 3", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.9.6" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 5 177 | } 178 | -------------------------------------------------------------------------------- /notebooks/09-dask-array.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d59e6a58-b50e-4015-bbd8-b48608d44b26", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "013dde55-1cea-4fd8-b980-0fa06bdd5568", 14 | "metadata": {}, 15 | "source": [ 16 | "# Dask Array\n", 17 | "\n", 18 | "\n", 19 | "\n", 20 | "Dask array provides a parallel, larger-than-memory, n-dimensional array using blocked algorithms. Simply put: distributed Numpy.\n", 21 | "\n", 22 | "* **Parallel**: Uses all of the cores on your computer\n", 23 | "* **Larger-than-memory**: Lets you work on datasets that are larger than your available memory by breaking up your array into many small pieces, operating on those pieces in an order that minimizes the memory footprint of your computation, and effectively streaming data from disk.\n", 24 | "* **Blocked Algorithms**: Perform large computations by performing many smaller computations\n", 25 | "\n", 26 | "\n", 27 | "## Learning Objectives\n", 28 | "\n", 29 | "- Understand key features of dask arrays\n", 30 | "- Work with Dask Array's in much the same way you would work with a NumPy array\n", 31 | "\n", 32 | "## Prerequisites\n", 33 | "\n", 34 | "\n", 35 | "| Concepts | Importance | Notes |\n", 36 | "| --- | --- | --- |\n", 37 | "| Familiarity with NumPy | Necessary | |\n", 38 | "\n", 39 | "\n", 40 | "- **Time to learn**: *20-25 minutes*\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "44f46770-7c59-4b1a-9608-9af25f286501", 46 | "metadata": {}, 47 | "source": [ 48 | "## Setup\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "0c92dbf2-16bb-4ec9-9dbc-35929e292617", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "from dask.distributed import Client, LocalCluster\n", 59 | "\n", 60 | "cluster = LocalCluster()\n", 61 | "client = Client(cluster)\n", 62 | "client" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "259bdc60-6e96-4258-8d71-e733ce2d9aca", 68 | "metadata": {}, 69 | "source": [ 70 | "## Blocked Algorithms\n", 71 | "\n", 72 | "A *blocked algorithm* executes on a large dataset by breaking it up into many small blocks.\n", 73 | "\n", 74 | "For example, consider taking the sum of a billion numbers. We might instead break up the array into 1,000 chunks, each of size 1,000,000, take the sum of each chunk, and then take the sum of the intermediate sums.\n", 75 | "\n", 76 | "We achieve the intended result (one sum on one billion numbers) by performing many smaller results (one thousand sums on one million numbers each, followed by another sum of a thousand numbers.)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "53b69958-355f-4121-a644-227adc1b14ef", 82 | "metadata": {}, 83 | "source": [ 84 | "## `dask.array` contains these algorithms\n", 85 | "\n", 86 | "`dask.array` implements a subset of the NumPy ndarray interface using blocked algorithms, cutting up the large array into many small arrays. This lets us compute on arrays larger than memory using multiple cores. We coordinate these blocked algorithms using Dask graphs. Dask Array's are also lazy, meaning that they do not evaluate until you explicitly ask for a result using the compute method." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "1aef9368-240b-423b-a3c7-0ab7baa8fa13", 92 | "metadata": {}, 93 | "source": [ 94 | "### Create `dask.array` object\n", 95 | "\n", 96 | "If we want to create a 3D NumPy array of random values, we do it like this:" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "da589b6b-7268-4a02-93c7-efc2741fee96", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "import dask\n", 107 | "import dask.array as da\n", 108 | "import numpy as np\n", 109 | "from distributed.utils import format_bytes" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "id": "690bc749-976e-4b78-a801-8d01ee363ad7", 116 | "metadata": { 117 | "tags": [] 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "shape = (600, 200, 200)\n", 122 | "arr = np.random.random(shape)\n", 123 | "arr" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "c36032d1-0fb6-43d2-a188-87e8e00bd5a4", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "format_bytes(arr.nbytes)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "8e7e54dc-342c-4d31-902a-ece54a813e7e", 139 | "metadata": {}, 140 | "source": [ 141 | "This array contains `~183 MB` of data" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "905dffb1-0879-4842-8b76-2538592f6156", 147 | "metadata": {}, 148 | "source": [ 149 | "Now let's create the same array using Dask's array interface." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "be84728a-7953-4561-aa7d-326f4a45e3aa", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "darr = da.random.random(shape, chunks=(300, 100, 200))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "id": "632e9f08-3142-46d7-b457-e9bde0d1dce9", 165 | "metadata": {}, 166 | "source": [ 167 | "A chunk size to tell us how to block up our array, like `(300, 100, 200)`. " 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "ebbafe88-bb79-436c-aa3b-a9c5f31ff1ec", 173 | "metadata": {}, 174 | "source": [ 175 | "
\n", 176 | "

Specifying Chunks

\n", 177 | " There are several ways to specify chunks. In this tutorial, we will use a block shape.\n", 178 | "\n", 179 | "\n", 180 | "
\n", 181 | "\n" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "42c13417-a5a2-4fd2-8610-a3cd70f4b93a", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "darr" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "75988622-8ec1-45b0-a069-29ca74f53836", 197 | "metadata": {}, 198 | "source": [ 199 | "Notice that we just see a symbolic representation of the array, including its `shape`, `dtype`, and `chunksize`. No data has been generated yet. Let's visualize the constructed task graph. " 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "id": "44a7a6e4-bcfc-40c1-a095-8fa315bfef4e", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "darr.visualize()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "id": "37f4038c-f2b2-40c3-91cd-af2713dd23df", 215 | "metadata": {}, 216 | "source": [ 217 | "Our array has four chunks. To generate it, Dask calls `np.random.random` four times and then concatenates this together into one array." 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "id": "c5b29134-62af-4a0b-8e44-9f99b5072b45", 223 | "metadata": {}, 224 | "source": [ 225 | "### Manipulate `dask.array` object as you would a numpy array\n", 226 | "\n", 227 | "\n", 228 | "Now that we have an `Array` we perform standard numpy-style computations like arithmetic, mathematics, slicing, reductions, etc..\n", 229 | "\n", 230 | "The interface is familiar, but the actual work is different. `dask_array.sum()` does not do the same thing as `numpy_array.sum()`.\n", 231 | "\n", 232 | "#### What's the difference?\n", 233 | "\n", 234 | "`dask_array.sum()` builds an expression of the computation. It does not do the computation yet. `numpy_array.sum()` computes the sum immediately.\n", 235 | "\n", 236 | "#### Why the difference?\n", 237 | "\n", 238 | "Dask arrays are split into chunks. Each chunk must have computations run on that chunk explicitly. If the desired answer comes from a small slice of the entire dataset, running the computation over all data would be wasteful of CPU and memory." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "id": "cf14f2f0-a66e-4578-8a8d-f0c7701beb9c", 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "total = darr.sum()\n", 249 | "total" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "id": "36724be4-63ec-4b1c-9a9e-1a605a12a5a5", 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "total.visualize()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "09186a60-51fc-49c1-91ac-3614af0202cc", 265 | "metadata": {}, 266 | "source": [ 267 | "#### Compute result\n", 268 | "\n", 269 | "Dask.array objects are lazily evaluated. Operations like `.sum` build up a graph of blocked tasks to execute. \n", 270 | "\n", 271 | "We ask for the final result with a call to `.compute()`. This triggers the actual computation." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "id": "36aff31f-2f06-4703-a2a6-cf692ab5eed3", 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "%%time\n", 282 | "total.compute()" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "id": "af09c610-0e66-4ce8-a53a-fdd50471271d", 288 | "metadata": {}, 289 | "source": [ 290 | "### Exercise: Modify the chunk size (or shape) in the random dask array, call `.sum()` on the new array, and visualize how the task graph changes." 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "id": "11922e84-f13e-4db6-a8a0-cf75a5727cfb", 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "da.random.random(shape, chunks=(50, 200, 400)).sum().visualize()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "id": "b275b5cc-51a6-48ff-a0a4-62bdc43e6530", 306 | "metadata": {}, 307 | "source": [ 308 | "Here we see Dask's strategy for finding the sum. This simple example illustrates the beauty of Dask: it automatically designs an algorithm appropriate for custom operations with big data." 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "id": "7a7dcaaa-6a6e-4f58-aa80-2890136158fd", 314 | "metadata": {}, 315 | "source": [ 316 | "If we make our operation more complex, the graph gets more complex:" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "id": "e74a6817-8d06-4dd1-afec-98c53a8ae52a", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "z = darr.dot(darr.T).mean(axis=0)[::2, :].std(axis=1)\n", 327 | "z" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "a4ccffad-5bda-4108-a6c8-6628510f8363", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "z.visualize()" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "id": "35c59c2f-3e5c-443b-908f-4f14535d2802", 343 | "metadata": {}, 344 | "source": [ 345 | "### A Bigger Calculation\n", 346 | "\n", 347 | "The examples above were toy examples; the data (180 MB) is probably not big enough to warrant the use of Dask.\n", 348 | "\n", 349 | "We can make it a lot bigger! Let's create a new, big array" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "id": "482fb0fe-87d4-46fa-bb9d-ed38cc71d834", 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "darr = da.random.random((8000, 100, 8000), chunks=(1000, 100, 500)).astype('float32')\n", 360 | "darr" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "6c0f8b73-41c1-49fb-9fa4-97cb10ae6f4d", 366 | "metadata": {}, 367 | "source": [ 368 | "This dataset is `~23 GB`, rather than 32 MB! This is probably close to or greater than the amount of available RAM than you have in your computer. Nevertheless, Dask has no problem working on it." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "51e9addb-cc13-46f5-b542-827f8bdd94b5", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "z = (darr + darr.T)[::2, :].mean(axis=2)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "c44ed57c-2a31-4df1-897b-02b614279755", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "z.visualize()" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "id": "f5bf25c1-7384-4953-bbb8-be0c3c4e02e9", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "%%time\n", 399 | "z.compute()" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "id": "017e6ced-7c09-431c-b608-83351617513c", 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "cluster.close()\n", 410 | "client.close()" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "id": "65dd2f1b-b5a3-4063-9678-7ac66fe3edcc", 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "%load_ext watermark\n", 421 | "%watermark --time --python --updated --iversion" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "id": "7cf5002f-ed03-4318-935a-b5ce6e57434e", 427 | "metadata": {}, 428 | "source": [ 429 | "---" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "id": "f58e6da5-1492-4778-8821-aa03721e3db4", 435 | "metadata": {}, 436 | "source": [ 437 | "## Summary\n", 438 | "\n", 439 | "Dask Array does not implement the entire numpy interface. Users expecting this\n", 440 | "will be disappointed. Notably Dask Array has the following failings:\n", 441 | "\n", 442 | "1. Dask does not implement all of ``np.linalg``. This has been done by a\n", 443 | " number of excellent BLAS/LAPACK implementations and is the focus of\n", 444 | " numerous ongoing academic research projects.\n", 445 | "2. Dask Array does not support some operations where the resulting shape\n", 446 | " depends on the values of the array. For those that it does support\n", 447 | " (for example, masking one Dask Array with another boolean mask),\n", 448 | " the chunk sizes will be unknown, which may cause issues with other\n", 449 | " operations that need to know the chunk sizes.\n", 450 | "3. Dask Array does not attempt operations like ``sort`` which are notoriously\n", 451 | " difficult to do in parallel and are of somewhat diminished value on very\n", 452 | " large data (you rarely actually need a full sort).\n", 453 | " Often we include parallel-friendly alternatives like ``topk``.\n", 454 | "4. Dask development is driven by immediate need, and so many lesser used\n", 455 | " functions, like ``np.sometrue`` have not been implemented purely out of\n", 456 | " laziness. These would make excellent community contributions.\n", 457 | "\n", 458 | "## Learn More\n", 459 | "\n", 460 | "Visit the [Array documentation](https://docs.dask.org/en/latest/array.html). In particular, this [array screencast](https://youtu.be/9h_61hXCDuI) will reinforce the concepts you learned here.\n", 461 | "\n" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "id": "21fbe02b-6bee-447b-bbc8-2ba8a0b96c87", 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "from IPython.display import YouTubeVideo\n", 472 | "\n", 473 | "YouTubeVideo(id=\"9h_61hXCDuI\", width=600, height=300)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "id": "c282d878-a11f-41a2-9737-caee406ad5c3", 479 | "metadata": {}, 480 | "source": [ 481 | "## Resources and references\n", 482 | "\n", 483 | "* Reference\n", 484 | " * [Dask Docs](https://dask.org/)\n", 485 | " * [Dask Examples](https://examples.dask.org/)\n", 486 | " * [Dask Code](https://github.com/dask/dask/)\n", 487 | " * [Dask Blog](https://blog.dask.org/)\n", 488 | " \n", 489 | " * [Xarray Docs](https://xarray.pydata.org/)\n", 490 | " \n", 491 | "* Ask for help\n", 492 | " * [`dask`](http://stackoverflow.com/questions/tagged/dask) tag on Stack Overflow, for usage questions\n", 493 | " * [github discussions: dask](https://github.com/dask/dask/discussions) for general, non-bug, discussion, and usage questions\n", 494 | " * [github issues: dask](https://github.com/dask/dask/issues/new) for bug reports and feature requests\n", 495 | " * [github discussions: xarray](https://github.com/pydata/xarray/discussions) for general, non-bug, discussion, and usage questions\n", 496 | " * [github issues: xarray](https://github.com/pydata/xarray/issues/new) for bug reports and feature requests\n", 497 | " \n", 498 | "* Pieces of this notebook are adapted from the following sources\n", 499 | " * https://github.com/dask/dask-tutorial/blob/main/03_array.ipynb\n", 500 | " * https://github.com/xarray-contrib/xarray-tutorial/blob/master/scipy-tutorial/06_xarray_and_dask.ipynb\n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | "
\n", 505 | "

Previous: Dask Delayed

\n", 506 | "

Next: Dask and Xarray

\n", 507 | " \n", 508 | "
" 509 | ] 510 | } 511 | ], 512 | "metadata": { 513 | "kernelspec": { 514 | "display_name": "Python 3", 515 | "language": "python", 516 | "name": "python3" 517 | }, 518 | "language_info": { 519 | "codemirror_mode": { 520 | "name": "ipython", 521 | "version": 3 522 | }, 523 | "file_extension": ".py", 524 | "mimetype": "text/x-python", 525 | "name": "python", 526 | "nbconvert_exporter": "python", 527 | "pygments_lexer": "ipython3", 528 | "version": "3.9.6" 529 | } 530 | }, 531 | "nbformat": 4, 532 | "nbformat_minor": 5 533 | } 534 | -------------------------------------------------------------------------------- /notebooks/10-dask-and-xarray.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6a4abb99-d3c8-492c-a748-19be43d56fd1", 6 | "metadata": {}, 7 | "source": [ 8 | "\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "id": "21d3aa3c-d623-4dc1-8361-e623b4ba0014", 15 | "metadata": {}, 16 | "source": [ 17 | "# Dask and Xarray\n", 18 | "\n", 19 | "\n", 20 | "This notebook demonstrates one of xarray's most powerful features: the ability to wrap dask arrays and allow users to seamlessly execute analysis code in parallel.\n", 21 | "\n", 22 | "\n", 23 | "## Learning Objectives\n", 24 | "\n", 25 | "- Learn that xarray DataArrays and Datasets are \"dask collections\" i.e. you can execute top-level dask functions such as dask.visualize(xarray_object)\n", 26 | "- Learn that all xarray built-in operations can transparently use dask\n", 27 | "- Learn that xarray provides tools to easily parallelize custom functions across blocks of dask-backed xarray objects.\n", 28 | "\n", 29 | "## Prerequisites\n", 30 | "\n", 31 | "\n", 32 | "| Concepts | Importance | Notes |\n", 33 | "| --- | --- | --- |\n", 34 | "| Familiarity with Dask Array | Necessary | |\n", 35 | "| Familiarity with xarray | Necessary | |\n", 36 | "\n", 37 | "- **Time to learn**: *15-20 minutes*\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "27c1bc78-6d5e-4407-a1ef-a7e662e3e389", 43 | "metadata": {}, 44 | "source": [ 45 | "## Setup\n", 46 | "\n", 47 | "First let's set up a `LocalCluster` using `dask.distributed`. \n", 48 | "\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "8cbb8d31-b3ce-4634-a771-569b31b77c1b", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import dask\n", 59 | "import dask.array as da\n", 60 | "import xarray as xr\n", 61 | "from dask.distributed import Client, LocalCluster" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "9cc56bf9-3122-484c-9cd9-f6b858c3c1ef", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "cluster = LocalCluster()\n", 72 | "client = Client(cluster)\n", 73 | "client" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "fba4b6f6-f224-425e-b9fc-2390bf5a0232", 79 | "metadata": {}, 80 | "source": [ 81 | "## Reading data with Dask and Xarray\n", 82 | "\n", 83 | "Recall that a dask's array consists of many chunked arrays:" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "04e1b222-737c-4181-8031-b02317c6e4ec", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "darr = da.ones((2000, 300), chunks=(200, 50))\n", 94 | "darr" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "371f8e08-84ec-4af3-8ada-f68051149ef5", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "darr.compute()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "8ef570f8-959a-4249-8c9a-28abd7e48e1a", 110 | "metadata": {}, 111 | "source": [ 112 | "To read data as dask arrays with xarray, we need to specify the `chunks` argument to `open_dataset()` function. " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "fd72d69d-32ec-4a76-bf06-58bc3b5a52d8", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "ds = xr.open_dataset(\n", 123 | " \"data/tos_Omon_CESM2_historical_r11i1p1f1_gr_200001-201412.nc\", engine=\"netcdf4\", chunks={}\n", 124 | ")\n", 125 | "ds" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "d28bea74-8593-4053-ba51-7e2c6b34bfbc", 131 | "metadata": {}, 132 | "source": [ 133 | "Passing `chunks={}` to `open_dataset()` works, but since we didn't tell dask how to split up (or chunk) the array, Dask will create a single chunk for our array. " 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "2398993f-424e-4e2d-ade4-d790091db331", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "ds = xr.open_dataset(\n", 144 | " \"data/tos_Omon_CESM2_historical_r11i1p1f1_gr_200001-201412.nc\",\n", 145 | " engine=\"netcdf4\",\n", 146 | " chunks={\"time\": 90, \"lat\": 180, \"lon\": 360},\n", 147 | ")\n", 148 | "ds" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "c4a58b22-6af7-423b-ae7c-e951c2a5f105", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "ds.tos" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "91daa092-05b5-4a96-a0d5-e6bfef3a2f9d", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "ds.tos.chunks" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "id": "0bc2d903-5cae-436e-b1bb-886d48d15ae0", 174 | "metadata": {}, 175 | "source": [ 176 | "## Xarray data structures are first-class dask collections\n", 177 | "\n", 178 | "This means you can call the following functions \n", 179 | "\n", 180 | "- `dask.visualize(...)`\n", 181 | "- `dask.compute(...)`\n", 182 | "- `dask.persist(...)`\n", 183 | "\n", 184 | "on both xarray DataArrays and Datasets backed by dask-arrays. " 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "b22ca274-17a7-41d6-a123-f86838851a78", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "dask.visualize(ds)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "id": "cc861a0e-aa87-4ac3-be04-d9e3be3beed9", 200 | "metadata": {}, 201 | "source": [ 202 | "## Parallel and Lazy computation using `dask.array` with xarray\n", 203 | "\n", 204 | "\n", 205 | "Xarray seamlessly wraps dask so all computation is deferred until explicitly requested. " 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "8d0c1351-b666-4b59-b672-345a0ec90404", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "z = ds.tos.mean(['lat', 'lon']).dot(ds.tos.T)\n", 216 | "z" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "id": "2444bb09-cbab-4e6a-a374-ad19a936c831", 222 | "metadata": {}, 223 | "source": [ 224 | "As you can see, `z` contains a dask array. This is true for all xarray built-in operations including subsetting" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "213eabbc-9d1d-4a35-9ac5-7ced8f88ca07", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "z.isel(lat=0)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "5feada15-6f9a-4091-9ab7-8d2ad3b237a1", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "dask.visualize(z)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "id": "6077847d-09cb-481c-85d4-2bd023f9bfd9", 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "%%time\n", 255 | "z.compute()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "id": "5996ff04-3ff3-482a-8a01-027d0ba6e35d", 261 | "metadata": {}, 262 | "source": [ 263 | "## Reading multiple datasets with `open_mfdataset`\n", 264 | "\n", 265 | "Xarray provides a built-in function `xr.open_mfdataset()` for opening multiple files as a single dataset. This makes it easy to work with data from multiple files as one logical dataset. \n", 266 | "\n", 267 | "For demonstration purposes, let's revisit our example in [Dask Delayed Notebook](./08-dask-delayed.ipynb). In this example, we loop over a list of files (for four ensemble members), and we compute the anomaly for each ensemble member as follows:" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "id": "f839be00-17fa-41ce-a397-3c9300c7f60a", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "import pathlib\n", 278 | "\n", 279 | "data_dir = pathlib.Path(\"data/\")\n", 280 | "files = sorted(data_dir.glob(\"tos_Omon_CESM2*\"))\n", 281 | "\n", 282 | "results = {}\n", 283 | "for file in files:\n", 284 | "\n", 285 | " # Read in file\n", 286 | " ds = dask.delayed(xr.open_dataset)(file, engine='netcdf4')\n", 287 | "\n", 288 | " # Compute anomaly\n", 289 | " gb = ds.tos.groupby('time.month')\n", 290 | " tos_anom = gb - gb.mean(dim='time')\n", 291 | "\n", 292 | " # Save the computed anomaly and record the name of the ensemble member\n", 293 | " results[file.stem.split('_')[-3]] = tos_anom\n", 294 | "\n", 295 | "\n", 296 | "# Compute the results\n", 297 | "# dask.compute() returns a tuple here with a single item. So, ensure to grab this one item by using the 0 index\n", 298 | "computed_results = dask.compute(results)[0]\n", 299 | "# Combine the results in our dataarray by concatenating the results across a new dimension `ensemble_member`\n", 300 | "dset_anom = xr.concat(list(computed_results.values()), dim='ensemble_member')\n", 301 | "dset_anom['ensemble_member'] = list(computed_results.keys())\n", 302 | "dset_anom" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "id": "49bba85d-dd79-4ee0-9915-5b7f222da9d3", 308 | "metadata": {}, 309 | "source": [ 310 | "Instead of explicitly looping over the list of files to construct xarray datasets, we can pass the list of files to [`xr.open_mfdataset()`](https://xarray.pydata.org/en/stable/generated/xarray.open_mfdataset.html#xarray.open_mfdataset) and xarray will construct a single dataset for us:" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "id": "4d174e32-60b2-44ba-a63a-ed7ff04141df", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "dset = xr.open_mfdataset(\n", 321 | " sorted(files),\n", 322 | " concat_dim='ensemble_member',\n", 323 | " combine=\"nested\",\n", 324 | " parallel=True,\n", 325 | " data_vars=['tos'],\n", 326 | " engine=\"netcdf4\",\n", 327 | " chunks={'time': 90},\n", 328 | ")\n", 329 | "# Add coordinate labels for the newly created `ensemble_member` dimension\n", 330 | "dset[\"ensemble_member\"] = ['r11i1p1f1', 'r7i1p1f1', 'r8i1p1f1', 'r9i1p1f1']\n", 331 | "dset" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "id": "7dc1e7ee-5d02-4fe6-b794-15253f2cfb8e", 337 | "metadata": {}, 338 | "source": [ 339 | "
\n", 340 | "

\n", 341 | " \n", 346 | "
" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "id": "6f1d3b7e-ba3c-45dd-ba70-1cc8f15af811", 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "# Compute anomaly\n", 357 | "gb = dset.tos.groupby('time.month')\n", 358 | "tos_anom = gb - gb.mean(dim='time')\n", 359 | "tos_anom" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "id": "25d1f67c-a7d6-410b-b645-ff32ad4d6458", 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "tos_anom.sel(lon=310, lat=50, method='nearest').plot(col='ensemble_member', col_wrap=2, size=4);" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "id": "408fecf6-864e-4b0e-9699-edbabdb0adc2", 375 | "metadata": {}, 376 | "source": [ 377 | "
\n", 378 | "

\n", 379 | " Note that using plotting functionality will automatically trigger computation of required results.\n", 380 | "
" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "id": "70e7c78f-1a01-406f-ad96-6d979802266e", 386 | "metadata": {}, 387 | "source": [ 388 | "So, with xarray's `open_mfdataset()`, the following code\n", 389 | "\n", 390 | "```python\n", 391 | "results = {}\n", 392 | "for file in files:\n", 393 | "\n", 394 | " # Read in file\n", 395 | " ds = dask.delayed(xr.open_dataset)(file, engine='netcdf4')\n", 396 | "\n", 397 | " # Compute anomaly\n", 398 | " gb = ds.tos.groupby('time.month')\n", 399 | " tos_anom = gb - gb.mean(dim='time')\n", 400 | "\n", 401 | " # Save the computed anomaly and record the name of the ensemble member\n", 402 | " results[file.stem.split('_')[-3]] = tos_anom\n", 403 | "\n", 404 | "\n", 405 | "# Compute the results\n", 406 | "# dask.compute() returns a tuple here with a single item. So, ensure to grab this one item by using the 0 index\n", 407 | "computed_results = dask.compute(results)[0]\n", 408 | "# Combine the results in our dataarray by concatenating the results across a new dimension `ensemble_member`\n", 409 | "dset_anom = xr.concat(list(computed_results.values()), dim='ensemble_member')\n", 410 | "dset_anom['ensemble_member'] = list(computed_results.keys())\n", 411 | "```\n", 412 | "\n", 413 | "becomes \n", 414 | "\n", 415 | "\n", 416 | "```python\n", 417 | "dset = xr.open_mfdataset(sorted(files), concat_dim='ensemble_member', \n", 418 | " combine=\"nested\", parallel=True, data_vars=['tos'],\n", 419 | " engine=\"netcdf4\", chunks={'time': 90})\n", 420 | "# Add coordinate labels for the newly created `ensemble_member` dimension\n", 421 | "dset[\"ensemble_member\"] = ['r11i1p1f1', 'r7i1p1f1', 'r8i1p1f1', 'r9i1p1f1'] \n", 422 | "# Compute anomaly\n", 423 | "gb = dset.tos.groupby('time.month')\n", 424 | "tos_anom = gb - gb.mean(dim='time')\n", 425 | "```\n", 426 | "\n", 427 | "This latter version is cleaner and easier to maintain than the version with loops. \n", 428 | "\n" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "id": "48f3f0f5-969c-4013-be74-f8a87ae6b491", 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "cluster.close()\n", 439 | "client.close()" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "id": "eaa208cf-a2da-4bc8-b8d0-90ad2f2309fa", 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "%load_ext watermark\n", 450 | "%watermark --time --python --updated --iversion" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "id": "f9bc93c6-d1d1-4bc1-9fec-197fade60206", 456 | "metadata": {}, 457 | "source": [ 458 | "---" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "id": "3ac98098-0ea2-4b85-b04f-0ae58a9c9f4b", 464 | "metadata": {}, 465 | "source": [ 466 | "## Learn More\n", 467 | "\n", 468 | "Visit the [Parallel computing with Dask documentation](https://xarray.pydata.org/en/stable/user-guide/dask.html), and the [dask array best practices](https://docs.dask.org/en/latest/array-best-practices.html) which provides advice on using `dask.array` well." 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "id": "8cda9549-2a22-4b72-b40d-5fb899482e3c", 474 | "metadata": {}, 475 | "source": [ 476 | "## Resources and references\n", 477 | "\n", 478 | "* Reference\n", 479 | " * [Dask Docs](https://dask.org/)\n", 480 | " * [Dask Blog](https://blog.dask.org/)\n", 481 | " * [Xarray Docs](https://xarray.pydata.org/)\n", 482 | " \n", 483 | "* Ask for help\n", 484 | " * [`dask`](http://stackoverflow.com/questions/tagged/dask) tag on Stack Overflow, for usage questions\n", 485 | " * [github discussions (dask):](https://github.com/dask/dask/discussions) for general, non-bug, discussion, and usage questions\n", 486 | " * [github issues (dask): ](https://github.com/dask/dask/issues/new) for bug reports and feature requests\n", 487 | " * [github discussions (xarray): ](https://github.com/pydata/xarray/discussions) for general, non-bug, discussion, and usage questions\n", 488 | " * [github issues (xarray): ](https://github.com/pydata/xarray/issues/new) for bug reports and feature requests\n", 489 | " \n", 490 | "* Pieces of this notebook are adapted from the following sources\n", 491 | " * https://github.com/xarray-contrib/xarray-tutorial/blob/master/scipy-tutorial/06_xarray_and_dask.ipynb\n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | "
\n", 496 | "

Previous: Dask Delayed

\n", 497 | "

Next: Dask and Xarray

\n", 498 | " \n", 499 | "
" 500 | ] 501 | } 502 | ], 503 | "metadata": { 504 | "kernelspec": { 505 | "display_name": "Python 3", 506 | "language": "python", 507 | "name": "python3" 508 | }, 509 | "language_info": { 510 | "codemirror_mode": { 511 | "name": "ipython", 512 | "version": 3 513 | }, 514 | "file_extension": ".py", 515 | "mimetype": "text/x-python", 516 | "name": "python", 517 | "nbconvert_exporter": "python", 518 | "pygments_lexer": "ipython3", 519 | "version": "3.9.6" 520 | } 521 | }, 522 | "nbformat": 4, 523 | "nbformat_minor": 5 524 | } 525 | -------------------------------------------------------------------------------- /notebooks/11-dask-distributed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0c247eff-7187-4de9-93a9-1c6a60db569b", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "55accf7f-c02b-4aae-b1a4-a7a613df03ba", 14 | "metadata": {}, 15 | "source": [ 16 | "# Distributed" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "6821b7b0-4c44-4004-bce4-8d704ed26545", 22 | "metadata": {}, 23 | "source": [ 24 | "## Learning Objectives \n", 25 | "\n", 26 | "- Use single machine Dask schedulers\n", 27 | "- Deploy a local Dask Distributed Cluster and access the diagnostics dashboard\n", 28 | "\n", 29 | "\n", 30 | "## Prerequisites\n", 31 | "\n", 32 | "\n", 33 | "| Concepts | Importance | Notes |\n", 34 | "| --- | --- | --- |\n", 35 | "| Familiarity with Python | Necessary | |\n", 36 | "| Familiarity with Dask Fundamentals | Necessary | |\n", 37 | "\n", 38 | "\n", 39 | "- **Time to learn**: *25-35 minutes*\n" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "a43f990d-100d-43c9-8fb1-e8876b34d3b4", 45 | "metadata": {}, 46 | "source": [ 47 | "## Dask Schedulers\n", 48 | "\n", 49 | "As we have seen so far, Dask allows you to simply construct graphs of tasks with dependencies, as well as have graphs created automatically for you using functional, Numpy or Xarray syntax on data collections. None of this would be very useful, if there weren't also a way to execute these graphs, in a parallel and memory-aware way. So far we have been calling `thing.compute()` or `dask.compute(thing)` without worrying what this entails. Now we will discuss the options available for that execution, and in particular, the distributed scheduler, which comes with additional functionality.\n", 50 | "\n", 51 | "Dask comes with four available schedulers:\n", 52 | "\n", 53 | "- \"threaded\" (aka \"threading\"): a scheduler backed by a thread pool\n", 54 | "- \"processes\": a scheduler backed by a process pool\n", 55 | "- \"single-threaded\" (aka \"sync\"): a synchronous scheduler, good for debugging\n", 56 | "- distributed: a distributed scheduler for executing graphs on multiple machines, see below.\n", 57 | "\n", 58 | "To select one of these for computation, you can specify at the time of asking for a result, e.g.,\n", 59 | "```python\n", 60 | "myvalue.compute(scheduler=\"single-threaded\") # for debugging\n", 61 | "```\n", 62 | "\n", 63 | "You can also set a default scheduler either temporarily\n", 64 | "```python\n", 65 | "with dask.config.set(scheduler='processes'):\n", 66 | " # set temporarily for this block only\n", 67 | " # all compute calls within this block will use the specified scheduler\n", 68 | " myvalue.compute()\n", 69 | " anothervalue.compute()\n", 70 | "```\n", 71 | "\n", 72 | "Or globally\n", 73 | "```python\n", 74 | "# set until further notice\n", 75 | "dask.config.set(scheduler='processes')\n", 76 | "```\n", 77 | "\n", 78 | "Let's try out a few schedulers on the Sea Surface Temperature data." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "cfb03e79-dd85-44cb-affb-8864712cbcd5", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "import pathlib\n", 89 | "\n", 90 | "import dask\n", 91 | "import xarray as xr" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "b7b94f32-edbc-4a89-96ce-d92d4de3fdb3", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "data_dir = pathlib.Path(\"data/\")\n", 102 | "files = sorted(data_dir.glob(\"tos_Omon_CESM2*\"))\n", 103 | "files" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "ef83e9e5-9c43-4b0f-95d5-e759c653b5a6", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "dset = xr.open_mfdataset(\n", 114 | " sorted(files),\n", 115 | " concat_dim='ensemble_member',\n", 116 | " combine=\"nested\",\n", 117 | " parallel=True,\n", 118 | " data_vars=['tos'],\n", 119 | " engine=\"netcdf4\",\n", 120 | " chunks={'time': 90},\n", 121 | ")\n", 122 | "# Add coordinate labels for the newly created `ensemble_member` dimension\n", 123 | "dset[\"ensemble_member\"] = ['r11i1p1f1', 'r7i1p1f1', 'r8i1p1f1', 'r9i1p1f1']\n", 124 | "dset" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "id": "a414c9f2-98f4-4873-b9c9-badf018fde43", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# Compute anomaly\n", 135 | "gb = dset.tos.groupby('time.month')\n", 136 | "tos_anom = gb - gb.mean(dim='time')\n", 137 | "tos_anom" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "c386bad9-4d1f-4b24-be6f-58d76832093a", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# each of the following gives the same results (you can check!)\n", 148 | "# any surprises?\n", 149 | "import time\n", 150 | "\n", 151 | "for sch in ['threading', 'processes', 'sync']:\n", 152 | " t0 = time.time()\n", 153 | " r = tos_anom.compute(scheduler=sch)\n", 154 | " t1 = time.time()\n", 155 | " print(f\"{sch:>10}, {t1 - t0:0.4f} s; {r.min().data, r.max().data, r.mean().data}\")" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "8fd1f05d-73b1-4b3d-a91f-e18bd8848ff6", 162 | "metadata": { 163 | "tags": [] 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "dask.visualize(tos_anom)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "519d9129-8bd8-40d7-b3b9-08c30616096a", 173 | "metadata": {}, 174 | "source": [ 175 | "### Some Questions to Consider:\n", 176 | "\n", 177 | "- How much speedup is possible for this task (hint, look at the graph).\n", 178 | "- Given how many cores are on this machine, how much faster could the parallel schedulers be than the single-threaded scheduler.\n", 179 | "- How much faster was using threads over a single thread? Why does this differ from the optimal speedup?\n", 180 | "- Why is the multiprocessing scheduler so much slower here?\n", 181 | "\n", 182 | "The `threaded` scheduler is a fine choice for working with large datasets out-of-core on a single machine, as long as the functions being used release the [Python Global Interpreter Lock (GIL)](https://wiki.python.org/moin/GlobalInterpreterLock) most of the time. NumPy and pandas release the GIL in most places, so the `threaded` scheduler is the default for `dask.array` and `dask.dataframe`. The distributed scheduler, perhaps with `processes=False`, will also work well for these workloads on a single machine.\n", 183 | "\n", 184 | "For workloads that do hold the GIL, as is common with `dask.bag` and custom code wrapped with `dask.delayed`, we recommend using the distributed scheduler, even on a single machine. Generally speaking, it's more intelligent and provides better diagnostics than the `processes` scheduler.\n", 185 | "\n", 186 | "
\n", 187 | "

What Is the Python Global Interpreter Lock (GIL)?

\n", 188 | " The Python Global Interpreter Lock or GIL, in simple words, is a mutex (or a lock) that allows only one thread to hold the control of the Python interpreter.\n", 189 | "
\n", 190 | " See this blog post for more details on Python GIL.\n", 191 | "
\n", 192 | "\n", 193 | "\n", 194 | "\n", 195 | "https://docs.dask.org/en/latest/scheduling.html provides some additional details on choosing a scheduler.\n", 196 | "\n", 197 | "For scaling out work across a cluster, the distributed scheduler is required." 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "id": "0038ec01-d8d9-49b6-bca3-2745c84bb9fa", 203 | "metadata": {}, 204 | "source": [ 205 | "## Making a cluster" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "717d438d-dbe1-4c76-9905-3c46af2183b4", 211 | "metadata": {}, 212 | "source": [ 213 | "### Simple method\n", 214 | "\n", 215 | "The `dask.distributed` system is composed of a single centralized scheduler and one or more worker processes. [Deploying](https://docs.dask.org/en/latest/setup.html) a remote Dask cluster involves some additional effort. But doing things locally is just involves creating a `LocalCluster` object and connecting this object to a `Client` object, which lets you interact with the \"cluster\" (local threads or processes on your machine). For more information see [here](https://docs.dask.org/en/latest/setup/single-distributed.html). \n", 216 | "\n", 217 | "\n", 218 | "\n", 219 | "Note that `LocalCluster()` takes a lot of optional [arguments](https://distributed.dask.org/en/latest/local-cluster.html#api), to configure the number of processes/threads, memory limits and other " 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "280d8409-4aaf-444b-9b20-e67e9999c806", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "from dask.distributed import Client, LocalCluster\n", 230 | "\n", 231 | "# Setup a local cluster.\n", 232 | "# By default this sets up 1 worker per CPU core\n", 233 | "\n", 234 | "cluster = LocalCluster()\n", 235 | "client = Client(cluster)\n", 236 | "client" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "id": "9531d51f-3b21-4aa7-9beb-a8b0a550061b", 242 | "metadata": {}, 243 | "source": [ 244 | "**Note:**\n", 245 | "\n", 246 | "This code\n", 247 | "\n", 248 | "```python\n", 249 | "cluster = LocalCluster()\n", 250 | "client = Client(cluster)\n", 251 | "```\n", 252 | "\n", 253 | "is equivalent to \n", 254 | "\n", 255 | "```python\n", 256 | "client = Client()\n", 257 | "```" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "13c70a4a-437b-4126-ace8-90bfffa2b8e5", 263 | "metadata": {}, 264 | "source": [ 265 | "If you aren't in jupyterlab and using the `dask-labextension`, be sure to click the `Dashboard` link to open up the diagnostics dashboard.\n", 266 | "\n" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "id": "b1471997-4a70-40a3-bb52-13e2a0d7f498", 272 | "metadata": {}, 273 | "source": [ 274 | "## Distributed Dask clusters for HPC and Cloud environments\n", 275 | "\n", 276 | "Dask can be deployed on distributed infrastructure, such as a an HPC system or a cloud computing system. There is a growing ecosystem of Dask deployment projects that faciliate easy deployment and scaling of Dask clusters on a wide variety of computing systems.\n", 277 | "\n", 278 | "### HPC\n", 279 | "\n", 280 | "#### Dask Jobqueue (https://jobqueue.dask.org/)\n", 281 | "\n", 282 | "- `dask_jobqueue.PBSCluster`\n", 283 | "- `dask_jobqueue.SlurmCluster`\n", 284 | "- `dask_jobqueue.LSFCluster`\n", 285 | "- etc.\n", 286 | "\n", 287 | "#### Dask MPI (https://mpi.dask.org/)\n", 288 | "\n", 289 | "- `dask_mpi.initialize`\n", 290 | "\n", 291 | "### Cloud\n", 292 | "\n", 293 | "#### Dask Kubernetes (https://kubernetes.dask.org/)\n", 294 | "\n", 295 | "- `dask_kubernetes.KubeCluster`\n", 296 | "\n", 297 | "#### Dask Cloud Provider (https://cloudprovider.dask.org)\n", 298 | "\n", 299 | "- `dask_cloudprovider.FargateCluster`\n", 300 | "- `dask_cloudprovider.ECSCluster`\n", 301 | "- `dask_cloudprovider.ECSCluster`\n", 302 | "\n", 303 | "#### Dask Gateway (https://gateway.dask.org/)\n", 304 | "\n", 305 | "- `dask_gateway.GatewayCluster`\n" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "id": "a2e3f65e-6836-4fba-9df7-58fe9f48aea4", 311 | "metadata": {}, 312 | "source": [ 313 | "## Executing with the distributed client\n", 314 | "\n", 315 | "Consider some calculation, such as we've used before, where we computed anomaly per ensemble member" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "id": "ff6a4027-e21a-4b47-b6a3-349d0c8155be", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "tos_anom" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "id": "9a07be93-4b44-4def-85fb-e510306a8f52", 331 | "metadata": {}, 332 | "source": [ 333 | "By default, creating a `Client` makes it the default scheduler. Any calls to `.compute` will use the cluster your `client` is attached to, unless you specify otherwise, as above.\n" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "007d949a-25a8-40a0-bac2-a43ac1341056", 339 | "metadata": {}, 340 | "source": [ 341 | "The tasks will appear in the web UI as they are processed by the cluster and, eventually, a result will be printed as output of the cell above. Note that the kernel is blocked while waiting for the result.\n", 342 | "\n", 343 | "You can also see a simplified version of the graph being executed on Graph pane of the dashboard, so long as the calculation is in-flight.\n", 344 | "\n", 345 | "\n", 346 | "Let's return to the anomaly computation from before, and see what happens on the dashboard (you may wish to have both the notebook and dashboard side-by-side). How does this perform compared to before?" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "id": "738ada56-dff4-4fe0-9b38-0dc7342adf9e", 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "%time tos_anom.compute()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "id": "3b7fd256-fa96-4c7d-b981-90d31f043e03", 362 | "metadata": {}, 363 | "source": [ 364 | "In this particular case, this should be as fast or faster than the best case, threading, above. Why do you suppose this is? You should start your reading [here](https://distributed.dask.org/en/latest/index.html#architecture), and in particular note that the distributed scheduler was a complete rewrite with more intelligence around sharing of intermediate results and which tasks run on which worker. This will result in better performance in *some* cases, but still larger latency and overhead compared to the threaded scheduler, so there will be rare cases where it performs worse. Fortunately, the dashboard now gives us a lot more [diagnostic information](https://distributed.dask.org/en/latest/diagnosing-performance.html). Look at the Profile page of the dashboard to find out what takes the biggest fraction of CPU time for the computation we just performed?" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "id": "14a4082d-b8a9-4843-8f93-8ff1f035deb6", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "cluster.close()\n", 375 | "client.close()" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "id": "ca42fc1e-6c86-4568-8a95-d03e00cdf8fb", 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "%load_ext watermark\n", 386 | "%watermark --time --python --updated --iversion" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "id": "721d405f-ed11-4e00-8eab-f1382b2848d3", 392 | "metadata": {}, 393 | "source": [ 394 | "---" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "id": "1cf640f8-673e-4f13-a6e2-9b5ff6aa315e", 400 | "metadata": {}, 401 | "source": [ 402 | "## Learn More\n", 403 | "\n", 404 | "If all you want to do is execute computations created using delayed, or run calculations based on the higher-level data collections, then that is about all you need to know to scale your work up to cluster scale. However, there is more detail to know about the distributed scheduler that will help with efficient usage. See this tutorial on advanced features of Distributed: https://tutorial.dask.org/06_distributed_advanced.html." 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "id": "9d7ebf3a-edab-4cdd-9f85-f8bba94fb000", 410 | "metadata": {}, 411 | "source": [ 412 | "## Resources and references\n", 413 | "\n", 414 | "* Reference\n", 415 | " * [Dask Docs](https://dask.org/)\n", 416 | " * [Dask Blog](https://blog.dask.org/)\n", 417 | " * [Xarray Docs](https://xarray.pydata.org/)\n", 418 | " \n", 419 | "* Ask for help\n", 420 | " * [`dask`](http://stackoverflow.com/questions/tagged/dask) tag on Stack Overflow, for usage questions\n", 421 | " * [github discussions (dask):](https://github.com/dask/dask/discussions) for general, non-bug, discussion, and usage questions\n", 422 | " * [github issues (dask): ](https://github.com/dask/dask/issues/new) for bug reports and feature requests\n", 423 | " * [github discussions (xarray): ](https://github.com/pydata/xarray/discussions) for general, non-bug, discussion, and usage questions\n", 424 | " * [github issues (xarray): ](https://github.com/pydata/xarray/issues/new) for bug reports and feature requests\n", 425 | " \n", 426 | "* Pieces of this notebook are adapted from the following sources\n", 427 | " * https://github.com/dask/dask-tutorial/blob/main/05_distributed.ipynb\n", 428 | " * https://github.com/xarray-contrib/xarray-tutorial/blob/master/scipy-tutorial/05_intro_to_dask.ipynb\n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | "
\n", 433 | "

Previous: Dask and Xarray

\n", 434 | " \n", 435 | "
" 436 | ] 437 | } 438 | ], 439 | "metadata": { 440 | "kernelspec": { 441 | "display_name": "Python 3", 442 | "language": "python", 443 | "name": "python3" 444 | }, 445 | "language_info": { 446 | "codemirror_mode": { 447 | "name": "ipython", 448 | "version": 3 449 | }, 450 | "file_extension": ".py", 451 | "mimetype": "text/x-python", 452 | "name": "python", 453 | "nbconvert_exporter": "python", 454 | "pygments_lexer": "ipython3", 455 | "version": "3.9.6" 456 | } 457 | }, 458 | "nbformat": 4, 459 | "nbformat_minor": 5 460 | } 461 | -------------------------------------------------------------------------------- /notebooks/blank-01-xarray-fundamentals.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Xarray Fundamentals" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Learning Objectives\n", 22 | "\n", 23 | "- Provide an overview of xarray\n", 24 | "- Describe the core xarray data structures, the `DataArray` and the `Dataset`, and the components that make them up\n", 25 | "- Load xarray dataset from a netCDF file \n", 26 | "- Load xarray dataset from a GRIB file\n", 27 | "- Load xarray dataset from a remote dataset from a THREDDS server\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Prerequisites\n", 35 | "\n", 36 | "\n", 37 | "| Concepts | Importance | Notes |\n", 38 | "| --- | --- | --- |\n", 39 | "| Basic familiarity with NumPy | Necessary | |\n", 40 | "| Basic familiarity with Pandas | Helpful | |\n", 41 | "| [Understanding of NetCDF Data Model](https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_model.html) | Helpful | Familiarity with metadata structure |\n", 42 | "\n", 43 | "\n", 44 | "- **Time to learn**: *15-20 minutes*\n", 45 | "\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "---" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Imports\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import xarray as xr # \"canonical\" namespace short-hand" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## What is Xarray?\n", 76 | "\n", 77 | "Xarray is a Python library for working with **labeled**, **multi dimensional** arrays. \n", 78 | "\n", 79 | "- Built on top of numpy and pandas \n", 80 | "- Brings the power of pandas to multidimensional arrays \n", 81 | "- Supports data of any dimensionality " 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Core Data Structures\n", 89 | "\n", 90 | "- Xarray has **two** main data structures:\n", 91 | " - `xarray.DataArray`: a fancy, labelled version of `numpy.ndarray` with associated coordinates. \n", 92 | " - `xarray.Dataset`: a collection of multiple `xarray.DataArray` that share the same coordinates and/or dimensions.\n", 93 | "\n", 94 | "---\n", 95 | "\n", 96 | "" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "### Dataset" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Xarray's interface is heavily inspired by the [netCDF data model](https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_model.html). Xarray's Dataset is designed as an in-memory representation of a netCDF dataset. \n", 111 | "\n", 112 | "\n", 113 | "#### Loading data from a netCDF file\n", 114 | "\n", 115 | "First, let's open a local netCDF file using the `xarray.open_dataset()` function:" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "%%time\n", 125 | "ds = xr.open_dataset(\n", 126 | " \"./data/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n", 127 | ")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "By default, `xarray.open_dataset()` function uses **lazy loading** i.e. it just loads in the coordinate and attribute metadata and **not** the data that correspond to data variables themselves. The data variables are loaded only on actual values access (e.g. when performing some calculation, slicing, ...) or with `.load()` method. \n", 135 | "\n", 136 | "Let's look at the HTML representation of the loaded dataset:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "ds" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "\n", 153 | "
\n", 154 | "

Text based representation

\n", 155 | " If you prefer a text based representation, you can set the display_style='text' by uncommenting the line below\n", 156 | "
\n" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "To look at the corresponding netCDF representation, we can use the `.info()` method:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Datasets have the following key properties:\n", 185 | "- `data_vars`: an dictionary of `DataArrays` corresponding to data variables \n", 186 | "- `dims`: a dictionary mapping from dimenion names to the fixed length of each dimension (e.g. `{'time': 1815, 'nv': 2, 'latitude': 180, 'longitude': 360}` )\n", 187 | "- `coords`: a dictionary-like container of arrays (coordinates) that label each point (tick label) along our dimensions\n", 188 | "- `attrs`: a dictionary holding arbitrary metadata pertaining to the dataset" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# variables that are in our dataset" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# dataset dimensions" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# dataset coordinates" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "# dataset global attributes" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### DataArray\n", 232 | "\n", 233 | "The DataArray is xarray's implementation of a labeled, multi-dimensional array. It has several key properties:\n", 234 | "\n", 235 | "- `data`: a Duck array (`numpy.ndarray` or [`dask.array`](https://docs.dask.org/en/latest/array.html) or [`sparse`](https://sparse.pydata.org/en/stable/) or [`cupy.array`](https://docs.cupy.dev/en/stable/index.html) holding the array's values). \n", 236 | "- `dims`: dimension names for each axis e.g. `(lat, lon, time)`\n", 237 | "- `coords`: a dictionary-like container of arrays (coordinates) that label each point (tick label) along our dimensions\n", 238 | "- `attrs`: a dictionary that holds arbitrary attributes/metadata (such as units). \n", 239 | "- `name`: an arbitrary name of the array" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# Extract the tas variable (dataarray)\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "# ds[\"tas\"] is equivalent to ds.tas\n" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "\n", 265 | "
\n", 266 | "

Warning: dot notation vs bracket notation

\n", 267 | "\n", 268 | "\n", 269 | "\n", 270 | "\n", 271 | "\n", 275 | "
" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "# The actual array data\n" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "# datarray coordinates\n" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# dataarray attributes" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "### Dimensions vs Coordinates\n", 310 | "\n", 311 | "- A dimension is just a name of an axis, like \"longitude\" or \"time\"\n", 312 | "- Labeled coordinates are tick labels along an axis, e.g. \"2021-06-08\"\n", 313 | "\n", 314 | "\n", 315 | "#### `repr` & HTML representation of dimensions with or without coordinates \n", 316 | "\n", 317 | "| Dimension | HTML repr | Text based repr |\n", 318 | "| --- | --- | --- |\n", 319 | "| with coordinates | **bold** | `*` symbol in `.coords` |\n", 320 | "| without coordinates | normal | listed explicitly |\n", 321 | "\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "\n", 343 | "\n", 344 | "### Loading data in other file formats \n", 345 | "\n", 346 | "\n", 347 | "#### Loading data from a grib file \n", 348 | "\n", 349 | "To load a grib file in an xarray Dataset, we use the `xarray.open_dataset()` and we need to specify `engine=\"cfgrib\"`. This requires the presence of `cfgrib` package in our Python environment:" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": { 362 | "slideshow": { 363 | "slide_type": "subslide" 364 | } 365 | }, 366 | "source": [ 367 | "#### Loading data from a remote OPENDAP server \n", 368 | "\n", 369 | "\n", 370 | "If you happen to have access to netCDF datasets that are hosted remotely on a THREDDS server, you can point xarray to a url and it will load/stream the data over the network without needing to download it locally. " 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "url = \"http://crd-esgf-drc.ec.gc.ca/thredds/dodsC/esgD_dataroot/AR6/CMIP6/ScenarioMIP/CCCma/CanESM5/ssp126/r12i1p2f1/Amon/wap/gn/v20190429/wap_Amon_CanESM5_ssp126_r12i1p2f1_gn_201501-210012.nc\"" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "---" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "%load_ext watermark\n", 403 | "%watermark --time --python --updated --iversion" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "## Summary \n", 411 | "\n", 412 | "\n", 413 | "- Xarray has two main data structures: DataArray and Dataset\n", 414 | "- DataArrays store the multi-dimensional arrays\n", 415 | "- Xarray is built on top of Numpy and Pandas and its architecture is heavily inspired by the netCDF data model" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "## Resources and References\n", 423 | "\n", 424 | "- [Xarray Documentation on Data Structures](http://xarray.pydata.org/en/latest/data-structures.html)\n", 425 | "- [Xarray Documentation on reading files and writing files](https://xarray.pydata.org/en/stable/io.html)\n", 426 | "- [cfgrib Documentation](https://github.com/ecmwf/cfgrib)" 427 | ] 428 | } 429 | ], 430 | "metadata": { 431 | "kernelspec": { 432 | "display_name": "Python 3 (ipykernel)", 433 | "language": "python", 434 | "name": "python3" 435 | }, 436 | "language_info": { 437 | "codemirror_mode": { 438 | "name": "ipython", 439 | "version": 3 440 | }, 441 | "file_extension": ".py", 442 | "mimetype": "text/x-python", 443 | "name": "python", 444 | "nbconvert_exporter": "python", 445 | "pygments_lexer": "ipython3", 446 | "version": "3.9.12" 447 | }, 448 | "toc-autonumbering": false 449 | }, 450 | "nbformat": 4, 451 | "nbformat_minor": 4 452 | } 453 | -------------------------------------------------------------------------------- /notebooks/blank-02-indexing-and-selecting-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Indexing and Selecting data\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Learning Objectives \n", 22 | "\n", 23 | "\n", 24 | "- Select data by position using `.isel()` with values or slices\n", 25 | "- Select data by coordinate label/value using `.sel()` with values or slices\n", 26 | "- Use nearest-neighbor lookups with `.sel()`\n", 27 | "- Use `interp()` to interpolate by coordinate labels" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Prerequisites\n", 35 | "\n", 36 | "\n", 37 | "| Concepts | Importance | Notes |\n", 38 | "| --- | --- | --- |\n", 39 | "| [Understanding of xarray core data structures](./01-xarray-fundamentals.ipynb) | Necessary | |\n", 40 | "| [Basic familiarity with NumPy indexing](https://numpy.org/doc/stable/reference/arrays.indexing.html) | Helpful | |\n", 41 | "| [Basic familiarity with Pandas indexing](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html) | Helpful | |\n", 42 | "\n", 43 | "- **Time to learn**: *15-20 minutes*\n", 44 | "\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "---" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Imports\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "import xarray as xr" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "ds = xr.open_dataset(\n", 77 | " \"./data/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n", 78 | ")\n", 79 | "ds" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## NumPy Positional Indexing\n", 87 | "\n", 88 | "When working with numpy, indexing is done by position (slices/ranges/scalars)." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# retrieve numpy array\n" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Let's extract a timeseries for a single spatical location \n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "
\n", 126 | "

\n", 127 | " but wait, what labels go with 20 and 40? Was that lat/lon or lon/lat? Where are the timestamps that go along with this time-series?\n", 128 | "
" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## Different choices for indexing \n", 136 | "\n", 137 | "\n", 138 | "Xarray supports two kinds of indexing \n", 139 | "\n", 140 | "- Positional indexing via `.isel()`: provides primarily integer position based index (from `0` to `length-1` of the axis/dimension\n", 141 | "- Label indexing via `.sel()`: provides primarily label based index\n", 142 | "\n", 143 | "Xarray's indexing methods preserves the coordinate labels and associated metadata.\n", 144 | "\n", 145 | "\n", 146 | "\n", 147 | "### Selection by position\n", 148 | "\n", 149 | "The `.isel()` method is the primary access method for **purely integer based indexing**. The following are valid inputs:\n", 150 | "- An integer e.g. `lat=10`\n", 151 | "- A list or array of integers `lon=[10, 20, 39]`\n", 152 | "- A slice object with integers e.g. `time=slice(2, 20)`" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# the original object i.e. no selection" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "### Selection by label \n", 190 | "\n", 191 | "\n", 192 | "The `.sel()` method is the primary access method for **purely coordinate label based indexing.**. The following are valid inputs:\n", 193 | "\n", 194 | "- A single coordinate label e.g. `time=\"2021-03-01\"`\n", 195 | "- A list or array of coordinate labels `lon=[=\"2021-01-01\", =\"2021-03-10\", =\"2021-03-12\"]`\n", 196 | "- A slice object with coordinate labels e.g. `time=slice(\"2021-01-01\", \"2021-03-01\")`. (Note that contrary to usual Python slices, both the start and the stop are included, when present in the index!)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "It is also possible to use slice for the time dimension:" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "\n", 225 | "
\n", 226 | "

Warning: Be careful when working with floating coordinate labels

\n", 227 | " \n", 228 | " When we have integer, string, datetime-like values for coordinate labels, \"sel()\" works flawlessly. When we try to work with floating coordinate labels, things get a little tricky:\n", 229 | " \n", 230 | "
\n", 231 | "\n", 232 | "\n" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "### Nearest-neighbor lookups\n", 247 | "\n", 248 | "As shown above, when our coordinate labels are not integers or strings or datetime-like but floating point numbers, `.sel()` may throw a `KeyError`:" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "`ds.tas.sel(lat=39.5, lon=105.7)` fails because we are trying to use a conditional for an approximate value i.e floating numbers are represented approximately inside the computer, and xarray is unable to locate this exact value. To address this issue, xarray supports `method` and `tolerance` keyword argument. The `method` parameter allows for enabling nearest neighbor (inexact) lookups by use of the methods `'pad', 'backfill' or 'nearest'`: " 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "So the closest location in the data was at `lat=39.11`, `lon=106.2`." 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "See the [xarray documentation](https://xarray.pydata.org/en/stable/generated/xarray.DataArray.sel.html) for more on usage of `method` and `tolerance` parameters in `.sel()`. " 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "
\n", 284 | "

Tip

\n", 285 | "Another way to use the nearest neighbor lookup is via slice objects. For e.g.:\n", 286 | "
" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "Operators can be chained, so multiple operations can be peformed sequentially. For example, to select an area of interest and the first time index" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "### Interpolation\n", 315 | "\n", 316 | "If we want to interpolate along coordinates rather than looking up the nearest neighbos, we can use the `.interp()` method. To use `interp()` requires the presence of `scipy` library. \n" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "---" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "%load_ext watermark\n", 340 | "%watermark --time --python --updated --iversion" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "## Summary \n", 348 | "\n", 349 | "- Xarray’s named dimensions and labeled coordinates free the user from having to track positional ordering of dimensions when accessing data\n", 350 | "- Xarray provides a variety of methods for subsetting data via `.sel()`, `.isel()`, `.interp()` methods\n" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Resources and References\n", 358 | "\n", 359 | "- [Xarray Documentation - Indexing and Selecting Data](https://xarray.pydata.org/en/stable/indexing.html)\n", 360 | "- [Xarray Documentation - Interpolation](https://xarray.pydata.org/en/stable/user-guide/interpolation.html)\n" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "
\n", 368 | "

Previous: Xarray Fundamentals

\n", 369 | "

Next: Data Visualization

\n", 370 | "
" 371 | ] 372 | } 373 | ], 374 | "metadata": { 375 | "kernelspec": { 376 | "display_name": "Python 3", 377 | "language": "python", 378 | "name": "python3" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 3 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython3", 390 | "version": "3.9.4" 391 | }, 392 | "toc-autonumbering": false, 393 | "toc-showcode": false, 394 | "toc-showmarkdowntxt": false, 395 | "toc-showtags": false 396 | }, 397 | "nbformat": 4, 398 | "nbformat_minor": 4 399 | } 400 | -------------------------------------------------------------------------------- /notebooks/blank-03-data-visualization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Visualization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Learning Objectives \n", 22 | "\n", 23 | "\n", 24 | "- How to use xarray's builtin, matplotlib-backed plotting interface to visualize datasets.\n", 25 | "- How to use `hvplot` to produce interactive plots " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Prerequisites\n", 33 | "\n", 34 | "\n", 35 | "| Concepts | Importance | Notes |\n", 36 | "| --- | --- | --- |\n", 37 | "| [Understanding of xarray core data structures](./01-xarray-fundamentals.ipynb) | Necessary | |\n", 38 | "| [Familiarity with xarray indexing and subsetting](./02-indexing-and-subsetting.ipynb) | Necessary | |\n", 39 | "| [Basic familiarity with Matplotlib](https://numpy.org/doc/stable/reference/arrays.indexing.html) | Helpful | |\n", 40 | "\n", 41 | "\n", 42 | "- **Time to learn**: *15-20 minutes*\n", 43 | "\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "---" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Imports\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "import xarray as xr" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "Let's open the same dataset as before" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "ds = xr.open_dataset(\n", 83 | " \"./data/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n", 84 | ")\n", 85 | "ds" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Basic plotting with via `.plot()`\n", 93 | "\n", 94 | "Xarray provides a `.plot()` method on `DataArray` and `Dataset`. This method is a wrapper around Matplotlib's `matplotlib.pyplot.plot()`. xaarray will automatically guess the type of plot based on the dimensionality of the data. By default `.plot()` creates:\n", 95 | "\n", 96 | "- a **line** plot for `1-D arrays` using `matplotlib.pyplot.plot()`\n", 97 | "- a **pcolormesh** plot for 2-D arrays using `matplotlib.pyplot.pcolormesh()`\n", 98 | "- a **histogram** for everything else (more than 2 dimensions) using `matplotlib.pyplot.hist()`\n", 99 | " " 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "### 1D line plots\n", 107 | "\n", 108 | "Let's select one spatial location and plot a time sesries of the near surface temperature" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "
\n", 123 | " We are selecting a single point, so `.sel()` requires either an exact location that exists in the data, or to specify method argument to tell it how to choose a location from the data. \n", 124 | "
" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "\n", 132 | "Lets say we want to compare plots of temperature at three different latitudes. We can use the `hue` keyword argument to do this." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### 2D plots" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "Operator chaining means it is possible to have multiple selection operators and add `.plot()` to the end to visualise the result" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "The x- and y-axes are labeled with full names — \"Latitude\", \"Longitude\" — along with units. The colorbar has a nice label, again with units. And the title tells us the timestamp of the data presented." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# define keyword arguments that are passed to matptolib.pyplot.colorbar\n", 184 | "colorbar_kwargs = {\n", 185 | " \"orientation\": \"horizontal\",\n", 186 | " \"label\": \"my clustom label\",\n", 187 | " \"pad\": 0.2,\n", 188 | "}\n", 189 | "\n", 190 | "ds.tas.isel(lon=1).plot(\n", 191 | " x=\"time\", # coordinate to plot on the x-axis of the plot\n", 192 | " robust=True, # set colorbar limits to 2nd and 98th percentile of data\n", 193 | " cbar_kwargs=colorbar_kwargs,\n", 194 | ");" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "\n", 202 | "### Faceting\n", 203 | "\n", 204 | "Faceting is an effective way of visualizing variations of 3D data where 2D slices are visualized in a panel (subplot) and the third dimensions is varied between panels (subplots)." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "See the [xarray documentation](https://xarray.pydata.org/en/stable/user-guide/plotting.html) for more on \"faceted\" plots or subplots." 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "### Histograms" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Bonus Plot \n", 240 | "\n", 241 | "Let's look at the air temperature data but at for **all pressure levels**. We are going to select out the first time index and the longitude corresponding to the Himalayas and plot a vertical profile of the atmosphere from pole to pole:" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "ds_air_all_pressure_levels = xr.open_dataset(\n", 251 | " \"data/ta_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n", 252 | ")\n", 253 | "ds_air_all_pressure_levels" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "
\n", 268 | " \n", 272 | "
" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "---" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## Interactive visualization using `hvplot`\n", 287 | "\n", 288 | "Let's switch gears and look at how we can produce interactive plots via [holoviews](https://holoviews.org/). The holoviews plotting ecosystem provides the [hvplot](https://hvplot.holoviz.org/) package to allow easy visualization of xarray (and other) objects. These plots build on [Bokeh](https://docs.bokeh.org/en/latest/index.html) -- a Python library for creating interactive visualziatons for web browsers.\n", 289 | "\n", 290 | "\n", 291 | "To enable the `.hvplot` interface on xarray object, let's import the `hvplot.xarray` module:" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "import hvplot.xarray" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "To use `hvplot` instead of `matplotlib`, we use the `.hvplot()` method:" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "As you can see, calling `.hvplot()` behaves the same as `.plot()` i.e. it uses the same heuristics as `.plot()`. In this case, it produces a histogram for data with more than 3 dimensions. To plot a `pcolormesh`, let's reduce the dimensionality of our data to 2D and call `.hvplot()` again:" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "So far we have had to subset our data in order to produce plots. `hvplot` provides convenient functionality for producing plots on-demand via interactive widgets. Let's create a series of 2D for each time slice, We will use the `groupby` parameter to let hvplot know that we want to create a widget (a slider) for the time dimension:" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "Let's add more customizations to our time widget:" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "---" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "%load_ext watermark\n", 387 | "%watermark --time --python --updated --iversion" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "## Summary \n", 395 | "\n", 396 | "- Xarray has plotting functionality that is a thin wrapper around the Matplotlib library\n", 397 | "- Xarray uses syntax and function names from Matplotlib whenever possible\n", 398 | "- Hvplot provides a neat interface to xarray for creating interactive plots" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "## Resources and References\n", 406 | "\n", 407 | "- [Hvplot Documentation](https://hvplot.holoviz.org/index.html)\n", 408 | "- [Xarray Documentation - Plotting](https://xarray.pydata.org/en/stable/user-guide/plotting.html)\n", 409 | "- [Matplolib Documentation](https://matplotlib.org/stable/contents.html)\n", 410 | "\n", 411 | "
\n", 412 | "

Geocat-examples Gallery

\n", 413 | " For geo-science specific visualization examples, please see the geocat-examples gallery which resides here.\n", 414 | "
\n" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "
\n", 422 | "

Previous: Xarray Fundamentals

\n", 423 | "

Next: Computation

\n", 424 | "
" 425 | ] 426 | } 427 | ], 428 | "metadata": { 429 | "interpreter": { 430 | "hash": "affdb75a3ef70a25a87eb00ec52822d75cad558e88f93d5bb3da0d72a04ea7e1" 431 | }, 432 | "kernelspec": { 433 | "display_name": "Python 3", 434 | "language": "python", 435 | "name": "python3" 436 | }, 437 | "language_info": { 438 | "codemirror_mode": { 439 | "name": "ipython", 440 | "version": 3 441 | }, 442 | "file_extension": ".py", 443 | "mimetype": "text/x-python", 444 | "name": "python", 445 | "nbconvert_exporter": "python", 446 | "pygments_lexer": "ipython3", 447 | "version": "3.9.4" 448 | }, 449 | "toc-autonumbering": false 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 4 453 | } 454 | -------------------------------------------------------------------------------- /notebooks/blank-05-masking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Masking Data\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Learning Objectives \n", 22 | "\n", 23 | "\n", 24 | "- Provide an overview of masking data in xarray\n", 25 | "- Masking data using `.where()` method" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Prerequisites\n", 33 | "\n", 34 | "\n", 35 | "| Concepts | Importance | Notes |\n", 36 | "| --- | --- | --- |\n", 37 | "| [Understanding of xarray core data structures](./01-xarray-fundamentals.ipynb) | Necessary | |\n", 38 | "| [Familiarity with NumPy ](https://numpy.org/doc/stable/reference/arrays.indexing.html) | Helpful | |\n", 39 | "\n", 40 | "- **Time to learn**: *10 minutes*\n", 41 | "\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "---" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Overview\n", 56 | "\n", 57 | "Using `xr.where()` or `.where()` method, elements of an Xarray Dataset or xarray DataArray that satisfy a given condition or multiple conditions can be replaced/masked.To demonstrate this, we are going to use the `.where()` method on the `tos` dataarray. " 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Imports\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "import matplotlib.pyplot as plt\n", 74 | "import xarray as xr" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Open the sea surface temperature dataset:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "ds = xr.open_dataset(\n", 91 | " \"./data/tos_Omon_CESM2_historical_r11i1p1f1_gr_200001-201412.nc\", engine=\"netcdf4\"\n", 92 | ")\n", 93 | "ds" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## Using `where` with one condition" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "sample = ...\n", 110 | "sample" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Before applying `.where()`, let's look at the documentation" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "sample.where?" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "- As the documention points out, the conditional expression in `.where` can be: \n", 134 | "\n", 135 | " - a DataArray\n", 136 | " - a Dataset\n", 137 | " - a function\n", 138 | "\n", 139 | "- Unlike `.isel()` and `sel()` that change the shape of the returned results, `.where()` preserves the shape of the original data. It does accomplishes this by returning values from the original DataArray or Dataset if the `condition` is `True`, and fills in missing values wherever the `condition` is `False`. \n", 140 | "\n", 141 | "\n", 142 | "For demonstration purposes, let's use where to mask locations with temperature values greater than `0`:" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "masked_sample = sample...\n", 152 | "masked_sample" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "Let's plot both our original sample, and the masked sample:" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "fig, axes = plt.subplots(ncols=2, figsize=(19, 6))\n", 169 | "sample.plot(ax=axes[0], robust=True)\n", 170 | "masked_sample.plot(ax=axes[1], robust=True);" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "## Using `where` with multiple conditions" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "`.where()` allows providing multiple conditions. To do this, we need to make sure each conditional expression is enclosed in `()`. To combine conditions, we use the `bit-wise and` (`&`) operator and/or the `bit-wise or` (`|`). let's use `where` to mask locations with temperature values less than 25 and greater than 30:" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "We can use coordinates to apply a mask as well. Below, we use the `latitude` and `longitude` coordinates to mask the [Niño 3.4 region](https://www.ncdc.noaa.gov/teleconnections/enso/indicators/sst/):\n", 199 | "\n", 200 | "![](https://www.ncdc.noaa.gov/monitoring-content/teleconnections/nino-regions.gif)\n", 201 | "\n" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## Using `where` with a custom fill value" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "`.where()` can take a second argument, which, if supplied, is used to fill value for the masked region. Below we fill masked regtions with a constant `0`" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "---" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "%load_ext watermark\n", 246 | "%watermark --time --python --updated --iversion" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "## Resources and References\n", 254 | "\n", 255 | "- [Xarray Documentation - Masking with `where()`](https://xarray.pydata.org/en/stable/user-guide/indexing.html#masking-with-where)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "
\n", 263 | "

Previous: Computation

\n", 264 | "

Next: End-to-End example: Computing Niño 3.4 Index

\n", 265 | "
" 266 | ] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Python 3", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.9.4" 286 | }, 287 | "toc-autonumbering": false, 288 | "toc-showcode": false, 289 | "toc-showmarkdowntxt": false, 290 | "toc-showtags": false 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 4 294 | } 295 | -------------------------------------------------------------------------------- /notebooks/template.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Let's start here! If you can directly link to an image relevant to your notebook, such as [canonical logos](https://github.com/numpy/numpy/blob/main/doc/source/_static/numpylogo.svg), do so here at the top of your notebook. You can do this with Markdown syntax,\n", 8 | "\n", 9 | "> `![](http://link.com/to/image.png \"image alt text\")`\n", 10 | "\n", 11 | "or edit this cell to see raw HTML `img` demonstration. This is preferred if you need to shrink your embedded image. **Either way be sure to include `alt` text for any embedded images to make your content more accessible.**\n", 12 | "\n", 13 | "\"Project" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Project Pythia Notebook Template\n", 21 | "\n", 22 | "Next, title your notebook appropriately with a top-level Markdown header, `#`. Do not use this level header anywhere else in the notebook. Our book build process will use this title in the navbar, table of contents, etc. Keep it short, keep it descriptive. Follow this with a `---` cell to visually distinguish the transition to the prerequisites section." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "---" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Overview\n", 37 | "If you have an introductory paragraph, lead with it here! Keep it short and tied to your material, then be sure to continue into the required list of topics below,\n", 38 | "\n", 39 | "1. This is a numbered list of the specific topics\n", 40 | "1. These should map approximately to your main sections of content\n", 41 | "1. Or each second-level, `##`, header in your notebook\n", 42 | "1. Keep the size and scope of your notebook in check\n", 43 | "1. And be sure to let the reader know up front the important concepts they'll be leaving with" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Prerequisites\n", 51 | "This section was inspired by [this template](https://github.com/alan-turing-institute/the-turing-way/blob/master/book/templates/chapter-template/chapter-landing-page.md) of the wonderful [The Turing Way](https://the-turing-way.netlify.app/welcome.html) Jupyter Book.\n", 52 | "\n", 53 | "Following your overview, tell your reader what concepts, packages, or other background information they'll **need** before learning your material. Tie this explicitly with links to other pages here in Foundations or to relevant external resources. Remove this body text, then populate the Markdown table, denoted in this cell with `|` vertical brackets, below, and fill out the information following. In this table, lay out prerequisite concepts by explicitly linking to other Foundations material or external resources, or describe generally helpful concepts.\n", 54 | "\n", 55 | "Label the importance of each concept explicitly as **helpful/necessary**.\n", 56 | "\n", 57 | "| Concepts | Importance | Notes |\n", 58 | "| --- | --- | --- |\n", 59 | "| [Intro to Cartopy](../core/cartopy/cartopy) | Necessary | |\n", 60 | "| [Understanding of NetCDF](some-link-to-external-resource) | Helpful | Familiarity with metadata structure |\n", 61 | "| Project management | Helpful | |\n", 62 | "\n", 63 | "- **Experience level**: with relevant packages or general self-assessed experience as **beginner/intermediate/advanced**\n", 64 | "- **Time to learn**: estimate in minutes or qualitatively as **long/medium/short**\n", 65 | "- **System requirements**:\n", 66 | " - Populate with any system, version, or non-Python software requirements if necessary\n", 67 | " - Otherwise use the concepts table above and the Imports section below to describe required packages as necessary\n", 68 | " - If no extra requirements, remove the **System requirements** point altogether" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "---" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Imports\n", 83 | "Begin your body of content with another `---` divider before continuing into this section, then remove this body text and populate the following code cell with all necessary Python imports **up-front**:" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "import sys" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Your first content section" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "This is where you begin your first section of material, loosely tied to your objectives stated up front. Tie together your notebook as a narrative, with interspersed Markdown text, images, and more as necessary," 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "# as well as any and all of your code cells\n", 116 | "print(\"Hello world!\")" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### A content subsection\n", 124 | "Divide and conquer your objectives with Markdown subsections, which will populate the helpful navbar in Jupyter Lab and here on the Jupyter Book!" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# some subsection code\n", 134 | "new = \"helpful information\"" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "### Another content subsection\n", 142 | "Keep up the good work! A note, *try to avoid using code comments as narrative*, and instead let them only exist as brief clarifications where necessary." 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "## Your second content section\n", 150 | "Here we can move on to our second objective, and we can demonstrate" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "### Subsection to the second section\n", 158 | "\n", 159 | "#### a quick demonstration\n", 160 | "\n", 161 | "##### of further and further\n", 162 | "\n", 163 | "###### header levels" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "as well $m = a * t / h$ text! Similarly, you have access to other $\\LaTeX$ equation [**functionality**](https://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Typesetting%20Equations.html) via MathJax (demo below from link),\n", 171 | "\n", 172 | "\\begin{align}\n", 173 | "\\dot{x} & = \\sigma(y-x) \\\\\n", 174 | "\\dot{y} & = \\rho x - y - xz \\\\\n", 175 | "\\dot{z} & = -\\beta z + xy\n", 176 | "\\end{align}" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "Check out [**any number of helpful Markdown resources**](https://www.markdownguide.org/basic-syntax/) for further customizing your notebooks and the [**Jupyter docs**](https://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Working%20With%20Markdown%20Cells.html) for Jupyter-specific formatting information. Don't hesitate to ask questions if you have problems getting it to look *just right*." 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Last Section\n", 191 | "\n", 192 | "If you're comfortable, and as we briefly used for our embedded logo up top, you can embed raw html into Jupyter Markdown cells (edit to see):" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "
\n", 200 | "

Info

\n", 201 | " Your relevant information here!\n", 202 | "
" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "Feel free to copy this around and edit or play around with yourself. Some other `admonitions` you can put in:" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "
\n", 217 | "

Success

\n", 218 | " We got this done after all!\n", 219 | "
" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "
\n", 227 | "

Warning

\n", 228 | " Be careful!\n", 229 | "
" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "
\n", 237 | "

Danger

\n", 238 | " Scary stuff be here.\n", 239 | "
" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "We also suggest checking out Jupyter Book's [brief demonstration](https://jupyterbook.org/content/metadata.html#jupyter-cell-tags) on adding cell tags to your cells in Jupyter Notebook, Lab, or manually. Using these cell tags can allow you to [customize](https://jupyterbook.org/interactive/hiding.html) how your code content is displayed and even [demonstrate errors](https://jupyterbook.org/content/execute.html#dealing-with-code-that-raises-errors) without altogether crashing our loyal army of machines!" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "---" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## Summary\n", 261 | "Add one final `---` marking the end of your body of content, and then conclude with a brief single paragraph summarizing at a high level the key pieces that were learned and how they tied to your objectives. Look to reiterate what the most important takeaways were.\n", 262 | "\n", 263 | "### What's next?\n", 264 | "Let Jupyter book tie this to the next (sequential) piece of content that people could move on to down below and in the sidebar. However, if this page uniquely enables your reader to tackle other nonsequential concepts throughout this book, or even external content, link to it here!" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Resources and references\n", 272 | "Finally, be rigorous in your citations and references as necessary. Give credit where credit is due. Also, feel free to link to relevant external material, further reading, documentation, etc. Then you're done! Give yourself a quick review, a high five, and send us a pull request. A few final notes:\n", 273 | " - `Kernel > Restart Kernel and Run All Cells...` to confirm that your notebook will cleanly run from start to finish\n", 274 | " - `Kernel > Restart Kernel and Clear All Outputs...` before committing your notebook, our machines will do the heavy lifting\n", 275 | " - Take credit! Provide author contact information if you'd like; if so, consider adding information here at the bottom of your notebook\n", 276 | " - Give credit! Attribute appropriate authorship for referenced code, information, images, etc.\n", 277 | " - Only include what you're legally allowed: **no copyright infringement or plagiarism**\n", 278 | " \n", 279 | "Thank you for your contribution!" 280 | ] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "Python 3", 286 | "language": "python", 287 | "name": "python3" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 3 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython3", 299 | "version": "3.9.6" 300 | }, 301 | "nbdime-conflicts": { 302 | "local_diff": [ 303 | { 304 | "diff": [ 305 | { 306 | "diff": [ 307 | { 308 | "key": 0, 309 | "op": "addrange", 310 | "valuelist": [ 311 | "Python 3" 312 | ] 313 | }, 314 | { 315 | "key": 0, 316 | "length": 1, 317 | "op": "removerange" 318 | } 319 | ], 320 | "key": "display_name", 321 | "op": "patch" 322 | } 323 | ], 324 | "key": "kernelspec", 325 | "op": "patch" 326 | } 327 | ], 328 | "remote_diff": [ 329 | { 330 | "diff": [ 331 | { 332 | "diff": [ 333 | { 334 | "key": 0, 335 | "op": "addrange", 336 | "valuelist": [ 337 | "Python3" 338 | ] 339 | }, 340 | { 341 | "key": 0, 342 | "length": 1, 343 | "op": "removerange" 344 | } 345 | ], 346 | "key": "display_name", 347 | "op": "patch" 348 | } 349 | ], 350 | "key": "kernelspec", 351 | "op": "patch" 352 | } 353 | ] 354 | }, 355 | "toc-autonumbering": false 356 | }, 357 | "nbformat": 4, 358 | "nbformat_minor": 4 359 | } 360 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | target-version = ['py38'] 4 | skip-string-normalization = true 5 | 6 | [tool.nbqa.mutate] 7 | isort = 1 8 | black = 1 9 | pyupgrade = 1 10 | 11 | [tool.nbqa.addopts] 12 | pyupgrade = ["--py36-plus"] 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | ignore = E203,E266,E501,W503,E722,E402,C901 4 | max-line-length = 100 5 | max-complexity = 18 6 | select = B,C,E,F,W,T4,B9 7 | 8 | [isort] 9 | known_first_party= 10 | known_third_party= 11 | multi_line_output=3 12 | include_trailing_comma=True 13 | force_grid_wrap=0 14 | combine_as_imports=True 15 | line_length=100 16 | skip= 17 | --------------------------------------------------------------------------------