├── .github
└── workflows
│ └── build.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierrc.toml
├── LICENSE
├── README.md
├── binder
└── environment.yml
├── images
├── Dask Array (Light).png
├── Dask Cluster Manager (Light)(1).png
├── Dask Dataframe (Light).png
├── Dask Overview (Light).png
├── Distributed Overview (Light).png
├── Xarray-data-structures.png
├── dask-task-stream.gif
├── dask_horizontal.svg
├── dataset-diagram-logo.png
├── should-i-use-dask.png
├── xarray-data-structures.svg
└── xarray-split-apply-combine.png
├── notebooks
├── 00-download-data.ipynb
├── 01-xarray-fundamentals.ipynb
├── 02-indexing-and-selecting-data.ipynb
├── 03-data-visualization.ipynb
├── 04-computation.ipynb
├── 05-masking.ipynb
├── 06-end-to-end-example.ipynb
├── 07-dask-intro.ipynb
├── 08-dask-delayed.ipynb
├── 09-dask-array.ipynb
├── 10-dask-and-xarray.ipynb
├── 11-dask-distributed.ipynb
├── blank-01-xarray-fundamentals.ipynb
├── blank-02-indexing-and-selecting-data.ipynb
├── blank-03-data-visualization.ipynb
├── blank-04-computation.ipynb
├── blank-05-masking.ipynb
└── template.ipynb
├── pyproject.toml
└── setup.cfg
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 | on: [push]
3 |
4 | jobs:
5 | binder:
6 | runs-on: ubuntu-latest
7 | steps:
8 | - name: Build and cache on mybinder.org
9 | uses: jupyterhub/repo2docker-action@master
10 | with:
11 | NO_PUSH: true
12 | MYBINDERORG_TAG: ${{ github.event.ref }}
13 |
14 | conda-solve:
15 | runs-on: ${{ matrix.os }}
16 | strategy:
17 | matrix:
18 | os: [windows-latest, ubuntu-latest, macos-latest]
19 |
20 | steps:
21 | - name: Checkout source
22 | uses: actions/checkout@v2
23 |
24 | - name: Setup Conda Environment
25 | uses: conda-incubator/setup-miniconda@v2
26 | with:
27 | environment-file: binder/environment.yml
28 | activate-environment: xarray-tutorial
29 | auto-activate-base: false
30 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | data/
130 | dask-worker-space/
131 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | exclude: schema/generic_schema.yaml
2 | repos:
3 | - repo: https://github.com/pre-commit/pre-commit-hooks
4 | rev: v4.2.0
5 | hooks:
6 | - id: trailing-whitespace
7 | - id: end-of-file-fixer
8 | - id: check-docstring-first
9 | - id: check-json
10 | - id: check-yaml
11 | - id: double-quote-string-fixer
12 |
13 | - repo: https://github.com/ambv/black
14 | rev: 22.3.0
15 | hooks:
16 | - id: black
17 |
18 | - repo: https://github.com/keewis/blackdoc
19 | rev: v0.3.4
20 | hooks:
21 | - id: blackdoc
22 |
23 | - repo: https://gitlab.com/pycqa/flake8
24 | rev: 3.9.2
25 | hooks:
26 | - id: flake8
27 |
28 | - repo: https://github.com/asottile/seed-isort-config
29 | rev: v2.2.0
30 | hooks:
31 | - id: seed-isort-config
32 | - repo: https://github.com/pre-commit/mirrors-isort
33 | rev: v5.10.1
34 | hooks:
35 | - id: isort
36 |
37 | - repo: https://github.com/pre-commit/mirrors-prettier
38 | rev: v2.6.2
39 | hooks:
40 | - id: prettier
41 |
42 | - repo: https://github.com/nbQA-dev/nbQA
43 | rev: 1.3.1
44 | hooks:
45 | - id: nbqa-black
46 | additional_dependencies: [black==21.5b1]
47 | - id: nbqa-pyupgrade
48 | additional_dependencies: [pyupgrade==2.7.3]
49 | - id: nbqa-isort
50 | additional_dependencies: [isort==5.8.0]
51 |
--------------------------------------------------------------------------------
/.prettierrc.toml:
--------------------------------------------------------------------------------
1 | tabWidth = 2
2 | semi = false
3 | singleQuote = true
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
122 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # xarray-tutorial
2 |
3 | [](https://github.com/andersy005/xarray-tutorial/actions/workflows/build.yml)
4 | [](https://mybinder.org/v2/gh/andersy005/xarray-tutorial/main?urlpath=lab)
5 |
6 | This repository contains materials for the xarray tutorial.
7 |
8 | ## Running the tutorial
9 |
10 | There are two different ways in which you can set up and go through the tutorial materials. Both of which are outlined in the table below.
11 |
12 | | Method | Setup | Description |
13 | | :-----------: | :----------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
14 | | Binder | [](https://mybinder.org/v2/gh/andersy005/xarray-tutorial/main?urlpath=lab) | Run the tutorial notebooks on mybinder.org without installing anything locally. |
15 | | Local install | [Instructions](#Local-installation-instructions) | Download the tutorial notebooks and install the necessary packages (via `conda`) locally. Setting things up locally can take a few minutes, so we recommend going through the installation steps prior to the tutorial. |
16 |
17 | ## Local installation instructions
18 |
19 | ### 1. Clone the repository
20 |
21 | First clone this repository to your local machine via:
22 |
23 | ```
24 | git clone https://github.com/andersy005/xarray-tutorial
25 | ```
26 |
27 | ### 2. Download conda (if you haven't already)
28 |
29 | If you do not already have the conda package manager installed, please follow the instructions [here](https://github.com/conda-forge/miniforge#install).
30 |
31 | ### 3. Create a conda environment
32 |
33 | Navigate to the `xarray-tutorial/` directory and create a new conda environment with the required
34 | packages via:
35 |
36 | ```terminal
37 | cd xarray-tutorial
38 | conda env update --file binder/environment.yml
39 | ```
40 |
41 | This will create a new conda environment named "xarray-tutorial".
42 |
43 | ### 4. Activate the environment
44 |
45 | Next, activate the environment:
46 |
47 | ```
48 | conda activate xarray-tutorial
49 | ```
50 |
51 | ### 5. Download sample datasets
52 |
53 | To download sample datasets, run the `00-download-data.ipynb` notebook:
54 |
55 | ```bash
56 | cd notebooks/
57 | nbterm --run 00-download-data.ipynb
58 | ```
59 |
60 | ### 6. Launch JupyterLab
61 |
62 | Finally, launch JupyterLab with:
63 |
64 | ```
65 | jupyter lab
66 | ```
67 |
--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
1 | name: xarray-tutorial
2 | channels:
3 | - conda-forge
4 | - nodefaults
5 | dependencies:
6 | - cartopy
7 | - cfgrib
8 | - cftime < 1.5
9 | - dask
10 | - dask-labextension
11 | - distributed
12 | - h5netcdf
13 | - hvplot
14 | - ipywidgets
15 | - jupyterlab-system-monitor
16 | - jupyterlab>=3
17 | - matplotlib
18 | - nbterm
19 | - nc-time-axis
20 | - netcdf4
21 | - nodejs
22 | - pip
23 | - pre-commit
24 | - pydap
25 | - python-graphviz
26 | - python=3.9
27 | - scipy
28 | - watermark
29 | - xarray>=2022.3.0
30 |
--------------------------------------------------------------------------------
/images/Dask Array (Light).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Dask Array (Light).png
--------------------------------------------------------------------------------
/images/Dask Cluster Manager (Light)(1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Dask Cluster Manager (Light)(1).png
--------------------------------------------------------------------------------
/images/Dask Dataframe (Light).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Dask Dataframe (Light).png
--------------------------------------------------------------------------------
/images/Dask Overview (Light).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Dask Overview (Light).png
--------------------------------------------------------------------------------
/images/Distributed Overview (Light).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Distributed Overview (Light).png
--------------------------------------------------------------------------------
/images/Xarray-data-structures.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/Xarray-data-structures.png
--------------------------------------------------------------------------------
/images/dask-task-stream.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/dask-task-stream.gif
--------------------------------------------------------------------------------
/images/dask_horizontal.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/dataset-diagram-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/dataset-diagram-logo.png
--------------------------------------------------------------------------------
/images/should-i-use-dask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/should-i-use-dask.png
--------------------------------------------------------------------------------
/images/xarray-split-apply-combine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/xarray-tutorial/ffb302785027b19447a75ea58990089e667f8ee7/images/xarray-split-apply-combine.png
--------------------------------------------------------------------------------
/notebooks/00-download-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "5d7665ed-2124-4773-b8db-5f88cd88ada9",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import gzip\n",
11 | "import pathlib\n",
12 | "import shutil\n",
13 | "\n",
14 | "import requests\n",
15 | "\n",
16 | "\n",
17 | "def download_data(url, data_dir=\"data\", unarchive=False, clobber=False):\n",
18 | " data_dir = pathlib.Path(data_dir)\n",
19 | " data_dir.mkdir(parents=True, exist_ok=True)\n",
20 | " local_filename = data_dir / url.split('/')[-1]\n",
21 | " if (local_filename.exists() and clobber) or not local_filename.exists():\n",
22 | " with requests.get(url, stream=True) as rstream:\n",
23 | " with local_filename.open(\"wb\") as f:\n",
24 | " shutil.copyfileobj(rstream.raw, f)\n",
25 | "\n",
26 | " if unarchive:\n",
27 | " local_filename_unarchived = data_dir / local_filename.stem\n",
28 | " if (\n",
29 | " local_filename_unarchived.exists() and clobber\n",
30 | " ) and not local_filename_unarchived.exists():\n",
31 | " with gzip.open(local_filename, \"rb\") as fin:\n",
32 | " with local_filename_unarchived.open(\"wb\") as fout:\n",
33 | " shutil.copyfileobj(fin, fout)\n",
34 | " return str(local_filename_unarchived)\n",
35 | " return local_filename"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "id": "d8c529ad-183e-4c23-ac2f-ea41bbc6950c",
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "%%time\n",
46 | "\n",
47 | "urls = [\n",
48 | " (\"http://download.ecmwf.int/test-data/cfgrib/era5-levels-members.grib\", False),\n",
49 | " (\"https://psl.noaa.gov/thredds/fileServer/Datasets/noaa.oisst.v2/sst.mnmean.nc\", False),\n",
50 | " (\n",
51 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Amon/tas/gn/v20190514/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\",\n",
52 | " False,\n",
53 | " ),\n",
54 | " (\n",
55 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Amon/ta/gn/v20190514/ta_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\",\n",
56 | " False,\n",
57 | " ),\n",
58 | " (\n",
59 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Ofx/areacello/gr/v20190514/areacello_Ofx_CESM2_historical_r11i1p1f1_gr.nc\",\n",
60 | " False,\n",
61 | " ),\n",
62 | " (\n",
63 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Omon/tos/gr/v20190514/tos_Omon_CESM2_historical_r11i1p1f1_gr_200001-201412.nc\",\n",
64 | " False,\n",
65 | " ),\n",
66 | " (\n",
67 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r9i1p1f1/Omon/tos/gr/v20190311/tos_Omon_CESM2_historical_r9i1p1f1_gr_200001-201412.nc\",\n",
68 | " False,\n",
69 | " ),\n",
70 | " (\n",
71 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r7i1p1f1/Omon/tos/gr/v20190311/tos_Omon_CESM2_historical_r7i1p1f1_gr_200001-201412.nc\",\n",
72 | " False,\n",
73 | " ),\n",
74 | " (\n",
75 | " \"http://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r8i1p1f1/Omon/tos/gr/v20190311/tos_Omon_CESM2_historical_r8i1p1f1_gr_200001-201412.nc\",\n",
76 | " False,\n",
77 | " ),\n",
78 | "]\n",
79 | "for url, unarchive in urls:\n",
80 | " download_data(url, unarchive=unarchive)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "id": "56d6f41f-7f98-4c2a-828c-4ae420cabe83",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "%load_ext watermark\n",
91 | "%watermark --time --python --updated --iversion"
92 | ]
93 | }
94 | ],
95 | "metadata": {
96 | "kernelspec": {
97 | "display_name": "Python 3 (ipykernel)",
98 | "language": "python",
99 | "name": "python3"
100 | },
101 | "language_info": {
102 | "codemirror_mode": {
103 | "name": "ipython",
104 | "version": 3
105 | },
106 | "file_extension": ".py",
107 | "mimetype": "text/x-python",
108 | "name": "python",
109 | "nbconvert_exporter": "python",
110 | "pygments_lexer": "ipython3",
111 | "version": "3.9.6"
112 | }
113 | },
114 | "nbformat": 4,
115 | "nbformat_minor": 5
116 | }
117 |
--------------------------------------------------------------------------------
/notebooks/01-xarray-fundamentals.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Xarray Fundamentals"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "---"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Learning Objectives\n",
22 | "\n",
23 | "- Provide an overview of xarray\n",
24 | "- Describe the core xarray data structures, the `DataArray` and the `Dataset`, and the components that make them up\n",
25 | "- Load xarray dataset from a netCDF file \n",
26 | "- Load xarray dataset from a GRIB file\n",
27 | "- Load xarray dataset from a remote dataset from a THREDDS server\n"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Prerequisites\n",
35 | "\n",
36 | "\n",
37 | "| Concepts | Importance | Notes |\n",
38 | "| --- | --- | --- |\n",
39 | "| Basic familiarity with NumPy | Necessary | |\n",
40 | "| Basic familiarity with Pandas | Helpful | |\n",
41 | "| [Understanding of NetCDF Data Model](https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_model.html) | Helpful | Familiarity with metadata structure |\n",
42 | "\n",
43 | "\n",
44 | "- **Time to learn**: *15-20 minutes*\n",
45 | "\n"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "---"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "## Imports\n"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "import xarray as xr # \"canonical\" namespace short-hand"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "## What is Xarray?\n",
76 | "\n",
77 | "Xarray is a Python library for working with **labeled**, **multi dimensional** arrays. \n",
78 | "\n",
79 | "- Built on top of numpy and pandas \n",
80 | "- Brings the power of pandas to multidimensional arrays \n",
81 | "- Supports data of any dimensionality "
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## Core Data Structures\n",
89 | "\n",
90 | "- Xarray has **two** main data structures:\n",
91 | " - `xarray.DataArray`: a fancy, labelled version of `numpy.ndarray` with associated coordinates. \n",
92 | " - `xarray.Dataset`: a collection of multiple `xarray.DataArray` that share the same coordinates and/or dimensions.\n",
93 | "\n",
94 | "---\n",
95 | "\n",
96 | ""
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### Dataset"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "Xarray's interface is heavily inspired by the [netCDF data model](https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_model.html). Xarray's Dataset is designed as an in-memory representation of a netCDF dataset. \n",
111 | "\n",
112 | "\n",
113 | "#### Loading data from a netCDF file\n",
114 | "\n",
115 | "First, let's open a local netCDF file using the `xarray.open_dataset()` function:"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "%%time\n",
125 | "ds = xr.open_dataset(\n",
126 | " \"./data/tas_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n",
127 | ")"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "By default, `xarray.open_dataset()` function uses **lazy loading** i.e. it just loads in the coordinate and attribute metadata and **not** the data that correspond to data variables themselves. The data variables are loaded only on actual values access (e.g. when performing some calculation, slicing, ...) or with `.load()` method. \n",
135 | "\n",
136 | "Let's look at the HTML representation of the loaded dataset:"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "ds"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "\n",
153 | "
\n",
154 | "
Text based representation
\n",
155 | " If you prefer a text based representation, you can set the display_style='text' by uncommenting the line below\n",
156 | "
\n"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "# xr.set_options(display_style=\"text\")"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "To look at the corresponding netCDF representation, we can use the `.info()` method:"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "ds.info()"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "Datasets have the following key properties:\n",
189 | "- `data_vars`: an dictionary of `DataArrays` corresponding to data variables \n",
190 | "- `dims`: a dictionary mapping from dimenion names to the fixed length of each dimension (e.g. `{'time': 1815, 'nv': 2, 'latitude': 180, 'longitude': 360}` )\n",
191 | "- `coords`: a dictionary-like container of arrays (coordinates) that label each point (tick label) along our dimensions\n",
192 | "- `attrs`: a dictionary holding arbitrary metadata pertaining to the dataset"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "# variables that are in our dataset\n",
202 | "ds.data_vars"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": null,
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "# dataset dimensions\n",
212 | "ds.dims"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "# dataset coordinates\n",
222 | "ds.coords"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "# dataset global attributes\n",
232 | "ds.attrs"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "### DataArray\n",
240 | "\n",
241 | "The DataArray is xarray's implementation of a labeled, multi-dimensional array. It has several key properties:\n",
242 | "\n",
243 | "- `data`: a Duck array (`numpy.ndarray` or [`dask.array`](https://docs.dask.org/en/latest/array.html) or [`sparse`](https://sparse.pydata.org/en/stable/) or [`cupy.array`](https://docs.cupy.dev/en/stable/index.html) holding the array's values). \n",
244 | "- `dims`: dimension names for each axis e.g. `(lat, lon, time)`\n",
245 | "- `coords`: a dictionary-like container of arrays (coordinates) that label each point (tick label) along our dimensions\n",
246 | "- `attrs`: a dictionary that holds arbitrary attributes/metadata (such as units). \n",
247 | "- `name`: an arbitrary name of the array"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "# Extract the tas variable (dataarray)\n",
257 | "ds[\"tas\"]"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "# ds[\"tas\"] is equivalent to ds.tas\n",
267 | "ds.tas"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "\n",
275 | "
You can use this dot notation access only if the variable/datarray name is a valid Python identifier, e.g. \"mydataset.1\" is not allowed. See here for an explanation of valid identifiers.
\n",
283 | "
Some unexpected behavior may occur if the variable/datarray name conflicts with an existing method name, e.g. Using \"ds.min\" to refer to a variable called \"min\" collides with the \"min\" (minimum) xarray method, but \"ds['min']\" works fine.
\n",
284 | "
\n",
285 | "
"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "# The actual array data\n",
295 | "ds[\"tas\"].data"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "# datarray coordinates\n",
305 | "ds[\"tas\"].coords"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "# dataarray attributes\n",
315 | "ds[\"tas\"].attrs"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "metadata": {},
321 | "source": [
322 | "### Dimensions vs Coordinates\n",
323 | "\n",
324 | "- A dimension is just a name of an axis, like \"longitude\" or \"time\"\n",
325 | "- Labeled coordinates are tick labels along an axis, e.g. \"2021-06-08\"\n",
326 | "\n",
327 | "\n",
328 | "#### `repr` & HTML representation of dimensions with or without coordinates \n",
329 | "\n",
330 | "| Dimension | HTML repr | Text based repr |\n",
331 | "| --- | --- | --- |\n",
332 | "| with coordinates | **bold** | `*` symbol in `.coords` |\n",
333 | "| without coordinates | normal | listed explicitly |\n",
334 | "\n"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": null,
340 | "metadata": {},
341 | "outputs": [],
342 | "source": [
343 | "ds"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {},
350 | "outputs": [],
351 | "source": [
352 | "with xr.set_options(display_style=\"text\"):\n",
353 | " print(ds)"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "\n",
361 | "\n",
362 | "### Loading data in other file formats \n",
363 | "\n",
364 | "\n",
365 | "#### Loading data from a grib file \n",
366 | "\n",
367 | "To load a grib file in an xarray Dataset, we use the `xarray.open_dataset()` and we need to specify `engine=\"cfgrib\"`. This requires the presence of `cfgrib` package in our Python environment:"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "ds = xr.open_dataset(\"./data/era5-levels-members.grib\", engine=\"cfgrib\")\n",
377 | "ds"
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {
383 | "slideshow": {
384 | "slide_type": "subslide"
385 | }
386 | },
387 | "source": [
388 | "#### Loading data from a remote OPENDAP server \n",
389 | "\n",
390 | "\n",
391 | "If you happen to have access to netCDF datasets that are hosted remotely on a THREDDS server, you can point xarray to a url and it will load/stream the data over the network without needing to download it locally. "
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "metadata": {},
398 | "outputs": [],
399 | "source": [
400 | "url = \"http://crd-esgf-drc.ec.gc.ca/thredds/dodsC/esgD_dataroot/AR6/CMIP6/ScenarioMIP/CCCma/CanESM5/ssp126/r12i1p2f1/Amon/wap/gn/v20190429/wap_Amon_CanESM5_ssp126_r12i1p2f1_gn_201501-210012.nc\""
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "metadata": {},
407 | "outputs": [],
408 | "source": [
409 | "xr.open_dataset(url, engine=\"netcdf4\", chunks={})"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "---"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": null,
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "%load_ext watermark\n",
426 | "%watermark --time --python --updated --iversion"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "## Summary \n",
434 | "\n",
435 | "\n",
436 | "- Xarray has two main data structures: DataArray and Dataset\n",
437 | "- DataArrays store the multi-dimensional arrays\n",
438 | "- Xarray is built on top of Numpy and Pandas and its architecture is heavily inspired by the netCDF data model"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "metadata": {},
444 | "source": [
445 | "## Resources and References\n",
446 | "\n",
447 | "- [Xarray Documentation on Data Structures](http://xarray.pydata.org/en/latest/data-structures.html)\n",
448 | "- [Xarray Documentation on reading files and writing files](https://xarray.pydata.org/en/stable/io.html)\n",
449 | "- [cfgrib Documentation](https://github.com/ecmwf/cfgrib)"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {},
455 | "source": [
456 | "
\n",
131 | " \n",
132 | " but wait, what labels go with 20 and 40? Was that lat/lon or lon/lat? Where are the timestamps that go along with this time-series?\n",
133 | "
"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "## Different choices for indexing \n",
141 | "\n",
142 | "\n",
143 | "Xarray supports two kinds of indexing \n",
144 | "\n",
145 | "- Positional indexing via `.isel()`: provides primarily integer position based index (from `0` to `length-1` of the axis/dimension\n",
146 | "- Label indexing via `.sel()`: provides primarily label based index\n",
147 | "\n",
148 | "Xarray's indexing methods preserves the coordinate labels and associated metadata.\n",
149 | "\n",
150 | "\n",
151 | "\n",
152 | "### Selection by position\n",
153 | "\n",
154 | "The `.isel()` method is the primary access method for **purely integer based indexing**. The following are valid inputs:\n",
155 | "- An integer e.g. `lat=10`\n",
156 | "- A list or array of integers `lon=[10, 20, 39]`\n",
157 | "- A slice object with integers e.g. `time=slice(2, 20)`"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "ds.tas.isel() # the original object i.e. no selection"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "ds.tas.isel(lat=100)"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "ds.tas.isel(lat=100, time=[-2, -1])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "ds.tas.isel(lon=100, time=slice(10, 20))"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "### Selection by label \n",
201 | "\n",
202 | "\n",
203 | "The `.sel()` method is the primary access method for **purely coordinate label based indexing.**. The following are valid inputs:\n",
204 | "\n",
205 | "- A single coordinate label e.g. `time=\"2021-03-01\"`\n",
206 | "- A list or array of coordinate labels `lon=[=\"2021-01-01\", =\"2021-03-10\", =\"2021-03-12\"]`\n",
207 | "- A slice object with coordinate labels e.g. `time=slice(\"2021-01-01\", \"2021-03-01\")`. (Note that contrary to usual Python slices, both the start and the stop are included, when present in the index!)"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "ds.tas.sel(time=\"2013\")"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "It is also possible to use slice for the time dimension:"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "ds.tas.sel(time=slice(\"2013-01-01\", \"2014-12-31\"))"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "\n",
240 | "
\n",
241 | "
Warning: Be careful when working with floating coordinate labels
\n",
242 | " \n",
243 | " When we have integer, string, datetime-like values for coordinate labels, \"sel()\" works flawlessly. When we try to work with floating coordinate labels, things get a little tricky:\n",
244 | " \n",
245 | "
\n",
246 | "\n",
247 | "\n"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "ds.tas.sel(lat=39.5, lon=105.7)"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "### Nearest-neighbor lookups\n",
264 | "\n",
265 | "As shown above, when our coordinate labels are not integers or strings or datetime-like but floating point numbers, `.sel()` may throw a `KeyError`:"
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "`ds.tas.sel(lat=39.5, lon=105.7)` fails because we are trying to use a conditional for an approximate value i.e floating numbers are represented approximately inside the computer, and xarray is unable to locate this exact value. To address this issue, xarray supports `method` and `tolerance` keyword argument. The `method` parameter allows for enabling nearest neighbor (inexact) lookups by use of the methods `'pad', 'backfill' or 'nearest'`: "
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "metadata": {},
279 | "outputs": [],
280 | "source": [
281 | "ds.tas.sel(lat=39.5, lon=105.7, method='nearest')"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "So the closest location in the data was at `lat=39.11`, `lon=106.2`."
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "See the [xarray documentation](https://xarray.pydata.org/en/stable/generated/xarray.DataArray.sel.html) for more on usage of `method` and `tolerance` parameters in `.sel()`. "
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "
\n",
303 | "
Tip
\n",
304 | "Another way to use the nearest neighbor lookup is via slice objects. For e.g.:\n",
305 | "
"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "ds.tas.sel(lat=slice(39, 39.5), lon=slice(106.1, 106.3))"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "Operators can be chained, so multiple operations can be peformed sequentially. For example, to select an area of interest and the first time index"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "ds.tas.isel(time=0).sel(lon=slice(20, 160), lat=slice(-80, 25))"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "### Interpolation\n",
338 | "\n",
339 | "If we want to interpolate along coordinates rather than looking up the nearest neighbos, we can use the `.interp()` method. To use `interp()` requires the presence of `scipy` library. \n"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "ds.tas.interp(lat=[10, 10.1, 10.2], method='nearest')"
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "metadata": {},
354 | "source": [
355 | "---"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "%load_ext watermark\n",
365 | "%watermark --time --python --updated --iversion"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {},
371 | "source": [
372 | "## Summary \n",
373 | "\n",
374 | "- Xarray’s named dimensions and labeled coordinates free the user from having to track positional ordering of dimensions when accessing data\n",
375 | "- Xarray provides a variety of methods for subsetting data via `.sel()`, `.isel()`, `.interp()` methods\n"
376 | ]
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "metadata": {},
381 | "source": [
382 | "## Resources and References\n",
383 | "\n",
384 | "- [Xarray Documentation - Indexing and Selecting Data](https://xarray.pydata.org/en/stable/indexing.html)\n",
385 | "- [Xarray Documentation - Interpolation](https://xarray.pydata.org/en/stable/user-guide/interpolation.html)\n"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "
\n",
125 | " We are selecting a single point, so `.sel()` requires either an exact location that exists in the data, or to specify method argument to tell it how to choose a location from the data. \n",
126 | "
"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "\n",
134 | "Lets say we want to compare plots of temperature at three different latitudes. We can use the `hue` keyword argument to do this."
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "ds.tas.sel(lat=[-40, 0, 40], time=\"2013-03\", method=\"nearest\")"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "ds.tas.sel(lat=[-40, 0, 40], time=\"2013-03\", method=\"nearest\").plot(\n",
153 | " x=\"lon\", hue=\"lat\", figsize=(8, 6)\n",
154 | ");"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "### 2D plots"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "Operator chaining means it is possible to have multiple selection operators and add `.plot()` to the end to visualise the result"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "ds.tas.isel(time=-10).sel(lon=slice(20, 160), lat=slice(-80, 25)).plot(robust=True, figsize=(8, 6));"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "The x- and y-axes are labeled with full names — \"Latitude\", \"Longitude\" — along with units. The colorbar has a nice label, again with units. And the title tells us the timestamp of the data presented."
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "# define keyword arguments that are passed to matptolib.pyplot.colorbar\n",
194 | "colorbar_kwargs = {\n",
195 | " \"orientation\": \"horizontal\",\n",
196 | " \"label\": \"my clustom label\",\n",
197 | " \"pad\": 0.2,\n",
198 | "}\n",
199 | "\n",
200 | "ds.tas.isel(lon=1).plot(\n",
201 | " x=\"time\", # coordinate to plot on the x-axis of the plot\n",
202 | " robust=True, # set colorbar limits to 2nd and 98th percentile of data\n",
203 | " cbar_kwargs=colorbar_kwargs,\n",
204 | ");"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "\n",
212 | "### Faceting\n",
213 | "\n",
214 | "Faceting is an effective way of visualizing variations of 3D data where 2D slices are visualized in a panel (subplot) and the third dimensions is varied between panels (subplots)."
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "ds.tas.sel(time=slice(\"2010\", \"2011\")).plot(col=\"time\", col_wrap=6, robust=True);"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "See the [xarray documentation](https://xarray.pydata.org/en/stable/user-guide/plotting.html) for more on \"faceted\" plots or subplots."
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "metadata": {},
236 | "source": [
237 | "### Histograms"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "ds.tas.plot();"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "metadata": {},
252 | "source": [
253 | "### Bonus Plot \n",
254 | "\n",
255 | "Let's look at the air temperature data but at for **all pressure levels**. We are going to select out the first time index and the longitude corresponding to the Himalayas and plot a vertical profile of the atmosphere from pole to pole:"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "ds_air_all_pressure_levels = xr.open_dataset(\n",
265 | " \"data/ta_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n",
266 | ")\n",
267 | "ds_air_all_pressure_levels"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "data = ds_air_all_pressure_levels.ta.isel(time=-1).sel(lon=86.93, method='nearest')\n",
277 | "fig = data.plot(size=6, yincrease=False)\n",
278 | "fig.axes.set_title(\n",
279 | " f'Vertical profile of Temperature from pole to pole \\nat longitude = {data.lon.data} and time = {data.time.data}',\n",
280 | " size=15,\n",
281 | ");"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "
\n",
289 | "
\n",
290 | "
The yincrease=False option was used for the plot to invert the y-axis as pressure decreases with height
\n",
291 | "
We can make more complicated figures and/or make customizations to our plots by saving the returned object from .plot and accessing the .axes attribute of the returned object
\n",
292 | "
\n",
293 | "
"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "---"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "## Interactive visualization using `hvplot`\n",
308 | "\n",
309 | "Let's switch gears and look at how we can produce interactive plots via [holoviews](https://holoviews.org/). The holoviews plotting ecosystem provides the [hvplot](https://hvplot.holoviz.org/) package to allow easy visualization of xarray (and other) objects. These plots build on [Bokeh](https://docs.bokeh.org/en/latest/index.html) -- a Python library for creating interactive visualziatons for web browsers.\n",
310 | "\n",
311 | "\n",
312 | "To enable the `.hvplot` interface on xarray object, let's import the `hvplot.xarray` module:"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "import hvplot.xarray"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "To use `hvplot` instead of `matplotlib`, we use the `.hvplot()` method:"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "ds.tas.hvplot()"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "As you can see, calling `.hvplot()` behaves the same as `.plot()` i.e. it uses the same heuristics as `.plot()`. In this case, it produces a histogram for data with more than 3 dimensions. To plot a `pcolormesh`, let's reduce the dimensionality of our data to 2D and call `.hvplot()` again:"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "ds.tas.isel(time=1).hvplot(cmap=\"fire\")"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "ds.tas.isel(time=-1, lon=100).hvplot()"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": null,
368 | "metadata": {},
369 | "outputs": [],
370 | "source": [
371 | "ds.tas.sel(lat=28.5, lon=83.9, method='nearest').hvplot()"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "metadata": {},
377 | "source": [
378 | "So far we have had to subset our data in order to produce plots. `hvplot` provides convenient functionality for producing plots on-demand via interactive widgets. Let's create a series of 2D for each time slice, We will use the `groupby` parameter to let hvplot know that we want to create a widget (a slider) for the time dimension:"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "ds.tas.hvplot(groupby=\"time\", clim=(ds.tas.min(), ds.tas.max()), cmap='turbo')"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "Let's add more customizations to our time widget:"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": null,
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "ds.tas.hvplot(\n",
404 | " groupby=\"time\",\n",
405 | " clim=(ds.tas.min(), ds.tas.max()),\n",
406 | " cmap=\"turbo\",\n",
407 | " widget_type=\"scrubber\",\n",
408 | " widget_location=\"bottom\",\n",
409 | ")"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "---"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": null,
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "%load_ext watermark\n",
426 | "%watermark --time --python --updated --iversion"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "## Summary \n",
434 | "\n",
435 | "- Xarray has plotting functionality that is a thin wrapper around the Matplotlib library\n",
436 | "- Xarray uses syntax and function names from Matplotlib whenever possible\n",
437 | "- Hvplot provides a neat interface to xarray for creating interactive plots"
438 | ]
439 | },
440 | {
441 | "cell_type": "markdown",
442 | "metadata": {},
443 | "source": [
444 | "## Resources and References\n",
445 | "\n",
446 | "- [Hvplot Documentation](https://hvplot.holoviz.org/index.html)\n",
447 | "- [Xarray Documentation - Plotting](https://xarray.pydata.org/en/stable/user-guide/plotting.html)\n",
448 | "- [Matplolib Documentation](https://matplotlib.org/stable/contents.html)\n",
449 | "\n",
450 | "
\n",
451 | "
Geocat-examples Gallery
\n",
452 | " For geo-science specific visualization examples, please see the geocat-examples gallery which resides here.\n",
453 | "
Lazy evaluation: objects are evaluated just in time when the results are needed
\n",
80 | " \n",
81 | "
Eager evaluation: objects are evaluated in real time regardless if the results are needed immediately or not
\n",
82 | "
\n",
83 | "
\n",
84 | " \n"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "id": "9587dd62-6873-4056-b606-5806f64e11e7",
90 | "metadata": {},
91 | "source": [
92 | "## Advantages of using Dask\n",
93 | "\n",
94 | "- **Familiarity**: Dask collections such as Dask Array, Dask DataFrames provide decent NumPy and Pandas compatible APIs.\n",
95 | "- **Responsive**: Dask is designed with interactive computing in mind. \n",
96 | " - It provides rapid feedback and diagnostics to aid humans\n",
97 | "- **Scale up and scale down**: It scales well from single machine (laptop) to clusters (100s of machines)\n",
98 | " - This ease of transition between single machine to moderate clusters makes it easy for users to prototype their workflows on their local machines and seamlessy transition to a cluster when needed. \n",
99 | " - This also gives users a lot of flexibility when choosing the best to deploy and run their workflows. \n",
100 | "- **Flexibility**: Dask supports interfacing with popular cluster resource managers such as PBS/SLURM/Kubernetes, etc.. with a minimal amount of effort\n",
101 | "\n",
102 | ""
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "id": "ea2c600b-8fb5-4dfb-aede-d85870fbb9bf",
108 | "metadata": {},
109 | "source": [
110 | "## Task Graphs\n",
111 | "\n",
112 | "Dask represents distributed/parallel computations with task graphs, more specifically [directed acyclic graphs](https://en.wikipedia.org/wiki/Directed_acyclic_graph).\n",
113 | "\n",
114 | "- A task is a function that you want to call and its corresponding inputs\n",
115 | "- A task graph is a collection of (1) the functions we want to call + their inputs (2) their dependencies. \n",
116 | "\n",
117 | "\n",
118 | "Directed acyclic graphs are made up of nodes and have a clearly defined start and end, a single traversal path, and no looping \n",
119 | "\n",
120 | ""
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "id": "45775347-0f7e-4882-a5a2-4bc0ce60791a",
126 | "metadata": {},
127 | "source": [
128 | "---"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "id": "238a4c70-964a-4667-b399-26e308ea4841",
134 | "metadata": {},
135 | "source": [
136 | "## Resources and references\n",
137 | "\n",
138 | "* Reference\n",
139 | " * [Docs](https://dask.org/)\n",
140 | " * [Examples](https://examples.dask.org/)\n",
141 | " * [Code](https://github.com/dask/dask/)\n",
142 | " * [Blog](https://blog.dask.org/)\n",
143 | "* Ask for help\n",
144 | " * [`dask`](http://stackoverflow.com/questions/tagged/dask) tag on Stack Overflow, for usage questions\n",
145 | " * [github discussions](https://github.com/dask/dask/discussions) for general, non-bug, discussion, and usage questions\n",
146 | " * [github issues](https://github.com/dask/dask/issues/new) for bug reports and feature requests\n",
147 | " \n",
148 | " \n",
149 | "
By default, open_mfdataset() will chunk each netCDF file into a single Dask array; supply the chunks argument to control the size of the resulting Dask arrays.
\n",
343 | "
In more complex cases, you can open each file individually using open_dataset(..., chunks={...}) and merge the results into a single dataset.
\n",
344 | "
Passing the keyword argument parallel=True to open_mfdataset() will speed up the reading of large multi-file datasets by executing those read tasks in parallel using dask.delayed.
"
500 | ]
501 | }
502 | ],
503 | "metadata": {
504 | "kernelspec": {
505 | "display_name": "Python 3",
506 | "language": "python",
507 | "name": "python3"
508 | },
509 | "language_info": {
510 | "codemirror_mode": {
511 | "name": "ipython",
512 | "version": 3
513 | },
514 | "file_extension": ".py",
515 | "mimetype": "text/x-python",
516 | "name": "python",
517 | "nbconvert_exporter": "python",
518 | "pygments_lexer": "ipython3",
519 | "version": "3.9.6"
520 | }
521 | },
522 | "nbformat": 4,
523 | "nbformat_minor": 5
524 | }
525 |
--------------------------------------------------------------------------------
/notebooks/11-dask-distributed.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "0c247eff-7187-4de9-93a9-1c6a60db569b",
6 | "metadata": {},
7 | "source": [
8 | ""
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "55accf7f-c02b-4aae-b1a4-a7a613df03ba",
14 | "metadata": {},
15 | "source": [
16 | "# Distributed"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "6821b7b0-4c44-4004-bce4-8d704ed26545",
22 | "metadata": {},
23 | "source": [
24 | "## Learning Objectives \n",
25 | "\n",
26 | "- Use single machine Dask schedulers\n",
27 | "- Deploy a local Dask Distributed Cluster and access the diagnostics dashboard\n",
28 | "\n",
29 | "\n",
30 | "## Prerequisites\n",
31 | "\n",
32 | "\n",
33 | "| Concepts | Importance | Notes |\n",
34 | "| --- | --- | --- |\n",
35 | "| Familiarity with Python | Necessary | |\n",
36 | "| Familiarity with Dask Fundamentals | Necessary | |\n",
37 | "\n",
38 | "\n",
39 | "- **Time to learn**: *25-35 minutes*\n"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "id": "a43f990d-100d-43c9-8fb1-e8876b34d3b4",
45 | "metadata": {},
46 | "source": [
47 | "## Dask Schedulers\n",
48 | "\n",
49 | "As we have seen so far, Dask allows you to simply construct graphs of tasks with dependencies, as well as have graphs created automatically for you using functional, Numpy or Xarray syntax on data collections. None of this would be very useful, if there weren't also a way to execute these graphs, in a parallel and memory-aware way. So far we have been calling `thing.compute()` or `dask.compute(thing)` without worrying what this entails. Now we will discuss the options available for that execution, and in particular, the distributed scheduler, which comes with additional functionality.\n",
50 | "\n",
51 | "Dask comes with four available schedulers:\n",
52 | "\n",
53 | "- \"threaded\" (aka \"threading\"): a scheduler backed by a thread pool\n",
54 | "- \"processes\": a scheduler backed by a process pool\n",
55 | "- \"single-threaded\" (aka \"sync\"): a synchronous scheduler, good for debugging\n",
56 | "- distributed: a distributed scheduler for executing graphs on multiple machines, see below.\n",
57 | "\n",
58 | "To select one of these for computation, you can specify at the time of asking for a result, e.g.,\n",
59 | "```python\n",
60 | "myvalue.compute(scheduler=\"single-threaded\") # for debugging\n",
61 | "```\n",
62 | "\n",
63 | "You can also set a default scheduler either temporarily\n",
64 | "```python\n",
65 | "with dask.config.set(scheduler='processes'):\n",
66 | " # set temporarily for this block only\n",
67 | " # all compute calls within this block will use the specified scheduler\n",
68 | " myvalue.compute()\n",
69 | " anothervalue.compute()\n",
70 | "```\n",
71 | "\n",
72 | "Or globally\n",
73 | "```python\n",
74 | "# set until further notice\n",
75 | "dask.config.set(scheduler='processes')\n",
76 | "```\n",
77 | "\n",
78 | "Let's try out a few schedulers on the Sea Surface Temperature data."
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "id": "cfb03e79-dd85-44cb-affb-8864712cbcd5",
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "import pathlib\n",
89 | "\n",
90 | "import dask\n",
91 | "import xarray as xr"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "id": "b7b94f32-edbc-4a89-96ce-d92d4de3fdb3",
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "data_dir = pathlib.Path(\"data/\")\n",
102 | "files = sorted(data_dir.glob(\"tos_Omon_CESM2*\"))\n",
103 | "files"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "id": "ef83e9e5-9c43-4b0f-95d5-e759c653b5a6",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "dset = xr.open_mfdataset(\n",
114 | " sorted(files),\n",
115 | " concat_dim='ensemble_member',\n",
116 | " combine=\"nested\",\n",
117 | " parallel=True,\n",
118 | " data_vars=['tos'],\n",
119 | " engine=\"netcdf4\",\n",
120 | " chunks={'time': 90},\n",
121 | ")\n",
122 | "# Add coordinate labels for the newly created `ensemble_member` dimension\n",
123 | "dset[\"ensemble_member\"] = ['r11i1p1f1', 'r7i1p1f1', 'r8i1p1f1', 'r9i1p1f1']\n",
124 | "dset"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "id": "a414c9f2-98f4-4873-b9c9-badf018fde43",
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "# Compute anomaly\n",
135 | "gb = dset.tos.groupby('time.month')\n",
136 | "tos_anom = gb - gb.mean(dim='time')\n",
137 | "tos_anom"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "id": "c386bad9-4d1f-4b24-be6f-58d76832093a",
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "# each of the following gives the same results (you can check!)\n",
148 | "# any surprises?\n",
149 | "import time\n",
150 | "\n",
151 | "for sch in ['threading', 'processes', 'sync']:\n",
152 | " t0 = time.time()\n",
153 | " r = tos_anom.compute(scheduler=sch)\n",
154 | " t1 = time.time()\n",
155 | " print(f\"{sch:>10}, {t1 - t0:0.4f} s; {r.min().data, r.max().data, r.mean().data}\")"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "id": "8fd1f05d-73b1-4b3d-a91f-e18bd8848ff6",
162 | "metadata": {
163 | "tags": []
164 | },
165 | "outputs": [],
166 | "source": [
167 | "dask.visualize(tos_anom)"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "519d9129-8bd8-40d7-b3b9-08c30616096a",
173 | "metadata": {},
174 | "source": [
175 | "### Some Questions to Consider:\n",
176 | "\n",
177 | "- How much speedup is possible for this task (hint, look at the graph).\n",
178 | "- Given how many cores are on this machine, how much faster could the parallel schedulers be than the single-threaded scheduler.\n",
179 | "- How much faster was using threads over a single thread? Why does this differ from the optimal speedup?\n",
180 | "- Why is the multiprocessing scheduler so much slower here?\n",
181 | "\n",
182 | "The `threaded` scheduler is a fine choice for working with large datasets out-of-core on a single machine, as long as the functions being used release the [Python Global Interpreter Lock (GIL)](https://wiki.python.org/moin/GlobalInterpreterLock) most of the time. NumPy and pandas release the GIL in most places, so the `threaded` scheduler is the default for `dask.array` and `dask.dataframe`. The distributed scheduler, perhaps with `processes=False`, will also work well for these workloads on a single machine.\n",
183 | "\n",
184 | "For workloads that do hold the GIL, as is common with `dask.bag` and custom code wrapped with `dask.delayed`, we recommend using the distributed scheduler, even on a single machine. Generally speaking, it's more intelligent and provides better diagnostics than the `processes` scheduler.\n",
185 | "\n",
186 | "
\n",
187 | "
What Is the Python Global Interpreter Lock (GIL)?
\n",
188 | " The Python Global Interpreter Lock or GIL, in simple words, is a mutex (or a lock) that allows only one thread to hold the control of the Python interpreter.\n",
189 | " \n",
190 | " See this blog post for more details on Python GIL.\n",
191 | "
\n",
192 | "\n",
193 | "\n",
194 | "\n",
195 | "https://docs.dask.org/en/latest/scheduling.html provides some additional details on choosing a scheduler.\n",
196 | "\n",
197 | "For scaling out work across a cluster, the distributed scheduler is required."
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "id": "0038ec01-d8d9-49b6-bca3-2745c84bb9fa",
203 | "metadata": {},
204 | "source": [
205 | "## Making a cluster"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "id": "717d438d-dbe1-4c76-9905-3c46af2183b4",
211 | "metadata": {},
212 | "source": [
213 | "### Simple method\n",
214 | "\n",
215 | "The `dask.distributed` system is composed of a single centralized scheduler and one or more worker processes. [Deploying](https://docs.dask.org/en/latest/setup.html) a remote Dask cluster involves some additional effort. But doing things locally is just involves creating a `LocalCluster` object and connecting this object to a `Client` object, which lets you interact with the \"cluster\" (local threads or processes on your machine). For more information see [here](https://docs.dask.org/en/latest/setup/single-distributed.html). \n",
216 | "\n",
217 | "\n",
218 | "\n",
219 | "Note that `LocalCluster()` takes a lot of optional [arguments](https://distributed.dask.org/en/latest/local-cluster.html#api), to configure the number of processes/threads, memory limits and other "
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "id": "280d8409-4aaf-444b-9b20-e67e9999c806",
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "from dask.distributed import Client, LocalCluster\n",
230 | "\n",
231 | "# Setup a local cluster.\n",
232 | "# By default this sets up 1 worker per CPU core\n",
233 | "\n",
234 | "cluster = LocalCluster()\n",
235 | "client = Client(cluster)\n",
236 | "client"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "id": "9531d51f-3b21-4aa7-9beb-a8b0a550061b",
242 | "metadata": {},
243 | "source": [
244 | "**Note:**\n",
245 | "\n",
246 | "This code\n",
247 | "\n",
248 | "```python\n",
249 | "cluster = LocalCluster()\n",
250 | "client = Client(cluster)\n",
251 | "```\n",
252 | "\n",
253 | "is equivalent to \n",
254 | "\n",
255 | "```python\n",
256 | "client = Client()\n",
257 | "```"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "id": "13c70a4a-437b-4126-ace8-90bfffa2b8e5",
263 | "metadata": {},
264 | "source": [
265 | "If you aren't in jupyterlab and using the `dask-labextension`, be sure to click the `Dashboard` link to open up the diagnostics dashboard.\n",
266 | "\n"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "id": "b1471997-4a70-40a3-bb52-13e2a0d7f498",
272 | "metadata": {},
273 | "source": [
274 | "## Distributed Dask clusters for HPC and Cloud environments\n",
275 | "\n",
276 | "Dask can be deployed on distributed infrastructure, such as a an HPC system or a cloud computing system. There is a growing ecosystem of Dask deployment projects that faciliate easy deployment and scaling of Dask clusters on a wide variety of computing systems.\n",
277 | "\n",
278 | "### HPC\n",
279 | "\n",
280 | "#### Dask Jobqueue (https://jobqueue.dask.org/)\n",
281 | "\n",
282 | "- `dask_jobqueue.PBSCluster`\n",
283 | "- `dask_jobqueue.SlurmCluster`\n",
284 | "- `dask_jobqueue.LSFCluster`\n",
285 | "- etc.\n",
286 | "\n",
287 | "#### Dask MPI (https://mpi.dask.org/)\n",
288 | "\n",
289 | "- `dask_mpi.initialize`\n",
290 | "\n",
291 | "### Cloud\n",
292 | "\n",
293 | "#### Dask Kubernetes (https://kubernetes.dask.org/)\n",
294 | "\n",
295 | "- `dask_kubernetes.KubeCluster`\n",
296 | "\n",
297 | "#### Dask Cloud Provider (https://cloudprovider.dask.org)\n",
298 | "\n",
299 | "- `dask_cloudprovider.FargateCluster`\n",
300 | "- `dask_cloudprovider.ECSCluster`\n",
301 | "- `dask_cloudprovider.ECSCluster`\n",
302 | "\n",
303 | "#### Dask Gateway (https://gateway.dask.org/)\n",
304 | "\n",
305 | "- `dask_gateway.GatewayCluster`\n"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "id": "a2e3f65e-6836-4fba-9df7-58fe9f48aea4",
311 | "metadata": {},
312 | "source": [
313 | "## Executing with the distributed client\n",
314 | "\n",
315 | "Consider some calculation, such as we've used before, where we computed anomaly per ensemble member"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "id": "ff6a4027-e21a-4b47-b6a3-349d0c8155be",
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "tos_anom"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "id": "9a07be93-4b44-4def-85fb-e510306a8f52",
331 | "metadata": {},
332 | "source": [
333 | "By default, creating a `Client` makes it the default scheduler. Any calls to `.compute` will use the cluster your `client` is attached to, unless you specify otherwise, as above.\n"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "id": "007d949a-25a8-40a0-bac2-a43ac1341056",
339 | "metadata": {},
340 | "source": [
341 | "The tasks will appear in the web UI as they are processed by the cluster and, eventually, a result will be printed as output of the cell above. Note that the kernel is blocked while waiting for the result.\n",
342 | "\n",
343 | "You can also see a simplified version of the graph being executed on Graph pane of the dashboard, so long as the calculation is in-flight.\n",
344 | "\n",
345 | "\n",
346 | "Let's return to the anomaly computation from before, and see what happens on the dashboard (you may wish to have both the notebook and dashboard side-by-side). How does this perform compared to before?"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": null,
352 | "id": "738ada56-dff4-4fe0-9b38-0dc7342adf9e",
353 | "metadata": {},
354 | "outputs": [],
355 | "source": [
356 | "%time tos_anom.compute()"
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "id": "3b7fd256-fa96-4c7d-b981-90d31f043e03",
362 | "metadata": {},
363 | "source": [
364 | "In this particular case, this should be as fast or faster than the best case, threading, above. Why do you suppose this is? You should start your reading [here](https://distributed.dask.org/en/latest/index.html#architecture), and in particular note that the distributed scheduler was a complete rewrite with more intelligence around sharing of intermediate results and which tasks run on which worker. This will result in better performance in *some* cases, but still larger latency and overhead compared to the threaded scheduler, so there will be rare cases where it performs worse. Fortunately, the dashboard now gives us a lot more [diagnostic information](https://distributed.dask.org/en/latest/diagnosing-performance.html). Look at the Profile page of the dashboard to find out what takes the biggest fraction of CPU time for the computation we just performed?"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": null,
370 | "id": "14a4082d-b8a9-4843-8f93-8ff1f035deb6",
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "cluster.close()\n",
375 | "client.close()"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "id": "ca42fc1e-6c86-4568-8a95-d03e00cdf8fb",
382 | "metadata": {},
383 | "outputs": [],
384 | "source": [
385 | "%load_ext watermark\n",
386 | "%watermark --time --python --updated --iversion"
387 | ]
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "id": "721d405f-ed11-4e00-8eab-f1382b2848d3",
392 | "metadata": {},
393 | "source": [
394 | "---"
395 | ]
396 | },
397 | {
398 | "cell_type": "markdown",
399 | "id": "1cf640f8-673e-4f13-a6e2-9b5ff6aa315e",
400 | "metadata": {},
401 | "source": [
402 | "## Learn More\n",
403 | "\n",
404 | "If all you want to do is execute computations created using delayed, or run calculations based on the higher-level data collections, then that is about all you need to know to scale your work up to cluster scale. However, there is more detail to know about the distributed scheduler that will help with efficient usage. See this tutorial on advanced features of Distributed: https://tutorial.dask.org/06_distributed_advanced.html."
405 | ]
406 | },
407 | {
408 | "cell_type": "markdown",
409 | "id": "9d7ebf3a-edab-4cdd-9f85-f8bba94fb000",
410 | "metadata": {},
411 | "source": [
412 | "## Resources and references\n",
413 | "\n",
414 | "* Reference\n",
415 | " * [Dask Docs](https://dask.org/)\n",
416 | " * [Dask Blog](https://blog.dask.org/)\n",
417 | " * [Xarray Docs](https://xarray.pydata.org/)\n",
418 | " \n",
419 | "* Ask for help\n",
420 | " * [`dask`](http://stackoverflow.com/questions/tagged/dask) tag on Stack Overflow, for usage questions\n",
421 | " * [github discussions (dask):](https://github.com/dask/dask/discussions) for general, non-bug, discussion, and usage questions\n",
422 | " * [github issues (dask): ](https://github.com/dask/dask/issues/new) for bug reports and feature requests\n",
423 | " * [github discussions (xarray): ](https://github.com/pydata/xarray/discussions) for general, non-bug, discussion, and usage questions\n",
424 | " * [github issues (xarray): ](https://github.com/pydata/xarray/issues/new) for bug reports and feature requests\n",
425 | " \n",
426 | "* Pieces of this notebook are adapted from the following sources\n",
427 | " * https://github.com/dask/dask-tutorial/blob/main/05_distributed.ipynb\n",
428 | " * https://github.com/xarray-contrib/xarray-tutorial/blob/master/scipy-tutorial/05_intro_to_dask.ipynb\n",
429 | " \n",
430 | " \n",
431 | " \n",
432 | "
You can use this dot notation access only if the variable/datarray name is a valid Python identifier, e.g. \"mydataset.1\" is not allowed. See here for an explanation of valid identifiers.
\n",
273 | "
Some unexpected behavior may occur if the variable/datarray name conflicts with an existing method name, e.g. Using \"ds.min\" to refer to a variable called \"min\" collides with the \"min\" (minimum) xarray method, but \"ds['min']\" works fine.
\n",
126 | " \n",
127 | " but wait, what labels go with 20 and 40? Was that lat/lon or lon/lat? Where are the timestamps that go along with this time-series?\n",
128 | "
"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "## Different choices for indexing \n",
136 | "\n",
137 | "\n",
138 | "Xarray supports two kinds of indexing \n",
139 | "\n",
140 | "- Positional indexing via `.isel()`: provides primarily integer position based index (from `0` to `length-1` of the axis/dimension\n",
141 | "- Label indexing via `.sel()`: provides primarily label based index\n",
142 | "\n",
143 | "Xarray's indexing methods preserves the coordinate labels and associated metadata.\n",
144 | "\n",
145 | "\n",
146 | "\n",
147 | "### Selection by position\n",
148 | "\n",
149 | "The `.isel()` method is the primary access method for **purely integer based indexing**. The following are valid inputs:\n",
150 | "- An integer e.g. `lat=10`\n",
151 | "- A list or array of integers `lon=[10, 20, 39]`\n",
152 | "- A slice object with integers e.g. `time=slice(2, 20)`"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "# the original object i.e. no selection"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": []
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": []
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": []
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "### Selection by label \n",
190 | "\n",
191 | "\n",
192 | "The `.sel()` method is the primary access method for **purely coordinate label based indexing.**. The following are valid inputs:\n",
193 | "\n",
194 | "- A single coordinate label e.g. `time=\"2021-03-01\"`\n",
195 | "- A list or array of coordinate labels `lon=[=\"2021-01-01\", =\"2021-03-10\", =\"2021-03-12\"]`\n",
196 | "- A slice object with coordinate labels e.g. `time=slice(\"2021-01-01\", \"2021-03-01\")`. (Note that contrary to usual Python slices, both the start and the stop are included, when present in the index!)"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": []
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "It is also possible to use slice for the time dimension:"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": []
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "\n",
225 | "
\n",
226 | "
Warning: Be careful when working with floating coordinate labels
\n",
227 | " \n",
228 | " When we have integer, string, datetime-like values for coordinate labels, \"sel()\" works flawlessly. When we try to work with floating coordinate labels, things get a little tricky:\n",
229 | " \n",
230 | "
\n",
231 | "\n",
232 | "\n"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": []
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "### Nearest-neighbor lookups\n",
247 | "\n",
248 | "As shown above, when our coordinate labels are not integers or strings or datetime-like but floating point numbers, `.sel()` may throw a `KeyError`:"
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "metadata": {},
254 | "source": [
255 | "`ds.tas.sel(lat=39.5, lon=105.7)` fails because we are trying to use a conditional for an approximate value i.e floating numbers are represented approximately inside the computer, and xarray is unable to locate this exact value. To address this issue, xarray supports `method` and `tolerance` keyword argument. The `method` parameter allows for enabling nearest neighbor (inexact) lookups by use of the methods `'pad', 'backfill' or 'nearest'`: "
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": []
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "So the closest location in the data was at `lat=39.11`, `lon=106.2`."
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "See the [xarray documentation](https://xarray.pydata.org/en/stable/generated/xarray.DataArray.sel.html) for more on usage of `method` and `tolerance` parameters in `.sel()`. "
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "
\n",
284 | "
Tip
\n",
285 | "Another way to use the nearest neighbor lookup is via slice objects. For e.g.:\n",
286 | "
"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": []
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "Operators can be chained, so multiple operations can be peformed sequentially. For example, to select an area of interest and the first time index"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": []
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "### Interpolation\n",
315 | "\n",
316 | "If we want to interpolate along coordinates rather than looking up the nearest neighbos, we can use the `.interp()` method. To use `interp()` requires the presence of `scipy` library. \n"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": []
325 | },
326 | {
327 | "cell_type": "markdown",
328 | "metadata": {},
329 | "source": [
330 | "---"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "%load_ext watermark\n",
340 | "%watermark --time --python --updated --iversion"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "## Summary \n",
348 | "\n",
349 | "- Xarray’s named dimensions and labeled coordinates free the user from having to track positional ordering of dimensions when accessing data\n",
350 | "- Xarray provides a variety of methods for subsetting data via `.sel()`, `.isel()`, `.interp()` methods\n"
351 | ]
352 | },
353 | {
354 | "cell_type": "markdown",
355 | "metadata": {},
356 | "source": [
357 | "## Resources and References\n",
358 | "\n",
359 | "- [Xarray Documentation - Indexing and Selecting Data](https://xarray.pydata.org/en/stable/indexing.html)\n",
360 | "- [Xarray Documentation - Interpolation](https://xarray.pydata.org/en/stable/user-guide/interpolation.html)\n"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "
\n",
123 | " We are selecting a single point, so `.sel()` requires either an exact location that exists in the data, or to specify method argument to tell it how to choose a location from the data. \n",
124 | "
"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "\n",
132 | "Lets say we want to compare plots of temperature at three different latitudes. We can use the `hue` keyword argument to do this."
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": []
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": []
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "### 2D plots"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "Operator chaining means it is possible to have multiple selection operators and add `.plot()` to the end to visualise the result"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": []
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "The x- and y-axes are labeled with full names — \"Latitude\", \"Longitude\" — along with units. The colorbar has a nice label, again with units. And the title tells us the timestamp of the data presented."
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "# define keyword arguments that are passed to matptolib.pyplot.colorbar\n",
184 | "colorbar_kwargs = {\n",
185 | " \"orientation\": \"horizontal\",\n",
186 | " \"label\": \"my clustom label\",\n",
187 | " \"pad\": 0.2,\n",
188 | "}\n",
189 | "\n",
190 | "ds.tas.isel(lon=1).plot(\n",
191 | " x=\"time\", # coordinate to plot on the x-axis of the plot\n",
192 | " robust=True, # set colorbar limits to 2nd and 98th percentile of data\n",
193 | " cbar_kwargs=colorbar_kwargs,\n",
194 | ");"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "\n",
202 | "### Faceting\n",
203 | "\n",
204 | "Faceting is an effective way of visualizing variations of 3D data where 2D slices are visualized in a panel (subplot) and the third dimensions is varied between panels (subplots)."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": []
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "See the [xarray documentation](https://xarray.pydata.org/en/stable/user-guide/plotting.html) for more on \"faceted\" plots or subplots."
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "### Histograms"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": []
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "### Bonus Plot \n",
240 | "\n",
241 | "Let's look at the air temperature data but at for **all pressure levels**. We are going to select out the first time index and the longitude corresponding to the Himalayas and plot a vertical profile of the atmosphere from pole to pole:"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "ds_air_all_pressure_levels = xr.open_dataset(\n",
251 | " \"data/ta_Amon_CESM2_historical_r11i1p1f1_gn_200001-201412.nc\", engine=\"netcdf4\"\n",
252 | ")\n",
253 | "ds_air_all_pressure_levels"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": []
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "
\n",
268 | "
\n",
269 | "
The yincrease=False option was used for the plot to invert the y-axis as pressure decreases with height
\n",
270 | "
We can make more complicated figures and/or make customizations to our plots by saving the returned object from .plot and accessing the .axes attribute of the returned object
\n",
271 | "
\n",
272 | "
"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "---"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "## Interactive visualization using `hvplot`\n",
287 | "\n",
288 | "Let's switch gears and look at how we can produce interactive plots via [holoviews](https://holoviews.org/). The holoviews plotting ecosystem provides the [hvplot](https://hvplot.holoviz.org/) package to allow easy visualization of xarray (and other) objects. These plots build on [Bokeh](https://docs.bokeh.org/en/latest/index.html) -- a Python library for creating interactive visualziatons for web browsers.\n",
289 | "\n",
290 | "\n",
291 | "To enable the `.hvplot` interface on xarray object, let's import the `hvplot.xarray` module:"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "import hvplot.xarray"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "To use `hvplot` instead of `matplotlib`, we use the `.hvplot()` method:"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": []
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "As you can see, calling `.hvplot()` behaves the same as `.plot()` i.e. it uses the same heuristics as `.plot()`. In this case, it produces a histogram for data with more than 3 dimensions. To plot a `pcolormesh`, let's reduce the dimensionality of our data to 2D and call `.hvplot()` again:"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": []
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": []
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": []
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {},
348 | "source": [
349 | "So far we have had to subset our data in order to produce plots. `hvplot` provides convenient functionality for producing plots on-demand via interactive widgets. Let's create a series of 2D for each time slice, We will use the `groupby` parameter to let hvplot know that we want to create a widget (a slider) for the time dimension:"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": []
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "Let's add more customizations to our time widget:"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {},
370 | "outputs": [],
371 | "source": []
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "---"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "%load_ext watermark\n",
387 | "%watermark --time --python --updated --iversion"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "## Summary \n",
395 | "\n",
396 | "- Xarray has plotting functionality that is a thin wrapper around the Matplotlib library\n",
397 | "- Xarray uses syntax and function names from Matplotlib whenever possible\n",
398 | "- Hvplot provides a neat interface to xarray for creating interactive plots"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {},
404 | "source": [
405 | "## Resources and References\n",
406 | "\n",
407 | "- [Hvplot Documentation](https://hvplot.holoviz.org/index.html)\n",
408 | "- [Xarray Documentation - Plotting](https://xarray.pydata.org/en/stable/user-guide/plotting.html)\n",
409 | "- [Matplolib Documentation](https://matplotlib.org/stable/contents.html)\n",
410 | "\n",
411 | "
\n",
412 | "
Geocat-examples Gallery
\n",
413 | " For geo-science specific visualization examples, please see the geocat-examples gallery which resides here.\n",
414 | "
"
266 | ]
267 | }
268 | ],
269 | "metadata": {
270 | "kernelspec": {
271 | "display_name": "Python 3",
272 | "language": "python",
273 | "name": "python3"
274 | },
275 | "language_info": {
276 | "codemirror_mode": {
277 | "name": "ipython",
278 | "version": 3
279 | },
280 | "file_extension": ".py",
281 | "mimetype": "text/x-python",
282 | "name": "python",
283 | "nbconvert_exporter": "python",
284 | "pygments_lexer": "ipython3",
285 | "version": "3.9.4"
286 | },
287 | "toc-autonumbering": false,
288 | "toc-showcode": false,
289 | "toc-showmarkdowntxt": false,
290 | "toc-showtags": false
291 | },
292 | "nbformat": 4,
293 | "nbformat_minor": 4
294 | }
295 |
--------------------------------------------------------------------------------
/notebooks/template.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Let's start here! If you can directly link to an image relevant to your notebook, such as [canonical logos](https://github.com/numpy/numpy/blob/main/doc/source/_static/numpylogo.svg), do so here at the top of your notebook. You can do this with Markdown syntax,\n",
8 | "\n",
9 | "> ``\n",
10 | "\n",
11 | "or edit this cell to see raw HTML `img` demonstration. This is preferred if you need to shrink your embedded image. **Either way be sure to include `alt` text for any embedded images to make your content more accessible.**\n",
12 | "\n",
13 | ""
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Project Pythia Notebook Template\n",
21 | "\n",
22 | "Next, title your notebook appropriately with a top-level Markdown header, `#`. Do not use this level header anywhere else in the notebook. Our book build process will use this title in the navbar, table of contents, etc. Keep it short, keep it descriptive. Follow this with a `---` cell to visually distinguish the transition to the prerequisites section."
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "---"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "## Overview\n",
37 | "If you have an introductory paragraph, lead with it here! Keep it short and tied to your material, then be sure to continue into the required list of topics below,\n",
38 | "\n",
39 | "1. This is a numbered list of the specific topics\n",
40 | "1. These should map approximately to your main sections of content\n",
41 | "1. Or each second-level, `##`, header in your notebook\n",
42 | "1. Keep the size and scope of your notebook in check\n",
43 | "1. And be sure to let the reader know up front the important concepts they'll be leaving with"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## Prerequisites\n",
51 | "This section was inspired by [this template](https://github.com/alan-turing-institute/the-turing-way/blob/master/book/templates/chapter-template/chapter-landing-page.md) of the wonderful [The Turing Way](https://the-turing-way.netlify.app/welcome.html) Jupyter Book.\n",
52 | "\n",
53 | "Following your overview, tell your reader what concepts, packages, or other background information they'll **need** before learning your material. Tie this explicitly with links to other pages here in Foundations or to relevant external resources. Remove this body text, then populate the Markdown table, denoted in this cell with `|` vertical brackets, below, and fill out the information following. In this table, lay out prerequisite concepts by explicitly linking to other Foundations material or external resources, or describe generally helpful concepts.\n",
54 | "\n",
55 | "Label the importance of each concept explicitly as **helpful/necessary**.\n",
56 | "\n",
57 | "| Concepts | Importance | Notes |\n",
58 | "| --- | --- | --- |\n",
59 | "| [Intro to Cartopy](../core/cartopy/cartopy) | Necessary | |\n",
60 | "| [Understanding of NetCDF](some-link-to-external-resource) | Helpful | Familiarity with metadata structure |\n",
61 | "| Project management | Helpful | |\n",
62 | "\n",
63 | "- **Experience level**: with relevant packages or general self-assessed experience as **beginner/intermediate/advanced**\n",
64 | "- **Time to learn**: estimate in minutes or qualitatively as **long/medium/short**\n",
65 | "- **System requirements**:\n",
66 | " - Populate with any system, version, or non-Python software requirements if necessary\n",
67 | " - Otherwise use the concepts table above and the Imports section below to describe required packages as necessary\n",
68 | " - If no extra requirements, remove the **System requirements** point altogether"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "---"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "## Imports\n",
83 | "Begin your body of content with another `---` divider before continuing into this section, then remove this body text and populate the following code cell with all necessary Python imports **up-front**:"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "import sys"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "## Your first content section"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "This is where you begin your first section of material, loosely tied to your objectives stated up front. Tie together your notebook as a narrative, with interspersed Markdown text, images, and more as necessary,"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "# as well as any and all of your code cells\n",
116 | "print(\"Hello world!\")"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {},
122 | "source": [
123 | "### A content subsection\n",
124 | "Divide and conquer your objectives with Markdown subsections, which will populate the helpful navbar in Jupyter Lab and here on the Jupyter Book!"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "# some subsection code\n",
134 | "new = \"helpful information\""
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "### Another content subsection\n",
142 | "Keep up the good work! A note, *try to avoid using code comments as narrative*, and instead let them only exist as brief clarifications where necessary."
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "## Your second content section\n",
150 | "Here we can move on to our second objective, and we can demonstrate"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "### Subsection to the second section\n",
158 | "\n",
159 | "#### a quick demonstration\n",
160 | "\n",
161 | "##### of further and further\n",
162 | "\n",
163 | "###### header levels"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "as well $m = a * t / h$ text! Similarly, you have access to other $\\LaTeX$ equation [**functionality**](https://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Typesetting%20Equations.html) via MathJax (demo below from link),\n",
171 | "\n",
172 | "\\begin{align}\n",
173 | "\\dot{x} & = \\sigma(y-x) \\\\\n",
174 | "\\dot{y} & = \\rho x - y - xz \\\\\n",
175 | "\\dot{z} & = -\\beta z + xy\n",
176 | "\\end{align}"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "Check out [**any number of helpful Markdown resources**](https://www.markdownguide.org/basic-syntax/) for further customizing your notebooks and the [**Jupyter docs**](https://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Working%20With%20Markdown%20Cells.html) for Jupyter-specific formatting information. Don't hesitate to ask questions if you have problems getting it to look *just right*."
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "## Last Section\n",
191 | "\n",
192 | "If you're comfortable, and as we briefly used for our embedded logo up top, you can embed raw html into Jupyter Markdown cells (edit to see):"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "
\n",
200 | "
Info
\n",
201 | " Your relevant information here!\n",
202 | "
"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "Feel free to copy this around and edit or play around with yourself. Some other `admonitions` you can put in:"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "
\n",
217 | "
Success
\n",
218 | " We got this done after all!\n",
219 | "