├── .github └── workflows │ └── gh-page_builder.yml ├── .gitignore ├── LICENSE ├── README.md ├── _config.yml ├── _toc.yml ├── data └── qhist-dask.out ├── environment.yml ├── images ├── NCAR-contemp-logo-blue.png ├── NCAR_CISL_NSF_banner.jpeg ├── NCAR_CISL_logo.png ├── dask-array.svg ├── dask_dataframe.png ├── dask_dist.png ├── dask_horizontal.svg ├── dask_twitter.png ├── denver.png ├── denver2.png ├── distributed-overview.png └── high_vs_low_level_coll_analogy.png └── notebooks ├── 00-dask-overview.ipynb ├── 01-dask-array.ipynb ├── 02-dask-dataframe.ipynb ├── 03-dask-xarray.ipynb ├── 04-dask-cluster.ipynb ├── 05-dask-hpc.ipynb ├── 06-dask-chunking.ipynb └── get_data.sh /.github/workflows/gh-page_builder.yml: -------------------------------------------------------------------------------- 1 | name: JupyterBook 2 | #name: Build and Publish JupyterBook to GitHub Pages 3 | 4 | on: 5 | [push] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: 3.7 17 | - name: Install Python dependencies 18 | run: | 19 | sudo apt-get install python3-pip 20 | #pip install git+git://github.com/executablebookproject/cli.git#egg=master 21 | pip install ghp-import 22 | #pip install -r book/requirements.txt 23 | pip install jupyter-book 24 | PATH="${PATH}:${HOME}/.local/bin" 25 | # - name: Build book TOC file 26 | # run: | 27 | # #jupyter-book toc . 28 | - name: Build book HTML 29 | run: | 30 | jupyter-book build . 31 | - name: Push _build/html to gh-pages 32 | run: | 33 | sudo chown -R $(whoami):$(whoami) . 34 | git config --global user.email "$GITHUB_ACTOR@users.noreply.github.com" 35 | git config --global user.name "$GITHUB_ACTOR" 36 | git remote set-url origin "https://$GITHUB_ACTOR:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY" 37 | ls -lrt 38 | ghp-import _build/html -f -p -n 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb_checkpoints* 2 | dask-worker-logs 3 | notebooks/*.html 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![NCAR CISL NSF Logo](images/NCAR_CISL_NSF_banner.jpeg) 2 | # NCAR Dask Tutorial 3 | 4 | [![Jupyter Build](https://shields.api-test.nl/github/workflow/status/NCAR/dask-tutorial/JupyterBook?label=JupyterBook&logo=GitHub&style=flat-square)](https://ncar.github.io/dask-tutorial/README.html) 5 | [![Made withJupyter](https://img.shields.io/badge/Made%20with-Jupyter-green?style=flat-square&logo=Jupyter&color=green)](https://jupyter.org/try) 6 | [![Commits](https://img.shields.io/github/last-commit/NCAR/dask-tutorial?label=Last%20commit&style=flat-square&color=green)](https://github.com/NCAR/dask-tutorial/commits/main) 7 | 8 | **Welcome to NCAR Dask Tutorial!** 9 | 10 | **Organized by: Brian Vanderwende, Negin Sobhani, Deepak Cherian, and Ben Kirk** 11 | 12 | The materials and notebooks in this tutorial is published as a Jupyter book here. [![Jupyter Book Badge](https://jupyterbook.org/badge.svg)](https://ncar.github.io/dask-tutorial/README.html) 13 | 14 | Here you will find the tutorial materials from the CISL/CSG Dask Tutorial. 15 | The 4-hour tutorial will be split into two sections, with early topics focused on beginner Dask users and later topics focused on intermediate usage on HPC and associated best practices. 16 | 17 | This tutorial is open to non-UCAR staff. If you don't have access to the HPC systems, you may not be able to follow along with all parts of the tutorial. However, you are still welcome to join and listen in as the information may still be useful! 18 | 19 | Video Recoding: Will be available after the event 20 | 21 | ## Course Outline 22 | 23 | 0. [Dask Overview](https://ncar.github.io/dask-tutorial/notebooks/00-dask-overview.html) 24 | 1. [Dask Data Arrays](https://ncar.github.io/dask-tutorial/notebooks/01-dask-array.html) 25 | 2. [Dask DataFrames](https://ncar.github.io/dask-tutorial/notebooks/02-dask-dataframe.html) 26 | 3. [Dask + Xarray](https://ncar.github.io/dask-tutorial/notebooks/03-dask-xarray.html) 27 | 4. [Dask Schedulers](https://ncar.github.io/dask-tutorial/notebooks/04-dask-cluster.html) 28 | 5. [Dask on HPC Systems](https://ncar.github.io/dask-tutorial/notebooks/05-dask-hpc.html) 29 | 6. [Dask Best Practices](https://ncar.github.io/dask-tutorial/notebooks/06-dask-chunking.html) 30 | 31 | ## Prerequisites 32 | Before beginning any of the tutorials, it is highly recommended that you have a basic understanding of Python programming and Python libraries such as NumPy, pandas, and Xarray. 33 | 34 | 35 | ## ⌨️ Getting set up 36 | 37 | This tutorial is open to non-UCAR staff. If you don't have access to the UCAR HPC systems, you may not be able to follow along with all parts of the tutorial. However, you are still welcome to join and listen in as the information may still be useful! 38 | 39 | ### [NCAR JupyterHub](https://github.com/NCAR/dask-tutorial) 40 | This is the preferred way to interact with this tutorial. Users with access to Casper can run the notebooks interactively, and will be able to save their work and pull in new updates. 41 | To connect to NCAR JupyterHub, please open this link in a web browser: https://jupyterhub.hpc.ucar.edu/ 42 | 43 | Next, clone the repository to your local directory: 44 | ``` 45 | git clone https://github.com/NCAR/dask-tutorial 46 | ``` 47 | Finally, open the notebooks and interact with them. Make sure to choose the "NPL 2023a" kernel. 48 | 49 | ### Local installation instructions 50 | Users without access to the NCAR/UCAR Casper cluster can only run through the first few notebooks. 51 | To run the notebooks locally: 52 | 53 | First clone this repository to your local machine via: 54 | ``` 55 | git clone https://github.com/NCAR/dask-tutorial 56 | ``` 57 | 58 | Next, download conda (if you haven't already) 59 | 60 | If you do not already have the conda package manager installed, please follow the instructions [here](https://github.com/conda-forge/miniforge#install). 61 | 62 | Now, create a conda environment: 63 | 64 | Navigate to the `dask-tutorial/` directory and create a new conda environment with the required 65 | packages via: 66 | 67 | ```terminal 68 | cd dask-tutorial 69 | conda env update --file environment.yml 70 | ``` 71 | 72 | This will create a new conda environment named "dask-tutorial". 73 | 74 | Next, activate the environment: 75 | 76 | ``` 77 | conda activate dask-tutorial 78 | ``` 79 | 80 | Finally, launch JupyterLab with: 81 | 82 | ``` 83 | jupyter lab 84 | ``` 85 | 86 | ## Contributing 87 | We welcome contributions from the community! If you have a tutorial you would like to add or if you would like to improve an existing tutorial, please follow these steps: 88 | 89 | Fork the repository. 90 | 91 | Clone the repository to your local machine: 92 | ``` 93 | git clone https://github.com/your-username/dask-tutorial-repository.git 94 | ``` 95 | Create a new branch for your changes: 96 | ``` 97 | git checkout -b my-new-tutorial 98 | ``` 99 | Make your changes and commit them: 100 | ``` 101 | git add . 102 | git commit -m "Add my new tutorial" 103 | ``` 104 | Push your changes to your fork: 105 | ``` 106 | git push origin my-new-tutorial 107 | ``` 108 | Submit a pull request to the original repository. 109 | 110 | 111 | 112 | ## Support 113 | If you have any questions or need help with the tutorials, please [open a GitHub issue](https://github.com/NCAR/dask-tutorial/issues/new?title=Issue%20on%20page%20%2FREADME.html&body=Your%20issue%20content%20here.) in the repository. 114 | 115 | ## 👍 Acknowledgments 116 | 117 | * NCAR CISL/CSG Team 118 | * ESDS Initiative 119 | 120 | ## License 121 | The tutorials in this repository are released under the MIT License. 122 | 123 | 124 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | # Learn more at https://jupyterbook.org/customize/config.html 3 | 4 | title: Dask Tutorial 5 | author: Negin Sobhani, Brian Vanderwende, Deepak Cherian, Ben Kirk 6 | copyright: "2023" # Copyright year to be placed in the footer 7 | 8 | logo: images/NCAR_CISL_logo.png 9 | 10 | # Force re-execution of notebooks on each build. 11 | # See https://jupyterbook.org/content/execute.html 12 | #execute: 13 | # execute_notebooks: force 14 | 15 | execute: 16 | execute_notebooks: 'off' 17 | 18 | 19 | # Define the name of the latex output file for PDF builds 20 | 21 | latex: 22 | latex_engine : pdflatex # one of 'pdflatex', 'xelatex' (recommended for unicode), 'luatex', 'platex', 'uplatex' 23 | use_jupyterbook_latex : true # use jupyterbook-latex for pdf builds as default 24 | latex_documents: 25 | targetname: tutorial_book.tex 26 | 27 | # Add a bibtex file so that we can create citations 28 | bibtex_bibfiles: 29 | - references.bib 30 | 31 | # Information about where the book exists on the web 32 | repository: 33 | url: https://github.com/NCAR/dask-tutorial # Online location of your book 34 | #path_to_book: # Optional path to your book, relative to the repository root 35 | branch: main # Which branch of the repository should be used when creating links (optional) 36 | 37 | # Add GitHub buttons to your book 38 | # See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository 39 | html: 40 | use_repository_button: true 41 | use_edit_page_button : true 42 | use_issues_button : true 43 | home_page_in_navbar : true 44 | 45 | 46 | # Launch button settings 47 | launch_buttons: 48 | binder : true 49 | binderhub : true 50 | binderhub_url: https://mybinder.org 51 | thebe : false 52 | notebook_interface : jupyterlab # The interface interactive links will activate ["classic", "jupyterlab"] 53 | jupyterhub : false 54 | jupyterhub_url : "https://jupyterhub.hpc.ucar.edu/stable/" # The URL of the JupyterHub (e.g., https://datahub.berkeley.edu) 55 | colab : false 56 | # colab_url : "" # The URL of Google Colab (https://colab.research.google.com) 57 | -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | # Learn more at https://jupyterbook.org/customize/toc.html 3 | 4 | format: jb-book 5 | root: README 6 | 7 | parts: 8 | - caption: Introductions to Dask 9 | chapters: 10 | - file: notebooks/00-dask-overview.ipynb 11 | - file: notebooks/01-dask-array.ipynb 12 | - file: notebooks/02-dask-dataframe.ipynb 13 | 14 | - caption: Dask and Xarray 15 | chapters: 16 | - file: notebooks/03-dask-xarray.ipynb 17 | 18 | - caption: Dask on HPC 19 | chapters: 20 | - file: notebooks/04-dask-cluster.ipynb 21 | - file: notebooks/05-dask-hpc.ipynb 22 | 23 | - caption: Dask Best Practices 24 | chapters: 25 | - file: notebooks/06-dask-chunking.ipynb 26 | 27 | -------------------------------------------------------------------------------- /data/qhist-dask.out: -------------------------------------------------------------------------------- 1 | Job ID,NCPUs,Elapsed (h) 2 | 5526022.casper-pbs,1,0.05 3 | 5526023.casper-pbs,1,0.05 4 | 5526025.casper-pbs,1,0.05 5 | 5526024.casper-pbs,1,0.05 6 | 5525958.casper-pbs,1,0.25 7 | 5525957.casper-pbs,1,0.25 8 | 5525962.casper-pbs,1,0.21 9 | 5525963.casper-pbs,1,0.21 10 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: dask-tutorial 2 | channels: 3 | - conda-forge 4 | - nodefaults 5 | dependencies: 6 | - cfgrib 7 | - cftime < 1.5 8 | - dask 9 | - dask-labextension 10 | - distributed 11 | - h5netcdf 12 | - hvplot 13 | - ipywidgets 14 | - jupyterlab-system-monitor 15 | - jupyterlab>=3 16 | - matplotlib 17 | - nbterm 18 | - nc-time-axis 19 | - netcdf4 20 | - nodejs 21 | - pandas 22 | - pip 23 | - pre-commit 24 | - pydap 25 | - python-graphviz 26 | - python=3.9 27 | - scipy 28 | - xarray>=2022.3.0 29 | -------------------------------------------------------------------------------- /images/NCAR-contemp-logo-blue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/NCAR-contemp-logo-blue.png -------------------------------------------------------------------------------- /images/NCAR_CISL_NSF_banner.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/NCAR_CISL_NSF_banner.jpeg -------------------------------------------------------------------------------- /images/NCAR_CISL_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/NCAR_CISL_logo.png -------------------------------------------------------------------------------- /images/dask-array.svg: -------------------------------------------------------------------------------- 1 | 2 | 15 | 17 | 39 | 42 | 52 | 62 | 72 | 82 | 92 | 102 | 112 | 122 | 132 | 142 | 152 | 162 | 172 | 182 | 192 | 202 | 212 | 222 | 232 | 242 | 250 | 258 | 266 | 274 | 282 | 290 | 298 | 306 | 314 | 318 | 322 | 327 | 335 | 340 | 348 | 349 | 350 | -------------------------------------------------------------------------------- /images/dask_dataframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/dask_dataframe.png -------------------------------------------------------------------------------- /images/dask_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/dask_dist.png -------------------------------------------------------------------------------- /images/dask_horizontal.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /images/dask_twitter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/dask_twitter.png -------------------------------------------------------------------------------- /images/denver.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/denver.png -------------------------------------------------------------------------------- /images/denver2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/denver2.png -------------------------------------------------------------------------------- /images/distributed-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/distributed-overview.png -------------------------------------------------------------------------------- /images/high_vs_low_level_coll_analogy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/dask-tutorial/f08459154557990a73c7eae1fd1376b9f74892af/images/high_vs_low_level_coll_analogy.png -------------------------------------------------------------------------------- /notebooks/00-dask-overview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4db5d14d-ee57-4791-9ab7-58cb2ff2cc3b", 6 | "metadata": {}, 7 | "source": [ 8 | "\"NCAR\n", 12 | "\n", 13 | "\n", 14 | "# Dask Overview\n", 15 | "\n", 16 | "**ESDS Dask Tutorial | 06 February, 2023** \n", 17 | "\n", 18 | "Negin Sobhani, Brian Vanderwende, Deepak Cherian, Ben Kirk \n", 19 | "Computational & Information Systems Lab (CISL) \n", 20 | "[negins@ucar.edu](mailto:negins@ucar.edu), [vanderwb@ucar.edu](mailto:vanderwb@ucar.edu)\n", 21 | "\n", 22 | "\n", 23 | "---------" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "5b6211a0-3762-41a2-8a45-6b19ce32f658", 29 | "metadata": {}, 30 | "source": [ 31 | "**In this tutorial, you learn:**\n", 32 | "\n", 33 | "* What is Dask?\n", 34 | "* Why Dask in Geosciences?\n", 35 | "* Dask Data Structures and Schedulers\n", 36 | "* When to use Dask?\n" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "a26e012c-213f-43ad-997c-1f895241826e", 42 | "metadata": { 43 | "tags": [] 44 | }, 45 | "source": [ 46 | "## Introduction\n", 47 | "\n", 48 | "Complex data structures enable data science in Python. For example:\n", 49 | "* [NumPy arrays](https://numpy.org/doc/stable/)\n", 50 | "* [Pandas series and dataframes](https://pandas.pydata.org/)\n", 51 | "* [XArray datasets](https://docs.xarray.dev/)\n", 52 | "\n", 53 | "*But datasets are getting larger all of the time! What if my dataset is too big to fit into memory, or it takes too long to complete an analysis?*\n", 54 | "\n", 55 | "## What is Dask?\n", 56 | "\n", 57 | "\"NCAR\n", 61 | "\n", 62 | "* Dask is an open-source Python library for parallel and distributed computing that scales the existing Python ecosystem.\n", 63 | "\n", 64 | "* Dask was developed to scale Python packages such as Numpy, Pandas, and Xarray to multi-core machines and distributed clusters when datasets exceed memory.\n", 65 | "\n", 66 | "\n", 67 | "## Why Dask?\n", 68 | "\n", 69 | "### Familiar Interface \n", 70 | "\n", 71 | "Dask provides interfaces which mimics significant portions of the NumPy and Pandas APIs. \n", 72 | "\n", 73 | "This means Dask provides ways to parallelize Pandas, Xarray, and Numpy workflows with minimal code rewriting (no massive code-restructure or writing a script in another language).\n", 74 | "\n", 75 | "### Scalability\n", 76 | "Dask is designed to scale well from single machine (laptop) to thousand-node HPC clusters, and on the cloud.\n", 77 | "\n", 78 | "This allows users to use their existing hardware, or add more machines as needed, to handle increasingly large and complex datasets.\n", 79 | "\n", 80 | "### Flexibility\n", 81 | "Dask provides several tools that help with data analysis on large datasets. For example, you can easily wrap your function in `dask.delayed` decorator to make it run in parallel. \n", 82 | "\n", 83 | "Dask provides seamless integration with well-known HPC resource managers and job scheduling systems, including PBS, SLURM, and SGE.\n", 84 | "\n", 85 | "### Built-in Diagnostic Tools\n", 86 | "Dask provides responsive feedback via the client as well as a real-time interactive diagnostic dashboard to keep users informed on how the computation is progressing. \n", 87 | "\n", 88 | "This helps users identify and resolve potential issues without waiting for the work to be completed. \n", 89 | "\n", 90 | "\n", 91 | "## First Rule of Dask\n", 92 | "\n", 93 | "While Dask is a powerful tool for parallel and distributed computing, it is not always the best solution for every problem. \n", 94 | "In some cases, using Dask may introduce additional complexity and overhead, without providing any substantial benefits in terms of performance or scalability.\n", 95 | "\n", 96 | "* Keep in mind the time spent parallelizing and optimizing your workflow when using Dask vs. the time saved because of that parallelization.\n", 97 | "\n", 98 | "* Consider how many times you plan to run your code - if only once, is it worth it?\n", 99 | "\n", 100 | "
\n", 101 | "\n", 102 | "NOTE: **Dask should only be used when necessary.** \n", 103 | "\n", 104 | "Avoid Dask if you can easily:\n", 105 | "\n", 111 | "And keep in mind - all of the above steps improve your code whether you end up using Dask or not!

\n", 112 | "\n", 114 | "\n", 115 | "
\n", 116 | "\n", 117 | "\n", 118 | "## When to use Dask?\n", 119 | "Here are some general guidelines for when to use Dask and when to avoid it:\n", 120 | "\n", 121 | "#### Use Dask:\n", 122 | "* When you have large datasets that don't fit into memory on a single machine.\n", 123 | "* When you need to perform parallel computations, such as big data analysis. \n", 124 | "\n", 125 | "#### Avoid Dask:\n", 126 | "\n", 127 | "* When you have small datasets that can be processed efficiently on a single machine.\n", 128 | "* When you don't need parallel processing, as the overhead of managing a distributed computing environment may not be worth the benefits.\n", 129 | "* When you need to debug or troubleshoot problems, as distributed computing environments can be challenging for debugging. If the problem is complex, using Dask may make debugging more difficult." 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "id": "77b446ce-24a6-48f6-a197-a5d1b97fb0db", 135 | "metadata": { 136 | "tags": [] 137 | }, 138 | "source": [ 139 | "## Dask Components\n", 140 | "\n", 141 | "Dask is composed of two main parts:\n", 142 | "\n", 143 | "### 1. Dask Collections\n", 144 | "\n", 145 | "Dask *Collections* are the user interfaces we use for parallel and distributed computing with Dask.\n", 146 | "\n", 147 | "Dask features different levels of collection types:\n", 148 | "\n", 149 | "#### High-level collections \n", 150 | "Dask provides high-level collections Dask Arrays, Dask DataFrames, and Dask Bags that mimic NumPy, pandas, and lists but can operate in parallel on datasets that don’t fit into memory.\n", 151 | "\n", 152 | "Most of the time, you will probably use one of the following *high-level* (big) data structures (or an even higher-level derivative type like Xarrays):\n", 153 | "\n", 154 | "| Collection | Serial | Dask |\n", 155 | "|-|-|-|\n", 156 | "| Arrays | numpy.array | dask.array.from_array |\n", 157 | "| Dataframes | pandas.read_csv | dask.dataframe.read_csv |\n", 158 | "| Unstructured | [1,2,3] | dask.bag.from_sequence([1,2,3]) |\n", 159 | "\n", 160 | "\n", 161 | "#### Low-level collections\n", 162 | "Dask also features two *low-level* collection types - `delayed` and `futures`. These collections give users finer control to build custom parallel and distributed computations.\n", 163 | "\n", 164 | "* **delayed** - run any arbitrary Python function using Dask task parallelism (think looped function calls)\n", 165 | "* **futures** - similar to delayed but allows for concurrent commands in the client script (think backgrounded processes)\n", 166 | "\n", 167 | "These are very powerfull tools, but it is easy to write something using a delayed function that could be executed faster and more simply using a high-level collection \n", 168 | "\n", 169 | "\"Dask\n", 172 | " \n", 173 | "*Image credit: Anaconda, Inc. and contributors*\n", 174 | "\n", 175 | "\n", 176 | "### 2. Dynamic Task Scheduling\n", 177 | "**We can basically think of the Dask scheduler as our task orchestrator.**\n", 178 | "\n", 179 | "When a computation is submitted, work is segmented into discrete tasks which are assigned to workers by the Dask scheduler. \n", 180 | "\n", 181 | "To perform work, a scheduler must be assigned resources in the form of a Dask cluster. The cluster consists of the following components: \n", 182 | "\n", 183 | "* **scheduler** : A scheduler creates and manages task graphs and distributes tasks to workers.\n", 184 | "\n", 185 | "* **workers** : A worker is typically a separate Python process on either the local host or a remote machine. A Dask cluster usually consists of many workers. Basically, a worker is a Python interpretor which will perform work on a subset of our dataset.\n", 186 | "\n", 187 | "* **client** - A high-level interface that points to the scheduler (often local but not always). A client serves as the entry point for interacting with a Dask scheduler.\n", 188 | "\n", 189 | "\"Dask\n", 192 | " \n", 193 | "*Image credit: Anaconda, Inc. and contributors*\n", 194 | "\n", 195 | "\n", 196 | "We will learn more about Dask Collections and Dynamic Task Scheduling in the next tutorials." 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "id": "9cbea0a3-6f90-41a1-9827-8f60771e67af", 202 | "metadata": {}, 203 | "source": [ 204 | "## Useful Resources\n", 205 | "\n", 206 | "* Reference\n", 207 | " * [Docs](https://dask.org/)\n", 208 | " * [Examples](https://examples.dask.org/)\n", 209 | " * [Code](https://github.com/dask/dask/)\n", 210 | " * [Blog](https://blog.dask.org/)\n", 211 | "* Ask for help\n", 212 | " * [`dask`](http://stackoverflow.com/questions/tagged/dask) tag on Stack Overflow, for usage questions\n", 213 | " * [github issues](https://github.com/dask/dask/issues/new) for bug reports and feature requests\n", 214 | " * [discourse forum](https://dask.discourse.group/) for general, non-bug, questions and discussion" 215 | ] 216 | } 217 | ], 218 | "metadata": { 219 | "kernelspec": { 220 | "display_name": "Python 3 (ipykernel)", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.9.12" 235 | }, 236 | "widgets": { 237 | "application/vnd.jupyter.widget-state+json": { 238 | "state": {}, 239 | "version_major": 2, 240 | "version_minor": 0 241 | } 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 5 246 | } 247 | -------------------------------------------------------------------------------- /notebooks/04-dask-cluster.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "990cfa4c-2117-4435-9806-ff9048890398", 6 | "metadata": {}, 7 | "source": [ 8 | "\"NCAR\n", 12 | "\n", 13 | "# Dask Schedulers\n", 14 | "\n", 15 | "**ESDS dask tutorial | 06 February, 2023** \n", 16 | "\n", 17 | "Negin Sobhani, Brian Vanderwende, Deepak Cherian, Ben Kirk \n", 18 | "Computational & Information Systems Lab (CISL) \n", 19 | "[negins@ucar.edu](mailto:negins@ucar.edu), [vanderwb@ucar.edu](mailto:vanderwb@ucar.edu)\n", 20 | "\n", 21 | "------" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "36f1fc96-8189-48b4-a61c-c8f2d2fc1a58", 27 | "metadata": {}, 28 | "source": [ 29 | "### In this tutorial, you learn:\n", 30 | "\n", 31 | "* Components of Dask Schedulers\n", 32 | "* Types of Dask Schedulers\n", 33 | "* Single Machine Schedulers\n", 34 | "\n", 35 | "\n", 36 | "**Related Documentation**\n", 37 | "\n", 38 | "* [Dask Scheduling](https://docs.dask.org/en/latest/scheduling.html) \n", 39 | "* [Dask Local Cluster](https://docs.dask.org/en/stable/deploying-python.html) \n", 40 | "* [Dask Cluster manager](https://docs.dask.org/en/latest/deploying-python.html) " 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "214569a5-8969-4e1f-8b24-d8d3183219a9", 46 | "metadata": { 47 | "tags": [] 48 | }, 49 | "source": [ 50 | "## Introduction\n", 51 | "As we mentioned in our Dask overview, Dask is composed of two main parts:\n", 52 | "\n", 53 | "1. Dask Collections (APIs)\n", 54 | "2. Dynamic Task Scheduling\n", 55 | "\n", 56 | "So far, we have talked about different Dask collections, but in this tutorial we are going to talk more about the second part. \n", 57 | "\n", 58 | "\n", 59 | "## The Dask scheduler - our task orchestrator\n", 60 | "\n", 61 | "The `Dask.distributed` task *scheduler* is a centralized, dynamic system that coordinates the efforts of various dask *worker* processes spread accross different machines.\n", 62 | "\n", 63 | "When a computational task is submitted, the Dask distributed *scheduler* sends it off to a Dask *cluster* - simply a collection of Dask *workers*. A worker is typically a separate Python process on either the local host or a remote machine. \n", 64 | "\n", 65 | "\n", 66 | "
\n", 67 | "\n", 68 | "* **worker** - a Python interpretor which will perform work on a subset of our dataset.\n", 69 | "* **cluster** - an object containing instructions for starting and talking to workers.\n", 70 | "* **scheduler** - sends tasks from our task graph to workers.\n", 71 | "* **client** - a local object that points to the scheduler (*often local but not always*).
\n", 72 | "\n", 73 | "" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "263fd4a3-0475-41a4-abae-417e8731100a", 79 | "metadata": {}, 80 | "source": [ 81 | "## Schedulers\n", 82 | "Dask essentially offers two types of schedulers:\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | "\n", 87 | "### 1. Single machine scheduler \n", 88 | "* The Single-machine Scheduler schedules tasks and manages the execution of those tasks on the same machine where the scheduler is running. \n", 89 | "* It is designed to be used in situations where the amount of data or the computational requirements are too large for a single process to handle, but not large enough to warrant the use of a cluster of machines.\n", 90 | "* It is relatively simple and cheap to use but it does not scale as it only runs on a single machine. \n", 91 | "\n", 92 | "**Single machine scheduler is the default choice used by Dask.**\n", 93 | "\n", 94 | "In Dask, there are several types of single machine schedulers that can be used to schedule computations on a single machine:\n", 95 | "#### 1.1. Single-threaded scheduler\n", 96 | "This scheduler runs all tasks **serially** on a single thread. \n", 97 | "This is only useful for debugging and profiling, but does not have any parallelization. \n", 98 | "\n", 99 | "#### 1.2. Threaded scheduler\n", 100 | "The threaded scheduler uses a pool of **local** threads to execute tasks concurrently. \n", 101 | "This is the default scheduler for Dask, and is suitable for most use cases on a single machine. Multithreading works well for Dask Array and Dask DataFrame. \n", 102 | "\n", 103 | "To select one of the above scheduler for your computation, you can specify it when doing `.compute()`:\n", 104 | "\n", 105 | "For example: \n", 106 | "```python\n", 107 | "this.compute(scheduler=\"single-threaded\") # for debugging and profiling only\n", 108 | "```\n", 109 | "\n", 110 | "\n", 111 | "As mentioned above the *threaded scheduler* is the default scheduler in Dask. But you can set the default scheduler to Single-threaded or multi-processing by: \n", 112 | "\n", 113 | "```python\n", 114 | "import dask\n", 115 | "dask.config.set(scheduler='synchronous') # overwrite default with single-threaded scheduler\n", 116 | "```\n", 117 | "\n", 118 | "Multi-processing works well for pure Python code - *delayed* functions and operations on Dask Bags.\n", 119 | "\n", 120 | "Let's compare the performance of each of these single-machine schedulers:" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "022a8b81-6474-47b0-8892-d0f5d65f0ffa", 126 | "metadata": {}, 127 | "source": [ 128 | "### Distributed Clusters" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 1, 134 | "id": "5ed871a9-45d4-4b0e-9f39-4dbb0b871c64", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "import numpy as np\n", 139 | "import dask.array as da" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 14, 145 | "id": "f61fb1fe-bbd7-4207-8ac6-8eb784d6710f", 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "CPU times: user 14.9 s, sys: 1.32 s, total: 16.2 s\n", 153 | "Wall time: 16.1 s\n" 154 | ] 155 | }, 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "array([ 9.99987393, 9.99942047, 10.00069322, ..., 9.99997333,\n", 160 | " 9.99945909, 10.00094973])" 161 | ] 162 | }, 163 | "execution_count": 14, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "%%time\n", 170 | "## - numpy performance\n", 171 | "xn = np.random.normal(10, 0.1, size=(20_000, 20_000))\n", 172 | "yn = xn.mean(axis=0)\n", 173 | "yn" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 15, 179 | "id": "af609dc9-9482-41e0-8a93-5cbcf096e376", 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "CPU times: user 14.8 s, sys: 112 ms, total: 14.9 s\n", 187 | "Wall time: 3.83 s\n" 188 | ] 189 | }, 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "array([ 9.99928454, 9.99968075, 10.00027327, ..., 10.00030439,\n", 194 | " 9.9999113 , 9.99947802])" 195 | ] 196 | }, 197 | "execution_count": 15, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "%%time\n", 204 | "# -- dask array using the default\n", 205 | "xd = da.random.normal(10, 0.1, size=(20_000, 20_000), chunks=(2000, 2000))\n", 206 | "yd = xd.mean(axis=0)\n", 207 | "yd.compute()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 16, 213 | "id": "141053bf-942c-46be-b169-2709ca569451", 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | " threading : 3.7886 s\n", 221 | " processes : 5.2656 s\n", 222 | " sync : 14.7481 s\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "import time\n", 228 | "# -- dask testing different schedulers:\n", 229 | "for sch in ['threading', 'processes', 'sync']:\n", 230 | " t0 = time.time()\n", 231 | " r = yd.compute(scheduler=sch)\n", 232 | " t1 = time.time()\n", 233 | " print(f\"{sch:>10} : {t1 - t0:0.4f} s\")" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 5, 239 | "id": "4c33d184-7c25-4052-aece-50dd9382e32e", 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/html": [ 245 | "
\n", 246 | "
\n", 247 | "
\n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | "
\n", 259 | "
\n", 260 | "

HighLevelGraph

\n", 261 | "

\n", 262 | " HighLevelGraph with 4 layers and 240 keys from all layers.\n", 263 | "

\n", 264 | " \n", 265 | "
\n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | "\n", 272 | "
\n", 273 | " \n", 274 | "

Layer1: normal

\n", 275 | "
\n", 276 | "

\n", 277 | " normal-6ad96170c4c61710dbc18b74e58c3cb2\n", 278 | "

\n", 279 | "\n", 280 | " \n", 281 | " \n", 282 | " \n", 328 | " \n", 365 | " \n", 366 | "
\n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | "
layer_typeMaterializedLayer
is_materializedTrue
number of outputs100
shape(30000, 30000)
dtypefloat64
chunksize(3000, 3000)
typedask.array.core.Array
chunk_typenumpy.ndarray
\n", 327 | "
\n", 329 | " \n", 330 | "\n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | "\n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | "\n", 357 | " \n", 358 | " \n", 359 | "\n", 360 | " \n", 361 | " 30000\n", 362 | " 30000\n", 363 | "\n", 364 | "
\n", 367 | "\n", 368 | "
\n", 369 | "
\n", 370 | " \n", 371 | "
\n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | "\n", 378 | "
\n", 379 | " \n", 380 | "

Layer2: mean_chunk

\n", 381 | "
\n", 382 | "

\n", 383 | " mean_chunk-1ccb39e699989873e56e0c577ab50469\n", 384 | "

\n", 385 | "\n", 386 | " \n", 387 | " \n", 388 | " \n", 441 | " \n", 478 | " \n", 479 | "
\n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | "
layer_typeBlockwise
is_materializedTrue
number of outputs100
shape(30000, 30000)
dtypefloat64
chunksize(3000, 3000)
typedask.array.core.Array
chunk_typenumpy.ndarray
depends on normal-6ad96170c4c61710dbc18b74e58c3cb2
\n", 440 | "
\n", 442 | " \n", 443 | "\n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | "\n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | "\n", 470 | " \n", 471 | " \n", 472 | "\n", 473 | " \n", 474 | " 30000\n", 475 | " 30000\n", 476 | "\n", 477 | "
\n", 480 | "\n", 481 | "
\n", 482 | "
\n", 483 | " \n", 484 | "
\n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | "\n", 491 | "
\n", 492 | " \n", 493 | "

Layer3: mean_combine-partial

\n", 494 | "
\n", 495 | "

\n", 496 | " mean_combine-partial-9d7408b2918b12ded83c71dd3ff57f3e\n", 497 | "

\n", 498 | "\n", 499 | " \n", 500 | " \n", 501 | " \n", 554 | " \n", 584 | " \n", 585 | "
\n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | "
layer_typeMaterializedLayer
is_materializedTrue
number of outputs30
shape(3, 30000)
dtypefloat64
chunksize(1, 3000)
typedask.array.core.Array
chunk_typenumpy.ndarray
depends on mean_chunk-1ccb39e699989873e56e0c577ab50469
\n", 553 | "
\n", 555 | " \n", 556 | "\n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | "\n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | "\n", 576 | " \n", 577 | " \n", 578 | "\n", 579 | " \n", 580 | " 30000\n", 581 | " 3\n", 582 | "\n", 583 | "
\n", 586 | "\n", 587 | "
\n", 588 | "
\n", 589 | " \n", 590 | "
\n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | "\n", 597 | "
\n", 598 | " \n", 599 | "

Layer4: mean_agg-aggregate

\n", 600 | "
\n", 601 | "

\n", 602 | " mean_agg-aggregate-8bc993aec3b4d7a1fc433f965f460473\n", 603 | "

\n", 604 | "\n", 605 | " \n", 606 | " \n", 607 | " \n", 660 | " \n", 688 | " \n", 689 | "
\n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | "
layer_typeMaterializedLayer
is_materializedTrue
number of outputs10
shape(30000,)
dtypefloat64
chunksize(3000,)
typedask.array.core.Array
chunk_typenumpy.ndarray
depends on mean_combine-partial-9d7408b2918b12ded83c71dd3ff57f3e
\n", 659 | "
\n", 661 | " \n", 662 | "\n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | "\n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | "\n", 680 | " \n", 681 | " \n", 682 | "\n", 683 | " \n", 684 | " 30000\n", 685 | " 1\n", 686 | "\n", 687 | "
\n", 690 | "\n", 691 | "
\n", 692 | "
\n", 693 | " \n", 694 | "
\n", 695 | "
\n", 696 | "
" 697 | ], 698 | "text/plain": [ 699 | "HighLevelGraph with 4 layers.\n", 700 | "\n", 701 | " 0. normal-6ad96170c4c61710dbc18b74e58c3cb2\n", 702 | " 1. mean_chunk-1ccb39e699989873e56e0c577ab50469\n", 703 | " 2. mean_combine-partial-9d7408b2918b12ded83c71dd3ff57f3e\n", 704 | " 3. mean_agg-aggregate-8bc993aec3b4d7a1fc433f965f460473" 705 | ] 706 | }, 707 | "execution_count": 5, 708 | "metadata": {}, 709 | "output_type": "execute_result" 710 | } 711 | ], 712 | "source": [ 713 | "yd.dask" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 6, 719 | "id": "5641646c-3175-422e-95aa-f0d638b0687b", 720 | "metadata": {}, 721 | "outputs": [ 722 | { 723 | "data": { 724 | "text/html": [ 725 | "\n", 726 | " \n", 727 | " \n", 762 | " \n", 790 | " \n", 791 | "
\n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | "
Array Chunk
Bytes 234.38 kiB 23.44 kiB
Shape (30000,) (3000,)
Count 4 Graph Layers 10 Chunks
Type float64 numpy.ndarray
\n", 761 | "
\n", 763 | " \n", 764 | "\n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | "\n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | "\n", 782 | " \n", 783 | " \n", 784 | "\n", 785 | " \n", 786 | " 30000\n", 787 | " 1\n", 788 | "\n", 789 | "
" 792 | ], 793 | "text/plain": [ 794 | "dask.array" 795 | ] 796 | }, 797 | "execution_count": 6, 798 | "metadata": {}, 799 | "output_type": "execute_result" 800 | } 801 | ], 802 | "source": [ 803 | "yd" 804 | ] 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "id": "fe2398e2-9ac6-4e85-89fe-8c794571dfeb", 809 | "metadata": {}, 810 | "source": [ 811 | "* Notice how `sync` scheduler takes almost the same time as pure NumPy code. \n", 812 | "* Why is the multiprocessing scheduler so much slower?\n", 813 | "\n", 814 | "If you use the multiprocessing backend, all communication between processes still needs to pass through the main process because processes are isolated from other processes. This introduces a large overhead. \n", 815 | "\n", 816 | "**The Dask developers recommend using the Dask Distributed Scheduler which we will cover now.**\n" 817 | ] 818 | }, 819 | { 820 | "cell_type": "markdown", 821 | "id": "8292ca19-7d31-4702-ae86-ecb75c16a47d", 822 | "metadata": {}, 823 | "source": [ 824 | "\n", 825 | "### 2. Distributed scheduler\n", 826 | "* The Distributed scheduler or `dask.distributed` schedules tasks and manages the execution of those tasks on workers from a single or multiple machines. \n", 827 | "* This scheduler is more sophisticated and offers more features including a live diagnostic dashboard which provides live insight on performance and progress of the calculations.\n", 828 | "\n", 829 | "\n", 830 | "In most cases, `dask.distributed` is preferred since it is very scalable, and provides and informative interactive dashboard and access to more complex Dask collections such as `futures`. \n" 831 | ] 832 | }, 833 | { 834 | "cell_type": "markdown", 835 | "id": "e2d506bb-1c20-46f7-a0ea-2d9b10ffce6f", 836 | "metadata": {}, 837 | "source": [ 838 | "#### 2.1. Local Cluster\n", 839 | "\n", 840 | "A Dask Local Cluster refers to a group of worker processes that run on a single machine and are managed by a single Dask scheduler. \n", 841 | "\n", 842 | "This is useful for situations where the computational requirements are not large enough to warrant the use of a full cluster of separate machines. It provides an easy way to run parallel computations on a single machine, without the need for complex cluster management or other infrastructure." 843 | ] 844 | }, 845 | { 846 | "cell_type": "markdown", 847 | "id": "bc1a0d22-e14f-46cc-9e54-157d098f5380", 848 | "metadata": {}, 849 | "source": [ 850 | "##### Let's start by creating a Local Cluster\n", 851 | "\n", 852 | "For this we need to set up a `LocalCluster` using `dask.distributed` and connect a `client` to it. " 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": 7, 858 | "id": "bcf3e27b-7ac6-4250-8e01-3836fd7e0c18", 859 | "metadata": {}, 860 | "outputs": [ 861 | { 862 | "data": { 863 | "text/html": [ 864 | "
\n", 865 | "
\n", 866 | "
\n", 867 | "

Client

\n", 868 | "

Client-528e046a-a5a4-11ed-928c-3cecef1b11fa

\n", 869 | " \n", 870 | "\n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | "\n", 878 | " \n", 879 | " \n", 880 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | "\n", 887 | "
Connection method: Cluster objectCluster type: distributed.LocalCluster
\n", 881 | " Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/negins/casper_16_4/proxy/8787/status\n", 882 | "
\n", 888 | "\n", 889 | " \n", 890 | "
\n", 891 | "

Cluster Info

\n", 892 | "
\n", 893 | "
\n", 894 | "
\n", 895 | "
\n", 896 | "

LocalCluster

\n", 897 | "

ecdf1399

\n", 898 | " \n", 899 | " \n", 900 | " \n", 903 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 911 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | "\n", 920 | "\n", 921 | " \n", 922 | "
\n", 901 | " Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/negins/casper_16_4/proxy/8787/status\n", 902 | " \n", 904 | " Workers: 4\n", 905 | "
\n", 909 | " Total threads: 4\n", 910 | " \n", 912 | " Total memory: 16.00 GiB\n", 913 | "
Status: runningUsing processes: True
\n", 923 | "\n", 924 | "
\n", 925 | " \n", 926 | "

Scheduler Info

\n", 927 | "
\n", 928 | "\n", 929 | "
\n", 930 | "
\n", 931 | "
\n", 932 | "
\n", 933 | "

Scheduler

\n", 934 | "

Scheduler-680550a6-4f52-4d2d-99d6-6adbb921c7c9

\n", 935 | " \n", 936 | " \n", 937 | " \n", 940 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 948 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 956 | " \n", 959 | " \n", 960 | "
\n", 938 | " Comm: tcp://127.0.0.1:46436\n", 939 | " \n", 941 | " Workers: 4\n", 942 | "
\n", 946 | " Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/negins/casper_16_4/proxy/8787/status\n", 947 | " \n", 949 | " Total threads: 4\n", 950 | "
\n", 954 | " Started: Just now\n", 955 | " \n", 957 | " Total memory: 16.00 GiB\n", 958 | "
\n", 961 | "
\n", 962 | "
\n", 963 | "\n", 964 | "
\n", 965 | " \n", 966 | "

Workers

\n", 967 | "
\n", 968 | "\n", 969 | " \n", 970 | "
\n", 971 | "
\n", 972 | "
\n", 973 | "
\n", 974 | " \n", 975 | "

Worker: 0

\n", 976 | "
\n", 977 | " \n", 978 | " \n", 979 | " \n", 982 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 990 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1004 | " \n", 1005 | "\n", 1006 | " \n", 1007 | "\n", 1008 | " \n", 1009 | "\n", 1010 | "
\n", 980 | " Comm: tcp://127.0.0.1:45124\n", 981 | " \n", 983 | " Total threads: 1\n", 984 | "
\n", 988 | " Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/negins/casper_16_4/proxy/39912/status\n", 989 | " \n", 991 | " Memory: 4.00 GiB\n", 992 | "
\n", 996 | " Nanny: tcp://127.0.0.1:42313\n", 997 | "
\n", 1002 | " Local directory: /glade/scratch/negins/dask-worker-space/worker-qsda33zu\n", 1003 | "
\n", 1011 | "
\n", 1012 | "
\n", 1013 | "
\n", 1014 | " \n", 1015 | "
\n", 1016 | "
\n", 1017 | "
\n", 1018 | "
\n", 1019 | " \n", 1020 | "

Worker: 1

\n", 1021 | "
\n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1027 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1035 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1049 | " \n", 1050 | "\n", 1051 | " \n", 1052 | "\n", 1053 | " \n", 1054 | "\n", 1055 | "
\n", 1025 | " Comm: tcp://127.0.0.1:38609\n", 1026 | " \n", 1028 | " Total threads: 1\n", 1029 | "
\n", 1033 | " Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/negins/casper_16_4/proxy/42328/status\n", 1034 | " \n", 1036 | " Memory: 4.00 GiB\n", 1037 | "
\n", 1041 | " Nanny: tcp://127.0.0.1:41562\n", 1042 | "
\n", 1047 | " Local directory: /glade/scratch/negins/dask-worker-space/worker-cpl_hk52\n", 1048 | "
\n", 1056 | "
\n", 1057 | "
\n", 1058 | "
\n", 1059 | " \n", 1060 | "
\n", 1061 | "
\n", 1062 | "
\n", 1063 | "
\n", 1064 | " \n", 1065 | "

Worker: 2

\n", 1066 | "
\n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1072 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1080 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1094 | " \n", 1095 | "\n", 1096 | " \n", 1097 | "\n", 1098 | " \n", 1099 | "\n", 1100 | "
\n", 1070 | " Comm: tcp://127.0.0.1:43616\n", 1071 | " \n", 1073 | " Total threads: 1\n", 1074 | "
\n", 1078 | " Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/negins/casper_16_4/proxy/44797/status\n", 1079 | " \n", 1081 | " Memory: 4.00 GiB\n", 1082 | "
\n", 1086 | " Nanny: tcp://127.0.0.1:40001\n", 1087 | "
\n", 1092 | " Local directory: /glade/scratch/negins/dask-worker-space/worker-qnoend2w\n", 1093 | "
\n", 1101 | "
\n", 1102 | "
\n", 1103 | "
\n", 1104 | " \n", 1105 | "
\n", 1106 | "
\n", 1107 | "
\n", 1108 | "
\n", 1109 | " \n", 1110 | "

Worker: 3

\n", 1111 | "
\n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1117 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1125 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1139 | " \n", 1140 | "\n", 1141 | " \n", 1142 | "\n", 1143 | " \n", 1144 | "\n", 1145 | "
\n", 1115 | " Comm: tcp://127.0.0.1:40873\n", 1116 | " \n", 1118 | " Total threads: 1\n", 1119 | "
\n", 1123 | " Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/negins/casper_16_4/proxy/40512/status\n", 1124 | " \n", 1126 | " Memory: 4.00 GiB\n", 1127 | "
\n", 1131 | " Nanny: tcp://127.0.0.1:36033\n", 1132 | "
\n", 1137 | " Local directory: /glade/scratch/negins/dask-worker-space/worker-0sqdrcdn\n", 1138 | "
\n", 1146 | "
\n", 1147 | "
\n", 1148 | "
\n", 1149 | " \n", 1150 | "\n", 1151 | "
\n", 1152 | "
\n", 1153 | "\n", 1154 | "
\n", 1155 | "
\n", 1156 | "
\n", 1157 | "
\n", 1158 | " \n", 1159 | "\n", 1160 | "
\n", 1161 | "
" 1162 | ], 1163 | "text/plain": [ 1164 | "" 1165 | ] 1166 | }, 1167 | "execution_count": 7, 1168 | "metadata": {}, 1169 | "output_type": "execute_result" 1170 | } 1171 | ], 1172 | "source": [ 1173 | "from dask.distributed import LocalCluster, Client\n", 1174 | "\n", 1175 | "cluster = LocalCluster()\n", 1176 | "client = Client(cluster)\n", 1177 | "client" 1178 | ] 1179 | }, 1180 | { 1181 | "cell_type": "markdown", 1182 | "id": "2e3384ef-9a9a-4425-bd77-6ca811ff0577", 1183 | "metadata": {}, 1184 | "source": [ 1185 | "☝️ Click the Dashboard link above.\n", 1186 | "\n", 1187 | "👈 Or click the “Search” 🔍 button in the dask-labextension dashboard.\n", 1188 | "\n" 1189 | ] 1190 | }, 1191 | { 1192 | "cell_type": "markdown", 1193 | "id": "d556c2ff-333f-48bf-9183-689ff589f011", 1194 | "metadata": {}, 1195 | "source": [ 1196 | "If no arguments are specified in `LocalCluster` it will automatically detect the number of CPU cores your system has and the amount of memory and create workers to appropriately fill that.\n", 1197 | "\n", 1198 | "A `LocalCluster` will use the full resources of the current JupyterLab session. For example, if you used NCAR JupyterHub, it will use the number of CPUs selected. \n", 1199 | "\n", 1200 | "\n", 1201 | "Note that `LocalCluster()` takes a lot of optional arguments, allowing you to configure the number of processes/threads, memory limits and other settings.\n" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "markdown", 1206 | "id": "0b0f0121-2307-4ea6-b3f7-9e7925c53ca8", 1207 | "metadata": {}, 1208 | "source": [ 1209 | "You can also find your cluster dashboard link using : " 1210 | ] 1211 | }, 1212 | { 1213 | "cell_type": "code", 1214 | "execution_count": 8, 1215 | "id": "c7a9a540-2592-44fb-b77e-4474d8521125", 1216 | "metadata": {}, 1217 | "outputs": [ 1218 | { 1219 | "data": { 1220 | "text/plain": [ 1221 | "'https://jupyterhub.hpc.ucar.edu/stable/user/negins/casper_16_4/proxy/8787/status'" 1222 | ] 1223 | }, 1224 | "execution_count": 8, 1225 | "metadata": {}, 1226 | "output_type": "execute_result" 1227 | } 1228 | ], 1229 | "source": [ 1230 | "cluster.dashboard_link" 1231 | ] 1232 | }, 1233 | { 1234 | "cell_type": "code", 1235 | "execution_count": 9, 1236 | "id": "a1f41158-4165-4da8-a2d1-5afb68edf513", 1237 | "metadata": {}, 1238 | "outputs": [ 1239 | { 1240 | "name": "stdout", 1241 | "output_type": "stream", 1242 | "text": [ 1243 | "CPU times: user 499 ms, sys: 142 ms, total: 641 ms\n", 1244 | "Wall time: 10.1 s\n" 1245 | ] 1246 | }, 1247 | { 1248 | "data": { 1249 | "text/plain": [ 1250 | "array([10.00024901, 10.00025024, 10.00001342, ..., 10.00006029,\n", 1251 | " 9.99957823, 10.00021491])" 1252 | ] 1253 | }, 1254 | "execution_count": 9, 1255 | "metadata": {}, 1256 | "output_type": "execute_result" 1257 | } 1258 | ], 1259 | "source": [ 1260 | "%%time\n", 1261 | "# -- dask array using the default\n", 1262 | "xd = da.random.normal(10, 0.1, size=(30_000, 30_000), chunks=(3000, 3000))\n", 1263 | "yd = xd.mean(axis=0)\n", 1264 | "yd.compute()" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "markdown", 1269 | "id": "1bf2d2b0-7cd9-4e25-98f4-46a801d29964", 1270 | "metadata": {}, 1271 | "source": [ 1272 | "Always remember to close your local Dask cluster:" 1273 | ] 1274 | }, 1275 | { 1276 | "cell_type": "code", 1277 | "execution_count": 10, 1278 | "id": "0a292434-3f33-480e-b332-be809f8c8571", 1279 | "metadata": {}, 1280 | "outputs": [], 1281 | "source": [ 1282 | "client.shutdown()" 1283 | ] 1284 | }, 1285 | { 1286 | "cell_type": "markdown", 1287 | "id": "8c031818-16fd-44ef-b94c-f0f1843567f7", 1288 | "metadata": {}, 1289 | "source": [ 1290 | "### Dask Distributed (Cluster)\n", 1291 | "\n", 1292 | "So far we have talked about running a job on a local machine.\n", 1293 | "\n", 1294 | "Dask can be deployed on distributed infrastructure, such as a an HPC system or a cloud computing system.\n", 1295 | "\n", 1296 | "\n", 1297 | " \n", 1298 | "Dask Clusters have different names corresponding to different computing environments. Some examples are `dask-jobqueue` for your HPC systems (including `PBSCluster`) or Kubernetes Cluster for machines on the Cloud. \n", 1299 | "\n", 1300 | "In section 5, we will talk more about Dask on HPC Systems. " 1301 | ] 1302 | } 1303 | ], 1304 | "metadata": { 1305 | "kernelspec": { 1306 | "display_name": "NPL 2023a", 1307 | "language": "python", 1308 | "name": "npl-2023a" 1309 | }, 1310 | "language_info": { 1311 | "codemirror_mode": { 1312 | "name": "ipython", 1313 | "version": 3 1314 | }, 1315 | "file_extension": ".py", 1316 | "mimetype": "text/x-python", 1317 | "name": "python", 1318 | "nbconvert_exporter": "python", 1319 | "pygments_lexer": "ipython3", 1320 | "version": "3.9.15" 1321 | }, 1322 | "widgets": { 1323 | "application/vnd.jupyter.widget-state+json": { 1324 | "state": {}, 1325 | "version_major": 2, 1326 | "version_minor": 0 1327 | } 1328 | } 1329 | }, 1330 | "nbformat": 4, 1331 | "nbformat_minor": 5 1332 | } 1333 | -------------------------------------------------------------------------------- /notebooks/05-dask-hpc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "990cfa4c-2117-4435-9806-ff9048890398", 6 | "metadata": {}, 7 | "source": [ 8 | "\"NCAR\n", 12 | "\n", 13 | "# Dask on HPC - Starting Clusters, Monitoring, and Debugging\n", 14 | "\n", 15 | "**ESDS Dask tutorial | 06 February, 2023** \n", 16 | "\n", 17 | "Negin Sobhani, Brian Vanderwende, Deepak Cherian, Ben Kirk \n", 18 | "Computational & Information Systems Lab (CISL) \n", 19 | "[negins@ucar.edu](mailto:negins@ucar.edu), [vanderwb@ucar.edu](mailto:vanderwb@ucar.edu)\n", 20 | "\n", 21 | "------" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "36f1fc96-8189-48b4-a61c-c8f2d2fc1a58", 27 | "metadata": {}, 28 | "source": [ 29 | "### In this tutorial, you will learn:\n", 30 | "\n", 31 | "* How to configure and initialize an HPC Dask cluster via `dask-jobqueue`\n", 32 | "* How to manage and monitor the resource usage of your Dask workers\n", 33 | "* Understanding Dask worker logs\n", 34 | "* Controlling how and where data spills from memory to disk\n", 35 | "* Analyzing the impact of your Dask workflow on your allocation\n", 36 | "\n", 37 | "**Related Documentation**\n", 38 | "\n", 39 | "* [dask-jobqueue documentation](https://jobqueue.dask.org/en/latest/)\n", 40 | "* [Diagnosing Distributed Dask Performance](https://distributed.dask.org/en/stable/diagnosing-performance.html)\n", 41 | "* [Dask HPC Configuratio Examples](https://jobqueue.dask.org/en/latest/configurations.html)\n", 42 | "* [Managing HPC allocations at NCAR](https://arc.ucar.edu/knowledge_base/70549817)\n", 43 | "\n", 44 | "---" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "c6b54bf9-f29f-4aea-bc32-706e82472165", 50 | "metadata": {}, 51 | "source": [ 52 | "## Starting HPC Dask clusters with `dask-jobqueue`\n", 53 | "\n", 54 | "A defining feature of most HPC systems is the batch scheduler - *Slurm, PBS, LSF, etc...* These schedulers allow us to access the significant resources of the system and scale far beyond what is capable by a personal workstation.\n", 55 | "\n", 56 | "Using Dask on an HPC system is no different - we need to interact with the scheduler to provide Dask with ample compute resources. We *could* first start a job with multiple cores and a large amount of memory, and then use the **LocalCluster** to spawn workers. However, this approach only scales to a single node.\n", 57 | "\n", 58 | "The typical approach is to let Dask request resources directly from the job scheduler via a scheduler-specific cluster type. Such clusters are provided by the add-on `dask-jobqueue` package." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "fe8d2cd0-2836-4abb-ac00-79cfc2081cd6", 64 | "metadata": {}, 65 | "source": [ 66 | "### Creating a scheduled-cluster\n", 67 | "\n", 68 | "Since we use the PBS Pro scheduler at NCAR, we will use the **PBSCluster** Dask scheduler from `dask-jobqueue`. Initialization is similar to a **LocalCluster**, but with unique parameters specific to creating batch jobs." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "d317a0ce-f1b8-47c5-8790-a9fd266ca248", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "import dask\n", 79 | "from dask_jobqueue import PBSCluster\n", 80 | "from dask.distributed import Client" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "36531de7-4dbe-442b-9d22-65c0370412d4", 86 | "metadata": {}, 87 | "source": [ 88 | "The parameters of the `PBSCluster` provide a basic template for the resources that will be assigned to each job..." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "4b0ecda0-2777-4fcd-a1a8-f3cd462f4fe6", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# Create a PBS cluster object\n", 99 | "cluster = PBSCluster(\n", 100 | " job_name = 'dask-wk23-hpc',\n", 101 | " cores = 1,\n", 102 | " memory = '4GiB',\n", 103 | " processes = 1,\n", 104 | " local_directory = '/local_scratch/pbs.$PBS_JOBID/dask/spill',\n", 105 | " resource_spec = 'select=1:ncpus=1:mem=4GB',\n", 106 | " queue = 'casper',\n", 107 | " walltime = '30:00',\n", 108 | " interface = 'ext'\n", 109 | ")" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "id": "ebfcb342-71f3-4def-b7f7-212e0c8ee395", 115 | "metadata": {}, 116 | "source": [ 117 | "*Since we are working on a shared system, you may get a port-in-use warning. This is no cause for alarm, but make sure you are not starting a duplicate cluster unintentionally.*\n", 118 | "\n", 119 | "We should pause and consider some of these settings...\n", 120 | "\n", 121 | "* The `cores` and `memory` parameters are used by Dask to define workers, while the `resource_spec` is used by PBS to define jobs. In this single-worker config, they should match!\n", 122 | "* PBS uses *GB* to mean 1024-based storage units. `dask-jobqueue` accurately calls these `GiB`.\n", 123 | "* We use `interface='ext'` to instruct Dask to use TCP over the high-speed ethernet instead of other, slower, ethernet devices.\n", 124 | "\n", 125 | "Note also that we are using one worker per PBS job. This is a reasonable default on Casper, but it is possible to group workers together on one or more PBS jobs as well by increasing the `cores` and `ncpus`. Here are some considerations:\n", 126 | "\n", 127 | "**Using less workers per job will:**\n", 128 | "* Increase job throughput on most systems (easier to backfill smaller jobs)\n", 129 | "* Will always avoid interpretor lock issues\n", 130 | "* Is conceptually easy to understand\n", 131 | "* May be more robust if system is unstable\n", 132 | "* Can speed up file-reads in some situations\n", 133 | "\n", 134 | "**Using more workers per job will:**\n", 135 | "* Less overhead in thread-friendly workflows\n", 136 | "* May allow for slightly higher memory thresholds since they will share a pool" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "308fbaa0-e295-4d31-a9d0-13e8795a7c7d", 142 | "metadata": {}, 143 | "source": [ 144 | "#### It is good practice to check your validate your cluster before initiating any workers, by outputting the job script Dask will create" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "10475615-7181-4b46-b807-7e210d488c73", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "print(cluster.job_script())" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "id": "06fb88ef-1c5c-4041-905b-31bd4fe09fed", 160 | "metadata": {}, 161 | "source": [ 162 | "*Note how some settings are showing up despite me not setting them... where does my account come from, for example?*\n", 163 | "\n", 164 | "Let's take a detour for a moment..." 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "id": "6098b8a8-5007-4618-a6a3-2b2dbf0ad4a6", 170 | "metadata": {}, 171 | "source": [ 172 | "### Dask configuration files\n", 173 | "\n", 174 | "We can customize the behavior of Dask using YAML-based configuration files. These have some advantages:\n", 175 | "\n", 176 | "* Eliminate user-specific configuration from your notebooks/scripts\n", 177 | "* Avoid repetition in defining clusters and other Dask objects\n", 178 | "* Potentially reduce errors from forgetting important settings\n", 179 | "\n", 180 | "And also some downsides:\n", 181 | "\n", 182 | "* Obfuscates settings from others (including your future self!)\n", 183 | "* Reduces portability and ease of debugging\n", 184 | "\n", 185 | "User configuration files are stored in `~/.config/dask` by default. System administrators may also provide default Dask configuration in `/etc/dask` or via the `DASK_ROOT_CONFIG` environment variable." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "8a955a20-eeac-4ea9-8220-e7951fe18e05", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "!ls ~/.config/dask" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "7599a5e1-ffab-4a8f-9998-92e7dfc933ef", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "# Programmatically view configuration file(s) within Python\n", 206 | "from dask import config\n", 207 | "config.refresh()\n", 208 | "config.get('jobqueue.pbs')" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "id": "eae2134d-ef6d-4d43-8a89-2e508b0ef281", 214 | "metadata": {}, 215 | "source": [ 216 | "### Live Performance Monitoring\n", 217 | "\n", 218 | "Using `dask.distributed` provides us with a powerful diagnostic tool you have already seen: the *Dashboard*. The Dashboard can be integrated into your Jupyter environment in two ways - either with a separate website accessible from the Client widget, or as tabs in your JupyterLab interface via the `dask-labextension` add-on.\n", 219 | "\n", 220 | "**Tip:** JupyterLab Dashboard tabs can be saved as a \"workspace\" and loaded back in future sessions.\n", 221 | "\n", 222 | "Let's see how both can be used to monitor Dask workers." 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "d394abe8-d31b-454d-b3eb-dbf8ae16ddaa", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "# Create the client to load the Dashboard\n", 233 | "client = Client(cluster)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "44a153c2-129c-49fd-ba77-7d13fa5048c1", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "# Display the client repr\n", 244 | "client" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "id": "882958c7-fae6-45c5-a6bf-a25b80a4b98c", 250 | "metadata": {}, 251 | "source": [ 252 | "The Dashboard is immediately accessible above when using NCAR's JupyterHub. This URL can also be entered into the Dashboard extension (click the Dask logo on the left toolbar), which allows you to add useful screens like `Task Stream` and `Workers Memory` to your Lab interface." 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "id": "bc7cc604-dfe2-4d7e-b305-3d71ee64428e", 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "# Scale the cluster to 2 workers (which will use 2 jobs here)\n", 263 | "cluster.scale(2)\n", 264 | "\n", 265 | "# Block progress until workers have spawned (typically only in demos and benchmarks!)\n", 266 | "client.wait_for_workers(2)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "id": "0a45c3b5-aad4-45b1-978b-a2374e83f260", 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "# See the workers from the cluster object\n", 277 | "cluster.workers" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "ad5a6709-858a-4434-9403-56ffe7a04f2e", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "# See the workers in the job scheduler\n", 288 | "!qstat -u $USER" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "id": "fba8a3f9-98e8-4260-8b7e-4b78e3f5a378", 294 | "metadata": {}, 295 | "source": [ 296 | "*As soon as we scale the cluster up, the clock is ticking on these PBS jobs. Be mindful of idle workers when using a batch scheduler!*" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "id": "2b84d057-b1b4-48d4-b253-62097086fcc2", 302 | "metadata": {}, 303 | "source": [ 304 | "#### Dashboard demo: multi-file Xarray data analysis\n", 305 | "\n", 306 | "To demonstrate how the Dashboard can be useful, let's do some simple analysis of data files using Xarray. Here we load 19 days of GOES5 data, and compute the mean near-surface temperature across the western US." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "id": "a62fe240-c8bc-4857-9e03-7af100baa2cb", 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "import xarray as xr" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "id": "6d8edc52-70ad-4a20-ad41-3596838f621b", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "# Use a multi-file import and load data in parallel\n", 327 | "ds = xr.open_mfdataset(\"/glade/collections/rda/data/ds313.0/orig_res/2022/GEOS5_orig_res_202201[0-1]*.nc\", parallel = True)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "dcf1b6f7-3290-4ffb-a982-c5e209e482b0", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "# Show the total size of the variable (this has not been read in yet!)\n", 338 | "print(\"Size of Variable = {:5.2f} GiB\".format(ds.T.nbytes / 1024 ** 3))" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "id": "d73db672-0e3e-4ad4-b09a-a762b1977901", 344 | "metadata": {}, 345 | "source": [ 346 | "This data is much too big for our worker template, but as we have seen the chunks will be smaller in size. We can see if they will fit in RAM or cause spill by querying the data array" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "id": "77a4042b-a3a2-4731-9ebb-33011337fecb", 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "# The graphical repr of one DaskArray - T\n", 357 | "ds.T" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "id": "37816544-50c9-42b2-835c-ce24a650f3c5", 363 | "metadata": {}, 364 | "source": [ 365 | "It looks like our data chunks will fit into RAM, but we can verify using the Dashboard. Let's construct our computation. Here we do the following:\n", 366 | "1. Subset the \"western US\" from the data via lat/lon slices\n", 367 | "2. Take the mean of temperature values across our western US box\n", 368 | "3. Select the near-surface level (0)\n", 369 | "\n", 370 | "Remember, we are just creating the task graph here. No work will occur yet in our cluster." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "id": "0d37e389-3f74-4798-ae85-0a94f9b5c19f", 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# Create our task graph\n", 381 | "sfc_mean_graph = ds.T.sel(lon = slice(235, 255), lat = slice(30,50)).mean([\"lat\",\"lon\"]).isel(lev = 0)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "id": "ee71e697-92c0-4832-9395-0864f3c7fac7", 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "# Tip - double-click the figure to see the actual size\n", 392 | "dask.visualize(sfc_mean_graph)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "id": "93316148-6748-42b0-a1c1-c3f0d9e2966b", 398 | "metadata": {}, 399 | "source": [ 400 | "Now, we can use `.compute()` to start the computation on our cluster. Keep an eye on the dashboard plots to follow progress." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "id": "4b746a91-75cc-4173-9cc1-9a6d1afd6c38", 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "%%time\n", 411 | "result = sfc_mean_graph.compute()" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "id": "ab77998c-2fd0-4672-be8c-8fedd11660a4", 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "result.plot()" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "id": "a55e3e88-d0b7-4d36-a2f0-766382c6d819", 427 | "metadata": {}, 428 | "source": [ 429 | "Now, let's see what speedup we can get by manually scaling up our computation by 2x. This is not possible (*beyond a certain hardware limit*) on a `LocalCluster`, but is easy to do using `dask-jobqueue`!" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "id": "6c28c82e-e191-42ca-85ba-4a911a4ca50c", 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "# Scale the cluster to 4 workers\n", 440 | "cluster.scale(4)\n", 441 | "client.wait_for_workers(4)" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "0221d503-9bcd-4cd7-9999-8449bcf556b4", 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "# How does this look in PBS?\n", 452 | "!qstat -u vanderwb" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "id": "02f3f86d-1a98-400f-bb9f-16739b7ad666", 458 | "metadata": {}, 459 | "source": [ 460 | "One downside of scaling is that you end up with a worker pool that has different amounts of wallclock time remaining. The flow of your script across time is something to consider - if you need to spin up more workers after some collection of them has used significant walltime, it may make sense to first scale down your cluster to zero (or run `client.restart()`) and then instantiate the new workers.\n", 461 | "\n", 462 | "Here, we will also demonstrate another type of performance monitoring provided by `dask.distributed` - the *performance report*. Using a context manager, we can profile the computational components in the task stream and store it as an HTML file for future analysis. It provides a hard copy analysis of the computation, though unfortunately it does not record worker memory usage." 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "id": "36abd4e9-4294-4eb9-a5e4-2f3d85a2f7c6", 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "# Let's try generating a \"performance report\" this time\n", 473 | "from dask.distributed import performance_report" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "id": "2e4d4d5a-a3f8-4f8d-a3af-fd033c51f337", 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "%%time\n", 484 | "# Since metrics are captured live anyway, the overhead from the report is small\n", 485 | "with performance_report(filename=\"dask-report.html\"):\n", 486 | " result = sfc_mean_graph.compute()" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "id": "5793937e-5444-4c6c-af82-0c758076979c", 492 | "metadata": {}, 493 | "source": [ 494 | "Hopefully, we see a significant improvement in time-to-solution using 2x the workers.\n", 495 | "\n", 496 | "Another improvement we can make to our computation is to reduce the size of the problem down as much as possible before doing meaningful work. Let's try rearranging our graph:" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "id": "c9a900c8-be09-4039-a95a-2617bcd463c3", 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "# Create our improved task graph\n", 507 | "sfc_mean_graph = ds.T.sel(lon = slice(235, 255), lat = slice(30,50)).isel(lev = 0).mean([\"lat\",\"lon\"])" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "id": "22bf9403-0186-46bd-8cd5-c0d613be6162", 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "%%time\n", 518 | "result = sfc_mean_graph.compute()" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "id": "2412a835-f759-4af2-a34a-b488295e4909", 524 | "metadata": {}, 525 | "source": [ 526 | "Since flattening the level dimension will reduce data compared to the lat-lon box, let's do that first to make subsequent operations cheaper.\n", 527 | "\n", 528 | "|Indexing|Dimensions|Pts Eliminated|\n", 529 | "|-|-|-|\n", 530 | "|isel|721 x 1152 x 71|58,972,032|\n", 531 | "|sel|(721 - 81) x (1152 - 81) x 72|49,351,680|" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "id": "0a2fd78d-e116-4e5b-95ad-a75e5e24b6d3", 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "# Can we do better?\n", 542 | "sfc_mean_graph = ds.T.isel(lev = 0).sel(lon = slice(235, 255), lat = slice(30,50)).mean([\"lat\",\"lon\"])" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "id": "7224a727-6873-4988-afcf-b4bf40182015", 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "%%time\n", 553 | "result = sfc_mean_graph.compute()" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "id": "bc4e065c-c2b1-43cb-9c06-65b3098ee881", 559 | "metadata": {}, 560 | "source": [ 561 | "In this case, optimizations to the base operation can yield better speed improvements than doubling the dask worker count. **Optimize your workflow first, if possible - then parallelize with Dask if still necessary.**\n", 562 | "\n", 563 | "Let's plot our results again to inspect for differences." 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "id": "3d727024-d802-46b0-8138-a08e6f118433", 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "result.plot()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "id": "f24fb2f4-b5fb-46c3-9ac4-6df61e4baf1f", 579 | "metadata": {}, 580 | "source": [ 581 | "#### Aside: Clusters can also adaptively scale\n", 582 | "\n", 583 | "For interactive, exploratory work, *[adaptive scaling](https://docs.dask.org/en/stable/how-to/adaptive.html)* can be useful (also very useful on cloud platforms). This allows the cluster to dynamically scale up and down based on the (Dask) scheduler's estimation of resource needs. This capability is highly customizable, but one basic method would be to set bounds on the number of worker jobs that can be used:\n", 584 | "\n", 585 | "```\n", 586 | "cluster.adapt(minimum=0, maximum=12)\n", 587 | "```\n", 588 | "\n", 589 | "Another benefit of adaptive scaling is that you can use the worker `--lifetime` argument to tell Dask to cleanly end work on a worker and restart the PBS job. If you stagger the start of your workers, Dask will be able to shuffle tasks appropriately to produce a so-called *[infinite workload](https://jobqueue.dask.org/en/latest/advanced-tips-and-tricks.html#how-to-handle-job-queueing-system-walltime-killing-workers)*.\n", 590 | "\n", 591 | "On busy systems, adaptive scaling can slow down bursty computations because of queue waits between scale-down and scale-up cycles." 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "id": "2e73976d-d2e5-45fe-a29b-06346b9373b7", 597 | "metadata": {}, 598 | "source": [ 599 | "#### Optimization: Persisting data in worker memory\n", 600 | "\n", 601 | "Sometimes you will need to compute multiple parameters on data from Dask objects. Using `.persist()` to store intermediate data in worker memory can save computational time if used appropriately. The raw data can be persisted too, of course, but watch out for exhausting worker memory.\n", 602 | "\n", 603 | "Here we compare the time it takes - with and without persisting intermediate results - to compute our level-0 mean, a level-10 mean, and a mean across all model levels.\n", 604 | "\n", 605 | "We will also introduce another diagnostic tool here, the `MemorySampler` context manager." 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "id": "767453d0-148b-4281-b767-5831569868f2", 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [ 615 | "from distributed.diagnostics import MemorySampler\n", 616 | "ms = MemorySampler()" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "id": "228524e0-3ffa-4740-a4a1-eaf9fc101c2c", 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "%%time\n", 627 | "# Without persistance\n", 628 | "with ms.sample(\"Original\"):\n", 629 | " r1 = ds.T.isel(lev = 0).sel(lon = slice(235, 255), lat = slice(30,50)).mean([\"lat\",\"lon\"]).compute()\n", 630 | " r2 = ds.T.isel(lev = 10).sel(lon = slice(235, 255), lat = slice(30,50)).mean([\"lat\",\"lon\"]).compute()\n", 631 | " ra = ds.T.sel(lon = slice(235, 255), lat = slice(30,50)).mean([\"lev\",\"lat\",\"lon\"]).compute()" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "id": "39882b19-e745-4b44-a2c4-d19ffa662aa1", 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "%%time\n", 642 | "# With persistance\n", 643 | "with ms.sample(\"Persist\"):\n", 644 | " T_means = ds.T.sel(lon = slice(235, 255), lat = slice(30,50)).mean([\"lat\",\"lon\"]).persist()\n", 645 | " r1 = T_means.isel(lev = 0).compute()\n", 646 | " r2 = T_means.isel(lev = 10).compute()\n", 647 | " ra = T_means.mean(\"lev\").compute()" 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "id": "9f0a952c-02c1-4db3-8703-e66b39886299", 653 | "metadata": {}, 654 | "source": [ 655 | "Without persisting the intermediate results, Dask will only store r1 and r2 in worker memory, and so the indexing operations must be done from scratch each time.\n", 656 | "\n", 657 | "Let's look at the memory usage..." 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": null, 663 | "id": "4966cc57-42f6-4f81-9386-dc5c5dbe3043", 664 | "metadata": {}, 665 | "outputs": [], 666 | "source": [ 667 | "ms.plot(align=True)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "id": "0b47939d-f4a8-4c3f-9e8c-4d4e47e01673", 673 | "metadata": {}, 674 | "source": [ 675 | "Because the intermediate results are required for the second calculation in the original case, we do not even use more memory. A clear win!\n", 676 | "\n", 677 | "Of course, when persisting data it is extra important to clean up. Running `del` on your persisted client variable will clear those data from worker memory (*as long as they are not referenced by other variables*). The **progress** dashboard is a useful reminder that we have data persisted." 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "id": "47eb05e6-f35e-473b-bed8-55e625d8b6f7", 684 | "metadata": {}, 685 | "outputs": [], 686 | "source": [ 687 | "del T_means" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": null, 693 | "id": "aa1b4fb0-ed7e-48be-a8f4-550e5edbc415", 694 | "metadata": {}, 695 | "outputs": [], 696 | "source": [ 697 | "# Close current workers\n", 698 | "cluster.scale(0)" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "id": "f503bce5-6ec0-4c4d-9ace-a75e92624768", 704 | "metadata": {}, 705 | "source": [ 706 | "### Debugging workers case study: memory and spill-to-disk" 707 | ] 708 | }, 709 | { 710 | "cell_type": "markdown", 711 | "id": "24a441ec-03b5-4f38-92a4-3b5028c96f65", 712 | "metadata": {}, 713 | "source": [ 714 | "In this section, we will demonstrate two common considerations when using Dask on HPC:\n", 715 | "* Dask data spilling to disk\n", 716 | "* Interacting with `dask.distributed` worker logs\n", 717 | "\n", 718 | "For this case study, we will generate progressively larger Dask arrays that eventually trigger memory conditions. Dask workers handle data in different ways in the following memory regimes:\n", 719 | "\n", 720 | "|Suggested Threshold|Case Study Value|Worker Behavior|\n", 721 | "|-|-|-|\n", 722 | "|0.6 (managed mem)|2.4 GB|Data is allocated on disk (spill)|\n", 723 | "|0.7 (process mem)|2.8 GB|Data is allocated on disk (spill)|\n", 724 | "|0.8|3.2 GB|New data allocation is paused|\n", 725 | "|0.95|3.8 GB|Worker is killed to avoid OOM|\n", 726 | "\n", 727 | "These thresholds can be set at cluster creation time or overridden by your Dask Distributed configuration file." 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "id": "1c2a719a-6e10-4aa5-a267-9a320616643a", 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "import dask.array as da\n", 738 | "from distributed.worker import logger" 739 | ] 740 | }, 741 | { 742 | "cell_type": "markdown", 743 | "id": "8e07b29d-2d9f-4a2a-93f9-6422df47b459", 744 | "metadata": {}, 745 | "source": [ 746 | "It is possible to write directly to worker logs (PBS job logs in our case) using the worker `logger` from Dask Distributed. Here, we define a function to call the logger on each worker, which we will run eagerly via `client.run`.\n", 747 | "\n", 748 | "Keep an eye on the worker memory Dashboard panel as our for loop proceeds..." 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "id": "55a817f6-6a9e-440d-b118-39ff99385cb6", 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "def log_message(chunk_size):\n", 759 | " logger.info(\"Current chunk size = {} MiB\".format(chunk_size))" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "id": "a2eb9129-8105-4954-b00c-69c822f92bfc", 766 | "metadata": {}, 767 | "outputs": [], 768 | "source": [ 769 | "# Start up 4 new workers\n", 770 | "cluster.scale(4)\n", 771 | "client.wait_for_workers(4)" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "id": "46f1ea0c-63a1-4e6a-9c20-819592047235", 778 | "metadata": {}, 779 | "outputs": [], 780 | "source": [ 781 | "for chunk_mib in [1600, 2400, 3200, 3900]:\n", 782 | " client.run(log_message, chunk_mib)\n", 783 | " chunk_size = chunk_mib / 8 * 1024 * 1024\n", 784 | " print(\"Mean of {} MiB random array = {:0.2f}\".format(chunk_mib, da.random.random((chunk_size * 4), chunks=(chunk_size)).mean().compute()))" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "id": "456149fc-3fe5-482b-83bb-8d0ab5f11673", 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [ 794 | "# List the most recent 4 worker logs - these should be our logs\n", 795 | "!ls -lrt dask-worker-logs | tail -n 4" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "id": "2090673f-d867-4b82-94d7-e43c527f43c5", 801 | "metadata": {}, 802 | "source": [ 803 | "We can open the log file in the Lab interface or a terminal and investigate the reason for the `KilledWorker` exception." 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "id": "ee6e1e52-9078-4bf5-8c03-edabbfbb7629", 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [ 813 | "# Let's look at the worker state in PBS after the failure\n", 814 | "!qstat -u vanderwb" 815 | ] 816 | }, 817 | { 818 | "cell_type": "markdown", 819 | "id": "35336aa6-3e52-48e1-957f-7e9e2fdfd0e2", 820 | "metadata": {}, 821 | "source": [ 822 | "Notice how the workers retried the final computation a few times before giving up. This behavior occurs when using the `nanny`, which attempts to restore workers when they are killed for various exceptions. (*the nanny is an additional lightweight process that monitors the health of the worker and enables functionality like full cluster restart*)\n", 823 | "\n", 824 | "Recall the aforementined memory thresholds - workers get killed by Dask before exceeding the absolute limit of a job (which if hit on certain systems, could kill the PBS job too). Because of this safety mechanism, our PBS jobs are still intact.\n", 825 | "\n", 826 | "If you are running a long computation that, *if restarted*, could exhaust the worker job's walltime, you can disable the nanny functionality and make exceptions fatal." 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": null, 832 | "id": "ec0635f9-8b2c-44b4-86f7-0d35b6765242", 833 | "metadata": {}, 834 | "outputs": [], 835 | "source": [ 836 | "# Shut down our client (thus terminating workers)\n", 837 | "client.shutdown()" 838 | ] 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "id": "c7114f8e-bdd0-4d15-9802-f723981bf72d", 843 | "metadata": {}, 844 | "source": [ 845 | "## Analyzing your allocation" 846 | ] 847 | }, 848 | { 849 | "cell_type": "markdown", 850 | "id": "e7a322b0-c521-4424-9d0c-440d35f1fb45", 851 | "metadata": {}, 852 | "source": [ 853 | "Dask does not provide integrated tools for analyzing the impact to your allocation (though you could back out values with care). This job is best suited for the scheduler itself, assuming that you've carefully instantiated your workers.\n", 854 | "\n", 855 | "Consider this value from our cluster config:" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": null, 861 | "id": "e848ef4b-a28d-472a-98a0-27f75254e8ab", 862 | "metadata": {}, 863 | "outputs": [], 864 | "source": [ 865 | "cluster.job_name" 866 | ] 867 | }, 868 | { 869 | "cell_type": "markdown", 870 | "id": "0d903623-91fc-4e45-8a02-dd629f5028dc", 871 | "metadata": {}, 872 | "source": [ 873 | "This means that every Dask worker I start via this workflow's `PBSCluster` will have the job name **dask-wk23-hpc**. We can leverage this along with our `qhist` utility to query the logs:\n", 874 | "\n", 875 | "```bash\n", 876 | "qhist -u $USER -N dask-wk23-hpc -f numcpus,elapsed -c > qhist-dask.out\n", 877 | "```\n", 878 | "\n", 879 | "This command will query all dask-workers in today's scheduler logs and output to CSV, which we redirect to a file. The `name` field is a powerful tool. If instead you use a worker name specific to your script, you can easily query only the jobs from that script. *Confusingly, if you wish to set the job name when creating your cluster object, use the `job_name` parameter, not `name`!*\n", 880 | "\n", 881 | "We can then read in the CSV using pandas (or even using a Dask Dataframe!)." 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": null, 887 | "id": "0e2b5bd8-96d7-4180-b1b7-f491cab24804", 888 | "metadata": {}, 889 | "outputs": [], 890 | "source": [ 891 | "import pandas as pd" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": null, 897 | "id": "730c5d64-0567-464e-87af-63840f878ed6", 898 | "metadata": {}, 899 | "outputs": [], 900 | "source": [ 901 | "dj = pd.read_csv(\"../data/qhist-dask.out\")\n", 902 | "dj.head()" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": null, 908 | "id": "d766238b-6d3a-4aa3-85c5-71802d86a0d7", 909 | "metadata": {}, 910 | "outputs": [], 911 | "source": [ 912 | "print(\"Core-hours used by this notebook: {:.2f}\".format(sum(dj['NCPUs'] * dj['Elapsed (h)'])))" 913 | ] 914 | }, 915 | { 916 | "cell_type": "markdown", 917 | "id": "12fbea5d-a01d-46b4-bcba-9e07b91b6a2d", 918 | "metadata": {}, 919 | "source": [ 920 | "Records from `qhist` span the time that PBS was used on each system, so with a bit of forward-thinking prep work (picking descriptive worker names), you can easily trace back usage." 921 | ] 922 | }, 923 | { 924 | "cell_type": "markdown", 925 | "id": "b767a0a5-1622-419f-9088-2eeeb18d3f11", 926 | "metadata": {}, 927 | "source": [ 928 | "## Additional Considerations\n", 929 | "\n", 930 | "As we've shown, Dask is flexible and highly configurable. While we will not cover the following topics in depth, we encourage you to explore further on your own (and let us know in the *survey* if you would like learn more!).\n", 931 | "\n", 932 | "#### Dask Worker Count\n", 933 | "\n", 934 | "You may be wondering how to choose the number of workers. This question is tricky and can often depend on the state of the machine at any time. Here are some absolutes:\n", 935 | "\n", 936 | "1. Use more than a single worker unless debugging or profiling\n", 937 | "2. Do not use more workers than you have chunks - they will be idle\n", 938 | "\n", 939 | "And here are some guidelines:\n", 940 | "\n", 941 | "1. If you have to choose between more workers vs. more memory per worker, let the chunk size be your guide (more on this in the next notebook)\n", 942 | "2. In general, requesting less workers with more memory will take longer to get through the queue than more workers with less memory; *typically memory is more constrained than CPU cores on analysis machines*\n", 943 | "3. Using [**adaptive scaling**](https://docs.dask.org/en/stable/how-to/adaptive.html) will make your workflow throughput less senstive to the state of the HPC jobs queue\n", 944 | "\n", 945 | "#### Using Dask on GPUs\n", 946 | "\n", 947 | "Much like Xarray can use NumPy arrays or Dask arrays, Dask itself can use NumPy arrays or [CuPy](https://cupy.dev/) arrays - the latter of which are GPU enabled on both NVIDIA and AMD hardware. For NVIDIA users, the [RAPIDS](https://rapids.ai/) suite offers cuDF - a drop in replacement for pandas DataFrames which can also be used with Dask. And efforts are underway to effectively use GPUs with Xarray and Dask.\n", 948 | "\n", 949 | "For a starting point, check out the [Dask with GPUs](https://www2.cisl.ucar.edu/events/gpu-series-multiple-gpus-python-dask) tutorial CISL offered in Summer 2022." 950 | ] 951 | } 952 | ], 953 | "metadata": { 954 | "kernelspec": { 955 | "display_name": "NPL 2023a", 956 | "language": "python", 957 | "name": "npl-2023a" 958 | }, 959 | "language_info": { 960 | "codemirror_mode": { 961 | "name": "ipython", 962 | "version": 3 963 | }, 964 | "file_extension": ".py", 965 | "mimetype": "text/x-python", 966 | "name": "python", 967 | "nbconvert_exporter": "python", 968 | "pygments_lexer": "ipython3", 969 | "version": "3.9.15" 970 | }, 971 | "widgets": { 972 | "application/vnd.jupyter.widget-state+json": { 973 | "state": {}, 974 | "version_major": 2, 975 | "version_minor": 0 976 | } 977 | } 978 | }, 979 | "nbformat": 4, 980 | "nbformat_minor": 5 981 | } 982 | -------------------------------------------------------------------------------- /notebooks/06-dask-chunking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "990cfa4c-2117-4435-9806-ff9048890398", 6 | "metadata": {}, 7 | "source": [ 8 | "\"NCAR\n", 12 | "\n", 13 | "# Dask Chunking - Best Practices\n", 14 | "\n", 15 | "**ESDS Dask tutorial | 06 February, 2023** \n", 16 | "\n", 17 | "Negin Sobhani, Brian Vanderwende, Deepak Cherian, Ben Kirk \n", 18 | "Computational & Information Systems Lab (CISL) \n", 19 | "[negins@ucar.edu](mailto:negins@ucar.edu), [vanderwb@ucar.edu](mailto:vanderwb@ucar.edu)\n", 20 | "\n", 21 | "------" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "36f1fc96-8189-48b4-a61c-c8f2d2fc1a58", 27 | "metadata": {}, 28 | "source": [ 29 | "### In this tutorial, you will learn:\n", 30 | "\n", 31 | "* Basic rules of thumb for chunking\n", 32 | "* The importance of conforming to file chunks\n", 33 | "* The impact of rechunking in the computational pipeline\n", 34 | "\n", 35 | "**Related Documentation**\n", 36 | "\n", 37 | "* [Dask Chunking Documentation](https://docs.dask.org/en/stable/array-chunks.html)\n", 38 | "* [Choosing Chunk Sizes Blog Post](https://blog.dask.org/2021/11/02/choosing-dask-chunk-sizes)\n", 39 | "* [Xarray Chunking Documentation](https://docs.xarray.dev/en/stable/user-guide/dask.html#chunking-and-performance)\n", 40 | "\n", 41 | "---" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "c6b54bf9-f29f-4aea-bc32-706e82472165", 47 | "metadata": { 48 | "tags": [] 49 | }, 50 | "source": [ 51 | "## Chunking Considerations\n", 52 | "\n", 53 | "Determining the best approach for sizing your Dask chunks can be tricky and often requires intuition about both Dask and your particular dataset. There are various considerations you may need to account for depending on your workflow:\n", 54 | "\n", 55 | "* The size (in bytes) of your chunks vs your number of workers\n", 56 | "* The chunk layout of data read from disk (formats like HDF5, Zarr)\n", 57 | "* The access patterns of your computational pipeline\n", 58 | "\n", 59 | "**Dask Array with NumPy array chunks...**\n", 60 | "\n", 61 | "\"Dask\n", 62 | "\n", 63 | "\n", 64 | "----" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "6bad3462-c839-4d9c-916b-2268276a2440", 70 | "metadata": { 71 | "tags": [] 72 | }, 73 | "source": [ 74 | "### Starting up our PBS Cluster\n", 75 | "\n", 76 | "To demonstrate the affects of different chunking strategies, let's instantiate a `PBSCluster` with 4 workers" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "d317a0ce-f1b8-47c5-8790-a9fd266ca248", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "import dask\n", 87 | "from dask_jobqueue import PBSCluster\n", 88 | "from dask.distributed import Client" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "4b0ecda0-2777-4fcd-a1a8-f3cd462f4fe6", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# Create a PBS cluster object\n", 99 | "cluster = PBSCluster(\n", 100 | " job_name = 'dask-wk23-chunking',\n", 101 | " cores = 1,\n", 102 | " memory = '10GiB',\n", 103 | " processes = 1,\n", 104 | " local_directory = '/glade/scratch/vanderwb/temp/dask/spill/pbs.$PBS_JOBID',\n", 105 | " resource_spec = 'select=1:ncpus=1:mem=10GB',\n", 106 | " queue = 'casper',\n", 107 | " walltime = '30:00',\n", 108 | " interface = 'ext'\n", 109 | ")" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "id": "10475615-7181-4b46-b807-7e210d488c73", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# Sanity-check our setup\n", 120 | "print(cluster.job_script())" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "107629fa-34a3-426c-828e-bafe6a8d7ca2", 127 | "metadata": { 128 | "tags": [] 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "client = Client(cluster)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "c82f23a4-a8f3-412e-b511-7b2f2c96c552", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "client" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "53a25dc4-9e04-4074-aa9e-21397b14c75b", 148 | "metadata": { 149 | "tags": [] 150 | }, 151 | "source": [ 152 | "----\n", 153 | "\n", 154 | "## Chunk size - Load balancing vs. Overhead\n", 155 | "\n", 156 | "There is always an optimal chunk size given your hardware setup and computation problem that is neither too big nor too small. Finding this chunk size often requires some trial and error, but it is helpful to know what you are looking to avoid:\n", 157 | "\n", 158 | "* **Too small** - if your chunks are too small, you will end up spending a significant and wasteful amount of time waiting for Dask to perform overhead (scheduling tasks, data communication) relative to the time spent computing\n", 159 | "* **Too large** - you run the risk of spilling to dask or memory failures and the scheduler also has a more difficult time load balancing\n", 160 | "\n", 161 | "The following rules of thumb are known, but it will vary according to your workflow:\n", 162 | "\n", 163 | "|Too Small|Possibly Too Small|Optimal|Too Large|\n", 164 | "|-|-|-|-|\n", 165 | "|< 1 MB|1-100 MB|100 MB - 1 GB|> Spill threshold|\n", 166 | "\n", 167 | "In practice, using chunks close to 0.1-0.5 GB in size works well.\n", 168 | "\n", 169 | "#### Let's test these rules of thumb..." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "dbb9aa86-182b-4ca6-bf62-d8a7884b0306", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# Spin up workers on our PBS cluster\n", 180 | "cluster.scale(4)\n", 181 | "client.wait_for_workers(4)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "6e389dd1-d6f7-46f4-8dc1-fae0543f9450", 187 | "metadata": {}, 188 | "source": [ 189 | "For this exercise, we will simply generate a random number **Dask Array** of sufficient size that it would not fit in our login session memory. Let's try different chunking strategies." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "7efdf2dc-2d99-43ad-a0a6-a3aa71049e1e", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "import dask.array as da" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "id": "d9d2aa49-94d0-4cd1-bc28-db3aef3815d5", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "t = da.random.random((60000, 72000), chunks = (30000,36000))\n", 210 | "t" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "id": "3c14713d-4374-463a-b0fe-394e43db5574", 216 | "metadata": {}, 217 | "source": [ 218 | "These chunks are too large. They will exceed our spill threshold (0.6-0.7) and even briefly exceed our pause limit (0.8). The only thing working in our favor in this configuration is that non-aggregation tasks should be well-balanced among the 4 workers with 4 chunks, and we have a short task graph." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "47da0213-4c7e-483e-99a3-73393559c817", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "task = t.mean()\n", 229 | "task.dask" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "id": "d74ce6b4-88d3-495e-8b54-66d69b68a614", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "%%time\n", 240 | "result = task.compute()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "id": "eca6de5f-d0d3-4f28-914c-6f104859a6a1", 246 | "metadata": {}, 247 | "source": [ 248 | "In this next configuration, we end up specifying a configuration with very small chunks relative to the problem size. We will not come close to the memory limits, but we will incur significant overhead relative to our computational task." 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "id": "552aa90c-908d-42ca-bcad-bfad2e8022ef", 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "t = da.random.random((60000, 72000), chunks = (1000,1000))\n", 259 | "t" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "57a139e9-ecf6-45d8-a3de-cf7d2561b8a6", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "task = t.mean()\n", 270 | "task.dask" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "id": "3c72dedb-2ea7-4d8a-8c90-479a9492694c", 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "%%time\n", 281 | "result = task.compute()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "id": "5001d3e2-468d-42ff-a71e-ecfdbe94385b", 287 | "metadata": {}, 288 | "source": [ 289 | "Next, we will choose chunk sizes that fall in our expected \"optimal\" range of `100 MiB - 1 GiB`. We should be allowing Dask to distribute work efficiently but not imposing a high overhead..." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "id": "4bbedf51-7548-477d-bdda-b451b7be27c9", 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "t = da.random.random((60000, 72000), chunks = (10000,6000))\n", 300 | "t" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "id": "1882a526-d8fd-4406-8bc4-4acc9b666739", 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "%%time\n", 311 | "result = t.mean().compute()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "id": "787282db-254c-4007-b015-63ac8e196b86", 317 | "metadata": {}, 318 | "source": [ 319 | "----\n", 320 | "\n", 321 | "## Matching chunking in a netCDF4 file\n", 322 | "\n", 323 | "If you are using a chunked data format, it is best to specify Dask chunks which equal to or (better-yet) multiples of the chunk shape on disk. If chunk sizes aren't multiples of disk chunks, you risk unnecessary additional reads of data as multiple disk chunks will need to be read to populate each Dask chunk. This can be very inefficient!\n", 324 | "\n", 325 | "#### Inspecting file chunking\n", 326 | "\n", 327 | "The exact process for checking file chunking depends on the format. Using the netCDF4 Python module, we can query the chunking parameters of any variable in a netCDF4 file.\n", 328 | "\n", 329 | "*Classic netCDF files do not support chunking!*" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "d0eedce9-d0df-4235-bcff-6152cfde17da", 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "import netCDF4 as nc" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "id": "4e3daf33-2579-4861-85d5-b20360d4bbe5", 345 | "metadata": {}, 346 | "source": [ 347 | "We will use a data file from a model forecast dataset over the Arctic:" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "id": "16bdef3e-6c2c-4007-b061-04aa85e052e6", 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "my_file = '/glade/collections/rda/data/ds631.1/asr15.fcst3.3D/asr15km.fct.3D.20120916.nc'" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "id": "42027834-fa87-432b-bd5c-a10b88aa8eb9", 363 | "metadata": {}, 364 | "source": [ 365 | "Once we open the *dataset* (nc4 file), we can reference a variable of interest using a dictionary key and then get the dimensions of that variable using `get_dims()`." 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "442f91b0-a174-467c-9095-0bf0e67639fe", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "nc_data = nc.Dataset(my_file)\n", 376 | "nc_data['CLDFRA'].get_dims()" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "id": "897923a9-9ab3-4e7b-8362-9f84e0510832", 382 | "metadata": {}, 383 | "source": [ 384 | "We can then use the `chunking()` method to get our chunk size for each dimension:" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "id": "82965dea-3b73-492a-b013-1f1e363cb9d8", 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "nc_data['CLDFRA'].chunking()" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "id": "8bbed586-688c-4526-869e-f0267b6edf0b", 400 | "metadata": {}, 401 | "source": [ 402 | "### Specifying chunks using Xarray\n", 403 | "\n", 404 | "Now that we understand our file chunks, we can specify a preferred chunk size to `open_dataset`. Note that if we use the `chunks` parameter, any dimension we don't mention will be spanned in its entirety for chunks." 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "d67ca1a2-3102-4f85-b44c-3a001a93e5d5", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "import xarray as xr" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "id": "bf1c6563-00f9-47ed-8e6c-1dcc5f7b0a85", 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "# Open dataset using chunking along Time dimension\n", 425 | "ds = xr.open_dataset(my_file, chunks = {'Time' : 1})" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "id": "668891dd-5b6c-48bb-942a-90aee04c30a5", 431 | "metadata": {}, 432 | "source": [ 433 | "Since we are only specifying a chunk size for Time, this should be equivalent to the following chunk shape:\n", 434 | "```python\n", 435 | "chunks = {'Time' : 1,\n", 436 | " 'num_metgrid_levels' : -1,\n", 437 | " 'south_north' : -1,\n", 438 | " 'west_east' : -1 }\n", 439 | "```\n", 440 | "We can confirm that our chunks look as intended using the DataArray *repr*:" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "id": "94a41e94-fafb-4d0e-9988-2569a1f9ef6f", 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "ds.CLDFRA" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "id": "ea593bbf-aa22-41b0-97e3-e5ab68afe39f", 456 | "metadata": {}, 457 | "source": [ 458 | "**Note:** You can also retrieve the file chunk size from Xarray itself, but it is not shown in the above repr. Use the following DataArray (variable) attribute instead:" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "id": "f541f89c-b2f8-49c7-aa93-70ef0bf8df9f", 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "ds.CLDFRA.encoding[\"chunksizes\"]" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "id": "1fa65dda-efa9-4611-9e12-df5662d51ad8", 474 | "metadata": {}, 475 | "source": [ 476 | "Now let's benchmark various chunk configurations. Our initial guess achieves the recommended ratio of >= 2 chunks per worker, but does use multiples of the file chunk size except in the time dimension.\n", 477 | "\n", 478 | "For this benchmark, we will find the maximum cloud fraction across vertical levels at all locations and times." 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "id": "48125db1-00ac-4600-a786-7e6e6cb098b7", 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "%%time\n", 489 | "result = ds.CLDFRA.max(dim = \"num_metgrid_levels\").compute()" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "id": "45b68e87-834c-42c0-ad63-0c1f199f8fc9", 495 | "metadata": {}, 496 | "source": [ 497 | "Notice above that this file has chunking that does not divide evenly into the dimension sizes. We can specify that our chunks match the file chunks directly, but this will leave \"remainder\" chunks and will slightly increase overhead." 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "id": "e975b1ff-6c30-4270-ae2b-b945d96c02a0", 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "ds = xr.open_dataset(my_file, chunks = {'Time' : 1, \"num_metgrid_levels\" : 16,\n", 508 | " \"south_north\" : 355, \"east_west\" : 355})\n", 509 | "ds.CLDFRA" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "id": "b9257791-d82e-4564-bfe4-909e9253af3e", 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "%%time\n", 520 | "result = ds.CLDFRA.max(dim = \"num_metgrid_levels\").compute()" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "id": "f26a6004-aea9-4f99-b094-b0d8affa18fa", 526 | "metadata": {}, 527 | "source": [ 528 | "The most problematic case occurs when we have chunk sizes that are smaller than the file chunks in one or more dimensions. Let's evaluate the impact by using progressively smaller vertical level ranks:" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "id": "3ab203f6-20a9-4183-b95c-2e119b40f217", 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "# Using half the file chunk size in the vertical (same number of chunks)\n", 539 | "ds = xr.open_dataset(my_file, chunks = {\"Time\" : 4, \"num_metgrid_levels\" : 8})" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "id": "3af94486-17c3-43a8-8bbf-e82cd99cd8c0", 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "%%time\n", 550 | "result = ds.CLDFRA.max(dim = \"num_metgrid_levels\").compute()" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "id": "ea3d0289-a14d-490d-a05e-5b43b3e80091", 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "# Use 1/4 the chunk size in the vertical\n", 561 | "ds = xr.open_dataset(my_file, chunks = {\"Time\" : 8, \"num_metgrid_levels\" : 4})" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "id": "01f33087-5468-48a0-a3c0-9dfbced66f49", 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "%%time\n", 572 | "result = ds.CLDFRA.max(dim = \"num_metgrid_levels\").compute()" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "id": "0b4f9ecb-0608-4c79-b598-4d83b9509f0b", 578 | "metadata": {}, 579 | "source": [ 580 | "It is also possible to use \"auto\" chunking, whereby the DataArray chunks are calculated for you. Are these optimal?" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "id": "82a204b9-95db-4516-a16e-56f30f3d98a6", 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "# Open dataset using auto-chunking\n", 591 | "ds = xr.open_dataset(my_file, chunks = 'auto')" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "id": "0857d634-18cc-4f1f-98fe-b18007dd9394", 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [ 601 | "ds.CLDFRA" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "id": "04ad72d7-9b25-4516-8cbb-9c5068a881cc", 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [ 611 | "%%time\n", 612 | "result = ds.CLDFRA.max(dim = \"num_metgrid_levels\").compute()" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "id": "88e15589-c483-440d-ad96-f7fa98920515", 618 | "metadata": {}, 619 | "source": [ 620 | "**No! Avoid using auto chunking for files written in chunks!**" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "id": "a23624a5-2174-4d73-aace-024646918580", 626 | "metadata": {}, 627 | "source": [ 628 | "## Rechunking is expensive" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "id": "02fafba9-b289-436d-aa72-3a66553e9539", 634 | "metadata": {}, 635 | "source": [ 636 | "There are various reasons Dask might need to rechunk data, but in any case, it can be an expensive operation with a large amount of communication required between workers.\n", 637 | "\n", 638 | "**Scenario:** We wish to get the mean difference between two versions of a model for the same case study. Unfortunately, while the grids match for each version, the file chunk size used was different.\n", 639 | "\n", 640 | "Here, we will emulate the scenario with Dask Arrays..." 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "id": "c6f62d6e-2c35-4298-a546-e570a6736f28", 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "old_run = da.random.random((800,600,60,20), chunks = (400,300,30,1))" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "id": "ee642dae-b1c1-4aa2-873c-ab207be5fb5c", 657 | "metadata": {}, 658 | "outputs": [], 659 | "source": [ 660 | "old_run" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "id": "682d897a-4158-49ae-8960-7eac3b67c685", 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "new_run = da.random.random((800,600,60,20), chunks = (800,600,10,1))" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "id": "2e045450-a2a8-49b3-9ada-e9af425ebae2", 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "new_run" 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "id": "00776a1b-0bf2-49fd-bfac-6b0e2d7a33a3", 686 | "metadata": {}, 687 | "source": [ 688 | "Let's set up and analyse (via a high-level task graph), the operations we will need to do to retrieve a mean-squared difference/error between our two datasets." 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": null, 694 | "id": "7f773a71-37db-4023-ac13-81e7d19dba5e", 695 | "metadata": {}, 696 | "outputs": [], 697 | "source": [ 698 | "# Calculate the mean squared difference\n", 699 | "mse_graph = ((old_run - new_run) ** 2).sum() / old_run.size" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "id": "b005b366-dc3b-431b-b381-7dc26c79fee6", 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "mse_graph.dask" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "id": "9d799359-f2cb-40eb-ba06-d66eaef6b722", 715 | "metadata": {}, 716 | "source": [ 717 | "Note the two rechunking operations near the beginning of our task graph. Because our data arrays are chunked differently, Dask must rechunk first to avoid slowing down operations with large data transfers between workers. It is good that Dask does this, but rechunking is still expensive..." 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "id": "71844b87-19f7-4b9a-8ae3-54c8fafbea8d", 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [ 727 | "%%time\n", 728 | "mse_graph.compute()" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "id": "0e3e618a-6772-4f7f-9498-a630dbbe6f5d", 734 | "metadata": {}, 735 | "source": [ 736 | "In most circumstances, we will want to rechunk this data ourselves manually, and then save state (probably by creating a new rechunked data file). This one-time cost means we will not need to rechunk again in the future.\n", 737 | "\n", 738 | "In our scenario, we would likely rechunk the old run data, since we expect all future runs will have the new chunking.\n", 739 | "\n", 740 | "```python\n", 741 | "old_run_rechunked = old_run.rechunk((800,600,10,1))\n", 742 | "```\n", 743 | "\n", 744 | "Once this is done in a conversion workflow, we could load the rechunked data in our current workflow." 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "id": "868a4114-6c60-4078-b6ea-45345702805d", 751 | "metadata": {}, 752 | "outputs": [], 753 | "source": [ 754 | "old_run = da.random.random((800,600,60,20), chunks = (800,600,10,1))" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "id": "3dc6ff9e-0199-4bdf-90f9-90e3de1ea516", 761 | "metadata": {}, 762 | "outputs": [], 763 | "source": [ 764 | "# Calculate the mean squared difference\n", 765 | "mse_graph = ((old_run - new_run) ** 2).sum() / old_run.size" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "id": "fc2cbe54-fa8b-4850-893d-be2874adf7ae", 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "mse_graph.dask" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "id": "a86b1e64-3140-44f4-ade1-cd636216640e", 782 | "metadata": {}, 783 | "outputs": [], 784 | "source": [ 785 | "%%time\n", 786 | "mse_graph.compute()" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": null, 792 | "id": "be018325-ef95-424a-9bf2-24d33a26c640", 793 | "metadata": {}, 794 | "outputs": [], 795 | "source": [ 796 | "client.shutdown()" 797 | ] 798 | }, 799 | { 800 | "cell_type": "markdown", 801 | "id": "024686ad-6bfa-48ba-bd85-d005f9477902", 802 | "metadata": {}, 803 | "source": [ 804 | "## Takeaway Message\n", 805 | "\n", 806 | "Chunking is fundamental to Dask and its blocked-algorithm approach, so don't ignore intelligently sizing your data chunks. Finding the perfect chunk size is not the goal, but neglecting simple rules of thumb can lead to massive performance penalties when aggregated over a complex multipart analysis." 807 | ] 808 | } 809 | ], 810 | "metadata": { 811 | "kernelspec": { 812 | "display_name": "NPL 2023a", 813 | "language": "python", 814 | "name": "npl-2023a" 815 | }, 816 | "language_info": { 817 | "codemirror_mode": { 818 | "name": "ipython", 819 | "version": 3 820 | }, 821 | "file_extension": ".py", 822 | "mimetype": "text/x-python", 823 | "name": "python", 824 | "nbconvert_exporter": "python", 825 | "pygments_lexer": "ipython3", 826 | "version": "3.9.15" 827 | }, 828 | "widgets": { 829 | "application/vnd.jupyter.widget-state+json": { 830 | "state": {}, 831 | "version_major": 2, 832 | "version_minor": 0 833 | } 834 | } 835 | }, 836 | "nbformat": 4, 837 | "nbformat_minor": 5 838 | } 839 | -------------------------------------------------------------------------------- /notebooks/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | urls=("https://docs.google.com/uc?export=download&id=14doSRn8hT14QYtjZz28GKv14JgdIsbFF" "https://docs.google.com/uc?export=download&id=15rCwQUxxpH6angDhpXzlvbe1nGetYHrf") 4 | 5 | for url in "${urls[@]}"; do 6 | filename="$(basename $url)" 7 | 8 | # Download the tar file 9 | wget $url -O ../data.tar.gz 10 | 11 | # Unzip the tar file 12 | tar -xvzf ../data.tar.gz -C ../data 13 | 14 | # Clean up by removing the tar file 15 | rm ../data.tar.gz 16 | done --------------------------------------------------------------------------------