├── jupyterlab ├── dask-extension.png ├── run.sh ├── environment.yaml ├── create-notebook.py ├── workspace.json └── jupyterlab.ipynb ├── dask-sql ├── run.sh ├── cluster-env.yaml ├── notebook-env.yaml ├── create-notebook.py ├── workspace.json └── dask-sql.ipynb ├── hyperband ├── run.sh ├── environment.yaml ├── torch_model.py ├── create-notebook.py ├── workspace.json └── hyperband-optimization.ipynb ├── quickstart ├── run.sh ├── environment.yaml ├── create-notebook.py ├── workspace.json └── quickstart.ipynb ├── optuna-xgboost ├── run.sh ├── environment.yaml ├── create-notebook.py ├── workspace.json └── optuna-xgboost.ipynb ├── scaling-xgboost ├── run.sh ├── environment.yaml ├── create-notebook.py ├── workspace.json └── scaling-xgboost.ipynb ├── README.md ├── .github └── workflows │ ├── ci-build.yml │ └── ci-bump-version.yml ├── LICENSE └── .gitignore /jupyterlab/dask-extension.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coiled/notebooks/main/jupyterlab/dask-extension.png -------------------------------------------------------------------------------- /dask-sql/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /hyperband/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /jupyterlab/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /quickstart/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /optuna-xgboost/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /scaling-xgboost/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /dask-sql/cluster-env.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - dask=2021.10.0 5 | - s3fs>=2021.8.0 6 | - pandas>=1.3.0 7 | -------------------------------------------------------------------------------- /quickstart/environment.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - coiled=0.0.54 5 | - dask=2021.10.0 6 | - pandas>=1.3.0 7 | -------------------------------------------------------------------------------- /jupyterlab/environment.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - traitlets=5.0.4 5 | - coiled=0.0.54 6 | - dask=2021.10.0 7 | - pandas>=1.3.0 8 | -------------------------------------------------------------------------------- /dask-sql/notebook-env.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - coiled=0.0.54 5 | - dask=2021.10.0 6 | - dask-sql>=0.3.4 7 | - matplotlib 8 | - s3fs>=2021.8.0 9 | - pandas>=1.3.0 -------------------------------------------------------------------------------- /optuna-xgboost/environment.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - dask=2021.10.0 5 | - coiled=0.0.54 6 | - optuna=2.3.0 7 | - numpy 8 | - scikit-learn 9 | - xgboost 10 | - joblib 11 | - pandas>=1.3.0 12 | - pip: 13 | - dask-optuna 14 | -------------------------------------------------------------------------------- /hyperband/environment.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - dask=2021.10.0 7 | - coiled=0.0.54 8 | - numpy 9 | - pandas>=1.3.0 10 | - skorch 11 | - scipy 12 | - matplotlib 13 | - pytorch=1.8.1 14 | - s3fs>=2021.8.0 15 | - dask-ml -------------------------------------------------------------------------------- /scaling-xgboost/environment.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - dask=2021.10.0 5 | - coiled=0.0.54 6 | - pandas>=1.3.0 7 | - python=3.9 8 | - python-snappy 9 | - xgboost 10 | - dask-ml 11 | - dask-xgboost 12 | - scikit-learn 13 | - s3fs>=2021.8.0 14 | - matplotlib -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Coiled Notebooks 2 | 3 | [![Build notebooks](https://github.com/coiled/notebooks/workflows/Build%20notebooks/badge.svg)](https://github.com/coiled/notebooks/actions?query=workflow%3A%22Build+notebooks%22+branch%3Amain) 4 | 5 | Example notebooks maintained by Coiled. You can launch these notebooks at https://cloud.coiled.io/examples/notebooks. 6 | -------------------------------------------------------------------------------- /hyperband/torch_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class HiddenLayerNet(nn.Module): 7 | def __init__(self, n_features=10, n_outputs=1, n_hidden=100, activation="relu"): 8 | super().__init__() 9 | self.fc1 = nn.Linear(n_features, n_hidden) 10 | self.fc2 = nn.Linear(n_hidden, n_outputs) 11 | self.activation = getattr(F, activation) 12 | 13 | def forward(self, x, **kwargs): 14 | return self.fc2(self.activation(self.fc1(x))) -------------------------------------------------------------------------------- /quickstart/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | software_name = "examples/quickstart-notebook" 4 | coiled.create_software_environment( 5 | name=software_name, 6 | container="coiled/notebook:latest", 7 | conda="environment.yaml" 8 | ) 9 | 10 | coiled.create_job_configuration( 11 | name="examples/quickstart", 12 | software=software_name, 13 | command=[ 14 | "/bin/bash", 15 | "run.sh", 16 | ], 17 | files=["quickstart.ipynb", "workspace.json", "run.sh"], 18 | ports=[8888], 19 | description="Quickly launch a Dask cluster on the cloud with Coiled", 20 | ) 21 | -------------------------------------------------------------------------------- /jupyterlab/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | software_name = "examples/jupyterlab-notebook" 4 | coiled.create_software_environment( 5 | name=software_name, 6 | container="coiled/notebook:latest", 7 | conda="environment.yaml", 8 | ) 9 | 10 | coiled.create_job_configuration( 11 | name="examples/jupyterlab", 12 | software=software_name, 13 | command=[ 14 | "conda", 15 | "run", 16 | "-n", 17 | "coiled", 18 | "--no-capture-output", 19 | "/bin/bash", 20 | "run.sh", 21 | ], 22 | files=["jupyterlab.ipynb", "workspace.json", "run.sh", "dask-extension.png"], 23 | ports=[8888], 24 | description="See how Coiled integrates with JupyterLab", 25 | ) 26 | -------------------------------------------------------------------------------- /optuna-xgboost/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | # Create cluster software environment 4 | software_name = "examples/optuna-xgboost" 5 | coiled.create_software_environment( 6 | name=software_name, 7 | conda="environment.yaml", 8 | ) 9 | 10 | # Create notebook job software environment 11 | software_notebook_name = software_name + "-notebook" 12 | coiled.create_software_environment( 13 | name=software_notebook_name, 14 | container="coiled/notebook:latest", 15 | conda="environment.yaml", 16 | ) 17 | 18 | coiled.create_job_configuration( 19 | name="examples/optuna", 20 | software=software_notebook_name, 21 | command=[ 22 | "/bin/bash", 23 | "run.sh", 24 | ], 25 | files=["optuna-xgboost.ipynb", "workspace.json", "run.sh"], 26 | ports=[8888], 27 | description="Hyperparameter optimization with Optuna", 28 | ) 29 | -------------------------------------------------------------------------------- /scaling-xgboost/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | # Create cluster software environment 4 | software_name = "examples/scaling-xgboost" 5 | coiled.create_software_environment( 6 | name=software_name, 7 | conda="environment.yaml", 8 | ) 9 | 10 | # Create notebook job software environment 11 | software_notebook_name = software_name + "-notebook" 12 | coiled.create_software_environment( 13 | name=software_notebook_name, 14 | container="coiled/notebook:latest", 15 | conda="environment.yaml", 16 | ) 17 | 18 | coiled.create_job_configuration( 19 | name="examples/scaling-xgboost", 20 | software=software_notebook_name, 21 | command=[ 22 | "/bin/bash", 23 | "run.sh", 24 | ], 25 | files=["scaling-xgboost.ipynb", "workspace.json", "run.sh"], 26 | ports=[8888], 27 | description="Perform distributed training of an XGBoost classifier", 28 | ) 29 | -------------------------------------------------------------------------------- /dask-sql/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | # Create cluster software environment 4 | software_name = "examples/dask-sql" 5 | coiled.create_software_environment( 6 | name=software_name, 7 | conda="cluster-env.yaml", 8 | ) 9 | 10 | # Create notebook job software environment 11 | software_notebook_name = software_name + "-notebook" 12 | # Add Dask-SQL and matplotlib to notebook software environment 13 | coiled.create_software_environment( 14 | name=software_notebook_name, 15 | container="coiled/notebook:latest", 16 | conda="notebook-env.yaml", 17 | ) 18 | 19 | coiled.create_job_configuration( 20 | name="examples/dask-sql", 21 | software=software_notebook_name, 22 | command=[ 23 | "/bin/bash", 24 | "run.sh", 25 | ], 26 | files=["dask-sql.ipynb", "workspace.json", "run.sh"], 27 | ports=[8888], 28 | description="Query and transform Dask DataFrames using SQL", 29 | ) 30 | -------------------------------------------------------------------------------- /hyperband/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | # Create cluster software environment 4 | software_name = "examples/hyperband-optimization" 5 | coiled.create_software_environment( 6 | name=software_name, 7 | conda="environment.yaml", 8 | ) 9 | 10 | # Create notebook job software environment 11 | software_notebook_name = software_name + "-notebook" 12 | coiled.create_software_environment( 13 | name=software_notebook_name, 14 | container="coiled/notebook:latest", 15 | conda="environment.yaml", 16 | ) 17 | 18 | coiled.create_job_configuration( 19 | name="examples/hyperband-optimization", 20 | software=software_notebook_name, 21 | command=[ 22 | "/bin/bash", 23 | "run.sh", 24 | ], 25 | files=[ 26 | "hyperband-optimization.ipynb", 27 | "torch_model.py", 28 | "workspace.json", 29 | "run.sh", 30 | ], 31 | ports=[8888], 32 | description="Tune a PyTorch model with Hyperband cross-validation", 33 | ) 34 | -------------------------------------------------------------------------------- /dask-sql/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "tab-area", 10 | "currentIndex": 0, 11 | "widgets": [ 12 | "notebook:dask-sql.ipynb" 13 | ] 14 | }, 15 | "mode": "multiple-document", 16 | "current": "notebook:dask-sql.ipynb" 17 | }, 18 | "left": { 19 | "collapsed": true, 20 | "widgets": [ 21 | "filebrowser", 22 | "running-sessions", 23 | "command-palette", 24 | "jp-property-inspector", 25 | "tab-manager", 26 | "extensionmanager.main-view" 27 | ] 28 | }, 29 | "right": { 30 | "collapsed": true, 31 | "widgets": [] 32 | } 33 | }, 34 | "notebook:dask-sql.ipynb": { 35 | "data": { 36 | "path": "dask-sql.ipynb", 37 | "factory": "Notebook" 38 | } 39 | } 40 | }, 41 | "metadata": { 42 | "id": "/lab" 43 | } 44 | } -------------------------------------------------------------------------------- /quickstart/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "tab-area", 10 | "currentIndex": 0, 11 | "widgets": [ 12 | "notebook:quickstart.ipynb" 13 | ] 14 | }, 15 | "mode": "multiple-document", 16 | "current": "notebook:quickstart.ipynb" 17 | }, 18 | "left": { 19 | "collapsed": true, 20 | "widgets": [ 21 | "filebrowser", 22 | "running-sessions", 23 | "command-palette", 24 | "jp-property-inspector", 25 | "tab-manager", 26 | "extensionmanager.main-view" 27 | ] 28 | }, 29 | "right": { 30 | "collapsed": true, 31 | "widgets": [] 32 | } 33 | }, 34 | "notebook:quickstart.ipynb": { 35 | "data": { 36 | "path": "quickstart.ipynb", 37 | "factory": "Notebook" 38 | } 39 | } 40 | }, 41 | "metadata": { 42 | "id": "/lab" 43 | } 44 | } -------------------------------------------------------------------------------- /optuna-xgboost/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "tab-area", 10 | "currentIndex": 0, 11 | "widgets": [ 12 | "notebook:optuna-xgboost.ipynb" 13 | ] 14 | }, 15 | "mode": "multiple-document", 16 | "current": "notebook:optuna-xgboost.ipynb" 17 | }, 18 | "left": { 19 | "collapsed": true, 20 | "widgets": [ 21 | "filebrowser", 22 | "running-sessions", 23 | "command-palette", 24 | "jp-property-inspector", 25 | "tab-manager", 26 | "extensionmanager.main-view" 27 | ] 28 | }, 29 | "right": { 30 | "collapsed": true, 31 | "widgets": [] 32 | } 33 | }, 34 | "notebook:optuna-xgboost.ipynb": { 35 | "data": { 36 | "path": "optuna-xgboost.ipynb", 37 | "factory": "Notebook" 38 | } 39 | } 40 | }, 41 | "metadata": { 42 | "id": "/lab" 43 | } 44 | } -------------------------------------------------------------------------------- /scaling-xgboost/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "tab-area", 10 | "currentIndex": 0, 11 | "widgets": [ 12 | "notebook:scaling-xgboost.ipynb" 13 | ] 14 | }, 15 | "mode": "multiple-document", 16 | "current": "notebook:scaling-xgboost.ipynb" 17 | }, 18 | "left": { 19 | "collapsed": true, 20 | "widgets": [ 21 | "filebrowser", 22 | "running-sessions", 23 | "command-palette", 24 | "jp-property-inspector", 25 | "tab-manager", 26 | "extensionmanager.main-view" 27 | ] 28 | }, 29 | "right": { 30 | "collapsed": true, 31 | "widgets": [] 32 | } 33 | }, 34 | "notebook:scaling-xgboost.ipynb": { 35 | "data": { 36 | "path": "scaling-xgboost.ipynb", 37 | "factory": "Notebook" 38 | } 39 | } 40 | }, 41 | "metadata": { 42 | "id": "/lab" 43 | } 44 | } -------------------------------------------------------------------------------- /hyperband/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "tab-area", 10 | "currentIndex": 0, 11 | "widgets": [ 12 | "notebook:hyperband-optimization.ipynb" 13 | ] 14 | }, 15 | "mode": "multiple-document", 16 | "current": "notebook:hyperband-optimization.ipynb" 17 | }, 18 | "left": { 19 | "collapsed": true, 20 | "widgets": [ 21 | "filebrowser", 22 | "running-sessions", 23 | "command-palette", 24 | "jp-property-inspector", 25 | "tab-manager", 26 | "extensionmanager.main-view" 27 | ] 28 | }, 29 | "right": { 30 | "collapsed": true, 31 | "widgets": [] 32 | } 33 | }, 34 | "notebook:hyperband-optimization.ipynb": { 35 | "data": { 36 | "path": "hyperband-optimization.ipynb", 37 | "factory": "Notebook" 38 | } 39 | } 40 | }, 41 | "metadata": { 42 | "id": "/lab" 43 | } 44 | } -------------------------------------------------------------------------------- /.github/workflows/ci-build.yml: -------------------------------------------------------------------------------- 1 | name: Build notebooks 2 | 3 | on: 4 | push: 5 | branches: main 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | fail-fast: false 12 | max-parallel: 4 13 | matrix: 14 | example: 15 | [ 16 | "quickstart", 17 | "scaling-xgboost", 18 | "optuna-xgboost", 19 | "jupyterlab", 20 | "hyperband", 21 | "dask-sql", 22 | ] 23 | server: ["https://cloud.coiled.io", "https://staging.coiledhq.com", "https://development.coiledhq.com"] 24 | include: 25 | - server: "https://cloud.coiled.io" 26 | token-name: CLOUD_TOKEN 27 | 28 | - server: "https://staging.coiledhq.com" 29 | token-name: SANDBOX_TOKEN 30 | 31 | - server: "https://development.coiledhq.com" 32 | token-name: DEV_TOKEN 33 | env: 34 | DASK_COILED__SERVER: ${{ matrix.server }} 35 | DASK_COILED__TOKEN: ${{ secrets[matrix.token-name] }} 36 | 37 | steps: 38 | - name: Checkout source 39 | uses: actions/checkout@v2 40 | 41 | - name: Set up Python 42 | uses: actions/setup-python@v1 43 | 44 | - name: Install Coiled 45 | run: python -m pip install coiled 46 | 47 | - name: Build ${{ matrix.example }} notebook 48 | run: python create-notebook.py 49 | working-directory: ${{ matrix.example }} 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Coiled 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dask-worker-space/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /.github/workflows/ci-bump-version.yml: -------------------------------------------------------------------------------- 1 | name: Check for new Coiled and dask versions on conda-forge 2 | 3 | on: 4 | schedule: 5 | - cron: "36 * * * *" 6 | 7 | jobs: 8 | check-version: 9 | runs-on: ubuntu-latest 10 | if: github.repository == 'coiled/notebooks' 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - name: Get latest Coiled version 16 | id: latest_version_coiled 17 | uses: jacobtomlinson/gha-anaconda-package-version@0.1.3 18 | with: 19 | org: "conda-forge" 20 | package: "coiled" 21 | 22 | - name: Find and replace Coiled version 23 | id: find_and_replace_coiled 24 | uses: jacobtomlinson/gha-find-replace@0.1.1 25 | with: 26 | find: "coiled=[.0-9]+" 27 | replace: "coiled=${{ steps.latest_version_coiled.outputs.version }}" 28 | include: .+\.ya?ml|.+\.ipynb 29 | exclude: ^\.git.* 30 | 31 | - name: Get latest dask version 32 | id: latest_version_dask 33 | uses: jacobtomlinson/gha-anaconda-package-version@0.1.1 34 | with: 35 | org: "conda-forge" 36 | package: "dask" 37 | 38 | - name: Find and replace dask version 39 | id: find_and_replace_dask 40 | uses: jacobtomlinson/gha-find-replace@0.1.1 41 | with: 42 | find: "dask=[.0-9]+" 43 | replace: "dask=${{ steps.latest_version_dask.outputs.version }}" 44 | include: .+\.ya?ml|.+\.ipynb 45 | exclude: ^\.git.* 46 | 47 | 48 | - name: Output changed files 49 | run: | 50 | echo ${{ steps.find_and_replace_coiled.outputs.modifiedFiles }} || echo ${{ steps.find_and_replace_dask.outputs.modifiedFiles }} 51 | 52 | - name: Create Pull Request 53 | uses: peter-evans/create-pull-request@v3 54 | with: 55 | author: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 56 | token: ${{ secrets.GITHUB_TOKEN }} 57 | commit-message: "Update Coiled version to ${{ steps.latest_version_coiled.outputs.version }}, update dask version to ${{ steps.latest_version_dask.outputs.version }}" 58 | title: "Update Coiled version to ${{ steps.latest_version_coiled.outputs.version }}, update dask version to ${{ steps.latest_version_dask.outputs.version }}" 59 | reviewers: "jrbourbeau, ian-r-rose" 60 | branch: "upgrade-package-versions" 61 | body: | 62 | A new version of coiled or dask has been detected. 63 | 64 | Coiled is now `${{ steps.latest_version_coiled.outputs.version }}`. 65 | Dask is now `${{ steps.latest_version_dask.outputs.version }}`. 66 | -------------------------------------------------------------------------------- /jupyterlab/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "split-area", 10 | "orientation": "horizontal", 11 | "sizes": [ 12 | 0.5, 13 | 0.5 14 | ], 15 | "children": [ 16 | { 17 | "type": "tab-area", 18 | "currentIndex": 0, 19 | "widgets": [ 20 | "notebook:jupyterlab.ipynb" 21 | ] 22 | }, 23 | { 24 | "type": "split-area", 25 | "orientation": "vertical", 26 | "sizes": [ 27 | 0.3135581893384421, 28 | 0.33692004590199504, 29 | 0.34952176475956287 30 | ], 31 | "children": [ 32 | { 33 | "type": "tab-area", 34 | "currentIndex": 0, 35 | "widgets": [ 36 | "dask-dashboard-launcher:individual-task-stream" 37 | ] 38 | }, 39 | { 40 | "type": "tab-area", 41 | "currentIndex": 0, 42 | "widgets": [ 43 | "dask-dashboard-launcher:individual-progress" 44 | ] 45 | }, 46 | { 47 | "type": "tab-area", 48 | "currentIndex": 0, 49 | "widgets": [ 50 | "dask-dashboard-launcher:individual-workers" 51 | ] 52 | } 53 | ] 54 | } 55 | ] 56 | }, 57 | "mode": "multiple-document", 58 | "current": "notebook:jupyterlab.ipynb" 59 | }, 60 | "left": { 61 | "collapsed": true, 62 | "widgets": [ 63 | "filebrowser", 64 | "running-sessions", 65 | "dask-dashboard-launcher", 66 | "command-palette", 67 | "jp-property-inspector", 68 | "tab-manager", 69 | "extensionmanager.main-view" 70 | ] 71 | }, 72 | "right": { 73 | "collapsed": true, 74 | "widgets": [] 75 | } 76 | }, 77 | "notebook:jupyterlab.ipynb": { 78 | "data": { 79 | "path": "jupyterlab.ipynb", 80 | "factory": "Notebook" 81 | } 82 | }, 83 | "dask-dashboard-launcher:individual-task-stream": { 84 | "data": { 85 | "route": "individual-task-stream", 86 | "label": "Task Stream" 87 | } 88 | }, 89 | "dask-dashboard-launcher:individual-progress": { 90 | "data": { 91 | "route": "individual-progress", 92 | "label": "Progress" 93 | } 94 | }, 95 | "dask-dashboard-launcher:individual-workers": { 96 | "data": { 97 | "route": "individual-workers", 98 | "label": "Workers" 99 | } 100 | } 101 | }, 102 | "metadata": { 103 | "id": "/lab" 104 | } 105 | } -------------------------------------------------------------------------------- /jupyterlab/jupyterlab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Coiled + JupyterLab\n", 8 | "\n", 9 | "Coiled integrates well with [JupyterLab](https://jupyterlab.readthedocs.io/en/latest/) and its extension ecosystem. In particular, this notebook highlights the [Dask](https://github.com/dask/dask-labextension) and [Ipywidgets](https://ipywidgets.readthedocs.io/en/latest/index.html) JupyterLab extensions.\n", 10 | "\n", 11 | "## Cluster widget\n", 12 | "\n", 13 | "``coiled.Cluster`` uses Ipywidget's JupyterLab extension to display an interactive widget when outputted in a JupyterLab cell " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import coiled\n", 23 | "\n", 24 | "cluster = coiled.Cluster(n_workers=10)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "cluster" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "You can use the cluster widget by, for example, clicking the \"Manual Scaling\" dropdown and adjusting the number of workers in your Coiled cluster.\n", 41 | "\n", 42 | "## Dask Jupyterlab extension\n", 43 | "\n", 44 | "Dask's JupyterLab extension allows you to embed [Dask dashboard plots](https://docs.dask.org/en/latest/diagnostics-distributed.html) directly into a JupyterLab session. This let's you view diagnostic plots right alongside the code you're running.\n", 45 | "\n", 46 | "To use Dask's JupyterLab extension, we need to first connect Dask to our cluster by creating a ``distributed.Client``:" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from dask.distributed import Client\n", 56 | "\n", 57 | "client = Client(cluster)\n", 58 | "client.wait_for_workers(10)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Open the Dask JupyterLab extension by clicking the Dask logo in the JupyterLab left sidebar and then click the magnifying glass icon in the upper right-hand corner to connect the extension to your cluster.\n", 66 | "\n", 67 | "
\n", 68 | " \n", 69 | "
" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "That's it! Now when we perform computations we get interactive, realtime views of what's happening on the cluster:" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import dask.dataframe as dd\n", 86 | "\n", 87 | "df = dd.read_parquet(\n", 88 | " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.parquet\",\n", 89 | " columns=[\"passenger_count\", \"tip_amount\"],\n", 90 | " storage_options={\"anon\": True},\n", 91 | ").persist()\n", 92 | "\n", 93 | "df.groupby(\"passenger_count\").tip_amount.mean().compute()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Each organge button like \"Task stream\" and \"Progress\" correspond to a different diagnostic plot to view. Try clicking one of the buttons and then arranging the plot wherever you'd like in this JupyterLab session." 101 | ] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.8.5" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 4 125 | } 126 | -------------------------------------------------------------------------------- /optuna-xgboost/optuna-xgboost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hyperparameter optimization with Optuna and Dask\n", 8 | "\n", 9 | "[Optuna](https://optuna.org/) is a popular Python library for hyperparameter optimization. This example walks through a workload using Optuna to optimize an [XGBoost](https://xgboost.readthedocs.io/en/latest/) classification model. and then how to scale the same workload using Dask and Coiled.\n", 10 | "\n", 11 | "## Optuna in a nutshell\n", 12 | "\n", 13 | "Optuna has three primary concepts:\n", 14 | "\n", 15 | "- Objective function: This is some function that depends on the hyperparameters in your model that you would like to optimize. For example, it’s common to maximum a classification model’s prediction accuracy (i.e. the objective function would be the accuracy score).\n", 16 | "\n", 17 | "- Optimization trial: A trial is a single evaluation of the objective function with a given set of hyperparameters.\n", 18 | "\n", 19 | "- Optimization study: A study is a collection of optimization trials where each trial uses hyperparameters sampled from a set of allowed values.\n", 20 | "\n", 21 | "The set of hyperparameters for the trial which gives the optimal value for the objective function are chosen as the best set of hyperparameters.\n", 22 | "\n", 23 | "\n", 24 | "## Scaling Optuna with Dask\n", 25 | "\n", 26 | "Below is a snippet which uses Optuna to optimize several hyperparameters for an XGBoost classifier trained on the [breast cancer dataset](https://scikit-learn.org/stable/datasets/index.html#breast-cancer-wisconsin-diagnostic-dataset). We also use [Dask-Optuna](https://jrbourbeau.github.io/dask-optuna/) and [Joblib](https://joblib.readthedocs.io/en/latest/) to run Optuna trials in parallel on a Coiled cluster." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import numpy as np\n", 36 | "import sklearn.datasets\n", 37 | "import sklearn.metrics\n", 38 | "from sklearn.model_selection import train_test_split\n", 39 | "import xgboost as xgb\n", 40 | "\n", 41 | "def objective(trial):\n", 42 | " # Load our dataset\n", 43 | " X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n", 44 | " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)\n", 45 | " dtrain = xgb.DMatrix(X_train, label=y_train)\n", 46 | " dtest = xgb.DMatrix(X_test, label=y_test)\n", 47 | "\n", 48 | " # Get set of hyperparameters\n", 49 | " param = {\n", 50 | " \"silent\": 1,\n", 51 | " \"objective\": \"binary:logistic\",\n", 52 | " \"booster\": trial.suggest_categorical(\"booster\", [\"gbtree\", \"dart\"]),\n", 53 | " \"lambda\": trial.suggest_float(\"lambda\", 1e-8, 1.0, log=True),\n", 54 | " \"alpha\": trial.suggest_float(\"alpha\", 1e-8, 1.0, log=True),\n", 55 | " \"max_depth\": trial.suggest_int(\"max_depth\", 1, 9),\n", 56 | " \"eta\": trial.suggest_float(\"eta\", 1e-8, 1.0, log=True),\n", 57 | " \"gamma\": trial.suggest_float(\"gamma\", 1e-8, 1.0, log=True),\n", 58 | " \"grow_policy\": trial.suggest_categorical(\"grow_policy\", [\"depthwise\", \"lossguide\"]),\n", 59 | " }\n", 60 | "\n", 61 | " # Train XGBoost model\n", 62 | " bst = xgb.train(param, dtrain)\n", 63 | " preds = bst.predict(dtest)\n", 64 | "\n", 65 | " # Compute and return model accuracy\n", 66 | " pred_labels = np.rint(preds)\n", 67 | " accuracy = sklearn.metrics.accuracy_score(y_test, pred_labels)\n", 68 | " return accuracy" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Connect to Coiled" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "import coiled\n", 85 | "from dask.distributed import Client\n", 86 | "\n", 87 | "cluster = coiled.Cluster(\n", 88 | " n_workers=10, \n", 89 | " software=\"examples/optuna-xgboost\"\n", 90 | ")\n", 91 | "client = Client(cluster)\n", 92 | "client.wait_for_workers(10)\n", 93 | "\n", 94 | "client" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Train with Optuna" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "import optuna\n", 111 | "import dask_optuna\n", 112 | "import joblib\n", 113 | "\n", 114 | "# Create Dask-compatible Optuna storage class\n", 115 | "storage = dask_optuna.DaskStorage()\n", 116 | "\n", 117 | "# Run 500 optimizations trial on our cluster\n", 118 | "study = optuna.create_study(direction=\"maximize\", storage=storage)\n", 119 | "with joblib.parallel_backend(\"dask\"):\n", 120 | " study.optimize(objective, n_trials=500, n_jobs=-1)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "study.best_params" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "And with that, you’re able to run distributed hyperparameter optimizations using Optuna, Dask, and Coiled!" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.8.5" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 4 161 | } 162 | -------------------------------------------------------------------------------- /dask-sql/dask-sql.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Analyzing data with Dask, SQL, and Coiled\n", 8 | "\n", 9 | "In this notebook, we look at using [Dask-SQL](https://dask-sql.readthedocs.io/en/latest/), an exciting new open-source library which adds a SQL query layer on top of Dask. This allows you to query and transform Dask DataFrames using common SQL operations.\n", 10 | "\n", 11 | "## Launch a cluster\n", 12 | "\n", 13 | "Let's first start by creating a Coiled cluster which uses the `examples/dask-sql` software environment, which has `dask`, `pandas`, `s3fs`, and a few other libraries installed." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import coiled\n", 23 | "\n", 24 | "cluster = coiled.Cluster(\n", 25 | " n_workers=10,\n", 26 | " worker_cpu=4,\n", 27 | " worker_memory=\"30GiB\",\n", 28 | " software=\"examples/dask-sql\",\n", 29 | ")\n", 30 | "cluster" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "and then connect Dask to our remote Coiled cluster" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from dask.distributed import Client\n", 47 | "\n", 48 | "client = Client(cluster)\n", 49 | "client.wait_for_workers(10)\n", 50 | "client" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Getting started with Dask-SQL\n", 58 | "\n", 59 | "Internally, Dask-SQL uses a well-established Java library, Apache Calcite, to parse SQL and perform some initial work on your query. To help Dask-SQL locate JVM shared libraries, we set the `JAVA_HOME` environment variable. " 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import os\n", 69 | "\n", 70 | "os.environ[\"JAVA_HOME\"] = os.environ[\"CONDA_DIR\"]" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The main interface for interacting with Dask-SQL is the `dask_sql.Context` object. It allows your to register Dask DataFrames as data sources and can convert SQL queries to Dask DataFrame operations." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "from dask_sql import Context\n", 87 | "\n", 88 | "c = Context()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "For this notebook, we'll use the NYC taxi dataset, which is publically accessible on AWS S3, as our data source" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "import dask.dataframe as dd\n", 105 | "from distributed import wait\n", 106 | "\n", 107 | "df = dd.read_parquet(\n", 108 | " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.parquet\",\n", 109 | " dtype={\n", 110 | " \"payment_type\": \"UInt8\",\n", 111 | " \"VendorID\": \"UInt8\",\n", 112 | " \"passenger_count\": \"UInt8\",\n", 113 | " \"RatecodeID\": \"UInt8\",\n", 114 | " },\n", 115 | " storage_options={\"anon\": True}\n", 116 | ")\n", 117 | "\n", 118 | "# Load datasest into the cluster's distributed memory.\n", 119 | "# This isn't strictly necessary, but does allow us to\n", 120 | "# avoid repeated running the same I/O operations. \n", 121 | "df = df.persist()\n", 122 | "wait(df);" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "We can then use our `dask_sql.Context` to assign a table name to this DataFrame, and then use that table name within SQL queries" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# Registers our Dask DataFrame df as a table with the name \"taxi\"\n", 139 | "c.register_dask_table(df, \"taxi\")\n", 140 | "\n", 141 | "# Perform a SQL operation on the \"taxi\" table\n", 142 | "result = c.sql(\"SELECT count(1) FROM taxi\")\n", 143 | "result" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "Note that this returned another Dask DataFrame and no computation has been run yet. This is similar to other Dask DataFrame operations, which are lazily evaluated. We can call `.compute()` to run the computation on our cluster." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "result.compute()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "Hooray, we've run our first SQL query with Dask-SQL! Let's try out some more complex queries.\n", 167 | "\n", 168 | "## More complex SQL examples\n", 169 | "\n", 170 | "With Dask-SQL we can run more complex SQL statements like, for example, a groupby-aggregation:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "c.sql('SELECT avg(tip_amount) FROM taxi GROUP BY passenger_count').compute()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "NOTE: that the equivalent operatation using the Dask DataFrame API would be:\n", 187 | "\n", 188 | "```python\n", 189 | "df.groupby(\"passenger_count\").tip_amount.mean().compute()\n", 190 | "```" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "We can even make plots of our SQL query results for near-real-time interactive data exploration and visualization." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "c.sql(\"\"\"\n", 207 | " SELECT floor(trip_distance) AS dist, avg(fare_amount) as fare\n", 208 | " FROM taxi \n", 209 | " WHERE trip_distance < 50 AND trip_distance >= 0 \n", 210 | " GROUP BY floor(trip_distance)\n", 211 | "\"\"\").compute().plot(x=\"dist\", y=\"fare\");" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "If you would like to learn more about Dask-SQL check out the [Dask-SQL docs](https://dask-sql.readthedocs.io/) or [source code](https://github.com/nils-braun/dask-sql) on GitHub." 219 | ] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.8.6" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 4 243 | } 244 | -------------------------------------------------------------------------------- /quickstart/quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting started with Coiled\n", 8 | "\n", 9 | "Welcome to the getting started guide for Coiled! This notebook covers installing and setting up Coiled as well as running your first computation using Coiled." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Launch a cluster\n", 17 | "\n", 18 | "The first step is to spin up a Dask Cluster. In Coiled, this is done by creating a `coiled.Cluster` instance, there are [several keyword arguments](https://docs.coiled.io/user_guide/api.html#coiled.Cluster) you can use to specify the details of your cluster further. Please read the [cluster creation documentation](https://docs.coiled.io/user_guide/cluster_creation.html) to know more.\n", 19 | "\n", 20 | "Note that we will give a name to this cluster, if you don't specify this keyword argument, clusters will be given a unique randomly generated name." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import coiled\n", 30 | "\n", 31 | "cluster = coiled.Cluster(name=\"quickstart-example\", n_workers=10)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "Once a cluster has been created (you can see the status on your [Coiled dashboard](https://cloud.coiled.io/)), you can connect Dask to the cluster by creating a `distributed.Client` instance." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from dask.distributed import Client\n", 48 | "\n", 49 | "client = Client(cluster)\n", 50 | "client" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Analyze data in the cloud\n", 58 | "\n", 59 | "Now that we have our cluster running and Dask connected to it, let's run a computation. This example will run the computation on about 84 million rows." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import dask.dataframe as dd\n", 69 | "\n", 70 | "df = dd.read_parquet(\n", 71 | " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.parquet\",\n", 72 | " columns=[\"passenger_count\", \"tip_amount\"],\n", 73 | " storage_options={\"anon\": True},\n", 74 | ").persist()\n", 75 | "\n", 76 | "df.groupby(\"passenger_count\").tip_amount.mean().compute()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## Stop a cluster\n", 84 | "\n", 85 | "By default, clusters will shutdown after 20 minutes of inactivity. You can stop a cluster by pressing the stop button on the [Coiled dashboard](https://cloud.coiled.io/). Alternatively, we can get a list of all running clusters and use the cluster name to stop it." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "coiled.list_clusters()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "The command `list_clusters` returns a dictionary with the cluster name used as the key. We can grab that and then call the command `coiled.delete_cluster()` to stop the running cluster, and `client.close()` to close the client." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "coiled.delete_cluster(name=\"quickstart-example\")\n", 111 | "client.close()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "You can now go back to the [Coiled dashboard](https://cloud.coiled.io/) and you will see that the cluster is now stopping/stopped" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "# Software Environments\n", 126 | "\n", 127 | "Software Environments are Docker images that contain all your dependencies and files that you might need to run your computations. If you don't specify a software environment to the `coiled.Cluster` constructor, we will use Coiled's default software environment. You can learn more about software environments in our [documentation](https://docs.coiled.io/user_guide/software_environment.html)." 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Create a software environment\n", 135 | "\n", 136 | "When creating software environments, there are [several keyword arguments](https://docs.coiled.io/user_guide/api.html#coiled.create_software_environment) that you can use to create a custom environment for your work." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "coiled.create_software_environment(\n", 146 | " name=\"quickstart\", \n", 147 | " conda={\n", 148 | " \"channels\": [\"conda-forge\"], \n", 149 | " \"dependencies\": [\"coiled=0.0.54\", \"dask=2021.10.0\"]\n", 150 | " }\n", 151 | ")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "We can now follow our previous workflow of creating a cluster - this time, we will use our newly created software environment - connect the cluster to Dask and then running the same example." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "cluster = coiled.Cluster(n_workers=10, software=\"quickstart\")\n", 168 | "client = Client(cluster)\n", 169 | "client" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "If you go to the [Coiled dashboard](https://cloud.coiled.io/), under the **Software Environment** column, you can see that we are using the quickstart software environment we have just created. Note also that this time, the cluster will have a randomly generated name.\n", 177 | "\n", 178 | "Let's now run the same computation as before, but using the cluster that is running with the software environment that we have recently created." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "import dask.dataframe as dd\n", 188 | "\n", 189 | "df = dd.read_parquet(\n", 190 | " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.parquet\",\n", 191 | " columns=[\"passenger_count\", \"tip_amount\"],\n", 192 | " storage_options={\"anon\": True},\n", 193 | ").persist()\n", 194 | "\n", 195 | "df.groupby(\"passenger_count\").tip_amount.mean().compute()" 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 3", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.8.8" 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 4 220 | } 221 | -------------------------------------------------------------------------------- /hyperband/hyperband-optimization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hyperparameter Optimization with Dask and Coiled\n", 8 | "\n", 9 | "This example will walk through the following:\n", 10 | "\n", 11 | "* **Getting and processing the data.**\n", 12 | "* **Defining a model and parameters.**\n", 13 | "* **Finding the best parameters,** and some details on why we're using the chosen search algorithm.\n", 14 | "* **Scoring** and deploying.\n", 15 | "\n", 16 | "All of these tasks will be performed on the New York City Taxi Cab dataset." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Setup cluster" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Create cluster with Coiled\n", 33 | "import coiled\n", 34 | "\n", 35 | "cluster = coiled.Cluster(\n", 36 | " n_workers=20,\n", 37 | " software=\"examples/hyperband-optimization\",\n", 38 | ")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Connect Dask to the cluster\n", 48 | "import dask.distributed\n", 49 | "\n", 50 | "client = dask.distributed.Client(cluster)\n", 51 | "client" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "#### ☝️ Don’t forget to click the \"Dashboard\" link above to view the cluster dashboard!" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Get and pre-process data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "This example will mirror the Kaggle \"[NYC Taxi Trip Duration][1]\" example with different data.\n", 73 | "\n", 74 | "These data have records on 84 million taxi rides.\n", 75 | "\n", 76 | "[1]:https://www.kaggle.com/c/nyc-taxi-trip-duration/" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import dask.dataframe as dd\n", 86 | "\n", 87 | "features = [\"passenger_count\", \"trip_distance\", \"fare_amount\"]\n", 88 | "categorical_features = [\"RatecodeID\", \"payment_type\"]\n", 89 | "output = [\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"]\n", 90 | "\n", 91 | "df = dd.read_parquet(\n", 92 | " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.parquet\",\n", 93 | " parse_dates=output,\n", 94 | " usecols=features + categorical_features + output,\n", 95 | " dtype={\n", 96 | " \"passenger_count\": \"UInt8\",\n", 97 | " \"RatecodeID\": \"category\",\n", 98 | " \"payment_type\": \"category\",\n", 99 | " },\n", 100 | ")\n", 101 | "\n", 102 | "df = df.repartition(partition_size=\"10 MiB\").persist()\n", 103 | "\n", 104 | "# one hot encode the categorical columns\n", 105 | "df = df.categorize(categorical_features)\n", 106 | "df = dd.get_dummies(df, columns=categorical_features)\n", 107 | "\n", 108 | "# persist so only download once\n", 109 | "df = df.persist()\n", 110 | "\n", 111 | "data = df[[c for c in df.columns if c not in output]]\n", 112 | "data = data.fillna(0)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "durations = (df[\"tpep_dropoff_datetime\"] - df[\"tpep_pickup_datetime\"]).dt.total_seconds() / 60 # minutes" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "from dask_ml.model_selection import train_test_split\n", 131 | "import dask\n", 132 | "\n", 133 | "X = data.to_dask_array(lengths=True).astype(\"float32\")\n", 134 | "y = durations.to_dask_array(lengths=True).astype(\"float32\")\n", 135 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, shuffle=True)\n", 136 | "\n", 137 | "# persist the data so it's not re-computed\n", 138 | "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "## Define model and hyperparameters" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Let's use a simple neural network from [PyTorch] using [Skorch], a simple wrapper that provides a Scikit-Learn API for PyTorch.\n", 153 | "\n", 154 | "This network is only small for demonstration. If desired, we could use much larger networks on GPUs.\n", 155 | "\n", 156 | "[PyTorch]:https://pytorch.org/\n", 157 | "[skorch]:https://skorch.readthedocs.io/en/stable/" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# Import our HiddenLayerNet pytorch model from a local torch_model.py module\n", 167 | "from torch_model import HiddenLayerNet\n", 168 | "# Send module with HiddenLayerNet to workers on cluster\n", 169 | "client.upload_file(\"torch_model.py\")" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Print contents of torch_model.py module\n", 179 | "!cat torch_model.py" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "import torch\n", 189 | "import torch.optim as optim\n", 190 | "import torch.nn as nn\n", 191 | "from skorch import NeuralNetRegressor\n", 192 | "\n", 193 | "niceties = {\n", 194 | " \"callbacks\": False,\n", 195 | " \"warm_start\": True,\n", 196 | " \"train_split\": None,\n", 197 | " \"max_epochs\": 1,\n", 198 | "}\n", 199 | "\n", 200 | "class NonNanLossRegressor(NeuralNetRegressor):\n", 201 | " def get_loss(self, y_pred, y_true, X=None, training=False):\n", 202 | " if torch.abs(y_true - y_pred).abs().mean() > 1e6:\n", 203 | " return torch.tensor([0.0], requires_grad=True)\n", 204 | " return super().get_loss(y_pred, y_true, X=X, training=training)\n", 205 | "\n", 206 | "model = NonNanLossRegressor(\n", 207 | " module=HiddenLayerNet,\n", 208 | " module__n_features=X_train.shape[1],\n", 209 | " optimizer=optim.SGD,\n", 210 | " criterion=nn.MSELoss,\n", 211 | " lr=0.0001,\n", 212 | " **niceties,\n", 213 | ")" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "from scipy.stats import loguniform, uniform\n", 223 | "\n", 224 | "params = {\n", 225 | " \"module__activation\": [\"relu\", \"elu\", \"softsign\", \"leaky_relu\", \"rrelu\"],\n", 226 | " \"batch_size\": [32, 64, 128, 256],\n", 227 | " \"optimizer__lr\": loguniform(1e-4, 1e-3),\n", 228 | " \"optimizer__weight_decay\": loguniform(1e-6, 1e-3),\n", 229 | " \"optimizer__momentum\": uniform(0, 1),\n", 230 | " \"optimizer__nesterov\": [True],\n", 231 | "}" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "All of these parameters control model architecture, execpt for two basic optimizatino parameters, `batch_size` and `learning_rate_init`. They control finding the best model of a particular architecture." 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "## Find the best hyperparameters" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Our search is \"computationally-constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", 253 | "\n", 254 | "[Dask-ML's documentation on hyperparameter searches][2] indicates that we should use `HyperbandSearchCV`.\n", 255 | "\n", 256 | "[2]:https://ml.dask.org/hyper-parameter-search.html" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "from dask_ml.model_selection import HyperbandSearchCV\n", 266 | "search = HyperbandSearchCV(model, params, random_state=2, verbose=True, max_iter=9)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "By default, `HyperbandSearchCV` will call `partial_fit` on each chunk of the Dask Array. `HyperbandSearchCV`'s rule of thumb specifies how to train for longer or sample more parameters." 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "y_train2 = y_train.reshape(-1, 1).persist()\n", 283 | "search.fit(X_train, y_train2)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "## Score" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "`HyperbandSearchCV` and the like mirror the Scikit-Learn model selection interface, so all attributes of Scikit-Learn's [RandomizedSearchCV][rscv] are available:\n", 298 | "\n", 299 | "[rscv]:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "search.best_score_" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "search.best_params_" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "search.best_estimator_" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "This means we can deploy the best model and score on the testing dataset:" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "from dask_ml.wrappers import ParallelPostFit\n", 343 | "deployed_model = ParallelPostFit(search.best_estimator_)\n", 344 | "deployed_model.score(X_test, y_test)" 345 | ] 346 | } 347 | ], 348 | "metadata": { 349 | "kernelspec": { 350 | "display_name": "Python 3", 351 | "language": "python", 352 | "name": "python3" 353 | }, 354 | "language_info": { 355 | "codemirror_mode": { 356 | "name": "ipython", 357 | "version": 3 358 | }, 359 | "file_extension": ".py", 360 | "mimetype": "text/x-python", 361 | "name": "python", 362 | "nbconvert_exporter": "python", 363 | "pygments_lexer": "ipython3", 364 | "version": "3.8.8" 365 | } 366 | }, 367 | "nbformat": 4, 368 | "nbformat_minor": 4 369 | } 370 | -------------------------------------------------------------------------------- /scaling-xgboost/scaling-xgboost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Scaling XGBoost with Dask and Coiled\n", 8 | "\n", 9 | "This notebook walks through training an [XGBoost](https://xgboost.readthedocs.io/en/latest/) model locally on a small dataset and then using [Dask](https://dask.org/) and [Coiled](https://coiled.io/) to scale out to the cloud and run XGBoost on a larger-than-memory dataset." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Local XGBoost\n", 17 | "\n", 18 | "[XGBoost](https://xgboost.readthedocs.io/en/latest/) is a popular library for training gradient boosted supervised machine learning models. \n", 19 | "\n", 20 | "## Load our dataset\n", 21 | "\n", 22 | "The first step towards training our model is to load our dataset. We'll use the [Higgs dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS), which is available on Amazon S3. The dataset is composed of 11 million simulated particle collisions, each of which is described by 28 real-valued, features and a binary label indicating which class the sample belongs to (i.e. whether the sample represents a signal or background event). To start, we'll load only a sample of the dataset (just over 175 thousand samples) and process the full datset in the next section." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "\n", 33 | "# Load a single CSV file\n", 34 | "df = pd.read_csv(\"s3://coiled-data/higgs/higgs-00.csv\")\n", 35 | "df" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Next, we can separate our classification label and training features and then use Scikit-learn's `sklearn.model_selection.train_test_split` function to partition the dataset into training and testing samples." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "X, y = df.iloc[:, 1:], df[\"labels\"]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from sklearn.model_selection import train_test_split\n", 61 | "\n", 62 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "To use XGBoost, we'll need to construct `xgboost.DMatrix` objects for both our training and testing datasets -- these are the internal data structures XGBoost uses to manage dataset features and targets. However, since XGBoost plays well with libaries like NumPy and Pandas, we can simply pass our training and testing datasets directly to `xgboost.DMatrix(...)`." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "import xgboost\n", 79 | "\n", 80 | "dtrain = xgboost.DMatrix(X_train, y_train)\n", 81 | "dtest = xgboost.DMatrix(X_test, y_test)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Next we'll define the set of hyperparameters we want to use for our XGBoost model and train the model!" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "params = {\n", 98 | " 'objective': 'binary:logistic',\n", 99 | " 'max_depth': 3,\n", 100 | " 'min_child_weight': 0.5,\n", 101 | "}\n", 102 | "\n", 103 | "bst = xgboost.train(params, dtrain, num_boost_round=3)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Now that our model has been trained, we can use it to make predictions on the testing dataset which was _not_ used to train the model." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "y_pred = bst.predict(dtest)\n", 120 | "\n", 121 | "y_pred" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "To get a sense for the quality of these predictions we can compute and plot a [receiver operating characteristic (ROC) curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) of our model's predictions, which compares the predicted output from our model with the known labels to calculate the true postive rate vs. false positive rate." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from sklearn.metrics import roc_curve\n", 138 | "\n", 139 | "fpr, tpr, _ = roc_curve(y_test, y_pred)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "from sklearn.metrics import auc\n", 149 | "import matplotlib.pyplot as plt\n", 150 | "%matplotlib inline\n", 151 | "\n", 152 | "fig, ax = plt.subplots(figsize=(8, 8))\n", 153 | "ax.plot(fpr, tpr, lw=3,\n", 154 | " label='ROC Curve (area = {:.2f})'.format(auc(fpr, tpr)))\n", 155 | "ax.plot([0, 1], [0, 1], \"k--\", lw=2)\n", 156 | "ax.set(\n", 157 | " xlim=(0, 1),\n", 158 | " ylim=(0, 1),\n", 159 | " title=\"ROC Curve\",\n", 160 | " xlabel=\"False Positive Rate\",\n", 161 | " ylabel=\"True Positive Rate\",\n", 162 | ")\n", 163 | "ax.legend()\n", 164 | "plt.show()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "# Scaling with Dask and Coiled\n", 172 | "\n", 173 | "In the previous section, we trained a model with a subset of the full Higgs dataset. In this section, we will use the full dataset with 11 million samples! With this increased number of samples, the dataset may not fit comfortably into memory on a personal laptop. So we'll use Dask and Coiled to expand our compute resources to the cloud to enable us to work with this larger datset." 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### Create a Dask cluster on AWS with Coiled\n", 181 | "\n", 182 | "Let's create a Coiled cluster using the `examples/xgboost` software environment, which has Dask, XGBoost, Scikit-learn, and other relavant packages installed, and then connect a `dask.distributed.Client` to our cluster so we can begin to submit tasks to the cluster." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "import coiled\n", 192 | "\n", 193 | "cluster = coiled.Cluster(\n", 194 | " n_workers=10,\n", 195 | " software=\"examples/scaling-xgboost\",\n", 196 | ")" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "import dask.distributed\n", 206 | "\n", 207 | "client = dask.distributed.Client(cluster)\n", 208 | "client" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "#### ☝️ Don’t forget to click the \\\"Dashboard\\\" link above to view the cluster dashboard!" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "### Load full dataset\n", 223 | "\n", 224 | "Dask's `read_csv` functions makes it easy to read in all the CSV files in the dataset." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "import dask.dataframe as dd\n", 234 | "\n", 235 | "# Load the entire dataset using Dask\n", 236 | "ddf = dd.read_csv(\"s3://coiled-data/higgs/higgs-*.csv\", storage_options={\"anon\": True})\n", 237 | "ddf" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "Dask's machine learning library, [Dask-ML](https://ml.dask.org/), mimics Scikit-learn's API, providing scalable versions of functions of `sklearn.datasets.make_classification` and `sklearn.model_selection.train_test_split` that are designed to work with Dask Arrays and DataFrames in larger-than-memory settings.\n", 245 | "\n", 246 | "Let's use Dask-ML to generate a similar classification dataset as before, but now with 100 million samples." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "from dask_ml.model_selection import train_test_split\n", 256 | "\n", 257 | "X, y = ddf.iloc[:, 1:], ddf[\"labels\"]\n", 258 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=2)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "Next we'll [persist our training and testing datasets](https://distributed.dask.org/en/latest/memory.html#persisting-collections) into distributed memory to avoid any unnecessary re-computations." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "import dask\n", 275 | "\n", 276 | "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)\n", 277 | "\n", 278 | "X_train" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "To do distributed training of an XGBoost model, we'll use the [Dask-XGBoost](https://github.com/dask/dask-xgboost) package which mirrors XGBoost's interface but works with Dask Arrays and DataFrames." 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "import dask_xgboost\n", 295 | "\n", 296 | "bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=3)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "Finally, we can again compute and plot the ROC curve for this model's predictions." 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "y_pred = dask_xgboost.predict(client, bst, X_test)\n", 313 | "\n", 314 | "y_test, y_pred = dask.compute(y_test, y_pred)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "fpr, tpr, _ = roc_curve(y_test, y_pred)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "fig, ax = plt.subplots(figsize=(8, 8))\n", 333 | "ax.plot(fpr, tpr, lw=3,\n", 334 | " label='ROC Curve (area = {:.2f})'.format(auc(fpr, tpr)))\n", 335 | "ax.plot([0, 1], [0, 1], \"k--\", lw=2)\n", 336 | "ax.set(\n", 337 | " xlim=(0, 1),\n", 338 | " ylim=(0, 1),\n", 339 | " title=\"ROC Curve\",\n", 340 | " xlabel=\"False Positive Rate\",\n", 341 | " ylabel=\"True Positive Rate\",\n", 342 | ")\n", 343 | "ax.legend()\n", 344 | "plt.show()" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "Voilà! Congratulations on training a boosted decision tree in the cloud." 352 | ] 353 | } 354 | ], 355 | "metadata": { 356 | "kernelspec": { 357 | "display_name": "Python 3", 358 | "language": "python", 359 | "name": "python3" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 3 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython3", 371 | "version": "3.8.5" 372 | } 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 4 376 | } 377 | --------------------------------------------------------------------------------