├── energy-landscape ├── run.sh ├── create-notebook.py └── workspace.json ├── xgboost-mortgage ├── run.sh ├── create-notebook.py ├── workspace.json └── xgboost-mortgage.ipynb ├── hyperparameter-tuning ├── run.sh ├── create-notebook.py ├── workspace.json └── hyperparameter-tuning.ipynb ├── .github └── workflows │ └── ci-build.yml ├── LICENSE ├── README.md └── .gitignore /energy-landscape/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /xgboost-mortgage/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /hyperparameter-tuning/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start.sh jupyter lab workspaces import workspace.json 4 | start.sh jupyter lab -------------------------------------------------------------------------------- /.github/workflows/ci-build.yml: -------------------------------------------------------------------------------- 1 | 2 | name: Build notebooks 3 | 4 | on: 5 | push: 6 | branches: master 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | example: ["xgboost-mortgage", "hyperparameter-tuning", "energy-landscape"] 15 | env: 16 | DASK_COILED__SERVER: https://cloud.coiled.io 17 | DASK_COILED__TOKEN: ${{ secrets.DASK_COILED__TOKEN }} 18 | 19 | steps: 20 | - name: Checkout source 21 | uses: actions/checkout@v2 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v1 25 | 26 | - name: Install Coiled 27 | run: python -m pip install coiled 28 | 29 | - name: Build ${{ matrix.example }} notebook 30 | run: python create-notebook.py 31 | working-directory: ${{ matrix.example }} -------------------------------------------------------------------------------- /energy-landscape/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | software_name = "blog-notebooks/energy-landscape" 4 | coiled.create_software_environment( 5 | name=software_name, 6 | container="coiled/notebook:latest", 7 | conda={ 8 | "channels": ["conda-forge"], 9 | "dependencies": [ 10 | "coiled", 11 | "dask-ml", 12 | "dask>=2.23.0", 13 | "fastparquet", 14 | "matplotlib", 15 | "pandas>=1.1.0", 16 | "python-snappy", 17 | "seaborn", 18 | "s3fs", 19 | "scikit-learn", 20 | "xgboost>=1.3.0", 21 | "optuna<2.4.0", 22 | ], 23 | }, 24 | pip=["dask-optuna", "fastdtw"], 25 | ) 26 | 27 | coiled.create_job_configuration( 28 | name="blog-notebooks/energy-landscape", 29 | software=software_name, 30 | command=[ 31 | "/bin/bash", 32 | "run.sh", 33 | ], 34 | files=["energy-landscape.ipynb", "workspace.json", "run.sh"], 35 | ports=[8888], 36 | description="Explore residential electricity usage and Dynamic Time Warping", 37 | ) 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Coiled 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](https://mk0coiled27knr0w73eb.kinstacdn.com/wp-content/uploads/horizontal-black.svg) 2 | 3 | # [Coiled Blog](https://coiled.io/blog) accompanying notebooks 4 | 5 | Example notebooks letting you reproduce results from the Coiled Blog. You can launch these notebooks at https://cloud.coiled.io/blog-notebooks/notebooks. 6 | 7 | * **[Blog](https://coiled.io/blog/xgboost-frictionless-training/) | [Notebook](xgboost-mortgage/xgboost-mortgage.ipynb) | [Launch](https://cloud.coiled.io/blog-notebooks/jobs/xgboost-on-coiled)** - XGBoost – frictionless training on datasets too big for the memory 8 | * **[Blog](https://coiled.io/blog/changing-energy-landscape-distributed-python/) | [Notebook](energy-landscape/energy-landscape.ipynb) | [Launch](https://cloud.coiled.io/blog-notebooks/jobs/energy-landscape)** - A Changing Energy Landscape – Analyzing 8 Million Homes with Distributed Python 9 | * **[Blog](https://coiled.io/blog/faster-hyperparameter-tuning-cloud/) | [Notebook](hyperparameter-tuning/hyperparameter-tuning.ipynb) | [Launch](https://cloud.coiled.io/blog-notebooks/jobs/optuna-xgboost)** - No More Coffee Breaks - Faster Hyperparameter Tuning in the Cloud -------------------------------------------------------------------------------- /xgboost-mortgage/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | software_name = "blog-notebooks/xgboost-on-coiled" 4 | coiled.create_software_environment( 5 | name=software_name, 6 | container="coiled/notebook:latest", 7 | conda={ 8 | "channels": ["conda-forge"], 9 | "dependencies": [ 10 | "coiled", 11 | "dask", 12 | "dask-ml", 13 | "dask>=2.23.0", 14 | "fastparquet", 15 | "matplotlib", 16 | "pandas>=1.1.0", 17 | "python-snappy", 18 | "s3fs", 19 | "scikit-learn", 20 | "xgboost>=1.3.0", 21 | "optuna<2.4.0", 22 | "numpy", 23 | "xgboost", 24 | "joblib", 25 | ], 26 | }, 27 | pip=["dask-optuna"], 28 | ) 29 | 30 | coiled.create_job_configuration( 31 | name="blog-notebooks/xgboost-on-coiled", 32 | software=software_name, 33 | command=[ 34 | "/bin/bash", 35 | "run.sh", 36 | ], 37 | files=["xgboost-mortgage.ipynb", "workspace.json", "run.sh"], 38 | ports=[8888], 39 | description="Train XGBoost on a large dataset with Dask on Coiled", 40 | ) 41 | -------------------------------------------------------------------------------- /hyperparameter-tuning/create-notebook.py: -------------------------------------------------------------------------------- 1 | import coiled 2 | 3 | software_name = "blog-notebooks/xgboost-on-coiled" 4 | coiled.create_software_environment( 5 | name=software_name, 6 | container="coiled/notebook:latest", 7 | conda={ 8 | "channels": ["conda-forge"], 9 | "dependencies": [ 10 | "coiled", 11 | "dask", 12 | "dask-ml", 13 | "dask>=2.23.0", 14 | "fastparquet", 15 | "matplotlib", 16 | "pandas>=1.1.0", 17 | "python-snappy", 18 | "s3fs", 19 | "scikit-learn", 20 | "xgboost>=1.3.0", 21 | "optuna<2.4.0", 22 | "numpy", 23 | "xgboost", 24 | "joblib", 25 | ], 26 | }, 27 | pip=["dask-optuna"], 28 | ) 29 | 30 | coiled.create_job_configuration( 31 | name="blog-notebooks/optuna-xgboost", 32 | software=software_name, 33 | command=[ 34 | "/bin/bash", 35 | "run.sh", 36 | ], 37 | files=["hyperparameter-tuning.ipynb", "workspace.json", "run.sh"], 38 | ports=[8888], 39 | description="XGBoost hyperparameter tuning with Optuna and Dask on Coiled", 40 | ) 41 | -------------------------------------------------------------------------------- /energy-landscape/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "tab-area", 10 | "currentIndex": 0, 11 | "widgets": [ 12 | "notebook:energy-landscape.ipynb" 13 | ] 14 | }, 15 | "mode": "multiple-document", 16 | "current": "notebook:energy-landscape.ipynb" 17 | }, 18 | "left": { 19 | "collapsed": true, 20 | "widgets": [ 21 | "filebrowser", 22 | "running-sessions", 23 | "command-palette", 24 | "jp-property-inspector", 25 | "tab-manager", 26 | "extensionmanager.main-view" 27 | ] 28 | }, 29 | "right": { 30 | "collapsed": true, 31 | "widgets": [] 32 | } 33 | }, 34 | "notebook:quickstart.ipynb": { 35 | "data": { 36 | "path": "energy-landscape.ipynb", 37 | "factory": "Notebook" 38 | } 39 | } 40 | }, 41 | "metadata": { 42 | "id": "/lab" 43 | } 44 | } -------------------------------------------------------------------------------- /xgboost-mortgage/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "tab-area", 10 | "currentIndex": 0, 11 | "widgets": [ 12 | "notebook:xgboost-mortgage.ipynb" 13 | ] 14 | }, 15 | "mode": "multiple-document", 16 | "current": "notebook:xgboost-mortgage.ipynb" 17 | }, 18 | "left": { 19 | "collapsed": true, 20 | "widgets": [ 21 | "filebrowser", 22 | "running-sessions", 23 | "command-palette", 24 | "jp-property-inspector", 25 | "tab-manager", 26 | "extensionmanager.main-view" 27 | ] 28 | }, 29 | "right": { 30 | "collapsed": true, 31 | "widgets": [] 32 | } 33 | }, 34 | "notebook:quickstart.ipynb": { 35 | "data": { 36 | "path": "xgboost-mortgage.ipynb", 37 | "factory": "Notebook" 38 | } 39 | } 40 | }, 41 | "metadata": { 42 | "id": "/lab" 43 | } 44 | } -------------------------------------------------------------------------------- /hyperparameter-tuning/workspace.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "file-browser-filebrowser:cwd": { 4 | "path": "" 5 | }, 6 | "layout-restorer:data": { 7 | "main": { 8 | "dock": { 9 | "type": "tab-area", 10 | "currentIndex": 0, 11 | "widgets": [ 12 | "notebook:hyperparameter-tuning.ipynb" 13 | ] 14 | }, 15 | "mode": "multiple-document", 16 | "current": "notebook:hyperparameter-tuning.ipynb" 17 | }, 18 | "left": { 19 | "collapsed": true, 20 | "widgets": [ 21 | "filebrowser", 22 | "running-sessions", 23 | "command-palette", 24 | "jp-property-inspector", 25 | "tab-manager", 26 | "extensionmanager.main-view" 27 | ] 28 | }, 29 | "right": { 30 | "collapsed": true, 31 | "widgets": [] 32 | } 33 | }, 34 | "notebook:quickstart.ipynb": { 35 | "data": { 36 | "path": "hyperparameter-tuning.ipynb", 37 | "factory": "Notebook" 38 | } 39 | } 40 | }, 41 | "metadata": { 42 | "id": "/lab" 43 | } 44 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /xgboost-mortgage/xgboost-mortgage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Distributed XGBoost on Dask in the cloud\n", 8 | "\n", 9 | "This is the accompanying notebook to the blog post [XGBoost – frictionless training on datasets too big for the memory](https://coiled.io/blog/xgboost-frictionless-training/).\n", 10 | "\n", 11 | "Swap in your dataset, spin up a cluster in 2 minutes and train at any scale!" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Cluster setup " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from dask import dataframe as dd\n", 28 | "import coiled" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Order the cluster and look at it coming up in your [Coiled dashboard](https://cloud.coiled.io/):" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "%%time\n", 45 | "cluster = coiled.Cluster(n_workers=12, software=\"xgboost-on-coiled\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "Connect to the cluster:" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from dask.distributed import Client, progress\n", 62 | "client = Client(cluster)\n", 63 | "client" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Load the dataset sample" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "mortgage_data = dd.read_parquet(\n", 80 | " \"s3://coiled-data/mortgage-2000.parq/*\", \n", 81 | " compression=\"gzip\", \n", 82 | " columns=columns, \n", 83 | " storage_options={\"anon\":True}\n", 84 | ")\n", 85 | "\n", 86 | "mortgage_data" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Pin the downloaded dataset to memory:\n", 94 | "\n", 95 | "_This step reduces waiting times in subsequent steps that trigger computation._" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "mortgage_data = mortgage_data.persist()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Data preprocessing" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "The dataset needs a little work - we need to prepare categorical columns to a format that is supported by XGBoost.\n", 119 | "\n", 120 | "The columns we'll be working with:" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "columns = [\n", 130 | " \"delinquency_12\",\n", 131 | " \"interest_rate\",\n", 132 | " \"loan_age\",\n", 133 | " \"adj_remaining_months_to_maturity\",\n", 134 | " \"longest_ever_deliquent\",\n", 135 | " \"orig_channel\",\n", 136 | " \"num_borrowers\",\n", 137 | " \"borrower_credit_score\",\n", 138 | " \"first_home_buyer\",\n", 139 | " \"loan_purpose\",\n", 140 | " \"property_type\",\n", 141 | " \"num_units\",\n", 142 | " \"occupancy_status\",\n", 143 | " \"property_state\",\n", 144 | " \"zip\",\n", 145 | " \"mortgage_insurance_percent\",\n", 146 | " \"coborrow_credit_score\",\n", 147 | " \"relocation_mortgage_indicator\",\n", 148 | "]\n", 149 | "categorical = [\n", 150 | " \"orig_channel\",\n", 151 | " \"occupancy_status\",\n", 152 | " \"property_state\",\n", 153 | " \"first_home_buyer\",\n", 154 | " \"loan_purpose\",\n", 155 | " \"property_type\",\n", 156 | " \"zip\",\n", 157 | " \"relocation_mortgage_indicator\",\n", 158 | " \"delinquency_12\",\n", 159 | "]" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "Create a column categorizer:" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "from dask_ml.preprocessing import Categorizer\n", 176 | "\n", 177 | "ce = Categorizer(columns=categorical)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Apply column categorizer:" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "mortgage_data = ce.fit_transform(mortgage_data)\n", 194 | "\n", 195 | "mortgage_data.dtypes" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# https://github.com/dmlc/xgboost/blame/9a0399e8981b2279d921fe2312f7ab1b880fd3c3/python-package/xgboost/dask.py#L227\n", 205 | "# Dask categorical columns are not yet available\n", 206 | "\n", 207 | "# the commit is already in master, can be expected in release 1.4.0\n", 208 | "\n", 209 | "# Because this is not possible yet, I will cast to ints\n", 210 | "for col in categorical:\n", 211 | " mortgage_data[col] = mortgage_data[col].cat.codes" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### Split the dataset before training" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "dependent_vars = mortgage_data.columns.difference([\"delinquency_12\"])\n", 228 | "X, y = mortgage_data.iloc[:, dependent_vars], mortgage_data[\"delinquency_12\"]" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "from dask_ml.model_selection import train_test_split\n", 238 | "\n", 239 | "X_train, X_test, y_train, y_test = train_test_split(\n", 240 | " X, y, test_size=0.2, shuffle=True, random_state=2\n", 241 | ")" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### Train XGBoost" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "import xgboost as xgb" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Prepare distributed DMatrix structures: " 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train) \n", 274 | "dtest = xgb.dask.DaskDMatrix(client, X_test, y_test) " 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "Training params:" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "params = {\n", 291 | " \"max_depth\": 8,\n", 292 | " \"max_leaves\": 2 ** 8,\n", 293 | " \"alpha\": 0.9,\n", 294 | " \"eta\": 0.1,\n", 295 | " \"gamma\": 0.1,\n", 296 | " \"learning_rate\": 0.1,\n", 297 | " \"subsample\": 1,\n", 298 | " \"reg_lambda\": 1,\n", 299 | " \"scale_pos_weight\": 2,\n", 300 | " \"min_child_weight\": 30,\n", 301 | " \"objective\": \"binary:logistic\",\n", 302 | " \"grow_policy\": \"lossguide\",\n", 303 | "}" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "Run training" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "%%time\n", 320 | "output = xgb.dask.train(\n", 321 | " client,\n", 322 | " params,\n", 323 | " dtrain,\n", 324 | " num_boost_round=20,\n", 325 | " evals=[\n", 326 | " (dtrain, 'train'), \n", 327 | " (dtest, 'test')\n", 328 | " ]\n", 329 | ")" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "Access results" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "booster = output['booster'] # booster is the trained model\n", 346 | "history = output['history'] # A dictionary containing evaluation " 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "booster" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "history" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "### Close session and set down the cluster" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "cluster.close()" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "### Join our Slack community and share your success!\n", 388 | "\n", 389 | "Follow this link to join: \n", 390 | "https://join.slack.com/t/coiled-users/shared_invite/zt-hx1fnr7k-In~Q8ui3XkQfvQon0yN5WQ" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "### Next steps\n", 398 | "\n", 399 | "* Train on your own dataset\n", 400 | "* Scale up the cluster to use more resources with `cluster.scale(24)`\n", 401 | "* [GPU-accelerated XGBoost on Dask (NVidia RAPIDS team)](https://github.com/rapidsai-community/notebooks-contrib/blob/branch-0.14/intermediate_notebooks/E2E/mortgage/mortgage_e2e.ipynb)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [] 410 | } 411 | ], 412 | "metadata": { 413 | "kernelspec": { 414 | "display_name": "Python 3", 415 | "language": "python", 416 | "name": "python3" 417 | }, 418 | "language_info": { 419 | "codemirror_mode": { 420 | "name": "ipython", 421 | "version": 3 422 | }, 423 | "file_extension": ".py", 424 | "mimetype": "text/x-python", 425 | "name": "python", 426 | "nbconvert_exporter": "python", 427 | "pygments_lexer": "ipython3", 428 | "version": "3.8.5" 429 | } 430 | }, 431 | "nbformat": 4, 432 | "nbformat_minor": 4 433 | } 434 | -------------------------------------------------------------------------------- /hyperparameter-tuning/hyperparameter-tuning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Distirbuted Hyperparameter Optimization with Dask\n", 8 | "\n", 9 | "This is the accompanying notebook to the blog post [No More Coffee Breaks - Faster Hyperparameter Tuning in the Cloud](https://coiled.io/blog/faster-hyperparameter-tuning-cloud)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### Load data " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "\n", 28 | "wine = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\", delimiter=\";\")\n", 29 | "\n", 30 | "X, y = wine.iloc[:, :-1], wine[\"quality\"]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "wine.shape" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### 5-fold cross-validated XGBoost model" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from xgboost.sklearn import XGBRegressor\n", 56 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 57 | "\n", 58 | "estimator = XGBRegressor(objective=\"reg:squarederror\")\n", 59 | "-cross_val_score(estimator, X, y, cv=5, scoring=\"neg_mean_absolute_error\").mean()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### Local GridSearchCV (client machine)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "params = {\n", 76 | " \"max_depth\": [3, 16],\n", 77 | " \"min_child_weight\": [10, 30],\n", 78 | " \"eta\": [0.1, 0.05],\n", 79 | " \"grow_policy\": [\"depthwise\", \"lossguide\"],\n", 80 | "}" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "estimator = XGBRegressor(objective=\"reg:squarederror\")\n", 90 | "\n", 91 | "grid_search = GridSearchCV(\n", 92 | " estimator=estimator,\n", 93 | " param_grid=params,\n", 94 | " scoring=\"neg_mean_absolute_error\",\n", 95 | " n_jobs=-1,\n", 96 | " cv=5,\n", 97 | " verbose=True,\n", 98 | ")" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "%%time\n", 108 | "grid_search.fit(X, y)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "grid_search.best_score_" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "grid_search.best_params_" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### GridSearchCV on a Dask `LocalCluster` (client machine)\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "This is equivalen to running local `GridSearchCV` with `sklearn`, except it can be useful to prototype the flow before running it on another cluster with more resources.\n", 141 | "\n", 142 | "Feel free to skip this step and scale out to a bigger cluster!" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "# from dask_ml.model_selection import GridSearchCV\n", 152 | "\n", 153 | "# params = {\n", 154 | "# \"max_depth\": [3, 6, 8, 16],\n", 155 | "# \"min_child_weight\": [3, 5, 10, 20, 30],\n", 156 | "# \"eta\": [0.3, 0.2, 0.1, 0.05, 0.01],\n", 157 | "# \"colsample_bytree\": np.arange(0.7, 1.0, 0.1),\n", 158 | "# \"sampling_method \": [\"uniform\", \"gradient_based\"],\n", 159 | "# \"booster\": [\"gbtree\", \"dart\"],\n", 160 | "# \"grow_policy\": [\"depthwise\", \"lossguide\"],\n", 161 | "# }\n", 162 | "\n", 163 | "# estimator = XGBRegressor(objective=\"reg:squarederror\")\n", 164 | "\n", 165 | "# grid_search = GridSearchCV(\n", 166 | "# estimator=estimator,\n", 167 | "# param_grid=params,\n", 168 | "# scoring=\"neg_mean_absolute_error\",\n", 169 | "# n_jobs=-1,\n", 170 | "# cv=5,\n", 171 | "# )" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# %%time\n", 181 | "# grid_search.fit(X, y)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# grid_search.best_score_" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# grid_search.best_params_" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "### GridSearchCV on a Dask cluster in the cloud - Coiled\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 2, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "import coiled\n", 216 | "import joblib\n", 217 | "from dask.distributed import Client" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "Set up a cluster of any size in 2 minutes. \n", 225 | "\n", 226 | "Go ahead and explore the configuration arguments for `coiled.Cluster()`!" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "coiled.Cluster??" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "Get a cluster in the cloud" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "%%time\n", 252 | "cluster = coiled.Cluster(n_workers=24, software=\"optuna-xgboost\")\n", 253 | "client = Client(cluster)\n", 254 | "\n", 255 | "client" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "params = {\n", 265 | " \"max_depth\": [3, 6, 8, 16],\n", 266 | " \"min_child_weight\": [3, 5, 10, 20, 30],\n", 267 | " \"eta\": [0.3, 0.2, 0.1, 0.05, 0.01],\n", 268 | " \"colsample_bytree\": np.arange(0.7, 1.0, 0.1),\n", 269 | " \"sampling_method \": [\"uniform\", \"gradient_based\"],\n", 270 | " \"booster\": [\"gbtree\", \"dart\"],\n", 271 | " \"grow_policy\": [\"depthwise\", \"lossguide\"],\n", 272 | "}" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "#### 5 folds for each of 3200 candidates, totalling 16000 fits" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "estimator = XGBRegressor(objective=\"reg:squarederror\")\n", 289 | "\n", 290 | "grid_search = GridSearchCV(\n", 291 | " estimator=estimator,\n", 292 | " param_grid=params,\n", 293 | " scoring=\"neg_mean_absolute_error\",\n", 294 | " n_jobs=-1,\n", 295 | " cv=5,\n", 296 | " verbose=2,\n", 297 | ")" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "`joblib`, the scikit-learn parallelization backend, will use our open cluster client connected to Coiled." 305 | ] 306 | }, 307 | { 308 | "source": [ 309 | "Make sure to launch the Dask Cluster dashboard, to see the tasks being worked on in real time!" 310 | ], 311 | "cell_type": "markdown", 312 | "metadata": {} 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "cluster.dashboard_link" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "%%time\n", 330 | "with joblib.parallel_backend(\"dask\", scatter=[X, y]):\n", 331 | " grid_search.fit(X, y)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "grid_search.best_score_" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "grid_search.best_estimator_" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "### Scale the cluster down\n", 357 | "\n", 358 | "For next steps, the required resources will be much lower.\n", 359 | "\n", 360 | "We can cut costs by scaling the cluster down right from this notebook:" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "cluster.scale(4) # target = 4 workers" 370 | ] 371 | }, 372 | { 373 | "source": [ 374 | "We can view the Cluster Dashboard to see workers disappearing and hourly cost coming down." 375 | ], 376 | "cell_type": "markdown", 377 | "metadata": {} 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "cluster.dashboard_link" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "## Optuna\n", 393 | "\n", 394 | "Bayesian optimization - accelerated search.\n", 395 | "\n", 396 | "Let's configure the objective function to suggest parameter values from the ranges we want to explore." 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "def objective(trial):\n", 406 | " params = {\n", 407 | " \"max_depth\": trial.suggest_int(\"max_depth\", 3, 16),\n", 408 | " \"min_child_weight\": trial.suggest_int(\"min_child_weight\", 3, 30),\n", 409 | " \"eta\": trial.suggest_float(\"eta\", 1e-8, 1.0, log=True),\n", 410 | " \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.7, 1.0),\n", 411 | " \"sampling_method \": trial.suggest_categorical(\n", 412 | " \"sampling_method\", [\"uniform\", \"gradient_based\"]\n", 413 | " ),\n", 414 | " \"booster\": trial.suggest_categorical(\"booster\", [\"gbtree\", \"dart\"]),\n", 415 | " \"grow_policy\": trial.suggest_categorical(\n", 416 | " \"grow_policy\", [\"depthwise\", \"lossguide\"]\n", 417 | " ),\n", 418 | " }\n", 419 | "\n", 420 | " wine = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\", delimiter=\";\")\n", 421 | " X, y = wine.iloc[:, :-1], wine[\"quality\"]\n", 422 | "\n", 423 | " estimator = XGBRegressor(objective=\"reg:squarederror\", **params)\n", 424 | " \n", 425 | " score = cross_val_score(\n", 426 | " estimator, X, y, cv=5, scoring=\"neg_mean_absolute_error\"\n", 427 | " ).mean()\n", 428 | " \n", 429 | " return score" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "%%time\n", 439 | "\n", 440 | "import dask_optuna\n", 441 | "import joblib\n", 442 | "import optuna\n", 443 | "\n", 444 | "storage = dask_optuna.DaskStorage()\n", 445 | "\n", 446 | "study = optuna.create_study(direction=\"maximize\", storage=storage)\n", 447 | "with joblib.parallel_backend(\"dask\"):\n", 448 | " study.optimize(objective, n_trials=100, n_jobs=-1)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "study.best_value" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "study.best_trial" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "study.best_params" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "results = study.trials_dataframe()" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "### Plot trial outcomes" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "import matplotlib.pyplot as plt" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 42, 506 | "metadata": {}, 507 | "outputs": [ 508 | { 509 | "data": { 510 | "text/plain": [ 511 | "Text(0.5, 1.0, 'Mean Absolute Error over 100 Optimization Trials')" 512 | ] 513 | }, 514 | "execution_count": 42, 515 | "metadata": {}, 516 | "output_type": "execute_result" 517 | }, 518 | { 519 | "data": { 520 | "image/png": "\n", 521 | "text/plain": [ 522 | "
" 523 | ] 524 | }, 525 | "metadata": { 526 | "needs_background": "light" 527 | }, 528 | "output_type": "display_data" 529 | } 530 | ], 531 | "source": [ 532 | "fig, ax = plt.subplots(figsize=(12, 8))\n", 533 | "results.value.clip(lower=-1).plot.line()\n", 534 | "best_trial = results.value.idxmax()\n", 535 | "plt.scatter(best_trial, results.loc[best_trial].value, color='r', s=100)\n", 536 | "plt.ylabel('5-fold CV Mean Absolute Error')\n", 537 | "plt.xlabel('Optuna Trials')\n", 538 | "plt.title('Mean Absolute Error over 100 Optimization Trials')" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": {}, 544 | "source": [ 545 | "### Set down the cluster" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "cluster.close()" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "### Dataset citation:\n", 562 | "\n", 563 | "> P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. \n", 564 | "> Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009." 565 | ] 566 | } 567 | ], 568 | "metadata": { 569 | "kernelspec": { 570 | "display_name": "Python 3", 571 | "language": "python", 572 | "name": "python3" 573 | }, 574 | "language_info": { 575 | "codemirror_mode": { 576 | "name": "ipython", 577 | "version": 3 578 | }, 579 | "file_extension": ".py", 580 | "mimetype": "text/x-python", 581 | "name": "python", 582 | "nbconvert_exporter": "python", 583 | "pygments_lexer": "ipython3", 584 | "version": "3.8.5" 585 | } 586 | }, 587 | "nbformat": 4, 588 | "nbformat_minor": 4 589 | } --------------------------------------------------------------------------------