├── requirements.txt ├── .dockerignore ├── 01_hello_sky ├── example.yaml └── 01_hello_sky.ipynb ├── Dockerfile ├── LICENSE ├── 03_spot_instances ├── bert.yaml ├── terminator.py └── 03_spot_instances.ipynb ├── 02_using_accelerators ├── bert.yaml └── 02_using_accelerators.ipynb ├── README.md ├── .gitignore └── 00_installation └── 00_installation.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | jupyterlab 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | **/.ipynb_checkpoints 2 | **/.git 3 | build_docker.sh 4 | Dockerfile 5 | local/ 6 | README.md 7 | LICENSE 8 | -------------------------------------------------------------------------------- /01_hello_sky/example.yaml: -------------------------------------------------------------------------------- 1 | # example.yaml 2 | name: example 3 | 4 | setup: | 5 | echo "Run any setup commands here" 6 | pip install cowsay 7 | 8 | run: | 9 | echo "Hello Stranger!" 10 | cowsay "Moo!" 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:4.12.0 2 | 3 | WORKDIR /skypilot-tutorial 4 | 5 | ADD ./requirements.txt /skypilot-tutorial/requirements.txt 6 | 7 | # Install tutorial dependencies 8 | RUN pip install -r requirements.txt 9 | 10 | # Install SkyPilot + dependencies 11 | RUN conda install -c conda-forge google-cloud-sdk && \ 12 | apt update -y && \ 13 | apt install rsync nano -y && \ 14 | pip install skypilot[aws,gcp] && \ 15 | rm -rf /var/lib/apt/lists/* 16 | 17 | # Copy credentials. 18 | # UPDATE - no longer required. Instead mount the .aws and .config dirs to /credentials and it will be copied over. 19 | # COPY src/.aws /root/.aws 20 | # COPY src/.config/gcloud /root/.config/gcloud 21 | 22 | # Exclude usage logging message 23 | RUN mkdir -p /root/.sky && touch /root/.sky/privacy_policy 24 | 25 | # Add files which may change frequently 26 | COPY . /skypilot-tutorial 27 | 28 | # Set bash as default shell 29 | ENV SHELL /bin/bash 30 | 31 | CMD ["/bin/bash", "-c", "cp -a /credentials/. /root/;sky show-gpus;jupyter lab --no-browser --ip '*' --allow-root --notebook-dir=/skypilot-tutorial --NotebookApp.token='SkyCamp2022'"] 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Romil Bhardwaj 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /03_spot_instances/bert.yaml: -------------------------------------------------------------------------------- 1 | name: bert 2 | 3 | resources: 4 | accelerators: T4:1 # Use T4 GPUs for quota reasons. 5 | cloud: aws 6 | region: us-west-2 7 | 8 | # file_mounts specifies the any data that must be made available to the task 9 | file_mounts: 10 | /dataset/: # This specifies the destination where the object bucket will be mounted 11 | source: s3://sky-bert-dataset/ # The bucket URL to be mounted 12 | 13 | 14 | # Setup repository. 15 | setup: | 16 | git clone https://github.com/huggingface/transformers.git 17 | cd transformers && git checkout v4.18.0 18 | pip install -e . 19 | cd examples/pytorch/question-answering/ 20 | pip install -r requirements.txt 21 | 22 | # Run command. Note that the --train_file argument reads from the object store mounted at /dataset 23 | run: | 24 | cd transformers/examples/pytorch/question-answering/ 25 | python run_qa.py \ 26 | --train_file /dataset/train-v2.0.json \ 27 | --model_name_or_path bert-base-uncased \ 28 | --dataset_name squad \ 29 | --do_train \ 30 | --do_eval \ 31 | --per_device_train_batch_size 12 \ 32 | --learning_rate 3e-5 \ 33 | --num_train_epochs 50 \ 34 | --max_seq_length 384 \ 35 | --doc_stride 128 \ 36 | --report_to none \ 37 | --output_dir /tmp/checkpoints/. \ 38 | --save_total_limit 10 \ 39 | --save_steps 1000 40 | -------------------------------------------------------------------------------- /02_using_accelerators/bert.yaml: -------------------------------------------------------------------------------- 1 | name: bert 2 | 3 | resources: 4 | accelerators: # [DIY] - Add K80:1 here! 5 | 6 | # For this task, we specify cloud and region because our tutorial account has quota only in the us-west-2 region. 7 | # If these are not specified, SkyPilot will try the cheapest region first, and failover if quota is exceeded. 8 | cloud: aws 9 | region: us-west-2 10 | 11 | # file_mounts specifies the any data that must be made available to the task 12 | file_mounts: 13 | /dataset/: # This specifies the destination where the object bucket will be mounted 14 | source: s3://sky-bert-dataset/ # The bucket URL to be mounted 15 | 16 | # Setup repository. 17 | setup: | 18 | git clone https://github.com/huggingface/transformers.git 19 | cd transformers && git checkout v4.18.0 20 | pip install -e . 21 | cd examples/pytorch/question-answering/ 22 | pip install -r requirements.txt 23 | 24 | # Run command. Note that the --train_file argument reads from the object store mounted at /dataset 25 | run: | 26 | cd transformers/examples/pytorch/question-answering/ 27 | python run_qa.py \ 28 | --train_file /dataset/train-v2.0.json \ 29 | --model_name_or_path bert-base-uncased \ 30 | --dataset_name squad \ 31 | --do_train \ 32 | --do_eval \ 33 | --per_device_train_batch_size 12 \ 34 | --learning_rate 3e-5 \ 35 | --num_train_epochs 50 \ 36 | --max_seq_length 384 \ 37 | --doc_stride 128 \ 38 | --report_to none \ 39 | --output_dir /tmp/checkpoints/. \ 40 | --save_total_limit 10 \ 41 | --save_steps 1000 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | SkyPilot 5 | 6 |

7 | 8 | # SkyPilot Tutorial 👩‍🏫 9 | Welcome to [SkyPilot](https://github.com/skypilot-org/skypilot) tutorial! This tutorial is a collection of Jupyter notebooks that will walk you through the basics of using SkyPilot. 10 | 11 | # Installation 12 | To setup the dependencies for this tutorial, run the following command: 13 | ```console 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | ## Running the tutorial on your laptop 18 | It is recommended to run the tutorial with jupyter lab. In a terminal in the root of the repository, run: 19 | ```console 20 | jupyter lab 21 | ``` 22 | Jupyter lab should now be running. You follow the URL generated by jupyter lab to open the tutorial in your browser. 23 | 24 | If you do not have SkyPilot installed, please start with `00_installation` or refer to the [installation guide](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). Otherwise, you can start with `01_hello_sky`. 25 | 26 | 27 | ## Running the tutorial in a Docker container 28 | If you prefer to run the tutorial in a Docker container, you can do so by running the following command: 29 | ```console 30 | docker run --rm -p 8888:8888 -it public.ecr.aws/a9w6z7w5/skypilot-tutorial:latest 31 | ``` 32 | Note that you may need to setup credentials inside the container. 33 | 34 | If you would like to skip credential setup in the container and use your local credentials, copy your `~/.aws/` and `~/.config/` directories to a new directory `/tmp/credentials` such that it contains`/tmp/credentials/.aws/` and `/tmp/credentials/.config/gcloud/` directories with the relevant files. Then, run the following command: 35 | 36 | ```console 37 | docker run --rm -v /tmp/credentials:/credentials:ro 8888:8888 -it public.ecr.aws/a9w6z7w5/skypilot-tutorial:latest 38 | ``` 39 | 40 | This will automatically install the AWS and GCP credentials inside the container. 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /03_spot_instances/terminator.py: -------------------------------------------------------------------------------- 1 | # Helper functions to terminate AWS instances 2 | import sky 3 | import subprocess 4 | import time 5 | import sys 6 | 7 | 8 | def sleep_timer(time_duration): 9 | for remaining in range(time_duration, 0, -1): 10 | sys.stdout.write("\r") 11 | sys.stdout.write("{:3d} seconds remaining.".format(remaining)) 12 | sys.stdout.flush() 13 | time.sleep(1) 14 | 15 | 16 | def terminate(job_name=None, job_region=None): 17 | if job_name is None: 18 | print("Finding spot job to terminate...") 19 | jobs = sky.core.spot_status(refresh=False) 20 | latest_job = max(jobs, key=lambda k: k['job_id']) 21 | job_name = latest_job['job_name'] 22 | job_region = latest_job['region'] 23 | assert str(latest_job[ 24 | 'status']) == 'SpotStatus.RUNNING', f'Job {job_name} is not running, please check sky spot status' 25 | assert job_name is not None, "No job name provided" 26 | assert job_region is not None, "No job region provided" 27 | print(f"Terminating latest spot job {job_name}...") 28 | run_aws_terminate(job_name, job_region) 29 | print(f"\n====== Successfully terminated spot VM. Hasta la vista, {job_name} ======") 30 | 31 | 32 | def run_aws_terminate(job_name, job_region): 33 | # Get instance id 34 | print("Getting instance id...") 35 | instance_id_cmd = f'aws ec2 describe-instances --region {job_region} --filters Name=tag:ray-cluster-name,Values={job_name}* --query Reservations[].Instances[].InstanceId --output text' 36 | result = subprocess.run(instance_id_cmd, shell=True, stdout=subprocess.PIPE) 37 | if result.returncode != 0: 38 | raise RuntimeError( 39 | f'Unable to get instance_id for job {job_name}. Command: {instance_id_cmd}. Retcode: {result.returncode}. Stdout: {result.stdout}. Stderr: {result.stderr}') 40 | instance_id = result.stdout.decode('utf-8') 41 | print(f"Terminating instance_id {instance_id}") 42 | 43 | # Terminate instances 44 | terminate_cmd = f'aws ec2 terminate-instances --region {job_region} --instance-ids {instance_id}' 45 | print(f"Running command: {terminate_cmd}") 46 | result = subprocess.run(terminate_cmd, stdout=subprocess.PIPE, 47 | stderr=subprocess.PIPE, universal_newlines=True, 48 | shell=True) 49 | if result.returncode != 0: 50 | raise RuntimeError( 51 | f'Unable to terminate job {job_name}. Command: {terminate_cmd}. Retcode: {result.returncode}. Stdout: {result.stdout}. Stderr: {result.stderr}') 52 | -------------------------------------------------------------------------------- /00_installation/00_installation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "78d09a6e", 6 | "metadata": {}, 7 | "source": [ 8 | "

\n", 9 | " \n", 10 | "

" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "2db67081", 16 | "metadata": {}, 17 | "source": [ 18 | "# Installation" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "2f8fb045", 24 | "metadata": {}, 25 | "source": [ 26 | "This notebook will guide you through installing SkyPilot on your machine." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "fcd0eafb", 32 | "metadata": {}, 33 | "source": [ 34 | "## Installing SkyPilot" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "b0846a1d", 40 | "metadata": {}, 41 | "source": [ 42 | "By default, `pip install skyPilot` installs SkyPilot with AWS support. To install support for GCP and Azure, please edit the install command before running the following cell:" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "7d878461", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# Edit this command based on the clouds you have access to! This might take some time depending on chosen dependencies.\n", 53 | "!pip install \"skypilot[aws,azure,gcp]\" -q" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "5e5e5de9", 59 | "metadata": {}, 60 | "source": [ 61 | "Let's verify skypilot is installed:" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "6e8e3c34", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import sky\n", 72 | "print(sky.__version__)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "e70b24f9", 78 | "metadata": {}, 79 | "source": [ 80 | "## Checking credential setup" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "abc55d1f", 86 | "metadata": {}, 87 | "source": [ 88 | "Once SkyPilot is installed, it checks for access to clouds by using credentials used by cloud CLI tools.\n", 89 | "\n", 90 | "Let's run `sky check` to make sure your credentials are correctly setup.\n", 91 | "\n", 92 | "After running the below cell, you should have one or more clouds marked as `enabled`." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "0da65bb5", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "! sky check" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "e679abce", 108 | "metadata": {}, 109 | "source": [ 110 | "This will produce a summary like:\n", 111 | "\n", 112 | "---------------------\n", 113 | "```\n", 114 | "Checking credentials to enable clouds for SkyPilot.\n", 115 | " AWS: enabled\n", 116 | " GCP: enabled\n", 117 | " Azure: enabled\n", 118 | "\n", 119 | "SkyPilot will use only the enabled clouds to run tasks. To change this, configure cloud credentials, and run sky check.\n", 120 | "```\n", 121 | "---------------------" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "67c8fedb", 127 | "metadata": {}, 128 | "source": [ 129 | "* **If your desired clouds are marked as enabled ✅** - Congratulations! You have successfully installed SkyPilot! Please proceed to the next notebook.\n", 130 | "* **If your desired clouds are not marked as enabled ❌** - No worries, let's set them up now!" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "4e865bd0", 136 | "metadata": {}, 137 | "source": [ 138 | "## Setting up cloud accounts" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "2387fd4f", 144 | "metadata": {}, 145 | "source": [ 146 | "If `sky check` failed above, we will now setup your cloud credentials to work SkyPilot. \n", 147 | "\n", 148 | "For the following parts of the tutorial, you may need to open a terminal to run some commands. These points will be highlighted with a **💻** icon.\n", 149 | "\n", 150 | "> **💡 Hint** - If you're using jupyter lab, you can create a terminal in your browser by going to `File -> New -> Terminal`\n", 151 | "\n", 152 | "\n", 153 | "**Note - After running the below instructions, make sure to run `sky check` again to verify they are properly configured!**" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "id": "1c54d6aa", 159 | "metadata": {}, 160 | "source": [ 161 | "### AWS\n", 162 | "\n", 163 | "💻 Open a terminal and run:\n", 164 | "\n", 165 | "----------------------------\n", 166 | "```console\n", 167 | "pip install boto3\n", 168 | "aws configure\n", 169 | "```\n", 170 | "----------------------------\n", 171 | "\n", 172 | "\n", 173 | "To get the **AWS access key** required by `aws configure`, please go to the [AWS IAM Management Console](https://us-east-1.console.aws.amazon.com/iam/home?region=us-east-1#/security_credentials) and click on the “Access keys” dropdown (detailed instructions [here](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html#Using_CreateAccessKey)). The Default region name [None]: and Default output format [None]: fields are optional and can be left blank to choose defaults." 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "6a6b91c9", 179 | "metadata": {}, 180 | "source": [ 181 | "### Google Cloud Platform\n", 182 | "💻 In an interactive terminal, run:\n", 183 | "\n", 184 | "----------------------------\n", 185 | "```console\n", 186 | "pip install google-api-python-client\n", 187 | "conda install -c conda-forge google-cloud-sdk\n", 188 | "\n", 189 | "gcloud init\n", 190 | "\n", 191 | "# Run this if you don't have a credentials file.\n", 192 | "# This will generate ~/.config/gcloud/application_default_credentials.json.\n", 193 | "gcloud auth application-default login\n", 194 | "```\n", 195 | "----------------------------\n", 196 | "\n", 197 | "If running `conda install -c conda-forge google-cloud-sdk` produces the error _“RemoveError: ‘requests’ is a dependency of conda and cannot be removed from conda’s operating environment”_, try `conda update --force conda` first and rerun the command.\n", 198 | "\n", 199 | "Note: if you encounter _Authorization Error (Error 400: invalid_request)_ with the url generated by `gcloud auth login`, try installing the latest version of the [Google Cloud SDK](https://cloud.google.com/sdk/docs/install) (e.g., with `conda install -c conda-forge google-cloud-sdk`) and rerun the command." 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "id": "0052e1db", 205 | "metadata": {}, 206 | "source": [ 207 | "### Microsoft Azure\n", 208 | "\n", 209 | "💻 In an interactive terminal, run:\n", 210 | "\n", 211 | "----------------------------\n", 212 | "```console\n", 213 | "# Login\n", 214 | "az login\n", 215 | "# Set the subscription to use\n", 216 | "az account set -s \n", 217 | "```\n", 218 | "----------------------------\n", 219 | "\n", 220 | "You can run `az account subscription list` to get a list of subscription IDs under your account.\n" 221 | ] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": "Python 3", 227 | "language": "python", 228 | "name": "python3" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 3 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython3", 240 | "version": "3.7.4" 241 | } 242 | }, 243 | "nbformat": 4, 244 | "nbformat_minor": 5 245 | } -------------------------------------------------------------------------------- /02_using_accelerators/02_using_accelerators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

\n", 8 | " \n", 9 | "

" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Using accelerators and object stores to train ML Models 💨\n", 17 | "\n", 18 | "Tasks in SkyPilot can request special resources for their execution. For instance, an ML training task can request Nvidia GPUs or Google TPUs for accelerated training, or a larger disk size. SkyPilot handles provisioning and allocation of these specialized resources to tasks.\n", 19 | "\n", 20 | "Additionally, SkyPilot also allows tasks to access cloud object stores. It provides an easy to use interface for object stores which mounts the contents as files at a local path. Your datasets and dependencies stored in object stores can be directly accessed by SkyPilot tasks as if they were local files." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# Learning outcomes 🎯\n", 28 | "\n", 29 | "After completing this notebook, you will be able to:\n", 30 | "\n", 31 | "1. List the GPUs and Accelerators supported by SkyPilot. \n", 32 | "2. Specify different resource types (GPUs, TPUs) for your tasks.\n", 33 | "3. Access data on object stores directly from your tasks." 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "# [DIY] Listing supported accelerators with `sky show-gpus`\n", 41 | "\n", 42 | "To see the list of accelerators supported by SkyPilot , you can use the `sky show-gpus` command. \n", 43 | "\n", 44 | "**Run `sky show-gpus` by running the cell below:**" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "! sky show-gpus" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "### Expected output\n", 61 | "-------------------------\n", 62 | "```console\n", 63 | "$ sky show-gpus\n", 64 | "NVIDIA_GPU AVAILABLE_QUANTITIES \n", 65 | "V100 1, 2, 4, 8 \n", 66 | "V100-32GB 8 \n", 67 | "A100 1, 2, 4, 8, 16 \n", 68 | "A100-80GB 1, 2, 4, 8 \n", 69 | "P100 1, 2, 4 \n", 70 | "K80 1, 2, 4, 8, 16 \n", 71 | "T4 1, 2, 4, 8 \n", 72 | "M60 1, 2, 4 \n", 73 | "\n", 74 | "GOOGLE_TPU AVAILABLE_QUANTITIES \n", 75 | "tpu-v2-8 1 \n", 76 | "tpu-v2-32 1 \n", 77 | "tpu-v2-128 1 \n", 78 | "tpu-v2-256 1 \n", 79 | "tpu-v2-512 1 \n", 80 | "tpu-v3-8 1 \n", 81 | "tpu-v3-32 1 \n", 82 | "tpu-v3-64 1 \n", 83 | "tpu-v3-128 1 \n", 84 | "tpu-v3-256 1 \n", 85 | "tpu-v3-512 1 \n", 86 | "tpu-v3-1024 1 \n", 87 | "tpu-v3-2048 1 \n", 88 | "```\n", 89 | "-------------------------" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "> **💡 Hint -** For a more extensive list of the GPUs supported by each cloud and their pricing information, run `sky show-gpus -a` in an interactive terminal." 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "# Specifying resource requirements of tasks\n", 104 | "\n", 105 | "Special resource requirements are specified through the `resources` field in the SkyPilot task YAML. For example, to request 1 K80 GPU for your task, simply add it to the YAML like so:\n", 106 | "\n", 107 | "```yaml\n", 108 | "resources:\n", 109 | " accelerators: K80:1\n", 110 | "\n", 111 | "setup: ....\n", 112 | "\n", 113 | "run: .....\n", 114 | "```\n", 115 | "\n", 116 | "> **💡 Hint -** In addition to `accelerators`, you can specify many more requirements, such as `disk_size`, a specific `cloud`, `region` or `zone`, `instance_type` and more! You can find more details in the [YAML configuration docs](https://skypilot.readthedocs.io/en/latest/reference/yaml-spec.html)." 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## [DIY] 📝 Edit `bert.yaml` to use a K80 GPU! \n", 124 | "\n", 125 | "We have provided an example YAML (`bert.yaml`) which fine-tunes a BERT model on the SQuAD dataset. However, it does not specify any GPU resources for training.\n", 126 | "\n", 127 | "**Edit `bert.yaml` to add the resources field to it!**\n", 128 | "\n", 129 | "Your final YAML should have a `resources` field like this:\n", 130 | "\n", 131 | "---------------------\n", 132 | "```yaml\n", 133 | "...\n", 134 | "resources:\n", 135 | " accelerators: K80:1\n", 136 | "...\n", 137 | "```\n", 138 | "---------------------" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# Accessing data from object stores " 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "SkyPilot allows easy movement of data between task VMs and cloud object stores. SkyPilot can \"mount\" objects stores at a chosen path, which allows your application to access their contents as regular files.\n", 153 | "\n", 154 | "These mount paths can be specified using the `file_mounts` field. For example, you may have noticed this in `bert.yaml`:\n", 155 | "\n", 156 | "-------------------\n", 157 | "```yaml\n", 158 | "file_mounts:\n", 159 | " /dataset/:\n", 160 | " source: s3://sky-bert-dataset/\n", 161 | "```\n", 162 | "-------------------\n", 163 | "\n", 164 | "This statement directs SkyPilot to mount the contents of `s3://sky-bert-dataset/` at `/dataset/`. When the task accesses contents of `/dataset/`, they are streamed from the `sky-bert-dataset` s3 bucket. As a result, **the application is able to use files and datasets stored in cloud object stores without any changes to its code**, simply reading the dataset as if it were a local file at /dataset/.\n", 165 | "\n", 166 | "> **💡 Hint** - In addition to object stores, SkyPilot can also copy files from your local machine to the remote VM! Refer to [SkyPilot docs](https://skypilot.readthedocs.io/en/latest/examples/syncing-code-artifacts.html) for more information." 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## [DIY] 💻 Launch your BERT training task!\n", 174 | "\n", 175 | "**After you have edited `bert.yaml` to use K80 GPUs, open a terminal and use `sky launch` to create a GPU cluster:**\n", 176 | "\n", 177 | "-------------------------\n", 178 | "```console\n", 179 | "sky launch 02_using_accelerators/bert.yaml\n", 180 | "```\n", 181 | "-------------------------\n", 182 | "\n", 183 | "This will take about two minutes.\n", 184 | "\n", 185 | "### Expected output\n", 186 | "\n", 187 | "After the usual SkyPilot output, you should see your task run:\n", 188 | "\n", 189 | "-------------------------\n", 190 | "```console\n", 191 | "$ sky launch bert.yaml \n", 192 | "Task from YAML spec: bert.yaml\n", 193 | "...\n", 194 | "(bert_qa pid=81384) Running tokenizer on validation dataset: 91%|█████████ | 10/11 [00:05<00:00, 1.68ba/s]\n", 195 | "(bert_qa pid=81384) [INFO|trainer.py:1290] 2022-10-16 17:48:10,010 >> ***** Running training *****\n", 196 | "(bert_qa pid=81384) [INFO|trainer.py:1291] 2022-10-16 17:48:10,011 >> Num examples = 88524\n", 197 | "(bert_qa pid=81384) [INFO|trainer.py:1292] 2022-10-16 17:48:10,011 >> Num Epochs = 50\n", 198 | "(bert_qa pid=81384) [INFO|trainer.py:1293] 2022-10-16 17:48:10,011 >> Instantaneous batch size per device = 12\n", 199 | "(bert_qa pid=81384) [INFO|trainer.py:1294] 2022-10-16 17:48:10,011 >> Total train batch size (w. parallel, distributed & accumulation) = 12\n", 200 | "(bert_qa pid=81384) [INFO|trainer.py:1295] 2022-10-16 17:48:10,011 >> Gradient Accumulation steps = 1\n", 201 | "(bert_qa pid=81384) [INFO|trainer.py:1296] 2022-10-16 17:48:10,011 >> Total optimization steps = 368850\n", 202 | "```\n", 203 | "-------------------------\n", 204 | "\n", 205 | "**After you see the task training output, hit `ctrl+c` to exit.**\n", 206 | "\n", 207 | "> **💡 Hint** - For long running tasks, you can safely Ctrl+C to exit once the task has started. It will continue running in the background. For more on how to access logs after detaching, queue more tasks and cancel tasks, please refer to [SkyPilot docs](https://skypilot.readthedocs.io/en/latest/reference/job-queue.html)." 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## [DIY] 💻 Remember to terminate your cluster once you're done!\n", 215 | "\n", 216 | "**Run `sky status` to get the cluster name and then use `sky down` to terminate it.**\n", 217 | "\n", 218 | "-------------------------\n", 219 | "```console\n", 220 | "$ sky status\n", 221 | "...\n", 222 | "$ sky down \n", 223 | "```\n", 224 | "-------------------------" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "# Transparently training BERT on a different cloud\n", 232 | "Moving this complex BERT training job to a different cloud is easy with SkyPilot. \n", 233 | "\n", 234 | "**Even though this task requires access to accelerators and object stores, SkyPilot can seamlessly run this job on a different cloud with just one line change - adding the `--cloud` flag to `sky launch`.**\n", 235 | "\n", 236 | "Just like in the previous notebook, you can simply use the same YAML:\n", 237 | "\n", 238 | "-----------------\n", 239 | "```\n", 240 | "sky launch 02_using_accelerators/bert.yaml --cloud gcp\n", 241 | "```\n", 242 | "-----------------\n", 243 | "\n", 244 | "(In the interest of time, we don't run this command in this notebook but feel free to try it later!)\n", 245 | "\n", 246 | "SkyPilot will find instance types on GCP that support the required GPU, and it will also mount the object store when the task runs." 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "#### 🎉 Congratulations! You have learnt how to use accelerators and cloud object stores in SkyPilot! Please proceed to the next notebook.\n" 254 | ] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.7.4" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 4 278 | } 279 | -------------------------------------------------------------------------------- /03_spot_instances/03_spot_instances.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "jupyter": { 7 | "outputs_hidden": true 8 | } 9 | }, 10 | "source": [ 11 | "

\n", 12 | " \n", 13 | "

" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Saving costs with managed spot jobs 💸\n", 21 | "As an intercloud broker, SkyPilot also supports spot instances, low-priced VMs that can be preempted at any time by the cloud provider.\n", 22 | "\n", 23 | "More importantly, SkyPilot offers a fully managed experience for running jobs on spot instances **that can automatically recover from preemptions**. This feature **saves significant cost (e.g., up to 70% for GPU VMs)** by making preemptible spot instances practical for long-running jobs.\n", 24 | "\n", 25 | "To maximize availability, SkyPilot automatically finds available spot resources across regions and clouds. Here is an example of BERT training job running in different regions across AWS and GCP, switching over to a different region whenever preempted.\n", 26 | "\n", 27 | "

\n", 28 | " \n", 29 | "

" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Learning outcomes 🎯\n", 37 | "\n", 38 | "In this notebook, you will:\n", 39 | "\n", 40 | "1. Run a managed spot job in SkyPilot\n", 41 | "2. Forcefully preempt a running job and observe SkyPilot's recovery mechanism" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# Running managed spot jobs with `sky spot launch`\n", 49 | "Any SkyPilot task can be launched on spot instances by simply using `sky spot launch task.yaml` instead of `sky launch task.yaml`. The `sky spot` CLI offers the following commands:\n", 50 | "\n", 51 | "1. **`sky spot launch `** - Launches a managed spot job.\n", 52 | "2. **`sky spot status`** - Shows the status of managed spot jobs.\n", 53 | "3. **`sky spot logs `** - Fetches the logs of a spot job.\n", 54 | "4. **`sky spot cancel `** - Cancels a spot job.\n", 55 | "\n", 56 | "To manage the lifecycle of spot jobs, SkyPilot uses a controller that handles job launching and failure recovery. On running `sky spot launch`, SkyPilot first launches a controller (if it does not exist) and then runs the job. The controller is shared across all spot jobs launched by you." 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## [DIY] 💻 Train BERT on spot instances with `sky spot launch`!\n", 64 | "\n", 65 | "**Training BERT on spot instances with SkyPilot requires no changes to the YAML!**\n", 66 | "\n", 67 | "**Simply replace `sky launch` with `sky spot launch` to run the task on spot instances.**\n", 68 | "\n", 69 | "------------------\n", 70 | "```console\n", 71 | "$ sky spot launch 03_spot_instances/bert.yaml\n", 72 | "```\n", 73 | "------------------\n", 74 | "\n", 75 | "This command will take a few minutes.\n", 76 | "\n", 77 | "SkyPilot will launch and start monitoring the spot job. When a preemption happens, SkyPilot will automatically search for resources across regions and clouds to re-launch the job.\n", 78 | "\n", 79 | "```\n", 80 | "Task from YAML spec: bert.yaml\n", 81 | "Launching a new spot task 'sky-5ce7-romilb'. Proceed? [Y/n]: Y\n", 82 | "...\n", 83 | "I 10-16 21:29:06 cloud_vm_ray_backend.py:2067] Job submitted with Job ID: 1\n", 84 | "I 10-17 04:29:06 spot_utils.py:205] Waiting for the spot controller process to be RUNNING (status: PENDING).\n", 85 | "I 10-17 04:29:11 spot_utils.py:233] INFO: The log is not ready yet, as the spot job is STARTING. Waiting for 20 seconds.\n", 86 | "...\n", 87 | "I 10-17 04:34:33 log_lib.py:385] Start streaming logs for spot job 1.\n", 88 | "...\n", 89 | "(sky-5ce7-romilb pid=23855) [INFO|trainer.py:1290] 2022-10-17 04:35:52,604 >> ***** Running training *****\n", 90 | "(sky-5ce7-romilb pid=23855) [INFO|trainer.py:1291] 2022-10-17 04:35:52,604 >> Num examples = 88524\n", 91 | "(sky-5ce7-romilb pid=23855) [INFO|trainer.py:1292] 2022-10-17 04:35:52,604 >> Num Epochs = 50\n", 92 | "(sky-5ce7-romilb pid=23855) [INFO|trainer.py:1293] 2022-10-17 04:35:52,604 >> Instantaneous batch size per device = 12\n", 93 | "(sky-5ce7-romilb pid=23855) [INFO|trainer.py:1294] 2022-10-17 04:35:52,604 >> Total train batch size (w. parallel, distributed & accumulation) = 12\n", 94 | "(sky-5ce7-romilb pid=23855) [INFO|trainer.py:1295] 2022-10-17 04:35:52,604 >> Gradient Accumulation steps = 1\n", 95 | "(sky-5ce7-romilb pid=23855) [INFO|trainer.py:1296] 2022-10-17 04:35:52,604 >> Total optimization steps = 368850\n", 96 | "\n", 97 | "```\n", 98 | "\n", 99 | "## [DIY] 💻 Check the status of your spot job with `sky spot status` \n", 100 | "\n", 101 | "**Go ahead and run `sky spot status` to fetch the status of your job.**\n", 102 | "\n", 103 | "------------------\n", 104 | "```\n", 105 | "$ sky spot status\n", 106 | "Fetching managed spot job status...\n", 107 | "Managed spot jobs:\n", 108 | "In progress jobs: 1 RUNNING\n", 109 | "\n", 110 | "ID NAME RESOURCES SUBMITTED TOT. DURATION JOB DURATION #RECOVERIES STATUS\n", 111 | "1 sky-5ce7-romilb 1x [T4:1] 13 mins ago 13m 3s 7m 48s 0 RUNNING\n", 112 | "```\n", 113 | "------------------" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "# SkyPilot spot job recovery in action ⛑" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Let's observe how SkyPilot can automatically recover from spot instance preemptions.\n", 128 | "\n", 129 | "In this section, we will:\n", 130 | "\n", 131 | "1. **Forcefully terminate the spot instance using the AWS CLI**. We have provided a helper function `terminator.terminate()` to do this.\n", 132 | "2. Observe that the controller detects the spot job failure.\n", 133 | "3. Run `sky spot status` to see the **status change from `RUNNING` to `RECOVERING`**.\n", 134 | "4. Wait for the job to recover.\n", 135 | "5. Run `sky spot status` to see the **status change back to `RUNNING`** and `#RECOVERIES` increment by 1.\n", 136 | "\n", 137 | "All of these steps are coded in the following cell. \n", 138 | "\n", 139 | "**Run the cell below and observe the outputs.**" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "import terminator\n", 149 | "import time\n", 150 | "import subprocess\n", 151 | "from terminator import sleep_timer\n", 152 | "\n", 153 | "# Kill the spot instance\n", 154 | "terminator.terminate()\n", 155 | "\n", 156 | "# Wait for the spot job status to be updated in the controller\n", 157 | "print(\"\\nWaiting for 45 seconds to let the controller detect spot failure before running sky spot status\")\n", 158 | "sleep_timer(45)\n", 159 | "\n", 160 | "# Run sky spot status.\n", 161 | "print(\"\\n\\nRunning sky spot status. Note that the job status will have changed to RECOVERING.\")\n", 162 | "print(subprocess.check_output('sky spot status', shell=True, encoding='utf-8'))\n", 163 | "\n", 164 | "\n", 165 | "# Wait for the spot job status to be updated in the controller\n", 166 | "print(\"Waiting for 300 seconds to let the spot instance recover before running sky spot status again.\")\n", 167 | "sleep_timer(300)\n", 168 | "print(\"\\n\\nRunning sky spot status. Note that the job status will have changed to RUNNING.\")\n", 169 | "print(subprocess.check_output('sky spot status', shell=True, encoding='utf-8'))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "### Expected cell output:\n", 177 | "-------------------------\n", 178 | "```\n", 179 | "Finding spot job to terminate...\n", 180 | "Terminating latest spot job sky-5ce7-romilb...\n", 181 | "Getting instance id...\n", 182 | "Terminating instance_id i-0b7b1bc0c0d3a03c9\n", 183 | "Running command: aws ec2 terminate-instances --region us-west-2 --instance-ids i-08cb990c15dcf86a3\ti-0b252881a10ec7c88\ti-0d91caeda85b05e40\ti-0185b54f5cb2efad7\ti-0b7b1bc0c0d3a03c9\n", 184 | "\n", 185 | "====== Successfully terminated spot VM. Hasta la vista, sky-5ce7-romilb ======\n", 186 | "\n", 187 | "Waiting for 45 seconds to let the controller detect spot failure before running sky spot status\n", 188 | " 1 seconds remaining.\n", 189 | "\n", 190 | "Running sky spot status. Note that the job status will have changed to RECOVERING.\n", 191 | "Fetching managed spot job statuses...\n", 192 | "Managed spot jobs:\n", 193 | "In progress jobs: 1 RECOVERING\n", 194 | "\n", 195 | "ID NAME RESOURCES SUBMITTED TOT. DURATION JOB DURATION #RECOVERIES STATUS \n", 196 | "1 sky-5ce7-romilb 1x [T4:1] 1 hr ago 1h 31m 45s 1h 7m 30s 0 RECOVERING \n", 197 | "\n", 198 | "Waiting for 300 seconds to let the spot instance recover before running sky spot status again.\n", 199 | " 1 seconds remaining.\n", 200 | " \n", 201 | "Running sky spot status. Note that the job status will have changed to RUNNING.\n", 202 | "Fetching managed spot job statuses...\n", 203 | "Managed spot jobs:\n", 204 | "In progress jobs: 1 RUNNING\n", 205 | "\n", 206 | "ID NAME RESOURCES SUBMITTED TOT. DURATION JOB DURATION #RECOVERIES STATUS \n", 207 | "1 sky-5ce7-romilb 1x [T4:1] 1 hr ago 1h 36m 58s 1h 7m 51s 1 RUNNING\n", 208 | "```\n", 209 | "-------------------------" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "## 💻 Clean up with `sky spot cancel`\n", 217 | "We're at the end of this tutorial! \n", 218 | "\n", 219 | "**Run the following commands to stop all your jobs and kill any VMs.**\n", 220 | "\n", 221 | "---------------\n", 222 | "```\n", 223 | "# Cancel spot jobs\n", 224 | "$ sky spot cancel -ay\n", 225 | "\n", 226 | "# Stop any running VMs\n", 227 | "$ sky down -ay\n", 228 | "```\n", 229 | "---------------" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "### 🎉 Congratulations! You have compeleted the SkyPilot tutorial!\n", 237 | "\n", 238 | "### We want your feedback!\n", 239 | "**Please take a few minutes to fill out this short survey: [https://forms.gle/pjm7yPCxK7219vwm8](https://forms.gle/pjm7yPCxK7219vwm8).** We would love to hear what you thought about SkyPilot and this tutorial!\n", 240 | "\n", 241 | "\n", 242 | "### Liked SkyPilot?\n", 243 | "* **Give us a star on [github](github.com/skypilot-org/skypilot)!**\n", 244 | "* **Join us on the [SkyPilot slack](https://join.slack.com/t/skypilot-org/shared_invite/zt-1i4pa7lyc-g6Lo4_rqqCFWOSXdvwTs3Q)!** [![SkyPilotSlack](https://img.shields.io/badge/SkyPilot-Join%20Slack-blue?logo=slack)](https://join.slack.com/t/skypilot-org/shared_invite/zt-1i4pa7lyc-g6Lo4_rqqCFWOSXdvwTs3Q)\n", 245 | "* **Check out the [docs](https://skypilot.readthedocs.io/) to learn about more exciting SkyPilot features, such as automatic benchmarking, automatic instance stopping, TPUs, on-premise support and much more!**" 246 | ] 247 | } 248 | ], 249 | "metadata": { 250 | "kernelspec": { 251 | "display_name": "Python 3", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.7.4" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 4 270 | } 271 | -------------------------------------------------------------------------------- /01_hello_sky/01_hello_sky.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

\n", 8 | " \n", 9 | "

" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "pycharm": { 16 | "name": "#%% md\n" 17 | } 18 | }, 19 | "source": [ 20 | "# Welcome to SkyPilot! 👋" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "pycharm": { 27 | "name": "#%% md\n" 28 | } 29 | }, 30 | "source": [ 31 | "SkyPilot is a framework for easily running machine learning workloads on any cloud. \n", 32 | "\n", 33 | "Use the clouds **easily** and **cost effectively**, without needing cloud infra expertise.\n", 34 | "\n", 35 | "_Ease of use_\n", 36 | "* **Run existing projects on the cloud** with zero code changes\n", 37 | "* Use a **unified interface** to run on any cloud, without vendor lock-in (currently AWS, Azure, GCP)\n", 38 | "* **Queue jobs** on one or multiple clusters\n", 39 | "* **Automatic failover** to find scarce resources (GPUs) across regions and clouds\n", 40 | "* **Use datasets on the cloud** like you would on a local file system \n", 41 | "\n", 42 | "_Cost saving_\n", 43 | "* Run jobs on **spot instances** with **automatic recovery** from preemptions\n", 44 | "* Hands-free cluster management: **automatically stopping idle clusters**\n", 45 | "* One-click use of **TPUs**, for high-performance, cost-effective training\n", 46 | "* Automatically benchmark and find the cheapest hardware for your job" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "# Learning outcomes 🎯\n", 54 | "\n", 55 | "After completing this notebook, you will be able to:\n", 56 | "\n", 57 | "1. Understand the basic SkyPilot YAML interface (`setup`, `run`).\n", 58 | "2. Run a hello world task on a cloud of your choice.\n", 59 | "3. SSH into your cluster for debugging and development.\n", 60 | "4. Terminate the cluster and understand the cluster lifecycle.\n", 61 | "5. Run your task seamlessly across different clouds." 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "# How to use this Tutorial" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "These notebooks serve as an **interactive** introduction to SkyPilot.\n", 76 | "\n", 77 | "There are points in these notebooks where you may need to edit files outside the notebook and open a terminal to run some commands. These points will be highlighted with **two icons**:\n", 78 | "\n", 79 | "### [DIY] 📝 - Edit an external file\n", 80 | "### [DIY] 💻 - Run commands in an interactive terminal window\n", 81 | "\n", 82 | "Use these icons as a hint to know when to switch away from the current notebook and edit a file or open a terminal.\n", 83 | "\n", 84 | "> **💡 Hint** - If you're using jupyter lab, you can create a terminal in your browser by going to `File -> New -> Terminal`" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "# Preflight checks - verifying cloud credential setup" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Before we start this tutorial, let's run `sky check` to make sure your credentials are correctly setup.\n", 99 | "\n", 100 | "After running the below cell, you should have AWS and GCP clouds marked as `enabled`. \n", 101 | "\n", 102 | "> **💡 Hint** - If you don't see any clouds enabled, please refer to the [SkyPilot docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloud-account-setup) on how to setup your cloud accounts.\n", 103 | "\n", 104 | "> **💡 Hint** - SkyPilot also supports Azure! Though it is not used in this tutorial, please check out our [docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloud-account-setup) on how to setup Azure support." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# Run this cell to check if your cloud accounts are setup to work with SkyPilot\n", 114 | "! sky check" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "# Writing your first SkyPilot Task" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "A **task** in SkyPilot specifies the command that must be run on the cloud, along with the resources required (e.g. GPUs, TPUs, number of nodes) and any dependencies (e.g., files, packages and libraries).\n", 129 | "\n", 130 | "Tasks in SkyPilot are defined as YAML files. Here is an example:\n", 131 | "\n", 132 | "-----------------------------------\n", 133 | "```yaml\n", 134 | "# example.yaml\n", 135 | "name: example\n", 136 | "\n", 137 | "setup: |\n", 138 | " echo \"Run any setup commands here\"\n", 139 | " pip install cowsay\n", 140 | "\n", 141 | "run: |\n", 142 | " echo \"Hello Stranger!\"\n", 143 | " cowsay \"Moo!\"\n", 144 | "```\n", 145 | "----------------------------------- \n", 146 | "\n", 147 | "This defines a task with the following components:\n", 148 | "\n", 149 | "* **setup**: commands that must be run before the task is executed. Here we install any dependencies for the task.\n", 150 | "\n", 151 | "* **run**: commands that run the actual task." 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## [DIY] 📝 Edit `example.yaml` to echo \"Hello SkyPilot\" \n", 159 | "**Go ahead and open example.yaml and edit the run field to echo \"Hello SkyPilot\".**" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "# Launching your first SkyPilot Task with `sky launch`" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "Once your task YAML is ready, you can run it on the cloud with `sky launch`." 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## [DIY] 💻 Launch your Sky Task!\n", 181 | "\n", 182 | "**In a terminal window, run:**\n", 183 | "\n", 184 | "-------------------------\n", 185 | "```console\n", 186 | "sky launch 01_hello_sky/example.yaml\n", 187 | "```\n", 188 | "-------------------------\n", 189 | "\n", 190 | "This will take about a minute to run.\n", 191 | "\n", 192 | "> **💡 Hint** - If you're using jupyter lab, you can create a terminal in your browser by going to `File -> New -> Terminal`\n", 193 | "\n", 194 | "You'll notice that SkyPilot will perform multiple actions for you:\n", 195 | "#### **1. Find the lowest priced VM instance type across different clouds**\n", 196 | "\n", 197 | "SkyPilot will run its optimizer and present you with the cheapest VM type that fits your resource demand.\n", 198 | "\n", 199 | "```console\n", 200 | "$ sky launch example.yaml\n", 201 | "(base) romilb@romilbx1yoga:skypilot-tutorial/01_hello_sky$ sky launch example.yaml \n", 202 | "Task from YAML spec: example.yaml\n", 203 | "I 09-07 16:24:59 optimizer.py:605] == Optimizer ==\n", 204 | "I 09-07 16:24:59 optimizer.py:617] Target: minimizing cost\n", 205 | "I 09-07 16:24:59 optimizer.py:628] Estimated cost: $0.4 / hour\n", 206 | "I 09-07 16:24:59 optimizer.py:628] \n", 207 | "I 09-07 16:24:59 optimizer.py:685] Considered resources (1 node):\n", 208 | "I 09-07 16:24:59 optimizer.py:713] ---------------------------------------------------------------------\n", 209 | "I 09-07 16:24:59 optimizer.py:713] CLOUD INSTANCE vCPUs ACCELERATORS COST ($) CHOSEN \n", 210 | "I 09-07 16:24:59 optimizer.py:713] ---------------------------------------------------------------------\n", 211 | "I 09-07 16:24:59 optimizer.py:713] AWS m6i.2xlarge 8 - 0.38 ✔ \n", 212 | "I 09-07 16:24:59 optimizer.py:713] Azure Standard_D8_v4 8 - 0.38 \n", 213 | "I 09-07 16:24:59 optimizer.py:713] GCP n1-highmem-8 8 - 0.47 \n", 214 | "I 09-07 16:24:59 optimizer.py:713] ---------------------------------------------------------------------\n", 215 | "I 09-07 16:24:59 optimizer.py:713] \n", 216 | "Launching a new cluster 'sky-82ce-romilb'. Proceed? [Y/n]: Y\n", 217 | "```\n", 218 | "\n", 219 | "#### **2. Provision the cluster**\n", 220 | "\n", 221 | "SkyPilot will setup a cluster with the requested resources and setup a SSH profile for it.\n", 222 | "\n", 223 | "\n", 224 | "#### **3. Run the task's `setup` commands to prepare the cluster for running the task**\n", 225 | "\n", 226 | "SkyPilot will run any commands specified in the `setup` field in the YAML on the VMs in the cluster. In this case, it will install the `cowsay` package.\n", 227 | "\n", 228 | "\n", 229 | "#### **4. Run the task's `run` commands**\n", 230 | "\n", 231 | "Finally, SkyPilot will run the commands specified in the `run` field. These commands can use any dependencies installed in the `setup` phase.\n", 232 | "\n", 233 | "> ```console\n", 234 | "(example pid=23346) Hello SkyPilot!\n", 235 | "(example pid=23346) ______\n", 236 | "(example pid=23346) < Moo! >\n", 237 | "(example pid=23346) ------\n", 238 | "(example pid=23346) \\ ^__^\n", 239 | "(example pid=23346) \\ (oo)\\_______\n", 240 | "(example pid=23346) (__)\\ )\\/\\\n", 241 | "(example pid=23346) ||----w |\n", 242 | "(example pid=23346) || ||\n", 243 | "INFO: Job finished (status: SUCCEEDED).\n", 244 | "```" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "# Tasks and Clusters in SkyPilot" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "**Tasks** in SkyPilot are executed on **clusters**. A **cluster** is a collection of nodes on a cloud.\n", 259 | "\n", 260 | "When you run a task with `sky launch`, SkyPilot creates a new cluster with a random name if an existing cluster is not specified.\n", 261 | "\n", 262 | "> **💡 Hint** - When running `sky launch`, you can give the cluster a name with the `-c` flag. E.g. `sky launch -c mycluster example.yaml` would launch a cluster with the name `mycluster`. If the cluster name already exists, then SkyPilot will try to reuse the cluster by re-running the `setup` commands on the cluster.\n", 263 | "\n", 264 | "You can see a table of your clusters with the command `sky status`." 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## [DIY] 💻 Checking your cluster status with `sky status`\n", 272 | "\n", 273 | "**In a terminal window, run:**\n", 274 | "\n", 275 | "\n", 276 | "-------------------------\n", 277 | "```console\n", 278 | "sky status\n", 279 | "```\n", 280 | "-------------------------" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "### Expected output\n", 288 | "-------------------------\n", 289 | "```console\n", 290 | "(base) romilb@romilbx1yoga:skypilot-tutorial/01_hello_sky$ sky status\n", 291 | "\n", 292 | "NAME LAUNCHED RESOURCES STATUS AUTOSTOP COMMAND \n", 293 | "sky-82ce-romilb 19 mins ago 1x AWS(m6i.2xlarge) UP - sky launch example.yaml \n", 294 | "```\n", 295 | "-------------------------" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "We can see that the `sky launch` in the previous cells created a cluster with the name `sky-82ce-romilb`." 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "## [DIY] 💻 SSH into the cluster!" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "For debugging and development, you can easily SSH into a SkyPilot cluster with the `ssh` utility. \n", 317 | "\n", 318 | "**In a terminal window, run:**\n", 319 | "\n", 320 | "-------------------------\n", 321 | "```console\n", 322 | "ssh \n", 323 | "```\n", 324 | "-------------------------" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### Expected output\n", 332 | "\n", 333 | "This will drop you into an interactive terminal inside your cluster:\n", 334 | "\n", 335 | "-------------------------\n", 336 | "```console\n", 337 | "(base) romilb@romilbx1yoga:skypilot-tutorial/01_hello_sky$ ssh sky-82ce-romilb \n", 338 | "Warning: Permanently added '18.234.228.139' (ECDSA) to the list of known hosts.\n", 339 | "=============================================================================\n", 340 | " __| __|_ )\n", 341 | " _| ( / Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 20.04)\n", 342 | " ___|\\___|___|\n", 343 | "=============================================================================\n", 344 | "\n", 345 | "Welcome to Ubuntu 20.04.4 LTS (GNU/Linux 5.13.0-1014-aws x86_64v)\n", 346 | "\n", 347 | "Last login: Wed Sep 7 23:27:50 2022 from 24.23.130.196\n", 348 | "ubuntu@ip-172-31-33-58:~$ echo $HOSTNAME\n", 349 | "ip-172-31-33-58\n", 350 | "```\n", 351 | "-------------------------\n", 352 | "\n", 353 | "You can use `ctrl+d` to exit from the SSH session.\n", 354 | "\n", 355 | "> **💡 Hint** - To enable the SSH functionality, SkyPilot adds the remote cluster to your `~/.ssh/config`. This means you can use the cluster name alias with other ssh tools, such as `scp`, `rsync`, VSCode and more!" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "# Cluster lifecycle management" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "SkyPilot clusters can exist in four states, each of which has different billing and storage implications:\n", 370 | "\n", 371 | "* **`INIT`** - Cluster is initializing.\n", 372 | "* **`UP`** - Cluster is up and running, you will be billed for the instance and the attached storages.\n", 373 | "* **`STOPPED`** - Cluster nodes are shut down and their disks are suspended. Your data and node state is safe and the cluster can be restored to running state when required. You will be billed only for the storage.\n", 374 | "* **`TERMINATED`** - Cluster is terminated and all nodes and their attached disks are deleted. These clusters cannot be restarted and will not be shown in `sky status`.\n", 375 | "\n", 376 | "To manage these states, SkyPilot offers three useful commands:\n", 377 | "\n", 378 | "1. **`sky stop`** - stops a `UP` cluster.\n", 379 | "2. **`sky start`** - starts a `STOPPED` cluster.\n", 380 | "2. **`sky down`** - terminates a `UP` or `STOPPED` cluster.\n", 381 | "\n", 382 | "> **💡 Hint** - `sky stop` and `sky start` are useful when you want to suspend your experiments for a while but want to quickly resume later. `sky down` is useful to delete a cluster and restart a job from scratch." 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "## [DIY] 💻 Terminate your cluster!\n", 390 | "Now that we are done using the cluster, let's terminate it to stop being billed for it. You can use `sky down` to terminate a cluster.\n", 391 | "\n", 392 | "**First, get the cluster name with `sky status`.**\n", 393 | "\n", 394 | "-------------------------\n", 395 | "```console\n", 396 | "$ sky status\n", 397 | "```\n", 398 | "-------------------------\n", 399 | "\n", 400 | "**and then run `sky down` to terminate the cluster**\n", 401 | "\n", 402 | "-------------------------\n", 403 | "```console\n", 404 | "$ sky down \n", 405 | "```\n", 406 | "-------------------------" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "### Expected output\n", 414 | "\n", 415 | "-------------------------\n", 416 | "```console\n", 417 | "(base) romilb@romilbx1yoga:skypilot-tutorial/01_hello_sky$ sky down sky-82ce-romilb\n", 418 | "Terminating 1 cluster: sky-82ce-romilb. Proceed? [Y/n]: Y\n", 419 | "Terminating cluster sky-82ce-romilb...done.\n", 420 | "Terminating 1 cluster ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00\n", 421 | "```\n", 422 | "-------------------------" 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "# Switching clouds with just one line change" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "One of the key benefits of using SkyPilot is the ability to seamlessly switch between different clouds for running your tasks.\n", 437 | "\n", 438 | "You may have noticed the previous task was launched on AWS because it was cheaper than GCP. However, if we wish to use a specific cloud, we can override the optimizer by using the `--cloud` flag.\n", 439 | "\n", 440 | "**Let's launch the same task on Google Cloud (GCP).**\n" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "## [DIY] 💻 Launch example.yaml on google cloud with with the `--cloud` flag" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "To override the SkyPilot optimizer and manually pick a cloud, use the `--cloud ` flag for `sky launch`.\n", 455 | "\n", 456 | "**Go ahead and run the task on GCP using `--cloud gcp` flag.**\n", 457 | "\n", 458 | "-------------------------\n", 459 | "```console\n", 460 | "sky launch 01_hello_sky/example.yaml --cloud gcp\n", 461 | "```\n", 462 | "-------------------------\n", 463 | "\n", 464 | "This will take about a minute." 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "### Expected output\n", 472 | "\n", 473 | "You'll note that SkyPilot only considers GCP as a possible resource now. This is because the `--cloud` sets a hard constraint on the optimizer to use only GCP. \n", 474 | "\n", 475 | "\n", 476 | "--------------------------\n", 477 | "```console\n", 478 | "(base) romilb@romilbx1yoga:skypilot-tutorial/01_hello_sky$ sky launch example.yaml --cloud gcp\n", 479 | "Task from YAML spec: example.yaml\n", 480 | "I 10-16 08:41:14 optimizer.py:605] == Optimizer ==\n", 481 | "I 10-16 08:41:14 optimizer.py:628] Estimated cost: $0.5 / hour\n", 482 | "I 10-16 08:41:14 optimizer.py:628] \n", 483 | "I 10-16 08:41:14 optimizer.py:685] Considered resources (1 node):\n", 484 | "I 10-16 08:41:14 optimizer.py:713] -------------------------------------------------------------------\n", 485 | "I 10-16 08:41:14 optimizer.py:713] CLOUD INSTANCE vCPUs ACCELERATORS COST ($) CHOSEN \n", 486 | "I 10-16 08:41:14 optimizer.py:713] -------------------------------------------------------------------\n", 487 | "I 10-16 08:41:14 optimizer.py:713] GCP n1-highmem-8 8 - 0.47 ✔ \n", 488 | "I 10-16 08:41:14 optimizer.py:713] -------------------------------------------------------------------\n", 489 | "I 10-16 08:41:14 optimizer.py:713] \n", 490 | "Launching a new cluster 'sky-e2fc-romilb'. Proceed? [Y/n]: \n", 491 | "```\n", 492 | "--------------------------" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "## [DIY] 💻 Terminate your GCP cluster!\n", 500 | "We're at the end of this notebook and we don't want to let your GCP cluster keep running and rack up a big bill! Let's terminate the cluster with `sky down`.\n", 501 | "\n", 502 | "**First, get the cluster name with `sky status`.**\n", 503 | "\n", 504 | "-------------------------\n", 505 | "```console\n", 506 | "sky status\n", 507 | "```\n", 508 | "-------------------------\n", 509 | "\n", 510 | "**and then run `sky down` to terminate the cluster**\n", 511 | "\n", 512 | "-------------------------\n", 513 | "```console\n", 514 | "sky down \n", 515 | "```\n", 516 | "-------------------------" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "#### 🎉 Congratulations! You have used SkyPilot to seamlessly run tasks on two clouds! Please proceed to the next notebook to learn how to use accelerators and object stores in SkyPilot.\n" 524 | ] 525 | } 526 | ], 527 | "metadata": { 528 | "kernelspec": { 529 | "display_name": "Python 3", 530 | "language": "python", 531 | "name": "python3" 532 | }, 533 | "language_info": { 534 | "codemirror_mode": { 535 | "name": "ipython", 536 | "version": 3 537 | }, 538 | "file_extension": ".py", 539 | "mimetype": "text/x-python", 540 | "name": "python", 541 | "nbconvert_exporter": "python", 542 | "pygments_lexer": "ipython3", 543 | "version": "3.7.4" 544 | } 545 | }, 546 | "nbformat": 4, 547 | "nbformat_minor": 4 548 | } --------------------------------------------------------------------------------