├── .gitattributes
├── .github
    └── workflows
    │   ├── ci.yml
    │   └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.rst
├── ci
    ├── environment-3.10.yml
    ├── environment-3.11.yml
    ├── environment-3.12.yml
    └── scripts
    │   └── test_imports.sh
├── dask_cloudprovider
    ├── __init__.py
    ├── _version.py
    ├── aws
    │   ├── __init__.py
    │   ├── ec2.py
    │   ├── ecs.py
    │   ├── helper.py
    │   └── tests
    │   │   ├── test_ec2.py
    │   │   ├── test_ecs.py
    │   │   └── test_helper.py
    ├── azure
    │   ├── __init__.py
    │   ├── azurevm.py
    │   ├── tests
    │   │   └── test_azurevm.py
    │   └── utils.py
    ├── cli
    │   └── ecs.py
    ├── cloudprovider.yaml
    ├── config.py
    ├── conftest.py
    ├── digitalocean
    │   ├── __init__.py
    │   ├── droplet.py
    │   └── tests
    │   │   └── test_droplet.py
    ├── exceptions.py
    ├── gcp
    │   ├── __init__.py
    │   ├── instances.py
    │   ├── tests
    │   │   ├── test_gcp.py
    │   │   └── test_utils.py
    │   └── utils.py
    ├── generic
    │   ├── cloud-init.yaml.j2
    │   ├── tests
    │   │   └── test_vmcluster.py
    │   └── vmcluster.py
    ├── hetzner
    │   ├── __init__.py
    │   ├── tests
    │   │   └── test_vserver.py
    │   └── vserver.py
    ├── ibm
    │   ├── __init__.py
    │   ├── code_engine.py
    │   └── tests
    │   │   └── test_code_engine.py
    ├── nebius
    │   ├── __init__.py
    │   ├── instances.py
    │   └── tests
    │   │   └── test_nebius.py
    ├── openstack
    │   ├── __init__.py
    │   ├── instances.py
    │   └── tests
    │   │   └── test_instances.py
    ├── tests
    │   └── test_imports.py
    └── utils
    │   ├── logs.py
    │   ├── socket.py
    │   └── timeout.py
├── doc
    ├── Makefile
    ├── make.bat
    ├── requirements-docs.txt
    └── source
    │   ├── alternatives.rst
    │   ├── aws.rst
    │   ├── azure.rst
    │   ├── conf.py
    │   ├── config.rst
    │   ├── digitalocean.rst
    │   ├── gcp.rst
    │   ├── gpus.rst
    │   ├── hetzner.rst
    │   ├── ibm.rst
    │   ├── index.rst
    │   ├── installation.rst
    │   ├── nebius.rst
    │   ├── openstack.rst
    │   ├── packer.rst
    │   ├── releasing.rst
    │   ├── security.rst
    │   ├── testing.rst
    │   └── troubleshooting.rst
├── examples
    ├── EC2Cluster-randomforest.ipynb
    └── OpenstackCluster-scorepredict.ipynb
├── pytest.ini
├── requirements.txt
├── requirements_test.txt
├── setup.cfg
├── setup.py
└── versioneer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | dask_cloudprovider/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: ${{ matrix.os }}
 8 |     strategy:
 9 |       fail-fast: true
10 |       matrix:
11 |         os: ["ubuntu-latest"]
12 |         python-version: ["3.10", "3.11", "3.12"]
13 | 
14 |     steps:
15 |       - name: Checkout source
16 |         uses: actions/checkout@v2
17 | 
18 |       - name: Setup Conda Environment
19 |         uses: conda-incubator/setup-miniconda@v2
20 |         with:
21 |           miniconda-version: "latest"
22 |           python-version: ${{ matrix.python-version }}
23 |           environment-file: ci/environment-${{ matrix.python-version }}.yml
24 |           activate-environment: dask-cloudprovider-test
25 |           auto-activate-base: false
26 | 
27 |       - name: Install
28 |         shell: bash -l {0}
29 |         run: pip install -e .[all]
30 | 
31 |       - name: Run tests
32 |         shell: bash -l {0}
33 |         run: py.test dask_cloudprovider
34 | 
35 |   lint:
36 |     name: "pre-commit hooks"
37 |     runs-on: ubuntu-latest
38 |     steps:
39 |       - uses: actions/checkout@v2
40 |       - uses: actions/setup-python@v2
41 |       - uses: pre-commit/action@v2.0.0
42 | 
43 |   imports:
44 |     runs-on: ubuntu-latest
45 |     steps:
46 |       - name: Checkout source
47 |         uses: actions/checkout@v2
48 | 
49 |       - name: Setup Conda Environment
50 |         uses: conda-incubator/setup-miniconda@v2
51 |         with:
52 |           miniconda-version: "latest"
53 |           python-version: "3.12"
54 | 
55 |       - name: Run import tests
56 |         shell: bash -l {0}
57 |         run: source ci/scripts/test_imports.sh
58 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Build distribution
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: "ubuntu-latest"
 8 | 
 9 |     steps:
10 |       - name: Checkout source
11 |         uses: actions/checkout@v2
12 | 
13 |       - name: Set up Python 3.12
14 |         uses: actions/setup-python@v1
15 |         with:
16 |           python-version: 3.12
17 | 
18 |       - name: Install pypa/build
19 |         run: python -m pip install build wheel setuptools
20 | 
21 |       - name: Build distributions
22 |         shell: bash -l {0}
23 |         run: python setup.py sdist bdist_wheel
24 | 
25 |       - name: Publish package to PyPI
26 |         if: github.repository == 'dask/dask-cloudprovider' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
27 |         uses: pypa/gh-action-pypi-publish@master
28 |         with:
29 |           user: __token__
30 |           password: ${{ secrets.pypi_password }}
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.a
  8 | *.dll
  9 | *.exe
 10 | *.o
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | env/
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | doc/_build/
 72 | doc/source/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # dotenv
 90 | .env
 91 | 
 92 | # virtualenv
 93 | .venv
 94 | venv/
 95 | ENV/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 
110 | # IDE
111 | .vscode/
112 | .idea
113 | 
114 | # MAC
115 | .DS_Store
116 | 
117 | # any untitled Jupyter notebooks
118 | Untitled*.ipynb
119 | 
120 | # key material
121 | *.pem
122 | *.pub
123 | *_rsa
124 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: 23.10.1
 4 |     hooks:
 5 |       - id: black
 6 |         language_version: python3
 7 |         exclude: versioneer.py
 8 |   - repo: https://github.com/pycqa/flake8
 9 |     rev: 6.1.0
10 |     hooks:
11 |       - id: flake8
12 |         language_version: python3
13 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |   configuration: doc/source/conf.py
 5 | 
 6 | formats: all
 7 | 
 8 | python:
 9 |   install:
10 |     - method: pip
11 |       path: .
12 |       extra_requirements:
13 |         - all
14 |     - requirements: doc/requirements-docs.txt
15 | 
16 | submodules:
17 |   include: all
18 | 
19 | build:
20 |   os: ubuntu-22.04
21 |   tools:
22 |     python: "3.11"
23 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Dask is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more. 
2 | 
3 | For general information on how to contribute see https://docs.dask.org/en/latest/develop.html.
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, NVIDIA Corporation
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include dask_cloudprovider *.py
 2 | recursive-include dask_cloudprovider *.yaml
 3 | recursive-include dask_cloudprovider *.j2
 4 | 
 5 | include setup.py
 6 | include setup.cfg
 7 | include LICENSE
 8 | include README.rst
 9 | include requirements.txt
10 | include MANIFEST.in
11 | include versioneer.py
12 | 
13 | recursive-exclude * __pycache__
14 | recursive-exclude * *.py[co]include dask_cloudprovider/_version.py
15 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Dask Cloud Provider
 3 | ===================
 4 | 
 5 | 
 6 | .. image:: https://github.com/dask/dask-cloudprovider/actions/workflows/ci.yml/badge.svg
 7 |    :target: https://github.com/dask/dask-cloudprovider/actions?query=workflow%3ACI
 8 |    :alt: Build Status
 9 | 
10 | .. image:: https://img.shields.io/readthedocs/dask-cloudprovider?color=%232980B9&logo=read-the-docs&logoColor=white
11 |    :target: https://cloudprovider.dask.org/
12 |    :alt: Read the Docs
13 | 
14 | .. image:: https://img.shields.io/readthedocs/dask-cloudprovider?color=%232980B9&label=developer%20docs&logo=read-the-docs&logoColor=white
15 |    :target: https://cloudprovider.dask.org/releasing.html
16 |    :alt: Read the Docs Developer
17 | 
18 | .. image:: https://img.shields.io/pypi/v/dask-cloudprovider
19 |    :target: https://pypi.org/project/dask-cloudprovider/
20 |    :alt: PyPI
21 | 
22 | .. image:: https://img.shields.io/conda/vn/conda-forge/dask-cloudprovider
23 |    :target: https://anaconda.org/conda-forge/dask-cloudprovider
24 |    :alt: Conda Forge
25 | 
26 | 
27 | Native Cloud integration for Dask.
28 | 
29 | This library provides tools to enable Dask clusters to more natively integrate with the cloud.
30 | It includes cluster managers to create dask clusters on a given cloud provider using native resources,
31 | plugins to more closely integrate Dask components with the cloud platform they are running on and documentation to empower all folks running Dask on the cloud.
32 | 


--------------------------------------------------------------------------------
/ci/environment-3.10.yml:
--------------------------------------------------------------------------------
 1 | name: dask-cloudprovider-test
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python=3.10
 7 |   - nomkl
 8 |   - pip
 9 |   # Dask
10 |   - dask
11 |   # testing / CI
12 |   - flake8
13 |   - ipywidgets
14 |   - pytest
15 |   - pytest-asyncio
16 |   - black >=20.8b1
17 |   - pyyaml
18 |   # dask dependencies
19 |   - cloudpickle
20 |   - toolz
21 |   - cytoolz
22 |   - numpy
23 |   - partd
24 |   # distributed dependencies
25 |   - click >=6.6
26 |   - msgpack-python
27 |   - psutil >=5.0
28 |   - six
29 |   - sortedcontainers !=2.0.0,!=2.0.1
30 |   - tblib
31 |   - tornado >=5
32 |   - zict >=0.1.3
33 |   # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212
34 |   - pytest-asyncio >=0.14.0
35 |   - pytest-timeout
36 |   - pip:
37 |       - git+https://github.com/dask/dask.git@main
38 |       - git+https://github.com/dask/distributed@main
39 | 


--------------------------------------------------------------------------------
/ci/environment-3.11.yml:
--------------------------------------------------------------------------------
 1 | name: dask-cloudprovider-test
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python=3.11
 7 |   - nomkl
 8 |   - pip
 9 |   # Dask
10 |   - dask
11 |   # testing / CI
12 |   - flake8
13 |   - ipywidgets
14 |   - pytest
15 |   - pytest-asyncio
16 |   - black >=20.8b1
17 |   - pyyaml
18 |   # dask dependencies
19 |   - cloudpickle
20 |   - toolz
21 |   - cytoolz
22 |   - numpy
23 |   - partd
24 |   # distributed dependencies
25 |   - click >=6.6
26 |   - msgpack-python
27 |   - psutil >=5.0
28 |   - six
29 |   - sortedcontainers !=2.0.0,!=2.0.1
30 |   - tblib
31 |   - tornado >=5
32 |   - zict >=0.1.3
33 |   # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212
34 |   - pytest-asyncio >=0.14.0
35 |   - pytest-timeout
36 |   - pip:
37 |       - git+https://github.com/dask/dask.git@main
38 |       - git+https://github.com/dask/distributed@main
39 | 


--------------------------------------------------------------------------------
/ci/environment-3.12.yml:
--------------------------------------------------------------------------------
 1 | name: dask-cloudprovider-test
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python=3.12
 7 |   - nomkl
 8 |   - pip
 9 |   # Dask
10 |   - dask
11 |   # testing / CI
12 |   - flake8
13 |   - ipywidgets
14 |   - pytest
15 |   - pytest-asyncio
16 |   - black >=20.8b1
17 |   - pyyaml
18 |   # dask dependencies
19 |   - cloudpickle
20 |   - toolz
21 |   - cytoolz
22 |   - numpy
23 |   - partd
24 |   # distributed dependencies
25 |   - click >=6.6
26 |   - msgpack-python
27 |   - psutil >=5.0
28 |   - six
29 |   - sortedcontainers !=2.0.0,!=2.0.1
30 |   - tblib
31 |   - tornado >=5
32 |   - zict >=0.1.3
33 |   # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212
34 |   - pytest-asyncio >=0.14.0
35 |   - pytest-timeout
36 |   - pip:
37 |       - git+https://github.com/dask/dask.git@main
38 |       - git+https://github.com/dask/distributed@main
39 | 


--------------------------------------------------------------------------------
/ci/scripts/test_imports.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | 
 4 | 
 5 | test_import () {
 6 |     echo "Create environment: python=3.12 $1"
 7 |     # Create an empty environment
 8 |     conda create -q -y -n test-imports -c conda-forge python=3.12
 9 |     conda activate test-imports
10 |     pip install -e .[$1]
11 |     echo "python -c '$2'"
12 |     python -c "$2"
13 |     echo "Success [$1] 🚀"
14 |     conda deactivate
15 |     conda env remove -n test-imports
16 | }
17 | 
18 | test_import "aws"               "import dask_cloudprovider.aws"
19 | test_import "azure"             "import dask_cloudprovider.azure"
20 | test_import "digitalocean"      "import dask_cloudprovider.digitalocean"
21 | test_import "gcp"               "import dask_cloudprovider.gcp"
22 | test_import "ibm"               "import dask_cloudprovider.ibm"
23 | test_import "openstack"         "import dask_cloudprovider.openstack"
24 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import config
 2 | 
 3 | from ._version import get_versions
 4 | 
 5 | __version__ = get_versions()["version"]
 6 | 
 7 | del get_versions
 8 | 
 9 | 
10 | def __getattr__(name):
11 |     """As of dask_cloudprovider v0.5.0 all cluster managers are in cloud provider specific submodules.
12 | 
13 |     This allows us to more easily separate out optional dependencies. However we maintain some helpful
14 |     errors at the top level.
15 | 
16 |     This is both to help migrate users of any cluster managers that existed before this was changed
17 |     and also to help anyone who incorrectly tries to import a cluster manager from the top level.
18 |     Perhaps because they saw it used in some documentation but didn't see the import.
19 | 
20 |     """
21 | 
22 |     if name in ["EC2Cluster", "ECSCluster", "FargateCluster"]:
23 |         raise ImportError(
24 |             "AWS cluster managers must be imported from the aws subpackage. "
25 |             f"Please import dask_cloudprovider.aws.{name}"
26 |         )
27 | 
28 |     if name in ["AzureVMCluster"]:
29 |         raise ImportError(
30 |             "Azure cluster managers must be imported from the the azure subpackage. "
31 |             f"Please import dask_cloudprovider.azure.{name}"
32 |         )
33 | 
34 |     if name in ["GCPCluster"]:
35 |         raise ImportError(
36 |             "Google Cloud cluster managers must be imported from the the gcp subpackage. "
37 |             f"Please import dask_cloudprovider.gcp.{name}"
38 |         )
39 | 
40 |     if name in ["DropletCluster"]:
41 |         raise ImportError(
42 |             "DigitalOcean cluster managers must be imported from the digitalocean subpackage. "
43 |             f"Please import dask_cloudprovider.digitalocean.{name}"
44 |         )
45 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/aws/__init__.py:
--------------------------------------------------------------------------------
1 | from .ec2 import EC2Cluster
2 | from .ecs import ECSCluster, FargateCluster
3 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/aws/helper.py:
--------------------------------------------------------------------------------
  1 | """Helper functions for working with AWS services."""
  2 | from datetime import datetime
  3 | 
  4 | DEFAULT_SECURITY_GROUP_NAME = "dask-default"
  5 | 
  6 | 
  7 | def dict_to_aws(py_dict, upper=False, key_string=None, value_string=None):
  8 |     key_string = key_string or ("Key" if upper else "key")
  9 |     value_string = value_string or ("Value" if upper else "value")
 10 |     return [{key_string: key, value_string: value} for key, value in py_dict.items()]
 11 | 
 12 | 
 13 | def aws_to_dict(aws_dict):
 14 |     try:
 15 |         return {item["key"]: item["value"] for item in aws_dict}
 16 |     except KeyError:
 17 |         return {item["Key"]: item["Value"] for item in aws_dict}
 18 | 
 19 | 
 20 | # https://aws.amazon.com/blogs/messaging-and-targeting/how-to-handle-a-throttling-maximum-sending-rate-exceeded-error/
 21 | def get_sleep_duration(current_try, min_sleep_millis=10, max_sleep_millis=5000):
 22 |     current_try = max(1, current_try)
 23 |     current_sleep_millis = min_sleep_millis * current_try**2
 24 |     return min(current_sleep_millis, max_sleep_millis) / 1000  # return in seconds
 25 | 
 26 | 
 27 | class ConfigMixin:
 28 |     def update_attr_from_config(self, attr: str, private: bool):
 29 |         """Update class attribute of given cluster based on config, if not already set. If `private` is True, the class
 30 |         attribute will be prefixed with an underscore.
 31 | 
 32 |         This mixin can be applied to any class that has a config dict attribute.
 33 |         """
 34 |         prefix = "_" if private else ""
 35 |         if getattr(self, f"{prefix}{attr}") is None:
 36 |             setattr(self, f"{prefix}{attr}", self.config.get(attr))
 37 | 
 38 | 
 39 | async def get_latest_ami_id(client, name_glob, owner):
 40 |     images = await client.describe_images(
 41 |         Filters=[
 42 |             {"Name": "name", "Values": [name_glob]},
 43 |             {"Name": "owner-id", "Values": [owner]},
 44 |         ]
 45 |     )
 46 |     creation_date = None
 47 |     image_id = None
 48 | 
 49 |     for image in images["Images"]:
 50 |         image_date = datetime.strptime(image["CreationDate"], "%Y-%m-%dT%H:%M:%S.%fZ")
 51 |         if creation_date is None or creation_date < image_date:
 52 |             image_id = image["ImageId"]
 53 |             creation_date = image_date
 54 |     return image_id
 55 | 
 56 | 
 57 | async def get_default_vpc(client):
 58 |     vpcs = (await client.describe_vpcs())["Vpcs"]
 59 |     [vpc] = [vpc for vpc in vpcs if vpc["IsDefault"]]
 60 |     return vpc["VpcId"]
 61 | 
 62 | 
 63 | async def get_vpc_subnets(client, vpc):
 64 |     vpcs = (await client.describe_vpcs())["Vpcs"]
 65 |     [vpc] = [x for x in vpcs if x["VpcId"] == vpc]
 66 |     subnets = (await client.describe_subnets())["Subnets"]
 67 |     return [subnet["SubnetId"] for subnet in subnets if subnet["VpcId"] == vpc["VpcId"]]
 68 | 
 69 | 
 70 | async def get_security_group(client, vpc, create_default=True):
 71 |     try:
 72 |         response = await client.describe_security_groups(
 73 |             GroupNames=[DEFAULT_SECURITY_GROUP_NAME]
 74 |         )
 75 |         groups = response["SecurityGroups"]
 76 |     except Exception:
 77 |         groups = []
 78 |     if len(groups) > 0:
 79 |         return groups[0]["GroupId"]
 80 |     else:
 81 |         if create_default:
 82 |             try:
 83 |                 return await create_default_security_group(
 84 |                     client, DEFAULT_SECURITY_GROUP_NAME, vpc
 85 |                 )
 86 |             except Exception as e:
 87 |                 raise RuntimeError(
 88 |                     "Unable to create default security group. Please specify manually."
 89 |                 ) from e
 90 |         else:
 91 |             raise RuntimeError(
 92 |                 "Unable to find suitable security group. Please specify manually."
 93 |             )
 94 | 
 95 | 
 96 | async def create_default_security_group(client, group_name, vpc):
 97 |     response = await client.create_security_group(
 98 |         Description="A default security group for Dask",
 99 |         GroupName=group_name,
100 |         VpcId=vpc,
101 |         DryRun=False,
102 |     )
103 | 
104 |     await client.authorize_security_group_ingress(
105 |         GroupId=response["GroupId"],
106 |         IpPermissions=[
107 |             {
108 |                 "IpProtocol": "TCP",
109 |                 "FromPort": 8786,
110 |                 "ToPort": 8787,
111 |                 "IpRanges": [{"CidrIp": "0.0.0.0/0", "Description": "Anywhere"}],
112 |                 "Ipv6Ranges": [{"CidrIpv6": "::/0", "Description": "Anywhere"}],
113 |             },
114 |             {
115 |                 "IpProtocol": "TCP",
116 |                 "FromPort": 0,
117 |                 "ToPort": 65535,
118 |                 "UserIdGroupPairs": [{"GroupId": response["GroupId"]}],
119 |             },
120 |         ],
121 |         DryRun=False,
122 |     )
123 | 
124 |     return response["GroupId"]
125 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/aws/tests/test_ec2.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | aiobotocore = pytest.importorskip("aiobotocore")
  4 | 
  5 | from dask_cloudprovider.aws.ec2 import EC2Cluster
  6 | from dask_cloudprovider.aws.helper import get_latest_ami_id
  7 | from dask.distributed import Client
  8 | from distributed.core import Status
  9 | 
 10 | 
 11 | async def skip_without_credentials():
 12 |     try:
 13 |         async with aiobotocore.get_session().create_client("sts") as client:
 14 |             await client.get_caller_identity()
 15 |     except Exception:
 16 |         pytest.skip(
 17 |             """
 18 |         You must configure Your AWS credentials to run this test.
 19 | 
 20 |             $ aws configure
 21 | 
 22 |         """
 23 |         )
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | @pytest.mark.external
 28 | async def cluster():
 29 |     await skip_without_credentials()
 30 |     async with EC2Cluster(asynchronous=True) as cluster:
 31 |         yield cluster
 32 | 
 33 | 
 34 | @pytest.fixture
 35 | @pytest.mark.external
 36 | async def cluster_sync():
 37 |     await skip_without_credentials()
 38 |     cluster = EC2Cluster()
 39 |     yield cluster
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | @pytest.mark.external
 44 | async def cluster_rapids():
 45 |     await skip_without_credentials()
 46 |     async with EC2Cluster(
 47 |         asynchronous=True,
 48 |         # Deep Learning AMI (Ubuntu 18.04)
 49 |         ami="ami-0c7c7d78f752f8f17",
 50 |         # Python version must match local version and CUDA version must match AMI CUDA version
 51 |         docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
 52 |         instance_type="p3.2xlarge",
 53 |         bootstrap=False,
 54 |         filesystem_size=120,
 55 |     ) as cluster:
 56 |         yield cluster
 57 | 
 58 | 
 59 | @pytest.fixture
 60 | @pytest.mark.external
 61 | async def cluster_rapids_packer():
 62 |     await skip_without_credentials()
 63 |     async with EC2Cluster(
 64 |         asynchronous=True,
 65 |         # Packer AMI
 66 |         ami="ami-04e5539cb82859e69",
 67 |         # Python version must match local version and CUDA version must match AMI CUDA version
 68 |         docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
 69 |         instance_type="p3.2xlarge",
 70 |         bootstrap=False,
 71 |         filesystem_size=120,
 72 |     ) as cluster:
 73 |         yield cluster
 74 | 
 75 | 
 76 | @pytest.fixture
 77 | @pytest.mark.external
 78 | async def cluster_packer():
 79 |     await skip_without_credentials()
 80 |     async with EC2Cluster(
 81 |         asynchronous=True, ami="ami-0e6187593ace05a0c", bootstrap=False
 82 |     ) as cluster:
 83 |         yield cluster
 84 | 
 85 | 
 86 | @pytest.fixture
 87 | async def ec2_client():
 88 |     await skip_without_credentials()
 89 |     async with aiobotocore.get_session().create_client("ec2") as client:
 90 |         yield client
 91 | 
 92 | 
 93 | @pytest.mark.asyncio
 94 | @pytest.mark.external
 95 | async def test_init():
 96 |     cluster = EC2Cluster(asynchronous=True)
 97 |     assert cluster.status == Status.created
 98 | 
 99 | 
100 | @pytest.mark.asyncio
101 | @pytest.mark.timeout(600)
102 | async def test_create_cluster(cluster):
103 |     assert cluster.status == Status.running
104 | 
105 |     cluster.scale(2)
106 |     await cluster
107 |     assert len(cluster.workers) == 2
108 | 
109 |     async with Client(cluster, asynchronous=True) as client:
110 |         inc = lambda x: x + 1
111 |         assert await client.submit(inc, 10).result() == 11
112 | 
113 | 
114 | @pytest.mark.asyncio
115 | @pytest.mark.timeout(600)
116 | async def test_create_cluster_sync(cluster_sync):
117 |     assert cluster_sync.status == Status.running
118 | 
119 |     cluster_sync.scale(2)
120 | 
121 |     with Client(cluster_sync) as client:
122 |         inc = lambda x: x + 1
123 |         assert client.submit(inc, 10).result() == 11
124 | 
125 | 
126 | @pytest.mark.asyncio
127 | @pytest.mark.timeout(600)
128 | async def test_create_cluster_with_packer(cluster_packer):
129 |     assert cluster_packer.status == Status.running
130 | 
131 |     cluster_packer.scale(2)
132 |     await cluster_packer
133 |     assert len(cluster_packer.workers) == 2
134 | 
135 |     async with Client(cluster_packer, asynchronous=True) as client:
136 |         inc = lambda x: x + 1
137 |         assert await client.submit(inc, 10).result() == 11
138 | 
139 | 
140 | @pytest.mark.asyncio
141 | @pytest.mark.timeout(1200)
142 | async def test_create_rapids_cluster(cluster_rapids):
143 |     assert cluster_rapids.status == Status.running
144 | 
145 |     cluster_rapids.scale(1)
146 |     await cluster_rapids
147 |     assert len(cluster_rapids.workers) == 1
148 | 
149 |     async with Client(cluster_rapids, asynchronous=True) as client:
150 | 
151 |         def f():
152 |             import cupy
153 | 
154 |             return float(cupy.random.random(100).mean())
155 | 
156 |         assert await client.submit(f).result() < 1
157 | 
158 | 
159 | @pytest.mark.asyncio
160 | @pytest.mark.timeout(1200)
161 | async def test_create_rapids_cluster_with_packer(cluster_rapids_packer):
162 |     assert cluster_rapids_packer.status == Status.running
163 | 
164 |     cluster_rapids_packer.scale(1)
165 |     await cluster_rapids_packer
166 |     assert len(cluster_rapids_packer.workers) == 1
167 | 
168 |     async with Client(cluster_rapids_packer, asynchronous=True) as client:
169 | 
170 |         def f():
171 |             import cupy
172 | 
173 |             return float(cupy.random.random(100).mean())
174 | 
175 |         assert await client.submit(f).result() < 1
176 | 
177 | 
178 | @pytest.mark.asyncio
179 | async def test_get_ubuntu_image(ec2_client):
180 |     image = await get_latest_ami_id(
181 |         ec2_client,
182 |         "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*",
183 |         "099720109477",  # Canonical
184 |     )
185 |     assert "ami-" in image
186 | 
187 | 
188 | @pytest.mark.asyncio
189 | async def test_get_cloud_init():
190 |     cloud_init = EC2Cluster.get_cloud_init(
191 |         env_vars={"EXTRA_PIP_PACKAGES": "s3fs"},
192 |         docker_args="--privileged",
193 |     )
194 |     assert "systemctl start docker" in cloud_init
195 |     assert ' -e EXTRA_PIP_PACKAGES="s3fs" ' in cloud_init
196 |     assert " --privileged " in cloud_init
197 | 
198 | 
199 | @pytest.mark.asyncio
200 | async def test_get_cloud_init_rapids():
201 |     cloud_init = EC2Cluster.get_cloud_init(
202 |         # Deep Learning AMI (Ubuntu 18.04)
203 |         ami="ami-0c7c7d78f752f8f17",
204 |         # Python version must match local version and CUDA version must match AMI CUDA version
205 |         docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
206 |         instance_type="p3.2xlarge",
207 |         bootstrap=False,
208 |         filesystem_size=120,
209 |     )
210 |     assert "rapidsai" in cloud_init
211 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/aws/tests/test_ecs.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | aiobotocore = pytest.importorskip("aiobotocore")
4 | 
5 | 
6 | def test_import():
7 |     from dask_cloudprovider.aws import ECSCluster  # noqa
8 |     from dask_cloudprovider.aws import FargateCluster  # noqa
9 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/aws/tests/test_helper.py:
--------------------------------------------------------------------------------
 1 | def test_aws_to_dict_and_back():
 2 |     from dask_cloudprovider.aws.helper import aws_to_dict, dict_to_aws
 3 | 
 4 |     aws_dict = [{"key": "hello", "value": "world"}]
 5 |     aws_upper_dict = [{"Key": "hello", "Value": "world"}]
 6 |     py_dict = {"hello": "world"}
 7 | 
 8 |     assert dict_to_aws(py_dict) == aws_dict
 9 |     assert dict_to_aws(py_dict, upper=True) == aws_upper_dict
10 |     assert aws_to_dict(aws_dict) == py_dict
11 | 
12 |     assert aws_to_dict(dict_to_aws(py_dict, upper=True)) == py_dict
13 |     assert aws_to_dict(dict_to_aws(py_dict)) == py_dict
14 |     assert dict_to_aws(aws_to_dict(aws_dict)) == aws_dict
15 |     assert dict_to_aws(aws_to_dict(aws_upper_dict), upper=True) == aws_upper_dict
16 | 
17 | 
18 | def test_get_sleep_duration_first_try():
19 |     from dask_cloudprovider.aws.helper import get_sleep_duration
20 | 
21 |     duration = get_sleep_duration(
22 |         current_try=0, min_sleep_millis=10, max_sleep_millis=5000
23 |     )
24 |     assert duration == 0.01
25 | 
26 | 
27 | def test_get_sleep_duration_max():
28 |     from dask_cloudprovider.aws.helper import get_sleep_duration
29 | 
30 |     duration = get_sleep_duration(
31 |         current_try=23, min_sleep_millis=10, max_sleep_millis=5000
32 |     )
33 |     assert duration == 5.0
34 | 
35 | 
36 | def test_get_sleep_duration_negative_try():
37 |     from dask_cloudprovider.aws.helper import get_sleep_duration
38 | 
39 |     duration = get_sleep_duration(
40 |         current_try=-1, min_sleep_millis=10, max_sleep_millis=5000
41 |     )
42 |     assert duration == 0.01
43 | 
44 | 
45 | def test_config_mixin():
46 |     from dask_cloudprovider.aws.helper import ConfigMixin
47 | 
48 |     class MockCluster(ConfigMixin):
49 |         config = None
50 |         _attr1 = "foo"
51 |         attr2 = None
52 | 
53 |         def __init__(self):
54 |             self.config = {"attr2": "bar"}
55 | 
56 |     cluster_with_mixin = MockCluster()
57 | 
58 |     # Test that nothing happens if attr is already set
59 |     attr1 = cluster_with_mixin._attr1
60 |     cluster_with_mixin.update_attr_from_config(attr="attr1", private=True)
61 |     assert cluster_with_mixin._attr1 == attr1
62 | 
63 |     # Test that attr is updated if existing value is None
64 |     cluster_with_mixin.update_attr_from_config(attr="attr2", private=False)
65 |     assert cluster_with_mixin.attr2 == "bar"
66 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/azure/__init__.py:
--------------------------------------------------------------------------------
1 | from .azurevm import AzureVMCluster
2 | from .utils import AzurePreemptibleWorkerPlugin
3 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/azure/tests/test_azurevm.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import dask
  4 | 
  5 | azure_compute = pytest.importorskip("azure.mgmt.compute")
  6 | 
  7 | from dask_cloudprovider.azure import AzureVMCluster
  8 | from dask.distributed import Client
  9 | from distributed.core import Status
 10 | 
 11 | 
 12 | def skip_without_credentials(func):
 13 |     rg = dask.config.get("cloudprovider.azure.resource_group", None)
 14 |     vnet = dask.config.get("cloudprovider.azure.azurevm.vnet", None)
 15 |     security_group = dask.config.get("cloudprovider.azure.azurevm.security_group", None)
 16 |     location = dask.config.get("cloudprovider.azure.location", None)
 17 |     if rg is None or vnet is None or security_group is None or location is None:
 18 |         return pytest.mark.skip(
 19 |             reason="""
 20 |         You must configure your Azure resource group and vnet to run this test.
 21 | 
 22 |             $ export DASK_CLOUDPROVIDER__AZURE__LOCATION="<LOCATION>"
 23 |             $ export DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP="<RESOURCE GROUP>"
 24 |             $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__VNET="<VNET>"
 25 |             $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__SECURITY_GROUP="<SECURITY GROUP>"
 26 | 
 27 |         """
 28 |         )(func)
 29 |     return func
 30 | 
 31 | 
 32 | async def get_config():
 33 |     return dask.config.get("cloudprovider.azure", {})
 34 | 
 35 | 
 36 | @pytest.mark.asyncio
 37 | @skip_without_credentials
 38 | @pytest.mark.external
 39 | async def test_init():
 40 |     cluster = AzureVMCluster(asynchronous=True)
 41 |     assert cluster.status == Status.created
 42 | 
 43 | 
 44 | @pytest.mark.asyncio
 45 | @pytest.mark.timeout(1200)
 46 | @skip_without_credentials
 47 | @pytest.mark.external
 48 | async def test_create_cluster():
 49 |     async with AzureVMCluster(asynchronous=True) as cluster:
 50 |         assert cluster.status == Status.running
 51 | 
 52 |         cluster.scale(2)
 53 |         await cluster
 54 |         assert len(cluster.workers) == 2
 55 | 
 56 |         async with Client(cluster, asynchronous=True) as client:
 57 | 
 58 |             def inc(x):
 59 |                 return x + 1
 60 | 
 61 |             assert await client.submit(inc, 10).result() == 11
 62 | 
 63 | 
 64 | @pytest.mark.asyncio
 65 | @pytest.mark.timeout(1200)
 66 | @skip_without_credentials
 67 | @pytest.mark.external
 68 | async def test_create_cluster_sync():
 69 |     with AzureVMCluster() as cluster:
 70 |         with Client(cluster) as client:
 71 |             cluster.scale(1)
 72 |             client.wait_for_workers(1)
 73 |             assert len(cluster.workers) == 1
 74 | 
 75 |             def inc(x):
 76 |                 return x + 1
 77 | 
 78 |             assert client.submit(inc, 10).result() == 11
 79 | 
 80 | 
 81 | @pytest.mark.asyncio
 82 | @pytest.mark.timeout(1200)
 83 | @skip_without_credentials
 84 | @pytest.mark.external
 85 | async def test_create_rapids_cluster_sync():
 86 |     with AzureVMCluster(
 87 |         vm_size="Standard_NC12s_v3",
 88 |         docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9",
 89 |         worker_class="dask_cuda.CUDAWorker",
 90 |         worker_options={"rmm_pool_size": "15GB"},
 91 |     ) as cluster:
 92 |         with Client(cluster) as client:
 93 |             cluster.scale(1)
 94 |             client.wait_for_workers(1)
 95 | 
 96 |             def gpu_mem():
 97 |                 from pynvml.smi import nvidia_smi
 98 | 
 99 |                 nvsmi = nvidia_smi.getInstance()
100 |                 return nvsmi.DeviceQuery("memory.free, memory.total")
101 | 
102 |             results = client.run(gpu_mem)
103 |             for w, res in results.items():
104 |                 assert "total" in res["gpu"][0]["fb_memory_usage"].keys()
105 |                 print(res)
106 | 
107 | 
108 | @pytest.mark.asyncio
109 | @skip_without_credentials
110 | async def test_render_cloud_init():
111 |     cloud_init = AzureVMCluster.get_cloud_init(docker_args="--privileged")
112 |     assert " --privileged " in cloud_init
113 | 
114 |     cloud_init = AzureVMCluster.get_cloud_init(
115 |         docker_image="foo/bar:baz",
116 |         extra_bootstrap=["echo 'hello world'", "echo 'foo bar'"],
117 |     )
118 |     assert "foo/bar:baz" in cloud_init
119 |     assert "- echo 'hello world'" in cloud_init
120 |     assert "- echo 'foo bar'" in cloud_init
121 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/azure/utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import datetime
  3 | import json
  4 | import subprocess
  5 | import logging
  6 | 
  7 | import aiohttp
  8 | from distributed.diagnostics.plugin import WorkerPlugin
  9 | from tornado.ioloop import IOLoop, PeriodicCallback
 10 | 
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | AZURE_EVENTS_METADATA_URL = (
 15 |     "http://169.254.169.254/metadata/scheduledevents?api-version=2019-08-01"
 16 | )
 17 | 
 18 | 
 19 | def _get_default_subscription() -> str:
 20 |     """
 21 |     Get the default Azure subscription ID, as configured by the Azure CLI.
 22 |     """
 23 |     out = subprocess.check_output(["az", "account", "list", "--query", "[?isDefault]"])
 24 |     accounts = json.loads(out)
 25 |     if accounts:
 26 |         subscription_id = accounts[0]["id"]
 27 |         return subscription_id
 28 |     raise ValueError(
 29 |         "Could not find a default subscription. "
 30 |         "Run 'az account set' to set a default subscription."
 31 |     )
 32 | 
 33 | 
 34 | class AzurePreemptibleWorkerPlugin(WorkerPlugin):
 35 |     """A worker plugin for azure spot instances
 36 | 
 37 |     This worker plugin will poll azure's metadata service for preemption notifications.
 38 |     When a node is preempted, the plugin will attempt to shutdown gracefully all workers
 39 |     on the node.
 40 | 
 41 |     This plugin can be used on any worker running on azure spot instances, not just the
 42 |     ones created by ``dask-cloudprovider``.
 43 | 
 44 |     For more details on azure spot instances see:
 45 |     https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     poll_interval_s: int (optional)
 50 |         The rate at which the plugin will poll the metadata service in seconds.
 51 | 
 52 |         Defaults to ``1``
 53 | 
 54 |     metadata_url: str (optional)
 55 |         The url of the metadata service to poll.
 56 | 
 57 |         Defaults to "http://169.254.169.254/metadata/scheduledevents?api-version=2019-08-01"
 58 | 
 59 |     termination_events: List[str] (optional)
 60 |         The type of events that will trigger the gracefull shutdown
 61 | 
 62 |         Defaults to ``['Preempt', 'Terminate']``
 63 | 
 64 |     termination_offset_minutes: int (optional)
 65 |         Extra offset to apply to the premption date. This may be negative, to start
 66 |         the gracefull shutdown before the ``NotBefore`` date. It can also be positive, to
 67 |         start the shutdown after the ``NotBefore`` date, but this is at your own risk.
 68 | 
 69 |         Defaults to ``0``
 70 | 
 71 |     Examples
 72 |     --------
 73 | 
 74 |     Let's say you have cluster and a client instance.
 75 |     For example using :class:`dask_kubernetes.KubeCluster`
 76 | 
 77 |     >>> from dask_kubernetes import KubeCluster
 78 |     >>> from distributed import Client
 79 |     >>> cluster = KubeCluster()
 80 |     >>> client = Client(cluster)
 81 | 
 82 |     You can add the worker plugin using the following:
 83 | 
 84 |     >>> from dask_cloudprovider.azure import AzurePreemptibleWorkerPlugin
 85 |     >>> client.register_worker_plugin(AzurePreemptibleWorkerPlugin())
 86 |     """
 87 | 
 88 |     def __init__(
 89 |         self,
 90 |         poll_interval_s=1,
 91 |         metadata_url=None,
 92 |         termination_events=None,
 93 |         termination_offset_minutes=0,
 94 |     ):
 95 |         self.callback = None
 96 |         self.loop = None
 97 |         self.worker = None
 98 |         self.poll_interval_s = poll_interval_s
 99 |         self.metadata_url = metadata_url or AZURE_EVENTS_METADATA_URL
100 |         self.termination_events = termination_events or ["Preempt", "Terminate"]
101 |         self.termination_offset = datetime.timedelta(minutes=termination_offset_minutes)
102 | 
103 |         self.terminating = False
104 |         self.not_before = None
105 |         self._session = None
106 |         self._lock = None
107 | 
108 |     async def _is_terminating(self):
109 |         preempt_started = False
110 |         async with self._session.get(self.metadata_url) as response:
111 |             try:
112 |                 data = await response.json()
113 |             # Sometime azure responds with text/plain mime type
114 |             except aiohttp.ContentTypeError:
115 |                 return
116 |             # Sometimes the response doesn't contain the Events key
117 |             events = data.get("Events", [])
118 |             if events:
119 |                 logger.debug(
120 |                     "Worker {}, got metadata events {}".format(self.worker.name, events)
121 |                 )
122 |             for evt in events:
123 |                 event_type = evt["EventType"]
124 |                 if event_type not in self.termination_events:
125 |                     continue
126 | 
127 |                 event_status = evt.get("EventStatus")
128 |                 if event_status == "Started":
129 |                     logger.info(
130 |                         "Worker {}, node preemption started".format(self.worker.name)
131 |                     )
132 |                     preempt_started = True
133 |                     break
134 | 
135 |                 not_before = evt.get("NotBefore")
136 |                 if not not_before:
137 |                     continue
138 | 
139 |                 not_before = datetime.datetime.strptime(
140 |                     not_before, "%a, %d %b %Y %H:%M:%S GMT"
141 |                 )
142 |                 if self.not_before is None:
143 |                     logger.info(
144 |                         "Worker {}, node deletion scheduled not before {}".format(
145 |                             self.worker.name, self.not_before
146 |                         )
147 |                     )
148 |                     self.not_before = not_before
149 |                     break
150 |                 if self.not_before < not_before:
151 |                     logger.info(
152 |                         "Worker {}, node deletion re-scheduled not before {}".format(
153 |                             self.worker.name, not_before
154 |                         )
155 |                     )
156 |                     self.not_before = not_before
157 |                     break
158 | 
159 |         return preempt_started or (
160 |             self.not_before
161 |             and (self.not_before + self.termination_offset < datetime.datetime.utcnow())
162 |         )
163 | 
164 |     async def poll_status(self):
165 |         if self.terminating:
166 |             return
167 |         if self._session is None:
168 |             self._session = aiohttp.ClientSession(headers={"Metadata": "true"})
169 |         if self._lock is None:
170 |             self._lock = asyncio.Lock()
171 | 
172 |         async with self._lock:
173 |             is_terminating = await self._is_terminating()
174 |             if not is_terminating:
175 |                 return
176 | 
177 |             logger.info(
178 |                 "Worker {}, node is being deleted, attempting graceful shutdown".format(
179 |                     self.worker.name
180 |                 )
181 |             )
182 |             self.terminating = True
183 |             await self._session.close()
184 |             await self.worker.close_gracefully()
185 | 
186 |     def setup(self, worker):
187 |         self.worker = worker
188 |         self.loop = IOLoop.current()
189 |         self.callback = PeriodicCallback(
190 |             self.poll_status, callback_time=self.poll_interval_s * 1_000
191 |         )
192 |         self.loop.add_callback(self.callback.start)
193 |         logger.debug(
194 |             "Worker {}, registering preemptible plugin".format(self.worker.name)
195 |         )
196 | 
197 |     def teardown(self, worker):
198 |         logger.debug("Worker {}, tearing down plugin".format(self.worker.name))
199 |         if self.callback:
200 |             self.callback.stop()
201 |             self.callback = None
202 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/cli/ecs.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from asyncio import sleep
  3 | import sys
  4 | 
  5 | import click
  6 | from distributed.cli.utils import install_signal_handlers
  7 | from distributed.core import Status
  8 | from tornado.ioloop import IOLoop, TimeoutError
  9 | 
 10 | from dask_cloudprovider.aws import ECSCluster
 11 | 
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | @click.command()
 17 | @click.option("--fargate", is_flag=True, help="Turn on fargate mode (default off)")
 18 | @click.option(
 19 |     "--fargate-scheduler",
 20 |     is_flag=True,
 21 |     help="Turn on fargate mode for scheduler (default off)",
 22 | )
 23 | @click.option(
 24 |     "--fargate-workers",
 25 |     is_flag=True,
 26 |     help="Turn on fargate mode for workers (default off)",
 27 | )
 28 | @click.option(
 29 |     "--image",
 30 |     type=str,
 31 |     default=None,
 32 |     help="Docker image to use for scheduler and workers",
 33 | )
 34 | @click.option(
 35 |     "--scheduler-cpu",
 36 |     type=int,
 37 |     default=None,
 38 |     help="Scheduler CPU reservation in milli-CPU",
 39 | )
 40 | @click.option(
 41 |     "--scheduler-mem", type=int, default=None, help="Scheduler memory reservation in MB"
 42 | )
 43 | @click.option(
 44 |     "--scheduler-port",
 45 |     type=int,
 46 |     default=8786,
 47 |     help="The port on which the scheduler will be reachable to the workers and clients",
 48 | )
 49 | @click.option(
 50 |     "--scheduler-timeout",
 51 |     type=int,
 52 |     default=None,
 53 |     help="Scheduler timeout (e.g 5 minutes)",
 54 | )
 55 | @click.option(
 56 |     "--worker-cpu", type=int, default=None, help="Worker CPU reservation in milli-CPU"
 57 | )
 58 | @click.option(
 59 |     "--worker-mem", type=int, default=None, help="Worker memory reservation in MB"
 60 | )
 61 | @click.option(
 62 |     "--n-workers",
 63 |     type=int,
 64 |     default=None,
 65 |     help="Number of workers to start with the cluster",
 66 | )
 67 | @click.option(
 68 |     "--cluster-arn",
 69 |     type=str,
 70 |     default=None,
 71 |     help="The ARN of an existing ECS cluster to use",
 72 | )
 73 | @click.option(
 74 |     "--cluster-name-template",
 75 |     type=str,
 76 |     default=None,
 77 |     help="A template to use for the cluster name if `--cluster-arn` is not set",
 78 | )
 79 | @click.option(
 80 |     "--execution-role-arn",
 81 |     type=str,
 82 |     default=None,
 83 |     help="The ARN of an existing IAM role to use for ECS execution",
 84 | )
 85 | @click.option(
 86 |     "--task-role-arn",
 87 |     type=str,
 88 |     default=None,
 89 |     help="The ARN of an existing IAM role to give to the tasks",
 90 | )
 91 | @click.option(
 92 |     "--task-role-policy",
 93 |     type=str,
 94 |     default=None,
 95 |     multiple=True,
 96 |     help="Policy to attach to a task if --task-role-arn is not set (can be used multiple times)",
 97 | )
 98 | @click.option(
 99 |     "--cloudwatch-logs-group", type=str, default=None, help="The group to send logs to"
100 | )
101 | @click.option(
102 |     "--cloudwatch-logs-stream-prefix",
103 |     type=str,
104 |     default=None,
105 |     help="An optional prefix to use for log streams",
106 | )
107 | @click.option(
108 |     "--cloudwatch-logs-default-retention",
109 |     type=int,
110 |     default=None,
111 |     help="Number of says to retain logs",
112 | )
113 | @click.option(
114 |     "--vpc",
115 |     type=str,
116 |     default=None,
117 |     help="The ID of an existing VPC (uses default if not specified)",
118 | )
119 | @click.option(
120 |     "--subnet",
121 |     type=str,
122 |     default=None,
123 |     multiple=True,
124 |     help="VPC subnet to use (can be used multipel times, will defaul to all if none specified)",
125 | )
126 | @click.option(
127 |     "--security-group",
128 |     type=str,
129 |     default=None,
130 |     multiple=True,
131 |     help="Security group to use for task communication (can be used multiple times, will be created if not specified)",
132 | )
133 | @click.option(
134 |     "--environment",
135 |     type=str,
136 |     default=None,
137 |     multiple=True,
138 |     help="Environment variable for the scheduler and workers in the form FOO=bar (can be used multiple times)",
139 | )
140 | @click.option(
141 |     "--tag",
142 |     type=str,
143 |     default=None,
144 |     multiple=True,
145 |     help="Tag to apply to all resources created automatically in the form FOO=bar (can be used multiple times)",
146 | )
147 | @click.option("--skip_cleanup", is_flag=True, help="Skip cleanup of stale resources")
148 | @click.version_option()
149 | def main(
150 |     fargate,
151 |     fargate_scheduler,
152 |     fargate_workers,
153 |     image,
154 |     scheduler_cpu,
155 |     scheduler_mem,
156 |     scheduler_port,
157 |     scheduler_timeout,
158 |     worker_cpu,
159 |     worker_mem,
160 |     n_workers,
161 |     cluster_arn,
162 |     cluster_name_template,
163 |     execution_role_arn,
164 |     task_role_arn,
165 |     task_role_policy,
166 |     cloudwatch_logs_group,
167 |     cloudwatch_logs_stream_prefix,
168 |     cloudwatch_logs_default_retention,
169 |     vpc,
170 |     subnet,
171 |     security_group,
172 |     environment,
173 |     tag,
174 |     skip_cleanup,
175 | ):
176 |     tag = {v.split("=")[0]: v.split("=")[1] for v in tag} if tag else None
177 |     environment = (
178 |         {v.split("=")[0]: v.split("=")[1] for v in environment} if environment else None
179 |     )
180 |     subnet = subnet or None
181 |     security_group = security_group or None
182 |     task_role_policy = task_role_policy or None
183 |     logger.info("Starting ECS cluster")
184 |     try:
185 |         cluster = ECSCluster(
186 |             fargate_scheduler=fargate_scheduler or fargate,
187 |             fargate_workers=fargate_workers or fargate,
188 |             image=image,
189 |             scheduler_cpu=scheduler_cpu,
190 |             scheduler_mem=scheduler_mem,
191 |             scheduler_port=scheduler_port,
192 |             scheduler_timeout=scheduler_timeout,
193 |             worker_cpu=worker_cpu,
194 |             worker_mem=worker_mem,
195 |             n_workers=n_workers,
196 |             cluster_arn=cluster_arn,
197 |             cluster_name_template=cluster_name_template,
198 |             execution_role_arn=execution_role_arn,
199 |             task_role_arn=task_role_arn,
200 |             task_role_policies=task_role_policy,
201 |             cloudwatch_logs_group=cloudwatch_logs_group,
202 |             cloudwatch_logs_stream_prefix=cloudwatch_logs_stream_prefix,
203 |             cloudwatch_logs_default_retention=cloudwatch_logs_default_retention,
204 |             vpc=vpc,
205 |             subnets=subnet,
206 |             security_groups=security_group,
207 |             environment=environment,
208 |             tags=tag,
209 |             skip_cleanup=skip_cleanup,
210 |         )
211 |     except Exception as e:
212 |         ctx = click.get_current_context()
213 |         logger.error(str(e) + "\n")
214 |         click.echo(ctx.get_help())
215 |         sys.exit(1)
216 | 
217 |     async def run():
218 |         logger.info("Ready")
219 |         while cluster.status != Status.closed:
220 |             await sleep(0.2)
221 | 
222 |     def on_signal(signum):
223 |         logger.info("Exiting on signal %d", signum)
224 |         cluster.close(timeout=2)
225 | 
226 |     loop = IOLoop.current()
227 |     install_signal_handlers(loop, cleanup=on_signal)
228 | 
229 |     try:
230 |         loop.run_sync(run)
231 |     except (KeyboardInterrupt, TimeoutError):
232 |         logger.info("Shutting down")
233 |     finally:
234 |         logger.info("End dask-ecs")
235 | 
236 | 
237 | def go():
238 |     main()
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     go()
243 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/cloudprovider.yaml:
--------------------------------------------------------------------------------
  1 | cloudprovider:
  2 |   ecs:
  3 |     fargate_scheduler: false # Use fargate mode for the scheduler
  4 |     fargate_spot: false
  5 |     fargate_workers: false # Use fargate mode for the workers
  6 |     fargate_use_private_ip: false
  7 |     scheduler_cpu: 1024 # Millicpu (1024ths of a CPU core)
  8 |     scheduler_mem: 4096 # Memory in MB
  9 |     #   scheduler_extra_args: "--tls-cert,/path/to/cert.pem,--tls-key,/path/to/cert.key,--tls-ca-file,/path/to/ca.key"
 10 |     worker_cpu: 4096 # Millicpu (1024ths of a CPU core)
 11 |     worker_mem: 16384 # Memory in MB
 12 |     worker_gpu: 0 # Number of GPUs for each worker
 13 |     #   worker_extra_args: "--tls-cert,/path/to/cert.pem,--tls-key,/path/to/cert.key,--tls-ca-file,/path/to/ca.key"
 14 |     n_workers: 0 # Number of workers to start the cluster with
 15 |     scheduler_timeout: "5 minutes" # Length of inactivity to wait before closing the cluster
 16 | 
 17 |     image: "daskdev/dask:latest" # Docker image to use for non GPU tasks
 18 |     cpu_architecture: "X86_64" # Runtime platform CPU architecture
 19 |     gpu_image: "rapidsai/rapidsai:latest" # Docker image to use for GPU tasks
 20 |     cluster_name_template: "dask-{uuid}" # Template to use when creating a cluster
 21 |     cluster_arn: "" # ARN of existing ECS cluster to use (if not set one will be created)
 22 |     execution_role_arn: "" # Arn of existing execution role to use (if not set one will be created)
 23 |     task_role_arn: "" # Arn of existing task role to use (if not set one will be created)
 24 |     task_role_policies: [] # List of policy arns to attach to tasks (e.g S3 read only access)
 25 |     #   platform_version: "LATEST" # Fargate platformVersion string like "1.4.0" or "LATEST"
 26 | 
 27 |     cloudwatch_logs_group: "" # Name of existing cloudwatch logs group to use (if not set one will be created)
 28 |     cloudwatch_logs_stream_prefix: "{cluster_name}" # Stream prefix template
 29 |     cloudwatch_logs_default_retention: 30 # Number of days to retain logs (only applied if not using existing group)
 30 | 
 31 |     vpc: "default" # VPC to use for tasks
 32 |     subnets: [] # VPC subnets to use (will use all available if not set)
 33 |     security_groups: [] # Security groups to use (if not set one will be created)
 34 | 
 35 |     tags: {} # Tags to apply to all AWS resources created by the cluster manager
 36 |     environment: {} # Environment variables that are set within a task container
 37 |     skip_cleanup: false # Skip cleaning up of stale resources
 38 | 
 39 |   ec2:
 40 |     region: null # AWS region to create cluster. Defaults to environment or account default region.
 41 |     availability_zone: null # The availability zone to start you clusters. By default AWS will select the AZ with most free capacity.
 42 |     bootstrap: true # It is assumed that the AMI does not have Docker and needs bootstrapping. Set this to false if using a custom AMI with Docker already installed.
 43 |     auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out.
 44 |     # worker_command: "dask-worker" # The command for workers to run. If the instance_type is a GPU instance dask-cuda-worker will be used.
 45 |     ami: null # AMI ID to use for all instances. Defaults to latest Ubuntu 20.04 image.
 46 |     instance_type: "t2.micro" # Instance type for the scheduler and all workers
 47 |     scheduler_instance_type: "t2.micro" # Instance type for the scheduler
 48 |     worker_instance_type: "t2.micro" # Instance type for all workers
 49 |     docker_image: "daskdev/dask:latest" # docker image to use
 50 |     vpc: null # VPC id for instances to join. Defaults to default VPC.
 51 |     subnet_id: null # Subnet ID for instances to. Defaults to all subnets in default VPC.
 52 |     security_groups: [] # Security groups for instances. Will create a minimal Dask security group by default.
 53 |     filesystem_size: 40 # Default root filesystem size for scheduler and worker VMs in GB
 54 |     key_name: null # SSH Key name to assign to instances
 55 |     iam_instance_profile: {} # Iam role to assign to instances
 56 |       # Arn: 'string'
 57 |       # Name: 'string'
 58 |     instance_tags:
 59 |       createdBy: dask-cloudprovider
 60 |     volume_tags:
 61 |       createdBy: dask-cloudprovider
 62 |     enable_detailed_monitoring: false
 63 |     use_private_ip: false
 64 | 
 65 |   azure:
 66 |     location: null # The Azure location to launch your cluster
 67 |     resource_group: null # The Azure resource group for the cluster
 68 |     subscription_id: null # The Azure subscription ID for the cluster
 69 |     azurevm:
 70 |       vnet: null # Azure Virtual Network to launch VMs in
 71 |       subnet: null # Azure Virtual Network subnet to launch VMs in
 72 |       security_group: null # Network security group to allow 8786 and 8787
 73 |       public_ingress: true # Assign a public IP address to the scheduler
 74 |       vm_size: "Standard_DS1_v2" # Azure VM size to use for scheduler and workers
 75 |       disk_size: 50  # Specifies the size of the VM host OS disk in gigabytes. This value cannot be larger than `1023`.
 76 |       scheduler_vm_size: null # Set a different VM size for the scheduler. Will use vm_size if not set
 77 |       docker_image: "daskdev/dask:latest" # docker image to use
 78 |       vm_image: # OS image to use for the virtual machines
 79 |         publisher: "Canonical"
 80 |         offer: "UbuntuServer"
 81 |         sku: "18.04-LTS"
 82 |         version: "latest"
 83 |       bootstrap: true # It is assumed that the VHD does not have Docker and needs bootstrapping. Set this to false if using a custom VHD with Docker already installed.
 84 |       auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out.
 85 |       marketplace_plan:  null # This needs to be passed in if the user wants to use a Marketplace VM with a plan.
 86 |         # name: "ngc-base-version-21-02-2"
 87 |         # publisher: "nvidia"
 88 |         # product: "ngc_azure_17_11"
 89 |       extra_options: {}  # Additional options to provide when creating the VMs.
 90 | 
 91 |   digitalocean:
 92 |     token: null # API token for interacting with the Digital Ocean API
 93 |     region: "nyc3" # Region to launch Droplets in
 94 |     size: "s-1vcpu-1gb" # Droplet size to launch, default is 1GB RAM, 1 vCPU
 95 |     image: "ubuntu-20-04-x64" # Operating System image to use
 96 | 
 97 |   gcp:
 98 |     source_image: "projects/ubuntu-os-cloud/global/images/ubuntu-minimal-1804-bionic-v20201014" # the gcp image to use for all instances
 99 |     zone: "us-east1-c" # the zone of where to launch the instances
100 |     network: "default" # the network/subnetwork in GCP to use
101 |     network_projectid: null # GCP project id where the network exists
102 |     projectid: "" # name of the google cloud project
103 |     on_host_maintenance: "TERMINATE"
104 |     machine_type: "n1-standard-1" # size of the machine type to use for the scheduler and all workers
105 |     scheduler_machine_type: "n1-standard-1" # size of the machine type to use for the scheduler
106 |     worker_machine_type: "n1-standard-1" # size of the machine type to use for all workers
107 |     filesystem_size: 50 # amount in GBs of hard drive space to allocate
108 |     ngpus: "" # number of GPUs to use. If provided, will be used for both scheduler and worker
109 |     gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``. If provided, will be used for both scheduler and worker
110 |     scheduler_ngpus: "" # number of GPUs to use on scheduler
111 |     scheduler_gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``.
112 |     worker_ngpus: "" # number of GPUs to use on worker
113 |     worker_gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``.
114 |     disk_type: "pd-standard" # type of disk to use: pd-standard, pd-ssd
115 |     docker_image: "daskdev/dask:latest" # docker image to use
116 |     auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out.
117 |     public_ingress: true # configure the scheduler to be externally accessible.  This assumes firefwall rules for 8787 and 8786
118 |     instance_labels:
119 |       container_vm: "dask-cloudprovider"
120 |     service_account: "default"
121 | 
122 |   hetzner:
123 |     token: null # API token for interacting with the Hetzner cloud API
124 |     location: "fsn1" # Location to launch vServer in
125 |     server_type: "cx11" # vServer server type to launch, default is 2GB RAM, 1 vCPU
126 |     image: "ubuntu-20.04" # Operating System image to use
127 |     docker_image: "daskdev/dask:latest" # docker image to use
128 |     bootstrap: true # It is assumed that the OS image does not have Docker and needs bootstrapping. Set this to false if using a custom image with Docker already installed.
129 | 
130 |   ibm:
131 |     api_key: null
132 |     image: "ghcr.io/dask/dask:latest"
133 |     region: us-east
134 |     project_id: null
135 |     scheduler_cpu: "1.0"
136 |     scheduler_mem: 4G
137 |     scheduler_disk: 400M
138 |     scheduler_timeout: 600 # seconds
139 |     scheduler_command:  python -m distributed.cli.dask_scheduler --protocol ws
140 |     worker_cpu: "2.0"
141 |     worker_mem: 8G
142 |     worker_disk: 400M
143 |     worker_threads: 1
144 |     worker_command: python -m distributed.cli.dask_spec
145 |     docker_server: ""
146 |     docker_username: ""
147 |     docker_password: ""
148 | 
149 |   openstack:
150 |     region: "RegionOne" # The name of the region where resources will be allocated in OpenStack. List available regions using: `openstack region list`.
151 |     size: null # Openstack flavors define the compute, memory, and storage capacity of computing instances. List available flavors using: `openstack flavor list`
152 |     auth_url: null # The authentication URL for the OpenStack Identity service (Keystone). Example: https://cloud.example.com:5000
153 |     application_credential_id: null # The application credential id created in OpenStack. Create application credentials using: openstack application credential create
154 |     application_credential_secret: null # The secret associated with the application credential ID for authentication.
155 |     auth_type: "v3applicationcredential" # The type of authentication used, typically "v3applicationcredential" for using OpenStack application credentials.
156 |     network_id: null # The unique identifier for the internal/private network in OpenStack where the cluster VMs will be connected. List available networks using: `openstack network list`
157 |     image: null # The OS image name or id to use for the VM. List available images using: `openstack image list` 
158 |     keypair_name: null # The name of the SSH keypair used for instance access. Ensure you have created a keypair or use an existing one. List available keypairs using: `openstack keypair list`
159 |     security_group: null # The security group name that defines firewall rules for instances.  List available security groups using: `openstack security group list`
160 |     external_network_id: null # The ID of the external network used for assigning floating IPs. List available external networks using: `openstack network list --external`
161 |     create_floating_ip: false # Specifies whether to assign a floating IP to each instance, enabling external access. Set to `True` if external connectivity is needed.
162 |     docker_image: "daskdev/dask:latest" # docker image to use
163 | 
164 |   nebius:
165 |     token: null  # iam token for interacting with the Nebius AI Cloud
166 |     project_id: null # You can find it in Nebius AI Cloud console
167 |     bootstrap: true # It is assumed that the OS image does not have Docker and needs bootstrapping. Set this to false if using a custom image with Docker already installed.
168 |     image_family: "ubuntu22.04-driverless"  # it should be "ubuntu22.04-driverless" or "ubuntu22.04-cuda12" https://docs.nebius.com/compute/storage/manage#parameters-boot
169 |     docker_image: "daskdev/dask:latest" # docker image to use
170 |     server_platform: "cpu-d3"  # all platforms https://docs.nebius.com/compute/virtual-machines/types
171 |     server_preset: "4vcpu-16gb" # all presets https://docs.nebius.com/compute/virtual-machines/types
172 |     disk_size: 64  # Specifies the size of the VM host OS disk in gigabytes.


--------------------------------------------------------------------------------
/dask_cloudprovider/config.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division, absolute_import
 2 | 
 3 | import os
 4 | 
 5 | import dask
 6 | import yaml
 7 | 
 8 | 
 9 | class ClusterConfig(dict):
10 |     """Simple config interface for dask-cloudprovider clusters, such as `AzureVMCluster`.
11 | 
12 |     Enables '.' notation for nested access, as per `dask.config.get`.
13 | 
14 |     Example
15 |     -------
16 | 
17 |     >>> from dask_cloudprovider.config import ClusterConfig
18 |     >>> class RandomCluster(VMCluster):
19 |     ...     def __init__(self, option=None):
20 |     ...         self.config = ClusterConfig(dask.config.get("cloudprovider.random", {}))
21 |     ...         self.option = self.config.get("option", override_with=option)
22 | 
23 |     """
24 | 
25 |     def __new__(cls, d):
26 |         return super().__new__(cls, d)
27 | 
28 |     def get(self, key, default=None, override_with=None):
29 |         return dask.config.get(
30 |             key, default=default, config=self, override_with=override_with
31 |         )
32 | 
33 | 
34 | fn = os.path.join(os.path.dirname(__file__), "cloudprovider.yaml")
35 | dask.config.ensure_file(source=fn)
36 | 
37 | with open(fn) as f:
38 |     defaults = yaml.safe_load(f)
39 | 
40 | dask.config.update_defaults(defaults)
41 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption(
 6 |         "--create-external-resources",
 7 |         action="store_true",
 8 |         default=False,
 9 |         help="Run tests that create external resources.",
10 |     )
11 | 
12 | 
13 | def pytest_configure(config):
14 |     config.addinivalue_line(
15 |         "markers", "external: mark test as creates external resources"
16 |     )
17 | 
18 | 
19 | def pytest_collection_modifyitems(config, items):
20 |     if config.getoption("--create-external-resources"):
21 |         # --runslow given in cli: do not skip slow tests
22 |         return
23 |     skip_slow = pytest.mark.skip(
24 |         reason="need --create-external-resources option to run"
25 |     )
26 |     for item in items:
27 |         if "external" in item.keywords:
28 |             item.add_marker(skip_slow)
29 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/digitalocean/__init__.py:
--------------------------------------------------------------------------------
1 | from .droplet import DropletCluster
2 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/digitalocean/droplet.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | import dask
  4 | from dask_cloudprovider.generic.vmcluster import (
  5 |     VMCluster,
  6 |     VMInterface,
  7 |     SchedulerMixin,
  8 |     WorkerMixin,
  9 | )
 10 | 
 11 | try:
 12 |     import digitalocean
 13 | except ImportError as e:
 14 |     msg = (
 15 |         "Dask Cloud Provider Digital Ocean requirements are not installed.\n\n"
 16 |         "Please pip install as follows:\n\n"
 17 |         '  pip install "dask-cloudprovider[digitalocean]" --upgrade  # or python -m pip install'
 18 |     )
 19 |     raise ImportError(msg) from e
 20 | 
 21 | 
 22 | class Droplet(VMInterface):
 23 |     def __init__(
 24 |         self,
 25 |         cluster: str,
 26 |         config,
 27 |         *args,
 28 |         region: str = None,
 29 |         size: str = None,
 30 |         image: str = None,
 31 |         docker_image=None,
 32 |         env_vars=None,
 33 |         extra_bootstrap=None,
 34 |         **kwargs,
 35 |     ):
 36 |         super().__init__(*args, **kwargs)
 37 |         self.droplet = None
 38 |         self.cluster = cluster
 39 |         self.config = config
 40 |         self.region = region
 41 |         self.size = size
 42 |         self.image = image
 43 |         self.gpu_instance = False
 44 |         self.bootstrap = True
 45 |         self.extra_bootstrap = extra_bootstrap
 46 |         self.docker_image = docker_image
 47 |         self.env_vars = env_vars
 48 | 
 49 |     async def create_vm(self):
 50 |         self.droplet = digitalocean.Droplet(
 51 |             token=self.config.get("token"),
 52 |             name=self.name,
 53 |             region=self.region,
 54 |             image=self.image,
 55 |             size_slug=self.size,
 56 |             backups=False,
 57 |             user_data=self.cluster.render_process_cloud_init(self),
 58 |         )
 59 |         await self.call_async(self.droplet.create)
 60 |         for action in self.droplet.get_actions():
 61 |             while action.status != "completed":
 62 |                 action.load()
 63 |                 await asyncio.sleep(0.1)
 64 |         while self.droplet.ip_address is None:
 65 |             await self.call_async(self.droplet.load)
 66 |             await asyncio.sleep(0.1)
 67 |         self.cluster._log(f"Created droplet {self.name}")
 68 | 
 69 |         return self.droplet.ip_address, None
 70 | 
 71 |     async def destroy_vm(self):
 72 |         await self.call_async(self.droplet.destroy)
 73 |         self.cluster._log(f"Terminated droplet {self.name}")
 74 | 
 75 | 
 76 | class DropletScheduler(SchedulerMixin, Droplet):
 77 |     """Scheduler running on a DigitalOcean Droplet."""
 78 | 
 79 | 
 80 | class DropletWorker(WorkerMixin, Droplet):
 81 |     """Worker running on a DigitalOcean Droplet."""
 82 | 
 83 | 
 84 | class DropletCluster(VMCluster):
 85 |     """Cluster running on Digital Ocean droplets.
 86 | 
 87 |     VMs in DigitalOcean (DO) are referred to as droplets. This cluster manager constructs a Dask cluster
 88 |     running on VMs.
 89 | 
 90 |     When configuring your cluster you may find it useful to install the ``doctl`` tool for querying the
 91 |     DO API for available options.
 92 | 
 93 |     https://www.digitalocean.com/docs/apis-clis/doctl/how-to/install/
 94 | 
 95 |     Parameters
 96 |     ----------
 97 |     region: str
 98 |         The DO region to launch you cluster in. A full list can be obtained with ``doctl compute region list``.
 99 |     size: str
100 |         The VM size slug. You can get a full list with ``doctl compute size list``.
101 |         The default is ``s-1vcpu-1gb`` which is 1GB RAM and 1 vCPU
102 |     image: str
103 |         The image ID to use for the host OS. This should be a Ubuntu variant.
104 |         You can list available images with ``doctl compute image list --public | grep ubuntu.*x64``.
105 |     worker_module: str
106 |         The Dask worker module to start on worker VMs.
107 |     n_workers: int
108 |         Number of workers to initialise the cluster with. Defaults to ``0``.
109 |     worker_module: str
110 |         The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
111 |     worker_options: dict
112 |         Params to be passed to the worker class.
113 |         See :class:`distributed.worker.Worker` for default worker class.
114 |         If you set ``worker_module`` then refer to the docstring for the custom worker class.
115 |     scheduler_options: dict
116 |         Params to be passed to the scheduler class.
117 |         See :class:`distributed.scheduler.Scheduler`.
118 |     docker_image: string (optional)
119 |         The Docker image to run on all instances.
120 | 
121 |         This image must have a valid Python environment and have ``dask`` installed in order for the
122 |         ``dask-scheduler`` and ``dask-worker`` commands to be available. It is recommended the Python
123 |         environment matches your local environment where ``EC2Cluster`` is being created from.
124 | 
125 |         For GPU instance types the Docker image much have NVIDIA drivers and ``dask-cuda`` installed.
126 | 
127 |         By default the ``daskdev/dask:latest`` image will be used.
128 |     docker_args: string (optional)
129 |         Extra command line arguments to pass to Docker.
130 |     extra_bootstrap: list[str] (optional)
131 |         Extra commands to be run during the bootstrap phase.
132 |     env_vars: dict (optional)
133 |         Environment variables to be passed to the worker.
134 |     silence_logs: bool
135 |         Whether or not we should silence logging when setting up the cluster.
136 |     asynchronous: bool
137 |         If this is intended to be used directly within an event loop with
138 |         async/await
139 |     security : Security or bool, optional
140 |         Configures communication security in this cluster. Can be a security
141 |         object, or True. If True, temporary self-signed credentials will
142 |         be created automatically. Default is ``True``.
143 |     debug: bool, optional
144 |         More information will be printed when constructing clusters to enable debugging.
145 | 
146 |     Examples
147 |     --------
148 | 
149 |     Create the cluster.
150 | 
151 |     >>> from dask_cloudprovider.digitalocean import DropletCluster
152 |     >>> cluster = DropletCluster(n_workers=1)
153 |     Creating scheduler instance
154 |     Created droplet dask-38b817c1-scheduler
155 |     Waiting for scheduler to run
156 |     Scheduler is running
157 |     Creating worker instance
158 |     Created droplet dask-38b817c1-worker-dc95260d
159 | 
160 |     Connect a client.
161 | 
162 |     >>> from dask.distributed import Client
163 |     >>> client = Client(cluster)
164 | 
165 |     Do some work.
166 | 
167 |     >>> import dask.array as da
168 |     >>> arr = da.random.random((1000, 1000), chunks=(100, 100))
169 |     >>> arr.mean().compute()
170 |     0.5001550986751964
171 | 
172 |     Close the cluster
173 | 
174 |     >>> client.close()
175 |     >>> cluster.close()
176 |     Terminated droplet dask-38b817c1-worker-dc95260d
177 |     Terminated droplet dask-38b817c1-scheduler
178 | 
179 |     You can also do this all in one go with context managers to ensure the cluster is
180 |     created and cleaned up.
181 | 
182 |     >>> with DropletCluster(n_workers=1) as cluster:
183 |     ...     with Client(cluster) as client:
184 |     ...         print(da.random.random((1000, 1000), chunks=(100, 100)).mean().compute())
185 |     Creating scheduler instance
186 |     Created droplet dask-48efe585-scheduler
187 |     Waiting for scheduler to run
188 |     Scheduler is running
189 |     Creating worker instance
190 |     Created droplet dask-48efe585-worker-5181aaf1
191 |     0.5000558682356162
192 |     Terminated droplet dask-48efe585-worker-5181aaf1
193 |     Terminated droplet dask-48efe585-scheduler
194 | 
195 |     """
196 | 
197 |     def __init__(
198 |         self,
199 |         region: str = None,
200 |         size: str = None,
201 |         image: str = None,
202 |         debug: bool = False,
203 |         **kwargs,
204 |     ):
205 |         self.config = dask.config.get("cloudprovider.digitalocean", {})
206 |         self.scheduler_class = DropletScheduler
207 |         self.worker_class = DropletWorker
208 |         self.debug = debug
209 |         self.options = {
210 |             "cluster": self,
211 |             "config": self.config,
212 |             "region": region if region is not None else self.config.get("region"),
213 |             "size": size if size is not None else self.config.get("size"),
214 |             "image": image if image is not None else self.config.get("image"),
215 |         }
216 |         self.scheduler_options = {**self.options}
217 |         self.worker_options = {**self.options}
218 |         super().__init__(debug=debug, **kwargs)
219 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/digitalocean/tests/test_droplet.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import dask
 4 | 
 5 | digitalocean = pytest.importorskip("digitalocean")
 6 | 
 7 | from dask_cloudprovider.digitalocean.droplet import DropletCluster
 8 | from dask.distributed import Client
 9 | from distributed.core import Status
10 | 
11 | 
12 | async def skip_without_credentials(config):
13 |     if config.get("token") is None:
14 |         pytest.skip(
15 |             """
16 |         You must configure a Digital Ocean API token to run this test.
17 | 
18 |         Either set this in your config
19 | 
20 |             # cloudprovider.yaml
21 |             cloudprovider:
22 |               digitalocean:
23 |                 token: "yourtoken"
24 | 
25 |         Or by setting it as an environment variable
26 | 
27 |             export DASK_CLOUDPROVIDER__DIGITALOCEAN__TOKEN="yourtoken"
28 | 
29 |         """
30 |         )
31 | 
32 | 
33 | @pytest.fixture
34 | async def config():
35 |     return dask.config.get("cloudprovider.digitalocean", {})
36 | 
37 | 
38 | @pytest.fixture
39 | @pytest.mark.external
40 | async def cluster(config):
41 |     await skip_without_credentials(config)
42 |     async with DropletCluster(asynchronous=True) as cluster:
43 |         yield cluster
44 | 
45 | 
46 | @pytest.mark.asyncio
47 | @pytest.mark.external
48 | async def test_init():
49 |     cluster = DropletCluster(asynchronous=True)
50 |     assert cluster.status == Status.created
51 | 
52 | 
53 | @pytest.mark.asyncio
54 | @pytest.mark.timeout(600)
55 | @pytest.mark.external
56 | async def test_create_cluster(cluster):
57 |     assert cluster.status == Status.running
58 | 
59 |     cluster.scale(1)
60 |     await cluster
61 |     assert len(cluster.workers) == 1
62 | 
63 |     async with Client(cluster, asynchronous=True) as client:
64 | 
65 |         def inc(x):
66 |             return x + 1
67 | 
68 |         assert await client.submit(inc, 10).result() == 11
69 | 
70 | 
71 | @pytest.mark.asyncio
72 | async def test_get_cloud_init():
73 |     cloud_init = DropletCluster.get_cloud_init(
74 |         docker_args="--privileged",
75 |     )
76 |     assert " --privileged " in cloud_init
77 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/exceptions.py:
--------------------------------------------------------------------------------
1 | class ConfigError(Exception):
2 |     """Raised when required config is missing"""
3 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/gcp/__init__.py:
--------------------------------------------------------------------------------
1 | from .instances import GCPCluster
2 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/gcp/tests/test_gcp.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import dask
  4 | from dask_cloudprovider.gcp.instances import (
  5 |     GCPCluster,
  6 |     GCPCompute,
  7 |     GCPCredentialsError,
  8 | )
  9 | from dask.distributed import Client
 10 | from distributed.core import Status
 11 | 
 12 | 
 13 | def skip_without_credentials():
 14 |     try:
 15 |         _ = GCPCompute()
 16 |     except GCPCredentialsError:
 17 |         pytest.skip(
 18 |             """
 19 |         You must configure your GCP credentials to run this test.
 20 | 
 21 |             $ google auth login
 22 | 
 23 |             or
 24 | 
 25 |             $ export GOOGLE_APPLICATION_CREDENTIALS=<path-to-gcp-json-credentials>
 26 | 
 27 |         """
 28 |         )
 29 | 
 30 |     if not dask.config.get("cloudprovider.gcp.projectid"):
 31 |         pytest.skip(
 32 |             """
 33 |         You must configure your Google project ID to run this test.
 34 | 
 35 |             # ~/.config/dask/cloudprovider.yaml
 36 |             cloudprovider:
 37 |               gcp:
 38 |                 projectid: "YOUR PROJECT ID"
 39 | 
 40 |             or
 41 | 
 42 |             $ export DASK_CLOUDPROVIDER__GCP__PROJECTID="YOUR PROJECT ID"
 43 | 
 44 |         """
 45 |         )
 46 | 
 47 | 
 48 | @pytest.mark.asyncio
 49 | async def test_init():
 50 |     skip_without_credentials()
 51 | 
 52 |     cluster = GCPCluster(asynchronous=True)
 53 |     assert cluster.status == Status.created
 54 | 
 55 | 
 56 | @pytest.mark.asyncio
 57 | async def test_get_cloud_init():
 58 |     skip_without_credentials()
 59 |     cloud_init = GCPCluster.get_cloud_init(
 60 |         security=True,
 61 |         docker_args="--privileged",
 62 |         extra_bootstrap=["gcloud auth print-access-token"],
 63 |     )
 64 |     assert "dask-scheduler" in cloud_init
 65 |     assert "# Bootstrap" in cloud_init
 66 |     assert " --privileged " in cloud_init
 67 |     assert "- gcloud auth print-access-token" in cloud_init
 68 | 
 69 | 
 70 | @pytest.mark.asyncio
 71 | @pytest.mark.timeout(1200)
 72 | @pytest.mark.external
 73 | async def test_create_cluster():
 74 |     skip_without_credentials()
 75 | 
 76 |     async with GCPCluster(
 77 |         asynchronous=True, env_vars={"FOO": "bar"}, security=True
 78 |     ) as cluster:
 79 |         assert cluster.status == Status.running
 80 | 
 81 |         cluster.scale(2)
 82 |         await cluster
 83 |         assert len(cluster.workers) == 2
 84 | 
 85 |         async with Client(cluster, asynchronous=True) as client:
 86 | 
 87 |             def inc(x):
 88 |                 return x + 1
 89 | 
 90 |             def check_env():
 91 |                 import os
 92 | 
 93 |                 return os.environ["FOO"]
 94 | 
 95 |             assert await client.submit(inc, 10).result() == 11
 96 |             assert await client.submit(check_env).result() == "bar"
 97 | 
 98 | 
 99 | @pytest.mark.asyncio
100 | @pytest.mark.timeout(1200)
101 | @pytest.mark.external
102 | async def test_create_cluster_sync():
103 |     skip_without_credentials()
104 | 
105 |     cluster = GCPCluster(n_workers=1)
106 |     client = Client(cluster)
107 | 
108 |     def inc(x):
109 |         return x + 1
110 | 
111 |     assert client.submit(inc, 10).result() == 11
112 | 
113 | 
114 | @pytest.mark.asyncio
115 | @pytest.mark.timeout(1200)
116 | @pytest.mark.external
117 | async def test_create_rapids_cluster():
118 |     skip_without_credentials()
119 | 
120 |     async with GCPCluster(
121 |         source_image="projects/nv-ai-infra/global/images/ngc-docker-11-20200916",
122 |         zone="us-east1-c",
123 |         machine_type="n1-standard-1",
124 |         filesystem_size=50,
125 |         ngpus=2,
126 |         gpu_type="nvidia-tesla-t4",
127 |         docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9",
128 |         worker_class="dask_cuda.CUDAWorker",
129 |         worker_options={"rmm_pool_size": "15GB"},
130 |         asynchronous=True,
131 |         auto_shutdown=True,
132 |         bootstrap=False,
133 |     ) as cluster:
134 |         assert cluster.status == Status.running
135 | 
136 |         cluster.scale(1)
137 | 
138 |         await cluster
139 | 
140 |         assert len(cluster.workers) == 1
141 | 
142 |         client = Client(cluster, asynchronous=True)  # noqa
143 |         await client
144 | 
145 |         def gpu_mem():
146 |             from pynvml.smi import nvidia_smi
147 | 
148 |             nvsmi = nvidia_smi.getInstance()
149 |             return nvsmi.DeviceQuery("memory.free, memory.total")
150 | 
151 |         results = await client.run(gpu_mem)
152 |         for w, res in results.items():
153 |             assert "total" in res["gpu"][0]["fb_memory_usage"].keys()
154 |             print(res)
155 | 
156 | 
157 | @pytest.mark.timeout(1200)
158 | @pytest.mark.external
159 | def test_create_rapids_cluster_sync():
160 |     skip_without_credentials()
161 |     cluster = GCPCluster(
162 |         source_image="projects/nv-ai-infra/global/images/packer-1607527229",
163 |         network="dask-gcp-network-test",
164 |         zone="us-east1-c",
165 |         machine_type="n1-standard-1",
166 |         filesystem_size=50,
167 |         ngpus=2,
168 |         gpu_type="nvidia-tesla-t4",
169 |         docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9",
170 |         worker_class="dask_cuda.CUDAWorker",
171 |         worker_options={"rmm_pool_size": "15GB"},
172 |         asynchronous=False,
173 |         bootstrap=False,
174 |     )
175 | 
176 |     cluster.scale(1)
177 | 
178 |     client = Client(cluster)  # noqa
179 |     client.wait_for_workers(2)
180 | 
181 |     def gpu_mem():
182 |         from pynvml.smi import nvidia_smi
183 | 
184 |         nvsmi = nvidia_smi.getInstance()
185 |         return nvsmi.DeviceQuery("memory.free, memory.total")
186 | 
187 |     results = client.run(gpu_mem)
188 |     for w, res in results.items():
189 |         assert "total" in res["gpu"][0]["fb_memory_usage"].keys()
190 |         print(res)
191 |     cluster.close()
192 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/gcp/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from dask_cloudprovider.gcp.utils import build_request, is_inside_gce
 4 | 
 5 | 
 6 | def test_build_request():
 7 |     assert build_request()(None, lambda x: x, "https://example.com")
 8 | 
 9 | 
10 | @pytest.mark.xfail(
11 |     is_inside_gce(), reason="Fails if you run this test on GCE environment"
12 | )
13 | def test_is_gce_env():
14 |     # Note: this test isn't super valuable, but at least we run the code
15 |     assert is_inside_gce() is False
16 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/gcp/utils.py:
--------------------------------------------------------------------------------
 1 | import httplib2
 2 | import googleapiclient.http
 3 | import google_auth_httplib2
 4 | 
 5 | 
 6 | def build_request(credentials=None):
 7 |     def inner(http, *args, **kwargs):
 8 |         new_http = httplib2.Http()
 9 |         if credentials is not None:
10 |             new_http = google_auth_httplib2.AuthorizedHttp(credentials, http=new_http)
11 | 
12 |         return googleapiclient.http.HttpRequest(new_http, *args, **kwargs)
13 | 
14 |     return inner
15 | 
16 | 
17 | def is_inside_gce() -> bool:
18 |     """
19 |     Returns True is the client is running in the GCE environment,
20 |     False otherwise.
21 | 
22 |     Doc: https://cloud.google.com/compute/docs/storing-retrieving-metadata
23 |     """
24 |     h = httplib2.Http()
25 |     try:
26 |         resp_headers, _ = h.request(
27 |             "http://metadata.google.internal/computeMetadata/v1/",
28 |             headers={"metadata-flavor": "Google"},
29 |             method="GET",
30 |         )
31 |     except (httplib2.HttpLib2Error, OSError):
32 |         return False
33 |     return True
34 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/generic/cloud-init.yaml.j2:
--------------------------------------------------------------------------------
 1 | #cloud-config
 2 | 
 3 | {% if bootstrap %}
 4 | # Bootstrap
 5 | packages:
 6 |   - apt-transport-https
 7 |   - ca-certificates
 8 |   - curl
 9 |   - gnupg-agent
10 |   - software-properties-common
11 |   - ubuntu-drivers-common
12 | 
13 | # Enable ipv4 forwarding, required on CIS hardened machines
14 | write_files:
15 |   - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
16 |     content: |
17 |       net.ipv4.conf.all.forwarding=1
18 | 
19 | # create the docker group
20 | groups:
21 |   - docker
22 | 
23 | # Add default auto created user to docker group
24 | system_info:
25 |   default_user:
26 |     groups: [docker]
27 | {% endif %}
28 | 
29 | runcmd:
30 |   {% if bootstrap %}
31 |   # Install Docker
32 |   - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
33 |   - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
34 |   - apt-get update -y
35 |   - apt-get install -y docker-ce docker-ce-cli containerd.io
36 |   - systemctl start docker
37 |   - systemctl enable docker
38 |   {% endif %}
39 | 
40 |   {% if bootstrap and gpu_instance %}
41 |   # Install NVIDIA driver
42 |   - DEBIAN_FRONTEND=noninteractive ubuntu-drivers install
43 | 
44 |   # Install NVIDIA docker
45 |   - curl -fsSL https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
46 |   - curl -s -L https://nvidia.github.io/nvidia-docker/$(. /etc/os-release;echo $ID$VERSION_ID)/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
47 |   - apt-get update -y
48 |   - apt-get install -y nvidia-docker2
49 |   - systemctl restart docker
50 |   {% endif %}
51 | 
52 |   {% if extra_bootstrap %}
53 |   {% for command in extra_bootstrap %}
54 |   - {{ command }}
55 |   {% endfor %}
56 |   {% endif %}
57 | 
58 |   # Run container
59 |   - 'docker run --net=host {%+ if gpu_instance %}--gpus=all{% endif %} {% for key in env_vars %} -e {{key}}="{{env_vars[key]}}" {% endfor %}{%+ if docker_args %}{{docker_args}}{% endif %} {{image}} {{ command }}'
60 | 
61 |   {% if auto_shutdown %}
62 |   # Shutdown when command is done
63 |   - shutdown -h now
64 |   {% endif %}
65 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/generic/tests/test_vmcluster.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import asyncio
 4 | import time
 5 | 
 6 | from dask_cloudprovider.generic.vmcluster import VMCluster, VMInterface
 7 | 
 8 | 
 9 | class DummyWorker(VMInterface):
10 |     """A dummy worker for testing."""
11 | 
12 | 
13 | class DummyScheduler(VMInterface):
14 |     """A dummy scheduler for testing."""
15 | 
16 | 
17 | class DummyCluster(VMCluster):
18 |     """A dummy cluster for testing."""
19 | 
20 |     scheduler_class = DummyScheduler
21 |     worker_class = DummyWorker
22 | 
23 | 
24 | @pytest.mark.asyncio
25 | async def test_init():
26 |     with pytest.raises(RuntimeError):
27 |         _ = VMCluster(asynchronous=True)
28 | 
29 | 
30 | @pytest.mark.asyncio
31 | async def test_call_async():
32 |     cluster = DummyCluster(asynchronous=True)
33 | 
34 |     def blocking(string):
35 |         time.sleep(0.1)
36 |         return string
37 | 
38 |     start = time.time()
39 | 
40 |     a, b, c, d = await asyncio.gather(
41 |         cluster.call_async(blocking, "hello"),
42 |         cluster.call_async(blocking, "world"),
43 |         cluster.call_async(blocking, "foo"),
44 |         cluster.call_async(blocking, "bar"),
45 |     )
46 | 
47 |     assert a == "hello"
48 |     assert b == "world"
49 |     assert c == "foo"
50 |     assert d == "bar"
51 | 
52 |     # Each call to ``blocking`` takes 0.1 seconds, but they should've been run concurrently.
53 |     assert time.time() - start < 0.2
54 | 
55 |     await cluster.close()
56 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/generic/vmcluster.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import os
  4 | import uuid
  5 | 
  6 | from jinja2 import Environment, FileSystemLoader
  7 | 
  8 | import dask.config
  9 | from distributed.core import Status
 10 | from distributed.worker import Worker as _Worker
 11 | from distributed.scheduler import Scheduler as _Scheduler
 12 | from distributed.security import Security
 13 | from distributed.deploy.spec import SpecCluster, ProcessInterface
 14 | from distributed.utils import warn_on_duration, cli_keywords
 15 | 
 16 | from dask_cloudprovider.utils.socket import is_socket_open
 17 | 
 18 | 
 19 | class VMInterface(ProcessInterface):
 20 |     """A superclass for VM Schedulers, Workers and Nannies."""
 21 | 
 22 |     def __init__(self, docker_args: str = "", extra_bootstrap: list = None, **kwargs):
 23 |         super().__init__()
 24 |         self.name = None
 25 |         self.command = None
 26 |         self.address = None
 27 |         self.cluster = None
 28 |         self.gpu_instance = None
 29 |         self.bootstrap = None
 30 |         self.docker_image = "daskdev/dask:latest"
 31 |         self.docker_args = docker_args
 32 |         self.extra_bootstrap = extra_bootstrap
 33 |         self.auto_shutdown = True
 34 |         self.set_env = 'env DASK_INTERNAL_INHERIT_CONFIG="{}"'.format(
 35 |             dask.config.serialize(dask.config.global_config)
 36 |         )
 37 |         self.kwargs = kwargs
 38 | 
 39 |     async def create_vm(self):
 40 |         raise NotImplementedError("create_vm is a required method of the VMInterface")
 41 | 
 42 |     async def destroy_vm(self):
 43 |         raise NotImplementedError("destroy_vm is a required method of the VMInterface")
 44 | 
 45 |     async def wait_for_scheduler(self):
 46 |         if self.external_address:
 47 |             _, address = self.external_address.split("://")
 48 |         else:
 49 |             _, address = self.address.split("://")
 50 |         ip, port = address.split(":")
 51 | 
 52 |         self.cluster._log(f"Waiting for scheduler to run at {ip}:{port}")
 53 |         while not is_socket_open(ip, port):
 54 |             await asyncio.sleep(0.1)
 55 |         self.cluster._log("Scheduler is running")
 56 | 
 57 |     async def start(self):
 58 |         """Create a VM."""
 59 |         await super().start()
 60 | 
 61 |     async def close(self):
 62 |         """Destroy a VM."""
 63 |         await self.destroy_vm()
 64 |         await super().close()
 65 | 
 66 |     async def call_async(self, f, *args, **kwargs):
 67 |         """Run a blocking function in a thread as a coroutine."""
 68 |         return await self.call_async(f, *args, **kwargs)
 69 | 
 70 | 
 71 | class SchedulerMixin(object):
 72 |     """A mixin for Schedulers."""
 73 | 
 74 |     def __init__(
 75 |         self,
 76 |         *args,
 77 |         scheduler_options: dict = {},
 78 |         **kwargs,
 79 |     ):
 80 |         super().__init__(*args, **kwargs)
 81 |         self.name = f"dask-{self.cluster.uuid}-scheduler"
 82 |         self.port = scheduler_options.get("port", 8786)
 83 |         self.command = " ".join(
 84 |             [
 85 |                 self.set_env,
 86 |                 "python",
 87 |                 "-m",
 88 |                 "distributed.cli.dask_scheduler",
 89 |             ]
 90 |             + cli_keywords(scheduler_options, cls=_Scheduler)
 91 |         )
 92 | 
 93 |     async def start(self):
 94 |         self.cluster._log("Creating scheduler instance")
 95 | 
 96 |         internal_ip, external_ip = await self.create_vm()
 97 |         self.address = f"{self.cluster.protocol}://{internal_ip}:{self.port}"
 98 |         if external_ip:
 99 |             self.external_address = (
100 |                 f"{self.cluster.protocol}://{external_ip}:{self.port}"
101 |             )
102 | 
103 |         await self.wait_for_scheduler()
104 |         await super().start()
105 | 
106 | 
107 | class WorkerMixin(object):
108 |     """A Remote Dask Worker running on a VM."""
109 | 
110 |     def __init__(
111 |         self,
112 |         scheduler: str,
113 |         *args,
114 |         worker_module: str = None,
115 |         worker_class: str = None,
116 |         worker_options: dict = {},
117 |         **kwargs,
118 |     ):
119 |         super().__init__(*args, **kwargs)
120 |         self.scheduler = scheduler
121 |         self.name = f"dask-{self.cluster.uuid}-worker-{str(uuid.uuid4())[:8]}"
122 |         if worker_module is not None:
123 |             self.worker_module = worker_module
124 | 
125 |             self.command = " ".join(
126 |                 [
127 |                     self.set_env,
128 |                     "python",
129 |                     "-m",
130 |                     self.worker_module,
131 |                     self.scheduler,
132 |                     "--name",
133 |                     str(self.name),
134 |                 ]
135 |                 + cli_keywords(worker_options, cls=_Worker, cmd=self.worker_module)
136 |             )
137 |         if worker_class is not None:
138 |             self.worker_class = worker_class
139 |             self.command = " ".join(
140 |                 [
141 |                     self.set_env,
142 |                     "python",
143 |                     "-m",
144 |                     "distributed.cli.dask_spec",
145 |                     self.scheduler,
146 |                     "--spec",
147 |                     "''%s''"  # in yaml double single quotes escape the single quote
148 |                     % json.dumps(
149 |                         {
150 |                             "cls": self.worker_class,
151 |                             "opts": {
152 |                                 **worker_options,
153 |                                 "name": self.name,
154 |                             },
155 |                         }
156 |                     ),
157 |                 ]
158 |             )
159 | 
160 |     async def start(self):
161 |         self.cluster._log("Creating worker instance")
162 |         self.address, _ = await self.create_vm()
163 |         await super().start()
164 | 
165 | 
166 | class VMCluster(SpecCluster):
167 |     """A base class for Virtual Machine based cluster managers.
168 | 
169 |     This class holds logic around starting a scheduler and workers as VMs. This class
170 |     is not intended to be used directly but instead should be subclassed and the attributes
171 |     ``scheduler_class`` and ``worker_class`` should be set.
172 | 
173 |     The scheduler class should be a subclass of ``VMInterface`` with the ``SchedulerMixin``.
174 |     The worker class should be a subclass of ``VMInterface`` with the ``WorkerMixin``.
175 | 
176 |     See ``VMInterface`` docstring for required methods.
177 | 
178 |     For a reference implementation see :class:`DropletCluster`.
179 | 
180 |     The following paramaters section should be copied to the subclass docstring and appended
181 |     to the provider specific paramaters.
182 | 
183 |     Parameters
184 |     ----------
185 |     n_workers: int
186 |         Number of workers to initialise the cluster with. Defaults to ``0``.
187 |     worker_module: str
188 |         The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
189 |     worker_options: dict
190 |         Params to be passed to the worker class.
191 |         See :class:`distributed.worker.Worker` for default worker class.
192 |         If you set ``worker_module`` then refer to the docstring for the custom worker class.
193 |     scheduler_options: dict
194 |         Params to be passed to the scheduler class.
195 |         See :class:`distributed.scheduler.Scheduler`.
196 |     docker_image: string (optional)
197 |         The Docker image to run on all instances.
198 | 
199 |         This image must have a valid Python environment and have ``dask`` installed in order for the
200 |         ``dask-scheduler`` and ``dask-worker`` commands to be available. It is recommended the Python
201 |         environment matches your local environment where ``EC2Cluster`` is being created from.
202 | 
203 |         For GPU instance types the Docker image much have NVIDIA drivers and ``dask-cuda`` installed.
204 | 
205 |         By default the ``daskdev/dask:latest`` image will be used.
206 |     docker_args: string (optional)
207 |         Extra command line arguments to pass to Docker.
208 |     extra_bootstrap: list[str] (optional)
209 |         Extra commands to be run during the bootstrap phase.
210 |     silence_logs: bool
211 |         Whether or not we should silence logging when setting up the cluster.
212 |     asynchronous: bool
213 |         If this is intended to be used directly within an event loop with
214 |         async/await
215 |     security: Security or bool, optional
216 |         Configures communication security in this cluster. Can be a security
217 |         object, or True. If True, temporary self-signed credentials will
218 |         be created automatically. Default is ``True``.
219 |     debug: bool, optional
220 |         More information will be printed when constructing clusters to enable debugging.
221 | 
222 |     """
223 | 
224 |     scheduler_class = None
225 |     worker_class = None
226 |     options = {}
227 |     scheduler_options = {}
228 |     worker_options = {}
229 |     docker_image = None
230 |     command = None
231 |     gpu_instance = None
232 |     bootstrap = None
233 |     auto_shutdown = None
234 | 
235 |     def __init__(
236 |         self,
237 |         n_workers: int = 0,
238 |         worker_class: str = "dask.distributed.Nanny",
239 |         worker_options: dict = {},
240 |         scheduler_options: dict = {},
241 |         docker_image="daskdev/dask:latest",
242 |         docker_args: str = "",
243 |         extra_bootstrap: list = None,
244 |         env_vars: dict = {},
245 |         security: bool = True,
246 |         protocol: str = None,
247 |         debug: bool = False,
248 |         **kwargs,
249 |     ):
250 |         if self.scheduler_class is None or self.worker_class is None:
251 |             raise RuntimeError(
252 |                 "VMCluster is not intended to be used directly. See docstring for more info."
253 |             )
254 |         self._n_workers = n_workers
255 | 
256 |         if not security:
257 |             self.security = None
258 |         elif security is True:
259 |             # True indicates self-signed temporary credentials should be used
260 |             self.security = Security.temporary()
261 |         elif not isinstance(security, Security):
262 |             raise TypeError("security must be a Security object")
263 |         else:
264 |             self.security = security
265 | 
266 |         if protocol is None:
267 |             if self.security and self.security.require_encryption:
268 |                 self.protocol = "tls"
269 |             else:
270 |                 self.protocol = "tcp"
271 |         else:
272 |             self.protocol = protocol
273 | 
274 |         self.debug = debug
275 | 
276 |         if self.security and self.security.require_encryption:
277 |             dask.config.set(
278 |                 {
279 |                     "distributed.comm.default-scheme": self.protocol,
280 |                     "distributed.comm.require-encryption": True,
281 |                     "distributed.comm.tls.ca-file": self.security.tls_ca_file,
282 |                     "distributed.comm.tls.scheduler.key": self.security.tls_scheduler_key,
283 |                     "distributed.comm.tls.scheduler.cert": self.security.tls_scheduler_cert,
284 |                     "distributed.comm.tls.worker.key": self.security.tls_worker_key,
285 |                     "distributed.comm.tls.worker.cert": self.security.tls_worker_cert,
286 |                     "distributed.comm.tls.client.key": self.security.tls_client_key,
287 |                     "distributed.comm.tls.client.cert": self.security.tls_client_cert,
288 |                 }
289 |             )
290 | 
291 |         image = self.scheduler_options.get("docker_image", False) or docker_image
292 |         self.options["docker_image"] = image
293 |         self.scheduler_options["docker_image"] = image
294 |         self.scheduler_options["env_vars"] = env_vars
295 |         self.scheduler_options["protocol"] = protocol
296 |         self.scheduler_options["scheduler_options"] = scheduler_options
297 |         self.scheduler_options["extra_bootstrap"] = extra_bootstrap
298 |         self.worker_options["env_vars"] = env_vars
299 |         self.options["docker_args"] = docker_args
300 |         self.options["extra_bootstrap"] = extra_bootstrap
301 |         self.scheduler_options["docker_args"] = docker_args
302 |         self.worker_options["docker_args"] = docker_args
303 |         self.worker_options["docker_image"] = image
304 |         self.worker_options["worker_class"] = worker_class
305 |         self.worker_options["protocol"] = protocol
306 |         self.worker_options["worker_options"] = worker_options
307 |         self.worker_options["extra_bootstrap"] = extra_bootstrap
308 |         self.uuid = str(uuid.uuid4())[:8]
309 | 
310 |         super().__init__(**kwargs, security=self.security)
311 | 
312 |     async def call_async(self, f, *args, **kwargs):
313 |         """Run a blocking function in a thread as a coroutine.
314 | 
315 |         This can only be used to make IO-bound operations non-blocking due to the GIL.
316 | 
317 |         As of Python 3.9 this can be replaced with :func:`asyncio.to_thread`.
318 |         Once 3.9 is our minimum supported version this can be removed/replaced.
319 | 
320 |         """
321 |         [done], _ = await asyncio.wait(
322 |             fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))},
323 |             return_when=asyncio.ALL_COMPLETED,
324 |         )
325 |         return done.result()
326 | 
327 |     async def _start(
328 |         self,
329 |     ):
330 |         while self.status == Status.starting:
331 |             await asyncio.sleep(0.01)
332 |         if self.status == Status.running:
333 |             return
334 |         if self.status == Status.closed:
335 |             raise ValueError("Cluster is closed")
336 | 
337 |         self.scheduler_spec = {
338 |             "cls": self.scheduler_class,
339 |             "options": self.scheduler_options,
340 |         }
341 |         self.new_spec = {"cls": self.worker_class, "options": self.worker_options}
342 |         self.worker_spec = {
343 |             self._new_worker_name(i): self.new_spec for i in range(self._n_workers)
344 |         }
345 | 
346 |         with warn_on_duration(
347 |             "10s",
348 |             "Creating your cluster is taking a surprisingly long time. "
349 |             "This is likely due to pending resources. "
350 |             "Hang tight! ",
351 |         ):
352 |             await super()._start()
353 | 
354 |     def render_process_cloud_init(self, process):
355 |         return self.render_cloud_init(
356 |             image=process.docker_image,
357 |             command=process.command,
358 |             docker_args=process.docker_args,
359 |             extra_bootstrap=process.extra_bootstrap,
360 |             gpu_instance=process.gpu_instance,
361 |             bootstrap=process.bootstrap,
362 |             auto_shutdown=process.auto_shutdown,
363 |             env_vars=process.env_vars,
364 |         )
365 | 
366 |     def render_cloud_init(self, *args, **kwargs):
367 |         loader = FileSystemLoader([os.path.dirname(os.path.abspath(__file__))])
368 |         environment = Environment(loader=loader)
369 |         template = environment.get_template("cloud-init.yaml.j2")
370 |         cloud_init = template.render(**kwargs)
371 |         if self.debug:
372 |             print("\nCloud init\n==========\n\n")
373 |             print(cloud_init)
374 |         return cloud_init
375 | 
376 |     @classmethod
377 |     def get_cloud_init(cls, *args, **kwargs):
378 |         cluster = cls(*args, asynchronous=True, **kwargs)
379 |         cluster.auto_shutdown = False
380 |         return cluster.render_cloud_init(
381 |             image=cluster.options["docker_image"],
382 |             command="dask-scheduler --version",
383 |             docker_args=cluster.options["docker_args"],
384 |             extra_bootstrap=cluster.options["extra_bootstrap"],
385 |             gpu_instance=cluster.gpu_instance,
386 |             bootstrap=cluster.bootstrap,
387 |             auto_shutdown=cluster.auto_shutdown,
388 |             env_vars=cluster.worker_options["env_vars"],
389 |         )
390 | 
391 |     def get_tags(self):
392 |         """Generate tags to be applied to all resources."""
393 |         return {"creator": "dask-cloudprovider", "cluster-id": self.uuid}
394 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/hetzner/__init__.py:
--------------------------------------------------------------------------------
1 | from .vserver import HetznerCluster
2 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/hetzner/tests/test_vserver.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import dask
 4 | 
 5 | hetzner = pytest.importorskip("hcloud")
 6 | 
 7 | from dask_cloudprovider.hetzner.vserver import HetznerCluster
 8 | from dask.distributed import Client
 9 | from distributed.core import Status
10 | 
11 | 
12 | async def skip_without_credentials(config):
13 |     if config.get("token") is None:
14 |         pytest.skip(
15 |             """
16 |         You must configure a Hetzner API token to run this test.
17 | 
18 |         Either set this in your config
19 | 
20 |             # cloudprovider.yaml
21 |             cloudprovider:
22 |               hetzner:
23 |                 token: "yourtoken"
24 | 
25 |         Or by setting it as an environment variable
26 | 
27 |             export DASK_CLOUDPROVIDER__HETZNER__TOKEN="yourtoken"
28 | 
29 |         """
30 |         )
31 | 
32 | 
33 | @pytest.fixture
34 | async def config():
35 |     return dask.config.get("cloudprovider.hetzner", {})
36 | 
37 | 
38 | @pytest.fixture
39 | @pytest.mark.external
40 | async def cluster(config):
41 |     await skip_without_credentials(config)
42 |     async with HetznerCluster(asynchronous=True) as cluster:
43 |         yield cluster
44 | 
45 | 
46 | @pytest.mark.asyncio
47 | async def test_init():
48 |     cluster = HetznerCluster(asynchronous=True)
49 |     assert cluster.status == Status.created
50 | 
51 | 
52 | @pytest.mark.asyncio
53 | @pytest.mark.timeout(600)
54 | async def test_create_cluster(cluster):
55 |     assert cluster.status == Status.running
56 | 
57 |     cluster.scale(1)
58 |     await cluster
59 |     assert len(cluster.workers) == 1
60 | 
61 |     async with Client(cluster, asynchronous=True) as client:
62 | 
63 |         def inc(x):
64 |             return x + 1
65 | 
66 |         assert await client.submit(inc, 10).result() == 11
67 | 
68 | 
69 | @pytest.mark.asyncio
70 | async def test_get_cloud_init():
71 |     cloud_init = HetznerCluster.get_cloud_init(
72 |         docker_args="--privileged",
73 |     )
74 |     assert " --privileged " in cloud_init
75 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/hetzner/vserver.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import dask
  3 | 
  4 | from dask_cloudprovider.generic.vmcluster import (
  5 |     VMCluster,
  6 |     VMInterface,
  7 |     SchedulerMixin,
  8 |     WorkerMixin,
  9 | )
 10 | 
 11 | try:
 12 |     import hcloud
 13 | except ImportError as e:
 14 |     msg = (
 15 |         "Dask Cloud Provider Hetzner requirements are not installed.\n\n"
 16 |         "Please pip install as follows:\n\n"
 17 |         '  pip install "dask-cloudprovider[hcloud]" --upgrade  # or python -m pip install'
 18 |     )
 19 |     raise ImportError(msg) from e
 20 | 
 21 | from hcloud.images.domain import Image
 22 | from hcloud.server_types.domain import ServerType
 23 | from hcloud.actions.domain import Action
 24 | 
 25 | 
 26 | class VServer(VMInterface):
 27 |     def __init__(
 28 |         self,
 29 |         cluster: str,
 30 |         config,
 31 |         env_vars: dict = None,
 32 |         bootstrap=None,
 33 |         extra_bootstrap=None,
 34 |         docker_image: str = None,
 35 |         image: str = None,
 36 |         location: str = None,
 37 |         server_type: str = None,
 38 |         *args,
 39 |         **kwargs,
 40 |     ):
 41 |         super().__init__(*args, **kwargs)
 42 |         self.cluster = cluster
 43 |         self.config = config
 44 |         self.location = location
 45 |         self.bootstrap = bootstrap
 46 |         self.extra_bootstrap = extra_bootstrap
 47 |         self.env_vars = env_vars
 48 |         self.client = hcloud.Client(self.config.get("token"))
 49 |         self.server_type = ServerType(server_type)
 50 |         self.image = Image(name=image)
 51 |         self.docker_image = docker_image
 52 | 
 53 |     async def create_vm(self):
 54 |         await self.call_async(
 55 |             self.client.servers.create,
 56 |             server_type=self.server_type,
 57 |             image=self.image,
 58 |             name=self.name,
 59 |             user_data=self.cluster.render_process_cloud_init(self),
 60 |         )
 61 | 
 62 |         self.server = self.client.servers.get_by_name(self.name)
 63 |         for action in self.server.get_actions():
 64 |             while action.status != Action.STATUS_SUCCESS:
 65 |                 await self.call_async(action.reload)
 66 |                 await asyncio.sleep(0.1)
 67 |         self.cluster._log(f"Created Hetzner vServer {self.name}")
 68 | 
 69 |         return self.server.public_net.ipv4.ip, None
 70 | 
 71 |     async def destroy_vm(self):
 72 |         await self.call_async(self.client.servers.delete, server=self.server)
 73 |         self.cluster._log(f"Terminated vServer {self.name}")
 74 | 
 75 | 
 76 | class HetznerScheduler(SchedulerMixin, VServer):
 77 |     """Scheduler running on a Hetzner server."""
 78 | 
 79 | 
 80 | class HetznerWorker(WorkerMixin, VServer):
 81 |     """Worker running on a Hetzner server."""
 82 | 
 83 | 
 84 | class HetznerCluster(VMCluster):
 85 |     """Cluster running on Hetzner cloud vServers.
 86 | 
 87 |     VMs in Hetzner are referred to as vServers. This cluster manager constructs a Dask cluster
 88 |     running on VMs.
 89 | 
 90 |     When configuring your cluster you may find it useful to install the ``hcloud`` tool for querying the
 91 |     Hetzner API for available options.
 92 | 
 93 |     https://github.com/hetznercloud/cli
 94 | 
 95 |     Parameters
 96 |     ----------
 97 |     image: str
 98 |         The image to use for the host OS. This should be a Ubuntu variant.
 99 |         You can list available images with ``hcloud image list|grep Ubuntu``.
100 |     location: str
101 |         The Hetzner location to launch you cluster in. A full list can be obtained with ``hcloud location list``.
102 |     server_type: str
103 |         The VM server type. You can get a full list with ``hcloud server-type list``.
104 |         The default is ``cx11`` which is vServer with 2GB RAM and 1 vCPU.
105 |     n_workers: int
106 |         Number of workers to initialise the cluster with. Defaults to ``0``.
107 |     worker_module: str
108 |         The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
109 |     worker_options: dict
110 |         Params to be passed to the worker class.
111 |         See :class:`distributed.worker.Worker` for default worker class.
112 |         If you set ``worker_module`` then refer to the docstring for the custom worker class.
113 |     scheduler_options: dict
114 |         Params to be passed to the scheduler class.
115 |         See :class:`distributed.scheduler.Scheduler`.
116 |     env_vars: dict
117 |         Environment variables to be passed to the worker.
118 |     extra_bootstrap: list[str] (optional)
119 |         Extra commands to be run during the bootstrap phase.
120 | 
121 |     Example
122 |     --------
123 | 
124 |     >>> from dask_cloudprovider.hetzner import HetznerCluster
125 |     >>> cluster = HetznerCluster(n_workers=1)
126 | 
127 |     >>> from dask.distributed import Client
128 |     >>> client = Client(cluster)
129 | 
130 |     >>> import dask.array as da
131 |     >>> arr = da.random.random((1000, 1000), chunks=(100, 100))
132 |     >>> arr.mean().compute()
133 | 
134 |     >>> client.close()
135 |     >>> cluster.close()
136 | 
137 |     """
138 | 
139 |     def __init__(
140 |         self,
141 |         bootstrap: str = None,
142 |         image: str = None,
143 |         location: str = None,
144 |         server_type: str = None,
145 |         docker_image: str = None,
146 |         **kwargs,
147 |     ):
148 |         self.config = dask.config.get("cloudprovider.hetzner", {})
149 | 
150 |         self.scheduler_class = HetznerScheduler
151 |         self.worker_class = HetznerWorker
152 | 
153 |         self.image = dask.config.get("cloudprovider.hetzner.image", override_with=image)
154 |         self.docker_image = dask.config.get(
155 |             "cloudprovider.hetzner.docker_image", override_with=docker_image
156 |         )
157 |         self.location = dask.config.get(
158 |             "cloudprovider.hetzner.location", override_with=location
159 |         )
160 |         self.server_type = dask.config.get(
161 |             "cloudprovider.hetzner.server_type", override_with=server_type
162 |         )
163 |         self.bootstrap = dask.config.get(
164 |             "cloudprovider.hetzner.bootstrap", override_with=bootstrap
165 |         )
166 | 
167 |         self.options = {
168 |             "bootstrap": self.bootstrap,
169 |             "cluster": self,
170 |             "config": self.config,
171 |             "docker_image": self.docker_image,
172 |             "image": self.image,
173 |             "location": self.location,
174 |             "server_type": self.server_type,
175 |         }
176 |         self.scheduler_options = {**self.options}
177 |         self.worker_options = {**self.options}
178 |         super().__init__(**kwargs)
179 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/ibm/__init__.py:
--------------------------------------------------------------------------------
1 | from .code_engine import IBMCodeEngineCluster
2 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/ibm/tests/test_code_engine.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import dask
  4 | 
  5 | codeengine = pytest.importorskip("ibm_code_engine_sdk.code_engine_v2")
  6 | 
  7 | from dask_cloudprovider.ibm.code_engine import IBMCodeEngineCluster
  8 | from dask.distributed import Client
  9 | from distributed.core import Status
 10 | 
 11 | 
 12 | async def skip_without_credentials():
 13 |     if dask.config.get("cloudprovider.ibm.api_key") is None:
 14 |         pytest.skip(
 15 |             """
 16 |         You must configure a IBM API key to run this test.
 17 | 
 18 |         Either set this in your config
 19 | 
 20 |             # cloudprovider.yaml
 21 |             cloudprovider:
 22 |               ibm:
 23 |                 api_key: "your_api_key"
 24 | 
 25 |         Or by setting it as an environment variable
 26 | 
 27 |             export DASK_CLOUDPROVIDER__IBM__API_KEY="your_api_key"
 28 | 
 29 |         """
 30 |         )
 31 | 
 32 |     if dask.config.get("cloudprovider.ibm.project_id") is None:
 33 |         pytest.skip(
 34 |             """
 35 |         You must configure a IBM project id to run this test.
 36 | 
 37 |         Either set this in your config
 38 | 
 39 |             # cloudprovider.yaml
 40 |             cloudprovider:
 41 |               ibm:
 42 |                 project_id: "your_project_id"
 43 | 
 44 |         Or by setting it as an environment variable
 45 | 
 46 |             export DASK_CLOUDPROVIDER__IBM__PROJECT_ID="your_project_id"
 47 | 
 48 |         """
 49 |         )
 50 | 
 51 |     if dask.config.get("cloudprovider.ibm.region") is None:
 52 |         pytest.skip(
 53 |             """
 54 |         You must configure a IBM project id to run this test.
 55 | 
 56 |         Either set this in your config
 57 | 
 58 |             # cloudprovider.yaml
 59 |             cloudprovider:
 60 |               ibm:
 61 |                 region: "your_region"
 62 | 
 63 |         Or by setting it as an environment variable
 64 | 
 65 |             export DASK_CLOUDPROVIDER__IBM__REGION="your_region"
 66 | 
 67 |         """
 68 |         )
 69 | 
 70 | 
 71 | @pytest.mark.asyncio
 72 | async def test_init():
 73 |     await skip_without_credentials()
 74 |     cluster = IBMCodeEngineCluster(asynchronous=True)
 75 |     assert cluster.status == Status.created
 76 | 
 77 | 
 78 | @pytest.mark.asyncio
 79 | @pytest.mark.timeout(1200)
 80 | @pytest.mark.external
 81 | async def test_create_cluster():
 82 |     async with IBMCodeEngineCluster(asynchronous=True) as cluster:
 83 |         cluster.scale(2)
 84 |         await cluster
 85 |         assert len(cluster.workers) == 2
 86 | 
 87 |         async with Client(cluster, asynchronous=True) as client:
 88 | 
 89 |             def inc(x):
 90 |                 return x + 1
 91 | 
 92 |             assert await client.submit(inc, 10).result() == 11
 93 | 
 94 | 
 95 | @pytest.mark.asyncio
 96 | @pytest.mark.timeout(1200)
 97 | @pytest.mark.external
 98 | async def test_create_cluster_sync():
 99 |     with IBMCodeEngineCluster() as cluster:
100 |         with Client(cluster) as client:
101 |             cluster.scale(1)
102 |             client.wait_for_workers(1)
103 |             assert len(cluster.workers) == 1
104 | 
105 |             def inc(x):
106 |                 return x + 1
107 | 
108 |             assert client.submit(inc, 10).result() == 11
109 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/nebius/__init__.py:
--------------------------------------------------------------------------------
1 | from .instances import NebiusCluster
2 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/nebius/instances.py:
--------------------------------------------------------------------------------
  1 | import dask
  2 | 
  3 | from dask_cloudprovider.generic.vmcluster import (
  4 |     VMCluster,
  5 |     VMInterface,
  6 |     SchedulerMixin,
  7 |     WorkerMixin,
  8 | )
  9 | 
 10 | try:
 11 |     from nebius.api.nebius.common.v1 import ResourceMetadata
 12 |     from nebius.api.nebius.vpc.v1 import SubnetServiceClient, ListSubnetsRequest
 13 |     from nebius.sdk import SDK
 14 |     from nebius.api.nebius.compute.v1 import (
 15 |         InstanceServiceClient,
 16 |         CreateInstanceRequest,
 17 |         DiskServiceClient,
 18 |         CreateDiskRequest,
 19 |         DiskSpec,
 20 |         SourceImageFamily,
 21 |         InstanceSpec,
 22 |         AttachedDiskSpec,
 23 |         ExistingDisk,
 24 |         ResourcesSpec,
 25 |         NetworkInterfaceSpec,
 26 |         IPAddress,
 27 |         PublicIPAddress,
 28 |         GetInstanceRequest,
 29 |         DeleteInstanceRequest,
 30 |         DeleteDiskRequest,
 31 |     )
 32 | except ImportError as e:
 33 |     msg = (
 34 |         "Dask Cloud Provider Nebius requirements are not installed.\n\n"
 35 |         "Please pip install as follows:\n\n"
 36 |         '  pip install "dask-cloudprovider[nebius]" --upgrade  # or python -m pip install'
 37 |     )
 38 |     raise ImportError(msg) from e
 39 | 
 40 | 
 41 | class NebiusInstance(VMInterface):
 42 |     def __init__(
 43 |         self,
 44 |         cluster: str,
 45 |         config,
 46 |         env_vars: dict = None,
 47 |         bootstrap=None,
 48 |         extra_bootstrap=None,
 49 |         docker_image: str = None,
 50 |         image_family: str = None,
 51 |         project_id: str = None,
 52 |         server_platform: str = None,
 53 |         server_preset: str = None,
 54 |         disk_size: int = None,
 55 |         *args,
 56 |         **kwargs,
 57 |     ):
 58 |         super().__init__(*args, **kwargs)
 59 |         self.cluster = cluster
 60 |         self.config = config
 61 |         self.extra_bootstrap = extra_bootstrap
 62 |         self.env_vars = env_vars
 63 |         self.bootstrap = bootstrap
 64 |         self.image_family = image_family
 65 |         self.project_id = project_id
 66 |         self.docker_image = docker_image
 67 |         self.server_platform = server_platform
 68 |         self.server_preset = server_preset
 69 |         self.sdk = SDK(credentials=self.config.get("token"))
 70 |         self.disk_size = disk_size
 71 |         self.instance_id = None
 72 |         self.disk_id = None
 73 | 
 74 |     async def create_vm(self, user_data=None):
 75 |         service = DiskServiceClient(self.sdk)
 76 |         operation = await service.create(
 77 |             CreateDiskRequest(
 78 |                 metadata=ResourceMetadata(
 79 |                     parent_id=self.project_id,
 80 |                     name=self.name + "-disk",
 81 |                 ),
 82 |                 spec=DiskSpec(
 83 |                     source_image_family=SourceImageFamily(
 84 |                         image_family=self.image_family
 85 |                     ),
 86 |                     size_gibibytes=self.disk_size,
 87 |                     type=DiskSpec.DiskType.NETWORK_SSD,
 88 |                 ),
 89 |             )
 90 |         )
 91 |         await operation.wait()
 92 |         self.disk_id = operation.resource_id
 93 | 
 94 |         service = SubnetServiceClient(self.sdk)
 95 |         sub_net = await service.list(ListSubnetsRequest(parent_id=self.project_id))
 96 |         subnet_id = sub_net.items[0].metadata.id
 97 | 
 98 |         service = InstanceServiceClient(self.sdk)
 99 |         operation = await service.create(
100 |             CreateInstanceRequest(
101 |                 metadata=ResourceMetadata(
102 |                     parent_id=self.project_id,
103 |                     name=self.name,
104 |                 ),
105 |                 spec=InstanceSpec(
106 |                     boot_disk=AttachedDiskSpec(
107 |                         attach_mode=AttachedDiskSpec.AttachMode(2),
108 |                         existing_disk=ExistingDisk(id=self.disk_id),
109 |                     ),
110 |                     cloud_init_user_data=self.cluster.render_process_cloud_init(self),
111 |                     resources=ResourcesSpec(
112 |                         platform=self.server_platform, preset=self.server_preset
113 |                     ),
114 |                     network_interfaces=[
115 |                         NetworkInterfaceSpec(
116 |                             subnet_id=subnet_id,
117 |                             ip_address=IPAddress(),
118 |                             name="network-interface-0",
119 |                             public_ip_address=PublicIPAddress(),
120 |                         )
121 |                     ],
122 |                 ),
123 |             )
124 |         )
125 |         self.instance_id = operation.resource_id
126 | 
127 |         self.cluster._log(f"Creating Nebius instance {self.name}")
128 |         await operation.wait()
129 |         service = InstanceServiceClient(self.sdk)
130 |         operation = await service.get(
131 |             GetInstanceRequest(
132 |                 id=self.instance_id,
133 |             )
134 |         )
135 |         internal_ip = operation.status.network_interfaces[0].ip_address.address.split(
136 |             "/"
137 |         )[0]
138 |         external_ip = operation.status.network_interfaces[
139 |             0
140 |         ].public_ip_address.address.split("/")[0]
141 |         self.cluster._log(
142 |             f"Created Nebius instance {self.name} with internal IP {internal_ip} and external IP {external_ip}"
143 |         )
144 |         return internal_ip, external_ip
145 | 
146 |     async def destroy_vm(self):
147 |         if self.instance_id:
148 |             service = InstanceServiceClient(self.sdk)
149 |             operation = await service.delete(
150 |                 DeleteInstanceRequest(
151 |                     id=self.instance_id,
152 |                 )
153 |             )
154 |         await operation.wait()
155 | 
156 |         if self.disk_id:
157 |             service = DiskServiceClient(self.sdk)
158 |             await service.delete(
159 |                 DeleteDiskRequest(
160 |                     id=self.disk_id,
161 |                 )
162 |             )
163 |         self.cluster._log(
164 |             f"Terminated instance {self.name} ({self.instance_id}) and deleted disk {self.disk_id}"
165 |         )
166 |         self.instance_id = None
167 |         self.disk_id = None
168 | 
169 | 
170 | class NebiusScheduler(SchedulerMixin, NebiusInstance):
171 |     """Scheduler running on a Nebius server."""
172 | 
173 | 
174 | class NebiusWorker(WorkerMixin, NebiusInstance):
175 |     """Worker running on a Nebius server."""
176 | 
177 | 
178 | class NebiusCluster(VMCluster):
179 |     """Cluster running on Nebius AI Cloud instances.
180 | 
181 |     VMs in Nebius AI Cloud are referred to as instances. This cluster manager constructs a Dask cluster
182 |     running on VMs.
183 | 
184 |     When configuring your cluster you may find it useful to install the ``nebius`` tool for querying the
185 |     Nebius API for available options.
186 | 
187 |     https://docs.nebius.com/cli/quickstart
188 | 
189 |     Parameters
190 |     ----------
191 |     image_family: str
192 |         The image to use for the host OS. This should be a Ubuntu variant.
193 |         You find list available images here https://docs.nebius.com/compute/storage/manage#parameters-boot.
194 |     project_id: str
195 |         The Nebius AI Cloud project id. You can find in Nebius AI Cloud console.
196 |     server_platform: str
197 |         List of all platforms and presets here https://docs.nebius.com/compute/virtual-machines/types/.
198 |     server_preset: str
199 |         List of all platforms and presets here https://docs.nebius.com/compute/virtual-machines/types/.
200 |     n_workers: int
201 |         Number of workers to initialise the cluster with. Defaults to ``0``.
202 |     worker_module: str
203 |         The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
204 |     worker_options: dict
205 |         Params to be passed to the worker class.
206 |         See :class:`distributed.worker.Worker` for default worker class.
207 |         If you set ``worker_module`` then refer to the docstring for the custom worker class.
208 |     scheduler_options: dict
209 |         Params to be passed to the scheduler class.
210 |         See :class:`distributed.scheduler.Scheduler`.
211 |     env_vars: dict
212 |         Environment variables to be passed to the worker.
213 |     extra_bootstrap: list[str] (optional)
214 |         Extra commands to be run during the bootstrap phase.
215 | 
216 |     Example
217 |     --------
218 | 
219 |     >>> from dask_cloudprovider.nebius import NebiusCluster
220 |     >>> cluster = NebiusCluster(n_workers=1)
221 | 
222 |     >>> from dask.distributed import Client
223 |     >>> client = Client(cluster)
224 | 
225 |     >>> import dask.array as da
226 |     >>> arr = da.random.random((1000, 1000), chunks=(100, 100))
227 |     >>> arr.mean().compute()
228 | 
229 |     >>> client.close()
230 |     >>> cluster.close()
231 | 
232 |     """
233 | 
234 |     def __init__(
235 |         self,
236 |         bootstrap: str = None,
237 |         image_family: str = None,
238 |         project_id: str = None,
239 |         disk_size: int = None,
240 |         server_platform: str = None,
241 |         server_preset: str = None,
242 |         docker_image: str = None,
243 |         debug: bool = False,
244 |         **kwargs,
245 |     ):
246 |         self.config = dask.config.get("cloudprovider.nebius", {})
247 | 
248 |         self.scheduler_class = NebiusScheduler
249 |         self.worker_class = NebiusWorker
250 | 
251 |         self.image_family = dask.config.get(
252 |             "cloudprovider.nebius.image_family", override_with=image_family
253 |         )
254 |         self.docker_image = dask.config.get(
255 |             "cloudprovider.nebius.docker_image", override_with=docker_image
256 |         )
257 |         self.project_id = dask.config.get(
258 |             "cloudprovider.nebius.project_id", override_with=project_id
259 |         )
260 |         self.server_platform = dask.config.get(
261 |             "cloudprovider.nebius.server_platform", override_with=server_platform
262 |         )
263 |         self.server_preset = dask.config.get(
264 |             "cloudprovider.nebius.server_preset", override_with=server_preset
265 |         )
266 |         self.bootstrap = dask.config.get(
267 |             "cloudprovider.nebius.bootstrap", override_with=bootstrap
268 |         )
269 |         self.disk_size = dask.config.get(
270 |             "cloudprovider.nebius.disk_size", override_with=disk_size
271 |         )
272 |         self.debug = debug
273 | 
274 |         self.options = {
275 |             "bootstrap": self.bootstrap,
276 |             "cluster": self,
277 |             "config": self.config,
278 |             "docker_image": self.docker_image,
279 |             "image_family": self.image_family,
280 |             "project_id": self.project_id,
281 |             "server_platform": self.server_platform,
282 |             "server_preset": self.server_preset,
283 |             "disk_size": self.disk_size,
284 |         }
285 |         self.scheduler_options = {**self.options}
286 |         self.worker_options = {**self.options}
287 |         super().__init__(debug=debug, **kwargs)
288 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/nebius/tests/test_nebius.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import dask
 4 | 
 5 | nebius = pytest.importorskip("nebius")
 6 | 
 7 | from dask_cloudprovider.nebius.instances import NebiusCluster
 8 | from dask.distributed import Client
 9 | from distributed.core import Status
10 | 
11 | 
12 | async def skip_without_credentials(config):
13 |     if config.get("token") is None or config.get("project_id") is None:
14 |         pytest.skip(
15 |             """
16 |         You must configure a Nebius AI Cloud API token to run this test.
17 | 
18 |         Either set this in your config
19 | 
20 |             # cloudprovider.yaml
21 |             cloudprovider:
22 |               nebius:
23 |                 token: "yourtoken"
24 |                 project_id: "yourprojectid"
25 | 
26 |         Or by setting it as an environment variable
27 | 
28 |             export DASK_CLOUDPROVIDER__NEBIUS__TOKEN=$(nebius iam get-access-token)
29 |             export DASK_CLOUDPROVIDER__NEBIUS__PROJECT_ID=project_id
30 | 
31 |         """
32 |         )
33 | 
34 | 
35 | @pytest.fixture
36 | async def config():
37 |     return dask.config.get("cloudprovider.nebius", {})
38 | 
39 | 
40 | @pytest.fixture
41 | @pytest.mark.external
42 | async def cluster(config):
43 |     await skip_without_credentials(config)
44 |     async with NebiusCluster(asynchronous=True, debug=True) as cluster:
45 |         yield cluster
46 | 
47 | 
48 | @pytest.mark.asyncio
49 | @pytest.mark.external
50 | async def test_init():
51 |     cluster = NebiusCluster(asynchronous=True, debug=True)
52 |     assert cluster.status == Status.created
53 | 
54 | 
55 | @pytest.mark.asyncio
56 | @pytest.mark.external
57 | async def test_create_cluster(cluster):
58 |     assert cluster.status == Status.running
59 | 
60 |     cluster.scale(1)
61 |     await cluster
62 |     assert len(cluster.workers) == 1
63 | 
64 |     async with Client(cluster, asynchronous=True) as client:
65 | 
66 |         def inc(x):
67 |             return x + 1
68 | 
69 |         assert await client.submit(inc, 10).result() == 11
70 | 
71 | 
72 | @pytest.mark.asyncio
73 | async def test_get_cloud_init():
74 |     cloud_init = NebiusCluster.get_cloud_init(
75 |         docker_args="--privileged",
76 |     )
77 |     assert " --privileged " in cloud_init
78 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/openstack/__init__.py:
--------------------------------------------------------------------------------
1 | from .instances import OpenStackCluster
2 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/openstack/instances.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import dask
  3 | 
  4 | from dask_cloudprovider.generic.vmcluster import (
  5 |     VMCluster,
  6 |     VMInterface,
  7 |     SchedulerMixin,
  8 |     WorkerMixin,
  9 | )
 10 | 
 11 | from distributed.core import Status
 12 | 
 13 | try:
 14 |     from openstack import connection
 15 | except ImportError as e:
 16 |     msg = (
 17 |         "Dask Cloud Provider OpenStack requirements are not installed.\n\n"
 18 |         "Please pip install as follows:\n\n"
 19 |         '  pip install "openstacksdk" '
 20 |     )
 21 |     raise ImportError(msg) from e
 22 | 
 23 | 
 24 | class OpenStackInstance(VMInterface):
 25 |     def __init__(
 26 |         self,
 27 |         cluster,
 28 |         config,
 29 |         region: str = None,
 30 |         size: str = None,
 31 |         image: str = None,
 32 |         docker_image: str = None,
 33 |         env_vars: str = None,
 34 |         extra_bootstrap: str = None,
 35 |         **kwargs,
 36 |     ):
 37 |         super().__init__(**kwargs)
 38 |         self.instance = None
 39 |         self.cluster = cluster
 40 |         self.config = config
 41 |         self.region = region
 42 |         self.size = size
 43 |         self.image = image
 44 |         self.env_vars = env_vars
 45 |         self.bootstrap = True
 46 |         self.docker_image = docker_image
 47 |         self.extra_bootstrap = extra_bootstrap
 48 | 
 49 |     async def create_vm(self):
 50 |         conn = connection.Connection(
 51 |             region_name=self.region,
 52 |             auth_url=self.config["auth_url"],
 53 |             application_credential_id=self.config["application_credential_id"],
 54 |             application_credential_secret=self.config["application_credential_secret"],
 55 |             compute_api_version="2",
 56 |             identity_interface="public",
 57 |             auth_type="v3applicationcredential",
 58 |         )
 59 | 
 60 |         self.instance = conn.create_server(
 61 |             name=self.name,
 62 |             image=self.image,
 63 |             flavor=self.size,  # Changed 'flavor_id' to 'flavor'
 64 |             key_name=self.config["keypair_name"],  # Add the keypair name here
 65 |             nics=[
 66 |                 {"net-id": self.config["network_id"]}
 67 |             ],  # Changed from 'networks' to 'nics'
 68 |             userdata=self.cluster.render_process_cloud_init(self),
 69 |             security_groups=[self.config["security_group"]],
 70 |         )
 71 | 
 72 |         # Wait for the instance to be up and running
 73 |         while self.instance.status.lower() != "active":
 74 |             await asyncio.sleep(0.1)
 75 |             self.instance = conn.compute.get_server(self.instance.id)
 76 | 
 77 |         # Retrieve the internal IP address
 78 |         self.internal_ip = await self.get_internal_ip(conn)
 79 | 
 80 |         # Check if a floating IP should be created and assigned
 81 |         if self.config.get("create_floating_ip", False):
 82 |             self.external_ip = await self.create_and_assign_floating_ip(conn)
 83 |         else:
 84 |             self.external_ip = await self.get_external_ip(conn)
 85 | 
 86 |         self.cluster._log(
 87 |             f"{self.name}\n\tInternal IP: {self.internal_ip}\n\tExternal IP: "
 88 |             f"{self.external_ip if self.external_ip else 'None'}"
 89 |         )
 90 |         return self.internal_ip, self.external_ip
 91 | 
 92 |     async def get_internal_ip(self, conn):
 93 |         """Fetch the internal IP address from the OpenStack instance."""
 94 |         instance = conn.compute.get_server(self.instance.id)
 95 |         for network in instance.addresses.values():
 96 |             for addr in network:
 97 |                 if addr["OS-EXT-IPS:type"] == "fixed":
 98 |                     return addr["addr"]
 99 |         return None
100 | 
101 |     async def get_external_ip(self, conn):
102 |         """Fetch the external IP address from the OpenStack instance, if it exists."""
103 |         instance = conn.compute.get_server(self.instance.id)
104 |         for network in instance.addresses.values():
105 |             for addr in network:
106 |                 if addr["OS-EXT-IPS:type"] == "floating":
107 |                     return addr["addr"]
108 |         return None
109 | 
110 |     async def create_and_assign_floating_ip(self, conn):
111 |         """Create and assign a floating IP to the instance."""
112 |         try:
113 |             # Create a floating IP
114 |             floating_ip = await self.call_async(
115 |                 conn.network.create_ip,
116 |                 floating_network_id=self.config["external_network_id"],
117 |             )
118 | 
119 |             # Find the first port of the instance
120 |             ports = await self.call_async(
121 |                 conn.network.ports,
122 |                 device_id=self.instance.id
123 |             )
124 |             ports = list(ports)
125 |             if not ports:
126 |                 raise RuntimeError(f"No network ports found for instance {self.instance.id}")
127 | 
128 |             # Assign the floating IP to the instance's port
129 |             await self.call_async(
130 |                 conn.network.update_ip,
131 |                 floating_ip,
132 |                 port_id=ports[0].id
133 |             )
134 | 
135 |             return floating_ip.floating_ip_address
136 |         except Exception as e:
137 |             self.cluster._log(f"Failed to create or assign floating IP: {str(e)}")
138 |             return None
139 | 
140 |     async def destroy_vm(self):
141 |         conn = connection.Connection(
142 |             region_name=self.region,
143 |             auth_url=self.config["auth_url"],
144 |             application_credential_id=self.config["application_credential_id"],
145 |             application_credential_secret=self.config["application_credential_secret"],
146 |             compute_api_version="2",
147 |             identity_interface="public",
148 |             auth_type="v3applicationcredential",
149 |         )
150 | 
151 |         # Handle floating IP disassociation and deletion if applicable
152 |         if self.config.get(
153 |             "create_floating_ip", False
154 |         ):  # Checks if floating IPs were configured to be created
155 |             try:
156 |                 # Retrieve all floating IPs associated with the instance
157 |                 floating_ips = conn.network.ips(port_id=self.instance.id)
158 |                 for ip in floating_ips:
159 |                     # Disassociate and delete the floating IP
160 |                     conn.network.update_ip(ip, port_id=None)
161 |                     conn.network.delete_ip(ip.id)
162 |                     self.cluster._log(f"Deleted floating IP {ip.floating_ip_address}")
163 |             except Exception as e:
164 |                 self.cluster._log(
165 |                     f"Failed to clean up floating IPs for instance {self.name}: {str(e)}"
166 |                 )
167 |                 return  # Exit if floating IP cleanup fails
168 | 
169 |         # Then, attempt to delete the instance
170 |         try:
171 |             instance = conn.compute.get_server(self.instance.id)
172 |             if instance:
173 |                 await self.call_async(conn.compute.delete_server, instance.id)
174 |                 self.cluster._log(f"Terminated instance {self.name}")
175 |             else:
176 |                 self.cluster._log(f"Instance {self.name} not found or already deleted.")
177 |         except Exception as e:
178 |             self.cluster._log(f"Failed to terminate instance {self.name}: {str(e)}")
179 | 
180 |     async def start_vm(self):
181 |         # Code to start the instance
182 |         pass  # Placeholder to ensure correct indentation
183 | 
184 |     async def stop_vm(self):
185 |         # Code to stop the instance
186 |         pass  # Placeholder to ensure correct indentation
187 | 
188 | 
189 | class OpenStackScheduler(SchedulerMixin, OpenStackInstance):
190 |     """Scheduler running on an OpenStack Instance."""
191 | 
192 |     def __init__(self, *args, **kwargs):
193 |         super().__init__(*args, **kwargs)
194 | 
195 |     async def start(self):
196 |         await self.start_scheduler()
197 |         self.status = Status.running
198 | 
199 |     async def start_scheduler(self):
200 |         self.cluster._log(
201 |             f"Launching cluster with the following configuration: "
202 |             f"\n  OS Image: {self.image} "
203 |             f"\n  Flavor: {self.size} "
204 |             f"\n  Docker Image: {self.docker_image} "
205 |             f"\n  Security Group: {self.config['security_group']} "
206 |         )
207 |         self.cluster._log("Creating scheduler instance")
208 |         self.internal_ip, self.external_ip = await self.create_vm()
209 | 
210 |         # Choose the IP based on the access type configuration
211 |         if self.config.get("create_floating_ip", True):
212 |             # If public access is required and a floating IP is created
213 |             self.address = f"{self.cluster.protocol}://{self.external_ip}:{self.port}"
214 |         else:
215 |             # Use internal IP if no external access is configured
216 |             self.address = f"{self.cluster.protocol}://{self.internal_ip}:{self.port}"
217 | 
218 |         await self.wait_for_scheduler()
219 | 
220 |         # Storing IPs for cluster-wide use, if necessary
221 |         self.cluster.scheduler_internal_ip = self.internal_ip
222 |         self.cluster.scheduler_external_ip = self.external_ip
223 |         self.cluster.scheduler_port = self.port
224 | 
225 | 
226 | class OpenStackWorker(WorkerMixin, OpenStackInstance):
227 |     """Worker running on a OpenStack Instance."""
228 | 
229 | 
230 | class OpenStackCluster(VMCluster):
231 |     """Cluster running on Openstack VM Instances
232 | 
233 |     This cluster manager constructs a Dask cluster running on generic Openstack cloud
234 | 
235 |     When configuring your cluster you may find it useful to install the 'python-openstackclient'
236 |     client for querying the Openstack APIs for available options.
237 | 
238 |     https://github.com/openstack/python-openstackclient
239 | 
240 |     Parameters
241 |     ----------
242 | 
243 |     region: str
244 |         The name of the region where resources will be allocated in OpenStack.
245 |         Typically set to 'default' unless specified in your cloud configuration.
246 | 
247 |         List available regions using: `openstack region list`.
248 |     auth_url: str
249 |         The authentication URL for the OpenStack Identity service (Keystone).
250 |         Example: https://cloud.example.com:5000
251 |     application_credential_id: str
252 |          The application credential id created in OpenStack.
253 | 
254 |          Create application credentials using: openstack application credential create
255 |     application_credential_secret: str
256 |         The secret associated with the application credential ID for authentication.
257 |     auth_type: str
258 |         The type of authentication used, typically "v3applicationcredential" for
259 |         using OpenStack application credentials.
260 |     network_id: str
261 |         The unique identifier for the internal/private network in OpenStack where the cluster
262 |         VMs will be connected.
263 | 
264 |         List available networks using: `openstack network list`
265 |     image: str
266 |         The OS image name or id to use for the VM. Dask Cloudprovider will boostrap Ubuntu
267 |         based images automatically. Other images require Docker and for GPUs
268 |         the NVIDIA Drivers and NVIDIA Docker.
269 | 
270 |         List available images using: `openstack image list`
271 |     keypair_name: str
272 |         The name of the SSH keypair used for instance access. Ensure you have created a keypair
273 |         or use an existing one.
274 | 
275 |         List available keypairs using: `openstack keypair list`
276 |     security_group: str
277 |         The security group name that defines firewall rules for instances.
278 | 
279 |         The default is `default`. Please ensure the follwing accesses are configured:
280 |             - egress 0.0.0.0/0 on all ports for downloading docker images and general data access
281 |             - ingress <internal-cidr>/8 on all ports for internal communication of workers
282 |             - ingress 0.0.0.0/0 on 8786-8787 for external accessibility of the dashboard/scheduler
283 |             - (optional) ingress 0.0.0.0./0 on 22 for ssh access
284 | 
285 |         List available security groups using: `openstack security group list`
286 |     create_floating_ip: bool
287 |         Specifies whether to assign a floating IP to each instance, enabling external
288 |         access. Set to `True` if external connectivity is needed.
289 |     external_network_id: str
290 |         The ID of the external network used for assigning floating IPs.
291 | 
292 |         List available external networks using: `openstack network list --external`
293 |     n_workers: int (optional)
294 |         Number of workers to initialise the cluster with. Defaults to ``0``.
295 |     worker_module: str
296 |         The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
297 |     worker_options: dict
298 |         Params to be passed to the worker class.
299 |         See :class:`distributed.worker.Worker` for default worker class.
300 |         If you set ``worker_module`` then refer to the docstring for the custom worker class.
301 |     scheduler_options: dict
302 |         Params to be passed to the scheduler class.
303 |         See :class:`distributed.scheduler.Scheduler`.
304 |     env_vars: dict
305 |         Environment variables to be passed to the worker.
306 |     extra_bootstrap: list[str] (optional)
307 |         Extra commands to be run during the bootstrap phase.
308 |     docker_image: string (optional)
309 |         The Docker image to run on all instances.
310 | 
311 |         This image must have a valid Python environment and have ``dask`` installed in order for the
312 |         ``dask-scheduler`` and ``dask-worker`` commands to be available. It is recommended the Python
313 |         environment matches your local environment where ``OpenStackCluster`` is being created from.
314 | 
315 |         For GPU instance types the Docker image much have NVIDIA drivers and ``dask-cuda`` installed.
316 | 
317 |         By default the ``daskdev/dask:latest`` image will be used.
318 | 
319 |     Example
320 |     --------
321 | 
322 |     >>> from dask_cloudprovider.openstack import OpenStackCluster
323 |     >>> cluster = OpenStackCluster(n_workers=1)
324 |     Launching cluster with the following configuration:
325 |         OS Image: ubuntu-22-04
326 |         Flavor: 4vcpu-8gbram-50gbdisk
327 |         Docker Image: daskdev/dask:latest
328 |         Security Group: all-open
329 |     Creating scheduler instance
330 |         dask-9b85a5f8-scheduler
331 |                 Internal IP: 10.0.30.148
332 |                 External IP: None
333 |     Waiting for scheduler to run at 10.0.30.148:8786
334 |     Scheduler is running
335 |     Creating worker instance
336 | 
337 |     >>> from dask.distributed import Client
338 |     >>> client = Client(cluster)
339 | 
340 |     >>> import dask.array as da
341 |     >>> arr = da.random.random((1000, 1000), chunks=(100, 100))
342 |     >>> arr.mean().compute()
343 | 
344 |     >>> client.close()
345 |     >>> cluster.close()
346 |     Terminated instance dask-07280176-worker-319005a2
347 |     Terminated instance dask-07280176-scheduler
348 |     """
349 | 
350 |     def __init__(
351 |         self,
352 |         region: str = None,
353 |         size: str = None,
354 |         image: str = None,
355 |         docker_image: str = None,
356 |         debug: bool = False,
357 |         bootstrap: bool = True,
358 |         **kwargs,
359 |     ):
360 |         self.config = dask.config.get("cloudprovider.openstack", {})
361 |         self.scheduler_class = OpenStackScheduler
362 |         self.worker_class = OpenStackWorker
363 |         self.debug = debug
364 |         self.bootstrap = (
365 |             bootstrap if bootstrap is not None else self.config.get("bootstrap")
366 |         )
367 |         self.options = {
368 |             "cluster": self,
369 |             "config": self.config,
370 |             "region": region if region is not None else self.config.get("region"),
371 |             "size": size if size is not None else self.config.get("size"),
372 |             "image": image if image is not None else self.config.get("image"),
373 |             "docker_image": docker_image or self.config.get("docker_image"),
374 |         }
375 |         self.scheduler_options = {**self.options}
376 |         self.worker_options = {**self.options}
377 | 
378 |         if "extra_bootstrap" not in kwargs:
379 |             kwargs["extra_bootstrap"] = self.config.get("extra_bootstrap")
380 | 
381 |         super().__init__(debug=debug, **kwargs)
382 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/openstack/tests/test_instances.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import dask
 3 | from dask_cloudprovider.openstack.instances import OpenStackCluster
 4 | from dask.distributed import Client
 5 | from distributed.core import Status
 6 | 
 7 | # Optional: Skips tests if OpenStack credentials are not set
 8 | 
 9 | 
10 | async def skip_without_credentials(config):
11 |     if (
12 |         config.get("auth_url") is None
13 |         or config.get("application_credential_secret") is None
14 |     ):
15 |         pytest.skip(
16 |             """
17 |         You must configure OpenStack credentials to run this test.
18 | 
19 |         Set this in your config file or environment variables:
20 | 
21 |         # cloudprovider.yaml
22 |         cloudprovider:
23 |           openstack:
24 |             auth_url: "your_auth_url"
25 |             application_credential_id: "your_app_cred_id"
26 |             application_credential_secret: "your_app_cred_secret"
27 |         """
28 |         )
29 | 
30 | 
31 | @pytest.fixture
32 | async def config():
33 |     return dask.config.get("cloudprovider.openstack", {})
34 | 
35 | 
36 | @pytest.fixture
37 | @pytest.mark.external
38 | async def cluster(config):
39 |     await skip_without_credentials(config)
40 | 
41 |     async with OpenStackCluster(asynchronous=True) as cluster:
42 |         yield cluster
43 | 
44 | 
45 | @pytest.mark.asyncio
46 | async def test_init():
47 |     cluster = OpenStackCluster(asynchronous=True)
48 |     assert cluster.status == Status.created
49 | 
50 | 
51 | @pytest.mark.asyncio
52 | @pytest.mark.timeout(600)
53 | async def test_create_cluster(cluster):
54 |     assert cluster.status == Status.running
55 |     cluster.scale(1)
56 |     await cluster
57 |     assert len(cluster.workers) == 1
58 | 
59 |     async with Client(cluster, asynchronous=True) as client:
60 | 
61 |         def inc(x):
62 |             return x + 1
63 | 
64 |         assert await client.submit(inc, 10).result() == 11
65 | 
66 | 
67 | @pytest.mark.asyncio
68 | async def test_get_cloud_init():
69 |     cloud_init = OpenStackCluster.get_cloud_init(
70 |         docker_args="--privileged",
71 |     )
72 |     assert " --privileged " in cloud_init
73 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/tests/test_imports.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_imports():
 5 |     from dask_cloudprovider.aws import EC2Cluster  # noqa
 6 |     from dask_cloudprovider.aws import ECSCluster  # noqa
 7 |     from dask_cloudprovider.aws import FargateCluster  # noqa
 8 |     from dask_cloudprovider.azure import AzureVMCluster  # noqa
 9 |     from dask_cloudprovider.gcp import GCPCluster  # noqa
10 |     from dask_cloudprovider.digitalocean import DropletCluster  # noqa
11 |     from dask_cloudprovider.hetzner import HetznerCluster  # noqa
12 | 
13 | 
14 | def test_import_exceptions():
15 |     with pytest.raises(ImportError):
16 |         from dask_cloudprovider import EC2Cluster  # noqa
17 |     with pytest.raises(ImportError):
18 |         from dask_cloudprovider import ECSCluster  # noqa
19 |     with pytest.raises(ImportError):
20 |         from dask_cloudprovider import FargateCluster  # noqa
21 |     with pytest.raises(ImportError):
22 |         from dask_cloudprovider import AzureVMCluster  # noqa
23 |     with pytest.raises(ImportError):
24 |         from dask_cloudprovider import GCPCluster  # noqa
25 |     with pytest.raises(ImportError):
26 |         from dask_cloudprovider import DropletCluster  # noqa
27 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/utils/logs.py:
--------------------------------------------------------------------------------
 1 | class Log(str):
 2 |     """A container for logs."""
 3 | 
 4 |     def _widget(self):
 5 |         from ipywidgets import HTML
 6 | 
 7 |         return HTML(value="<pre><code>{logs}</code></pre>".format(logs=self))
 8 | 
 9 |     def _ipython_display_(self, **kwargs):
10 |         return self._widget()._ipython_display_(**kwargs)
11 | 
12 | 
13 | class Logs(dict):
14 |     """A container for multiple logs."""
15 | 
16 |     def _widget(self):
17 |         from ipywidgets import Accordion
18 | 
19 |         accordion = Accordion(children=[log._widget() for log in self.values()])
20 |         [accordion.set_title(i, title) for i, title in enumerate(self.keys())]
21 |         return accordion
22 | 
23 |     def _ipython_display_(self, **kwargs):
24 |         return self._widget()._ipython_display_(**kwargs)
25 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/utils/socket.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | 
 4 | def is_socket_open(ip, port):
 5 |     connection = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 6 |     try:
 7 |         connection.connect((ip, int(port)))
 8 |         connection.shutdown(2)
 9 |         return True
10 |     except Exception:
11 |         return False
12 | 


--------------------------------------------------------------------------------
/dask_cloudprovider/utils/timeout.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | import warnings
 3 | 
 4 | 
 5 | class TimeoutException(RuntimeError):
 6 |     """Raised when a loop times out."""
 7 | 
 8 | 
 9 | class Timeout:
10 |     """A timeout object for use in ``while True`` loops instead of ``True``.
11 | 
12 |     Create an instance of this class before beginning an infinite loop and
13 |     call ``run()`` instead of ``True``.
14 | 
15 | 
16 |     Parameters
17 |     ----------
18 |     timeout: int
19 |         Seconds before loop should timeout.
20 | 
21 |     error_message: str
22 |         Error message to raise in an exception if timeout occurs.
23 | 
24 |     warn: bool
25 |         Only raise a warning instead of a TimeoutException.
26 | 
27 |         Default ``False``.
28 |     Examples
29 |     --------
30 |     >>> timeout = Timeout(10, "Oh no! We timed out.")
31 |     >>> while timeout.run():
32 |     ...     time.sleep(1)  # Will timeout after 10 iterations
33 |     TimeoutException: Oh no! We timed out.
34 | 
35 |     You can also pass an exception to raise if you are suppressing for a set
36 |     amount of time.
37 | 
38 |     >>> timeout = Timeout(10, "Oh no! We timed out.")
39 |     >>> while timeout.run():
40 |     ...     try:
41 |     ...         some_function_that_raises()
42 |     ...         break
43 |     ...     except Exception as e:
44 |     ...         timeout.set_exception(e)
45 |     ...         time.sleep(1)  # Will timeout after 10 iterations
46 |     Exception: The exception from ``some_function_that_raises``
47 | 
48 | 
49 |     """
50 | 
51 |     def __init__(self, timeout, error_message, warn=False):
52 |         self.start = None
53 |         self.running = False
54 |         self.timeout = timeout
55 |         self.error_message = error_message
56 |         self.warn = warn
57 |         self.exception = TimeoutException(self.error_message)
58 | 
59 |     def run(self):
60 |         """Run the timeout.
61 | 
62 |         This method when called repeatedly will return ``True`` until the
63 |         timeout has elapsed. It will then raise or return ``False``.
64 |         """
65 |         if not self.running:
66 |             self.start = datetime.now()
67 |             self.running = True
68 | 
69 |         if self.start + timedelta(seconds=self.timeout) < datetime.now():
70 |             if self.warn:
71 |                 warnings.warn(self.error_message)
72 |                 return False
73 |             else:
74 |                 raise self.exception
75 |         return True
76 | 
77 |     def set_exception(self, e):
78 |         """Modify the default timeout exception.
79 | 
80 |         This would be useful if you are trying something repeatedly but if it
81 |         never succeeds before the timeout you want to raise the exception from
82 |         the thing you are trying rather than a TimeoutException.
83 |         """
84 |         self.exception = e
85 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?= -a
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/doc/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | numpydoc
 2 | sphinx
 3 | dask-sphinx-theme>=3.0.5
 4 | # FIXME: This workaround is required until we have sphinx>=5, as enabled by
 5 | #        dask-sphinx-theme no longer pinning sphinx-book-theme==0.2.0. This is
 6 | #        tracked in https://github.com/dask/dask-sphinx-theme/issues/68.
 7 | #
 8 | sphinxcontrib-applehelp<1.0.5
 9 | sphinxcontrib-devhelp<1.0.6
10 | sphinxcontrib-htmlhelp<2.0.5
11 | sphinxcontrib-serializinghtml<1.1.10
12 | sphinxcontrib-qthelp<1.0.7
13 | 


--------------------------------------------------------------------------------
/doc/source/alternatives.rst:
--------------------------------------------------------------------------------
 1 | Alternatives
 2 | ============
 3 | 
 4 | Many tools and services exist today for deploying Dask clusters, many of which are commonly used on the cloud.
 5 | This project aims to provide cloud native plugins and tools for Dask which can often compliment other approaches.
 6 | 
 7 | Community tools
 8 | ---------------
 9 | 
10 | Dask has a `vibrant ecosystem of community tooling for deploying Dask <https://docs.dask.org/en/latest/ecosystem.html#deploying-dask>`_ on various platforms. Many of which can be used on public cloud.
11 | 
12 | Kubernetes
13 | ^^^^^^^^^^
14 | 
15 | `Kubernetes <https://kubernetes.io/>`_ is an extremely popular project for managing cloud workloads and is part of the broader `Cloud Native Computing Foundation (CNCF) <https://www.cncf.io/>`_ ecosystem.
16 | 
17 | Dask has many options for `deploying clusters on Kubernetes <https://docs.dask.org/en/stable/deploying-kubernetes.html>`_.
18 | 
19 | HPC on Cloud
20 | ^^^^^^^^^^^^
21 | 
22 | Many popular HPC scheduling tools are used on the cloud and support features such as elastic scaling.
23 | If you are already leveraging HPC tools like `SLURM on the cloud <https://slurm.schedmd.com/elastic_computing.html>`_ then `Dask has great integration with HPC schedulers <https://jobqueue.dask.org/en/latest/>`_.
24 | 
25 | Hadoop/Spark/Yarn
26 | ^^^^^^^^^^^^^^^^^
27 | 
28 | Many cloud platforms have popular managed services for running Apache Spark workloads.
29 | 
30 | If you're already using a managed map-reduce service like `Amazon EMR <https://aws.amazon.com/emr/>`_ then check out `dask-yarn <https://yarn.dask.org/en/latest/>`_.
31 | 
32 | Nebari
33 | ^^^^^^
34 | 
35 | `Nebari <https://www.nebari.dev/>`_ is an open source data science platform which can be run locally or on a cloud platform of your choice.
36 | It includes a managed Dask service built on `Dask Gateway <http://gateway.dask.org/>`_ for managing Dask clusters.
37 | 
38 | Managed Services
39 | ----------------
40 | 
41 | Cloud vendors and third-party companies also offer managed Dask clusters as a service
42 | 
43 | Coiled
44 | ^^^^^^
45 | 
46 | `Coiled <https://www.coiled.io/>`_ is a mature managed Dask service that spawns clusters in your cloud account and allows you to manage them via a central control plane.
47 | 
48 | Saturn Cloud
49 | ^^^^^^^^^^^^
50 | 
51 | `Saturn Cloud <https://saturncloud.io/>`_ is a managed data science platform with hosted Dask clusters or the option to deploy them in your own AWS account.
52 | 


--------------------------------------------------------------------------------
/doc/source/aws.rst:
--------------------------------------------------------------------------------
 1 | Amazon Web Services (AWS)
 2 | =========================
 3 | 
 4 | .. currentmodule:: dask_cloudprovider.aws
 5 | 
 6 | .. autosummary::
 7 |    EC2Cluster
 8 |    ECSCluster
 9 |    FargateCluster
10 | 
11 | Overview
12 | --------
13 | 
14 | Authentication
15 | ^^^^^^^^^^^^^^
16 | 
17 | In order to create clusters on AWS you need to set your access key, secret key
18 | and region. The simplest way is to use the aws command line tool.
19 | 
20 | .. code-block:: console
21 | 
22 |    $ pip install awscli
23 |    $ aws configure
24 | 
25 | 
26 | Credentials
27 | ^^^^^^^^^^^
28 | 
29 | In order for your Dask workers to be able to connect to other AWS resources such as S3 they will need credentials.
30 | 
31 | This can be done by attaching IAM roles to individual resources or by passing credentials as environment variables. See
32 | each cluster manager docstring for more information.
33 | 
34 | Elastic Compute Cloud (EC2)
35 | ---------------------------
36 | 
37 | .. autoclass:: EC2Cluster
38 |    :members:
39 | 
40 | Elastic Container Service (ECS)
41 | -------------------------------
42 | 
43 | .. autoclass:: ECSCluster
44 |    :members:
45 | 
46 | Fargate
47 | -------
48 | 
49 | .. autoclass:: FargateCluster
50 |    :members:
51 | 


--------------------------------------------------------------------------------
/doc/source/azure.rst:
--------------------------------------------------------------------------------
  1 | Microsoft Azure
  2 | ===============
  3 | 
  4 | .. currentmodule:: dask_cloudprovider.azure
  5 | 
  6 | .. autosummary::
  7 |    AzureVMCluster
  8 | 
  9 | Overview
 10 | --------
 11 | 
 12 | Authentication
 13 | ^^^^^^^^^^^^^^
 14 | 
 15 | In order to create clusters on Azure you need to set your authentication credentials.
 16 | You can do this via the ``az`` `command line tool <https://docs.microsoft.com/en-us/cli/azure/install-azure-cli>`_.
 17 | 
 18 | .. code-block:: console
 19 | 
 20 |    $ az login
 21 | 
 22 | .. note::
 23 | 
 24 |    Setting the default output to ``table`` with ``az configure`` will make the ``az`` tool much easier to use.
 25 | 
 26 | Resource Groups
 27 | ^^^^^^^^^^^^^^^
 28 | 
 29 | To create resources on Azure they must be placed in a resource group. Dask Cloudprovider will need a group to create
 30 | Dask components in.
 31 | 
 32 | You can list existing groups via the cli.
 33 | 
 34 | .. code-block:: console
 35 | 
 36 |    $ az group list
 37 | 
 38 | You can also create a new resource group if you do not have an existing one.
 39 | 
 40 | .. code-block:: console
 41 | 
 42 |    $ az group create --location <location> --name <resource group name> --subscription <subscription>
 43 | 
 44 | You can get a full list of locations with ``az account list-locations`` and subscriptions with ``az account list``.
 45 | 
 46 | Take note of your resource group name for later.
 47 | 
 48 | Virtual Networks
 49 | ^^^^^^^^^^^^^^^^
 50 | 
 51 | Compute resources on Azure must be placed in virtual networks (vnet). Dask Cloudprovider will require an existing vnet to connect
 52 | compute resources to.
 53 | 
 54 | You can list existing vnets via the cli.
 55 | 
 56 | .. code-block:: console
 57 | 
 58 |    $ az network vnet list
 59 | 
 60 | You can also create a new vnet via the cli.
 61 | 
 62 | .. code-block:: console
 63 | 
 64 |    $ az network vnet create -g <resource group name> -n <vnet name> --address-prefix 10.0.0.0/16 \
 65 |          --subnet-name <subnet name> --subnet-prefix 10.0.0.0/24
 66 | 
 67 | This command will create a new vnet in your resource group with one subnet with the ``10.0.0.0/24`` prefix. For more than 255 compute resources you will need additional subnets.
 68 | 
 69 | Take note of your vnet name for later.
 70 | 
 71 | Security Groups
 72 | ^^^^^^^^^^^^^^^
 73 | 
 74 | To allow network traffic to reach your Dask cluster you will need to create a security group which allows traffic on ports 8786-8787 from wherever you are.
 75 | 
 76 | You can list existing security groups via the cli.
 77 | 
 78 | .. code-block:: console
 79 | 
 80 |    $ az network nsg list
 81 | 
 82 | Or you can create a new security group.
 83 | 
 84 | .. code-block:: console
 85 | 
 86 |    $ az network nsg create -g <resource group name> --name <security group name>
 87 |    $ az network nsg rule create -g <resource group name> --nsg-name <security group name> -n MyNsgRuleWithAsg \
 88 |          --priority 500 --source-address-prefixes Internet --destination-port-ranges 8786 8787 \
 89 |          --destination-address-prefixes '*' --access Allow --protocol Tcp --description "Allow Internet to Dask on ports 8786,8787."
 90 | 
 91 | This example allows all traffic to 8786-8787 from the internet. It is recommended you make your rules more restrictive than this by limiting it to your corporate network
 92 | or specific IP.
 93 | 
 94 | Again take note of this security group name for later.
 95 | 
 96 | Extra options
 97 | ^^^^^^^^^^^^^
 98 | 
 99 | To further customize the VMs created, you can provide ``extra_vm_options`` to :class:`AzureVMCluster`. For example, to set the identity
100 | of the virtual machines to a (previously created) user assigned identity, create an ``azure.mgmt.compute.models.VirtualMachineIdentity``
101 | 
102 | .. code-block:: python
103 | 
104 |    >>> import os
105 |    >>> import azure.identity
106 |    >>> import dask_cloudprovider.azure
107 |    >>> import azure.mgmt.compute.models
108 | 
109 |    >>> subscription_id = os.environ["DASK_CLOUDPROVIDER__AZURE__SUBSCRIPTION_ID"]
110 |    >>> rg_name = os.environ["DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP"]
111 |    >>> identity_name = "dask-cloudprovider-identity"
112 |    >>> v = azure.mgmt.compute.models.UserAssignedIdentitiesValue()
113 |    >>> user_assigned_identities = {
114 |    ...     f"/subscriptions/{subscription_id}/resourcegroups/{rg_name}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{identity_name}": v
115 |    ... }
116 |    >>> identity = azure.mgmt.compute.models.VirtualMachineIdentity(
117 |    ...     type="UserAssigned",
118 |    ...     user_assigned_identities=user_assigned_identities   
119 |    ... )
120 | 
121 | 
122 | And then provide that to :class:`AzureVMCluster`
123 | 
124 | .. code-block:: python
125 | 
126 |    >>> cluster = dask_cloudprovider.azure.AzureVMCluster(extra_vm_options={"identity": identity.as_dict()})
127 |    >>> cluster.scale(1)
128 | 
129 | Dask Configuration
130 | ^^^^^^^^^^^^^^^^^^
131 | 
132 | You'll provide the names or IDs of the Azure resources when you create a :class:`AzureVMCluster`. You can specify
133 | these values manually, or use Dask's `configuration system <https://docs.dask.org/en/stable/configuration.html>`_
134 | system. For example, the ``resource_group`` value can be specified using an environment variable:
135 | 
136 | .. code-block:: console
137 | 
138 |    $ export DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP="<resource group name>"
139 |    $ python
140 | 
141 | Or you can set it in a YAML configuration file.
142 | 
143 | .. code-block:: yaml
144 | 
145 |    cloudprovider:
146 |      azure:
147 |        resource_group: "<resource group name>"
148 |        azurevm:
149 |         vnet: "<vnet name>"
150 | 
151 | Note that the options controlling the VMs are under the `cloudprovider.azure.azurevm` key.
152 | 
153 | See :doc:`config` for more.
154 | 
155 | AzureVM
156 | -------
157 | 
158 | .. autoclass:: AzureVMCluster
159 |    :members:
160 | 
161 | Azure Spot Instance Plugin
162 | --------------------------
163 | 
164 | .. autoclass:: AzurePreemptibleWorkerPlugin
165 |    :members:
166 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Dask-kubernetes documentation build configuration file, created by
  5 | # sphinx-quickstart on Thu Feb  8 17:56:16 2018.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | 
 23 | from datetime import datetime
 24 | 
 25 | sys.path.insert(0, os.path.abspath(".."))
 26 | 
 27 | 
 28 | # -- General configuration ------------------------------------------------
 29 | 
 30 | # If your documentation needs a minimal Sphinx version, state it here.
 31 | #
 32 | # needs_sphinx = '1.0'
 33 | 
 34 | # Add any Sphinx extension module names here, as strings. They can be
 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 36 | # ones.
 37 | extensions = [
 38 |     "sphinx.ext.autodoc",
 39 |     "sphinx.ext.todo",
 40 |     "sphinx.ext.ifconfig",
 41 |     "sphinx.ext.viewcode",
 42 |     "sphinx.ext.autosummary",
 43 |     "sphinx.ext.extlinks",
 44 |     "sphinx.ext.intersphinx",
 45 |     "numpydoc",
 46 | ]
 47 | 
 48 | # Add any paths that contain templates here, relative to this directory.
 49 | templates_path = ["_templates"]
 50 | 
 51 | # The suffix(es) of source filenames.
 52 | # You can specify multiple suffix as a list of string:
 53 | #
 54 | # source_suffix = ['.rst', '.md']
 55 | source_suffix = ".rst"
 56 | 
 57 | # The master toctree document.
 58 | master_doc = "index"
 59 | 
 60 | # General information about the project.
 61 | project = "Dask Cloud Provider"
 62 | copyright = f"{datetime.now().year}, Dask Developers"
 63 | author = "Dask Developers"
 64 | 
 65 | # The version info for the project you're documenting, acts as replacement for
 66 | # |version| and |release|, also used in various other places throughout the
 67 | # built documents.
 68 | #
 69 | # The short X.Y version.
 70 | from dask_cloudprovider import __version__
 71 | 
 72 | version = __version__
 73 | # The full version, including alpha/beta/rc tags.
 74 | release = __version__
 75 | 
 76 | # The language for content autogenerated by Sphinx. Refer to documentation
 77 | # for a list of supported languages.
 78 | #
 79 | # This is also used if you do content translation via gettext catalogs.
 80 | # Usually you set "language" from the command line for these cases.
 81 | language = None
 82 | 
 83 | # List of patterns, relative to source directory, that match files and
 84 | # directories to ignore when looking for source files.
 85 | # This patterns also effect to html_static_path and html_extra_path
 86 | exclude_patterns = []
 87 | 
 88 | # The name of the Pygments (syntax highlighting) style to use.
 89 | # Commenting this out for now, if we register dask pygments,
 90 | # then eventually this line can be:
 91 | # pygments_style = "dask"
 92 | 
 93 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 94 | todo_include_todos = False
 95 | 
 96 | 
 97 | # -- Options for HTML output ----------------------------------------------
 98 | 
 99 | # The theme to use for HTML and HTML Help pages.  See the documentation for
100 | # a list of builtin themes.
101 | #
102 | # html_theme = 'alabaster'
103 | 
104 | html_theme = "dask_sphinx_theme"
105 | 
106 | # Theme options are theme-specific and customize the look and feel of a theme
107 | # further.  For a list of options available for each theme, see the
108 | # documentation.
109 | #
110 | # html_theme_options = {}
111 | 
112 | # Add any paths that contain custom static files (such as style sheets) here,
113 | # relative to this directory. They are copied after the builtin static files,
114 | # so a file named "default.css" will overwrite the builtin "default.css".
115 | html_static_path = ["_static"]
116 | 
117 | # Custom sidebar templates, must be a dictionary that maps document names
118 | # to template names.
119 | #
120 | # This is required for the alabaster theme
121 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
122 | # html_sidebars = {
123 | #     "**": [
124 | #         "relations.html",  # needs 'show_related': True theme option to display
125 | #         "searchbox.html",
126 | #     ]
127 | # }
128 | 
129 | 
130 | # -- Options for HTMLHelp output ------------------------------------------
131 | 
132 | # Output file base name for HTML help builder.
133 | htmlhelp_basename = "dask-cloudprovider-doc"
134 | 
135 | 
136 | # -- Options for LaTeX output ---------------------------------------------
137 | 
138 | latex_elements = {
139 |     # The paper size ('letterpaper' or 'a4paper').
140 |     #
141 |     # 'papersize': 'letterpaper',
142 |     # The font size ('10pt', '11pt' or '12pt').
143 |     #
144 |     # 'pointsize': '10pt',
145 |     # Additional stuff for the LaTeX preamble.
146 |     #
147 |     # 'preamble': '',
148 |     # Latex figure (float) alignment
149 |     #
150 |     # 'figure_align': 'htbp',
151 | }
152 | 
153 | # Grouping the document tree into LaTeX files. List of tuples
154 | # (source start file, target name, title,
155 | #  author, documentclass [howto, manual, or own class]).
156 | latex_documents = [
157 |     (
158 |         master_doc,
159 |         "dask-cloudprovider.tex",
160 |         "Dask Cloud Provider Documentation",
161 |         "Dask Cloud Provider Developers",
162 |         "manual",
163 |     )
164 | ]
165 | 
166 | 
167 | # -- Options for manual page output ---------------------------------------
168 | 
169 | # One entry per manual page. List of tuples
170 | # (source start file, name, description, authors, manual section).
171 | man_pages = [
172 |     (master_doc, "dask-cloudprovider", "Dask Cloud Provider Documentation", [author], 1)
173 | ]
174 | 
175 | 
176 | # -- Options for Texinfo output -------------------------------------------
177 | 
178 | # Grouping the document tree into Texinfo files. List of tuples
179 | # (source start file, target name, title, author,
180 | #  dir menu entry, description, category)
181 | texinfo_documents = [
182 |     (
183 |         master_doc,
184 |         "Dask Cloud Provider",
185 |         "Dask Cloud Provider Documentation",
186 |         author,
187 |         "Dask-CloudProvider",
188 |         "One line description of project.",
189 |         "Miscellaneous",
190 |     )
191 | ]
192 | 
193 | 
194 | intersphinx_mapping = {
195 |     "python": ("https://docs.python.org/3", None),
196 |     "dask": ("https://docs.dask.org/en/latest/", None),
197 |     "distributed": ("https://distributed.dask.org/en/latest/", None),
198 |     "dask_kubernetes": ("https://kubernetes.dask.org/en/latest/", None),
199 | }
200 | 


--------------------------------------------------------------------------------
/doc/source/config.rst:
--------------------------------------------------------------------------------
 1 | Configuration
 2 | =============
 3 | 
 4 | Each cluster manager in Dask Cloudprovider will require some configuration specific to the cloud
 5 | services you wish to use. Many config options will have sensible defaults and often you can create
 6 | a cluster with just your authentication credentials configured.
 7 | 
 8 | Authentication
 9 | --------------
10 | 
11 | All cluster managers assume you have already configured your credentials for the cloud you are using.
12 | 
13 | For AWS this would mean storing your access key and secret key in ``~/.aws/credentials``. The AWS CLI
14 | can create this for you by running the command ``aws configure``.
15 | 
16 | See each cluster manager for specific details.
17 | 
18 | .. warning::
19 |     Most cluster managers also allow passing credentials as keyword arguments, although this would result in
20 |     credentials being stored in code and is not advised.
21 | 
22 | Cluster config
23 | --------------
24 | 
25 | Configuration can be passed to a cluster manager via keyword arguments, YAML config or environment variables.
26 | 
27 | For example the ``FargateCluster`` manager for AWS ECS takes a ``scheduler_mem`` configuration option to set how much memory
28 | to give the scheduler in megabytes. This can be configured in the following ways.
29 | 
30 | .. code-block:: python
31 | 
32 |    from dask_cloudprovider.aws import FargateCluster
33 | 
34 |    cluster = FargateCluster(
35 |        scheduler_mem=8192
36 |    )
37 | 
38 | .. code-block:: yaml
39 | 
40 |    # ~/.config/dask/cloudprovider.yaml
41 | 
42 |    cloudprovider:
43 |      ecs:
44 |        scheduler_mem: 8192
45 | 
46 | .. code-block:: console
47 | 
48 |    $ export DASK_CLOUDPROVIDER__ECS__SCHEDULER_MEM=8192
49 | 
50 | See each cluster manager and the `Dask configuration docs <https://docs.dask.org/en/latest/configuration.html>`_ for more information.


--------------------------------------------------------------------------------
/doc/source/digitalocean.rst:
--------------------------------------------------------------------------------
 1 | DigitalOcean
 2 | ============
 3 | 
 4 | .. currentmodule:: dask_cloudprovider.digitalocean
 5 | 
 6 | .. autosummary::
 7 |    DropletCluster
 8 | 
 9 | Overview
10 | --------
11 | 
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 | 
15 | To authenticate with DigitalOcean you must first generate a
16 | `personal access token <https://www.digitalocean.com/docs/apis-clis/api/create-personal-access-token/>`_.
17 | 
18 | Then you must put this in your Dask configuration at ``cloudprovider.digitalocean.token``. This can be done by
19 | adding the token to your YAML configuration or exporting an environment variable.
20 | 
21 | .. code-block:: yaml
22 | 
23 |    # ~/.config/dask/cloudprovider.yaml
24 | 
25 |    cloudprovider:
26 |      digitalocean:
27 |        token: "yourtoken"
28 | 
29 | .. code-block:: console
30 | 
31 |    $ export DASK_CLOUDPROVIDER__DIGITALOCEAN__TOKEN="yourtoken"
32 | 
33 | Droplet
34 | -------
35 | 
36 | .. autoclass:: DropletCluster
37 |    :members:


--------------------------------------------------------------------------------
/doc/source/gcp.rst:
--------------------------------------------------------------------------------
 1 | Google Cloud Platform
 2 | =====================
 3 | 
 4 | .. currentmodule:: dask_cloudprovider.gcp
 5 | 
 6 | .. autosummary::
 7 |    GCPCluster
 8 | 
 9 | Overview
10 | --------
11 | 
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 | 
15 | In order to create clusters on GCP you need to set your authentication credentials.
16 | You can do this via the ``gcloud`` `command line tool <https://cloud.google.com/sdk/gcloud>`_.
17 | 
18 | .. code-block:: console
19 | 
20 |    $ gcloud auth login
21 | 
22 | Alternatively you can use a `service account <https://cloud.google.com/iam/docs/service-accounts>`_ which provides credentials in a JSON file.
23 | You must set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable to the path to the JSON file.
24 | 
25 | .. code-block:: console
26 | 
27 |    $ export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
28 | 
29 | Project ID
30 | ^^^^^^^^^^
31 | 
32 | To use Dask Cloudprovider with GCP you must also configure your `Project ID <https://cloud.google.com/resource-manager/docs/creating-managing-projects>`_.
33 | Generally when creating a GCP account you will create a default project. This can be found at the top of the GCP dashboard.
34 | 
35 | Your Project ID must be added to your Dask config file.
36 | 
37 | .. code-block:: yaml
38 | 
39 |     # ~/.config/dask/cloudprovider.yaml
40 |     cloudprovider:
41 |       gcp:
42 |         projectid: "YOUR PROJECT ID"
43 | 
44 | Or via an environment variable.
45 | 
46 | .. code-block:: console
47 | 
48 |     $ export DASK_CLOUDPROVIDER__GCP__PROJECTID="YOUR PROJECT ID"
49 | 
50 | Google Cloud VMs
51 | ----------------
52 | 
53 | .. autoclass:: GCPCluster
54 |    :members:


--------------------------------------------------------------------------------
/doc/source/gpus.rst:
--------------------------------------------------------------------------------
 1 | GPU clusters
 2 | ============
 3 | 
 4 | .. currentmodule:: dask_cloudprovider
 5 | 
 6 | Many cloud providers have GPU offerings and so it is possible to launch GPU enabled Dask clusters
 7 | with Dask Cloudprovider.
 8 | 
 9 | Each cluster manager handles this differently but generally you will need to configure the following settings:
10 | 
11 | - Configure the hardware to include GPUs. This may be by changing the hardware type or adding accelerators.
12 | - Ensure the OS/Docker image has the NVIDIA drivers. For Docker images it is recommended to use the [RAPIDS images](https://hub.docker.com/r/rapidsai/rapidsai/).
13 | - Set the ``worker_module`` config option to ``dask_cuda.cli.dask_cuda_worker`` or ``worker_command`` option to ``dask-cuda-worker``.
14 | 
15 | In the following AWS :class:`dask_cloudprovider.aws.EC2Cluster` example we set the ``ami`` to be a Deep Learning AMI with NVIDIA drivers, the ``docker_image`` to RAPIDS, the ``instance_type``
16 | to ``p3.2xlarge`` which has one NVIDIA Tesla V100 and the ``worker_module`` to ``dask_cuda.cli.dask_cuda_worker``.
17 | 
18 | .. code-block:: python
19 | 
20 |     >>> cluster = EC2Cluster(ami="ami-0c7c7d78f752f8f17",  # Example Deep Learning AMI (Ubuntu 18.04)
21 |                              docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04",
22 |                              instance_type="p3.2xlarge",
23 |                              worker_module="dask_cuda.cli.dask_cuda_worker",
24 |                              bootstrap=False,
25 |                              filesystem_size=120)
26 | 
27 | See each cluster manager's example sections for info on starting a GPU cluster.


--------------------------------------------------------------------------------
/doc/source/hetzner.rst:
--------------------------------------------------------------------------------
 1 | Hetzner
 2 | ============
 3 | 
 4 | .. currentmodule:: dask_cloudprovider.hetzner
 5 | 
 6 | .. autosummary::
 7 |    HetznerCluster
 8 | 
 9 | Overview
10 | --------
11 | 
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 | 
15 | To authenticate with Hetzner you must first generate a
16 | `personal access token <https://www.digitalocean.com/docs/apis-clis/api/create-personal-access-token/>`_.
17 | 
18 | Then you must put this in your Dask configuration at ``cloudprovider.hetzner.token``. This can be done by
19 | adding the token to your YAML configuration or exporting an environment variable.
20 | 
21 | .. code-block:: yaml
22 | 
23 |    # ~/.config/dask/cloudprovider.yaml
24 | 
25 |    cloudprovider:
26 |      hetzner:
27 |        token: "yourtoken"
28 | 
29 | .. code-block:: console
30 | 
31 |    $ export DASK_CLOUDPROVIDER__HETZNER__TOKEN="yourtoken"
32 | 
33 | 
34 | .. autoclass:: HetznerCluster
35 |    :members:
36 | 


--------------------------------------------------------------------------------
/doc/source/ibm.rst:
--------------------------------------------------------------------------------
 1 | IBM Cloud
 2 | ============
 3 | 
 4 | .. currentmodule:: dask_cloudprovider.ibm
 5 | 
 6 | .. autosummary::
 7 |    IBMCodeEngineCluster
 8 | 
 9 | Overview
10 | --------
11 | 
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 | 
15 | To authenticate with IBM Cloud you must first generate an 
16 | `API key <https://cloud.ibm.com/docs/account?topic=account-userapikey&interface=ui&locale=en#create_user_key>`_.
17 | 
18 | Then you must put this in your Dask configuration at ``cloudprovider.ibm.api_key``. This can be done by
19 | adding the API key to your YAML configuration or exporting an environment variable.
20 | 
21 | .. code-block:: yaml
22 | 
23 |    # ~/.config/dask/cloudprovider.yaml
24 | 
25 |    cloudprovider:
26 |       ibm:
27 |          api_key: "your_api_key"
28 | 
29 | .. code-block:: console
30 | 
31 |    $ export DASK_CLOUDPROVIDER__IBM__API_KEY="your_api_key"
32 | 
33 | Project ID
34 | ^^^^^^^^^^
35 | 
36 | To use Dask Cloudprovider with IBM Cloud you must also configure your `Project ID <https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-project-id.html?context=wx>`_.
37 | This can be found at the top of the IBM Cloud dashboard.
38 | 
39 | Your Project ID must be added to your Dask config file.
40 | 
41 | .. code-block:: yaml
42 | 
43 |     # ~/.config/dask/cloudprovider.yaml
44 |     cloudprovider:
45 |       ibm:
46 |          project_id: "your_project_id"
47 | 
48 | Or via an environment variable.
49 | 
50 | .. code-block:: console
51 | 
52 |     $ export DASK_CLOUDPROVIDER__IBM__PROJECT_ID="your_project_id"
53 | 
54 | Code Engine
55 | -------
56 | 
57 | .. autoclass:: IBMCodeEngineCluster
58 |    :members:


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
  1 | Dask Cloud Provider
  2 | ===================
  3 | 
  4 | *Native Cloud integration for Dask.*
  5 | 
  6 | This package contains open source tools to help you deploy and operate Dask clusters on the cloud.
  7 | It contains cluster managers which can help you launch clusters using native cloud resources like VMs or containers,
  8 | it has tools and plugins for use in ANY cluster running on the cloud and is a great source of documentation for Dask cloud deployments.
  9 | 
 10 | It is by no means the "complete" or "only" way to run Dask on the cloud, check out the :doc:`alternatives` page for more tools.
 11 | 
 12 | Cluster managers
 13 | ----------------
 14 | 
 15 | This package provides classes for constructing and managing ephemeral Dask clusters on various
 16 | cloud platforms.
 17 | 
 18 | Dask Cloud Provider is one of many options for deploying Dask clusters, see `Deploying Dask <https://docs.dask.org/en/stable/deploying.html#distributed-computing>`_ in the Dask documentation for an overview of additional options.
 19 | 
 20 | To use a cloud provider cluster manager you can import it and instantiate it. Instantiating the class
 21 | will result in cloud resources being created for you.
 22 | 
 23 | .. code-block:: python
 24 | 
 25 |     from dask_cloudprovider.aws import FargateCluster
 26 |     cluster = FargateCluster(
 27 |         # Cluster manager specific config kwargs
 28 |     )
 29 | 
 30 | You can then construct a Dask client with that cluster object to use the cluster.
 31 | 
 32 | .. code-block:: python
 33 | 
 34 |     from dask.distributed import Client
 35 |     client = Client(cluster)
 36 | 
 37 | Once you are connected to the cluster you can go ahead and use Dask and all computation will take
 38 | place on your cloud resource.
 39 | 
 40 | Once you are finished be sure to close out your cluster to shut down any cloud resources you have and end any charges.
 41 | 
 42 | .. code-block:: python
 43 | 
 44 |     cluster.close()
 45 | 
 46 | .. warning::
 47 | 
 48 |    Cluster managers will attempt to automatically remove hanging cloud resources on garbage collection if the cluster
 49 |    object is destroyed without calling ``cluster.close()``, however this is not guaranteed.
 50 | 
 51 | To implicitly close your cluster when you are done with it you can optionally contruct the cluster manager via a
 52 | context manager. However this will result in the creation and destruction of the whole cluster whenever you run
 53 | this code.
 54 | 
 55 | .. code-block:: python
 56 | 
 57 |     from dask_cloudprovider.aws import FargateCluster
 58 |     from dask.distributed import Client
 59 | 
 60 |     with FargateCluster(...) as cluster:
 61 |         with Client(cluster) as client:
 62 |             # Do some Dask things
 63 | 
 64 | Plugins
 65 | -------
 66 | 
 67 | Dask components like Schedulers and Workers can benefit from being cloud-aware.
 68 | This project has plugins and tools that extend these components.
 69 | 
 70 | One example is having the workers check for termination warnings when running on ephemeral/spot instances and begin migrating data to other workers.
 71 | 
 72 | For Azure VMs you could use the :class:`dask_cloudprovider.azure.AzurePreemptibleWorkerPlugin` to do this.
 73 | It can be used on any cluster that has workers running on Azure VMs, not just ones created with :class:`dask_cloudprovider.azure.AzureVMCluster`.
 74 | 
 75 | .. code-block:: python
 76 | 
 77 |     from distributed import Client
 78 |     client = Client("<Any Dask cluster running on Azure VMs>")
 79 | 
 80 |     from dask_cloudprovider.azure import AzurePreemptibleWorkerPlugin
 81 |     client.register_worker_plugin(AzurePreemptibleWorkerPlugin())
 82 | 
 83 | 
 84 | .. toctree::
 85 |     :maxdepth: 2
 86 |     :hidden:
 87 |     :caption: Overview
 88 | 
 89 |     installation.rst
 90 |     config.rst
 91 |     alternatives.rst
 92 | 
 93 | .. toctree::
 94 |     :maxdepth: 2
 95 |     :hidden:
 96 |     :caption: Providers
 97 | 
 98 |     aws.rst
 99 |     digitalocean.rst
100 |     gcp.rst
101 |     azure.rst
102 |     hetzner.rst
103 |     ibm.rst
104 |     openstack.rst
105 |     nebius.rst
106 | 
107 | .. toctree::
108 |     :maxdepth: 2
109 |     :hidden:
110 |     :caption: Advanced
111 | 
112 |     troubleshooting.rst
113 |     security.rst
114 |     gpus.rst
115 |     packer.rst
116 | 
117 | .. toctree::
118 |     :maxdepth: 2
119 |     :hidden:
120 |     :caption: Developer
121 | 
122 |     testing.rst
123 |     releasing.rst
124 | 


--------------------------------------------------------------------------------
/doc/source/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Pip
 5 | ---
 6 | 
 7 | .. code-block:: console
 8 | 
 9 |    $ pip install dask-cloudprovider[all]
10 | 
11 | You can also restrict your install to just a specific cloud provider by giving their name instead of ``all``.
12 | 
13 | .. code-block:: console
14 | 
15 |    $ pip install dask-cloudprovider[aws]  # or
16 |    $ pip install dask-cloudprovider[azure]  # or
17 |    $ pip install dask-cloudprovider[azureml]  # or
18 |    $ pip install dask-cloudprovider[digitalocean]  # or
19 |    $ pip install dask-cloudprovider[gcp]  # or
20 |    $ pip install dask-cloudprovider[ibm]  # or
21 |    $ pip install dask-cloudprovider[openstack]  # or
22 |    $ pip install dask-cloudprovider[nebius]
23 | 
24 | Conda
25 | -----
26 | 
27 | .. code-block:: console
28 | 
29 |    $ conda install -c conda-forge dask-cloudprovider


--------------------------------------------------------------------------------
/doc/source/nebius.rst:
--------------------------------------------------------------------------------
 1 | Nebius
 2 | ============
 3 | 
 4 | .. currentmodule:: dask_cloudprovider.nebius
 5 | 
 6 | .. autosummary::
 7 |    NebiusCluster
 8 | 
 9 | Overview
10 | --------
11 | 
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 | 
15 | 
16 | Before creating clusters on Nebius, you must configure your authentication credentials. You can do this using the `nebius` `command line tool <https://docs.nebius.com/cli/quickstart>`_.
17 | 
18 | After obtaining your credentials, add them to your Dask configuration under:
19 | 
20 | * cloudprovider.nebius.token
21 | * cloudprovider.nebius.project_id
22 | 
23 | You can specify these values by either:
24 | 
25 | #. Including the environment variables NB_IAM_TOKEN and NB_PROJECT_ID in your YAML configuration.
26 | 
27 |     .. code-block:: yaml
28 | 
29 |       # ~/.config/dask/cloudprovider.yaml
30 | 
31 |       cloudprovider:
32 |         nebius:
33 |             token: "your_iam_token"
34 |             project_id: "your_project_id"
35 | 
36 | #. Exporting them as environment variables in your shell.
37 | 
38 |     .. code-block:: console
39 | 
40 |       $ export DASK_CLOUDPROVIDER__NEBIUS__TOKEN=($nebius iam get-access-token)
41 |       $ export DASK_CLOUDPROVIDER__NEBIUS__PROJECT_ID="your_project_id"
42 | 
43 | Dask Configuration
44 | ^^^^^^^^^^^^^^^^^^
45 | 
46 | You can change configuration of ``server_platform``, ``server_preset`` and ``image_family``. List of all available platforms and presets you can find in `Nebius docs <https://docs.nebius.com/compute/virtual-machines/types>`_.
47 | 
48 | .. autoclass:: NebiusCluster
49 |    :members:


--------------------------------------------------------------------------------
/doc/source/openstack.rst:
--------------------------------------------------------------------------------
 1 | Openstack
 2 | ============
 3 | 
 4 | .. currentmodule:: dask_cloudprovider.openstack
 5 | 
 6 | .. autosummary::
 7 |    OpenStackCluster
 8 | 
 9 | Overview
10 | --------
11 | 
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 | 
15 | To authenticate with the OpenStack Identity service (Keystone) 
16 | 
17 | 1) Get your Authentication URL (auth_url) for OpenStack Identity service (Keystone) and put it in your Dask configuration at ``cloudprovider.openstack.auth_url``.
18 | 
19 | 2) Get your `region <https://docs.openstack.org/python-openstackclient/latest/cli/command-objects/region.html>`_ and put it in your Dask configuration at ``cloudprovider.openstack.region``.
20 |     .. code-block:: console
21 | 
22 |         $ openstack region list
23 |         +-----------+---------------+-------------+
24 |         | Region    | Parent Region | Description |
25 |         +-----------+---------------+-------------+
26 |         | RegionOne | None          |             |
27 |         +-----------+---------------+-------------+
28 | 
29 | 3) Generate an  `application credential <https://docs.openstack.org/keystone/latest/user/application_credentials.html>`_.
30 | 
31 |     .. code-block:: console
32 | 
33 |         $ openstack application credential create dask --unrestricted
34 |         +--------------+----------------------------------------------------------------------------------------+
35 |         | Field        | Value                                                                                  |
36 |         +--------------+----------------------------------------------------------------------------------------+
37 |         | description  | None                                                                                   |
38 |         | expires_at   | None                                                                                   |
39 |         | id           | 0a0372dbedfb4e82ab66449c3316ef1e                                                       |
40 |         | name         | dask                                                                             |
41 |         | project_id   | e99b6f4b9bf84a9da27e20c9cbfe887a                                                       |
42 |         | roles        | Member anotherrole                                                                     |
43 |         | secret       | ArOy6DYcLeLTRlTmfvF1TH1QmRzYbmD91cbVPOHL3ckyRaLXlaq5pTGJqvCvqg6leEvTI1SQeX3QK-3iwmdPxg |
44 |         | unrestricted | True                                                                                   |
45 |         +--------------+----------------------------------------------------------------------------------------+
46 | 
47 |     and put ``application_credential_id`` and ``application_credential_secret`` in your Dask configuration at ``cloudprovider.openstack.application_credential_id``
48 |     and ``cloudprovider.openstack.application_credential_secret``. 
49 | 
50 | All of this variables can be gathered from either `OpenStack RC file <https://docs.openstack.org/newton/user-guide/common/cli-set-environment-variables-using-openstack-rc.html>`_ 
51 | or `clouds.yaml file <https://docs.openstack.org/python-openstackclient/latest/configuration/index.html>`_.
52 | 
53 | Example Config File
54 | ^^^^^^^^^^^^^^
55 | .. code-block:: yaml
56 | 
57 |    # ~/.config/dask/cloudprovider.yaml
58 | 
59 |     cloudprovider:
60 |       openstack:
61 |         region: "RegionOne"
62 |         auth_url: "https://cloud.home.karatosun.xyz:5000"
63 |         application_credential_id: "0a0372dbedfb4e82ab66449c3316ef1e"
64 |         application_credential_secret: "ArOy6DYcLeLTRlTmfvF1TH1QmRzYbmD91cbVPOHL3ckyRaLXlaq5pTGJqvCvqg6leEvTI1SQeX3QK-3iwmdPxg"
65 |         auth_type: "v3applicationcredential"
66 | 
67 | You can also export them as environment variables.
68 | 
69 | .. code-block:: console
70 | 
71 |    $ export DASK_CLOUDPROVIDER__APPLICATION_CREDENTIAL_ID="0a0372dbedfb4e82ab66449c3316ef1e"
72 | 
73 | 
74 | .. autoclass:: OpenStackCluster
75 |    :members:
76 | 


--------------------------------------------------------------------------------
/doc/source/packer.rst:
--------------------------------------------------------------------------------
  1 | Creating custom OS images with Packer
  2 | =====================================
  3 | 
  4 | Many cloud providers in Dask Cloudprovider involve creating VMs and installing dependencies on those VMs at boot time.
  5 | 
  6 | This can slow down the creation and scaling of clusters, so this page discusses building custom images using `Packer <https://www.packer.io/>`_ to speed up cluster creation.
  7 | 
  8 | Packer is a utility which boots up a VM on your desired cloud, runs any installation steps and then takes a snapshot of the VM for use as a template for creating
  9 | new VMs later. This allows us to run through the installation steps once, and then reuse them when starting Dask components.
 10 | 
 11 | Installing Packer
 12 | -----------------
 13 | 
 14 | See the `official install docs <https://www.packer.io/docs/install>`_.
 15 | 
 16 | Packer Overview
 17 | ---------------
 18 | 
 19 | To create an image with packer we need to create a JSON config file.
 20 | 
 21 | A Packer config file is broken into a couple of sections, ``builders`` and ``provisioners``.
 22 | 
 23 | A builder configures what type of image you are building (AWS AMI, GCP VMI, etc). It describes the base
 24 | image you are building on top of and connection information for Packer to connect to the build instance.
 25 | 
 26 | When you run ``packer build /path/to/config.json`` a VM (or multiple VMs if you configure more than one) will be
 27 | created automatically based on your ``builders`` config section.
 28 | 
 29 | Once your build VM is up and running the ``provisioners`` will be run. These are steps to configure and provision your
 30 | machine. In the examples below we are mostly using the ``shell`` provisioner which will run commands on the VM to set things
 31 | up.
 32 | 
 33 | Once your provisioning scripts have completed the VM will automatically stop, a snapshot will be taken and you will be provided
 34 | with an ID which you can then use as a template in future runs of ``dask-cloudprovider``.
 35 | 
 36 | Image Requirements
 37 | ------------------
 38 | 
 39 | Each cluster manager that uses VMs will have specific requirements for the VM image.
 40 | 
 41 | The AWS ``ECSCluster`` for example requires `ECS optimised AMIs <https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html>`_.
 42 | 
 43 | The VM cluster managers such as ``EC2cluster`` and ``DropletCluster`` just require `Docker <https://docs.docker.com/engine/install/>`_ to be installed (or `NVIDIA Docker <https://github.com/NVIDIA/nvidia-docker>`_ for GPU VM types).
 44 | 
 45 | Examples
 46 | --------
 47 | 
 48 | ``EC2Cluster`` with cloud-init
 49 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 50 | 
 51 | When any of the ``VMCluster`` based cluster managers, such as ``EC2Cluster``, lauch a new default VM it uses the Ubuntu base image and installs all dependencies
 52 | with `cloud-init <https://cloudinit.readthedocs.io/en/latest/>`_.
 53 | 
 54 | Instead of doing this every time we could use Packer to do this once, and then reuse that image every time.
 55 | 
 56 | Each ``VMCluster`` cluster manager has a class method called ``get_cloud_init`` which takes the same keyword arguments as creating the object itself, but instead
 57 | returns the cloud-init file that would be generated.
 58 | 
 59 | .. code-block:: python
 60 | 
 61 |     from dask_cloudprovider.aws import EC2Cluster
 62 | 
 63 |     cloud_init_config = EC2Cluster.get_cloud_init(
 64 |         # Pass any kwargs here you would normally pass to ``EC2Cluster``
 65 |     )
 66 |     print(cloud_init_config)
 67 | 
 68 | We should see some output like this.
 69 | 
 70 | .. code-block:: YAML
 71 | 
 72 |     #cloud-config
 73 | 
 74 |     packages:
 75 |     - apt-transport-https
 76 |     - ca-certificates
 77 |     - curl
 78 |     - gnupg-agent
 79 |     - software-properties-common
 80 | 
 81 |     # Enable ipv4 forwarding, required on CIS hardened machines
 82 |     write_files:
 83 |     - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
 84 |         content: |
 85 |         net.ipv4.conf.all.forwarding=1
 86 | 
 87 |     # create the docker group
 88 |     groups:
 89 |     - docker
 90 | 
 91 |     # Add default auto created user to docker group
 92 |     system_info:
 93 |     default_user:
 94 |         groups: [docker]
 95 | 
 96 |     runcmd:
 97 | 
 98 |     # Install Docker
 99 |     - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
100 |     - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
101 |     - apt-get update -y
102 |     - apt-get install -y docker-ce docker-ce-cli containerd.io
103 |     - systemctl start docker
104 |     - systemctl enable docker
105 | 
106 |     # Run container
107 |     - docker run --net=host  daskdev/dask:latest dask-scheduler --version
108 | 
109 | We should save this output somewhere for reference later. Let's refer to it as ``/path/to/cloud-init-config.yaml``.
110 | 
111 | Next we need a Packer config file to build our image, let's refer to it as ``/path/to/config.json``.
112 | We will use the official Ubuntu 20.04 image and specify our cloud-init config file in the ``user_data_file`` option.
113 | 
114 | Packer will not necesserily wait for our cloud-init config to finish executing before taking a snapshot, so we need to add a provisioner
115 | that will block until the cloud-init completes.
116 | 
117 | .. code-block:: JSON
118 | 
119 |     {
120 |         "builders": [
121 |             {
122 |                 "type": "amazon-ebs",
123 |                 "region": "eu-west-2",
124 |                 "source_ami_filter": {
125 |                     "filters": {
126 |                         "virtualization-type": "hvm",
127 |                         "name": "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*",
128 |                         "root-device-type": "ebs"
129 |                     },
130 |                     "owners": [
131 |                         "099720109477"
132 |                     ],
133 |                     "most_recent": true
134 |                 },
135 |                 "instance_type": "t2.micro",
136 |                 "ssh_username": "ubuntu",
137 |                 "ami_name": "dask-cloudprovider {{timestamp}}",
138 |                 "user_data_file": "/path/to/cloud-init-config.yaml"
139 |             }
140 |         ],
141 |         "provisioners": [
142 |             {
143 |                 "type": "shell",
144 |                 "inline": [
145 |                     "echo 'Waiting for cloud-init'; while [ ! -f /var/lib/cloud/instance/boot-finished ]; do sleep 1; done; echo 'Done'"
146 |                 ]
147 |             }
148 |         ]
149 |     }
150 | 
151 | Then we can build our image with ``packer build /path/to/config.json``.
152 | 
153 | .. code-block::
154 | 
155 |     $ packer build /path/to/config.json
156 |     amazon-ebs: output will be in this color.
157 | 
158 |     ==> amazon-ebs: Prevalidating any provided VPC information
159 |     ==> amazon-ebs: Prevalidating AMI Name: dask-cloudprovider 1600875672
160 |         amazon-ebs: Found Image ID: ami-062c2b6de9e9c54d3
161 |     ==> amazon-ebs: Creating temporary keypair: packer_5f6b6c99-46b5-6002-3126-8dcb1696f969
162 |     ==> amazon-ebs: Creating temporary security group for this instance: packer_5f6b6c9a-bd7d-8bb3-58a8-d983f0e95a96
163 |     ==> amazon-ebs: Authorizing access to port 22 from [0.0.0.0/0] in the temporary security groups...
164 |     ==> amazon-ebs: Launching a source AWS instance...
165 |     ==> amazon-ebs: Adding tags to source instance
166 |         amazon-ebs: Adding tag: "Name": "Packer Builder"
167 |         amazon-ebs: Instance ID: i-0531483be973d60d8
168 |     ==> amazon-ebs: Waiting for instance (i-0531483be973d60d8) to become ready...
169 |     ==> amazon-ebs: Using ssh communicator to connect: 18.133.244.42
170 |     ==> amazon-ebs: Waiting for SSH to become available...
171 |     ==> amazon-ebs: Connected to SSH!
172 |     ==> amazon-ebs: Provisioning with shell script: /var/folders/0l/fmwbqvqn1tq96xf20rlz6xmm0000gp/T/packer-shell512450076
173 |         amazon-ebs: Waiting for cloud-init
174 |         amazon-ebs: Done
175 |     ==> amazon-ebs: Stopping the source instance...
176 |         amazon-ebs: Stopping instance
177 |     ==> amazon-ebs: Waiting for the instance to stop...
178 |     ==> amazon-ebs: Creating AMI dask-cloudprovider 1600875672 from instance i-0531483be973d60d8
179 |         amazon-ebs: AMI: ami-064f8db7634d19647
180 |     ==> amazon-ebs: Waiting for AMI to become ready...
181 |     ==> amazon-ebs: Terminating the source AWS instance...
182 |     ==> amazon-ebs: Cleaning up any extra volumes...
183 |     ==> amazon-ebs: No volumes to clean up, skipping
184 |     ==> amazon-ebs: Deleting temporary security group...
185 |     ==> amazon-ebs: Deleting temporary keypair...
186 |     Build 'amazon-ebs' finished after 4 minutes 5 seconds.
187 | 
188 |     ==> Wait completed after 4 minutes 5 seconds
189 | 
190 |     ==> Builds finished. The artifacts of successful builds are:
191 |     --> amazon-ebs: AMIs were created:
192 |     eu-west-2: ami-064f8db7634d19647
193 | 
194 | Then to use our new image we can create an ``EC2Cluster`` specifying the AMI and disabling the automatic bootstrapping.
195 | 
196 | .. code-block:: python
197 | 
198 |     from dask.distributed import Client
199 |     from dask_cloudprovider.aws import EC2Cluster
200 | 
201 |     cluster = EC2Cluster(
202 |         ami="ami-064f8db7634d19647",  # AMI ID provided by Packer
203 |         bootstrap=False
204 |     )
205 |     cluster.scale(2)
206 | 
207 |     client = Client(cluster)
208 |     # Your cluster is ready to use
209 | 
210 | ``EC2Cluster`` with RAPIDS
211 | ^^^^^^^^^^^^^^^^^^^^^^^^^^
212 | 
213 | To launch `RAPIDS <https://rapids.ai/>`_ on AWS EC2 we can select a GPU instance type, choose the official Deep Learning AMIs that Amazon provides and run the official RAPIDS Docker image.
214 | 
215 | .. code-block:: python
216 | 
217 |     from dask_cloudprovider.aws import EC2Cluster
218 | 
219 |     cluster = EC2Cluster(
220 |         ami="ami-0c7c7d78f752f8f17",  # Deep Learning AMI (this ID varies by region so find yours in the AWS Console)
221 |         docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
222 |         instance_type="p3.2xlarge",
223 |         bootstrap=False,  # Docker is already installed on the Deep Learning AMI
224 |         filesystem_size=120,
225 |     )
226 |     cluster.scale(2)
227 | 
228 | However every time a VM is created by ``EC2Cluster`` the RAPIDS Docker image will need to be pulled from Docker Hub.
229 | The result is that the above snippet can take ~20 minutes to run, so let's create our own AMI which already has the RAPIDS image pulled.
230 | 
231 | In our builders section we will specify we want to build on top of the latest Deep Learning AMI by specifying
232 | ``"Deep Learning AMI (Ubuntu 18.04) Version *"`` to list all versions and ``"most_recent": true`` to use the most recent.
233 | 
234 | We also restrict the owners to ``898082745236`` which is the ID for the official image channel.
235 | 
236 | The official image already has the NVIDIA drivers and NVIDIA Docker runtime installed so the only step we need to do is to
237 | pull the RAPIDS Docker image. That way when a scheduler or worker VM is created the image will already be available on the machine.
238 | 
239 | .. code-block:: JSON
240 | 
241 |     {
242 |         "builders": [
243 |             {
244 |                 "type": "amazon-ebs",
245 |                 "region": "eu-west-2",
246 |                 "source_ami_filter": {
247 |                     "filters": {
248 |                         "virtualization-type": "hvm",
249 |                         "name": "Deep Learning AMI (Ubuntu 18.04) Version *",
250 |                         "root-device-type": "ebs"
251 |                     },
252 |                     "owners": [
253 |                         "898082745236"
254 |                     ],
255 |                     "most_recent": true
256 |                 },
257 |                 "instance_type": "p3.2xlarge",
258 |                 "ssh_username": "ubuntu",
259 |                 "ami_name": "dask-cloudprovider-rapids {{timestamp}}"
260 |             }
261 |         ],
262 |         "provisioners": [
263 |             {
264 |                 "type": "shell",
265 |                 "inline": [
266 |                     "docker pull rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9"
267 |                 ]
268 |             }
269 |         ]
270 |     }
271 | 
272 | Then we can build our image with ``packer build /path/to/config.json``.
273 | 
274 | .. code-block::
275 | 
276 |     $ packer build /path/to/config.json
277 |     ==> amazon-ebs: Prevalidating any provided VPC information
278 |     ==> amazon-ebs: Prevalidating AMI Name: dask-cloudprovider-gpu 1600868638
279 |         amazon-ebs: Found Image ID: ami-0c7c7d78f752f8f17
280 |     ==> amazon-ebs: Creating temporary keypair: packer_5f6b511e-d3a3-c607-559f-d466560cd23b
281 |     ==> amazon-ebs: Creating temporary security group for this instance: packer_5f6b511f-8f62-cf98-ca54-5771f1423d2d
282 |     ==> amazon-ebs: Authorizing access to port 22 from [0.0.0.0/0] in the temporary security groups...
283 |     ==> amazon-ebs: Launching a source AWS instance...
284 |     ==> amazon-ebs: Adding tags to source instance
285 |         amazon-ebs: Adding tag: "Name": "Packer Builder"
286 |         amazon-ebs: Instance ID: i-077f54ed4ae6bcc66
287 |     ==> amazon-ebs: Waiting for instance (i-077f54ed4ae6bcc66) to become ready...
288 |     ==> amazon-ebs: Using ssh communicator to connect: 52.56.96.165
289 |     ==> amazon-ebs: Waiting for SSH to become available...
290 |     ==> amazon-ebs: Connected to SSH!
291 |     ==> amazon-ebs: Provisioning with shell script: /var/folders/0l/fmwbqvqn1tq96xf20rlz6xmm0000gp/T/packer-shell376445833
292 |         amazon-ebs: Waiting for cloud-init
293 |         amazon-ebs: Bootstrap complete
294 |     ==> amazon-ebs: Stopping the source instance...
295 |         amazon-ebs: Stopping instance
296 |     ==> amazon-ebs: Waiting for the instance to stop...
297 |     ==> amazon-ebs: Creating AMI dask-cloudprovider-gpu 1600868638 from instance i-077f54ed4ae6bcc66
298 |         amazon-ebs: AMI: ami-04e5539cb82859e69
299 |     ==> amazon-ebs: Waiting for AMI to become ready...
300 |     ==> amazon-ebs: Terminating the source AWS instance...
301 |     ==> amazon-ebs: Cleaning up any extra volumes...
302 |     ==> amazon-ebs: No volumes to clean up, skipping
303 |     ==> amazon-ebs: Deleting temporary security group...
304 |     ==> amazon-ebs: Deleting temporary keypair...
305 |     Build 'amazon-ebs' finished after 20 minutes 35 seconds.
306 | 
307 | It took over 20 minutes to build this image, but now that we've done it once we can reuse the image in our RAPIDS powered Dask clusters.
308 | 
309 | We can then run our code snippet again but this time it will take less than 5 minutes to get a running cluster.
310 | 
311 | .. code-block:: python
312 | 
313 |     from dask.distributed import Client
314 |     from dask_cloudprovider.aws import EC2Cluster
315 | 
316 |     cluster = EC2Cluster(
317 |         ami="ami-04e5539cb82859e69",  # AMI ID provided by Packer
318 |         docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
319 |         instance_type="p3.2xlarge",
320 |         bootstrap=False,
321 |         filesystem_size=120,
322 |     )
323 |     cluster.scale(2)
324 | 
325 |     client = Client(cluster)
326 |     # Your cluster is ready to use
327 | 


--------------------------------------------------------------------------------
/doc/source/releasing.rst:
--------------------------------------------------------------------------------
 1 | Releasing
 2 | =========
 3 | 
 4 | Releases are published automatically when a tag is pushed to GitHub.
 5 | 
 6 | .. code-block:: bash
 7 | 
 8 |     # Set next version number
 9 |     export RELEASE=x.x.x
10 | 
11 |     # Create tags
12 |     git commit --allow-empty -m "Release $RELEASE"
13 |     git tag -a $RELEASE -m "Version $RELEASE"
14 | 
15 |     # Push
16 |     git push upstream --tags


--------------------------------------------------------------------------------
/doc/source/security.rst:
--------------------------------------------------------------------------------
 1 | Security
 2 | ========
 3 | 
 4 | Dask Cloudprovider aims to balance ease of use with security best practices. The two are not always compatible so this document aims to outline the compromises and decisions made in this library.
 5 | 
 6 | Public Schedulers
 7 | -----------------
 8 | 
 9 | For each cluster manager to work correctly it must be able to make a connection to the Dask scheduler on port ``8786``.
10 | In many cluster managers the default option is to expose the Dask scheduler and dashboard to the internet via a public IP address.
11 | This makes things quick and easy for new users to get up and running, but may pose a security risk long term.
12 | 
13 | Many organisations have policies which do not allow users to assign public IP addresses or open ports. Our best practices
14 | advice is to use Dask Cloudprovider from within a cloud platform, either from a VM or a managed environment. Then disable public
15 | networking. For example: 
16 | 
17 | .. code-block:: python
18 | 
19 |     >>> import dask.config, dask_cloudprovider
20 |     >>> dask.config.set({"cloudprovider.gcp.public_ingress": False})
21 | 
22 | See each cluster manager for configuration options.
23 | 
24 | Authentication and encryption
25 | -----------------------------
26 | 
27 | Cluster managers such as :class:`dask_cloudprovider.aws.EC2Cluster`, :class:`dask_cloudprovider.azure.AzureVMCluster`,
28 | :class:`dask_cloudprovider.gcp.GCPCluster` and :class:`dask_cloudprovider.digitalocean.DropletCluster` enable certificate based authentication
29 | and encryption by default.
30 | 
31 | When a cluster is launched with any of these cluster managers a set of temporary keys will be generated and distributed to the cluster nodes
32 | via their startup script. All communication between the client, scheduler and workers will then be encrypted and only clients and workers with
33 | valid certificates will be able to connect to the scheduler.
34 | 
35 | You can also specify your own certificates using the :class:`distributed.security.Security` object.
36 | 
37 | .. code-block:: python
38 | 
39 |     >>> from dask_cloudprovider.gcp import GCPCluster
40 |     >>> from dask.distributed import Client
41 |     >>> from distributed.security import Security
42 |     >>> sec = Security(tls_ca_file='cluster_ca.pem',
43 |     ...                tls_client_cert='cli_cert.pem',
44 |     ...                tls_client_key='cli_key.pem',
45 |     ...                require_encryption=True)
46 |     >>> cluster = GCPCluster(n_workers=1, security=sec)
47 |     >>> client = Client(cluster)
48 |     >>> client
49 |     <Client: 'tls://10.142.0.29:8786' processes=0 threads=0, memory=0 B>
50 | 
51 | You can disable secure connections by setting the ``security`` keyword argument to ``False``. This may be desirable when troubleshooting or
52 | when running on a trusted network (entirely inside a VPC for example).
53 | 


--------------------------------------------------------------------------------
/doc/source/testing.rst:
--------------------------------------------------------------------------------
 1 | Testing
 2 | =======
 3 | 
 4 | Tests in ``dask-cloudprovider`` and written and run using ``pytest``.
 5 | 
 6 | To set up your testing environment run:
 7 | 
 8 | .. code-block:: bash
 9 | 
10 |     pip install -r requirements_test.txt
11 | 
12 | To run tests run ``pytest`` from the root directory
13 | 
14 | .. code-block:: bash
15 | 
16 |     pytest
17 | 
18 | You may notice that many tests will be skipped. This is because those tests create external resources on cloud providers. You can set those tests to run with the
19 | ``--create-external-resources`` flag.
20 | 
21 | .. warning::
22 | 
23 |    Running tests that create external resources are slow and will cost a small amount of credit on each cloud provider.
24 | 
25 | .. code-block:: bash
26 | 
27 |     pytest -rs --create-external-resources
28 | 
29 | It is also helpful to set the ``-rs`` flag here because tests may also skip if you do not have appropriate credentials to create those external resources.
30 | If this is the case the skip reason will contain instructions on how to set up those credentials. For example
31 | 
32 | .. code-block::
33 | 
34 |     SKIPPED [1] dask_cloudprovider/azure/tests/test_azurevm.py:49:
35 |         You must configure your Azure resource group and vnet to run this test.
36 | 
37 |             $ export DASK_CLOUDPROVIDER__AZURE__LOCATION="<LOCATION>"
38 |             $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__RESOURCE_GROUP="<RESOURCE GROUP>"
39 |             $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__VNET="<VNET>"
40 |             $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__SECURITY_GROUP="<SECUROTY GROUP>"
41 | 
42 | 


--------------------------------------------------------------------------------
/doc/source/troubleshooting.rst:
--------------------------------------------------------------------------------
 1 | Troubleshooting
 2 | ===============
 3 | 
 4 | This document contains frequently asked troubleshooting problems.
 5 | 
 6 | Unable to connect to scheduler
 7 | ------------------------------
 8 | 
 9 | The most common issue is not being able to connect to the cluster once it has been constructed.
10 | 
11 | Each cluster manager will construct a Dask scheduler and by default expose it via a public IP address. You must be able
12 | to connect to that address on ports ``8786`` and ``8787`` from wherever your Python session is.
13 | 
14 | If you are unable to connect to this address it is likely that there is something wrong with your network configuration,
15 | for example you may have corporate policies implementing additional firewall rules on your account.
16 | 
17 | To reduce the chances of this happening it is often simplest to run Dask Cloudprovider from within the cloud you are trying
18 | to use and configure private networking only. See your specific cluster manager docs for info.
19 | 
20 | Invalid CPU or Memory
21 | ---------------------
22 | 
23 | When working with ``FargateCluster`` or ``ECSCluster``, CPU and memory arguments can only take values from a fixed set of combinations.
24 | 
25 | So, for example, code like this will result in an error
26 | 
27 | .. code-block:: python
28 | 
29 |     from dask_cloudprovider.aws import FargateCluster
30 |     cluster = FargateCluster(
31 |         image="daskdev/dask:latest",
32 |         worker_cpu=256,
33 |         worker_mem=30720,
34 |         n_workers=2,
35 |         fargate_use_private_ip=False,
36 |         scheduler_timeout="15 minutes"
37 |     )
38 |     client = Client(cluster)
39 |     cluster
40 | 
41 |     # botocore.errorfactory.ClientException:
42 |     # An error occurred (ClientException) when calling the RegisterTaskDefinition operation:
43 |     # No Fargate configuration exists for given values.
44 | 
45 | 
46 | This is because ECS and Fargate task definitions with ``CPU=256`` cannot have as much memory as that code is requesting.
47 | 
48 | The AWS-accepted set of combinations is documented at
49 | https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html.
50 | 
51 | Requested CPU Configuration Above Limit
52 | ---------------------------------------
53 | When creating a ``FargateCluster`` or or ``ECSCluster``, or adding additional workers, you may receive an error response with
54 | "The requested CPU configuration is above your limit". This means that the scheduler and workers requested and any other
55 | EC2 resources you have running in that region use up more than your current service quota
56 | `limit for vCPUs <https://aws.amazon.com/ec2/faqs/#EC2_On-Demand_Instance_limits>`_.
57 | 
58 | You can adjust the scheduler and/or worker CPUs with the ``scheduler_cpu`` and ``worker_cpu``
59 | `arguments <https://cloudprovider.dask.org/en/latest/aws.html#elastic-container-service-ecs>`_. See the "Invalid CPU or Memory"
60 | section in this document for more information.
61 | 
62 | However, to get the desired cluster configuration you'll need to request a service limit quota increase.
63 | 
64 | Go to ``https://<region>.aws.amazon.com/servicequotas/home/services/ec2/quotas`` and
65 | `request an increase <https://docs.aws.amazon.com/servicequotas/latest/userguide/request-quota-increase.html>`_ for
66 | "Running On-Demand Standard (A, C, D, H, I, M, R, T, Z) instances".
67 | 
68 | Pulling private Docker images
69 | -----------------------------------
70 | 
71 | For cluster managers like ``EC2Cluster``, ``AzureVMCluster`` and ``GCPCluster`` Docker images will be pulled onto VMs created on the cloud of your choice.
72 | 
73 | If you need to pull a private Docker images which requires authentication each VM will need to be configured with credentials. These cluster managers accept
74 | and ``extra_bootstrap`` argument where you can provide additional bash commands to be run during startup. This is a good place to log into your Docker registry.
75 | 
76 | .. code-block:: python
77 | 
78 |     from dask_cloudprovider.azure import AzureVMCluster
79 |     cluster = AzureVMCluster(...
80 |                              docker_image="my_private_image:latest",
81 |                              extra_bootstrap=["docker login -u 'username' -p 'password'"])
82 | 
83 | If you need to access Artifact/Container Registry in GCP, one way of doing it would be to authenticate Docker with 
84 | `gcloud credential helper <https://cloud.google.com/artifact-registry/docs/docker/authentication#gcloud-helper>`_ by adding extra bootstrap params similar to 
85 | the ones below:
86 | 
87 | .. code-block:: python
88 | 
89 |     from dask_cloudprovider.gcp import GCPCluster
90 |     cluster = GCPCluster(...
91 |                              docker_image=f"{region}-docker.pkg.dev/{project}/{repo}/{image}:{tag}",
92 |                              extra_bootstrap=[f"gcloud auth configure-docker {region}-docker.pkg.dev"])
93 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | asyncio_mode = auto
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp>=3.7.3
2 | dask>=2021.01.1
3 | distributed>=2021.01.1
4 | jinja2
5 | tornado>=5


--------------------------------------------------------------------------------
/requirements_test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-asyncio
3 | pytest-timeout
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # References:
 3 | # https://flake8.readthedocs.io/en/latest/user/configuration.html
 4 | # https://flake8.readthedocs.io/en/latest/user/error-codes.html
 5 | 
 6 | # Note: there cannot be spaces after comma's here
 7 | exclude = __init__.py,versioneer.py,dask_cloudprovider/_version.py
 8 | ignore =
 9 |     # Extra space in brackets
10 |     E20,
11 |     # Multiple spaces around ","
12 |     E231,E241,
13 |     # Comments
14 |     E26,
15 |     # Import formatting
16 |     E4,
17 |     # Comparing types instead of isinstance
18 |     E721,
19 |     # Assigning lambda expression
20 |     E731,
21 |     # continuation line under-indented for hanging indent
22 |     E121,
23 |     # continuation line over-indented for hanging indent
24 |     E126,
25 |     # continuation line over-indented for visual indent
26 |     E127,
27 |     # E128 continuation line under-indented for visual indent
28 |     E128,
29 |     # multiple statements on one line (semicolon)
30 |     E702,
31 |     # line break before binary operator
32 |     W503,
33 |     # visually indented line with same indent as next logical line
34 |     E129,
35 |     # unexpected indentation
36 |     E116,
37 |     # redefinition of unused 'loop' from line 10
38 |     F811,
39 |     # local variable is assigned to but never used
40 |     F841,
41 |     # Ambiguous variable names
42 |     E741
43 | 
44 | max-line-length = 120
45 | 
46 | [versioneer]
47 | VCS = git
48 | style = pep440
49 | versionfile_source = dask_cloudprovider/_version.py
50 | versionfile_build = dask_cloudprovider/_version.py
51 | tag_prefix =
52 | parentdir_prefix = dask-cloudprovider-
53 | 
54 | [tool:pytest]
55 | timeout = 300


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from os.path import exists
 4 | from setuptools import setup, find_packages
 5 | 
 6 | import versioneer
 7 | 
 8 | extras_require = {
 9 |     "aws": ["aiobotocore>=0.10.2"],
10 |     "azure": [
11 |         "azure-mgmt-compute>=18.0.0",
12 |         "azure-mgmt-network>=16.0.0",
13 |         "azure-identity",
14 |     ],
15 |     "digitalocean": ["python-digitalocean>=1.15.0"],
16 |     "gcp": ["google-api-python-client>=1.12.5", "google-auth>=1.23.0"],
17 |     "hetzner": ["hcloud>=1.10.0"],
18 |     "ibm": ["ibm_code_engine_sdk>=3.1.0", "kubernetes>=25.3.0"],
19 |     "openstack": ["openstacksdk>=3.3.0"],
20 |     "nebius": ["nebius>=0.2.0"],
21 | }
22 | extras_require["all"] = set(pkg for pkgs in extras_require.values() for pkg in pkgs)
23 | 
24 | setup(
25 |     name="dask-cloudprovider",
26 |     cmdclass=versioneer.get_cmdclass(),
27 |     version=versioneer.get_version(),
28 |     description="Native Cloud Provider integration for Dask",
29 |     url="https://github.com/dask/dask-cloudprovider",
30 |     keywords="dask,cloud,distributed",
31 |     license="BSD",
32 |     packages=find_packages(),
33 |     include_package_data=True,
34 |     long_description=(open("README.rst").read() if exists("README.rst") else ""),
35 |     long_description_content_type="text/x-rst",
36 |     zip_safe=False,
37 |     install_requires=list(open("requirements.txt").read().strip().split("\n")),
38 |     extras_require=extras_require,
39 |     entry_points="""
40 |     [console_scripts]
41 |     dask-ecs=dask_cloudprovider.cli.ecs:go
42 |     """,
43 |     python_requires=">=3.10",
44 | )
45 | 


--------------------------------------------------------------------------------