├── pytest.ini
├── .gitattributes
├── dask_cloudprovider
├── gcp
│ ├── __init__.py
│ ├── tests
│ │ ├── test_utils.py
│ │ └── test_gcp.py
│ └── utils.py
├── hetzner
│ ├── __init__.py
│ ├── tests
│ │ └── test_vserver.py
│ └── vserver.py
├── nebius
│ ├── __init__.py
│ ├── tests
│ │ └── test_nebius.py
│ └── instances.py
├── digitalocean
│ ├── __init__.py
│ ├── tests
│ │ └── test_droplet.py
│ └── droplet.py
├── ibm
│ ├── __init__.py
│ └── tests
│ │ └── test_code_engine.py
├── openstack
│ ├── __init__.py
│ └── tests
│ │ └── test_instances.py
├── aws
│ ├── __init__.py
│ ├── tests
│ │ ├── test_helper.py
│ │ ├── test_ecs.py
│ │ └── test_ec2.py
│ └── helper.py
├── exceptions.py
├── azure
│ ├── __init__.py
│ ├── tests
│ │ └── test_azurevm.py
│ └── utils.py
├── utils
│ ├── socket.py
│ ├── logs.py
│ ├── config_helper.py
│ ├── tests
│ │ └── test_config_helper.py
│ └── timeout.py
├── conftest.py
├── tests
│ └── test_imports.py
├── config.py
├── generic
│ ├── tests
│ │ └── test_vmcluster.py
│ ├── cloud-init.yaml.j2
│ └── vmcluster.py
├── __init__.py
├── cli
│ └── ecs.py
└── cloudprovider.yaml
├── requirements_test.txt
├── doc
├── requirements-docs.txt
├── source
│ ├── releasing.rst
│ ├── installation.rst
│ ├── hetzner.rst
│ ├── digitalocean.rst
│ ├── aws.rst
│ ├── nebius.rst
│ ├── testing.rst
│ ├── ibm.rst
│ ├── gpus.rst
│ ├── gcp.rst
│ ├── config.rst
│ ├── alternatives.rst
│ ├── security.rst
│ ├── openstack.rst
│ ├── index.rst
│ ├── troubleshooting.rst
│ ├── azure.rst
│ ├── conf.py
│ └── packer.rst
├── Makefile
└── make.bat
├── requirements.txt
├── CONTRIBUTING.md
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── MANIFEST.in
├── ci
├── scripts
│ └── test_imports.sh
├── environment-3.10.yml
├── environment-3.11.yml
└── environment-3.12.yml
├── .github
└── workflows
│ ├── release.yml
│ └── ci.yml
├── README.rst
├── setup.py
├── LICENSE
├── setup.cfg
└── .gitignore
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | asyncio_mode = auto
3 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | dask_cloudprovider/_version.py export-subst
2 |
--------------------------------------------------------------------------------
/dask_cloudprovider/gcp/__init__.py:
--------------------------------------------------------------------------------
1 | from .instances import GCPCluster
2 |
--------------------------------------------------------------------------------
/requirements_test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-asyncio
3 | pytest-timeout
4 |
--------------------------------------------------------------------------------
/dask_cloudprovider/hetzner/__init__.py:
--------------------------------------------------------------------------------
1 | from .vserver import HetznerCluster
2 |
--------------------------------------------------------------------------------
/dask_cloudprovider/nebius/__init__.py:
--------------------------------------------------------------------------------
1 | from .instances import NebiusCluster
2 |
--------------------------------------------------------------------------------
/dask_cloudprovider/digitalocean/__init__.py:
--------------------------------------------------------------------------------
1 | from .droplet import DropletCluster
2 |
--------------------------------------------------------------------------------
/dask_cloudprovider/ibm/__init__.py:
--------------------------------------------------------------------------------
1 | from .code_engine import IBMCodeEngineCluster
2 |
--------------------------------------------------------------------------------
/dask_cloudprovider/openstack/__init__.py:
--------------------------------------------------------------------------------
1 | from .instances import OpenStackCluster
2 |
--------------------------------------------------------------------------------
/doc/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | numpydoc
2 | docutils
3 | sphinx>=8
4 | dask-sphinx-theme>=4.0.0
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp>=3.7.3
2 | dask>=2021.01.1
3 | distributed>=2021.01.1
4 | jinja2
5 | tornado>=5
--------------------------------------------------------------------------------
/dask_cloudprovider/aws/__init__.py:
--------------------------------------------------------------------------------
1 | from .ec2 import EC2Cluster
2 | from .ecs import ECSCluster, FargateCluster
3 |
--------------------------------------------------------------------------------
/dask_cloudprovider/exceptions.py:
--------------------------------------------------------------------------------
1 | class ConfigError(Exception):
2 | """Raised when required config is missing"""
3 |
--------------------------------------------------------------------------------
/dask_cloudprovider/azure/__init__.py:
--------------------------------------------------------------------------------
1 | from .azurevm import AzureVMCluster
2 | from .utils import AzurePreemptibleWorkerPlugin
3 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Dask is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more.
2 |
3 | For general information on how to contribute see https://docs.dask.org/en/latest/develop.html.
4 |
--------------------------------------------------------------------------------
/dask_cloudprovider/utils/socket.py:
--------------------------------------------------------------------------------
1 | import socket
2 |
3 |
4 | def is_socket_open(ip, port):
5 | connection = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
6 | try:
7 | connection.connect((ip, int(port)))
8 | connection.shutdown(2)
9 | return True
10 | except Exception:
11 | return False
12 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 23.10.1
4 | hooks:
5 | - id: black
6 | language_version: python3
7 | exclude: versioneer.py
8 | - repo: https://github.com/pycqa/flake8
9 | rev: 6.1.0
10 | hooks:
11 | - id: flake8
12 | language_version: python3
13 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | sphinx:
4 | configuration: doc/source/conf.py
5 |
6 | formats: all
7 |
8 | python:
9 | install:
10 | - method: pip
11 | path: .
12 | extra_requirements:
13 | - all
14 | - requirements: doc/requirements-docs.txt
15 |
16 | submodules:
17 | include: all
18 |
19 | build:
20 | os: ubuntu-22.04
21 | tools:
22 | python: "3.12"
23 |
--------------------------------------------------------------------------------
/doc/source/releasing.rst:
--------------------------------------------------------------------------------
1 | Releasing
2 | =========
3 |
4 | Releases are published automatically when a tag is pushed to GitHub.
5 |
6 | .. code-block:: bash
7 |
8 | # Set next version number
9 | export RELEASE=x.x.x
10 |
11 | # Create tags
12 | git commit --allow-empty -m "Release $RELEASE"
13 | git tag -a $RELEASE -m "Version $RELEASE"
14 |
15 | # Push
16 | git push upstream --tags
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include dask_cloudprovider *.py
2 | recursive-include dask_cloudprovider *.yaml
3 | recursive-include dask_cloudprovider *.j2
4 |
5 | include setup.py
6 | include setup.cfg
7 | include LICENSE
8 | include README.rst
9 | include requirements.txt
10 | include MANIFEST.in
11 | include versioneer.py
12 |
13 | recursive-exclude * __pycache__
14 | recursive-exclude * *.py[co]include dask_cloudprovider/_version.py
15 |
--------------------------------------------------------------------------------
/dask_cloudprovider/gcp/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dask_cloudprovider.gcp.utils import build_request, is_inside_gce
4 |
5 |
6 | def test_build_request():
7 | assert build_request()(None, lambda x: x, "https://example.com")
8 |
9 |
10 | @pytest.mark.xfail(
11 | is_inside_gce(), reason="Fails if you run this test on GCE environment"
12 | )
13 | def test_is_gce_env():
14 | # Note: this test isn't super valuable, but at least we run the code
15 | assert is_inside_gce() is False
16 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?= -a
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/dask_cloudprovider/utils/logs.py:
--------------------------------------------------------------------------------
1 | class Log(str):
2 | """A container for logs."""
3 |
4 | def _widget(self):
5 | from ipywidgets import HTML
6 |
7 | return HTML(value="
{logs}
".format(logs=self))
8 |
9 | def _ipython_display_(self, **kwargs):
10 | return self._widget()._ipython_display_(**kwargs)
11 |
12 |
13 | class Logs(dict):
14 | """A container for multiple logs."""
15 |
16 | def _widget(self):
17 | from ipywidgets import Accordion
18 |
19 | accordion = Accordion(children=[log._widget() for log in self.values()])
20 | [accordion.set_title(i, title) for i, title in enumerate(self.keys())]
21 | return accordion
22 |
23 | def _ipython_display_(self, **kwargs):
24 | return self._widget()._ipython_display_(**kwargs)
25 |
--------------------------------------------------------------------------------
/doc/source/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | Pip
5 | ---
6 |
7 | .. code-block:: console
8 |
9 | $ pip install dask-cloudprovider[all]
10 |
11 | You can also restrict your install to just a specific cloud provider by giving their name instead of ``all``.
12 |
13 | .. code-block:: console
14 |
15 | $ pip install dask-cloudprovider[aws] # or
16 | $ pip install dask-cloudprovider[azure] # or
17 | $ pip install dask-cloudprovider[azureml] # or
18 | $ pip install dask-cloudprovider[digitalocean] # or
19 | $ pip install dask-cloudprovider[gcp] # or
20 | $ pip install dask-cloudprovider[ibm] # or
21 | $ pip install dask-cloudprovider[openstack] # or
22 | $ pip install dask-cloudprovider[nebius]
23 |
24 | Conda
25 | -----
26 |
27 | .. code-block:: console
28 |
29 | $ conda install -c conda-forge dask-cloudprovider
--------------------------------------------------------------------------------
/dask_cloudprovider/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | def pytest_addoption(parser):
5 | parser.addoption(
6 | "--create-external-resources",
7 | action="store_true",
8 | default=False,
9 | help="Run tests that create external resources.",
10 | )
11 |
12 |
13 | def pytest_configure(config):
14 | config.addinivalue_line(
15 | "markers", "external: mark test as creates external resources"
16 | )
17 |
18 |
19 | def pytest_collection_modifyitems(config, items):
20 | if config.getoption("--create-external-resources"):
21 | # --runslow given in cli: do not skip slow tests
22 | return
23 | skip_slow = pytest.mark.skip(
24 | reason="need --create-external-resources option to run"
25 | )
26 | for item in items:
27 | if "external" in item.keywords:
28 | item.add_marker(skip_slow)
29 |
--------------------------------------------------------------------------------
/ci/scripts/test_imports.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -o errexit
3 |
4 |
5 | test_import () {
6 | echo "Create environment: python=3.12 $1"
7 | # Create an empty environment
8 | conda create -q -y -n test-imports -c conda-forge python=3.12
9 | conda activate test-imports
10 | pip install -e .[$1]
11 | echo "python -c '$2'"
12 | python -c "$2"
13 | echo "Success [$1] 🚀"
14 | conda deactivate
15 | conda env remove -n test-imports
16 | }
17 |
18 | test_import "aws" "import dask_cloudprovider.aws"
19 | test_import "azure" "import dask_cloudprovider.azure"
20 | test_import "digitalocean" "import dask_cloudprovider.digitalocean"
21 | test_import "gcp" "import dask_cloudprovider.gcp"
22 | test_import "ibm" "import dask_cloudprovider.ibm"
23 | test_import "openstack" "import dask_cloudprovider.openstack"
24 |
--------------------------------------------------------------------------------
/ci/environment-3.10.yml:
--------------------------------------------------------------------------------
1 | name: dask-cloudprovider-test
2 | channels:
3 | - defaults
4 | - conda-forge
5 | dependencies:
6 | - python=3.10
7 | - nomkl
8 | - pip
9 | # Dask
10 | - dask
11 | # testing / CI
12 | - flake8
13 | - ipywidgets
14 | - pytest
15 | - pytest-asyncio
16 | - black >=20.8b1
17 | - pyyaml
18 | # dask dependencies
19 | - cloudpickle
20 | - toolz
21 | - cytoolz
22 | - numpy
23 | - partd
24 | # distributed dependencies
25 | - click >=6.6
26 | - msgpack-python
27 | - psutil >=5.0
28 | - six
29 | - sortedcontainers !=2.0.0,!=2.0.1
30 | - tblib
31 | - tornado >=5
32 | - zict >=0.1.3
33 | # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212
34 | - pytest-asyncio >=0.14.0
35 | - pytest-timeout
36 | - pip:
37 | - git+https://github.com/dask/dask.git@main
38 | - git+https://github.com/dask/distributed@main
39 |
--------------------------------------------------------------------------------
/ci/environment-3.11.yml:
--------------------------------------------------------------------------------
1 | name: dask-cloudprovider-test
2 | channels:
3 | - defaults
4 | - conda-forge
5 | dependencies:
6 | - python=3.11
7 | - nomkl
8 | - pip
9 | # Dask
10 | - dask
11 | # testing / CI
12 | - flake8
13 | - ipywidgets
14 | - pytest
15 | - pytest-asyncio
16 | - black >=20.8b1
17 | - pyyaml
18 | # dask dependencies
19 | - cloudpickle
20 | - toolz
21 | - cytoolz
22 | - numpy
23 | - partd
24 | # distributed dependencies
25 | - click >=6.6
26 | - msgpack-python
27 | - psutil >=5.0
28 | - six
29 | - sortedcontainers !=2.0.0,!=2.0.1
30 | - tblib
31 | - tornado >=5
32 | - zict >=0.1.3
33 | # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212
34 | - pytest-asyncio >=0.14.0
35 | - pytest-timeout
36 | - pip:
37 | - git+https://github.com/dask/dask.git@main
38 | - git+https://github.com/dask/distributed@main
39 |
--------------------------------------------------------------------------------
/ci/environment-3.12.yml:
--------------------------------------------------------------------------------
1 | name: dask-cloudprovider-test
2 | channels:
3 | - defaults
4 | - conda-forge
5 | dependencies:
6 | - python=3.12
7 | - nomkl
8 | - pip
9 | # Dask
10 | - dask
11 | # testing / CI
12 | - flake8
13 | - ipywidgets
14 | - pytest
15 | - pytest-asyncio
16 | - black >=20.8b1
17 | - pyyaml
18 | # dask dependencies
19 | - cloudpickle
20 | - toolz
21 | - cytoolz
22 | - numpy
23 | - partd
24 | # distributed dependencies
25 | - click >=6.6
26 | - msgpack-python
27 | - psutil >=5.0
28 | - six
29 | - sortedcontainers !=2.0.0,!=2.0.1
30 | - tblib
31 | - tornado >=5
32 | - zict >=0.1.3
33 | # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212
34 | - pytest-asyncio >=0.14.0
35 | - pytest-timeout
36 | - pip:
37 | - git+https://github.com/dask/dask.git@main
38 | - git+https://github.com/dask/distributed@main
39 |
--------------------------------------------------------------------------------
/doc/source/hetzner.rst:
--------------------------------------------------------------------------------
1 | Hetzner
2 | ============
3 |
4 | .. currentmodule:: dask_cloudprovider.hetzner
5 |
6 | .. autosummary::
7 | HetznerCluster
8 |
9 | Overview
10 | --------
11 |
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 |
15 | To authenticate with Hetzner you must first generate a
16 | `personal access token `_.
17 |
18 | Then you must put this in your Dask configuration at ``cloudprovider.hetzner.token``. This can be done by
19 | adding the token to your YAML configuration or exporting an environment variable.
20 |
21 | .. code-block:: yaml
22 |
23 | # ~/.config/dask/cloudprovider.yaml
24 |
25 | cloudprovider:
26 | hetzner:
27 | token: "yourtoken"
28 |
29 | .. code-block:: console
30 |
31 | $ export DASK_CLOUDPROVIDER__HETZNER__TOKEN="yourtoken"
32 |
33 |
34 | .. autoclass:: HetznerCluster
35 | :members:
36 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Build distribution
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | runs-on: "ubuntu-latest"
8 |
9 | steps:
10 | - name: Checkout source
11 | uses: actions/checkout@v2
12 |
13 | - name: Set up Python 3.12
14 | uses: actions/setup-python@v1
15 | with:
16 | python-version: 3.12
17 |
18 | - name: Install pypa/build
19 | run: python -m pip install build wheel setuptools
20 |
21 | - name: Build distributions
22 | shell: bash -l {0}
23 | run: python setup.py sdist bdist_wheel
24 |
25 | - name: Publish package to PyPI
26 | if: github.repository == 'dask/dask-cloudprovider' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
27 | uses: pypa/gh-action-pypi-publish@master
28 | with:
29 | user: __token__
30 | password: ${{ secrets.pypi_password }}
31 |
--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/doc/source/digitalocean.rst:
--------------------------------------------------------------------------------
1 | DigitalOcean
2 | ============
3 |
4 | .. currentmodule:: dask_cloudprovider.digitalocean
5 |
6 | .. autosummary::
7 | DropletCluster
8 |
9 | Overview
10 | --------
11 |
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 |
15 | To authenticate with DigitalOcean you must first generate a
16 | `personal access token `_.
17 |
18 | Then you must put this in your Dask configuration at ``cloudprovider.digitalocean.token``. This can be done by
19 | adding the token to your YAML configuration or exporting an environment variable.
20 |
21 | .. code-block:: yaml
22 |
23 | # ~/.config/dask/cloudprovider.yaml
24 |
25 | cloudprovider:
26 | digitalocean:
27 | token: "yourtoken"
28 |
29 | .. code-block:: console
30 |
31 | $ export DASK_CLOUDPROVIDER__DIGITALOCEAN__TOKEN="yourtoken"
32 |
33 | Droplet
34 | -------
35 |
36 | .. autoclass:: DropletCluster
37 | :members:
--------------------------------------------------------------------------------
/dask_cloudprovider/gcp/utils.py:
--------------------------------------------------------------------------------
1 | import httplib2
2 | import googleapiclient.http
3 | import google_auth_httplib2
4 |
5 |
6 | def build_request(credentials=None):
7 | def inner(http, *args, **kwargs):
8 | new_http = httplib2.Http()
9 | if credentials is not None:
10 | new_http = google_auth_httplib2.AuthorizedHttp(credentials, http=new_http)
11 |
12 | return googleapiclient.http.HttpRequest(new_http, *args, **kwargs)
13 |
14 | return inner
15 |
16 |
17 | def is_inside_gce() -> bool:
18 | """
19 | Returns True is the client is running in the GCE environment,
20 | False otherwise.
21 |
22 | Doc: https://cloud.google.com/compute/docs/storing-retrieving-metadata
23 | """
24 | h = httplib2.Http()
25 | try:
26 | resp_headers, _ = h.request(
27 | "http://metadata.google.internal/computeMetadata/v1/",
28 | headers={"metadata-flavor": "Google"},
29 | method="GET",
30 | )
31 | except (httplib2.HttpLib2Error, OSError):
32 | return False
33 | return True
34 |
--------------------------------------------------------------------------------
/dask_cloudprovider/utils/config_helper.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import dask.config
3 |
4 |
5 | def prune_defaults(cfg: dict, defaults: dict) -> dict:
6 | """
7 | Recursively remove any key in cfg whose value exactly equals
8 | the corresponding built-in default.
9 | """
10 | pruned = {}
11 | for key, val in cfg.items():
12 | if key not in defaults:
13 | pruned[key] = val
14 | else:
15 | default_val = defaults[key]
16 | if isinstance(val, dict) and isinstance(default_val, dict):
17 | nested = prune_defaults(val, default_val)
18 | if nested:
19 | pruned[key] = nested
20 | elif val != default_val:
21 | pruned[key] = val
22 | return pruned
23 |
24 |
25 | def serialize_custom_config() -> str:
26 | """
27 | Pull out only the user-overrides from global_config and serialize them.
28 | """
29 | user_cfg = copy.deepcopy(dask.config.global_config)
30 | defaults = dask.config.merge(*dask.config.defaults)
31 | pruned = prune_defaults(user_cfg, defaults)
32 | return dask.config.serialize(pruned)
33 |
--------------------------------------------------------------------------------
/dask_cloudprovider/tests/test_imports.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | def test_imports():
5 | from dask_cloudprovider.aws import EC2Cluster # noqa
6 | from dask_cloudprovider.aws import ECSCluster # noqa
7 | from dask_cloudprovider.aws import FargateCluster # noqa
8 | from dask_cloudprovider.azure import AzureVMCluster # noqa
9 | from dask_cloudprovider.gcp import GCPCluster # noqa
10 | from dask_cloudprovider.digitalocean import DropletCluster # noqa
11 | from dask_cloudprovider.hetzner import HetznerCluster # noqa
12 |
13 |
14 | def test_import_exceptions():
15 | with pytest.raises(ImportError):
16 | from dask_cloudprovider import EC2Cluster # noqa
17 | with pytest.raises(ImportError):
18 | from dask_cloudprovider import ECSCluster # noqa
19 | with pytest.raises(ImportError):
20 | from dask_cloudprovider import FargateCluster # noqa
21 | with pytest.raises(ImportError):
22 | from dask_cloudprovider import AzureVMCluster # noqa
23 | with pytest.raises(ImportError):
24 | from dask_cloudprovider import GCPCluster # noqa
25 | with pytest.raises(ImportError):
26 | from dask_cloudprovider import DropletCluster # noqa
27 |
--------------------------------------------------------------------------------
/doc/source/aws.rst:
--------------------------------------------------------------------------------
1 | Amazon Web Services (AWS)
2 | =========================
3 |
4 | .. currentmodule:: dask_cloudprovider.aws
5 |
6 | .. autosummary::
7 | EC2Cluster
8 | ECSCluster
9 | FargateCluster
10 |
11 | Overview
12 | --------
13 |
14 | Authentication
15 | ^^^^^^^^^^^^^^
16 |
17 | In order to create clusters on AWS you need to set your access key, secret key
18 | and region. The simplest way is to use the aws command line tool.
19 |
20 | .. code-block:: console
21 |
22 | $ pip install awscli
23 | $ aws configure
24 |
25 |
26 | Credentials
27 | ^^^^^^^^^^^
28 |
29 | In order for your Dask workers to be able to connect to other AWS resources such as S3 they will need credentials.
30 |
31 | This can be done by attaching IAM roles to individual resources or by passing credentials as environment variables. See
32 | each cluster manager docstring for more information.
33 |
34 | Elastic Compute Cloud (EC2)
35 | ---------------------------
36 |
37 | .. autoclass:: EC2Cluster
38 | :members:
39 |
40 | Elastic Container Service (ECS)
41 | -------------------------------
42 |
43 | .. autoclass:: ECSCluster
44 | :members:
45 |
46 | Fargate
47 | -------
48 |
49 | .. autoclass:: FargateCluster
50 | :members:
51 |
--------------------------------------------------------------------------------
/dask_cloudprovider/config.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division, absolute_import
2 |
3 | import os
4 |
5 | import dask
6 | import yaml
7 |
8 |
9 | class ClusterConfig(dict):
10 | """Simple config interface for dask-cloudprovider clusters, such as `AzureVMCluster`.
11 |
12 | Enables '.' notation for nested access, as per `dask.config.get`.
13 |
14 | Example
15 | -------
16 |
17 | >>> from dask_cloudprovider.config import ClusterConfig
18 | >>> class RandomCluster(VMCluster):
19 | ... def __init__(self, option=None):
20 | ... self.config = ClusterConfig(dask.config.get("cloudprovider.random", {}))
21 | ... self.option = self.config.get("option", override_with=option)
22 |
23 | """
24 |
25 | def __new__(cls, d):
26 | return super().__new__(cls, d)
27 |
28 | def get(self, key, default=None, override_with=None):
29 | return dask.config.get(
30 | key, default=default, config=self, override_with=override_with
31 | )
32 |
33 |
34 | fn = os.path.join(os.path.dirname(__file__), "cloudprovider.yaml")
35 | dask.config.ensure_file(source=fn)
36 |
37 | with open(fn) as f:
38 | defaults = yaml.safe_load(f)
39 |
40 | dask.config.update_defaults(defaults)
41 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 |
2 | Dask Cloud Provider
3 | ===================
4 |
5 |
6 | .. image:: https://github.com/dask/dask-cloudprovider/actions/workflows/ci.yml/badge.svg
7 | :target: https://github.com/dask/dask-cloudprovider/actions?query=workflow%3ACI
8 | :alt: Build Status
9 |
10 | .. image:: https://img.shields.io/readthedocs/dask-cloudprovider?color=%232980B9&logo=read-the-docs&logoColor=white
11 | :target: https://cloudprovider.dask.org/
12 | :alt: Read the Docs
13 |
14 | .. image:: https://img.shields.io/readthedocs/dask-cloudprovider?color=%232980B9&label=developer%20docs&logo=read-the-docs&logoColor=white
15 | :target: https://cloudprovider.dask.org/releasing.html
16 | :alt: Read the Docs Developer
17 |
18 | .. image:: https://img.shields.io/pypi/v/dask-cloudprovider
19 | :target: https://pypi.org/project/dask-cloudprovider/
20 | :alt: PyPI
21 |
22 | .. image:: https://img.shields.io/conda/vn/conda-forge/dask-cloudprovider
23 | :target: https://anaconda.org/conda-forge/dask-cloudprovider
24 | :alt: Conda Forge
25 |
26 |
27 | Native Cloud integration for Dask.
28 |
29 | This library provides tools to enable Dask clusters to more natively integrate with the cloud.
30 | It includes cluster managers to create dask clusters on a given cloud provider using native resources,
31 | plugins to more closely integrate Dask components with the cloud platform they are running on and documentation to empower all folks running Dask on the cloud.
32 |
--------------------------------------------------------------------------------
/dask_cloudprovider/generic/tests/test_vmcluster.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import asyncio
4 | import time
5 |
6 | from dask_cloudprovider.generic.vmcluster import VMCluster, VMInterface
7 |
8 |
9 | class DummyWorker(VMInterface):
10 | """A dummy worker for testing."""
11 |
12 |
13 | class DummyScheduler(VMInterface):
14 | """A dummy scheduler for testing."""
15 |
16 |
17 | class DummyCluster(VMCluster):
18 | """A dummy cluster for testing."""
19 |
20 | scheduler_class = DummyScheduler
21 | worker_class = DummyWorker
22 |
23 |
24 | @pytest.mark.asyncio
25 | async def test_init():
26 | with pytest.raises(RuntimeError):
27 | _ = VMCluster(asynchronous=True)
28 |
29 |
30 | @pytest.mark.asyncio
31 | async def test_call_async():
32 | cluster = DummyCluster(asynchronous=True)
33 |
34 | def blocking(string):
35 | time.sleep(0.1)
36 | return string
37 |
38 | start = time.time()
39 |
40 | a, b, c, d = await asyncio.gather(
41 | cluster.call_async(blocking, "hello"),
42 | cluster.call_async(blocking, "world"),
43 | cluster.call_async(blocking, "foo"),
44 | cluster.call_async(blocking, "bar"),
45 | )
46 |
47 | assert a == "hello"
48 | assert b == "world"
49 | assert c == "foo"
50 | assert d == "bar"
51 |
52 | # Each call to ``blocking`` takes 0.1 seconds, but they should've been run concurrently.
53 | assert time.time() - start < 0.2
54 |
55 | await cluster.close()
56 |
--------------------------------------------------------------------------------
/doc/source/nebius.rst:
--------------------------------------------------------------------------------
1 | Nebius
2 | ============
3 |
4 | .. currentmodule:: dask_cloudprovider.nebius
5 |
6 | .. autosummary::
7 | NebiusCluster
8 |
9 | Overview
10 | --------
11 |
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 |
15 |
16 | Before creating clusters on Nebius, you must configure your authentication credentials. You can do this using the `nebius` `command line tool `_.
17 |
18 | After obtaining your credentials, add them to your Dask configuration under:
19 |
20 | * cloudprovider.nebius.token
21 | * cloudprovider.nebius.project_id
22 |
23 | You can specify these values by either:
24 |
25 | #. Including the environment variables NB_IAM_TOKEN and NB_PROJECT_ID in your YAML configuration.
26 |
27 | .. code-block:: yaml
28 |
29 | # ~/.config/dask/cloudprovider.yaml
30 |
31 | cloudprovider:
32 | nebius:
33 | token: "your_iam_token"
34 | project_id: "your_project_id"
35 |
36 | #. Exporting them as environment variables in your shell.
37 |
38 | .. code-block:: console
39 |
40 | $ export DASK_CLOUDPROVIDER__NEBIUS__TOKEN=($nebius iam get-access-token)
41 | $ export DASK_CLOUDPROVIDER__NEBIUS__PROJECT_ID="your_project_id"
42 |
43 | Dask Configuration
44 | ^^^^^^^^^^^^^^^^^^
45 |
46 | You can change configuration of ``server_platform``, ``server_preset`` and ``image_family``. List of all available platforms and presets you can find in `Nebius docs `_.
47 |
48 | .. autoclass:: NebiusCluster
49 | :members:
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from os.path import exists
4 | from setuptools import setup, find_packages
5 |
6 | import versioneer
7 |
8 | extras_require = {
9 | "aws": ["aiobotocore>=0.10.2"],
10 | "azure": [
11 | "azure-mgmt-compute>=18.0.0",
12 | "azure-mgmt-network>=16.0.0",
13 | "azure-identity",
14 | ],
15 | "digitalocean": ["python-digitalocean>=1.15.0"],
16 | "gcp": ["google-api-python-client>=1.12.5", "google-auth>=1.23.0"],
17 | "hetzner": ["hcloud>=1.10.0"],
18 | "ibm": ["ibm_code_engine_sdk>=3.1.0", "kubernetes>=25.3.0"],
19 | "openstack": ["openstacksdk>=3.3.0"],
20 | "nebius": ["nebius>=0.2.0"],
21 | }
22 | extras_require["all"] = set(pkg for pkgs in extras_require.values() for pkg in pkgs)
23 |
24 | setup(
25 | name="dask-cloudprovider",
26 | cmdclass=versioneer.get_cmdclass(),
27 | version=versioneer.get_version(),
28 | description="Native Cloud Provider integration for Dask",
29 | url="https://github.com/dask/dask-cloudprovider",
30 | keywords="dask,cloud,distributed",
31 | license="BSD",
32 | packages=find_packages(),
33 | include_package_data=True,
34 | long_description=(open("README.rst").read() if exists("README.rst") else ""),
35 | long_description_content_type="text/x-rst",
36 | zip_safe=False,
37 | install_requires=list(open("requirements.txt").read().strip().split("\n")),
38 | extras_require=extras_require,
39 | entry_points="""
40 | [console_scripts]
41 | dask-ecs=dask_cloudprovider.cli.ecs:go
42 | """,
43 | python_requires=">=3.10",
44 | )
45 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, NVIDIA Corporation
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/doc/source/testing.rst:
--------------------------------------------------------------------------------
1 | Testing
2 | =======
3 |
4 | Tests in ``dask-cloudprovider`` and written and run using ``pytest``.
5 |
6 | To set up your testing environment run:
7 |
8 | .. code-block:: bash
9 |
10 | pip install -r requirements_test.txt
11 |
12 | To run tests run ``pytest`` from the root directory
13 |
14 | .. code-block:: bash
15 |
16 | pytest
17 |
18 | You may notice that many tests will be skipped. This is because those tests create external resources on cloud providers. You can set those tests to run with the
19 | ``--create-external-resources`` flag.
20 |
21 | .. warning::
22 |
23 | Running tests that create external resources are slow and will cost a small amount of credit on each cloud provider.
24 |
25 | .. code-block:: bash
26 |
27 | pytest -rs --create-external-resources
28 |
29 | It is also helpful to set the ``-rs`` flag here because tests may also skip if you do not have appropriate credentials to create those external resources.
30 | If this is the case the skip reason will contain instructions on how to set up those credentials. For example
31 |
32 | .. code-block::
33 |
34 | SKIPPED [1] dask_cloudprovider/azure/tests/test_azurevm.py:49:
35 | You must configure your Azure resource group and vnet to run this test.
36 |
37 | $ export DASK_CLOUDPROVIDER__AZURE__LOCATION=""
38 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__RESOURCE_GROUP=""
39 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__VNET=""
40 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__SECURITY_GROUP=""
41 |
42 |
--------------------------------------------------------------------------------
/doc/source/ibm.rst:
--------------------------------------------------------------------------------
1 | IBM Cloud
2 | ============
3 |
4 | .. currentmodule:: dask_cloudprovider.ibm
5 |
6 | .. autosummary::
7 | IBMCodeEngineCluster
8 |
9 | Overview
10 | --------
11 |
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 |
15 | To authenticate with IBM Cloud you must first generate an
16 | `API key `_.
17 |
18 | Then you must put this in your Dask configuration at ``cloudprovider.ibm.api_key``. This can be done by
19 | adding the API key to your YAML configuration or exporting an environment variable.
20 |
21 | .. code-block:: yaml
22 |
23 | # ~/.config/dask/cloudprovider.yaml
24 |
25 | cloudprovider:
26 | ibm:
27 | api_key: "your_api_key"
28 |
29 | .. code-block:: console
30 |
31 | $ export DASK_CLOUDPROVIDER__IBM__API_KEY="your_api_key"
32 |
33 | Project ID
34 | ^^^^^^^^^^
35 |
36 | To use Dask Cloudprovider with IBM Cloud you must also configure your `Project ID `_.
37 | This can be found at the top of the IBM Cloud dashboard.
38 |
39 | Your Project ID must be added to your Dask config file.
40 |
41 | .. code-block:: yaml
42 |
43 | # ~/.config/dask/cloudprovider.yaml
44 | cloudprovider:
45 | ibm:
46 | project_id: "your_project_id"
47 |
48 | Or via an environment variable.
49 |
50 | .. code-block:: console
51 |
52 | $ export DASK_CLOUDPROVIDER__IBM__PROJECT_ID="your_project_id"
53 |
54 | Code Engine
55 | -------
56 |
57 | .. autoclass:: IBMCodeEngineCluster
58 | :members:
--------------------------------------------------------------------------------
/doc/source/gpus.rst:
--------------------------------------------------------------------------------
1 | GPU clusters
2 | ============
3 |
4 | .. currentmodule:: dask_cloudprovider
5 |
6 | Many cloud providers have GPU offerings and so it is possible to launch GPU enabled Dask clusters
7 | with Dask Cloudprovider.
8 |
9 | Each cluster manager handles this differently but generally you will need to configure the following settings:
10 |
11 | - Configure the hardware to include GPUs. This may be by changing the hardware type or adding accelerators.
12 | - Ensure the OS/Docker image has the NVIDIA drivers. For Docker images it is recommended to use the [RAPIDS images](https://hub.docker.com/r/rapidsai/rapidsai/).
13 | - Set the ``worker_module`` config option to ``dask_cuda.cli.dask_cuda_worker`` or ``worker_command`` option to ``dask-cuda-worker``.
14 |
15 | In the following AWS :class:`dask_cloudprovider.aws.EC2Cluster` example we set the ``ami`` to be a Deep Learning AMI with NVIDIA drivers, the ``docker_image`` to RAPIDS, the ``instance_type``
16 | to ``p3.2xlarge`` which has one NVIDIA Tesla V100 and the ``worker_module`` to ``dask_cuda.cli.dask_cuda_worker``.
17 |
18 | .. code-block:: python
19 |
20 | >>> cluster = EC2Cluster(ami="ami-0c7c7d78f752f8f17", # Example Deep Learning AMI (Ubuntu 18.04)
21 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04",
22 | instance_type="p3.2xlarge",
23 | worker_module="dask_cuda.cli.dask_cuda_worker",
24 | bootstrap=False,
25 | filesystem_size=120)
26 |
27 | See each cluster manager's example sections for info on starting a GPU cluster.
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # References:
3 | # https://flake8.readthedocs.io/en/latest/user/configuration.html
4 | # https://flake8.readthedocs.io/en/latest/user/error-codes.html
5 |
6 | # Note: there cannot be spaces after comma's here
7 | exclude = __init__.py,versioneer.py,dask_cloudprovider/_version.py
8 | ignore =
9 | # Extra space in brackets
10 | E20,
11 | # Multiple spaces around ","
12 | E231,E241,
13 | # Comments
14 | E26,
15 | # Import formatting
16 | E4,
17 | # Comparing types instead of isinstance
18 | E721,
19 | # Assigning lambda expression
20 | E731,
21 | # continuation line under-indented for hanging indent
22 | E121,
23 | # continuation line over-indented for hanging indent
24 | E126,
25 | # continuation line over-indented for visual indent
26 | E127,
27 | # E128 continuation line under-indented for visual indent
28 | E128,
29 | # multiple statements on one line (semicolon)
30 | E702,
31 | # line break before binary operator
32 | W503,
33 | # visually indented line with same indent as next logical line
34 | E129,
35 | # unexpected indentation
36 | E116,
37 | # redefinition of unused 'loop' from line 10
38 | F811,
39 | # local variable is assigned to but never used
40 | F841,
41 | # Ambiguous variable names
42 | E741
43 |
44 | max-line-length = 120
45 |
46 | [versioneer]
47 | VCS = git
48 | style = pep440
49 | versionfile_source = dask_cloudprovider/_version.py
50 | versionfile_build = dask_cloudprovider/_version.py
51 | tag_prefix =
52 | parentdir_prefix = dask-cloudprovider-
53 |
54 | [tool:pytest]
55 | timeout = 300
--------------------------------------------------------------------------------
/doc/source/gcp.rst:
--------------------------------------------------------------------------------
1 | Google Cloud Platform
2 | =====================
3 |
4 | .. currentmodule:: dask_cloudprovider.gcp
5 |
6 | .. autosummary::
7 | GCPCluster
8 |
9 | Overview
10 | --------
11 |
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 |
15 | In order to create clusters on GCP you need to set your authentication credentials.
16 | You can do this via the ``gcloud`` `command line tool `_.
17 |
18 | .. code-block:: console
19 |
20 | $ gcloud auth login
21 |
22 | Alternatively you can use a `service account `_ which provides credentials in a JSON file.
23 | You must set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable to the path to the JSON file.
24 |
25 | .. code-block:: console
26 |
27 | $ export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
28 |
29 | Project ID
30 | ^^^^^^^^^^
31 |
32 | To use Dask Cloudprovider with GCP you must also configure your `Project ID `_.
33 | Generally when creating a GCP account you will create a default project. This can be found at the top of the GCP dashboard.
34 |
35 | Your Project ID must be added to your Dask config file.
36 |
37 | .. code-block:: yaml
38 |
39 | # ~/.config/dask/cloudprovider.yaml
40 | cloudprovider:
41 | gcp:
42 | projectid: "YOUR PROJECT ID"
43 |
44 | Or via an environment variable.
45 |
46 | .. code-block:: console
47 |
48 | $ export DASK_CLOUDPROVIDER__GCP__PROJECTID="YOUR PROJECT ID"
49 |
50 | Google Cloud VMs
51 | ----------------
52 |
53 | .. autoclass:: GCPCluster
54 | :members:
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | runs-on: ${{ matrix.os }}
8 | strategy:
9 | fail-fast: true
10 | matrix:
11 | os: ["ubuntu-latest"]
12 | python-version: ["3.10", "3.11", "3.12"]
13 |
14 | steps:
15 | - name: Checkout source
16 | uses: actions/checkout@v2
17 |
18 | - name: Setup Conda Environment
19 | uses: conda-incubator/setup-miniconda@v2
20 | with:
21 | miniconda-version: "latest"
22 | python-version: ${{ matrix.python-version }}
23 | environment-file: ci/environment-${{ matrix.python-version }}.yml
24 | activate-environment: dask-cloudprovider-test
25 | auto-activate-base: false
26 |
27 | - name: Install
28 | shell: bash -l {0}
29 | run: pip install -e .[all]
30 |
31 | - name: Run tests
32 | shell: bash -l {0}
33 | run: py.test dask_cloudprovider
34 |
35 | lint:
36 | name: "pre-commit hooks"
37 | runs-on: ubuntu-latest
38 | steps:
39 | - uses: actions/checkout@v2
40 | - uses: actions/setup-python@v5
41 | - uses: pre-commit/action@v3.0.1
42 |
43 | imports:
44 | runs-on: ubuntu-latest
45 | steps:
46 | - name: Checkout source
47 | uses: actions/checkout@v2
48 |
49 | - name: Setup Conda Environment
50 | uses: conda-incubator/setup-miniconda@v2
51 | with:
52 | miniconda-version: "latest"
53 | python-version: "3.12"
54 |
55 | - name: Run import tests
56 | shell: bash -l {0}
57 | run: source ci/scripts/test_imports.sh
58 |
--------------------------------------------------------------------------------
/dask_cloudprovider/utils/tests/test_config_helper.py:
--------------------------------------------------------------------------------
1 | import dask.config
2 |
3 | from dask_cloudprovider.utils.config_helper import (
4 | prune_defaults,
5 | serialize_custom_config,
6 | )
7 |
8 |
9 | def test_prune_defaults_simple():
10 | # Keys matching defaults get dropped; new keys stay
11 | cfg = {"a": 1, "b": 2, "c": 3}
12 | defaults = {"a": 1, "b": 0}
13 | pruned = prune_defaults(cfg, defaults)
14 | assert pruned == {"b": 2, "c": 3}
15 |
16 |
17 | def test_prune_defaults_nested():
18 | # Nested dicts: only subkeys that differ survive
19 | cfg = {
20 | "outer": {"keep": 41, "drop": 0},
21 | "solo": 99,
22 | }
23 | defaults = {
24 | "outer": {"keep": 42, "drop": 0},
25 | "solo": 0,
26 | }
27 | pruned = prune_defaults(cfg, defaults)
28 | # 'outer.drop' matches default, 'outer.keep' differs; 'solo' differs
29 | assert pruned == {"outer": {"keep": 41}, "solo": 99}
30 |
31 |
32 | def test_serialize_custom_config(monkeypatch):
33 | # Arrange a fake global_config and defaults
34 | fake_global = {"x": 10, "y": {"a": 1, "b": 0}}
35 | fake_defaults = {"x": 0, "y": {"a": 1, "b": 0}}
36 |
37 | # Monkey-patch dask.config
38 | monkeypatch.setattr(dask.config, "global_config", fake_global)
39 | # defaults should be a sequence of dict(s)
40 | monkeypatch.setattr(dask.config, "defaults", (fake_defaults,))
41 |
42 | # Serialize the custom config
43 | serialized = serialize_custom_config()
44 | assert isinstance(serialized, str)
45 |
46 | # Assert it's valid JSON and only contains overrides (x and nothing under y)
47 | pruned = dask.config.deserialize(serialized)
48 | assert pruned == {"x": 10}
49 |
--------------------------------------------------------------------------------
/dask_cloudprovider/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 |
3 | from ._version import get_versions
4 |
5 | __version__ = get_versions()["version"]
6 |
7 | del get_versions
8 |
9 |
10 | def __getattr__(name):
11 | """As of dask_cloudprovider v0.5.0 all cluster managers are in cloud provider specific submodules.
12 |
13 | This allows us to more easily separate out optional dependencies. However we maintain some helpful
14 | errors at the top level.
15 |
16 | This is both to help migrate users of any cluster managers that existed before this was changed
17 | and also to help anyone who incorrectly tries to import a cluster manager from the top level.
18 | Perhaps because they saw it used in some documentation but didn't see the import.
19 |
20 | """
21 |
22 | if name in ["EC2Cluster", "ECSCluster", "FargateCluster"]:
23 | raise ImportError(
24 | "AWS cluster managers must be imported from the aws subpackage. "
25 | f"Please import dask_cloudprovider.aws.{name}"
26 | )
27 |
28 | if name in ["AzureVMCluster"]:
29 | raise ImportError(
30 | "Azure cluster managers must be imported from the the azure subpackage. "
31 | f"Please import dask_cloudprovider.azure.{name}"
32 | )
33 |
34 | if name in ["GCPCluster"]:
35 | raise ImportError(
36 | "Google Cloud cluster managers must be imported from the the gcp subpackage. "
37 | f"Please import dask_cloudprovider.gcp.{name}"
38 | )
39 |
40 | if name in ["DropletCluster"]:
41 | raise ImportError(
42 | "DigitalOcean cluster managers must be imported from the digitalocean subpackage. "
43 | f"Please import dask_cloudprovider.digitalocean.{name}"
44 | )
45 |
--------------------------------------------------------------------------------
/doc/source/config.rst:
--------------------------------------------------------------------------------
1 | Configuration
2 | =============
3 |
4 | Each cluster manager in Dask Cloudprovider will require some configuration specific to the cloud
5 | services you wish to use. Many config options will have sensible defaults and often you can create
6 | a cluster with just your authentication credentials configured.
7 |
8 | Authentication
9 | --------------
10 |
11 | All cluster managers assume you have already configured your credentials for the cloud you are using.
12 |
13 | For AWS this would mean storing your access key and secret key in ``~/.aws/credentials``. The AWS CLI
14 | can create this for you by running the command ``aws configure``.
15 |
16 | See each cluster manager for specific details.
17 |
18 | .. warning::
19 | Most cluster managers also allow passing credentials as keyword arguments, although this would result in
20 | credentials being stored in code and is not advised.
21 |
22 | Cluster config
23 | --------------
24 |
25 | Configuration can be passed to a cluster manager via keyword arguments, YAML config or environment variables.
26 |
27 | For example the ``FargateCluster`` manager for AWS ECS takes a ``scheduler_mem`` configuration option to set how much memory
28 | to give the scheduler in megabytes. This can be configured in the following ways.
29 |
30 | .. code-block:: python
31 |
32 | from dask_cloudprovider.aws import FargateCluster
33 |
34 | cluster = FargateCluster(
35 | scheduler_mem=8192
36 | )
37 |
38 | .. code-block:: yaml
39 |
40 | # ~/.config/dask/cloudprovider.yaml
41 |
42 | cloudprovider:
43 | ecs:
44 | scheduler_mem: 8192
45 |
46 | .. code-block:: console
47 |
48 | $ export DASK_CLOUDPROVIDER__ECS__SCHEDULER_MEM=8192
49 |
50 | See each cluster manager and the `Dask configuration docs `_ for more information.
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.a
8 | *.dll
9 | *.exe
10 | *.o
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 | doc/_build/
72 | doc/source/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # pyenv
81 | .python-version
82 |
83 | # celery beat schedule file
84 | celerybeat-schedule
85 |
86 | # SageMath parsed files
87 | *.sage.py
88 |
89 | # dotenv
90 | .env
91 |
92 | # virtualenv
93 | .venv
94 | venv/
95 | ENV/
96 |
97 | # Spyder project settings
98 | .spyderproject
99 | .spyproject
100 |
101 | # Rope project settings
102 | .ropeproject
103 |
104 | # mkdocs documentation
105 | /site
106 |
107 | # mypy
108 | .mypy_cache/
109 |
110 | # IDE
111 | .vscode/
112 | .idea
113 |
114 | # MAC
115 | .DS_Store
116 |
117 | # any untitled Jupyter notebooks
118 | Untitled*.ipynb
119 |
120 | # key material
121 | *.pem
122 | *.pub
123 | *_rsa
124 |
--------------------------------------------------------------------------------
/dask_cloudprovider/hetzner/tests/test_vserver.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import dask
4 |
5 | hetzner = pytest.importorskip("hcloud")
6 |
7 | from dask_cloudprovider.hetzner.vserver import HetznerCluster
8 | from dask.distributed import Client
9 | from distributed.core import Status
10 |
11 |
12 | async def skip_without_credentials(config):
13 | if config.get("token") is None:
14 | pytest.skip(
15 | """
16 | You must configure a Hetzner API token to run this test.
17 |
18 | Either set this in your config
19 |
20 | # cloudprovider.yaml
21 | cloudprovider:
22 | hetzner:
23 | token: "yourtoken"
24 |
25 | Or by setting it as an environment variable
26 |
27 | export DASK_CLOUDPROVIDER__HETZNER__TOKEN="yourtoken"
28 |
29 | """
30 | )
31 |
32 |
33 | @pytest.fixture
34 | async def config():
35 | return dask.config.get("cloudprovider.hetzner", {})
36 |
37 |
38 | @pytest.fixture
39 | @pytest.mark.external
40 | async def cluster(config):
41 | await skip_without_credentials(config)
42 | async with HetznerCluster(asynchronous=True) as cluster:
43 | yield cluster
44 |
45 |
46 | @pytest.mark.asyncio
47 | async def test_init():
48 | cluster = HetznerCluster(asynchronous=True)
49 | assert cluster.status == Status.created
50 |
51 |
52 | @pytest.mark.asyncio
53 | @pytest.mark.timeout(600)
54 | async def test_create_cluster(cluster):
55 | assert cluster.status == Status.running
56 |
57 | cluster.scale(1)
58 | await cluster
59 | assert len(cluster.workers) == 1
60 |
61 | async with Client(cluster, asynchronous=True) as client:
62 |
63 | def inc(x):
64 | return x + 1
65 |
66 | assert await client.submit(inc, 10).result() == 11
67 |
68 |
69 | @pytest.mark.asyncio
70 | async def test_get_cloud_init():
71 | cloud_init = HetznerCluster.get_cloud_init(
72 | docker_args="--privileged",
73 | )
74 | assert " --privileged " in cloud_init
75 |
--------------------------------------------------------------------------------
/dask_cloudprovider/digitalocean/tests/test_droplet.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import dask
4 |
5 | digitalocean = pytest.importorskip("digitalocean")
6 |
7 | from dask_cloudprovider.digitalocean.droplet import DropletCluster
8 | from dask.distributed import Client
9 | from distributed.core import Status
10 |
11 |
12 | async def skip_without_credentials(config):
13 | if config.get("token") is None:
14 | pytest.skip(
15 | """
16 | You must configure a Digital Ocean API token to run this test.
17 |
18 | Either set this in your config
19 |
20 | # cloudprovider.yaml
21 | cloudprovider:
22 | digitalocean:
23 | token: "yourtoken"
24 |
25 | Or by setting it as an environment variable
26 |
27 | export DASK_CLOUDPROVIDER__DIGITALOCEAN__TOKEN="yourtoken"
28 |
29 | """
30 | )
31 |
32 |
33 | @pytest.fixture
34 | async def config():
35 | return dask.config.get("cloudprovider.digitalocean", {})
36 |
37 |
38 | @pytest.fixture
39 | @pytest.mark.external
40 | async def cluster(config):
41 | await skip_without_credentials(config)
42 | async with DropletCluster(asynchronous=True) as cluster:
43 | yield cluster
44 |
45 |
46 | @pytest.mark.asyncio
47 | @pytest.mark.external
48 | async def test_init():
49 | cluster = DropletCluster(asynchronous=True)
50 | assert cluster.status == Status.created
51 |
52 |
53 | @pytest.mark.asyncio
54 | @pytest.mark.timeout(600)
55 | @pytest.mark.external
56 | async def test_create_cluster(cluster):
57 | assert cluster.status == Status.running
58 |
59 | cluster.scale(1)
60 | await cluster
61 | assert len(cluster.workers) == 1
62 |
63 | async with Client(cluster, asynchronous=True) as client:
64 |
65 | def inc(x):
66 | return x + 1
67 |
68 | assert await client.submit(inc, 10).result() == 11
69 |
70 |
71 | @pytest.mark.asyncio
72 | async def test_get_cloud_init():
73 | cloud_init = DropletCluster.get_cloud_init(
74 | docker_args="--privileged",
75 | )
76 | assert " --privileged " in cloud_init
77 |
--------------------------------------------------------------------------------
/dask_cloudprovider/generic/cloud-init.yaml.j2:
--------------------------------------------------------------------------------
1 | #cloud-config
2 |
3 | {% if bootstrap %}
4 | # Bootstrap
5 | packages:
6 | - apt-transport-https
7 | - ca-certificates
8 | - curl
9 | - gnupg-agent
10 | - software-properties-common
11 | - ubuntu-drivers-common
12 |
13 | # Enable ipv4 forwarding, required on CIS hardened machines
14 | write_files:
15 | - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
16 | content: |
17 | net.ipv4.conf.all.forwarding=1
18 |
19 | # create the docker group
20 | groups:
21 | - docker
22 |
23 | # Add default auto created user to docker group
24 | system_info:
25 | default_user:
26 | groups: [docker]
27 | {% endif %}
28 |
29 | runcmd:
30 | {% if bootstrap %}
31 | # Install Docker
32 | - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
33 | - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
34 | - apt-get update -y
35 | - apt-get install -y docker-ce docker-ce-cli containerd.io
36 | - systemctl start docker
37 | - systemctl enable docker
38 | {% endif %}
39 |
40 | {% if bootstrap and gpu_instance %}
41 | # Install NVIDIA driver
42 | - DEBIAN_FRONTEND=noninteractive ubuntu-drivers install
43 |
44 | # Install NVIDIA docker
45 | - curl -fsSL https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
46 | - curl -s -L https://nvidia.github.io/nvidia-docker/$(. /etc/os-release;echo $ID$VERSION_ID)/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
47 | - apt-get update -y
48 | - apt-get install -y nvidia-docker2
49 | - systemctl restart docker
50 | {% endif %}
51 |
52 | {% if extra_bootstrap %}
53 | {% for command in extra_bootstrap %}
54 | - {{ command }}
55 | {% endfor %}
56 | {% endif %}
57 |
58 | # Run container
59 | - 'docker run --net=host {%+ if gpu_instance %}--gpus=all{% endif %} {% for key in env_vars %} -e {{key}}="{{env_vars[key]}}" {% endfor %}{%+ if docker_args %}{{docker_args}}{% endif %} {{image}} {{ command }}'
60 |
61 | {% if auto_shutdown %}
62 | # Shutdown when command is done
63 | - shutdown -h now
64 | {% endif %}
65 |
--------------------------------------------------------------------------------
/dask_cloudprovider/openstack/tests/test_instances.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import dask
3 | from dask_cloudprovider.openstack.instances import OpenStackCluster
4 | from dask.distributed import Client
5 | from distributed.core import Status
6 |
7 | # Optional: Skips tests if OpenStack credentials are not set
8 |
9 |
10 | async def skip_without_credentials(config):
11 | if (
12 | config.get("auth_url") is None
13 | or config.get("application_credential_secret") is None
14 | ):
15 | pytest.skip(
16 | """
17 | You must configure OpenStack credentials to run this test.
18 |
19 | Set this in your config file or environment variables:
20 |
21 | # cloudprovider.yaml
22 | cloudprovider:
23 | openstack:
24 | auth_url: "your_auth_url"
25 | application_credential_id: "your_app_cred_id"
26 | application_credential_secret: "your_app_cred_secret"
27 | """
28 | )
29 |
30 |
31 | @pytest.fixture
32 | async def config():
33 | return dask.config.get("cloudprovider.openstack", {})
34 |
35 |
36 | @pytest.fixture
37 | @pytest.mark.external
38 | async def cluster(config):
39 | await skip_without_credentials(config)
40 |
41 | async with OpenStackCluster(asynchronous=True) as cluster:
42 | yield cluster
43 |
44 |
45 | @pytest.mark.asyncio
46 | async def test_init():
47 | cluster = OpenStackCluster(asynchronous=True)
48 | assert cluster.status == Status.created
49 |
50 |
51 | @pytest.mark.asyncio
52 | @pytest.mark.timeout(600)
53 | async def test_create_cluster(cluster):
54 | assert cluster.status == Status.running
55 | cluster.scale(1)
56 | await cluster
57 | assert len(cluster.workers) == 1
58 |
59 | async with Client(cluster, asynchronous=True) as client:
60 |
61 | def inc(x):
62 | return x + 1
63 |
64 | assert await client.submit(inc, 10).result() == 11
65 |
66 |
67 | @pytest.mark.asyncio
68 | async def test_get_cloud_init():
69 | cloud_init = OpenStackCluster.get_cloud_init(
70 | docker_args="--privileged",
71 | )
72 | assert " --privileged " in cloud_init
73 |
--------------------------------------------------------------------------------
/dask_cloudprovider/nebius/tests/test_nebius.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import dask
4 |
5 | nebius = pytest.importorskip("nebius")
6 |
7 | from dask_cloudprovider.nebius.instances import NebiusCluster
8 | from dask.distributed import Client
9 | from distributed.core import Status
10 |
11 |
12 | async def skip_without_credentials(config):
13 | if config.get("token") is None or config.get("project_id") is None:
14 | pytest.skip(
15 | """
16 | You must configure a Nebius AI Cloud API token to run this test.
17 |
18 | Either set this in your config
19 |
20 | # cloudprovider.yaml
21 | cloudprovider:
22 | nebius:
23 | token: "yourtoken"
24 | project_id: "yourprojectid"
25 |
26 | Or by setting it as an environment variable
27 |
28 | export DASK_CLOUDPROVIDER__NEBIUS__TOKEN=$(nebius iam get-access-token)
29 | export DASK_CLOUDPROVIDER__NEBIUS__PROJECT_ID=project_id
30 |
31 | """
32 | )
33 |
34 |
35 | @pytest.fixture
36 | async def config():
37 | return dask.config.get("cloudprovider.nebius", {})
38 |
39 |
40 | @pytest.fixture
41 | @pytest.mark.external
42 | async def cluster(config):
43 | await skip_without_credentials(config)
44 | async with NebiusCluster(asynchronous=True, debug=True) as cluster:
45 | yield cluster
46 |
47 |
48 | @pytest.mark.asyncio
49 | @pytest.mark.external
50 | async def test_init():
51 | cluster = NebiusCluster(asynchronous=True, debug=True)
52 | assert cluster.status == Status.created
53 |
54 |
55 | @pytest.mark.asyncio
56 | @pytest.mark.external
57 | async def test_create_cluster(cluster):
58 | assert cluster.status == Status.running
59 |
60 | cluster.scale(1)
61 | await cluster
62 | assert len(cluster.workers) == 1
63 |
64 | async with Client(cluster, asynchronous=True) as client:
65 |
66 | def inc(x):
67 | return x + 1
68 |
69 | assert await client.submit(inc, 10).result() == 11
70 |
71 |
72 | @pytest.mark.asyncio
73 | async def test_get_cloud_init():
74 | cloud_init = NebiusCluster.get_cloud_init(
75 | docker_args="--privileged",
76 | )
77 | assert " --privileged " in cloud_init
78 |
--------------------------------------------------------------------------------
/dask_cloudprovider/aws/tests/test_helper.py:
--------------------------------------------------------------------------------
1 | def test_aws_to_dict_and_back():
2 | from dask_cloudprovider.aws.helper import aws_to_dict, dict_to_aws
3 |
4 | aws_dict = [{"key": "hello", "value": "world"}]
5 | aws_upper_dict = [{"Key": "hello", "Value": "world"}]
6 | py_dict = {"hello": "world"}
7 |
8 | assert dict_to_aws(py_dict) == aws_dict
9 | assert dict_to_aws(py_dict, upper=True) == aws_upper_dict
10 | assert aws_to_dict(aws_dict) == py_dict
11 |
12 | assert aws_to_dict(dict_to_aws(py_dict, upper=True)) == py_dict
13 | assert aws_to_dict(dict_to_aws(py_dict)) == py_dict
14 | assert dict_to_aws(aws_to_dict(aws_dict)) == aws_dict
15 | assert dict_to_aws(aws_to_dict(aws_upper_dict), upper=True) == aws_upper_dict
16 |
17 |
18 | def test_get_sleep_duration_first_try():
19 | from dask_cloudprovider.aws.helper import get_sleep_duration
20 |
21 | duration = get_sleep_duration(
22 | current_try=0, min_sleep_millis=10, max_sleep_millis=5000
23 | )
24 | assert duration == 0.01
25 |
26 |
27 | def test_get_sleep_duration_max():
28 | from dask_cloudprovider.aws.helper import get_sleep_duration
29 |
30 | duration = get_sleep_duration(
31 | current_try=23, min_sleep_millis=10, max_sleep_millis=5000
32 | )
33 | assert duration == 5.0
34 |
35 |
36 | def test_get_sleep_duration_negative_try():
37 | from dask_cloudprovider.aws.helper import get_sleep_duration
38 |
39 | duration = get_sleep_duration(
40 | current_try=-1, min_sleep_millis=10, max_sleep_millis=5000
41 | )
42 | assert duration == 0.01
43 |
44 |
45 | def test_config_mixin():
46 | from dask_cloudprovider.aws.helper import ConfigMixin
47 |
48 | class MockCluster(ConfigMixin):
49 | config = None
50 | _attr1 = "foo"
51 | attr2 = None
52 |
53 | def __init__(self):
54 | self.config = {"attr2": "bar"}
55 |
56 | cluster_with_mixin = MockCluster()
57 |
58 | # Test that nothing happens if attr is already set
59 | attr1 = cluster_with_mixin._attr1
60 | cluster_with_mixin.update_attr_from_config(attr="attr1", private=True)
61 | assert cluster_with_mixin._attr1 == attr1
62 |
63 | # Test that attr is updated if existing value is None
64 | cluster_with_mixin.update_attr_from_config(attr="attr2", private=False)
65 | assert cluster_with_mixin.attr2 == "bar"
66 |
--------------------------------------------------------------------------------
/doc/source/alternatives.rst:
--------------------------------------------------------------------------------
1 | Alternatives
2 | ============
3 |
4 | Many tools and services exist today for deploying Dask clusters, many of which are commonly used on the cloud.
5 | This project aims to provide cloud native plugins and tools for Dask which can often compliment other approaches.
6 |
7 | Community tools
8 | ---------------
9 |
10 | Dask has a `vibrant ecosystem of community tooling for deploying Dask `_ on various platforms. Many of which can be used on public cloud.
11 |
12 | Kubernetes
13 | ^^^^^^^^^^
14 |
15 | `Kubernetes `_ is an extremely popular project for managing cloud workloads and is part of the broader `Cloud Native Computing Foundation (CNCF) `_ ecosystem.
16 |
17 | Dask has many options for `deploying clusters on Kubernetes `_.
18 |
19 | HPC on Cloud
20 | ^^^^^^^^^^^^
21 |
22 | Many popular HPC scheduling tools are used on the cloud and support features such as elastic scaling.
23 | If you are already leveraging HPC tools like `SLURM on the cloud `_ then `Dask has great integration with HPC schedulers `_.
24 |
25 | Hadoop/Spark/Yarn
26 | ^^^^^^^^^^^^^^^^^
27 |
28 | Many cloud platforms have popular managed services for running Apache Spark workloads.
29 |
30 | If you're already using a managed map-reduce service like `Amazon EMR `_ then check out `dask-yarn `_.
31 |
32 | Nebari
33 | ^^^^^^
34 |
35 | `Nebari `_ is an open source data science platform which can be run locally or on a cloud platform of your choice.
36 | It includes a managed Dask service built on `Dask Gateway `_ for managing Dask clusters.
37 |
38 | Managed Services
39 | ----------------
40 |
41 | Cloud vendors and third-party companies also offer managed Dask clusters as a service
42 |
43 | Coiled
44 | ^^^^^^
45 |
46 | `Coiled `_ is a mature managed Dask service that spawns clusters in your cloud account and allows you to manage them via a central control plane.
47 |
48 | Saturn Cloud
49 | ^^^^^^^^^^^^
50 |
51 | `Saturn Cloud `_ is a managed data science platform with hosted Dask clusters or the option to deploy them in your own AWS account.
52 |
--------------------------------------------------------------------------------
/doc/source/security.rst:
--------------------------------------------------------------------------------
1 | Security
2 | ========
3 |
4 | Dask Cloudprovider aims to balance ease of use with security best practices. The two are not always compatible so this document aims to outline the compromises and decisions made in this library.
5 |
6 | Public Schedulers
7 | -----------------
8 |
9 | For each cluster manager to work correctly it must be able to make a connection to the Dask scheduler on port ``8786``.
10 | In many cluster managers the default option is to expose the Dask scheduler and dashboard to the internet via a public IP address.
11 | This makes things quick and easy for new users to get up and running, but may pose a security risk long term.
12 |
13 | Many organisations have policies which do not allow users to assign public IP addresses or open ports. Our best practices
14 | advice is to use Dask Cloudprovider from within a cloud platform, either from a VM or a managed environment. Then disable public
15 | networking. For example:
16 |
17 | .. code-block:: python
18 |
19 | >>> import dask.config, dask_cloudprovider
20 | >>> dask.config.set({"cloudprovider.gcp.public_ingress": False})
21 |
22 | See each cluster manager for configuration options.
23 |
24 | Authentication and encryption
25 | -----------------------------
26 |
27 | Cluster managers such as :class:`dask_cloudprovider.aws.EC2Cluster`, :class:`dask_cloudprovider.azure.AzureVMCluster`,
28 | :class:`dask_cloudprovider.gcp.GCPCluster` and :class:`dask_cloudprovider.digitalocean.DropletCluster` enable certificate based authentication
29 | and encryption by default.
30 |
31 | When a cluster is launched with any of these cluster managers a set of temporary keys will be generated and distributed to the cluster nodes
32 | via their startup script. All communication between the client, scheduler and workers will then be encrypted and only clients and workers with
33 | valid certificates will be able to connect to the scheduler.
34 |
35 | You can also specify your own certificates using the :class:`distributed.security.Security` object.
36 |
37 | .. code-block:: python
38 |
39 | >>> from dask_cloudprovider.gcp import GCPCluster
40 | >>> from dask.distributed import Client
41 | >>> from distributed.security import Security
42 | >>> sec = Security(tls_ca_file='cluster_ca.pem',
43 | ... tls_client_cert='cli_cert.pem',
44 | ... tls_client_key='cli_key.pem',
45 | ... require_encryption=True)
46 | >>> cluster = GCPCluster(n_workers=1, security=sec)
47 | >>> client = Client(cluster)
48 | >>> client
49 |
50 |
51 | You can disable secure connections by setting the ``security`` keyword argument to ``False``. This may be desirable when troubleshooting or
52 | when running on a trusted network (entirely inside a VPC for example).
53 |
--------------------------------------------------------------------------------
/dask_cloudprovider/utils/timeout.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | import warnings
3 |
4 |
5 | class TimeoutException(RuntimeError):
6 | """Raised when a loop times out."""
7 |
8 |
9 | class Timeout:
10 | """A timeout object for use in ``while True`` loops instead of ``True``.
11 |
12 | Create an instance of this class before beginning an infinite loop and
13 | call ``run()`` instead of ``True``.
14 |
15 |
16 | Parameters
17 | ----------
18 | timeout: int
19 | Seconds before loop should timeout.
20 |
21 | error_message: str
22 | Error message to raise in an exception if timeout occurs.
23 |
24 | warn: bool
25 | Only raise a warning instead of a TimeoutException.
26 |
27 | Default ``False``.
28 | Examples
29 | --------
30 | >>> timeout = Timeout(10, "Oh no! We timed out.")
31 | >>> while timeout.run():
32 | ... time.sleep(1) # Will timeout after 10 iterations
33 | TimeoutException: Oh no! We timed out.
34 |
35 | You can also pass an exception to raise if you are suppressing for a set
36 | amount of time.
37 |
38 | >>> timeout = Timeout(10, "Oh no! We timed out.")
39 | >>> while timeout.run():
40 | ... try:
41 | ... some_function_that_raises()
42 | ... break
43 | ... except Exception as e:
44 | ... timeout.set_exception(e)
45 | ... time.sleep(1) # Will timeout after 10 iterations
46 | Exception: The exception from ``some_function_that_raises``
47 |
48 |
49 | """
50 |
51 | def __init__(self, timeout, error_message, warn=False):
52 | self.start = None
53 | self.running = False
54 | self.timeout = timeout
55 | self.error_message = error_message
56 | self.warn = warn
57 | self.exception = TimeoutException(self.error_message)
58 |
59 | def run(self):
60 | """Run the timeout.
61 |
62 | This method when called repeatedly will return ``True`` until the
63 | timeout has elapsed. It will then raise or return ``False``.
64 | """
65 | if not self.running:
66 | self.start = datetime.now()
67 | self.running = True
68 |
69 | if self.start + timedelta(seconds=self.timeout) < datetime.now():
70 | if self.warn:
71 | warnings.warn(self.error_message)
72 | return False
73 | else:
74 | raise self.exception
75 | return True
76 |
77 | def set_exception(self, e):
78 | """Modify the default timeout exception.
79 |
80 | This would be useful if you are trying something repeatedly but if it
81 | never succeeds before the timeout you want to raise the exception from
82 | the thing you are trying rather than a TimeoutException.
83 | """
84 | self.exception = e
85 |
--------------------------------------------------------------------------------
/dask_cloudprovider/ibm/tests/test_code_engine.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import dask
4 |
5 | codeengine = pytest.importorskip("ibm_code_engine_sdk.code_engine_v2")
6 |
7 | from dask_cloudprovider.ibm.code_engine import IBMCodeEngineCluster
8 | from dask.distributed import Client
9 | from distributed.core import Status
10 |
11 |
12 | async def skip_without_credentials():
13 | if dask.config.get("cloudprovider.ibm.api_key") is None:
14 | pytest.skip(
15 | """
16 | You must configure a IBM API key to run this test.
17 |
18 | Either set this in your config
19 |
20 | # cloudprovider.yaml
21 | cloudprovider:
22 | ibm:
23 | api_key: "your_api_key"
24 |
25 | Or by setting it as an environment variable
26 |
27 | export DASK_CLOUDPROVIDER__IBM__API_KEY="your_api_key"
28 |
29 | """
30 | )
31 |
32 | if dask.config.get("cloudprovider.ibm.project_id") is None:
33 | pytest.skip(
34 | """
35 | You must configure a IBM project id to run this test.
36 |
37 | Either set this in your config
38 |
39 | # cloudprovider.yaml
40 | cloudprovider:
41 | ibm:
42 | project_id: "your_project_id"
43 |
44 | Or by setting it as an environment variable
45 |
46 | export DASK_CLOUDPROVIDER__IBM__PROJECT_ID="your_project_id"
47 |
48 | """
49 | )
50 |
51 | if dask.config.get("cloudprovider.ibm.region") is None:
52 | pytest.skip(
53 | """
54 | You must configure a IBM project id to run this test.
55 |
56 | Either set this in your config
57 |
58 | # cloudprovider.yaml
59 | cloudprovider:
60 | ibm:
61 | region: "your_region"
62 |
63 | Or by setting it as an environment variable
64 |
65 | export DASK_CLOUDPROVIDER__IBM__REGION="your_region"
66 |
67 | """
68 | )
69 |
70 |
71 | @pytest.mark.asyncio
72 | async def test_init():
73 | await skip_without_credentials()
74 | cluster = IBMCodeEngineCluster(asynchronous=True)
75 | assert cluster.status == Status.created
76 |
77 |
78 | @pytest.mark.asyncio
79 | @pytest.mark.timeout(1200)
80 | @pytest.mark.external
81 | async def test_create_cluster():
82 | async with IBMCodeEngineCluster(asynchronous=True) as cluster:
83 | cluster.scale(2)
84 | await cluster
85 | assert len(cluster.workers) == 2
86 |
87 | async with Client(cluster, asynchronous=True) as client:
88 |
89 | def inc(x):
90 | return x + 1
91 |
92 | assert await client.submit(inc, 10).result() == 11
93 |
94 |
95 | @pytest.mark.asyncio
96 | @pytest.mark.timeout(1200)
97 | @pytest.mark.external
98 | async def test_create_cluster_sync():
99 | with IBMCodeEngineCluster() as cluster:
100 | with Client(cluster) as client:
101 | cluster.scale(1)
102 | client.wait_for_workers(1)
103 | assert len(cluster.workers) == 1
104 |
105 | def inc(x):
106 | return x + 1
107 |
108 | assert client.submit(inc, 10).result() == 11
109 |
--------------------------------------------------------------------------------
/doc/source/openstack.rst:
--------------------------------------------------------------------------------
1 | Openstack
2 | ============
3 |
4 | .. currentmodule:: dask_cloudprovider.openstack
5 |
6 | .. autosummary::
7 | OpenStackCluster
8 |
9 | Overview
10 | --------
11 |
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 |
15 | To authenticate with the OpenStack Identity service (Keystone)
16 |
17 | 1) Get your Authentication URL (auth_url) for OpenStack Identity service (Keystone) and put it in your Dask configuration at ``cloudprovider.openstack.auth_url``.
18 |
19 | 2) Get your `region `_ and put it in your Dask configuration at ``cloudprovider.openstack.region``.
20 | .. code-block:: console
21 |
22 | $ openstack region list
23 | +-----------+---------------+-------------+
24 | | Region | Parent Region | Description |
25 | +-----------+---------------+-------------+
26 | | RegionOne | None | |
27 | +-----------+---------------+-------------+
28 |
29 | 3) Generate an `application credential `_.
30 |
31 | .. code-block:: console
32 |
33 | $ openstack application credential create dask --unrestricted
34 | +--------------+----------------------------------------------------------------------------------------+
35 | | Field | Value |
36 | +--------------+----------------------------------------------------------------------------------------+
37 | | description | None |
38 | | expires_at | None |
39 | | id | 0a0372dbedfb4e82ab66449c3316ef1e |
40 | | name | dask |
41 | | project_id | e99b6f4b9bf84a9da27e20c9cbfe887a |
42 | | roles | Member anotherrole |
43 | | secret | ArOy6DYcLeLTRlTmfvF1TH1QmRzYbmD91cbVPOHL3ckyRaLXlaq5pTGJqvCvqg6leEvTI1SQeX3QK-3iwmdPxg |
44 | | unrestricted | True |
45 | +--------------+----------------------------------------------------------------------------------------+
46 |
47 | and put ``application_credential_id`` and ``application_credential_secret`` in your Dask configuration at ``cloudprovider.openstack.application_credential_id``
48 | and ``cloudprovider.openstack.application_credential_secret``.
49 |
50 | All of this variables can be gathered from either `OpenStack RC file `_
51 | or `clouds.yaml file `_.
52 |
53 | Example Config File
54 | ^^^^^^^^^^^^^^
55 | .. code-block:: yaml
56 |
57 | # ~/.config/dask/cloudprovider.yaml
58 |
59 | cloudprovider:
60 | openstack:
61 | region: "RegionOne"
62 | auth_url: "https://cloud.home.karatosun.xyz:5000"
63 | application_credential_id: "0a0372dbedfb4e82ab66449c3316ef1e"
64 | application_credential_secret: "ArOy6DYcLeLTRlTmfvF1TH1QmRzYbmD91cbVPOHL3ckyRaLXlaq5pTGJqvCvqg6leEvTI1SQeX3QK-3iwmdPxg"
65 | auth_type: "v3applicationcredential"
66 |
67 | You can also export them as environment variables.
68 |
69 | .. code-block:: console
70 |
71 | $ export DASK_CLOUDPROVIDER__APPLICATION_CREDENTIAL_ID="0a0372dbedfb4e82ab66449c3316ef1e"
72 |
73 |
74 | .. autoclass:: OpenStackCluster
75 | :members:
76 |
--------------------------------------------------------------------------------
/dask_cloudprovider/azure/tests/test_azurevm.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import dask
4 |
5 | azure_compute = pytest.importorskip("azure.mgmt.compute")
6 |
7 | from dask_cloudprovider.azure import AzureVMCluster
8 | from dask.distributed import Client
9 | from distributed.core import Status
10 |
11 |
12 | def skip_without_credentials(func):
13 | rg = dask.config.get("cloudprovider.azure.resource_group", None)
14 | vnet = dask.config.get("cloudprovider.azure.azurevm.vnet", None)
15 | security_group = dask.config.get("cloudprovider.azure.azurevm.security_group", None)
16 | location = dask.config.get("cloudprovider.azure.location", None)
17 | if rg is None or vnet is None or security_group is None or location is None:
18 | return pytest.mark.skip(
19 | reason="""
20 | You must configure your Azure resource group and vnet to run this test.
21 |
22 | $ export DASK_CLOUDPROVIDER__AZURE__LOCATION=""
23 | $ export DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP=""
24 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__VNET=""
25 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__SECURITY_GROUP=""
26 |
27 | """
28 | )(func)
29 | return func
30 |
31 |
32 | async def get_config():
33 | return dask.config.get("cloudprovider.azure", {})
34 |
35 |
36 | @pytest.mark.asyncio
37 | @skip_without_credentials
38 | @pytest.mark.external
39 | async def test_init():
40 | cluster = AzureVMCluster(asynchronous=True)
41 | assert cluster.status == Status.created
42 |
43 |
44 | @pytest.mark.asyncio
45 | @pytest.mark.timeout(1200)
46 | @skip_without_credentials
47 | @pytest.mark.external
48 | async def test_create_cluster():
49 | async with AzureVMCluster(asynchronous=True) as cluster:
50 | assert cluster.status == Status.running
51 |
52 | cluster.scale(2)
53 | await cluster
54 | assert len(cluster.workers) == 2
55 |
56 | async with Client(cluster, asynchronous=True) as client:
57 |
58 | def inc(x):
59 | return x + 1
60 |
61 | assert await client.submit(inc, 10).result() == 11
62 |
63 |
64 | @pytest.mark.asyncio
65 | @pytest.mark.timeout(1200)
66 | @skip_without_credentials
67 | @pytest.mark.external
68 | async def test_create_cluster_sync():
69 | with AzureVMCluster() as cluster:
70 | with Client(cluster) as client:
71 | cluster.scale(1)
72 | client.wait_for_workers(1)
73 | assert len(cluster.workers) == 1
74 |
75 | def inc(x):
76 | return x + 1
77 |
78 | assert client.submit(inc, 10).result() == 11
79 |
80 |
81 | @pytest.mark.asyncio
82 | @pytest.mark.timeout(1200)
83 | @skip_without_credentials
84 | @pytest.mark.external
85 | async def test_create_rapids_cluster_sync():
86 | with AzureVMCluster(
87 | vm_size="Standard_NC12s_v3",
88 | docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9",
89 | worker_class="dask_cuda.CUDAWorker",
90 | worker_options={"rmm_pool_size": "15GB"},
91 | ) as cluster:
92 | with Client(cluster) as client:
93 | cluster.scale(1)
94 | client.wait_for_workers(1)
95 |
96 | def gpu_mem():
97 | from pynvml.smi import nvidia_smi
98 |
99 | nvsmi = nvidia_smi.getInstance()
100 | return nvsmi.DeviceQuery("memory.free, memory.total")
101 |
102 | results = client.run(gpu_mem)
103 | for w, res in results.items():
104 | assert "total" in res["gpu"][0]["fb_memory_usage"].keys()
105 | print(res)
106 |
107 |
108 | @pytest.mark.asyncio
109 | @skip_without_credentials
110 | async def test_render_cloud_init():
111 | cloud_init = AzureVMCluster.get_cloud_init(docker_args="--privileged")
112 | assert " --privileged " in cloud_init
113 |
114 | cloud_init = AzureVMCluster.get_cloud_init(
115 | docker_image="foo/bar:baz",
116 | extra_bootstrap=["echo 'hello world'", "echo 'foo bar'"],
117 | )
118 | assert "foo/bar:baz" in cloud_init
119 | assert "- echo 'hello world'" in cloud_init
120 | assert "- echo 'foo bar'" in cloud_init
121 |
--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
1 | Dask Cloud Provider
2 | ===================
3 |
4 | *Native Cloud integration for Dask.*
5 |
6 | This package contains open source tools to help you deploy and operate Dask clusters on the cloud.
7 | It contains cluster managers which can help you launch clusters using native cloud resources like VMs or containers,
8 | it has tools and plugins for use in ANY cluster running on the cloud and is a great source of documentation for Dask cloud deployments.
9 |
10 | It is by no means the "complete" or "only" way to run Dask on the cloud, check out the :doc:`alternatives` page for more tools.
11 |
12 | Cluster managers
13 | ----------------
14 |
15 | This package provides classes for constructing and managing ephemeral Dask clusters on various
16 | cloud platforms.
17 |
18 | Dask Cloud Provider is one of many options for deploying Dask clusters, see `Deploying Dask `_ in the Dask documentation for an overview of additional options.
19 |
20 | To use a cloud provider cluster manager you can import it and instantiate it. Instantiating the class
21 | will result in cloud resources being created for you.
22 |
23 | .. code-block:: python
24 |
25 | from dask_cloudprovider.aws import FargateCluster
26 | cluster = FargateCluster(
27 | # Cluster manager specific config kwargs
28 | )
29 |
30 | You can then construct a Dask client with that cluster object to use the cluster.
31 |
32 | .. code-block:: python
33 |
34 | from dask.distributed import Client
35 | client = Client(cluster)
36 |
37 | Once you are connected to the cluster you can go ahead and use Dask and all computation will take
38 | place on your cloud resource.
39 |
40 | Once you are finished be sure to close out your cluster to shut down any cloud resources you have and end any charges.
41 |
42 | .. code-block:: python
43 |
44 | cluster.close()
45 |
46 | .. warning::
47 |
48 | Cluster managers will attempt to automatically remove hanging cloud resources on garbage collection if the cluster
49 | object is destroyed without calling ``cluster.close()``, however this is not guaranteed.
50 |
51 | To implicitly close your cluster when you are done with it you can optionally contruct the cluster manager via a
52 | context manager. However this will result in the creation and destruction of the whole cluster whenever you run
53 | this code.
54 |
55 | .. code-block:: python
56 |
57 | from dask_cloudprovider.aws import FargateCluster
58 | from dask.distributed import Client
59 |
60 | with FargateCluster(...) as cluster:
61 | with Client(cluster) as client:
62 | # Do some Dask things
63 |
64 | Plugins
65 | -------
66 |
67 | Dask components like Schedulers and Workers can benefit from being cloud-aware.
68 | This project has plugins and tools that extend these components.
69 |
70 | One example is having the workers check for termination warnings when running on ephemeral/spot instances and begin migrating data to other workers.
71 |
72 | For Azure VMs you could use the :class:`dask_cloudprovider.azure.AzurePreemptibleWorkerPlugin` to do this.
73 | It can be used on any cluster that has workers running on Azure VMs, not just ones created with :class:`dask_cloudprovider.azure.AzureVMCluster`.
74 |
75 | .. code-block:: python
76 |
77 | from distributed import Client
78 | client = Client("")
79 |
80 | from dask_cloudprovider.azure import AzurePreemptibleWorkerPlugin
81 | client.register_worker_plugin(AzurePreemptibleWorkerPlugin())
82 |
83 |
84 | .. toctree::
85 | :maxdepth: 2
86 | :hidden:
87 | :caption: Overview
88 |
89 | installation.rst
90 | config.rst
91 | alternatives.rst
92 |
93 | .. toctree::
94 | :maxdepth: 2
95 | :hidden:
96 | :caption: Providers
97 |
98 | aws.rst
99 | digitalocean.rst
100 | gcp.rst
101 | azure.rst
102 | hetzner.rst
103 | ibm.rst
104 | openstack.rst
105 | nebius.rst
106 |
107 | .. toctree::
108 | :maxdepth: 2
109 | :hidden:
110 | :caption: Advanced
111 |
112 | troubleshooting.rst
113 | security.rst
114 | gpus.rst
115 | packer.rst
116 |
117 | .. toctree::
118 | :maxdepth: 2
119 | :hidden:
120 | :caption: Developer
121 |
122 | testing.rst
123 | releasing.rst
124 |
--------------------------------------------------------------------------------
/doc/source/troubleshooting.rst:
--------------------------------------------------------------------------------
1 | Troubleshooting
2 | ===============
3 |
4 | This document contains frequently asked troubleshooting problems.
5 |
6 | Unable to connect to scheduler
7 | ------------------------------
8 |
9 | The most common issue is not being able to connect to the cluster once it has been constructed.
10 |
11 | Each cluster manager will construct a Dask scheduler and by default expose it via a public IP address. You must be able
12 | to connect to that address on ports ``8786`` and ``8787`` from wherever your Python session is.
13 |
14 | If you are unable to connect to this address it is likely that there is something wrong with your network configuration,
15 | for example you may have corporate policies implementing additional firewall rules on your account.
16 |
17 | To reduce the chances of this happening it is often simplest to run Dask Cloudprovider from within the cloud you are trying
18 | to use and configure private networking only. See your specific cluster manager docs for info.
19 |
20 | Invalid CPU or Memory
21 | ---------------------
22 |
23 | When working with ``FargateCluster`` or ``ECSCluster``, CPU and memory arguments can only take values from a fixed set of combinations.
24 |
25 | So, for example, code like this will result in an error
26 |
27 | .. code-block:: python
28 |
29 | from dask_cloudprovider.aws import FargateCluster
30 | cluster = FargateCluster(
31 | image="daskdev/dask:latest",
32 | worker_cpu=256,
33 | worker_mem=30720,
34 | n_workers=2,
35 | fargate_use_private_ip=False,
36 | scheduler_timeout="15 minutes"
37 | )
38 | client = Client(cluster)
39 | cluster
40 |
41 | # botocore.errorfactory.ClientException:
42 | # An error occurred (ClientException) when calling the RegisterTaskDefinition operation:
43 | # No Fargate configuration exists for given values.
44 |
45 |
46 | This is because ECS and Fargate task definitions with ``CPU=256`` cannot have as much memory as that code is requesting.
47 |
48 | The AWS-accepted set of combinations is documented at
49 | https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html.
50 |
51 | Requested CPU Configuration Above Limit
52 | ---------------------------------------
53 | When creating a ``FargateCluster`` or or ``ECSCluster``, or adding additional workers, you may receive an error response with
54 | "The requested CPU configuration is above your limit". This means that the scheduler and workers requested and any other
55 | EC2 resources you have running in that region use up more than your current service quota
56 | `limit for vCPUs `_.
57 |
58 | You can adjust the scheduler and/or worker CPUs with the ``scheduler_cpu`` and ``worker_cpu``
59 | `arguments `_. See the "Invalid CPU or Memory"
60 | section in this document for more information.
61 |
62 | However, to get the desired cluster configuration you'll need to request a service limit quota increase.
63 |
64 | Go to ``https://.aws.amazon.com/servicequotas/home/services/ec2/quotas`` and
65 | `request an increase `_ for
66 | "Running On-Demand Standard (A, C, D, H, I, M, R, T, Z) instances".
67 |
68 | Pulling private Docker images
69 | -----------------------------------
70 |
71 | For cluster managers like ``EC2Cluster``, ``AzureVMCluster`` and ``GCPCluster`` Docker images will be pulled onto VMs created on the cloud of your choice.
72 |
73 | If you need to pull a private Docker images which requires authentication each VM will need to be configured with credentials. These cluster managers accept
74 | and ``extra_bootstrap`` argument where you can provide additional bash commands to be run during startup. This is a good place to log into your Docker registry.
75 |
76 | .. code-block:: python
77 |
78 | from dask_cloudprovider.azure import AzureVMCluster
79 | cluster = AzureVMCluster(...
80 | docker_image="my_private_image:latest",
81 | extra_bootstrap=["docker login -u 'username' -p 'password'"])
82 |
83 | If you need to access Artifact/Container Registry in GCP, one way of doing it would be to authenticate Docker with
84 | `gcloud credential helper `_ by adding extra bootstrap params similar to
85 | the ones below:
86 |
87 | .. code-block:: python
88 |
89 | from dask_cloudprovider.gcp import GCPCluster
90 | cluster = GCPCluster(...
91 | docker_image=f"{region}-docker.pkg.dev/{project}/{repo}/{image}:{tag}",
92 | extra_bootstrap=[f"gcloud auth configure-docker {region}-docker.pkg.dev"])
93 |
--------------------------------------------------------------------------------
/dask_cloudprovider/aws/helper.py:
--------------------------------------------------------------------------------
1 | """Helper functions for working with AWS services."""
2 |
3 | from datetime import datetime
4 |
5 | DEFAULT_SECURITY_GROUP_NAME = "dask-default"
6 |
7 |
8 | def dict_to_aws(py_dict, upper=False, key_string=None, value_string=None):
9 | key_string = key_string or ("Key" if upper else "key")
10 | value_string = value_string or ("Value" if upper else "value")
11 | return [{key_string: key, value_string: value} for key, value in py_dict.items()]
12 |
13 |
14 | def aws_to_dict(aws_dict):
15 | try:
16 | return {item["key"]: item["value"] for item in aws_dict}
17 | except KeyError:
18 | return {item["Key"]: item["Value"] for item in aws_dict}
19 |
20 |
21 | # https://aws.amazon.com/blogs/messaging-and-targeting/how-to-handle-a-throttling-maximum-sending-rate-exceeded-error/
22 | def get_sleep_duration(current_try, min_sleep_millis=10, max_sleep_millis=5000):
23 | current_try = max(1, current_try)
24 | current_sleep_millis = min_sleep_millis * current_try**2
25 | return min(current_sleep_millis, max_sleep_millis) / 1000 # return in seconds
26 |
27 |
28 | class ConfigMixin:
29 | def update_attr_from_config(self, attr: str, private: bool):
30 | """Update class attribute of given cluster based on config, if not already set. If `private` is True, the class
31 | attribute will be prefixed with an underscore.
32 |
33 | This mixin can be applied to any class that has a config dict attribute.
34 | """
35 | prefix = "_" if private else ""
36 | if getattr(self, f"{prefix}{attr}") is None:
37 | setattr(self, f"{prefix}{attr}", self.config.get(attr))
38 |
39 |
40 | async def get_latest_ami_id(client, name_glob, owner):
41 | images = await client.describe_images(
42 | Filters=[
43 | {"Name": "name", "Values": [name_glob]},
44 | {"Name": "owner-id", "Values": [owner]},
45 | ]
46 | )
47 | creation_date = None
48 | image_id = None
49 |
50 | for image in images["Images"]:
51 | image_date = datetime.strptime(image["CreationDate"], "%Y-%m-%dT%H:%M:%S.%fZ")
52 | if creation_date is None or creation_date < image_date:
53 | image_id = image["ImageId"]
54 | creation_date = image_date
55 | return image_id
56 |
57 |
58 | async def get_default_vpc(client):
59 | vpcs = (await client.describe_vpcs())["Vpcs"]
60 | [vpc] = [vpc for vpc in vpcs if vpc["IsDefault"]]
61 | return vpc["VpcId"]
62 |
63 |
64 | async def get_vpc_subnets(client, vpc):
65 | vpcs = (await client.describe_vpcs())["Vpcs"]
66 | [vpc] = [x for x in vpcs if x["VpcId"] == vpc]
67 | subnets = (await client.describe_subnets())["Subnets"]
68 | return [subnet["SubnetId"] for subnet in subnets if subnet["VpcId"] == vpc["VpcId"]]
69 |
70 |
71 | async def get_security_group(client, vpc, create_default=True):
72 | try:
73 | response = await client.describe_security_groups(
74 | GroupNames=[DEFAULT_SECURITY_GROUP_NAME]
75 | )
76 | groups = response["SecurityGroups"]
77 | except Exception:
78 | groups = []
79 | if len(groups) > 0:
80 | return groups[0]["GroupId"]
81 | else:
82 | if create_default:
83 | try:
84 | return await create_default_security_group(
85 | client, DEFAULT_SECURITY_GROUP_NAME, vpc
86 | )
87 | except Exception as e:
88 | raise RuntimeError(
89 | "Unable to create default security group. Please specify manually."
90 | ) from e
91 | else:
92 | raise RuntimeError(
93 | "Unable to find suitable security group. Please specify manually."
94 | )
95 |
96 |
97 | async def create_default_security_group(client, group_name, vpc, tags):
98 | response = await client.create_security_group(
99 | Description="A default security group for Dask",
100 | GroupName=group_name,
101 | VpcId=vpc,
102 | TagSpecifications=[
103 | {
104 | "ResourceType": "security-group",
105 | "Tags": [
106 | {"Key": k, "Value": v}
107 | for k, v in (tags or {}).items()
108 | if k and v # Filter out empty tags
109 | ],
110 | }
111 | ],
112 | DryRun=False,
113 | )
114 |
115 | await client.authorize_security_group_ingress(
116 | GroupId=response["GroupId"],
117 | IpPermissions=[
118 | {
119 | "IpProtocol": "TCP",
120 | "FromPort": 8786,
121 | "ToPort": 8787,
122 | "IpRanges": [{"CidrIp": "0.0.0.0/0", "Description": "Anywhere"}],
123 | "Ipv6Ranges": [{"CidrIpv6": "::/0", "Description": "Anywhere"}],
124 | },
125 | {
126 | "IpProtocol": "TCP",
127 | "FromPort": 0,
128 | "ToPort": 65535,
129 | "UserIdGroupPairs": [{"GroupId": response["GroupId"]}],
130 | },
131 | ],
132 | DryRun=False,
133 | )
134 |
135 | return response["GroupId"]
136 |
--------------------------------------------------------------------------------
/dask_cloudprovider/gcp/tests/test_gcp.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import dask
4 | from dask_cloudprovider.gcp.instances import (
5 | GCPCluster,
6 | GCPCompute,
7 | GCPCredentialsError,
8 | )
9 | from dask.distributed import Client
10 | from distributed.core import Status
11 |
12 |
13 | def skip_without_credentials():
14 | try:
15 | _ = GCPCompute()
16 | except GCPCredentialsError:
17 | pytest.skip(
18 | """
19 | You must configure your GCP credentials to run this test.
20 |
21 | $ google auth login
22 |
23 | or
24 |
25 | $ export GOOGLE_APPLICATION_CREDENTIALS=
26 |
27 | """
28 | )
29 |
30 | if not dask.config.get("cloudprovider.gcp.projectid"):
31 | pytest.skip(
32 | """
33 | You must configure your Google project ID to run this test.
34 |
35 | # ~/.config/dask/cloudprovider.yaml
36 | cloudprovider:
37 | gcp:
38 | projectid: "YOUR PROJECT ID"
39 |
40 | or
41 |
42 | $ export DASK_CLOUDPROVIDER__GCP__PROJECTID="YOUR PROJECT ID"
43 |
44 | """
45 | )
46 |
47 |
48 | @pytest.mark.asyncio
49 | async def test_init():
50 | skip_without_credentials()
51 |
52 | cluster = GCPCluster(asynchronous=True)
53 | assert cluster.status == Status.created
54 |
55 |
56 | @pytest.mark.asyncio
57 | async def test_get_cloud_init():
58 | skip_without_credentials()
59 | cloud_init = GCPCluster.get_cloud_init(
60 | security=True,
61 | docker_args="--privileged",
62 | extra_bootstrap=["gcloud auth print-access-token"],
63 | )
64 | assert "dask-scheduler" in cloud_init
65 | assert "# Bootstrap" in cloud_init
66 | assert " --privileged " in cloud_init
67 | assert "- gcloud auth print-access-token" in cloud_init
68 |
69 |
70 | @pytest.mark.asyncio
71 | @pytest.mark.timeout(1200)
72 | @pytest.mark.external
73 | async def test_create_cluster():
74 | skip_without_credentials()
75 |
76 | async with GCPCluster(
77 | asynchronous=True, env_vars={"FOO": "bar"}, security=True
78 | ) as cluster:
79 | assert cluster.status == Status.running
80 |
81 | cluster.scale(2)
82 | await cluster
83 | assert len(cluster.workers) == 2
84 |
85 | async with Client(cluster, asynchronous=True) as client:
86 |
87 | def inc(x):
88 | return x + 1
89 |
90 | def check_env():
91 | import os
92 |
93 | return os.environ["FOO"]
94 |
95 | assert await client.submit(inc, 10).result() == 11
96 | assert await client.submit(check_env).result() == "bar"
97 |
98 |
99 | @pytest.mark.asyncio
100 | @pytest.mark.timeout(1200)
101 | @pytest.mark.external
102 | async def test_create_cluster_sync():
103 | skip_without_credentials()
104 |
105 | cluster = GCPCluster(n_workers=1)
106 | client = Client(cluster)
107 |
108 | def inc(x):
109 | return x + 1
110 |
111 | assert client.submit(inc, 10).result() == 11
112 |
113 |
114 | @pytest.mark.asyncio
115 | @pytest.mark.timeout(1200)
116 | @pytest.mark.external
117 | async def test_create_rapids_cluster():
118 | skip_without_credentials()
119 |
120 | async with GCPCluster(
121 | source_image="projects/nv-ai-infra/global/images/ngc-docker-11-20200916",
122 | zone="us-east1-c",
123 | machine_type="n1-standard-1",
124 | filesystem_size=50,
125 | ngpus=2,
126 | gpu_type="nvidia-tesla-t4",
127 | docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9",
128 | worker_class="dask_cuda.CUDAWorker",
129 | worker_options={"rmm_pool_size": "15GB"},
130 | asynchronous=True,
131 | auto_shutdown=True,
132 | bootstrap=False,
133 | ) as cluster:
134 | assert cluster.status == Status.running
135 |
136 | cluster.scale(1)
137 |
138 | await cluster
139 |
140 | assert len(cluster.workers) == 1
141 |
142 | client = Client(cluster, asynchronous=True) # noqa
143 | await client
144 |
145 | def gpu_mem():
146 | from pynvml.smi import nvidia_smi
147 |
148 | nvsmi = nvidia_smi.getInstance()
149 | return nvsmi.DeviceQuery("memory.free, memory.total")
150 |
151 | results = await client.run(gpu_mem)
152 | for w, res in results.items():
153 | assert "total" in res["gpu"][0]["fb_memory_usage"].keys()
154 | print(res)
155 |
156 |
157 | @pytest.mark.timeout(1200)
158 | @pytest.mark.external
159 | def test_create_rapids_cluster_sync():
160 | skip_without_credentials()
161 | cluster = GCPCluster(
162 | source_image="projects/nv-ai-infra/global/images/packer-1607527229",
163 | network="dask-gcp-network-test",
164 | zone="us-east1-c",
165 | machine_type="n1-standard-1",
166 | filesystem_size=50,
167 | ngpus=2,
168 | gpu_type="nvidia-tesla-t4",
169 | docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9",
170 | worker_class="dask_cuda.CUDAWorker",
171 | worker_options={"rmm_pool_size": "15GB"},
172 | asynchronous=False,
173 | bootstrap=False,
174 | )
175 |
176 | cluster.scale(1)
177 |
178 | client = Client(cluster) # noqa
179 | client.wait_for_workers(2)
180 |
181 | def gpu_mem():
182 | from pynvml.smi import nvidia_smi
183 |
184 | nvsmi = nvidia_smi.getInstance()
185 | return nvsmi.DeviceQuery("memory.free, memory.total")
186 |
187 | results = client.run(gpu_mem)
188 | for w, res in results.items():
189 | assert "total" in res["gpu"][0]["fb_memory_usage"].keys()
190 | print(res)
191 | cluster.close()
192 |
--------------------------------------------------------------------------------
/doc/source/azure.rst:
--------------------------------------------------------------------------------
1 | Microsoft Azure
2 | ===============
3 |
4 | .. currentmodule:: dask_cloudprovider.azure
5 |
6 | .. autosummary::
7 | AzureVMCluster
8 |
9 | Overview
10 | --------
11 |
12 | Authentication
13 | ^^^^^^^^^^^^^^
14 |
15 | In order to create clusters on Azure you need to set your authentication credentials.
16 | You can do this via the ``az`` `command line tool `_.
17 |
18 | .. code-block:: console
19 |
20 | $ az login
21 |
22 | .. note::
23 |
24 | Setting the default output to ``table`` with ``az configure`` will make the ``az`` tool much easier to use.
25 |
26 | Resource Groups
27 | ^^^^^^^^^^^^^^^
28 |
29 | To create resources on Azure they must be placed in a resource group. Dask Cloudprovider will need a group to create
30 | Dask components in.
31 |
32 | You can list existing groups via the cli.
33 |
34 | .. code-block:: console
35 |
36 | $ az group list
37 |
38 | You can also create a new resource group if you do not have an existing one.
39 |
40 | .. code-block:: console
41 |
42 | $ az group create --location --name --subscription
43 |
44 | You can get a full list of locations with ``az account list-locations`` and subscriptions with ``az account list``.
45 |
46 | Take note of your resource group name for later.
47 |
48 | Virtual Networks
49 | ^^^^^^^^^^^^^^^^
50 |
51 | Compute resources on Azure must be placed in virtual networks (vnet). Dask Cloudprovider will require an existing vnet to connect
52 | compute resources to.
53 |
54 | You can list existing vnets via the cli.
55 |
56 | .. code-block:: console
57 |
58 | $ az network vnet list
59 |
60 | You can also create a new vnet via the cli.
61 |
62 | .. code-block:: console
63 |
64 | $ az network vnet create -g -n --address-prefix 10.0.0.0/16 \
65 | --subnet-name --subnet-prefix 10.0.0.0/24
66 |
67 | This command will create a new vnet in your resource group with one subnet with the ``10.0.0.0/24`` prefix. For more than 255 compute resources you will need additional subnets.
68 |
69 | Take note of your vnet name for later.
70 |
71 | Security Groups
72 | ^^^^^^^^^^^^^^^
73 |
74 | To allow network traffic to reach your Dask cluster you will need to create a security group which allows traffic on ports 8786-8787 from wherever you are.
75 |
76 | You can list existing security groups via the cli.
77 |
78 | .. code-block:: console
79 |
80 | $ az network nsg list
81 |
82 | Or you can create a new security group.
83 |
84 | .. code-block:: console
85 |
86 | $ az network nsg create -g --name
87 | $ az network nsg rule create -g --nsg-name -n MyNsgRuleWithAsg \
88 | --priority 500 --source-address-prefixes Internet --destination-port-ranges 8786 8787 \
89 | --destination-address-prefixes '*' --access Allow --protocol Tcp --description "Allow Internet to Dask on ports 8786,8787."
90 |
91 | This example allows all traffic to 8786-8787 from the internet. It is recommended you make your rules more restrictive than this by limiting it to your corporate network
92 | or specific IP.
93 |
94 | Again take note of this security group name for later.
95 |
96 | Extra options
97 | ^^^^^^^^^^^^^
98 |
99 | To further customize the VMs created, you can provide ``extra_vm_options`` to :class:`AzureVMCluster`. For example, to set the identity
100 | of the virtual machines to a (previously created) user assigned identity, create an ``azure.mgmt.compute.models.VirtualMachineIdentity``
101 |
102 | .. code-block:: python
103 |
104 | >>> import os
105 | >>> import azure.identity
106 | >>> import dask_cloudprovider.azure
107 | >>> import azure.mgmt.compute.models
108 |
109 | >>> subscription_id = os.environ["DASK_CLOUDPROVIDER__AZURE__SUBSCRIPTION_ID"]
110 | >>> rg_name = os.environ["DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP"]
111 | >>> identity_name = "dask-cloudprovider-identity"
112 | >>> v = azure.mgmt.compute.models.UserAssignedIdentitiesValue()
113 | >>> user_assigned_identities = {
114 | ... f"/subscriptions/{subscription_id}/resourcegroups/{rg_name}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{identity_name}": v
115 | ... }
116 | >>> identity = azure.mgmt.compute.models.VirtualMachineIdentity(
117 | ... type="UserAssigned",
118 | ... user_assigned_identities=user_assigned_identities
119 | ... )
120 |
121 |
122 | And then provide that to :class:`AzureVMCluster`
123 |
124 | .. code-block:: python
125 |
126 | >>> cluster = dask_cloudprovider.azure.AzureVMCluster(extra_vm_options={"identity": identity.as_dict()})
127 | >>> cluster.scale(1)
128 |
129 | Dask Configuration
130 | ^^^^^^^^^^^^^^^^^^
131 |
132 | You'll provide the names or IDs of the Azure resources when you create a :class:`AzureVMCluster`. You can specify
133 | these values manually, or use Dask's `configuration system `_
134 | system. For example, the ``resource_group`` value can be specified using an environment variable:
135 |
136 | .. code-block:: console
137 |
138 | $ export DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP=""
139 | $ python
140 |
141 | Or you can set it in a YAML configuration file.
142 |
143 | .. code-block:: yaml
144 |
145 | cloudprovider:
146 | azure:
147 | resource_group: ""
148 | azurevm:
149 | vnet: ""
150 |
151 | Note that the options controlling the VMs are under the `cloudprovider.azure.azurevm` key.
152 |
153 | See :doc:`config` for more.
154 |
155 | AzureVM
156 | -------
157 |
158 | .. autoclass:: AzureVMCluster
159 | :members:
160 |
161 | Azure Spot Instance Plugin
162 | --------------------------
163 |
164 | .. autoclass:: AzurePreemptibleWorkerPlugin
165 | :members:
166 |
--------------------------------------------------------------------------------
/dask_cloudprovider/aws/tests/test_ecs.py:
--------------------------------------------------------------------------------
1 | from unittest import mock
2 | from unittest.mock import AsyncMock
3 |
4 | import pytest
5 |
6 | aiobotocore = pytest.importorskip("aiobotocore")
7 |
8 |
9 | def test_import():
10 | from dask_cloudprovider.aws import ECSCluster # noqa
11 | from dask_cloudprovider.aws import FargateCluster # noqa
12 |
13 |
14 | def test_reuse_ecs_cluster():
15 | from dask_cloudprovider.aws import ECSCluster # noqa
16 |
17 | fc1_name = "Spooky"
18 | fc2_name = "Weevil"
19 | vpc_name = "MyNetwork"
20 | vpc_subnets = ["MySubnet1", "MySubnet2"]
21 | cluster_arn = "CompletelyMadeUp"
22 | cluster_name = "Crunchy"
23 | log_group_name = "dask-ecs"
24 |
25 | expected_execution_role_name1 = f"dask-{fc1_name}-execution-role"
26 | expected_task_role_name1 = f"dask-{fc1_name}-task-role"
27 | expected_log_stream_prefix1 = f"{cluster_name}/{fc1_name}"
28 | expected_security_group_name1 = f"dask-{fc1_name}-security-group"
29 | expected_scheduler_task_name1 = f"dask-{fc1_name}-scheduler"
30 | expected_worker_task_name1 = f"dask-{fc1_name}-worker"
31 |
32 | expected_execution_role_name2 = f"dask-{fc2_name}-execution-role"
33 | expected_task_role_name2 = f"dask-{fc2_name}-task-role"
34 | expected_log_stream_prefix2 = f"{cluster_name}/{fc2_name}"
35 | expected_security_group_name2 = f"dask-{fc2_name}-security-group"
36 | expected_scheduler_task_name2 = f"dask-{fc2_name}-scheduler"
37 | expected_worker_task_name2 = f"dask-{fc2_name}-worker"
38 |
39 | mock_client = AsyncMock()
40 | mock_client.describe_clusters.return_value = {
41 | "clusters": [{"clusterName": cluster_name}]
42 | }
43 | mock_client.list_account_settings.return_value = {"settings": {"value": "enabled"}}
44 | mock_client.create_role.return_value = {"Role": {"Arn": "Random"}}
45 | mock_client.describe_log_groups.return_value = {"logGroups": []}
46 |
47 | class MockSession:
48 | class MockClient:
49 | async def __aenter__(self, *args, **kwargs):
50 | return mock_client
51 |
52 | async def __aexit__(self, *args, **kwargs):
53 | return
54 |
55 | def create_client(self, *args, **kwargs):
56 | return MockSession.MockClient()
57 |
58 | with (
59 | mock.patch(
60 | "dask_cloudprovider.aws.ecs.get_session", return_value=MockSession()
61 | ),
62 | mock.patch("distributed.deploy.spec.SpecCluster._start"),
63 | mock.patch("weakref.finalize"),
64 | ):
65 | # Make ourselves a test cluster
66 | fc1 = ECSCluster(
67 | name=fc1_name,
68 | cluster_arn=cluster_arn,
69 | vpc=vpc_name,
70 | subnets=vpc_subnets,
71 | skip_cleanup=True,
72 | )
73 | # Are we re-using the existing ECS cluster?
74 | assert fc1.cluster_name == cluster_name
75 | # Have we made completely unique AWS resources to run on that cluster?
76 | assert fc1._execution_role_name == expected_execution_role_name1
77 | assert fc1._task_role_name == expected_task_role_name1
78 | assert fc1._cloudwatch_logs_stream_prefix == expected_log_stream_prefix1
79 | assert (
80 | fc1.scheduler_spec["options"]["log_stream_prefix"]
81 | == expected_log_stream_prefix1
82 | )
83 | assert (
84 | fc1.new_spec["options"]["log_stream_prefix"] == expected_log_stream_prefix1
85 | )
86 | assert fc1.cloudwatch_logs_group == log_group_name
87 | assert fc1.scheduler_spec["options"]["log_group"] == log_group_name
88 | assert fc1.new_spec["options"]["log_group"] == log_group_name
89 | sg_calls = mock_client.create_security_group.call_args_list
90 | assert len(sg_calls) == 1
91 | assert sg_calls[0].kwargs["GroupName"] == expected_security_group_name1
92 | td_calls = mock_client.register_task_definition.call_args_list
93 | assert len(td_calls) == 2
94 | assert td_calls[0].kwargs["family"] == expected_scheduler_task_name1
95 | assert td_calls[1].kwargs["family"] == expected_worker_task_name1
96 |
97 | # Reset mocks ready for second cluster
98 | mock_client.create_security_group.reset_mock()
99 | mock_client.register_task_definition.reset_mock()
100 |
101 | # Make ourselves a second test cluster on the same ECS cluster
102 | fc2 = ECSCluster(
103 | name=fc2_name,
104 | cluster_arn=cluster_arn,
105 | vpc=vpc_name,
106 | subnets=vpc_subnets,
107 | skip_cleanup=True,
108 | )
109 | # Are we re-using the existing ECS cluster?
110 | assert fc2.cluster_name == cluster_name
111 | # Have we made completely unique AWS resources to run on that cluster?
112 | assert fc2._execution_role_name == expected_execution_role_name2
113 | assert fc2._task_role_name == expected_task_role_name2
114 | assert fc2._cloudwatch_logs_stream_prefix == expected_log_stream_prefix2
115 | assert (
116 | fc2.scheduler_spec["options"]["log_stream_prefix"]
117 | == expected_log_stream_prefix2
118 | )
119 | assert (
120 | fc2.new_spec["options"]["log_stream_prefix"] == expected_log_stream_prefix2
121 | )
122 | assert fc2.cloudwatch_logs_group == log_group_name
123 | assert fc2.scheduler_spec["options"]["log_group"] == log_group_name
124 | assert fc2.new_spec["options"]["log_group"] == log_group_name
125 | sg_calls = mock_client.create_security_group.call_args_list
126 | assert len(sg_calls) == 1
127 | assert sg_calls[0].kwargs["GroupName"] == expected_security_group_name2
128 | td_calls = mock_client.register_task_definition.call_args_list
129 | assert len(td_calls) == 2
130 | assert td_calls[0].kwargs["family"] == expected_scheduler_task_name2
131 | assert td_calls[1].kwargs["family"] == expected_worker_task_name2
132 |
133 | # Finish up
134 | fc1.close()
135 | fc2.close()
136 |
--------------------------------------------------------------------------------
/dask_cloudprovider/hetzner/vserver.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import dask
3 |
4 | from dask_cloudprovider.generic.vmcluster import (
5 | VMCluster,
6 | VMInterface,
7 | SchedulerMixin,
8 | WorkerMixin,
9 | )
10 |
11 | try:
12 | import hcloud
13 | except ImportError as e:
14 | msg = (
15 | "Dask Cloud Provider Hetzner requirements are not installed.\n\n"
16 | "Please pip install as follows:\n\n"
17 | ' pip install "dask-cloudprovider[hcloud]" --upgrade # or python -m pip install'
18 | )
19 | raise ImportError(msg) from e
20 |
21 | from hcloud.images.domain import Image
22 | from hcloud.server_types.domain import ServerType
23 | from hcloud.actions.domain import Action
24 |
25 |
26 | class VServer(VMInterface):
27 | def __init__(
28 | self,
29 | cluster: str,
30 | config,
31 | env_vars: dict = None,
32 | bootstrap=None,
33 | extra_bootstrap=None,
34 | docker_image: str = None,
35 | image: str = None,
36 | location: str = None,
37 | server_type: str = None,
38 | *args,
39 | **kwargs,
40 | ):
41 | super().__init__(*args, **kwargs)
42 | self.cluster = cluster
43 | self.config = config
44 | self.location = location
45 | self.bootstrap = bootstrap
46 | self.extra_bootstrap = extra_bootstrap
47 | self.env_vars = env_vars
48 | self.client = hcloud.Client(self.config.get("token"))
49 | self.server_type = ServerType(server_type)
50 | self.image = Image(name=image)
51 | self.docker_image = docker_image
52 |
53 | async def create_vm(self):
54 | await self.call_async(
55 | self.client.servers.create,
56 | server_type=self.server_type,
57 | image=self.image,
58 | name=self.name,
59 | user_data=self.cluster.render_process_cloud_init(self),
60 | )
61 |
62 | self.server = self.client.servers.get_by_name(self.name)
63 | for action in self.server.get_actions():
64 | while action.status != Action.STATUS_SUCCESS:
65 | await self.call_async(action.reload)
66 | await asyncio.sleep(0.1)
67 | self.cluster._log(f"Created Hetzner vServer {self.name}")
68 |
69 | return self.server.public_net.ipv4.ip, None
70 |
71 | async def destroy_vm(self):
72 | await self.call_async(self.client.servers.delete, server=self.server)
73 | self.cluster._log(f"Terminated vServer {self.name}")
74 |
75 |
76 | class HetznerScheduler(SchedulerMixin, VServer):
77 | """Scheduler running on a Hetzner server."""
78 |
79 |
80 | class HetznerWorker(WorkerMixin, VServer):
81 | """Worker running on a Hetzner server."""
82 |
83 |
84 | class HetznerCluster(VMCluster):
85 | """Cluster running on Hetzner cloud vServers.
86 |
87 | VMs in Hetzner are referred to as vServers. This cluster manager constructs a Dask cluster
88 | running on VMs.
89 |
90 | When configuring your cluster you may find it useful to install the ``hcloud`` tool for querying the
91 | Hetzner API for available options.
92 |
93 | https://github.com/hetznercloud/cli
94 |
95 | Parameters
96 | ----------
97 | image: str
98 | The image to use for the host OS. This should be a Ubuntu variant.
99 | You can list available images with ``hcloud image list|grep Ubuntu``.
100 | location: str
101 | The Hetzner location to launch you cluster in. A full list can be obtained with ``hcloud location list``.
102 | server_type: str
103 | The VM server type. You can get a full list with ``hcloud server-type list``.
104 | The default is ``cx11`` which is vServer with 2GB RAM and 1 vCPU.
105 | n_workers: int
106 | Number of workers to initialise the cluster with. Defaults to ``0``.
107 | worker_module: str
108 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
109 | worker_options: dict
110 | Params to be passed to the worker class.
111 | See :class:`distributed.worker.Worker` for default worker class.
112 | If you set ``worker_module`` then refer to the docstring for the custom worker class.
113 | scheduler_options: dict
114 | Params to be passed to the scheduler class.
115 | See :class:`distributed.scheduler.Scheduler`.
116 | env_vars: dict
117 | Environment variables to be passed to the worker.
118 | extra_bootstrap: list[str] (optional)
119 | Extra commands to be run during the bootstrap phase.
120 |
121 | Example
122 | --------
123 |
124 | >>> from dask_cloudprovider.hetzner import HetznerCluster
125 | >>> cluster = HetznerCluster(n_workers=1)
126 |
127 | >>> from dask.distributed import Client
128 | >>> client = Client(cluster)
129 |
130 | >>> import dask.array as da
131 | >>> arr = da.random.random((1000, 1000), chunks=(100, 100))
132 | >>> arr.mean().compute()
133 |
134 | >>> client.close()
135 | >>> cluster.close()
136 |
137 | """
138 |
139 | def __init__(
140 | self,
141 | bootstrap: str = None,
142 | image: str = None,
143 | location: str = None,
144 | server_type: str = None,
145 | docker_image: str = None,
146 | **kwargs,
147 | ):
148 | self.config = dask.config.get("cloudprovider.hetzner", {})
149 |
150 | self.scheduler_class = HetznerScheduler
151 | self.worker_class = HetznerWorker
152 |
153 | self.image = dask.config.get("cloudprovider.hetzner.image", override_with=image)
154 | self.docker_image = dask.config.get(
155 | "cloudprovider.hetzner.docker_image", override_with=docker_image
156 | )
157 | self.location = dask.config.get(
158 | "cloudprovider.hetzner.location", override_with=location
159 | )
160 | self.server_type = dask.config.get(
161 | "cloudprovider.hetzner.server_type", override_with=server_type
162 | )
163 | self.bootstrap = dask.config.get(
164 | "cloudprovider.hetzner.bootstrap", override_with=bootstrap
165 | )
166 |
167 | self.options = {
168 | "bootstrap": self.bootstrap,
169 | "cluster": self,
170 | "config": self.config,
171 | "docker_image": self.docker_image,
172 | "image": self.image,
173 | "location": self.location,
174 | "server_type": self.server_type,
175 | }
176 | self.scheduler_options = {**self.options}
177 | self.worker_options = {**self.options}
178 | super().__init__(**kwargs)
179 |
--------------------------------------------------------------------------------
/dask_cloudprovider/aws/tests/test_ec2.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | aiobotocore = pytest.importorskip("aiobotocore")
4 |
5 | from dask_cloudprovider.aws.ec2 import EC2Cluster
6 | from dask_cloudprovider.aws.helper import get_latest_ami_id
7 | from dask.distributed import Client
8 | from distributed.core import Status
9 |
10 |
11 | async def skip_without_credentials():
12 | try:
13 | async with aiobotocore.get_session().create_client("sts") as client:
14 | await client.get_caller_identity()
15 | except Exception:
16 | pytest.skip(
17 | """
18 | You must configure Your AWS credentials to run this test.
19 |
20 | $ aws configure
21 |
22 | """
23 | )
24 |
25 |
26 | @pytest.fixture
27 | @pytest.mark.external
28 | async def cluster():
29 | await skip_without_credentials()
30 | async with EC2Cluster(asynchronous=True) as cluster:
31 | yield cluster
32 |
33 |
34 | @pytest.fixture
35 | @pytest.mark.external
36 | async def cluster_sync():
37 | await skip_without_credentials()
38 | cluster = EC2Cluster()
39 | yield cluster
40 |
41 |
42 | @pytest.fixture
43 | @pytest.mark.external
44 | async def cluster_rapids():
45 | await skip_without_credentials()
46 | async with EC2Cluster(
47 | asynchronous=True,
48 | # Deep Learning AMI (Ubuntu 18.04)
49 | ami="ami-0c7c7d78f752f8f17",
50 | # Python version must match local version and CUDA version must match AMI CUDA version
51 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
52 | instance_type="p3.2xlarge",
53 | bootstrap=False,
54 | filesystem_size=120,
55 | ) as cluster:
56 | yield cluster
57 |
58 |
59 | @pytest.fixture
60 | @pytest.mark.external
61 | async def cluster_rapids_packer():
62 | await skip_without_credentials()
63 | async with EC2Cluster(
64 | asynchronous=True,
65 | # Packer AMI
66 | ami="ami-04e5539cb82859e69",
67 | # Python version must match local version and CUDA version must match AMI CUDA version
68 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
69 | instance_type="p3.2xlarge",
70 | bootstrap=False,
71 | filesystem_size=120,
72 | ) as cluster:
73 | yield cluster
74 |
75 |
76 | @pytest.fixture
77 | @pytest.mark.external
78 | async def cluster_packer():
79 | await skip_without_credentials()
80 | async with EC2Cluster(
81 | asynchronous=True, ami="ami-0e6187593ace05a0c", bootstrap=False
82 | ) as cluster:
83 | yield cluster
84 |
85 |
86 | @pytest.fixture
87 | async def ec2_client():
88 | await skip_without_credentials()
89 | async with aiobotocore.get_session().create_client("ec2") as client:
90 | yield client
91 |
92 |
93 | @pytest.mark.asyncio
94 | @pytest.mark.external
95 | async def test_init():
96 | cluster = EC2Cluster(asynchronous=True)
97 | assert cluster.status == Status.created
98 |
99 |
100 | @pytest.mark.asyncio
101 | @pytest.mark.timeout(600)
102 | async def test_create_cluster(cluster):
103 | assert cluster.status == Status.running
104 |
105 | cluster.scale(2)
106 | await cluster
107 | assert len(cluster.workers) == 2
108 |
109 | async with Client(cluster, asynchronous=True) as client:
110 | inc = lambda x: x + 1
111 | assert await client.submit(inc, 10).result() == 11
112 |
113 |
114 | @pytest.mark.asyncio
115 | @pytest.mark.timeout(600)
116 | async def test_create_cluster_sync(cluster_sync):
117 | assert cluster_sync.status == Status.running
118 |
119 | cluster_sync.scale(2)
120 |
121 | with Client(cluster_sync) as client:
122 | inc = lambda x: x + 1
123 | assert client.submit(inc, 10).result() == 11
124 |
125 |
126 | @pytest.mark.asyncio
127 | @pytest.mark.timeout(600)
128 | async def test_create_cluster_with_packer(cluster_packer):
129 | assert cluster_packer.status == Status.running
130 |
131 | cluster_packer.scale(2)
132 | await cluster_packer
133 | assert len(cluster_packer.workers) == 2
134 |
135 | async with Client(cluster_packer, asynchronous=True) as client:
136 | inc = lambda x: x + 1
137 | assert await client.submit(inc, 10).result() == 11
138 |
139 |
140 | @pytest.mark.asyncio
141 | @pytest.mark.timeout(1200)
142 | async def test_create_rapids_cluster(cluster_rapids):
143 | assert cluster_rapids.status == Status.running
144 |
145 | cluster_rapids.scale(1)
146 | await cluster_rapids
147 | assert len(cluster_rapids.workers) == 1
148 |
149 | async with Client(cluster_rapids, asynchronous=True) as client:
150 |
151 | def f():
152 | import cupy
153 |
154 | return float(cupy.random.random(100).mean())
155 |
156 | assert await client.submit(f).result() < 1
157 |
158 |
159 | @pytest.mark.asyncio
160 | @pytest.mark.timeout(1200)
161 | async def test_create_rapids_cluster_with_packer(cluster_rapids_packer):
162 | assert cluster_rapids_packer.status == Status.running
163 |
164 | cluster_rapids_packer.scale(1)
165 | await cluster_rapids_packer
166 | assert len(cluster_rapids_packer.workers) == 1
167 |
168 | async with Client(cluster_rapids_packer, asynchronous=True) as client:
169 |
170 | def f():
171 | import cupy
172 |
173 | return float(cupy.random.random(100).mean())
174 |
175 | assert await client.submit(f).result() < 1
176 |
177 |
178 | @pytest.mark.asyncio
179 | async def test_get_ubuntu_image(ec2_client):
180 | image = await get_latest_ami_id(
181 | ec2_client,
182 | "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*",
183 | "099720109477", # Canonical
184 | )
185 | assert "ami-" in image
186 |
187 |
188 | @pytest.mark.asyncio
189 | async def test_get_cloud_init():
190 | cloud_init = EC2Cluster.get_cloud_init(
191 | env_vars={"EXTRA_PIP_PACKAGES": "s3fs"},
192 | docker_args="--privileged",
193 | )
194 | assert "systemctl start docker" in cloud_init
195 | assert ' -e EXTRA_PIP_PACKAGES="s3fs" ' in cloud_init
196 | assert " --privileged " in cloud_init
197 |
198 |
199 | @pytest.mark.asyncio
200 | async def test_get_cloud_init_rapids():
201 | cloud_init = EC2Cluster.get_cloud_init(
202 | # Deep Learning AMI (Ubuntu 18.04)
203 | ami="ami-0c7c7d78f752f8f17",
204 | # Python version must match local version and CUDA version must match AMI CUDA version
205 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
206 | instance_type="p3.2xlarge",
207 | bootstrap=False,
208 | filesystem_size=120,
209 | )
210 | assert "rapidsai" in cloud_init
211 |
--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Dask-kubernetes documentation build configuration file, created by
5 | # sphinx-quickstart on Thu Feb 8 17:56:16 2018.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | import os
21 | import sys
22 |
23 | from datetime import datetime
24 |
25 | sys.path.insert(0, os.path.abspath(".."))
26 |
27 |
28 | # -- General configuration ------------------------------------------------
29 |
30 | # If your documentation needs a minimal Sphinx version, state it here.
31 | #
32 | # needs_sphinx = '1.0'
33 |
34 | # Add any Sphinx extension module names here, as strings. They can be
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36 | # ones.
37 | extensions = [
38 | "sphinx.ext.autodoc",
39 | "sphinx.ext.todo",
40 | "sphinx.ext.ifconfig",
41 | "sphinx.ext.viewcode",
42 | "sphinx.ext.autosummary",
43 | "sphinx.ext.extlinks",
44 | "sphinx.ext.intersphinx",
45 | "numpydoc",
46 | ]
47 |
48 | # Add any paths that contain templates here, relative to this directory.
49 | templates_path = ["_templates"]
50 |
51 | # The suffix(es) of source filenames.
52 | # You can specify multiple suffix as a list of string:
53 | #
54 | # source_suffix = ['.rst', '.md']
55 | source_suffix = ".rst"
56 |
57 | # The master toctree document.
58 | master_doc = "index"
59 |
60 | # General information about the project.
61 | project = "Dask Cloud Provider"
62 | copyright = f"{datetime.now().year}, Dask Developers"
63 | author = "Dask Developers"
64 |
65 | # The version info for the project you're documenting, acts as replacement for
66 | # |version| and |release|, also used in various other places throughout the
67 | # built documents.
68 | #
69 | # The short X.Y version.
70 | from dask_cloudprovider import __version__
71 |
72 | version = __version__
73 | # The full version, including alpha/beta/rc tags.
74 | release = __version__
75 |
76 | # The language for content autogenerated by Sphinx. Refer to documentation
77 | # for a list of supported languages.
78 | #
79 | # This is also used if you do content translation via gettext catalogs.
80 | # Usually you set "language" from the command line for these cases.
81 | language = "en"
82 |
83 | # List of patterns, relative to source directory, that match files and
84 | # directories to ignore when looking for source files.
85 | # This patterns also effect to html_static_path and html_extra_path
86 | exclude_patterns = []
87 |
88 | # The name of the Pygments (syntax highlighting) style to use.
89 | # Commenting this out for now, if we register dask pygments,
90 | # then eventually this line can be:
91 | # pygments_style = "dask"
92 |
93 | # If true, `todo` and `todoList` produce output, else they produce nothing.
94 | todo_include_todos = False
95 |
96 |
97 | # -- Options for HTML output ----------------------------------------------
98 |
99 | # The theme to use for HTML and HTML Help pages. See the documentation for
100 | # a list of builtin themes.
101 | #
102 | # html_theme = 'alabaster'
103 |
104 | html_theme = "dask_sphinx_theme"
105 |
106 | # Theme options are theme-specific and customize the look and feel of a theme
107 | # further. For a list of options available for each theme, see the
108 | # documentation.
109 | #
110 | # html_theme_options = {}
111 |
112 | # Add any paths that contain custom static files (such as style sheets) here,
113 | # relative to this directory. They are copied after the builtin static files,
114 | # so a file named "default.css" will overwrite the builtin "default.css".
115 | html_static_path = []
116 |
117 | # Custom sidebar templates, must be a dictionary that maps document names
118 | # to template names.
119 | #
120 | # This is required for the alabaster theme
121 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
122 | # html_sidebars = {
123 | # "**": [
124 | # "relations.html", # needs 'show_related': True theme option to display
125 | # "searchbox.html",
126 | # ]
127 | # }
128 |
129 |
130 | # -- Options for HTMLHelp output ------------------------------------------
131 |
132 | # Output file base name for HTML help builder.
133 | htmlhelp_basename = "dask-cloudprovider-doc"
134 |
135 |
136 | # -- Options for LaTeX output ---------------------------------------------
137 |
138 | latex_elements = {
139 | # The paper size ('letterpaper' or 'a4paper').
140 | #
141 | # 'papersize': 'letterpaper',
142 | # The font size ('10pt', '11pt' or '12pt').
143 | #
144 | # 'pointsize': '10pt',
145 | # Additional stuff for the LaTeX preamble.
146 | #
147 | # 'preamble': '',
148 | # Latex figure (float) alignment
149 | #
150 | # 'figure_align': 'htbp',
151 | }
152 |
153 | # Grouping the document tree into LaTeX files. List of tuples
154 | # (source start file, target name, title,
155 | # author, documentclass [howto, manual, or own class]).
156 | latex_documents = [
157 | (
158 | master_doc,
159 | "dask-cloudprovider.tex",
160 | "Dask Cloud Provider Documentation",
161 | "Dask Cloud Provider Developers",
162 | "manual",
163 | )
164 | ]
165 |
166 |
167 | # -- Options for manual page output ---------------------------------------
168 |
169 | # One entry per manual page. List of tuples
170 | # (source start file, name, description, authors, manual section).
171 | man_pages = [
172 | (master_doc, "dask-cloudprovider", "Dask Cloud Provider Documentation", [author], 1)
173 | ]
174 |
175 |
176 | # -- Options for Texinfo output -------------------------------------------
177 |
178 | # Grouping the document tree into Texinfo files. List of tuples
179 | # (source start file, target name, title, author,
180 | # dir menu entry, description, category)
181 | texinfo_documents = [
182 | (
183 | master_doc,
184 | "Dask Cloud Provider",
185 | "Dask Cloud Provider Documentation",
186 | author,
187 | "Dask-CloudProvider",
188 | "One line description of project.",
189 | "Miscellaneous",
190 | )
191 | ]
192 |
193 |
194 | intersphinx_mapping = {
195 | "python": ("https://docs.python.org/3", None),
196 | "dask": ("https://docs.dask.org/en/latest/", None),
197 | "distributed": ("https://distributed.dask.org/en/latest/", None),
198 | "dask_kubernetes": ("https://kubernetes.dask.org/en/latest/", None),
199 | }
200 |
--------------------------------------------------------------------------------
/dask_cloudprovider/cli/ecs.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from asyncio import sleep
3 | import sys
4 |
5 | import click
6 | from distributed.cli.utils import install_signal_handlers
7 | from distributed.core import Status
8 | from tornado.ioloop import IOLoop, TimeoutError
9 |
10 | from dask_cloudprovider.aws import ECSCluster
11 |
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | @click.command()
17 | @click.option("--fargate", is_flag=True, help="Turn on fargate mode (default off)")
18 | @click.option(
19 | "--fargate-scheduler",
20 | is_flag=True,
21 | help="Turn on fargate mode for scheduler (default off)",
22 | )
23 | @click.option(
24 | "--fargate-workers",
25 | is_flag=True,
26 | help="Turn on fargate mode for workers (default off)",
27 | )
28 | @click.option(
29 | "--image",
30 | type=str,
31 | default=None,
32 | help="Docker image to use for scheduler and workers",
33 | )
34 | @click.option(
35 | "--scheduler-cpu",
36 | type=int,
37 | default=None,
38 | help="Scheduler CPU reservation in milli-CPU",
39 | )
40 | @click.option(
41 | "--scheduler-mem", type=int, default=None, help="Scheduler memory reservation in MB"
42 | )
43 | @click.option(
44 | "--scheduler-port",
45 | type=int,
46 | default=8786,
47 | help="The port on which the scheduler will be reachable to the workers and clients",
48 | )
49 | @click.option(
50 | "--scheduler-timeout",
51 | type=int,
52 | default=None,
53 | help="Scheduler timeout (e.g 5 minutes)",
54 | )
55 | @click.option(
56 | "--worker-cpu", type=int, default=None, help="Worker CPU reservation in milli-CPU"
57 | )
58 | @click.option(
59 | "--worker-mem", type=int, default=None, help="Worker memory reservation in MB"
60 | )
61 | @click.option(
62 | "--n-workers",
63 | type=int,
64 | default=None,
65 | help="Number of workers to start with the cluster",
66 | )
67 | @click.option(
68 | "--cluster-arn",
69 | type=str,
70 | default=None,
71 | help="The ARN of an existing ECS cluster to use",
72 | )
73 | @click.option(
74 | "--cluster-name-template",
75 | type=str,
76 | default=None,
77 | help="A template to use for the cluster name if `--cluster-arn` is not set",
78 | )
79 | @click.option(
80 | "--execution-role-arn",
81 | type=str,
82 | default=None,
83 | help="The ARN of an existing IAM role to use for ECS execution",
84 | )
85 | @click.option(
86 | "--task-role-arn",
87 | type=str,
88 | default=None,
89 | help="The ARN of an existing IAM role to give to the tasks",
90 | )
91 | @click.option(
92 | "--task-role-policy",
93 | type=str,
94 | default=None,
95 | multiple=True,
96 | help="Policy to attach to a task if --task-role-arn is not set (can be used multiple times)",
97 | )
98 | @click.option(
99 | "--cloudwatch-logs-group", type=str, default=None, help="The group to send logs to"
100 | )
101 | @click.option(
102 | "--cloudwatch-logs-stream-prefix",
103 | type=str,
104 | default=None,
105 | help="An optional prefix to use for log streams",
106 | )
107 | @click.option(
108 | "--cloudwatch-logs-default-retention",
109 | type=int,
110 | default=None,
111 | help="Number of says to retain logs",
112 | )
113 | @click.option(
114 | "--vpc",
115 | type=str,
116 | default=None,
117 | help="The ID of an existing VPC (uses default if not specified)",
118 | )
119 | @click.option(
120 | "--subnet",
121 | type=str,
122 | default=None,
123 | multiple=True,
124 | help="VPC subnet to use (can be used multipel times, will defaul to all if none specified)",
125 | )
126 | @click.option(
127 | "--security-group",
128 | type=str,
129 | default=None,
130 | multiple=True,
131 | help="Security group to use for task communication (can be used multiple times, will be created if not specified)",
132 | )
133 | @click.option(
134 | "--environment",
135 | type=str,
136 | default=None,
137 | multiple=True,
138 | help="Environment variable for the scheduler and workers in the form FOO=bar (can be used multiple times)",
139 | )
140 | @click.option(
141 | "--tag",
142 | type=str,
143 | default=None,
144 | multiple=True,
145 | help="Tag to apply to all resources created automatically in the form FOO=bar (can be used multiple times)",
146 | )
147 | @click.option("--skip_cleanup", is_flag=True, help="Skip cleanup of stale resources")
148 | @click.version_option()
149 | def main(
150 | fargate,
151 | fargate_scheduler,
152 | fargate_workers,
153 | image,
154 | scheduler_cpu,
155 | scheduler_mem,
156 | scheduler_port,
157 | scheduler_timeout,
158 | worker_cpu,
159 | worker_mem,
160 | n_workers,
161 | cluster_arn,
162 | cluster_name_template,
163 | execution_role_arn,
164 | task_role_arn,
165 | task_role_policy,
166 | cloudwatch_logs_group,
167 | cloudwatch_logs_stream_prefix,
168 | cloudwatch_logs_default_retention,
169 | vpc,
170 | subnet,
171 | security_group,
172 | environment,
173 | tag,
174 | skip_cleanup,
175 | ):
176 | tag = {v.split("=")[0]: v.split("=")[1] for v in tag} if tag else None
177 | environment = (
178 | {v.split("=")[0]: v.split("=")[1] for v in environment} if environment else None
179 | )
180 | subnet = subnet or None
181 | security_group = security_group or None
182 | task_role_policy = task_role_policy or None
183 | logger.info("Starting ECS cluster")
184 | try:
185 | cluster = ECSCluster(
186 | fargate_scheduler=fargate_scheduler or fargate,
187 | fargate_workers=fargate_workers or fargate,
188 | image=image,
189 | scheduler_cpu=scheduler_cpu,
190 | scheduler_mem=scheduler_mem,
191 | scheduler_port=scheduler_port,
192 | scheduler_timeout=scheduler_timeout,
193 | worker_cpu=worker_cpu,
194 | worker_mem=worker_mem,
195 | n_workers=n_workers,
196 | cluster_arn=cluster_arn,
197 | cluster_name_template=cluster_name_template,
198 | execution_role_arn=execution_role_arn,
199 | task_role_arn=task_role_arn,
200 | task_role_policies=task_role_policy,
201 | cloudwatch_logs_group=cloudwatch_logs_group,
202 | cloudwatch_logs_stream_prefix=cloudwatch_logs_stream_prefix,
203 | cloudwatch_logs_default_retention=cloudwatch_logs_default_retention,
204 | vpc=vpc,
205 | subnets=subnet,
206 | security_groups=security_group,
207 | environment=environment,
208 | tags=tag,
209 | skip_cleanup=skip_cleanup,
210 | )
211 | except Exception as e:
212 | ctx = click.get_current_context()
213 | logger.error(str(e) + "\n")
214 | click.echo(ctx.get_help())
215 | sys.exit(1)
216 |
217 | async def run():
218 | logger.info("Ready")
219 | while cluster.status != Status.closed:
220 | await sleep(0.2)
221 |
222 | def on_signal(signum):
223 | logger.info("Exiting on signal %d", signum)
224 | cluster.close(timeout=2)
225 |
226 | loop = IOLoop.current()
227 | install_signal_handlers(loop, cleanup=on_signal)
228 |
229 | try:
230 | loop.run_sync(run)
231 | except (KeyboardInterrupt, TimeoutError):
232 | logger.info("Shutting down")
233 | finally:
234 | logger.info("End dask-ecs")
235 |
236 |
237 | def go():
238 | main()
239 |
240 |
241 | if __name__ == "__main__":
242 | go()
243 |
--------------------------------------------------------------------------------
/dask_cloudprovider/azure/utils.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import datetime
3 | import json
4 | import subprocess
5 | import logging
6 |
7 | import aiohttp
8 | from distributed.diagnostics.plugin import WorkerPlugin
9 | from tornado.ioloop import IOLoop, PeriodicCallback
10 |
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 | AZURE_EVENTS_METADATA_URL = (
15 | "http://169.254.169.254/metadata/scheduledevents?api-version=2019-08-01"
16 | )
17 |
18 |
19 | def _get_default_subscription() -> str:
20 | """
21 | Get the default Azure subscription ID, as configured by the Azure CLI.
22 | """
23 | out = subprocess.check_output(["az", "account", "list", "--query", "[?isDefault]"])
24 | accounts = json.loads(out)
25 | if accounts:
26 | subscription_id = accounts[0]["id"]
27 | return subscription_id
28 | raise ValueError(
29 | "Could not find a default subscription. "
30 | "Run 'az account set' to set a default subscription."
31 | )
32 |
33 |
34 | class AzurePreemptibleWorkerPlugin(WorkerPlugin):
35 | """A worker plugin for azure spot instances
36 |
37 | This worker plugin will poll azure's metadata service for preemption notifications.
38 | When a node is preempted, the plugin will attempt to shutdown gracefully all workers
39 | on the node.
40 |
41 | This plugin can be used on any worker running on azure spot instances, not just the
42 | ones created by ``dask-cloudprovider``.
43 |
44 | For more details on azure spot instances see:
45 | https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events
46 |
47 | Parameters
48 | ----------
49 | poll_interval_s: int (optional)
50 | The rate at which the plugin will poll the metadata service in seconds.
51 |
52 | Defaults to ``1``
53 |
54 | metadata_url: str (optional)
55 | The url of the metadata service to poll.
56 |
57 | Defaults to "http://169.254.169.254/metadata/scheduledevents?api-version=2019-08-01"
58 |
59 | termination_events: List[str] (optional)
60 | The type of events that will trigger the gracefull shutdown
61 |
62 | Defaults to ``['Preempt', 'Terminate']``
63 |
64 | termination_offset_minutes: int (optional)
65 | Extra offset to apply to the premption date. This may be negative, to start
66 | the gracefull shutdown before the ``NotBefore`` date. It can also be positive, to
67 | start the shutdown after the ``NotBefore`` date, but this is at your own risk.
68 |
69 | Defaults to ``0``
70 |
71 | Examples
72 | --------
73 |
74 | Let's say you have cluster and a client instance.
75 | For example using :class:`dask_kubernetes.KubeCluster`
76 |
77 | >>> from dask_kubernetes import KubeCluster
78 | >>> from distributed import Client
79 | >>> cluster = KubeCluster()
80 | >>> client = Client(cluster)
81 |
82 | You can add the worker plugin using the following:
83 |
84 | >>> from dask_cloudprovider.azure import AzurePreemptibleWorkerPlugin
85 | >>> client.register_worker_plugin(AzurePreemptibleWorkerPlugin())
86 | """
87 |
88 | def __init__(
89 | self,
90 | poll_interval_s=1,
91 | metadata_url=None,
92 | termination_events=None,
93 | termination_offset_minutes=0,
94 | ):
95 | self.callback = None
96 | self.loop = None
97 | self.worker = None
98 | self.poll_interval_s = poll_interval_s
99 | self.metadata_url = metadata_url or AZURE_EVENTS_METADATA_URL
100 | self.termination_events = termination_events or ["Preempt", "Terminate"]
101 | self.termination_offset = datetime.timedelta(minutes=termination_offset_minutes)
102 |
103 | self.terminating = False
104 | self.not_before = None
105 | self._session = None
106 | self._lock = None
107 |
108 | async def _is_terminating(self):
109 | preempt_started = False
110 | async with self._session.get(self.metadata_url) as response:
111 | try:
112 | data = await response.json()
113 | # Sometime azure responds with text/plain mime type
114 | except aiohttp.ContentTypeError:
115 | return
116 | # Sometimes the response doesn't contain the Events key
117 | events = data.get("Events", [])
118 | if events:
119 | logger.debug(
120 | "Worker {}, got metadata events {}".format(self.worker.name, events)
121 | )
122 | for evt in events:
123 | event_type = evt["EventType"]
124 | if event_type not in self.termination_events:
125 | continue
126 |
127 | event_status = evt.get("EventStatus")
128 | if event_status == "Started":
129 | logger.info(
130 | "Worker {}, node preemption started".format(self.worker.name)
131 | )
132 | preempt_started = True
133 | break
134 |
135 | not_before = evt.get("NotBefore")
136 | if not not_before:
137 | continue
138 |
139 | not_before = datetime.datetime.strptime(
140 | not_before, "%a, %d %b %Y %H:%M:%S GMT"
141 | )
142 | if self.not_before is None:
143 | logger.info(
144 | "Worker {}, node deletion scheduled not before {}".format(
145 | self.worker.name, self.not_before
146 | )
147 | )
148 | self.not_before = not_before
149 | break
150 | if self.not_before < not_before:
151 | logger.info(
152 | "Worker {}, node deletion re-scheduled not before {}".format(
153 | self.worker.name, not_before
154 | )
155 | )
156 | self.not_before = not_before
157 | break
158 |
159 | return preempt_started or (
160 | self.not_before
161 | and (self.not_before + self.termination_offset < datetime.datetime.utcnow())
162 | )
163 |
164 | async def poll_status(self):
165 | if self.terminating:
166 | return
167 | if self._session is None:
168 | self._session = aiohttp.ClientSession(headers={"Metadata": "true"})
169 | if self._lock is None:
170 | self._lock = asyncio.Lock()
171 |
172 | async with self._lock:
173 | is_terminating = await self._is_terminating()
174 | if not is_terminating:
175 | return
176 |
177 | logger.info(
178 | "Worker {}, node is being deleted, attempting graceful shutdown".format(
179 | self.worker.name
180 | )
181 | )
182 | self.terminating = True
183 | await self._session.close()
184 | await self.worker.close_gracefully()
185 |
186 | def setup(self, worker):
187 | self.worker = worker
188 | self.loop = IOLoop.current()
189 | self.callback = PeriodicCallback(
190 | self.poll_status, callback_time=self.poll_interval_s * 1_000
191 | )
192 | self.loop.add_callback(self.callback.start)
193 | logger.debug(
194 | "Worker {}, registering preemptible plugin".format(self.worker.name)
195 | )
196 |
197 | def teardown(self, worker):
198 | logger.debug("Worker {}, tearing down plugin".format(self.worker.name))
199 | if self.callback:
200 | self.callback.stop()
201 | self.callback = None
202 |
--------------------------------------------------------------------------------
/dask_cloudprovider/digitalocean/droplet.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | import dask
4 | from dask_cloudprovider.generic.vmcluster import (
5 | VMCluster,
6 | VMInterface,
7 | SchedulerMixin,
8 | WorkerMixin,
9 | )
10 |
11 | try:
12 | import digitalocean
13 | except ImportError as e:
14 | msg = (
15 | "Dask Cloud Provider Digital Ocean requirements are not installed.\n\n"
16 | "Please pip install as follows:\n\n"
17 | ' pip install "dask-cloudprovider[digitalocean]" --upgrade # or python -m pip install'
18 | )
19 | raise ImportError(msg) from e
20 |
21 |
22 | class Droplet(VMInterface):
23 | def __init__(
24 | self,
25 | cluster: str,
26 | config,
27 | *args,
28 | region: str = None,
29 | size: str = None,
30 | image: str = None,
31 | docker_image=None,
32 | env_vars=None,
33 | extra_bootstrap=None,
34 | **kwargs,
35 | ):
36 | super().__init__(*args, **kwargs)
37 | self.droplet = None
38 | self.cluster = cluster
39 | self.config = config
40 | self.region = region
41 | self.size = size
42 | self.image = image
43 | self.gpu_instance = False
44 | self.bootstrap = True
45 | self.extra_bootstrap = extra_bootstrap
46 | self.docker_image = docker_image
47 | self.env_vars = env_vars
48 |
49 | async def create_vm(self):
50 | self.droplet = digitalocean.Droplet(
51 | token=self.config.get("token"),
52 | name=self.name,
53 | region=self.region,
54 | image=self.image,
55 | size_slug=self.size,
56 | backups=False,
57 | user_data=self.cluster.render_process_cloud_init(self),
58 | )
59 | await self.call_async(self.droplet.create)
60 | for action in self.droplet.get_actions():
61 | while action.status != "completed":
62 | action.load()
63 | await asyncio.sleep(0.1)
64 | while self.droplet.ip_address is None:
65 | await self.call_async(self.droplet.load)
66 | await asyncio.sleep(0.1)
67 | self.cluster._log(f"Created droplet {self.name}")
68 |
69 | return self.droplet.ip_address, None
70 |
71 | async def destroy_vm(self):
72 | await self.call_async(self.droplet.destroy)
73 | self.cluster._log(f"Terminated droplet {self.name}")
74 |
75 |
76 | class DropletScheduler(SchedulerMixin, Droplet):
77 | """Scheduler running on a DigitalOcean Droplet."""
78 |
79 |
80 | class DropletWorker(WorkerMixin, Droplet):
81 | """Worker running on a DigitalOcean Droplet."""
82 |
83 |
84 | class DropletCluster(VMCluster):
85 | """Cluster running on Digital Ocean droplets.
86 |
87 | VMs in DigitalOcean (DO) are referred to as droplets. This cluster manager constructs a Dask cluster
88 | running on VMs.
89 |
90 | When configuring your cluster you may find it useful to install the ``doctl`` tool for querying the
91 | DO API for available options.
92 |
93 | https://www.digitalocean.com/docs/apis-clis/doctl/how-to/install/
94 |
95 | Parameters
96 | ----------
97 | region: str
98 | The DO region to launch you cluster in. A full list can be obtained with ``doctl compute region list``.
99 | size: str
100 | The VM size slug. You can get a full list with ``doctl compute size list``.
101 | The default is ``s-1vcpu-1gb`` which is 1GB RAM and 1 vCPU
102 | image: str
103 | The image ID to use for the host OS. This should be a Ubuntu variant.
104 | You can list available images with ``doctl compute image list --public | grep ubuntu.*x64``.
105 | worker_module: str
106 | The Dask worker module to start on worker VMs.
107 | n_workers: int
108 | Number of workers to initialise the cluster with. Defaults to ``0``.
109 | worker_module: str
110 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
111 | worker_options: dict
112 | Params to be passed to the worker class.
113 | See :class:`distributed.worker.Worker` for default worker class.
114 | If you set ``worker_module`` then refer to the docstring for the custom worker class.
115 | scheduler_options: dict
116 | Params to be passed to the scheduler class.
117 | See :class:`distributed.scheduler.Scheduler`.
118 | docker_image: string (optional)
119 | The Docker image to run on all instances.
120 |
121 | This image must have a valid Python environment and have ``dask`` installed in order for the
122 | ``dask-scheduler`` and ``dask-worker`` commands to be available. It is recommended the Python
123 | environment matches your local environment where ``EC2Cluster`` is being created from.
124 |
125 | For GPU instance types the Docker image much have NVIDIA drivers and ``dask-cuda`` installed.
126 |
127 | By default the ``daskdev/dask:latest`` image will be used.
128 | docker_args: string (optional)
129 | Extra command line arguments to pass to Docker.
130 | extra_bootstrap: list[str] (optional)
131 | Extra commands to be run during the bootstrap phase.
132 | env_vars: dict (optional)
133 | Environment variables to be passed to the worker.
134 | silence_logs: bool
135 | Whether or not we should silence logging when setting up the cluster.
136 | asynchronous: bool
137 | If this is intended to be used directly within an event loop with
138 | async/await
139 | security : Security or bool, optional
140 | Configures communication security in this cluster. Can be a security
141 | object, or True. If True, temporary self-signed credentials will
142 | be created automatically. Default is ``True``.
143 | debug: bool, optional
144 | More information will be printed when constructing clusters to enable debugging.
145 |
146 | Examples
147 | --------
148 |
149 | Create the cluster.
150 |
151 | >>> from dask_cloudprovider.digitalocean import DropletCluster
152 | >>> cluster = DropletCluster(n_workers=1)
153 | Creating scheduler instance
154 | Created droplet dask-38b817c1-scheduler
155 | Waiting for scheduler to run
156 | Scheduler is running
157 | Creating worker instance
158 | Created droplet dask-38b817c1-worker-dc95260d
159 |
160 | Connect a client.
161 |
162 | >>> from dask.distributed import Client
163 | >>> client = Client(cluster)
164 |
165 | Do some work.
166 |
167 | >>> import dask.array as da
168 | >>> arr = da.random.random((1000, 1000), chunks=(100, 100))
169 | >>> arr.mean().compute()
170 | 0.5001550986751964
171 |
172 | Close the cluster
173 |
174 | >>> client.close()
175 | >>> cluster.close()
176 | Terminated droplet dask-38b817c1-worker-dc95260d
177 | Terminated droplet dask-38b817c1-scheduler
178 |
179 | You can also do this all in one go with context managers to ensure the cluster is
180 | created and cleaned up.
181 |
182 | >>> with DropletCluster(n_workers=1) as cluster:
183 | ... with Client(cluster) as client:
184 | ... print(da.random.random((1000, 1000), chunks=(100, 100)).mean().compute())
185 | Creating scheduler instance
186 | Created droplet dask-48efe585-scheduler
187 | Waiting for scheduler to run
188 | Scheduler is running
189 | Creating worker instance
190 | Created droplet dask-48efe585-worker-5181aaf1
191 | 0.5000558682356162
192 | Terminated droplet dask-48efe585-worker-5181aaf1
193 | Terminated droplet dask-48efe585-scheduler
194 |
195 | """
196 |
197 | def __init__(
198 | self,
199 | region: str = None,
200 | size: str = None,
201 | image: str = None,
202 | debug: bool = False,
203 | **kwargs,
204 | ):
205 | self.config = dask.config.get("cloudprovider.digitalocean", {})
206 | self.scheduler_class = DropletScheduler
207 | self.worker_class = DropletWorker
208 | self.debug = debug
209 | self.options = {
210 | "cluster": self,
211 | "config": self.config,
212 | "region": region if region is not None else self.config.get("region"),
213 | "size": size if size is not None else self.config.get("size"),
214 | "image": image if image is not None else self.config.get("image"),
215 | }
216 | self.scheduler_options = {**self.options}
217 | self.worker_options = {**self.options}
218 | super().__init__(debug=debug, **kwargs)
219 |
--------------------------------------------------------------------------------
/dask_cloudprovider/nebius/instances.py:
--------------------------------------------------------------------------------
1 | import dask
2 |
3 | from dask_cloudprovider.generic.vmcluster import (
4 | VMCluster,
5 | VMInterface,
6 | SchedulerMixin,
7 | WorkerMixin,
8 | )
9 |
10 | try:
11 | from nebius.api.nebius.common.v1 import ResourceMetadata
12 | from nebius.api.nebius.vpc.v1 import SubnetServiceClient, ListSubnetsRequest
13 | from nebius.sdk import SDK
14 | from nebius.api.nebius.compute.v1 import (
15 | InstanceServiceClient,
16 | CreateInstanceRequest,
17 | DiskServiceClient,
18 | CreateDiskRequest,
19 | DiskSpec,
20 | SourceImageFamily,
21 | InstanceSpec,
22 | AttachedDiskSpec,
23 | ExistingDisk,
24 | ResourcesSpec,
25 | NetworkInterfaceSpec,
26 | IPAddress,
27 | PublicIPAddress,
28 | GetInstanceRequest,
29 | DeleteInstanceRequest,
30 | DeleteDiskRequest,
31 | )
32 | except ImportError as e:
33 | msg = (
34 | "Dask Cloud Provider Nebius requirements are not installed.\n\n"
35 | "Please pip install as follows:\n\n"
36 | ' pip install "dask-cloudprovider[nebius]" --upgrade # or python -m pip install'
37 | )
38 | raise ImportError(msg) from e
39 |
40 |
41 | class NebiusInstance(VMInterface):
42 | def __init__(
43 | self,
44 | cluster: str,
45 | config,
46 | env_vars: dict = None,
47 | bootstrap=None,
48 | extra_bootstrap=None,
49 | docker_image: str = None,
50 | image_family: str = None,
51 | project_id: str = None,
52 | server_platform: str = None,
53 | server_preset: str = None,
54 | disk_size: int = None,
55 | *args,
56 | **kwargs,
57 | ):
58 | super().__init__(*args, **kwargs)
59 | self.cluster = cluster
60 | self.config = config
61 | self.extra_bootstrap = extra_bootstrap
62 | self.env_vars = env_vars
63 | self.bootstrap = bootstrap
64 | self.image_family = image_family
65 | self.project_id = project_id
66 | self.docker_image = docker_image
67 | self.server_platform = server_platform
68 | self.server_preset = server_preset
69 | self.sdk = SDK(credentials=self.config.get("token"))
70 | self.disk_size = disk_size
71 | self.instance_id = None
72 | self.disk_id = None
73 |
74 | async def create_vm(self, user_data=None):
75 | service = DiskServiceClient(self.sdk)
76 | operation = await service.create(
77 | CreateDiskRequest(
78 | metadata=ResourceMetadata(
79 | parent_id=self.project_id,
80 | name=self.name + "-disk",
81 | ),
82 | spec=DiskSpec(
83 | source_image_family=SourceImageFamily(
84 | image_family=self.image_family
85 | ),
86 | size_gibibytes=self.disk_size,
87 | type=DiskSpec.DiskType.NETWORK_SSD,
88 | ),
89 | )
90 | )
91 | await operation.wait()
92 | self.disk_id = operation.resource_id
93 |
94 | service = SubnetServiceClient(self.sdk)
95 | sub_net = await service.list(ListSubnetsRequest(parent_id=self.project_id))
96 | subnet_id = sub_net.items[0].metadata.id
97 |
98 | service = InstanceServiceClient(self.sdk)
99 | operation = await service.create(
100 | CreateInstanceRequest(
101 | metadata=ResourceMetadata(
102 | parent_id=self.project_id,
103 | name=self.name,
104 | ),
105 | spec=InstanceSpec(
106 | boot_disk=AttachedDiskSpec(
107 | attach_mode=AttachedDiskSpec.AttachMode(2),
108 | existing_disk=ExistingDisk(id=self.disk_id),
109 | ),
110 | cloud_init_user_data=self.cluster.render_process_cloud_init(self),
111 | resources=ResourcesSpec(
112 | platform=self.server_platform, preset=self.server_preset
113 | ),
114 | network_interfaces=[
115 | NetworkInterfaceSpec(
116 | subnet_id=subnet_id,
117 | ip_address=IPAddress(),
118 | name="network-interface-0",
119 | public_ip_address=PublicIPAddress(),
120 | )
121 | ],
122 | ),
123 | )
124 | )
125 | self.instance_id = operation.resource_id
126 |
127 | self.cluster._log(f"Creating Nebius instance {self.name}")
128 | await operation.wait()
129 | service = InstanceServiceClient(self.sdk)
130 | operation = await service.get(
131 | GetInstanceRequest(
132 | id=self.instance_id,
133 | )
134 | )
135 | internal_ip = operation.status.network_interfaces[0].ip_address.address.split(
136 | "/"
137 | )[0]
138 | external_ip = operation.status.network_interfaces[
139 | 0
140 | ].public_ip_address.address.split("/")[0]
141 | self.cluster._log(
142 | f"Created Nebius instance {self.name} with internal IP {internal_ip} and external IP {external_ip}"
143 | )
144 | return internal_ip, external_ip
145 |
146 | async def destroy_vm(self):
147 | if self.instance_id:
148 | service = InstanceServiceClient(self.sdk)
149 | operation = await service.delete(
150 | DeleteInstanceRequest(
151 | id=self.instance_id,
152 | )
153 | )
154 | await operation.wait()
155 |
156 | if self.disk_id:
157 | service = DiskServiceClient(self.sdk)
158 | await service.delete(
159 | DeleteDiskRequest(
160 | id=self.disk_id,
161 | )
162 | )
163 | self.cluster._log(
164 | f"Terminated instance {self.name} ({self.instance_id}) and deleted disk {self.disk_id}"
165 | )
166 | self.instance_id = None
167 | self.disk_id = None
168 |
169 |
170 | class NebiusScheduler(SchedulerMixin, NebiusInstance):
171 | """Scheduler running on a Nebius server."""
172 |
173 |
174 | class NebiusWorker(WorkerMixin, NebiusInstance):
175 | """Worker running on a Nebius server."""
176 |
177 |
178 | class NebiusCluster(VMCluster):
179 | """Cluster running on Nebius AI Cloud instances.
180 |
181 | VMs in Nebius AI Cloud are referred to as instances. This cluster manager constructs a Dask cluster
182 | running on VMs.
183 |
184 | When configuring your cluster you may find it useful to install the ``nebius`` tool for querying the
185 | Nebius API for available options.
186 |
187 | https://docs.nebius.com/cli/quickstart
188 |
189 | Parameters
190 | ----------
191 | image_family: str
192 | The image to use for the host OS. This should be a Ubuntu variant.
193 | You find list available images here https://docs.nebius.com/compute/storage/manage#parameters-boot.
194 | project_id: str
195 | The Nebius AI Cloud project id. You can find in Nebius AI Cloud console.
196 | server_platform: str
197 | List of all platforms and presets here https://docs.nebius.com/compute/virtual-machines/types/.
198 | server_preset: str
199 | List of all platforms and presets here https://docs.nebius.com/compute/virtual-machines/types/.
200 | n_workers: int
201 | Number of workers to initialise the cluster with. Defaults to ``0``.
202 | worker_module: str
203 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
204 | worker_options: dict
205 | Params to be passed to the worker class.
206 | See :class:`distributed.worker.Worker` for default worker class.
207 | If you set ``worker_module`` then refer to the docstring for the custom worker class.
208 | scheduler_options: dict
209 | Params to be passed to the scheduler class.
210 | See :class:`distributed.scheduler.Scheduler`.
211 | env_vars: dict
212 | Environment variables to be passed to the worker.
213 | extra_bootstrap: list[str] (optional)
214 | Extra commands to be run during the bootstrap phase.
215 |
216 | Example
217 | --------
218 |
219 | >>> from dask_cloudprovider.nebius import NebiusCluster
220 | >>> cluster = NebiusCluster(n_workers=1)
221 |
222 | >>> from dask.distributed import Client
223 | >>> client = Client(cluster)
224 |
225 | >>> import dask.array as da
226 | >>> arr = da.random.random((1000, 1000), chunks=(100, 100))
227 | >>> arr.mean().compute()
228 |
229 | >>> client.close()
230 | >>> cluster.close()
231 |
232 | """
233 |
234 | def __init__(
235 | self,
236 | bootstrap: str = None,
237 | image_family: str = None,
238 | project_id: str = None,
239 | disk_size: int = None,
240 | server_platform: str = None,
241 | server_preset: str = None,
242 | docker_image: str = None,
243 | debug: bool = False,
244 | **kwargs,
245 | ):
246 | self.config = dask.config.get("cloudprovider.nebius", {})
247 |
248 | self.scheduler_class = NebiusScheduler
249 | self.worker_class = NebiusWorker
250 |
251 | self.image_family = dask.config.get(
252 | "cloudprovider.nebius.image_family", override_with=image_family
253 | )
254 | self.docker_image = dask.config.get(
255 | "cloudprovider.nebius.docker_image", override_with=docker_image
256 | )
257 | self.project_id = dask.config.get(
258 | "cloudprovider.nebius.project_id", override_with=project_id
259 | )
260 | self.server_platform = dask.config.get(
261 | "cloudprovider.nebius.server_platform", override_with=server_platform
262 | )
263 | self.server_preset = dask.config.get(
264 | "cloudprovider.nebius.server_preset", override_with=server_preset
265 | )
266 | self.bootstrap = dask.config.get(
267 | "cloudprovider.nebius.bootstrap", override_with=bootstrap
268 | )
269 | self.disk_size = dask.config.get(
270 | "cloudprovider.nebius.disk_size", override_with=disk_size
271 | )
272 | self.debug = debug
273 |
274 | self.options = {
275 | "bootstrap": self.bootstrap,
276 | "cluster": self,
277 | "config": self.config,
278 | "docker_image": self.docker_image,
279 | "image_family": self.image_family,
280 | "project_id": self.project_id,
281 | "server_platform": self.server_platform,
282 | "server_preset": self.server_preset,
283 | "disk_size": self.disk_size,
284 | }
285 | self.scheduler_options = {**self.options}
286 | self.worker_options = {**self.options}
287 | super().__init__(debug=debug, **kwargs)
288 |
--------------------------------------------------------------------------------
/dask_cloudprovider/cloudprovider.yaml:
--------------------------------------------------------------------------------
1 | cloudprovider:
2 | ecs:
3 | fargate_scheduler: false # Use fargate mode for the scheduler
4 | fargate_spot: false
5 | fargate_workers: false # Use fargate mode for the workers
6 | fargate_use_private_ip: false
7 | scheduler_cpu: 1024 # Millicpu (1024ths of a CPU core)
8 | scheduler_mem: 4096 # Memory in MB
9 | # scheduler_extra_args: "--tls-cert,/path/to/cert.pem,--tls-key,/path/to/cert.key,--tls-ca-file,/path/to/ca.key"
10 | worker_cpu: 4096 # Millicpu (1024ths of a CPU core)
11 | worker_mem: 16384 # Memory in MB
12 | worker_gpu: 0 # Number of GPUs for each worker
13 | # worker_extra_args: "--tls-cert,/path/to/cert.pem,--tls-key,/path/to/cert.key,--tls-ca-file,/path/to/ca.key"
14 | n_workers: 0 # Number of workers to start the cluster with
15 | scheduler_timeout: "5 minutes" # Length of inactivity to wait before closing the cluster
16 |
17 | image: "daskdev/dask:latest" # Docker image to use for non GPU tasks
18 | cpu_architecture: "X86_64" # Runtime platform CPU architecture
19 | gpu_image: "rapidsai/rapidsai:latest" # Docker image to use for GPU tasks
20 | cluster_name_template: "dask-{name}" # Template to use when creating a cluster
21 | cluster_arn: "" # ARN of existing ECS cluster to use (if not set one will be created)
22 | execution_role_arn: "" # Arn of existing execution role to use (if not set one will be created)
23 | task_role_arn: "" # Arn of existing task role to use (if not set one will be created)
24 | task_role_policies: [] # List of policy arns to attach to tasks (e.g S3 read only access)
25 | # platform_version: "LATEST" # Fargate platformVersion string like "1.4.0" or "LATEST"
26 |
27 | cloudwatch_logs_group: "" # Name of existing cloudwatch logs group to use (if not set one will be created)
28 | cloudwatch_logs_stream_prefix: "{cluster_name}/{name}" # Stream prefix template
29 | cloudwatch_logs_default_retention: 30 # Number of days to retain logs (only applied if not using existing group)
30 |
31 | vpc: "default" # VPC to use for tasks
32 | subnets: [] # VPC subnets to use (will use all available if not set)
33 | security_groups: [] # Security groups to use (if not set one will be created)
34 |
35 | tags: {} # Tags to apply to all AWS resources created by the cluster manager
36 | environment: {} # Environment variables that are set within a task container
37 | skip_cleanup: false # Skip cleaning up of stale resources
38 |
39 | ec2:
40 | region: null # AWS region to create cluster. Defaults to environment or account default region.
41 | availability_zone: null # The availability zone to start you clusters. By default AWS will select the AZ with most free capacity.
42 | bootstrap: true # It is assumed that the AMI does not have Docker and needs bootstrapping. Set this to false if using a custom AMI with Docker already installed.
43 | auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out.
44 | # worker_command: "dask-worker" # The command for workers to run. If the instance_type is a GPU instance dask-cuda-worker will be used.
45 | ami: null # AMI ID to use for all instances. Defaults to latest Ubuntu 20.04 image.
46 | instance_type: "t2.micro" # Instance type for the scheduler and all workers
47 | scheduler_instance_type: "t2.micro" # Instance type for the scheduler
48 | worker_instance_type: "t2.micro" # Instance type for all workers
49 | docker_image: "daskdev/dask:latest" # docker image to use
50 | vpc: null # VPC id for instances to join. Defaults to default VPC.
51 | subnet_id: null # Subnet ID for instances to. Defaults to all subnets in default VPC.
52 | security_groups: [] # Security groups for instances. Will create a minimal Dask security group by default.
53 | filesystem_size: 40 # Default root filesystem size for scheduler and worker VMs in GB
54 | key_name: null # SSH Key name to assign to instances
55 | iam_instance_profile: {} # Iam role to assign to instances
56 | # Arn: 'string'
57 | # Name: 'string'
58 | instance_tags:
59 | createdBy: dask-cloudprovider
60 | volume_tags:
61 | createdBy: dask-cloudprovider
62 | enable_detailed_monitoring: false
63 | use_private_ip: false
64 |
65 | azure:
66 | location: null # The Azure location to launch your cluster
67 | resource_group: null # The Azure resource group for the cluster
68 | subscription_id: null # The Azure subscription ID for the cluster
69 | azurevm:
70 | vnet: null # Azure Virtual Network to launch VMs in
71 | subnet: null # Azure Virtual Network subnet to launch VMs in
72 | security_group: null # Network security group to allow 8786 and 8787
73 | public_ingress: true # Assign a public IP address to the scheduler
74 | vm_size: "Standard_DS1_v2" # Azure VM size to use for scheduler and workers
75 | disk_size: 50 # Specifies the size of the VM host OS disk in gigabytes. This value cannot be larger than `1023`.
76 | scheduler_vm_size: null # Set a different VM size for the scheduler. Will use vm_size if not set
77 | docker_image: "daskdev/dask:latest" # docker image to use
78 | vm_image: # OS image to use for the virtual machines
79 | publisher: "Canonical"
80 | offer: "UbuntuServer"
81 | sku: "18.04-LTS"
82 | version: "latest"
83 | bootstrap: true # It is assumed that the VHD does not have Docker and needs bootstrapping. Set this to false if using a custom VHD with Docker already installed.
84 | auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out.
85 | marketplace_plan: null # This needs to be passed in if the user wants to use a Marketplace VM with a plan.
86 | # name: "ngc-base-version-21-02-2"
87 | # publisher: "nvidia"
88 | # product: "ngc_azure_17_11"
89 | extra_options: {} # Additional options to provide when creating the VMs.
90 |
91 | digitalocean:
92 | token: null # API token for interacting with the Digital Ocean API
93 | region: "nyc3" # Region to launch Droplets in
94 | size: "s-1vcpu-1gb" # Droplet size to launch, default is 1GB RAM, 1 vCPU
95 | image: "ubuntu-20-04-x64" # Operating System image to use
96 |
97 | gcp:
98 | source_image: "projects/ubuntu-os-cloud/global/images/ubuntu-minimal-1804-bionic-v20201014" # the gcp image to use for all instances
99 | zone: "us-east1-c" # the zone of where to launch the instances
100 | network: "default" # the network/subnetwork in GCP to use
101 | network_projectid: null # GCP project id where the network exists
102 | projectid: "" # name of the google cloud project
103 | on_host_maintenance: "TERMINATE"
104 | machine_type: "n1-standard-1" # size of the machine type to use for the scheduler and all workers
105 | scheduler_machine_type: "n1-standard-1" # size of the machine type to use for the scheduler
106 | worker_machine_type: "n1-standard-1" # size of the machine type to use for all workers
107 | filesystem_size: 50 # amount in GBs of hard drive space to allocate
108 | ngpus: "" # number of GPUs to use. If provided, will be used for both scheduler and worker
109 | gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``. If provided, will be used for both scheduler and worker
110 | scheduler_ngpus: "" # number of GPUs to use on scheduler
111 | scheduler_gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``.
112 | worker_ngpus: "" # number of GPUs to use on worker
113 | worker_gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``.
114 | disk_type: "pd-standard" # type of disk to use: pd-standard, pd-ssd
115 | docker_image: "daskdev/dask:latest" # docker image to use
116 | auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out.
117 | public_ingress: true # configure the scheduler to be externally accessible. This assumes firefwall rules for 8787 and 8786
118 | instance_labels:
119 | container_vm: "dask-cloudprovider"
120 | service_account: "default"
121 | instance_scopes: # OAuth2 scopes to assign to the service account on instances
122 | - "https://www.googleapis.com/auth/devstorage.read_write"
123 | - "https://www.googleapis.com/auth/logging.write"
124 | - "https://www.googleapis.com/auth/monitoring.write"
125 |
126 | hetzner:
127 | token: null # API token for interacting with the Hetzner cloud API
128 | location: "fsn1" # Location to launch vServer in
129 | server_type: "cx11" # vServer server type to launch, default is 2GB RAM, 1 vCPU
130 | image: "ubuntu-20.04" # Operating System image to use
131 | docker_image: "daskdev/dask:latest" # docker image to use
132 | bootstrap: true # It is assumed that the OS image does not have Docker and needs bootstrapping. Set this to false if using a custom image with Docker already installed.
133 |
134 | ibm:
135 | api_key: null
136 | image: "ghcr.io/dask/dask:latest"
137 | region: us-east
138 | project_id: null
139 | scheduler_cpu: "1.0"
140 | scheduler_mem: 4G
141 | scheduler_disk: 400M
142 | scheduler_timeout: 600 # seconds
143 | scheduler_command: python -m distributed.cli.dask_scheduler --protocol ws
144 | worker_cpu: "2.0"
145 | worker_mem: 8G
146 | worker_disk: 400M
147 | worker_threads: 1
148 | worker_command: python -m distributed.cli.dask_spec
149 | docker_server: ""
150 | docker_username: ""
151 | docker_password: ""
152 |
153 | openstack:
154 | region: "RegionOne" # The name of the region where resources will be allocated in OpenStack. List available regions using: `openstack region list`.
155 | size: null # Openstack flavors define the compute, memory, and storage capacity of computing instances. List available flavors using: `openstack flavor list`
156 | auth_url: null # The authentication URL for the OpenStack Identity service (Keystone). Example: https://cloud.example.com:5000
157 | application_credential_id: null # The application credential id created in OpenStack. Create application credentials using: openstack application credential create
158 | application_credential_secret: null # The secret associated with the application credential ID for authentication.
159 | auth_type: "v3applicationcredential" # The type of authentication used, typically "v3applicationcredential" for using OpenStack application credentials.
160 | network_id: null # The unique identifier for the internal/private network in OpenStack where the cluster VMs will be connected. List available networks using: `openstack network list`
161 | image: null # The OS image name or id to use for the VM. List available images using: `openstack image list`
162 | keypair_name: null # The name of the SSH keypair used for instance access. Ensure you have created a keypair or use an existing one. List available keypairs using: `openstack keypair list`
163 | security_group: null # The security group name that defines firewall rules for instances. List available security groups using: `openstack security group list`
164 | external_network_id: null # The ID of the external network used for assigning floating IPs. List available external networks using: `openstack network list --external`
165 | create_floating_ip: false # Specifies whether to assign a floating IP to each instance, enabling external access. Set to `True` if external connectivity is needed.
166 | docker_image: "daskdev/dask:latest" # docker image to use
167 | worker_threads: 2 # The number of threads to use on each worker.
168 | worker_command: null # str (optional) The command workers should run when starting. for example, ``dask-cuda-worker`` on GPU-enabled instances.
169 |
170 |
171 | nebius:
172 | token: null # iam token for interacting with the Nebius AI Cloud
173 | project_id: null # You can find it in Nebius AI Cloud console
174 | bootstrap: true # It is assumed that the OS image does not have Docker and needs bootstrapping. Set this to false if using a custom image with Docker already installed.
175 | image_family: "ubuntu22.04-driverless" # it should be "ubuntu22.04-driverless" or "ubuntu22.04-cuda12" https://docs.nebius.com/compute/storage/manage#parameters-boot
176 | docker_image: "daskdev/dask:latest" # docker image to use
177 | server_platform: "cpu-d3" # all platforms https://docs.nebius.com/compute/virtual-machines/types
178 | server_preset: "4vcpu-16gb" # all presets https://docs.nebius.com/compute/virtual-machines/types
179 | disk_size: 64 # Specifies the size of the VM host OS disk in gigabytes.
--------------------------------------------------------------------------------
/doc/source/packer.rst:
--------------------------------------------------------------------------------
1 | Creating custom OS images with Packer
2 | =====================================
3 |
4 | Many cloud providers in Dask Cloudprovider involve creating VMs and installing dependencies on those VMs at boot time.
5 |
6 | This can slow down the creation and scaling of clusters, so this page discusses building custom images using `Packer `_ to speed up cluster creation.
7 |
8 | Packer is a utility which boots up a VM on your desired cloud, runs any installation steps and then takes a snapshot of the VM for use as a template for creating
9 | new VMs later. This allows us to run through the installation steps once, and then reuse them when starting Dask components.
10 |
11 | Installing Packer
12 | -----------------
13 |
14 | See the `official install docs `_.
15 |
16 | Packer Overview
17 | ---------------
18 |
19 | To create an image with packer we need to create a JSON config file.
20 |
21 | A Packer config file is broken into a couple of sections, ``builders`` and ``provisioners``.
22 |
23 | A builder configures what type of image you are building (AWS AMI, GCP VMI, etc). It describes the base
24 | image you are building on top of and connection information for Packer to connect to the build instance.
25 |
26 | When you run ``packer build /path/to/config.json`` a VM (or multiple VMs if you configure more than one) will be
27 | created automatically based on your ``builders`` config section.
28 |
29 | Once your build VM is up and running the ``provisioners`` will be run. These are steps to configure and provision your
30 | machine. In the examples below we are mostly using the ``shell`` provisioner which will run commands on the VM to set things
31 | up.
32 |
33 | Once your provisioning scripts have completed the VM will automatically stop, a snapshot will be taken and you will be provided
34 | with an ID which you can then use as a template in future runs of ``dask-cloudprovider``.
35 |
36 | Image Requirements
37 | ------------------
38 |
39 | Each cluster manager that uses VMs will have specific requirements for the VM image.
40 |
41 | The AWS ``ECSCluster`` for example requires `ECS optimised AMIs `_.
42 |
43 | The VM cluster managers such as ``EC2cluster`` and ``DropletCluster`` just require `Docker `_ to be installed (or `NVIDIA Docker `_ for GPU VM types).
44 |
45 | Examples
46 | --------
47 |
48 | ``EC2Cluster`` with cloud-init
49 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
50 |
51 | When any of the ``VMCluster`` based cluster managers, such as ``EC2Cluster``, lauch a new default VM it uses the Ubuntu base image and installs all dependencies
52 | with `cloud-init `_.
53 |
54 | Instead of doing this every time we could use Packer to do this once, and then reuse that image every time.
55 |
56 | Each ``VMCluster`` cluster manager has a class method called ``get_cloud_init`` which takes the same keyword arguments as creating the object itself, but instead
57 | returns the cloud-init file that would be generated.
58 |
59 | .. code-block:: python
60 |
61 | from dask_cloudprovider.aws import EC2Cluster
62 |
63 | cloud_init_config = EC2Cluster.get_cloud_init(
64 | # Pass any kwargs here you would normally pass to ``EC2Cluster``
65 | )
66 | print(cloud_init_config)
67 |
68 | We should see some output like this.
69 |
70 | .. code-block:: YAML
71 |
72 | #cloud-config
73 |
74 | packages:
75 | - apt-transport-https
76 | - ca-certificates
77 | - curl
78 | - gnupg-agent
79 | - software-properties-common
80 |
81 | # Enable ipv4 forwarding, required on CIS hardened machines
82 | write_files:
83 | - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
84 | content: |
85 | net.ipv4.conf.all.forwarding=1
86 |
87 | # create the docker group
88 | groups:
89 | - docker
90 |
91 | # Add default auto created user to docker group
92 | system_info:
93 | default_user:
94 | groups: [docker]
95 |
96 | runcmd:
97 |
98 | # Install Docker
99 | - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
100 | - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
101 | - apt-get update -y
102 | - apt-get install -y docker-ce docker-ce-cli containerd.io
103 | - systemctl start docker
104 | - systemctl enable docker
105 |
106 | # Run container
107 | - docker run --net=host daskdev/dask:latest dask-scheduler --version
108 |
109 | We should save this output somewhere for reference later. Let's refer to it as ``/path/to/cloud-init-config.yaml``.
110 |
111 | Next we need a Packer config file to build our image, let's refer to it as ``/path/to/config.json``.
112 | We will use the official Ubuntu 20.04 image and specify our cloud-init config file in the ``user_data_file`` option.
113 |
114 | Packer will not necesserily wait for our cloud-init config to finish executing before taking a snapshot, so we need to add a provisioner
115 | that will block until the cloud-init completes.
116 |
117 | .. code-block:: JSON
118 |
119 | {
120 | "builders": [
121 | {
122 | "type": "amazon-ebs",
123 | "region": "eu-west-2",
124 | "source_ami_filter": {
125 | "filters": {
126 | "virtualization-type": "hvm",
127 | "name": "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*",
128 | "root-device-type": "ebs"
129 | },
130 | "owners": [
131 | "099720109477"
132 | ],
133 | "most_recent": true
134 | },
135 | "instance_type": "t2.micro",
136 | "ssh_username": "ubuntu",
137 | "ami_name": "dask-cloudprovider {{timestamp}}",
138 | "user_data_file": "/path/to/cloud-init-config.yaml"
139 | }
140 | ],
141 | "provisioners": [
142 | {
143 | "type": "shell",
144 | "inline": [
145 | "echo 'Waiting for cloud-init'; while [ ! -f /var/lib/cloud/instance/boot-finished ]; do sleep 1; done; echo 'Done'"
146 | ]
147 | }
148 | ]
149 | }
150 |
151 | Then we can build our image with ``packer build /path/to/config.json``.
152 |
153 | .. code-block::
154 |
155 | $ packer build /path/to/config.json
156 | amazon-ebs: output will be in this color.
157 |
158 | ==> amazon-ebs: Prevalidating any provided VPC information
159 | ==> amazon-ebs: Prevalidating AMI Name: dask-cloudprovider 1600875672
160 | amazon-ebs: Found Image ID: ami-062c2b6de9e9c54d3
161 | ==> amazon-ebs: Creating temporary keypair: packer_5f6b6c99-46b5-6002-3126-8dcb1696f969
162 | ==> amazon-ebs: Creating temporary security group for this instance: packer_5f6b6c9a-bd7d-8bb3-58a8-d983f0e95a96
163 | ==> amazon-ebs: Authorizing access to port 22 from [0.0.0.0/0] in the temporary security groups...
164 | ==> amazon-ebs: Launching a source AWS instance...
165 | ==> amazon-ebs: Adding tags to source instance
166 | amazon-ebs: Adding tag: "Name": "Packer Builder"
167 | amazon-ebs: Instance ID: i-0531483be973d60d8
168 | ==> amazon-ebs: Waiting for instance (i-0531483be973d60d8) to become ready...
169 | ==> amazon-ebs: Using ssh communicator to connect: 18.133.244.42
170 | ==> amazon-ebs: Waiting for SSH to become available...
171 | ==> amazon-ebs: Connected to SSH!
172 | ==> amazon-ebs: Provisioning with shell script: /var/folders/0l/fmwbqvqn1tq96xf20rlz6xmm0000gp/T/packer-shell512450076
173 | amazon-ebs: Waiting for cloud-init
174 | amazon-ebs: Done
175 | ==> amazon-ebs: Stopping the source instance...
176 | amazon-ebs: Stopping instance
177 | ==> amazon-ebs: Waiting for the instance to stop...
178 | ==> amazon-ebs: Creating AMI dask-cloudprovider 1600875672 from instance i-0531483be973d60d8
179 | amazon-ebs: AMI: ami-064f8db7634d19647
180 | ==> amazon-ebs: Waiting for AMI to become ready...
181 | ==> amazon-ebs: Terminating the source AWS instance...
182 | ==> amazon-ebs: Cleaning up any extra volumes...
183 | ==> amazon-ebs: No volumes to clean up, skipping
184 | ==> amazon-ebs: Deleting temporary security group...
185 | ==> amazon-ebs: Deleting temporary keypair...
186 | Build 'amazon-ebs' finished after 4 minutes 5 seconds.
187 |
188 | ==> Wait completed after 4 minutes 5 seconds
189 |
190 | ==> Builds finished. The artifacts of successful builds are:
191 | --> amazon-ebs: AMIs were created:
192 | eu-west-2: ami-064f8db7634d19647
193 |
194 | Then to use our new image we can create an ``EC2Cluster`` specifying the AMI and disabling the automatic bootstrapping.
195 |
196 | .. code-block:: python
197 |
198 | from dask.distributed import Client
199 | from dask_cloudprovider.aws import EC2Cluster
200 |
201 | cluster = EC2Cluster(
202 | ami="ami-064f8db7634d19647", # AMI ID provided by Packer
203 | bootstrap=False
204 | )
205 | cluster.scale(2)
206 |
207 | client = Client(cluster)
208 | # Your cluster is ready to use
209 |
210 | ``EC2Cluster`` with RAPIDS
211 | ^^^^^^^^^^^^^^^^^^^^^^^^^^
212 |
213 | To launch `RAPIDS `_ on AWS EC2 we can select a GPU instance type, choose the official Deep Learning AMIs that Amazon provides and run the official RAPIDS Docker image.
214 |
215 | .. code-block:: python
216 |
217 | from dask_cloudprovider.aws import EC2Cluster
218 |
219 | cluster = EC2Cluster(
220 | ami="ami-0c7c7d78f752f8f17", # Deep Learning AMI (this ID varies by region so find yours in the AWS Console)
221 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
222 | instance_type="p3.2xlarge",
223 | bootstrap=False, # Docker is already installed on the Deep Learning AMI
224 | filesystem_size=120,
225 | )
226 | cluster.scale(2)
227 |
228 | However every time a VM is created by ``EC2Cluster`` the RAPIDS Docker image will need to be pulled from Docker Hub.
229 | The result is that the above snippet can take ~20 minutes to run, so let's create our own AMI which already has the RAPIDS image pulled.
230 |
231 | In our builders section we will specify we want to build on top of the latest Deep Learning AMI by specifying
232 | ``"Deep Learning AMI (Ubuntu 18.04) Version *"`` to list all versions and ``"most_recent": true`` to use the most recent.
233 |
234 | We also restrict the owners to ``898082745236`` which is the ID for the official image channel.
235 |
236 | The official image already has the NVIDIA drivers and NVIDIA Docker runtime installed so the only step we need to do is to
237 | pull the RAPIDS Docker image. That way when a scheduler or worker VM is created the image will already be available on the machine.
238 |
239 | .. code-block:: JSON
240 |
241 | {
242 | "builders": [
243 | {
244 | "type": "amazon-ebs",
245 | "region": "eu-west-2",
246 | "source_ami_filter": {
247 | "filters": {
248 | "virtualization-type": "hvm",
249 | "name": "Deep Learning AMI (Ubuntu 18.04) Version *",
250 | "root-device-type": "ebs"
251 | },
252 | "owners": [
253 | "898082745236"
254 | ],
255 | "most_recent": true
256 | },
257 | "instance_type": "p3.2xlarge",
258 | "ssh_username": "ubuntu",
259 | "ami_name": "dask-cloudprovider-rapids {{timestamp}}"
260 | }
261 | ],
262 | "provisioners": [
263 | {
264 | "type": "shell",
265 | "inline": [
266 | "docker pull rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9"
267 | ]
268 | }
269 | ]
270 | }
271 |
272 | Then we can build our image with ``packer build /path/to/config.json``.
273 |
274 | .. code-block::
275 |
276 | $ packer build /path/to/config.json
277 | ==> amazon-ebs: Prevalidating any provided VPC information
278 | ==> amazon-ebs: Prevalidating AMI Name: dask-cloudprovider-gpu 1600868638
279 | amazon-ebs: Found Image ID: ami-0c7c7d78f752f8f17
280 | ==> amazon-ebs: Creating temporary keypair: packer_5f6b511e-d3a3-c607-559f-d466560cd23b
281 | ==> amazon-ebs: Creating temporary security group for this instance: packer_5f6b511f-8f62-cf98-ca54-5771f1423d2d
282 | ==> amazon-ebs: Authorizing access to port 22 from [0.0.0.0/0] in the temporary security groups...
283 | ==> amazon-ebs: Launching a source AWS instance...
284 | ==> amazon-ebs: Adding tags to source instance
285 | amazon-ebs: Adding tag: "Name": "Packer Builder"
286 | amazon-ebs: Instance ID: i-077f54ed4ae6bcc66
287 | ==> amazon-ebs: Waiting for instance (i-077f54ed4ae6bcc66) to become ready...
288 | ==> amazon-ebs: Using ssh communicator to connect: 52.56.96.165
289 | ==> amazon-ebs: Waiting for SSH to become available...
290 | ==> amazon-ebs: Connected to SSH!
291 | ==> amazon-ebs: Provisioning with shell script: /var/folders/0l/fmwbqvqn1tq96xf20rlz6xmm0000gp/T/packer-shell376445833
292 | amazon-ebs: Waiting for cloud-init
293 | amazon-ebs: Bootstrap complete
294 | ==> amazon-ebs: Stopping the source instance...
295 | amazon-ebs: Stopping instance
296 | ==> amazon-ebs: Waiting for the instance to stop...
297 | ==> amazon-ebs: Creating AMI dask-cloudprovider-gpu 1600868638 from instance i-077f54ed4ae6bcc66
298 | amazon-ebs: AMI: ami-04e5539cb82859e69
299 | ==> amazon-ebs: Waiting for AMI to become ready...
300 | ==> amazon-ebs: Terminating the source AWS instance...
301 | ==> amazon-ebs: Cleaning up any extra volumes...
302 | ==> amazon-ebs: No volumes to clean up, skipping
303 | ==> amazon-ebs: Deleting temporary security group...
304 | ==> amazon-ebs: Deleting temporary keypair...
305 | Build 'amazon-ebs' finished after 20 minutes 35 seconds.
306 |
307 | It took over 20 minutes to build this image, but now that we've done it once we can reuse the image in our RAPIDS powered Dask clusters.
308 |
309 | We can then run our code snippet again but this time it will take less than 5 minutes to get a running cluster.
310 |
311 | .. code-block:: python
312 |
313 | from dask.distributed import Client
314 | from dask_cloudprovider.aws import EC2Cluster
315 |
316 | cluster = EC2Cluster(
317 | ami="ami-04e5539cb82859e69", # AMI ID provided by Packer
318 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9",
319 | instance_type="p3.2xlarge",
320 | bootstrap=False,
321 | filesystem_size=120,
322 | )
323 | cluster.scale(2)
324 |
325 | client = Client(cluster)
326 | # Your cluster is ready to use
327 |
--------------------------------------------------------------------------------
/dask_cloudprovider/generic/vmcluster.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 | import os
4 | import uuid
5 |
6 | from jinja2 import Environment, FileSystemLoader
7 |
8 | import dask.config
9 | from distributed.core import Status
10 | from distributed.worker import Worker as _Worker
11 | from distributed.scheduler import Scheduler as _Scheduler
12 | from distributed.security import Security
13 | from distributed.deploy.spec import SpecCluster, ProcessInterface
14 | from distributed.utils import warn_on_duration, cli_keywords
15 |
16 | from dask_cloudprovider.utils.socket import is_socket_open
17 | from dask_cloudprovider.utils.config_helper import serialize_custom_config
18 |
19 |
20 | class VMInterface(ProcessInterface):
21 | """A superclass for VM Schedulers, Workers and Nannies."""
22 |
23 | def __init__(self, docker_args: str = "", extra_bootstrap: list = None, **kwargs):
24 | super().__init__()
25 | self.name = None
26 | self.command = None
27 | self.address = None
28 | self.cluster = None
29 | self.gpu_instance = None
30 | self.bootstrap = None
31 | self.docker_image = "daskdev/dask:latest"
32 | self.docker_args = docker_args
33 | self.extra_bootstrap = extra_bootstrap
34 | self.auto_shutdown = True
35 | self.set_env = f'env DASK_INTERNAL_INHERIT_CONFIG="{serialize_custom_config()}"'
36 | self.kwargs = kwargs
37 |
38 | async def create_vm(self):
39 | raise NotImplementedError("create_vm is a required method of the VMInterface")
40 |
41 | async def destroy_vm(self):
42 | raise NotImplementedError("destroy_vm is a required method of the VMInterface")
43 |
44 | async def wait_for_scheduler(self):
45 | if self.external_address:
46 | _, address = self.external_address.split("://")
47 | else:
48 | _, address = self.address.split("://")
49 | ip, port = address.split(":")
50 |
51 | self.cluster._log(f"Waiting for scheduler to run at {ip}:{port}")
52 | while not is_socket_open(ip, port):
53 | await asyncio.sleep(0.1)
54 | self.cluster._log("Scheduler is running")
55 |
56 | async def start(self):
57 | """Create a VM."""
58 | await super().start()
59 |
60 | async def close(self):
61 | """Destroy a VM."""
62 | await self.destroy_vm()
63 | await super().close()
64 |
65 | async def call_async(self, f, *args, **kwargs):
66 | """Run a blocking function in a thread as a coroutine."""
67 | return await self.cluster.call_async(f, *args, **kwargs)
68 |
69 |
70 | class SchedulerMixin(object):
71 | """A mixin for Schedulers."""
72 |
73 | def __init__(
74 | self,
75 | *args,
76 | scheduler_options: dict = {},
77 | **kwargs,
78 | ):
79 | super().__init__(*args, **kwargs)
80 | self.name = f"dask-{self.cluster.uuid}-scheduler"
81 | self.port = scheduler_options.get("port", 8786)
82 | self.command = " ".join(
83 | [
84 | self.set_env,
85 | "python",
86 | "-m",
87 | "distributed.cli.dask_scheduler",
88 | ]
89 | + cli_keywords(scheduler_options, cls=_Scheduler)
90 | )
91 |
92 | async def start(self):
93 | self.cluster._log("Creating scheduler instance")
94 |
95 | internal_ip, external_ip = await self.create_vm()
96 | self.address = f"{self.cluster.protocol}://{internal_ip}:{self.port}"
97 | if external_ip:
98 | self.external_address = (
99 | f"{self.cluster.protocol}://{external_ip}:{self.port}"
100 | )
101 |
102 | await self.wait_for_scheduler()
103 | await super().start()
104 |
105 |
106 | class WorkerMixin(object):
107 | """A Remote Dask Worker running on a VM."""
108 |
109 | def __init__(
110 | self,
111 | scheduler: str,
112 | *args,
113 | worker_module: str = None,
114 | worker_class: str = None,
115 | worker_options: dict = {},
116 | **kwargs,
117 | ):
118 | super().__init__(*args, **kwargs)
119 | self.scheduler = scheduler
120 | self.name = f"dask-{self.cluster.uuid}-worker-{str(uuid.uuid4())[:8]}"
121 | if worker_module is not None:
122 | self.worker_module = worker_module
123 |
124 | self.command = " ".join(
125 | [
126 | self.set_env,
127 | "python",
128 | "-m",
129 | self.worker_module,
130 | self.scheduler,
131 | "--name",
132 | str(self.name),
133 | ]
134 | + cli_keywords(worker_options, cls=_Worker, cmd=self.worker_module)
135 | )
136 | if worker_class is not None:
137 | self.worker_class = worker_class
138 | self.command = " ".join(
139 | [
140 | self.set_env,
141 | "python",
142 | "-m",
143 | "distributed.cli.dask_spec",
144 | self.scheduler,
145 | "--spec",
146 | "''%s''" # in yaml double single quotes escape the single quote
147 | % json.dumps(
148 | {
149 | "cls": self.worker_class,
150 | "opts": {
151 | **worker_options,
152 | "name": self.name,
153 | },
154 | }
155 | ),
156 | ]
157 | )
158 |
159 | async def start(self):
160 | self.cluster._log("Creating worker instance")
161 | self.address, _ = await self.create_vm()
162 | await super().start()
163 |
164 |
165 | class VMCluster(SpecCluster):
166 | """A base class for Virtual Machine based cluster managers.
167 |
168 | This class holds logic around starting a scheduler and workers as VMs. This class
169 | is not intended to be used directly but instead should be subclassed and the attributes
170 | ``scheduler_class`` and ``worker_class`` should be set.
171 |
172 | The scheduler class should be a subclass of ``VMInterface`` with the ``SchedulerMixin``.
173 | The worker class should be a subclass of ``VMInterface`` with the ``WorkerMixin``.
174 |
175 | See ``VMInterface`` docstring for required methods.
176 |
177 | For a reference implementation see :class:`DropletCluster`.
178 |
179 | The following paramaters section should be copied to the subclass docstring and appended
180 | to the provider specific paramaters.
181 |
182 | Parameters
183 | ----------
184 | n_workers: int
185 | Number of workers to initialise the cluster with. Defaults to ``0``.
186 | worker_module: str
187 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker``
188 | worker_options: dict
189 | Params to be passed to the worker class.
190 | See :class:`distributed.worker.Worker` for default worker class.
191 | If you set ``worker_module`` then refer to the docstring for the custom worker class.
192 | scheduler_options: dict
193 | Params to be passed to the scheduler class.
194 | See :class:`distributed.scheduler.Scheduler`.
195 | docker_image: string (optional)
196 | The Docker image to run on all instances.
197 |
198 | This image must have a valid Python environment and have ``dask`` installed in order for the
199 | ``dask-scheduler`` and ``dask-worker`` commands to be available. It is recommended the Python
200 | environment matches your local environment where ``EC2Cluster`` is being created from.
201 |
202 | For GPU instance types the Docker image much have NVIDIA drivers and ``dask-cuda`` installed.
203 |
204 | By default the ``daskdev/dask:latest`` image will be used.
205 | docker_args: string (optional)
206 | Extra command line arguments to pass to Docker.
207 | extra_bootstrap: list[str] (optional)
208 | Extra commands to be run during the bootstrap phase.
209 | silence_logs: bool
210 | Whether or not we should silence logging when setting up the cluster.
211 | asynchronous: bool
212 | If this is intended to be used directly within an event loop with
213 | async/await
214 | security: Security or bool, optional
215 | Configures communication security in this cluster. Can be a security
216 | object, or True. If True, temporary self-signed credentials will
217 | be created automatically. Default is ``True``.
218 | debug: bool, optional
219 | More information will be printed when constructing clusters to enable debugging.
220 |
221 | """
222 |
223 | scheduler_class = None
224 | worker_class = None
225 | options = {}
226 | scheduler_options = {}
227 | worker_options = {}
228 | docker_image = None
229 | command = None
230 | gpu_instance = None
231 | bootstrap = None
232 | auto_shutdown = None
233 |
234 | def __init__(
235 | self,
236 | n_workers: int = 0,
237 | worker_class: str = "dask.distributed.Nanny",
238 | worker_options: dict = {},
239 | scheduler_options: dict = {},
240 | docker_image="daskdev/dask:latest",
241 | docker_args: str = "",
242 | extra_bootstrap: list = None,
243 | env_vars: dict = {},
244 | security: bool = True,
245 | protocol: str = None,
246 | debug: bool = False,
247 | **kwargs,
248 | ):
249 | if self.scheduler_class is None or self.worker_class is None:
250 | raise RuntimeError(
251 | "VMCluster is not intended to be used directly. See docstring for more info."
252 | )
253 | self._n_workers = n_workers
254 |
255 | if not security:
256 | self.security = None
257 | elif security is True:
258 | # True indicates self-signed temporary credentials should be used
259 | self.security = Security.temporary()
260 | elif not isinstance(security, Security):
261 | raise TypeError("security must be a Security object")
262 | else:
263 | self.security = security
264 |
265 | if protocol is None:
266 | if self.security and self.security.require_encryption:
267 | self.protocol = "tls"
268 | else:
269 | self.protocol = "tcp"
270 | else:
271 | self.protocol = protocol
272 |
273 | self.debug = debug
274 |
275 | if self.security and self.security.require_encryption:
276 | dask.config.set(
277 | {
278 | "distributed.comm.default-scheme": self.protocol,
279 | "distributed.comm.require-encryption": True,
280 | "distributed.comm.tls.ca-file": self.security.tls_ca_file,
281 | "distributed.comm.tls.scheduler.key": self.security.tls_scheduler_key,
282 | "distributed.comm.tls.scheduler.cert": self.security.tls_scheduler_cert,
283 | "distributed.comm.tls.worker.key": self.security.tls_worker_key,
284 | "distributed.comm.tls.worker.cert": self.security.tls_worker_cert,
285 | "distributed.comm.tls.client.key": self.security.tls_client_key,
286 | "distributed.comm.tls.client.cert": self.security.tls_client_cert,
287 | }
288 | )
289 |
290 | image = self.scheduler_options.get("docker_image", False) or docker_image
291 | self.options["docker_image"] = image
292 | self.scheduler_options["docker_image"] = image
293 | self.scheduler_options["env_vars"] = env_vars
294 | self.scheduler_options["protocol"] = protocol
295 | self.scheduler_options["scheduler_options"] = scheduler_options
296 | self.scheduler_options["extra_bootstrap"] = extra_bootstrap
297 | self.worker_options["env_vars"] = env_vars
298 | self.options["docker_args"] = docker_args
299 | self.options["extra_bootstrap"] = extra_bootstrap
300 | self.scheduler_options["docker_args"] = docker_args
301 | self.worker_options["docker_args"] = docker_args
302 | self.worker_options["docker_image"] = image
303 | self.worker_options["worker_class"] = worker_class
304 | self.worker_options["protocol"] = protocol
305 | self.worker_options["worker_options"] = worker_options
306 | self.worker_options["extra_bootstrap"] = extra_bootstrap
307 | self.uuid = str(uuid.uuid4())[:8]
308 |
309 | super().__init__(**kwargs, security=self.security)
310 |
311 | async def call_async(self, f, *args, **kwargs):
312 | """Run a blocking function in a thread as a coroutine.
313 |
314 | This can only be used to make IO-bound operations non-blocking due to the GIL.
315 |
316 | As of Python 3.9 this can be replaced with :func:`asyncio.to_thread`.
317 | Once 3.9 is our minimum supported version this can be removed/replaced.
318 |
319 | """
320 | [done], _ = await asyncio.wait(
321 | fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))},
322 | return_when=asyncio.ALL_COMPLETED,
323 | )
324 | return done.result()
325 |
326 | async def _start(
327 | self,
328 | ):
329 | while self.status == Status.starting:
330 | await asyncio.sleep(0.01)
331 | if self.status == Status.running:
332 | return
333 | if self.status == Status.closed:
334 | raise ValueError("Cluster is closed")
335 |
336 | self.scheduler_spec = {
337 | "cls": self.scheduler_class,
338 | "options": self.scheduler_options,
339 | }
340 | self.new_spec = {"cls": self.worker_class, "options": self.worker_options}
341 | self.worker_spec = {
342 | self._new_worker_name(i): self.new_spec for i in range(self._n_workers)
343 | }
344 |
345 | with warn_on_duration(
346 | "10s",
347 | "Creating your cluster is taking a surprisingly long time. "
348 | "This is likely due to pending resources. "
349 | "Hang tight! ",
350 | ):
351 | await super()._start()
352 |
353 | def render_process_cloud_init(self, process):
354 | return self.render_cloud_init(
355 | image=process.docker_image,
356 | command=process.command,
357 | docker_args=process.docker_args,
358 | extra_bootstrap=process.extra_bootstrap,
359 | gpu_instance=process.gpu_instance,
360 | bootstrap=process.bootstrap,
361 | auto_shutdown=process.auto_shutdown,
362 | env_vars=process.env_vars,
363 | )
364 |
365 | def render_cloud_init(self, *args, **kwargs):
366 | loader = FileSystemLoader([os.path.dirname(os.path.abspath(__file__))])
367 | environment = Environment(loader=loader)
368 | template = environment.get_template("cloud-init.yaml.j2")
369 | cloud_init = template.render(**kwargs)
370 | if self.debug:
371 | print("\nCloud init\n==========\n\n")
372 | print(cloud_init)
373 | return cloud_init
374 |
375 | @classmethod
376 | def get_cloud_init(cls, *args, **kwargs):
377 | cluster = cls(*args, asynchronous=True, **kwargs)
378 | cluster.auto_shutdown = False
379 | return cluster.render_cloud_init(
380 | image=cluster.options["docker_image"],
381 | command="dask-scheduler --version",
382 | docker_args=cluster.options["docker_args"],
383 | extra_bootstrap=cluster.options["extra_bootstrap"],
384 | gpu_instance=cluster.gpu_instance,
385 | bootstrap=cluster.bootstrap,
386 | auto_shutdown=cluster.auto_shutdown,
387 | env_vars=cluster.worker_options["env_vars"],
388 | )
389 |
390 | def get_tags(self):
391 | """Generate tags to be applied to all resources."""
392 | return {"creator": "dask-cloudprovider", "cluster-id": self.uuid}
393 |
--------------------------------------------------------------------------------