├── .gitattributes ├── .github └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── ci ├── environment-3.10.yml ├── environment-3.11.yml ├── environment-3.12.yml └── scripts │ └── test_imports.sh ├── dask_cloudprovider ├── __init__.py ├── _version.py ├── aws │ ├── __init__.py │ ├── ec2.py │ ├── ecs.py │ ├── helper.py │ └── tests │ │ ├── test_ec2.py │ │ ├── test_ecs.py │ │ └── test_helper.py ├── azure │ ├── __init__.py │ ├── azurevm.py │ ├── tests │ │ └── test_azurevm.py │ └── utils.py ├── cli │ └── ecs.py ├── cloudprovider.yaml ├── config.py ├── conftest.py ├── digitalocean │ ├── __init__.py │ ├── droplet.py │ └── tests │ │ └── test_droplet.py ├── exceptions.py ├── gcp │ ├── __init__.py │ ├── instances.py │ ├── tests │ │ ├── test_gcp.py │ │ └── test_utils.py │ └── utils.py ├── generic │ ├── cloud-init.yaml.j2 │ ├── tests │ │ └── test_vmcluster.py │ └── vmcluster.py ├── hetzner │ ├── __init__.py │ ├── tests │ │ └── test_vserver.py │ └── vserver.py ├── ibm │ ├── __init__.py │ ├── code_engine.py │ └── tests │ │ └── test_code_engine.py ├── nebius │ ├── __init__.py │ ├── instances.py │ └── tests │ │ └── test_nebius.py ├── openstack │ ├── __init__.py │ ├── instances.py │ └── tests │ │ └── test_instances.py ├── tests │ └── test_imports.py └── utils │ ├── logs.py │ ├── socket.py │ └── timeout.py ├── doc ├── Makefile ├── make.bat ├── requirements-docs.txt └── source │ ├── alternatives.rst │ ├── aws.rst │ ├── azure.rst │ ├── conf.py │ ├── config.rst │ ├── digitalocean.rst │ ├── gcp.rst │ ├── gpus.rst │ ├── hetzner.rst │ ├── ibm.rst │ ├── index.rst │ ├── installation.rst │ ├── nebius.rst │ ├── openstack.rst │ ├── packer.rst │ ├── releasing.rst │ ├── security.rst │ ├── testing.rst │ └── troubleshooting.rst ├── examples ├── EC2Cluster-randomforest.ipynb └── OpenstackCluster-scorepredict.ipynb ├── pytest.ini ├── requirements.txt ├── requirements_test.txt ├── setup.cfg ├── setup.py └── versioneer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | dask_cloudprovider/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | fail-fast: true 10 | matrix: 11 | os: ["ubuntu-latest"] 12 | python-version: ["3.10", "3.11", "3.12"] 13 | 14 | steps: 15 | - name: Checkout source 16 | uses: actions/checkout@v2 17 | 18 | - name: Setup Conda Environment 19 | uses: conda-incubator/setup-miniconda@v2 20 | with: 21 | miniconda-version: "latest" 22 | python-version: ${{ matrix.python-version }} 23 | environment-file: ci/environment-${{ matrix.python-version }}.yml 24 | activate-environment: dask-cloudprovider-test 25 | auto-activate-base: false 26 | 27 | - name: Install 28 | shell: bash -l {0} 29 | run: pip install -e .[all] 30 | 31 | - name: Run tests 32 | shell: bash -l {0} 33 | run: py.test dask_cloudprovider 34 | 35 | lint: 36 | name: "pre-commit hooks" 37 | runs-on: ubuntu-latest 38 | steps: 39 | - uses: actions/checkout@v2 40 | - uses: actions/setup-python@v2 41 | - uses: pre-commit/action@v2.0.0 42 | 43 | imports: 44 | runs-on: ubuntu-latest 45 | steps: 46 | - name: Checkout source 47 | uses: actions/checkout@v2 48 | 49 | - name: Setup Conda Environment 50 | uses: conda-incubator/setup-miniconda@v2 51 | with: 52 | miniconda-version: "latest" 53 | python-version: "3.12" 54 | 55 | - name: Run import tests 56 | shell: bash -l {0} 57 | run: source ci/scripts/test_imports.sh 58 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build distribution 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: "ubuntu-latest" 8 | 9 | steps: 10 | - name: Checkout source 11 | uses: actions/checkout@v2 12 | 13 | - name: Set up Python 3.12 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: 3.12 17 | 18 | - name: Install pypa/build 19 | run: python -m pip install build wheel setuptools 20 | 21 | - name: Build distributions 22 | shell: bash -l {0} 23 | run: python setup.py sdist bdist_wheel 24 | 25 | - name: Publish package to PyPI 26 | if: github.repository == 'dask/dask-cloudprovider' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 27 | uses: pypa/gh-action-pypi-publish@master 28 | with: 29 | user: __token__ 30 | password: ${{ secrets.pypi_password }} 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.a 8 | *.dll 9 | *.exe 10 | *.o 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | doc/_build/ 72 | doc/source/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # dotenv 90 | .env 91 | 92 | # virtualenv 93 | .venv 94 | venv/ 95 | ENV/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | 110 | # IDE 111 | .vscode/ 112 | .idea 113 | 114 | # MAC 115 | .DS_Store 116 | 117 | # any untitled Jupyter notebooks 118 | Untitled*.ipynb 119 | 120 | # key material 121 | *.pem 122 | *.pub 123 | *_rsa 124 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 23.10.1 4 | hooks: 5 | - id: black 6 | language_version: python3 7 | exclude: versioneer.py 8 | - repo: https://github.com/pycqa/flake8 9 | rev: 6.1.0 10 | hooks: 11 | - id: flake8 12 | language_version: python3 13 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: doc/source/conf.py 5 | 6 | formats: all 7 | 8 | python: 9 | install: 10 | - method: pip 11 | path: . 12 | extra_requirements: 13 | - all 14 | - requirements: doc/requirements-docs.txt 15 | 16 | submodules: 17 | include: all 18 | 19 | build: 20 | os: ubuntu-22.04 21 | tools: 22 | python: "3.11" 23 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Dask is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more. 2 | 3 | For general information on how to contribute see https://docs.dask.org/en/latest/develop.html. 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, NVIDIA Corporation 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include dask_cloudprovider *.py 2 | recursive-include dask_cloudprovider *.yaml 3 | recursive-include dask_cloudprovider *.j2 4 | 5 | include setup.py 6 | include setup.cfg 7 | include LICENSE 8 | include README.rst 9 | include requirements.txt 10 | include MANIFEST.in 11 | include versioneer.py 12 | 13 | recursive-exclude * __pycache__ 14 | recursive-exclude * *.py[co]include dask_cloudprovider/_version.py 15 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | Dask Cloud Provider 3 | =================== 4 | 5 | 6 | .. image:: https://github.com/dask/dask-cloudprovider/actions/workflows/ci.yml/badge.svg 7 | :target: https://github.com/dask/dask-cloudprovider/actions?query=workflow%3ACI 8 | :alt: Build Status 9 | 10 | .. image:: https://img.shields.io/readthedocs/dask-cloudprovider?color=%232980B9&logo=read-the-docs&logoColor=white 11 | :target: https://cloudprovider.dask.org/ 12 | :alt: Read the Docs 13 | 14 | .. image:: https://img.shields.io/readthedocs/dask-cloudprovider?color=%232980B9&label=developer%20docs&logo=read-the-docs&logoColor=white 15 | :target: https://cloudprovider.dask.org/releasing.html 16 | :alt: Read the Docs Developer 17 | 18 | .. image:: https://img.shields.io/pypi/v/dask-cloudprovider 19 | :target: https://pypi.org/project/dask-cloudprovider/ 20 | :alt: PyPI 21 | 22 | .. image:: https://img.shields.io/conda/vn/conda-forge/dask-cloudprovider 23 | :target: https://anaconda.org/conda-forge/dask-cloudprovider 24 | :alt: Conda Forge 25 | 26 | 27 | Native Cloud integration for Dask. 28 | 29 | This library provides tools to enable Dask clusters to more natively integrate with the cloud. 30 | It includes cluster managers to create dask clusters on a given cloud provider using native resources, 31 | plugins to more closely integrate Dask components with the cloud platform they are running on and documentation to empower all folks running Dask on the cloud. 32 | -------------------------------------------------------------------------------- /ci/environment-3.10.yml: -------------------------------------------------------------------------------- 1 | name: dask-cloudprovider-test 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.10 7 | - nomkl 8 | - pip 9 | # Dask 10 | - dask 11 | # testing / CI 12 | - flake8 13 | - ipywidgets 14 | - pytest 15 | - pytest-asyncio 16 | - black >=20.8b1 17 | - pyyaml 18 | # dask dependencies 19 | - cloudpickle 20 | - toolz 21 | - cytoolz 22 | - numpy 23 | - partd 24 | # distributed dependencies 25 | - click >=6.6 26 | - msgpack-python 27 | - psutil >=5.0 28 | - six 29 | - sortedcontainers !=2.0.0,!=2.0.1 30 | - tblib 31 | - tornado >=5 32 | - zict >=0.1.3 33 | # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212 34 | - pytest-asyncio >=0.14.0 35 | - pytest-timeout 36 | - pip: 37 | - git+https://github.com/dask/dask.git@main 38 | - git+https://github.com/dask/distributed@main 39 | -------------------------------------------------------------------------------- /ci/environment-3.11.yml: -------------------------------------------------------------------------------- 1 | name: dask-cloudprovider-test 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.11 7 | - nomkl 8 | - pip 9 | # Dask 10 | - dask 11 | # testing / CI 12 | - flake8 13 | - ipywidgets 14 | - pytest 15 | - pytest-asyncio 16 | - black >=20.8b1 17 | - pyyaml 18 | # dask dependencies 19 | - cloudpickle 20 | - toolz 21 | - cytoolz 22 | - numpy 23 | - partd 24 | # distributed dependencies 25 | - click >=6.6 26 | - msgpack-python 27 | - psutil >=5.0 28 | - six 29 | - sortedcontainers !=2.0.0,!=2.0.1 30 | - tblib 31 | - tornado >=5 32 | - zict >=0.1.3 33 | # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212 34 | - pytest-asyncio >=0.14.0 35 | - pytest-timeout 36 | - pip: 37 | - git+https://github.com/dask/dask.git@main 38 | - git+https://github.com/dask/distributed@main 39 | -------------------------------------------------------------------------------- /ci/environment-3.12.yml: -------------------------------------------------------------------------------- 1 | name: dask-cloudprovider-test 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.12 7 | - nomkl 8 | - pip 9 | # Dask 10 | - dask 11 | # testing / CI 12 | - flake8 13 | - ipywidgets 14 | - pytest 15 | - pytest-asyncio 16 | - black >=20.8b1 17 | - pyyaml 18 | # dask dependencies 19 | - cloudpickle 20 | - toolz 21 | - cytoolz 22 | - numpy 23 | - partd 24 | # distributed dependencies 25 | - click >=6.6 26 | - msgpack-python 27 | - psutil >=5.0 28 | - six 29 | - sortedcontainers !=2.0.0,!=2.0.1 30 | - tblib 31 | - tornado >=5 32 | - zict >=0.1.3 33 | # `event_loop_policy` change See https://github.com/dask/distributed/pull/4212 34 | - pytest-asyncio >=0.14.0 35 | - pytest-timeout 36 | - pip: 37 | - git+https://github.com/dask/dask.git@main 38 | - git+https://github.com/dask/distributed@main 39 | -------------------------------------------------------------------------------- /ci/scripts/test_imports.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | 4 | 5 | test_import () { 6 | echo "Create environment: python=3.12 $1" 7 | # Create an empty environment 8 | conda create -q -y -n test-imports -c conda-forge python=3.12 9 | conda activate test-imports 10 | pip install -e .[$1] 11 | echo "python -c '$2'" 12 | python -c "$2" 13 | echo "Success [$1] 🚀" 14 | conda deactivate 15 | conda env remove -n test-imports 16 | } 17 | 18 | test_import "aws" "import dask_cloudprovider.aws" 19 | test_import "azure" "import dask_cloudprovider.azure" 20 | test_import "digitalocean" "import dask_cloudprovider.digitalocean" 21 | test_import "gcp" "import dask_cloudprovider.gcp" 22 | test_import "ibm" "import dask_cloudprovider.ibm" 23 | test_import "openstack" "import dask_cloudprovider.openstack" 24 | -------------------------------------------------------------------------------- /dask_cloudprovider/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | 3 | from ._version import get_versions 4 | 5 | __version__ = get_versions()["version"] 6 | 7 | del get_versions 8 | 9 | 10 | def __getattr__(name): 11 | """As of dask_cloudprovider v0.5.0 all cluster managers are in cloud provider specific submodules. 12 | 13 | This allows us to more easily separate out optional dependencies. However we maintain some helpful 14 | errors at the top level. 15 | 16 | This is both to help migrate users of any cluster managers that existed before this was changed 17 | and also to help anyone who incorrectly tries to import a cluster manager from the top level. 18 | Perhaps because they saw it used in some documentation but didn't see the import. 19 | 20 | """ 21 | 22 | if name in ["EC2Cluster", "ECSCluster", "FargateCluster"]: 23 | raise ImportError( 24 | "AWS cluster managers must be imported from the aws subpackage. " 25 | f"Please import dask_cloudprovider.aws.{name}" 26 | ) 27 | 28 | if name in ["AzureVMCluster"]: 29 | raise ImportError( 30 | "Azure cluster managers must be imported from the the azure subpackage. " 31 | f"Please import dask_cloudprovider.azure.{name}" 32 | ) 33 | 34 | if name in ["GCPCluster"]: 35 | raise ImportError( 36 | "Google Cloud cluster managers must be imported from the the gcp subpackage. " 37 | f"Please import dask_cloudprovider.gcp.{name}" 38 | ) 39 | 40 | if name in ["DropletCluster"]: 41 | raise ImportError( 42 | "DigitalOcean cluster managers must be imported from the digitalocean subpackage. " 43 | f"Please import dask_cloudprovider.digitalocean.{name}" 44 | ) 45 | -------------------------------------------------------------------------------- /dask_cloudprovider/aws/__init__.py: -------------------------------------------------------------------------------- 1 | from .ec2 import EC2Cluster 2 | from .ecs import ECSCluster, FargateCluster 3 | -------------------------------------------------------------------------------- /dask_cloudprovider/aws/helper.py: -------------------------------------------------------------------------------- 1 | """Helper functions for working with AWS services.""" 2 | from datetime import datetime 3 | 4 | DEFAULT_SECURITY_GROUP_NAME = "dask-default" 5 | 6 | 7 | def dict_to_aws(py_dict, upper=False, key_string=None, value_string=None): 8 | key_string = key_string or ("Key" if upper else "key") 9 | value_string = value_string or ("Value" if upper else "value") 10 | return [{key_string: key, value_string: value} for key, value in py_dict.items()] 11 | 12 | 13 | def aws_to_dict(aws_dict): 14 | try: 15 | return {item["key"]: item["value"] for item in aws_dict} 16 | except KeyError: 17 | return {item["Key"]: item["Value"] for item in aws_dict} 18 | 19 | 20 | # https://aws.amazon.com/blogs/messaging-and-targeting/how-to-handle-a-throttling-maximum-sending-rate-exceeded-error/ 21 | def get_sleep_duration(current_try, min_sleep_millis=10, max_sleep_millis=5000): 22 | current_try = max(1, current_try) 23 | current_sleep_millis = min_sleep_millis * current_try**2 24 | return min(current_sleep_millis, max_sleep_millis) / 1000 # return in seconds 25 | 26 | 27 | class ConfigMixin: 28 | def update_attr_from_config(self, attr: str, private: bool): 29 | """Update class attribute of given cluster based on config, if not already set. If `private` is True, the class 30 | attribute will be prefixed with an underscore. 31 | 32 | This mixin can be applied to any class that has a config dict attribute. 33 | """ 34 | prefix = "_" if private else "" 35 | if getattr(self, f"{prefix}{attr}") is None: 36 | setattr(self, f"{prefix}{attr}", self.config.get(attr)) 37 | 38 | 39 | async def get_latest_ami_id(client, name_glob, owner): 40 | images = await client.describe_images( 41 | Filters=[ 42 | {"Name": "name", "Values": [name_glob]}, 43 | {"Name": "owner-id", "Values": [owner]}, 44 | ] 45 | ) 46 | creation_date = None 47 | image_id = None 48 | 49 | for image in images["Images"]: 50 | image_date = datetime.strptime(image["CreationDate"], "%Y-%m-%dT%H:%M:%S.%fZ") 51 | if creation_date is None or creation_date < image_date: 52 | image_id = image["ImageId"] 53 | creation_date = image_date 54 | return image_id 55 | 56 | 57 | async def get_default_vpc(client): 58 | vpcs = (await client.describe_vpcs())["Vpcs"] 59 | [vpc] = [vpc for vpc in vpcs if vpc["IsDefault"]] 60 | return vpc["VpcId"] 61 | 62 | 63 | async def get_vpc_subnets(client, vpc): 64 | vpcs = (await client.describe_vpcs())["Vpcs"] 65 | [vpc] = [x for x in vpcs if x["VpcId"] == vpc] 66 | subnets = (await client.describe_subnets())["Subnets"] 67 | return [subnet["SubnetId"] for subnet in subnets if subnet["VpcId"] == vpc["VpcId"]] 68 | 69 | 70 | async def get_security_group(client, vpc, create_default=True): 71 | try: 72 | response = await client.describe_security_groups( 73 | GroupNames=[DEFAULT_SECURITY_GROUP_NAME] 74 | ) 75 | groups = response["SecurityGroups"] 76 | except Exception: 77 | groups = [] 78 | if len(groups) > 0: 79 | return groups[0]["GroupId"] 80 | else: 81 | if create_default: 82 | try: 83 | return await create_default_security_group( 84 | client, DEFAULT_SECURITY_GROUP_NAME, vpc 85 | ) 86 | except Exception as e: 87 | raise RuntimeError( 88 | "Unable to create default security group. Please specify manually." 89 | ) from e 90 | else: 91 | raise RuntimeError( 92 | "Unable to find suitable security group. Please specify manually." 93 | ) 94 | 95 | 96 | async def create_default_security_group(client, group_name, vpc): 97 | response = await client.create_security_group( 98 | Description="A default security group for Dask", 99 | GroupName=group_name, 100 | VpcId=vpc, 101 | DryRun=False, 102 | ) 103 | 104 | await client.authorize_security_group_ingress( 105 | GroupId=response["GroupId"], 106 | IpPermissions=[ 107 | { 108 | "IpProtocol": "TCP", 109 | "FromPort": 8786, 110 | "ToPort": 8787, 111 | "IpRanges": [{"CidrIp": "0.0.0.0/0", "Description": "Anywhere"}], 112 | "Ipv6Ranges": [{"CidrIpv6": "::/0", "Description": "Anywhere"}], 113 | }, 114 | { 115 | "IpProtocol": "TCP", 116 | "FromPort": 0, 117 | "ToPort": 65535, 118 | "UserIdGroupPairs": [{"GroupId": response["GroupId"]}], 119 | }, 120 | ], 121 | DryRun=False, 122 | ) 123 | 124 | return response["GroupId"] 125 | -------------------------------------------------------------------------------- /dask_cloudprovider/aws/tests/test_ec2.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | aiobotocore = pytest.importorskip("aiobotocore") 4 | 5 | from dask_cloudprovider.aws.ec2 import EC2Cluster 6 | from dask_cloudprovider.aws.helper import get_latest_ami_id 7 | from dask.distributed import Client 8 | from distributed.core import Status 9 | 10 | 11 | async def skip_without_credentials(): 12 | try: 13 | async with aiobotocore.get_session().create_client("sts") as client: 14 | await client.get_caller_identity() 15 | except Exception: 16 | pytest.skip( 17 | """ 18 | You must configure Your AWS credentials to run this test. 19 | 20 | $ aws configure 21 | 22 | """ 23 | ) 24 | 25 | 26 | @pytest.fixture 27 | @pytest.mark.external 28 | async def cluster(): 29 | await skip_without_credentials() 30 | async with EC2Cluster(asynchronous=True) as cluster: 31 | yield cluster 32 | 33 | 34 | @pytest.fixture 35 | @pytest.mark.external 36 | async def cluster_sync(): 37 | await skip_without_credentials() 38 | cluster = EC2Cluster() 39 | yield cluster 40 | 41 | 42 | @pytest.fixture 43 | @pytest.mark.external 44 | async def cluster_rapids(): 45 | await skip_without_credentials() 46 | async with EC2Cluster( 47 | asynchronous=True, 48 | # Deep Learning AMI (Ubuntu 18.04) 49 | ami="ami-0c7c7d78f752f8f17", 50 | # Python version must match local version and CUDA version must match AMI CUDA version 51 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9", 52 | instance_type="p3.2xlarge", 53 | bootstrap=False, 54 | filesystem_size=120, 55 | ) as cluster: 56 | yield cluster 57 | 58 | 59 | @pytest.fixture 60 | @pytest.mark.external 61 | async def cluster_rapids_packer(): 62 | await skip_without_credentials() 63 | async with EC2Cluster( 64 | asynchronous=True, 65 | # Packer AMI 66 | ami="ami-04e5539cb82859e69", 67 | # Python version must match local version and CUDA version must match AMI CUDA version 68 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9", 69 | instance_type="p3.2xlarge", 70 | bootstrap=False, 71 | filesystem_size=120, 72 | ) as cluster: 73 | yield cluster 74 | 75 | 76 | @pytest.fixture 77 | @pytest.mark.external 78 | async def cluster_packer(): 79 | await skip_without_credentials() 80 | async with EC2Cluster( 81 | asynchronous=True, ami="ami-0e6187593ace05a0c", bootstrap=False 82 | ) as cluster: 83 | yield cluster 84 | 85 | 86 | @pytest.fixture 87 | async def ec2_client(): 88 | await skip_without_credentials() 89 | async with aiobotocore.get_session().create_client("ec2") as client: 90 | yield client 91 | 92 | 93 | @pytest.mark.asyncio 94 | @pytest.mark.external 95 | async def test_init(): 96 | cluster = EC2Cluster(asynchronous=True) 97 | assert cluster.status == Status.created 98 | 99 | 100 | @pytest.mark.asyncio 101 | @pytest.mark.timeout(600) 102 | async def test_create_cluster(cluster): 103 | assert cluster.status == Status.running 104 | 105 | cluster.scale(2) 106 | await cluster 107 | assert len(cluster.workers) == 2 108 | 109 | async with Client(cluster, asynchronous=True) as client: 110 | inc = lambda x: x + 1 111 | assert await client.submit(inc, 10).result() == 11 112 | 113 | 114 | @pytest.mark.asyncio 115 | @pytest.mark.timeout(600) 116 | async def test_create_cluster_sync(cluster_sync): 117 | assert cluster_sync.status == Status.running 118 | 119 | cluster_sync.scale(2) 120 | 121 | with Client(cluster_sync) as client: 122 | inc = lambda x: x + 1 123 | assert client.submit(inc, 10).result() == 11 124 | 125 | 126 | @pytest.mark.asyncio 127 | @pytest.mark.timeout(600) 128 | async def test_create_cluster_with_packer(cluster_packer): 129 | assert cluster_packer.status == Status.running 130 | 131 | cluster_packer.scale(2) 132 | await cluster_packer 133 | assert len(cluster_packer.workers) == 2 134 | 135 | async with Client(cluster_packer, asynchronous=True) as client: 136 | inc = lambda x: x + 1 137 | assert await client.submit(inc, 10).result() == 11 138 | 139 | 140 | @pytest.mark.asyncio 141 | @pytest.mark.timeout(1200) 142 | async def test_create_rapids_cluster(cluster_rapids): 143 | assert cluster_rapids.status == Status.running 144 | 145 | cluster_rapids.scale(1) 146 | await cluster_rapids 147 | assert len(cluster_rapids.workers) == 1 148 | 149 | async with Client(cluster_rapids, asynchronous=True) as client: 150 | 151 | def f(): 152 | import cupy 153 | 154 | return float(cupy.random.random(100).mean()) 155 | 156 | assert await client.submit(f).result() < 1 157 | 158 | 159 | @pytest.mark.asyncio 160 | @pytest.mark.timeout(1200) 161 | async def test_create_rapids_cluster_with_packer(cluster_rapids_packer): 162 | assert cluster_rapids_packer.status == Status.running 163 | 164 | cluster_rapids_packer.scale(1) 165 | await cluster_rapids_packer 166 | assert len(cluster_rapids_packer.workers) == 1 167 | 168 | async with Client(cluster_rapids_packer, asynchronous=True) as client: 169 | 170 | def f(): 171 | import cupy 172 | 173 | return float(cupy.random.random(100).mean()) 174 | 175 | assert await client.submit(f).result() < 1 176 | 177 | 178 | @pytest.mark.asyncio 179 | async def test_get_ubuntu_image(ec2_client): 180 | image = await get_latest_ami_id( 181 | ec2_client, 182 | "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*", 183 | "099720109477", # Canonical 184 | ) 185 | assert "ami-" in image 186 | 187 | 188 | @pytest.mark.asyncio 189 | async def test_get_cloud_init(): 190 | cloud_init = EC2Cluster.get_cloud_init( 191 | env_vars={"EXTRA_PIP_PACKAGES": "s3fs"}, 192 | docker_args="--privileged", 193 | ) 194 | assert "systemctl start docker" in cloud_init 195 | assert ' -e EXTRA_PIP_PACKAGES="s3fs" ' in cloud_init 196 | assert " --privileged " in cloud_init 197 | 198 | 199 | @pytest.mark.asyncio 200 | async def test_get_cloud_init_rapids(): 201 | cloud_init = EC2Cluster.get_cloud_init( 202 | # Deep Learning AMI (Ubuntu 18.04) 203 | ami="ami-0c7c7d78f752f8f17", 204 | # Python version must match local version and CUDA version must match AMI CUDA version 205 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9", 206 | instance_type="p3.2xlarge", 207 | bootstrap=False, 208 | filesystem_size=120, 209 | ) 210 | assert "rapidsai" in cloud_init 211 | -------------------------------------------------------------------------------- /dask_cloudprovider/aws/tests/test_ecs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | aiobotocore = pytest.importorskip("aiobotocore") 4 | 5 | 6 | def test_import(): 7 | from dask_cloudprovider.aws import ECSCluster # noqa 8 | from dask_cloudprovider.aws import FargateCluster # noqa 9 | -------------------------------------------------------------------------------- /dask_cloudprovider/aws/tests/test_helper.py: -------------------------------------------------------------------------------- 1 | def test_aws_to_dict_and_back(): 2 | from dask_cloudprovider.aws.helper import aws_to_dict, dict_to_aws 3 | 4 | aws_dict = [{"key": "hello", "value": "world"}] 5 | aws_upper_dict = [{"Key": "hello", "Value": "world"}] 6 | py_dict = {"hello": "world"} 7 | 8 | assert dict_to_aws(py_dict) == aws_dict 9 | assert dict_to_aws(py_dict, upper=True) == aws_upper_dict 10 | assert aws_to_dict(aws_dict) == py_dict 11 | 12 | assert aws_to_dict(dict_to_aws(py_dict, upper=True)) == py_dict 13 | assert aws_to_dict(dict_to_aws(py_dict)) == py_dict 14 | assert dict_to_aws(aws_to_dict(aws_dict)) == aws_dict 15 | assert dict_to_aws(aws_to_dict(aws_upper_dict), upper=True) == aws_upper_dict 16 | 17 | 18 | def test_get_sleep_duration_first_try(): 19 | from dask_cloudprovider.aws.helper import get_sleep_duration 20 | 21 | duration = get_sleep_duration( 22 | current_try=0, min_sleep_millis=10, max_sleep_millis=5000 23 | ) 24 | assert duration == 0.01 25 | 26 | 27 | def test_get_sleep_duration_max(): 28 | from dask_cloudprovider.aws.helper import get_sleep_duration 29 | 30 | duration = get_sleep_duration( 31 | current_try=23, min_sleep_millis=10, max_sleep_millis=5000 32 | ) 33 | assert duration == 5.0 34 | 35 | 36 | def test_get_sleep_duration_negative_try(): 37 | from dask_cloudprovider.aws.helper import get_sleep_duration 38 | 39 | duration = get_sleep_duration( 40 | current_try=-1, min_sleep_millis=10, max_sleep_millis=5000 41 | ) 42 | assert duration == 0.01 43 | 44 | 45 | def test_config_mixin(): 46 | from dask_cloudprovider.aws.helper import ConfigMixin 47 | 48 | class MockCluster(ConfigMixin): 49 | config = None 50 | _attr1 = "foo" 51 | attr2 = None 52 | 53 | def __init__(self): 54 | self.config = {"attr2": "bar"} 55 | 56 | cluster_with_mixin = MockCluster() 57 | 58 | # Test that nothing happens if attr is already set 59 | attr1 = cluster_with_mixin._attr1 60 | cluster_with_mixin.update_attr_from_config(attr="attr1", private=True) 61 | assert cluster_with_mixin._attr1 == attr1 62 | 63 | # Test that attr is updated if existing value is None 64 | cluster_with_mixin.update_attr_from_config(attr="attr2", private=False) 65 | assert cluster_with_mixin.attr2 == "bar" 66 | -------------------------------------------------------------------------------- /dask_cloudprovider/azure/__init__.py: -------------------------------------------------------------------------------- 1 | from .azurevm import AzureVMCluster 2 | from .utils import AzurePreemptibleWorkerPlugin 3 | -------------------------------------------------------------------------------- /dask_cloudprovider/azure/tests/test_azurevm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import dask 4 | 5 | azure_compute = pytest.importorskip("azure.mgmt.compute") 6 | 7 | from dask_cloudprovider.azure import AzureVMCluster 8 | from dask.distributed import Client 9 | from distributed.core import Status 10 | 11 | 12 | def skip_without_credentials(func): 13 | rg = dask.config.get("cloudprovider.azure.resource_group", None) 14 | vnet = dask.config.get("cloudprovider.azure.azurevm.vnet", None) 15 | security_group = dask.config.get("cloudprovider.azure.azurevm.security_group", None) 16 | location = dask.config.get("cloudprovider.azure.location", None) 17 | if rg is None or vnet is None or security_group is None or location is None: 18 | return pytest.mark.skip( 19 | reason=""" 20 | You must configure your Azure resource group and vnet to run this test. 21 | 22 | $ export DASK_CLOUDPROVIDER__AZURE__LOCATION="" 23 | $ export DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP="" 24 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__VNET="" 25 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__SECURITY_GROUP="" 26 | 27 | """ 28 | )(func) 29 | return func 30 | 31 | 32 | async def get_config(): 33 | return dask.config.get("cloudprovider.azure", {}) 34 | 35 | 36 | @pytest.mark.asyncio 37 | @skip_without_credentials 38 | @pytest.mark.external 39 | async def test_init(): 40 | cluster = AzureVMCluster(asynchronous=True) 41 | assert cluster.status == Status.created 42 | 43 | 44 | @pytest.mark.asyncio 45 | @pytest.mark.timeout(1200) 46 | @skip_without_credentials 47 | @pytest.mark.external 48 | async def test_create_cluster(): 49 | async with AzureVMCluster(asynchronous=True) as cluster: 50 | assert cluster.status == Status.running 51 | 52 | cluster.scale(2) 53 | await cluster 54 | assert len(cluster.workers) == 2 55 | 56 | async with Client(cluster, asynchronous=True) as client: 57 | 58 | def inc(x): 59 | return x + 1 60 | 61 | assert await client.submit(inc, 10).result() == 11 62 | 63 | 64 | @pytest.mark.asyncio 65 | @pytest.mark.timeout(1200) 66 | @skip_without_credentials 67 | @pytest.mark.external 68 | async def test_create_cluster_sync(): 69 | with AzureVMCluster() as cluster: 70 | with Client(cluster) as client: 71 | cluster.scale(1) 72 | client.wait_for_workers(1) 73 | assert len(cluster.workers) == 1 74 | 75 | def inc(x): 76 | return x + 1 77 | 78 | assert client.submit(inc, 10).result() == 11 79 | 80 | 81 | @pytest.mark.asyncio 82 | @pytest.mark.timeout(1200) 83 | @skip_without_credentials 84 | @pytest.mark.external 85 | async def test_create_rapids_cluster_sync(): 86 | with AzureVMCluster( 87 | vm_size="Standard_NC12s_v3", 88 | docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9", 89 | worker_class="dask_cuda.CUDAWorker", 90 | worker_options={"rmm_pool_size": "15GB"}, 91 | ) as cluster: 92 | with Client(cluster) as client: 93 | cluster.scale(1) 94 | client.wait_for_workers(1) 95 | 96 | def gpu_mem(): 97 | from pynvml.smi import nvidia_smi 98 | 99 | nvsmi = nvidia_smi.getInstance() 100 | return nvsmi.DeviceQuery("memory.free, memory.total") 101 | 102 | results = client.run(gpu_mem) 103 | for w, res in results.items(): 104 | assert "total" in res["gpu"][0]["fb_memory_usage"].keys() 105 | print(res) 106 | 107 | 108 | @pytest.mark.asyncio 109 | @skip_without_credentials 110 | async def test_render_cloud_init(): 111 | cloud_init = AzureVMCluster.get_cloud_init(docker_args="--privileged") 112 | assert " --privileged " in cloud_init 113 | 114 | cloud_init = AzureVMCluster.get_cloud_init( 115 | docker_image="foo/bar:baz", 116 | extra_bootstrap=["echo 'hello world'", "echo 'foo bar'"], 117 | ) 118 | assert "foo/bar:baz" in cloud_init 119 | assert "- echo 'hello world'" in cloud_init 120 | assert "- echo 'foo bar'" in cloud_init 121 | -------------------------------------------------------------------------------- /dask_cloudprovider/azure/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import datetime 3 | import json 4 | import subprocess 5 | import logging 6 | 7 | import aiohttp 8 | from distributed.diagnostics.plugin import WorkerPlugin 9 | from tornado.ioloop import IOLoop, PeriodicCallback 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | AZURE_EVENTS_METADATA_URL = ( 15 | "http://169.254.169.254/metadata/scheduledevents?api-version=2019-08-01" 16 | ) 17 | 18 | 19 | def _get_default_subscription() -> str: 20 | """ 21 | Get the default Azure subscription ID, as configured by the Azure CLI. 22 | """ 23 | out = subprocess.check_output(["az", "account", "list", "--query", "[?isDefault]"]) 24 | accounts = json.loads(out) 25 | if accounts: 26 | subscription_id = accounts[0]["id"] 27 | return subscription_id 28 | raise ValueError( 29 | "Could not find a default subscription. " 30 | "Run 'az account set' to set a default subscription." 31 | ) 32 | 33 | 34 | class AzurePreemptibleWorkerPlugin(WorkerPlugin): 35 | """A worker plugin for azure spot instances 36 | 37 | This worker plugin will poll azure's metadata service for preemption notifications. 38 | When a node is preempted, the plugin will attempt to shutdown gracefully all workers 39 | on the node. 40 | 41 | This plugin can be used on any worker running on azure spot instances, not just the 42 | ones created by ``dask-cloudprovider``. 43 | 44 | For more details on azure spot instances see: 45 | https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events 46 | 47 | Parameters 48 | ---------- 49 | poll_interval_s: int (optional) 50 | The rate at which the plugin will poll the metadata service in seconds. 51 | 52 | Defaults to ``1`` 53 | 54 | metadata_url: str (optional) 55 | The url of the metadata service to poll. 56 | 57 | Defaults to "http://169.254.169.254/metadata/scheduledevents?api-version=2019-08-01" 58 | 59 | termination_events: List[str] (optional) 60 | The type of events that will trigger the gracefull shutdown 61 | 62 | Defaults to ``['Preempt', 'Terminate']`` 63 | 64 | termination_offset_minutes: int (optional) 65 | Extra offset to apply to the premption date. This may be negative, to start 66 | the gracefull shutdown before the ``NotBefore`` date. It can also be positive, to 67 | start the shutdown after the ``NotBefore`` date, but this is at your own risk. 68 | 69 | Defaults to ``0`` 70 | 71 | Examples 72 | -------- 73 | 74 | Let's say you have cluster and a client instance. 75 | For example using :class:`dask_kubernetes.KubeCluster` 76 | 77 | >>> from dask_kubernetes import KubeCluster 78 | >>> from distributed import Client 79 | >>> cluster = KubeCluster() 80 | >>> client = Client(cluster) 81 | 82 | You can add the worker plugin using the following: 83 | 84 | >>> from dask_cloudprovider.azure import AzurePreemptibleWorkerPlugin 85 | >>> client.register_worker_plugin(AzurePreemptibleWorkerPlugin()) 86 | """ 87 | 88 | def __init__( 89 | self, 90 | poll_interval_s=1, 91 | metadata_url=None, 92 | termination_events=None, 93 | termination_offset_minutes=0, 94 | ): 95 | self.callback = None 96 | self.loop = None 97 | self.worker = None 98 | self.poll_interval_s = poll_interval_s 99 | self.metadata_url = metadata_url or AZURE_EVENTS_METADATA_URL 100 | self.termination_events = termination_events or ["Preempt", "Terminate"] 101 | self.termination_offset = datetime.timedelta(minutes=termination_offset_minutes) 102 | 103 | self.terminating = False 104 | self.not_before = None 105 | self._session = None 106 | self._lock = None 107 | 108 | async def _is_terminating(self): 109 | preempt_started = False 110 | async with self._session.get(self.metadata_url) as response: 111 | try: 112 | data = await response.json() 113 | # Sometime azure responds with text/plain mime type 114 | except aiohttp.ContentTypeError: 115 | return 116 | # Sometimes the response doesn't contain the Events key 117 | events = data.get("Events", []) 118 | if events: 119 | logger.debug( 120 | "Worker {}, got metadata events {}".format(self.worker.name, events) 121 | ) 122 | for evt in events: 123 | event_type = evt["EventType"] 124 | if event_type not in self.termination_events: 125 | continue 126 | 127 | event_status = evt.get("EventStatus") 128 | if event_status == "Started": 129 | logger.info( 130 | "Worker {}, node preemption started".format(self.worker.name) 131 | ) 132 | preempt_started = True 133 | break 134 | 135 | not_before = evt.get("NotBefore") 136 | if not not_before: 137 | continue 138 | 139 | not_before = datetime.datetime.strptime( 140 | not_before, "%a, %d %b %Y %H:%M:%S GMT" 141 | ) 142 | if self.not_before is None: 143 | logger.info( 144 | "Worker {}, node deletion scheduled not before {}".format( 145 | self.worker.name, self.not_before 146 | ) 147 | ) 148 | self.not_before = not_before 149 | break 150 | if self.not_before < not_before: 151 | logger.info( 152 | "Worker {}, node deletion re-scheduled not before {}".format( 153 | self.worker.name, not_before 154 | ) 155 | ) 156 | self.not_before = not_before 157 | break 158 | 159 | return preempt_started or ( 160 | self.not_before 161 | and (self.not_before + self.termination_offset < datetime.datetime.utcnow()) 162 | ) 163 | 164 | async def poll_status(self): 165 | if self.terminating: 166 | return 167 | if self._session is None: 168 | self._session = aiohttp.ClientSession(headers={"Metadata": "true"}) 169 | if self._lock is None: 170 | self._lock = asyncio.Lock() 171 | 172 | async with self._lock: 173 | is_terminating = await self._is_terminating() 174 | if not is_terminating: 175 | return 176 | 177 | logger.info( 178 | "Worker {}, node is being deleted, attempting graceful shutdown".format( 179 | self.worker.name 180 | ) 181 | ) 182 | self.terminating = True 183 | await self._session.close() 184 | await self.worker.close_gracefully() 185 | 186 | def setup(self, worker): 187 | self.worker = worker 188 | self.loop = IOLoop.current() 189 | self.callback = PeriodicCallback( 190 | self.poll_status, callback_time=self.poll_interval_s * 1_000 191 | ) 192 | self.loop.add_callback(self.callback.start) 193 | logger.debug( 194 | "Worker {}, registering preemptible plugin".format(self.worker.name) 195 | ) 196 | 197 | def teardown(self, worker): 198 | logger.debug("Worker {}, tearing down plugin".format(self.worker.name)) 199 | if self.callback: 200 | self.callback.stop() 201 | self.callback = None 202 | -------------------------------------------------------------------------------- /dask_cloudprovider/cli/ecs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from asyncio import sleep 3 | import sys 4 | 5 | import click 6 | from distributed.cli.utils import install_signal_handlers 7 | from distributed.core import Status 8 | from tornado.ioloop import IOLoop, TimeoutError 9 | 10 | from dask_cloudprovider.aws import ECSCluster 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | @click.command() 17 | @click.option("--fargate", is_flag=True, help="Turn on fargate mode (default off)") 18 | @click.option( 19 | "--fargate-scheduler", 20 | is_flag=True, 21 | help="Turn on fargate mode for scheduler (default off)", 22 | ) 23 | @click.option( 24 | "--fargate-workers", 25 | is_flag=True, 26 | help="Turn on fargate mode for workers (default off)", 27 | ) 28 | @click.option( 29 | "--image", 30 | type=str, 31 | default=None, 32 | help="Docker image to use for scheduler and workers", 33 | ) 34 | @click.option( 35 | "--scheduler-cpu", 36 | type=int, 37 | default=None, 38 | help="Scheduler CPU reservation in milli-CPU", 39 | ) 40 | @click.option( 41 | "--scheduler-mem", type=int, default=None, help="Scheduler memory reservation in MB" 42 | ) 43 | @click.option( 44 | "--scheduler-port", 45 | type=int, 46 | default=8786, 47 | help="The port on which the scheduler will be reachable to the workers and clients", 48 | ) 49 | @click.option( 50 | "--scheduler-timeout", 51 | type=int, 52 | default=None, 53 | help="Scheduler timeout (e.g 5 minutes)", 54 | ) 55 | @click.option( 56 | "--worker-cpu", type=int, default=None, help="Worker CPU reservation in milli-CPU" 57 | ) 58 | @click.option( 59 | "--worker-mem", type=int, default=None, help="Worker memory reservation in MB" 60 | ) 61 | @click.option( 62 | "--n-workers", 63 | type=int, 64 | default=None, 65 | help="Number of workers to start with the cluster", 66 | ) 67 | @click.option( 68 | "--cluster-arn", 69 | type=str, 70 | default=None, 71 | help="The ARN of an existing ECS cluster to use", 72 | ) 73 | @click.option( 74 | "--cluster-name-template", 75 | type=str, 76 | default=None, 77 | help="A template to use for the cluster name if `--cluster-arn` is not set", 78 | ) 79 | @click.option( 80 | "--execution-role-arn", 81 | type=str, 82 | default=None, 83 | help="The ARN of an existing IAM role to use for ECS execution", 84 | ) 85 | @click.option( 86 | "--task-role-arn", 87 | type=str, 88 | default=None, 89 | help="The ARN of an existing IAM role to give to the tasks", 90 | ) 91 | @click.option( 92 | "--task-role-policy", 93 | type=str, 94 | default=None, 95 | multiple=True, 96 | help="Policy to attach to a task if --task-role-arn is not set (can be used multiple times)", 97 | ) 98 | @click.option( 99 | "--cloudwatch-logs-group", type=str, default=None, help="The group to send logs to" 100 | ) 101 | @click.option( 102 | "--cloudwatch-logs-stream-prefix", 103 | type=str, 104 | default=None, 105 | help="An optional prefix to use for log streams", 106 | ) 107 | @click.option( 108 | "--cloudwatch-logs-default-retention", 109 | type=int, 110 | default=None, 111 | help="Number of says to retain logs", 112 | ) 113 | @click.option( 114 | "--vpc", 115 | type=str, 116 | default=None, 117 | help="The ID of an existing VPC (uses default if not specified)", 118 | ) 119 | @click.option( 120 | "--subnet", 121 | type=str, 122 | default=None, 123 | multiple=True, 124 | help="VPC subnet to use (can be used multipel times, will defaul to all if none specified)", 125 | ) 126 | @click.option( 127 | "--security-group", 128 | type=str, 129 | default=None, 130 | multiple=True, 131 | help="Security group to use for task communication (can be used multiple times, will be created if not specified)", 132 | ) 133 | @click.option( 134 | "--environment", 135 | type=str, 136 | default=None, 137 | multiple=True, 138 | help="Environment variable for the scheduler and workers in the form FOO=bar (can be used multiple times)", 139 | ) 140 | @click.option( 141 | "--tag", 142 | type=str, 143 | default=None, 144 | multiple=True, 145 | help="Tag to apply to all resources created automatically in the form FOO=bar (can be used multiple times)", 146 | ) 147 | @click.option("--skip_cleanup", is_flag=True, help="Skip cleanup of stale resources") 148 | @click.version_option() 149 | def main( 150 | fargate, 151 | fargate_scheduler, 152 | fargate_workers, 153 | image, 154 | scheduler_cpu, 155 | scheduler_mem, 156 | scheduler_port, 157 | scheduler_timeout, 158 | worker_cpu, 159 | worker_mem, 160 | n_workers, 161 | cluster_arn, 162 | cluster_name_template, 163 | execution_role_arn, 164 | task_role_arn, 165 | task_role_policy, 166 | cloudwatch_logs_group, 167 | cloudwatch_logs_stream_prefix, 168 | cloudwatch_logs_default_retention, 169 | vpc, 170 | subnet, 171 | security_group, 172 | environment, 173 | tag, 174 | skip_cleanup, 175 | ): 176 | tag = {v.split("=")[0]: v.split("=")[1] for v in tag} if tag else None 177 | environment = ( 178 | {v.split("=")[0]: v.split("=")[1] for v in environment} if environment else None 179 | ) 180 | subnet = subnet or None 181 | security_group = security_group or None 182 | task_role_policy = task_role_policy or None 183 | logger.info("Starting ECS cluster") 184 | try: 185 | cluster = ECSCluster( 186 | fargate_scheduler=fargate_scheduler or fargate, 187 | fargate_workers=fargate_workers or fargate, 188 | image=image, 189 | scheduler_cpu=scheduler_cpu, 190 | scheduler_mem=scheduler_mem, 191 | scheduler_port=scheduler_port, 192 | scheduler_timeout=scheduler_timeout, 193 | worker_cpu=worker_cpu, 194 | worker_mem=worker_mem, 195 | n_workers=n_workers, 196 | cluster_arn=cluster_arn, 197 | cluster_name_template=cluster_name_template, 198 | execution_role_arn=execution_role_arn, 199 | task_role_arn=task_role_arn, 200 | task_role_policies=task_role_policy, 201 | cloudwatch_logs_group=cloudwatch_logs_group, 202 | cloudwatch_logs_stream_prefix=cloudwatch_logs_stream_prefix, 203 | cloudwatch_logs_default_retention=cloudwatch_logs_default_retention, 204 | vpc=vpc, 205 | subnets=subnet, 206 | security_groups=security_group, 207 | environment=environment, 208 | tags=tag, 209 | skip_cleanup=skip_cleanup, 210 | ) 211 | except Exception as e: 212 | ctx = click.get_current_context() 213 | logger.error(str(e) + "\n") 214 | click.echo(ctx.get_help()) 215 | sys.exit(1) 216 | 217 | async def run(): 218 | logger.info("Ready") 219 | while cluster.status != Status.closed: 220 | await sleep(0.2) 221 | 222 | def on_signal(signum): 223 | logger.info("Exiting on signal %d", signum) 224 | cluster.close(timeout=2) 225 | 226 | loop = IOLoop.current() 227 | install_signal_handlers(loop, cleanup=on_signal) 228 | 229 | try: 230 | loop.run_sync(run) 231 | except (KeyboardInterrupt, TimeoutError): 232 | logger.info("Shutting down") 233 | finally: 234 | logger.info("End dask-ecs") 235 | 236 | 237 | def go(): 238 | main() 239 | 240 | 241 | if __name__ == "__main__": 242 | go() 243 | -------------------------------------------------------------------------------- /dask_cloudprovider/cloudprovider.yaml: -------------------------------------------------------------------------------- 1 | cloudprovider: 2 | ecs: 3 | fargate_scheduler: false # Use fargate mode for the scheduler 4 | fargate_spot: false 5 | fargate_workers: false # Use fargate mode for the workers 6 | fargate_use_private_ip: false 7 | scheduler_cpu: 1024 # Millicpu (1024ths of a CPU core) 8 | scheduler_mem: 4096 # Memory in MB 9 | # scheduler_extra_args: "--tls-cert,/path/to/cert.pem,--tls-key,/path/to/cert.key,--tls-ca-file,/path/to/ca.key" 10 | worker_cpu: 4096 # Millicpu (1024ths of a CPU core) 11 | worker_mem: 16384 # Memory in MB 12 | worker_gpu: 0 # Number of GPUs for each worker 13 | # worker_extra_args: "--tls-cert,/path/to/cert.pem,--tls-key,/path/to/cert.key,--tls-ca-file,/path/to/ca.key" 14 | n_workers: 0 # Number of workers to start the cluster with 15 | scheduler_timeout: "5 minutes" # Length of inactivity to wait before closing the cluster 16 | 17 | image: "daskdev/dask:latest" # Docker image to use for non GPU tasks 18 | cpu_architecture: "X86_64" # Runtime platform CPU architecture 19 | gpu_image: "rapidsai/rapidsai:latest" # Docker image to use for GPU tasks 20 | cluster_name_template: "dask-{uuid}" # Template to use when creating a cluster 21 | cluster_arn: "" # ARN of existing ECS cluster to use (if not set one will be created) 22 | execution_role_arn: "" # Arn of existing execution role to use (if not set one will be created) 23 | task_role_arn: "" # Arn of existing task role to use (if not set one will be created) 24 | task_role_policies: [] # List of policy arns to attach to tasks (e.g S3 read only access) 25 | # platform_version: "LATEST" # Fargate platformVersion string like "1.4.0" or "LATEST" 26 | 27 | cloudwatch_logs_group: "" # Name of existing cloudwatch logs group to use (if not set one will be created) 28 | cloudwatch_logs_stream_prefix: "{cluster_name}" # Stream prefix template 29 | cloudwatch_logs_default_retention: 30 # Number of days to retain logs (only applied if not using existing group) 30 | 31 | vpc: "default" # VPC to use for tasks 32 | subnets: [] # VPC subnets to use (will use all available if not set) 33 | security_groups: [] # Security groups to use (if not set one will be created) 34 | 35 | tags: {} # Tags to apply to all AWS resources created by the cluster manager 36 | environment: {} # Environment variables that are set within a task container 37 | skip_cleanup: false # Skip cleaning up of stale resources 38 | 39 | ec2: 40 | region: null # AWS region to create cluster. Defaults to environment or account default region. 41 | availability_zone: null # The availability zone to start you clusters. By default AWS will select the AZ with most free capacity. 42 | bootstrap: true # It is assumed that the AMI does not have Docker and needs bootstrapping. Set this to false if using a custom AMI with Docker already installed. 43 | auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out. 44 | # worker_command: "dask-worker" # The command for workers to run. If the instance_type is a GPU instance dask-cuda-worker will be used. 45 | ami: null # AMI ID to use for all instances. Defaults to latest Ubuntu 20.04 image. 46 | instance_type: "t2.micro" # Instance type for the scheduler and all workers 47 | scheduler_instance_type: "t2.micro" # Instance type for the scheduler 48 | worker_instance_type: "t2.micro" # Instance type for all workers 49 | docker_image: "daskdev/dask:latest" # docker image to use 50 | vpc: null # VPC id for instances to join. Defaults to default VPC. 51 | subnet_id: null # Subnet ID for instances to. Defaults to all subnets in default VPC. 52 | security_groups: [] # Security groups for instances. Will create a minimal Dask security group by default. 53 | filesystem_size: 40 # Default root filesystem size for scheduler and worker VMs in GB 54 | key_name: null # SSH Key name to assign to instances 55 | iam_instance_profile: {} # Iam role to assign to instances 56 | # Arn: 'string' 57 | # Name: 'string' 58 | instance_tags: 59 | createdBy: dask-cloudprovider 60 | volume_tags: 61 | createdBy: dask-cloudprovider 62 | enable_detailed_monitoring: false 63 | use_private_ip: false 64 | 65 | azure: 66 | location: null # The Azure location to launch your cluster 67 | resource_group: null # The Azure resource group for the cluster 68 | subscription_id: null # The Azure subscription ID for the cluster 69 | azurevm: 70 | vnet: null # Azure Virtual Network to launch VMs in 71 | subnet: null # Azure Virtual Network subnet to launch VMs in 72 | security_group: null # Network security group to allow 8786 and 8787 73 | public_ingress: true # Assign a public IP address to the scheduler 74 | vm_size: "Standard_DS1_v2" # Azure VM size to use for scheduler and workers 75 | disk_size: 50 # Specifies the size of the VM host OS disk in gigabytes. This value cannot be larger than `1023`. 76 | scheduler_vm_size: null # Set a different VM size for the scheduler. Will use vm_size if not set 77 | docker_image: "daskdev/dask:latest" # docker image to use 78 | vm_image: # OS image to use for the virtual machines 79 | publisher: "Canonical" 80 | offer: "UbuntuServer" 81 | sku: "18.04-LTS" 82 | version: "latest" 83 | bootstrap: true # It is assumed that the VHD does not have Docker and needs bootstrapping. Set this to false if using a custom VHD with Docker already installed. 84 | auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out. 85 | marketplace_plan: null # This needs to be passed in if the user wants to use a Marketplace VM with a plan. 86 | # name: "ngc-base-version-21-02-2" 87 | # publisher: "nvidia" 88 | # product: "ngc_azure_17_11" 89 | extra_options: {} # Additional options to provide when creating the VMs. 90 | 91 | digitalocean: 92 | token: null # API token for interacting with the Digital Ocean API 93 | region: "nyc3" # Region to launch Droplets in 94 | size: "s-1vcpu-1gb" # Droplet size to launch, default is 1GB RAM, 1 vCPU 95 | image: "ubuntu-20-04-x64" # Operating System image to use 96 | 97 | gcp: 98 | source_image: "projects/ubuntu-os-cloud/global/images/ubuntu-minimal-1804-bionic-v20201014" # the gcp image to use for all instances 99 | zone: "us-east1-c" # the zone of where to launch the instances 100 | network: "default" # the network/subnetwork in GCP to use 101 | network_projectid: null # GCP project id where the network exists 102 | projectid: "" # name of the google cloud project 103 | on_host_maintenance: "TERMINATE" 104 | machine_type: "n1-standard-1" # size of the machine type to use for the scheduler and all workers 105 | scheduler_machine_type: "n1-standard-1" # size of the machine type to use for the scheduler 106 | worker_machine_type: "n1-standard-1" # size of the machine type to use for all workers 107 | filesystem_size: 50 # amount in GBs of hard drive space to allocate 108 | ngpus: "" # number of GPUs to use. If provided, will be used for both scheduler and worker 109 | gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``. If provided, will be used for both scheduler and worker 110 | scheduler_ngpus: "" # number of GPUs to use on scheduler 111 | scheduler_gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``. 112 | worker_ngpus: "" # number of GPUs to use on worker 113 | worker_gpu_type: "" # type of gpus to use. (e.g. 'nvidia-tesla-t4'). You can view the possible values through ``gcloud compute accelerator-types list``. 114 | disk_type: "pd-standard" # type of disk to use: pd-standard, pd-ssd 115 | docker_image: "daskdev/dask:latest" # docker image to use 116 | auto_shutdown: true # Shutdown instances automatically if the scheduler or worker services time out. 117 | public_ingress: true # configure the scheduler to be externally accessible. This assumes firefwall rules for 8787 and 8786 118 | instance_labels: 119 | container_vm: "dask-cloudprovider" 120 | service_account: "default" 121 | 122 | hetzner: 123 | token: null # API token for interacting with the Hetzner cloud API 124 | location: "fsn1" # Location to launch vServer in 125 | server_type: "cx11" # vServer server type to launch, default is 2GB RAM, 1 vCPU 126 | image: "ubuntu-20.04" # Operating System image to use 127 | docker_image: "daskdev/dask:latest" # docker image to use 128 | bootstrap: true # It is assumed that the OS image does not have Docker and needs bootstrapping. Set this to false if using a custom image with Docker already installed. 129 | 130 | ibm: 131 | api_key: null 132 | image: "ghcr.io/dask/dask:latest" 133 | region: us-east 134 | project_id: null 135 | scheduler_cpu: "1.0" 136 | scheduler_mem: 4G 137 | scheduler_disk: 400M 138 | scheduler_timeout: 600 # seconds 139 | scheduler_command: python -m distributed.cli.dask_scheduler --protocol ws 140 | worker_cpu: "2.0" 141 | worker_mem: 8G 142 | worker_disk: 400M 143 | worker_threads: 1 144 | worker_command: python -m distributed.cli.dask_spec 145 | docker_server: "" 146 | docker_username: "" 147 | docker_password: "" 148 | 149 | openstack: 150 | region: "RegionOne" # The name of the region where resources will be allocated in OpenStack. List available regions using: `openstack region list`. 151 | size: null # Openstack flavors define the compute, memory, and storage capacity of computing instances. List available flavors using: `openstack flavor list` 152 | auth_url: null # The authentication URL for the OpenStack Identity service (Keystone). Example: https://cloud.example.com:5000 153 | application_credential_id: null # The application credential id created in OpenStack. Create application credentials using: openstack application credential create 154 | application_credential_secret: null # The secret associated with the application credential ID for authentication. 155 | auth_type: "v3applicationcredential" # The type of authentication used, typically "v3applicationcredential" for using OpenStack application credentials. 156 | network_id: null # The unique identifier for the internal/private network in OpenStack where the cluster VMs will be connected. List available networks using: `openstack network list` 157 | image: null # The OS image name or id to use for the VM. List available images using: `openstack image list` 158 | keypair_name: null # The name of the SSH keypair used for instance access. Ensure you have created a keypair or use an existing one. List available keypairs using: `openstack keypair list` 159 | security_group: null # The security group name that defines firewall rules for instances. List available security groups using: `openstack security group list` 160 | external_network_id: null # The ID of the external network used for assigning floating IPs. List available external networks using: `openstack network list --external` 161 | create_floating_ip: false # Specifies whether to assign a floating IP to each instance, enabling external access. Set to `True` if external connectivity is needed. 162 | docker_image: "daskdev/dask:latest" # docker image to use 163 | 164 | nebius: 165 | token: null # iam token for interacting with the Nebius AI Cloud 166 | project_id: null # You can find it in Nebius AI Cloud console 167 | bootstrap: true # It is assumed that the OS image does not have Docker and needs bootstrapping. Set this to false if using a custom image with Docker already installed. 168 | image_family: "ubuntu22.04-driverless" # it should be "ubuntu22.04-driverless" or "ubuntu22.04-cuda12" https://docs.nebius.com/compute/storage/manage#parameters-boot 169 | docker_image: "daskdev/dask:latest" # docker image to use 170 | server_platform: "cpu-d3" # all platforms https://docs.nebius.com/compute/virtual-machines/types 171 | server_preset: "4vcpu-16gb" # all presets https://docs.nebius.com/compute/virtual-machines/types 172 | disk_size: 64 # Specifies the size of the VM host OS disk in gigabytes. -------------------------------------------------------------------------------- /dask_cloudprovider/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | 3 | import os 4 | 5 | import dask 6 | import yaml 7 | 8 | 9 | class ClusterConfig(dict): 10 | """Simple config interface for dask-cloudprovider clusters, such as `AzureVMCluster`. 11 | 12 | Enables '.' notation for nested access, as per `dask.config.get`. 13 | 14 | Example 15 | ------- 16 | 17 | >>> from dask_cloudprovider.config import ClusterConfig 18 | >>> class RandomCluster(VMCluster): 19 | ... def __init__(self, option=None): 20 | ... self.config = ClusterConfig(dask.config.get("cloudprovider.random", {})) 21 | ... self.option = self.config.get("option", override_with=option) 22 | 23 | """ 24 | 25 | def __new__(cls, d): 26 | return super().__new__(cls, d) 27 | 28 | def get(self, key, default=None, override_with=None): 29 | return dask.config.get( 30 | key, default=default, config=self, override_with=override_with 31 | ) 32 | 33 | 34 | fn = os.path.join(os.path.dirname(__file__), "cloudprovider.yaml") 35 | dask.config.ensure_file(source=fn) 36 | 37 | with open(fn) as f: 38 | defaults = yaml.safe_load(f) 39 | 40 | dask.config.update_defaults(defaults) 41 | -------------------------------------------------------------------------------- /dask_cloudprovider/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption( 6 | "--create-external-resources", 7 | action="store_true", 8 | default=False, 9 | help="Run tests that create external resources.", 10 | ) 11 | 12 | 13 | def pytest_configure(config): 14 | config.addinivalue_line( 15 | "markers", "external: mark test as creates external resources" 16 | ) 17 | 18 | 19 | def pytest_collection_modifyitems(config, items): 20 | if config.getoption("--create-external-resources"): 21 | # --runslow given in cli: do not skip slow tests 22 | return 23 | skip_slow = pytest.mark.skip( 24 | reason="need --create-external-resources option to run" 25 | ) 26 | for item in items: 27 | if "external" in item.keywords: 28 | item.add_marker(skip_slow) 29 | -------------------------------------------------------------------------------- /dask_cloudprovider/digitalocean/__init__.py: -------------------------------------------------------------------------------- 1 | from .droplet import DropletCluster 2 | -------------------------------------------------------------------------------- /dask_cloudprovider/digitalocean/droplet.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import dask 4 | from dask_cloudprovider.generic.vmcluster import ( 5 | VMCluster, 6 | VMInterface, 7 | SchedulerMixin, 8 | WorkerMixin, 9 | ) 10 | 11 | try: 12 | import digitalocean 13 | except ImportError as e: 14 | msg = ( 15 | "Dask Cloud Provider Digital Ocean requirements are not installed.\n\n" 16 | "Please pip install as follows:\n\n" 17 | ' pip install "dask-cloudprovider[digitalocean]" --upgrade # or python -m pip install' 18 | ) 19 | raise ImportError(msg) from e 20 | 21 | 22 | class Droplet(VMInterface): 23 | def __init__( 24 | self, 25 | cluster: str, 26 | config, 27 | *args, 28 | region: str = None, 29 | size: str = None, 30 | image: str = None, 31 | docker_image=None, 32 | env_vars=None, 33 | extra_bootstrap=None, 34 | **kwargs, 35 | ): 36 | super().__init__(*args, **kwargs) 37 | self.droplet = None 38 | self.cluster = cluster 39 | self.config = config 40 | self.region = region 41 | self.size = size 42 | self.image = image 43 | self.gpu_instance = False 44 | self.bootstrap = True 45 | self.extra_bootstrap = extra_bootstrap 46 | self.docker_image = docker_image 47 | self.env_vars = env_vars 48 | 49 | async def create_vm(self): 50 | self.droplet = digitalocean.Droplet( 51 | token=self.config.get("token"), 52 | name=self.name, 53 | region=self.region, 54 | image=self.image, 55 | size_slug=self.size, 56 | backups=False, 57 | user_data=self.cluster.render_process_cloud_init(self), 58 | ) 59 | await self.call_async(self.droplet.create) 60 | for action in self.droplet.get_actions(): 61 | while action.status != "completed": 62 | action.load() 63 | await asyncio.sleep(0.1) 64 | while self.droplet.ip_address is None: 65 | await self.call_async(self.droplet.load) 66 | await asyncio.sleep(0.1) 67 | self.cluster._log(f"Created droplet {self.name}") 68 | 69 | return self.droplet.ip_address, None 70 | 71 | async def destroy_vm(self): 72 | await self.call_async(self.droplet.destroy) 73 | self.cluster._log(f"Terminated droplet {self.name}") 74 | 75 | 76 | class DropletScheduler(SchedulerMixin, Droplet): 77 | """Scheduler running on a DigitalOcean Droplet.""" 78 | 79 | 80 | class DropletWorker(WorkerMixin, Droplet): 81 | """Worker running on a DigitalOcean Droplet.""" 82 | 83 | 84 | class DropletCluster(VMCluster): 85 | """Cluster running on Digital Ocean droplets. 86 | 87 | VMs in DigitalOcean (DO) are referred to as droplets. This cluster manager constructs a Dask cluster 88 | running on VMs. 89 | 90 | When configuring your cluster you may find it useful to install the ``doctl`` tool for querying the 91 | DO API for available options. 92 | 93 | https://www.digitalocean.com/docs/apis-clis/doctl/how-to/install/ 94 | 95 | Parameters 96 | ---------- 97 | region: str 98 | The DO region to launch you cluster in. A full list can be obtained with ``doctl compute region list``. 99 | size: str 100 | The VM size slug. You can get a full list with ``doctl compute size list``. 101 | The default is ``s-1vcpu-1gb`` which is 1GB RAM and 1 vCPU 102 | image: str 103 | The image ID to use for the host OS. This should be a Ubuntu variant. 104 | You can list available images with ``doctl compute image list --public | grep ubuntu.*x64``. 105 | worker_module: str 106 | The Dask worker module to start on worker VMs. 107 | n_workers: int 108 | Number of workers to initialise the cluster with. Defaults to ``0``. 109 | worker_module: str 110 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker`` 111 | worker_options: dict 112 | Params to be passed to the worker class. 113 | See :class:`distributed.worker.Worker` for default worker class. 114 | If you set ``worker_module`` then refer to the docstring for the custom worker class. 115 | scheduler_options: dict 116 | Params to be passed to the scheduler class. 117 | See :class:`distributed.scheduler.Scheduler`. 118 | docker_image: string (optional) 119 | The Docker image to run on all instances. 120 | 121 | This image must have a valid Python environment and have ``dask`` installed in order for the 122 | ``dask-scheduler`` and ``dask-worker`` commands to be available. It is recommended the Python 123 | environment matches your local environment where ``EC2Cluster`` is being created from. 124 | 125 | For GPU instance types the Docker image much have NVIDIA drivers and ``dask-cuda`` installed. 126 | 127 | By default the ``daskdev/dask:latest`` image will be used. 128 | docker_args: string (optional) 129 | Extra command line arguments to pass to Docker. 130 | extra_bootstrap: list[str] (optional) 131 | Extra commands to be run during the bootstrap phase. 132 | env_vars: dict (optional) 133 | Environment variables to be passed to the worker. 134 | silence_logs: bool 135 | Whether or not we should silence logging when setting up the cluster. 136 | asynchronous: bool 137 | If this is intended to be used directly within an event loop with 138 | async/await 139 | security : Security or bool, optional 140 | Configures communication security in this cluster. Can be a security 141 | object, or True. If True, temporary self-signed credentials will 142 | be created automatically. Default is ``True``. 143 | debug: bool, optional 144 | More information will be printed when constructing clusters to enable debugging. 145 | 146 | Examples 147 | -------- 148 | 149 | Create the cluster. 150 | 151 | >>> from dask_cloudprovider.digitalocean import DropletCluster 152 | >>> cluster = DropletCluster(n_workers=1) 153 | Creating scheduler instance 154 | Created droplet dask-38b817c1-scheduler 155 | Waiting for scheduler to run 156 | Scheduler is running 157 | Creating worker instance 158 | Created droplet dask-38b817c1-worker-dc95260d 159 | 160 | Connect a client. 161 | 162 | >>> from dask.distributed import Client 163 | >>> client = Client(cluster) 164 | 165 | Do some work. 166 | 167 | >>> import dask.array as da 168 | >>> arr = da.random.random((1000, 1000), chunks=(100, 100)) 169 | >>> arr.mean().compute() 170 | 0.5001550986751964 171 | 172 | Close the cluster 173 | 174 | >>> client.close() 175 | >>> cluster.close() 176 | Terminated droplet dask-38b817c1-worker-dc95260d 177 | Terminated droplet dask-38b817c1-scheduler 178 | 179 | You can also do this all in one go with context managers to ensure the cluster is 180 | created and cleaned up. 181 | 182 | >>> with DropletCluster(n_workers=1) as cluster: 183 | ... with Client(cluster) as client: 184 | ... print(da.random.random((1000, 1000), chunks=(100, 100)).mean().compute()) 185 | Creating scheduler instance 186 | Created droplet dask-48efe585-scheduler 187 | Waiting for scheduler to run 188 | Scheduler is running 189 | Creating worker instance 190 | Created droplet dask-48efe585-worker-5181aaf1 191 | 0.5000558682356162 192 | Terminated droplet dask-48efe585-worker-5181aaf1 193 | Terminated droplet dask-48efe585-scheduler 194 | 195 | """ 196 | 197 | def __init__( 198 | self, 199 | region: str = None, 200 | size: str = None, 201 | image: str = None, 202 | debug: bool = False, 203 | **kwargs, 204 | ): 205 | self.config = dask.config.get("cloudprovider.digitalocean", {}) 206 | self.scheduler_class = DropletScheduler 207 | self.worker_class = DropletWorker 208 | self.debug = debug 209 | self.options = { 210 | "cluster": self, 211 | "config": self.config, 212 | "region": region if region is not None else self.config.get("region"), 213 | "size": size if size is not None else self.config.get("size"), 214 | "image": image if image is not None else self.config.get("image"), 215 | } 216 | self.scheduler_options = {**self.options} 217 | self.worker_options = {**self.options} 218 | super().__init__(debug=debug, **kwargs) 219 | -------------------------------------------------------------------------------- /dask_cloudprovider/digitalocean/tests/test_droplet.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import dask 4 | 5 | digitalocean = pytest.importorskip("digitalocean") 6 | 7 | from dask_cloudprovider.digitalocean.droplet import DropletCluster 8 | from dask.distributed import Client 9 | from distributed.core import Status 10 | 11 | 12 | async def skip_without_credentials(config): 13 | if config.get("token") is None: 14 | pytest.skip( 15 | """ 16 | You must configure a Digital Ocean API token to run this test. 17 | 18 | Either set this in your config 19 | 20 | # cloudprovider.yaml 21 | cloudprovider: 22 | digitalocean: 23 | token: "yourtoken" 24 | 25 | Or by setting it as an environment variable 26 | 27 | export DASK_CLOUDPROVIDER__DIGITALOCEAN__TOKEN="yourtoken" 28 | 29 | """ 30 | ) 31 | 32 | 33 | @pytest.fixture 34 | async def config(): 35 | return dask.config.get("cloudprovider.digitalocean", {}) 36 | 37 | 38 | @pytest.fixture 39 | @pytest.mark.external 40 | async def cluster(config): 41 | await skip_without_credentials(config) 42 | async with DropletCluster(asynchronous=True) as cluster: 43 | yield cluster 44 | 45 | 46 | @pytest.mark.asyncio 47 | @pytest.mark.external 48 | async def test_init(): 49 | cluster = DropletCluster(asynchronous=True) 50 | assert cluster.status == Status.created 51 | 52 | 53 | @pytest.mark.asyncio 54 | @pytest.mark.timeout(600) 55 | @pytest.mark.external 56 | async def test_create_cluster(cluster): 57 | assert cluster.status == Status.running 58 | 59 | cluster.scale(1) 60 | await cluster 61 | assert len(cluster.workers) == 1 62 | 63 | async with Client(cluster, asynchronous=True) as client: 64 | 65 | def inc(x): 66 | return x + 1 67 | 68 | assert await client.submit(inc, 10).result() == 11 69 | 70 | 71 | @pytest.mark.asyncio 72 | async def test_get_cloud_init(): 73 | cloud_init = DropletCluster.get_cloud_init( 74 | docker_args="--privileged", 75 | ) 76 | assert " --privileged " in cloud_init 77 | -------------------------------------------------------------------------------- /dask_cloudprovider/exceptions.py: -------------------------------------------------------------------------------- 1 | class ConfigError(Exception): 2 | """Raised when required config is missing""" 3 | -------------------------------------------------------------------------------- /dask_cloudprovider/gcp/__init__.py: -------------------------------------------------------------------------------- 1 | from .instances import GCPCluster 2 | -------------------------------------------------------------------------------- /dask_cloudprovider/gcp/tests/test_gcp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import dask 4 | from dask_cloudprovider.gcp.instances import ( 5 | GCPCluster, 6 | GCPCompute, 7 | GCPCredentialsError, 8 | ) 9 | from dask.distributed import Client 10 | from distributed.core import Status 11 | 12 | 13 | def skip_without_credentials(): 14 | try: 15 | _ = GCPCompute() 16 | except GCPCredentialsError: 17 | pytest.skip( 18 | """ 19 | You must configure your GCP credentials to run this test. 20 | 21 | $ google auth login 22 | 23 | or 24 | 25 | $ export GOOGLE_APPLICATION_CREDENTIALS= 26 | 27 | """ 28 | ) 29 | 30 | if not dask.config.get("cloudprovider.gcp.projectid"): 31 | pytest.skip( 32 | """ 33 | You must configure your Google project ID to run this test. 34 | 35 | # ~/.config/dask/cloudprovider.yaml 36 | cloudprovider: 37 | gcp: 38 | projectid: "YOUR PROJECT ID" 39 | 40 | or 41 | 42 | $ export DASK_CLOUDPROVIDER__GCP__PROJECTID="YOUR PROJECT ID" 43 | 44 | """ 45 | ) 46 | 47 | 48 | @pytest.mark.asyncio 49 | async def test_init(): 50 | skip_without_credentials() 51 | 52 | cluster = GCPCluster(asynchronous=True) 53 | assert cluster.status == Status.created 54 | 55 | 56 | @pytest.mark.asyncio 57 | async def test_get_cloud_init(): 58 | skip_without_credentials() 59 | cloud_init = GCPCluster.get_cloud_init( 60 | security=True, 61 | docker_args="--privileged", 62 | extra_bootstrap=["gcloud auth print-access-token"], 63 | ) 64 | assert "dask-scheduler" in cloud_init 65 | assert "# Bootstrap" in cloud_init 66 | assert " --privileged " in cloud_init 67 | assert "- gcloud auth print-access-token" in cloud_init 68 | 69 | 70 | @pytest.mark.asyncio 71 | @pytest.mark.timeout(1200) 72 | @pytest.mark.external 73 | async def test_create_cluster(): 74 | skip_without_credentials() 75 | 76 | async with GCPCluster( 77 | asynchronous=True, env_vars={"FOO": "bar"}, security=True 78 | ) as cluster: 79 | assert cluster.status == Status.running 80 | 81 | cluster.scale(2) 82 | await cluster 83 | assert len(cluster.workers) == 2 84 | 85 | async with Client(cluster, asynchronous=True) as client: 86 | 87 | def inc(x): 88 | return x + 1 89 | 90 | def check_env(): 91 | import os 92 | 93 | return os.environ["FOO"] 94 | 95 | assert await client.submit(inc, 10).result() == 11 96 | assert await client.submit(check_env).result() == "bar" 97 | 98 | 99 | @pytest.mark.asyncio 100 | @pytest.mark.timeout(1200) 101 | @pytest.mark.external 102 | async def test_create_cluster_sync(): 103 | skip_without_credentials() 104 | 105 | cluster = GCPCluster(n_workers=1) 106 | client = Client(cluster) 107 | 108 | def inc(x): 109 | return x + 1 110 | 111 | assert client.submit(inc, 10).result() == 11 112 | 113 | 114 | @pytest.mark.asyncio 115 | @pytest.mark.timeout(1200) 116 | @pytest.mark.external 117 | async def test_create_rapids_cluster(): 118 | skip_without_credentials() 119 | 120 | async with GCPCluster( 121 | source_image="projects/nv-ai-infra/global/images/ngc-docker-11-20200916", 122 | zone="us-east1-c", 123 | machine_type="n1-standard-1", 124 | filesystem_size=50, 125 | ngpus=2, 126 | gpu_type="nvidia-tesla-t4", 127 | docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9", 128 | worker_class="dask_cuda.CUDAWorker", 129 | worker_options={"rmm_pool_size": "15GB"}, 130 | asynchronous=True, 131 | auto_shutdown=True, 132 | bootstrap=False, 133 | ) as cluster: 134 | assert cluster.status == Status.running 135 | 136 | cluster.scale(1) 137 | 138 | await cluster 139 | 140 | assert len(cluster.workers) == 1 141 | 142 | client = Client(cluster, asynchronous=True) # noqa 143 | await client 144 | 145 | def gpu_mem(): 146 | from pynvml.smi import nvidia_smi 147 | 148 | nvsmi = nvidia_smi.getInstance() 149 | return nvsmi.DeviceQuery("memory.free, memory.total") 150 | 151 | results = await client.run(gpu_mem) 152 | for w, res in results.items(): 153 | assert "total" in res["gpu"][0]["fb_memory_usage"].keys() 154 | print(res) 155 | 156 | 157 | @pytest.mark.timeout(1200) 158 | @pytest.mark.external 159 | def test_create_rapids_cluster_sync(): 160 | skip_without_credentials() 161 | cluster = GCPCluster( 162 | source_image="projects/nv-ai-infra/global/images/packer-1607527229", 163 | network="dask-gcp-network-test", 164 | zone="us-east1-c", 165 | machine_type="n1-standard-1", 166 | filesystem_size=50, 167 | ngpus=2, 168 | gpu_type="nvidia-tesla-t4", 169 | docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.9", 170 | worker_class="dask_cuda.CUDAWorker", 171 | worker_options={"rmm_pool_size": "15GB"}, 172 | asynchronous=False, 173 | bootstrap=False, 174 | ) 175 | 176 | cluster.scale(1) 177 | 178 | client = Client(cluster) # noqa 179 | client.wait_for_workers(2) 180 | 181 | def gpu_mem(): 182 | from pynvml.smi import nvidia_smi 183 | 184 | nvsmi = nvidia_smi.getInstance() 185 | return nvsmi.DeviceQuery("memory.free, memory.total") 186 | 187 | results = client.run(gpu_mem) 188 | for w, res in results.items(): 189 | assert "total" in res["gpu"][0]["fb_memory_usage"].keys() 190 | print(res) 191 | cluster.close() 192 | -------------------------------------------------------------------------------- /dask_cloudprovider/gcp/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dask_cloudprovider.gcp.utils import build_request, is_inside_gce 4 | 5 | 6 | def test_build_request(): 7 | assert build_request()(None, lambda x: x, "https://example.com") 8 | 9 | 10 | @pytest.mark.xfail( 11 | is_inside_gce(), reason="Fails if you run this test on GCE environment" 12 | ) 13 | def test_is_gce_env(): 14 | # Note: this test isn't super valuable, but at least we run the code 15 | assert is_inside_gce() is False 16 | -------------------------------------------------------------------------------- /dask_cloudprovider/gcp/utils.py: -------------------------------------------------------------------------------- 1 | import httplib2 2 | import googleapiclient.http 3 | import google_auth_httplib2 4 | 5 | 6 | def build_request(credentials=None): 7 | def inner(http, *args, **kwargs): 8 | new_http = httplib2.Http() 9 | if credentials is not None: 10 | new_http = google_auth_httplib2.AuthorizedHttp(credentials, http=new_http) 11 | 12 | return googleapiclient.http.HttpRequest(new_http, *args, **kwargs) 13 | 14 | return inner 15 | 16 | 17 | def is_inside_gce() -> bool: 18 | """ 19 | Returns True is the client is running in the GCE environment, 20 | False otherwise. 21 | 22 | Doc: https://cloud.google.com/compute/docs/storing-retrieving-metadata 23 | """ 24 | h = httplib2.Http() 25 | try: 26 | resp_headers, _ = h.request( 27 | "http://metadata.google.internal/computeMetadata/v1/", 28 | headers={"metadata-flavor": "Google"}, 29 | method="GET", 30 | ) 31 | except (httplib2.HttpLib2Error, OSError): 32 | return False 33 | return True 34 | -------------------------------------------------------------------------------- /dask_cloudprovider/generic/cloud-init.yaml.j2: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | 3 | {% if bootstrap %} 4 | # Bootstrap 5 | packages: 6 | - apt-transport-https 7 | - ca-certificates 8 | - curl 9 | - gnupg-agent 10 | - software-properties-common 11 | - ubuntu-drivers-common 12 | 13 | # Enable ipv4 forwarding, required on CIS hardened machines 14 | write_files: 15 | - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf 16 | content: | 17 | net.ipv4.conf.all.forwarding=1 18 | 19 | # create the docker group 20 | groups: 21 | - docker 22 | 23 | # Add default auto created user to docker group 24 | system_info: 25 | default_user: 26 | groups: [docker] 27 | {% endif %} 28 | 29 | runcmd: 30 | {% if bootstrap %} 31 | # Install Docker 32 | - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - 33 | - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" 34 | - apt-get update -y 35 | - apt-get install -y docker-ce docker-ce-cli containerd.io 36 | - systemctl start docker 37 | - systemctl enable docker 38 | {% endif %} 39 | 40 | {% if bootstrap and gpu_instance %} 41 | # Install NVIDIA driver 42 | - DEBIAN_FRONTEND=noninteractive ubuntu-drivers install 43 | 44 | # Install NVIDIA docker 45 | - curl -fsSL https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - 46 | - curl -s -L https://nvidia.github.io/nvidia-docker/$(. /etc/os-release;echo $ID$VERSION_ID)/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 47 | - apt-get update -y 48 | - apt-get install -y nvidia-docker2 49 | - systemctl restart docker 50 | {% endif %} 51 | 52 | {% if extra_bootstrap %} 53 | {% for command in extra_bootstrap %} 54 | - {{ command }} 55 | {% endfor %} 56 | {% endif %} 57 | 58 | # Run container 59 | - 'docker run --net=host {%+ if gpu_instance %}--gpus=all{% endif %} {% for key in env_vars %} -e {{key}}="{{env_vars[key]}}" {% endfor %}{%+ if docker_args %}{{docker_args}}{% endif %} {{image}} {{ command }}' 60 | 61 | {% if auto_shutdown %} 62 | # Shutdown when command is done 63 | - shutdown -h now 64 | {% endif %} 65 | -------------------------------------------------------------------------------- /dask_cloudprovider/generic/tests/test_vmcluster.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import asyncio 4 | import time 5 | 6 | from dask_cloudprovider.generic.vmcluster import VMCluster, VMInterface 7 | 8 | 9 | class DummyWorker(VMInterface): 10 | """A dummy worker for testing.""" 11 | 12 | 13 | class DummyScheduler(VMInterface): 14 | """A dummy scheduler for testing.""" 15 | 16 | 17 | class DummyCluster(VMCluster): 18 | """A dummy cluster for testing.""" 19 | 20 | scheduler_class = DummyScheduler 21 | worker_class = DummyWorker 22 | 23 | 24 | @pytest.mark.asyncio 25 | async def test_init(): 26 | with pytest.raises(RuntimeError): 27 | _ = VMCluster(asynchronous=True) 28 | 29 | 30 | @pytest.mark.asyncio 31 | async def test_call_async(): 32 | cluster = DummyCluster(asynchronous=True) 33 | 34 | def blocking(string): 35 | time.sleep(0.1) 36 | return string 37 | 38 | start = time.time() 39 | 40 | a, b, c, d = await asyncio.gather( 41 | cluster.call_async(blocking, "hello"), 42 | cluster.call_async(blocking, "world"), 43 | cluster.call_async(blocking, "foo"), 44 | cluster.call_async(blocking, "bar"), 45 | ) 46 | 47 | assert a == "hello" 48 | assert b == "world" 49 | assert c == "foo" 50 | assert d == "bar" 51 | 52 | # Each call to ``blocking`` takes 0.1 seconds, but they should've been run concurrently. 53 | assert time.time() - start < 0.2 54 | 55 | await cluster.close() 56 | -------------------------------------------------------------------------------- /dask_cloudprovider/generic/vmcluster.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import uuid 5 | 6 | from jinja2 import Environment, FileSystemLoader 7 | 8 | import dask.config 9 | from distributed.core import Status 10 | from distributed.worker import Worker as _Worker 11 | from distributed.scheduler import Scheduler as _Scheduler 12 | from distributed.security import Security 13 | from distributed.deploy.spec import SpecCluster, ProcessInterface 14 | from distributed.utils import warn_on_duration, cli_keywords 15 | 16 | from dask_cloudprovider.utils.socket import is_socket_open 17 | 18 | 19 | class VMInterface(ProcessInterface): 20 | """A superclass for VM Schedulers, Workers and Nannies.""" 21 | 22 | def __init__(self, docker_args: str = "", extra_bootstrap: list = None, **kwargs): 23 | super().__init__() 24 | self.name = None 25 | self.command = None 26 | self.address = None 27 | self.cluster = None 28 | self.gpu_instance = None 29 | self.bootstrap = None 30 | self.docker_image = "daskdev/dask:latest" 31 | self.docker_args = docker_args 32 | self.extra_bootstrap = extra_bootstrap 33 | self.auto_shutdown = True 34 | self.set_env = 'env DASK_INTERNAL_INHERIT_CONFIG="{}"'.format( 35 | dask.config.serialize(dask.config.global_config) 36 | ) 37 | self.kwargs = kwargs 38 | 39 | async def create_vm(self): 40 | raise NotImplementedError("create_vm is a required method of the VMInterface") 41 | 42 | async def destroy_vm(self): 43 | raise NotImplementedError("destroy_vm is a required method of the VMInterface") 44 | 45 | async def wait_for_scheduler(self): 46 | if self.external_address: 47 | _, address = self.external_address.split("://") 48 | else: 49 | _, address = self.address.split("://") 50 | ip, port = address.split(":") 51 | 52 | self.cluster._log(f"Waiting for scheduler to run at {ip}:{port}") 53 | while not is_socket_open(ip, port): 54 | await asyncio.sleep(0.1) 55 | self.cluster._log("Scheduler is running") 56 | 57 | async def start(self): 58 | """Create a VM.""" 59 | await super().start() 60 | 61 | async def close(self): 62 | """Destroy a VM.""" 63 | await self.destroy_vm() 64 | await super().close() 65 | 66 | async def call_async(self, f, *args, **kwargs): 67 | """Run a blocking function in a thread as a coroutine.""" 68 | return await self.call_async(f, *args, **kwargs) 69 | 70 | 71 | class SchedulerMixin(object): 72 | """A mixin for Schedulers.""" 73 | 74 | def __init__( 75 | self, 76 | *args, 77 | scheduler_options: dict = {}, 78 | **kwargs, 79 | ): 80 | super().__init__(*args, **kwargs) 81 | self.name = f"dask-{self.cluster.uuid}-scheduler" 82 | self.port = scheduler_options.get("port", 8786) 83 | self.command = " ".join( 84 | [ 85 | self.set_env, 86 | "python", 87 | "-m", 88 | "distributed.cli.dask_scheduler", 89 | ] 90 | + cli_keywords(scheduler_options, cls=_Scheduler) 91 | ) 92 | 93 | async def start(self): 94 | self.cluster._log("Creating scheduler instance") 95 | 96 | internal_ip, external_ip = await self.create_vm() 97 | self.address = f"{self.cluster.protocol}://{internal_ip}:{self.port}" 98 | if external_ip: 99 | self.external_address = ( 100 | f"{self.cluster.protocol}://{external_ip}:{self.port}" 101 | ) 102 | 103 | await self.wait_for_scheduler() 104 | await super().start() 105 | 106 | 107 | class WorkerMixin(object): 108 | """A Remote Dask Worker running on a VM.""" 109 | 110 | def __init__( 111 | self, 112 | scheduler: str, 113 | *args, 114 | worker_module: str = None, 115 | worker_class: str = None, 116 | worker_options: dict = {}, 117 | **kwargs, 118 | ): 119 | super().__init__(*args, **kwargs) 120 | self.scheduler = scheduler 121 | self.name = f"dask-{self.cluster.uuid}-worker-{str(uuid.uuid4())[:8]}" 122 | if worker_module is not None: 123 | self.worker_module = worker_module 124 | 125 | self.command = " ".join( 126 | [ 127 | self.set_env, 128 | "python", 129 | "-m", 130 | self.worker_module, 131 | self.scheduler, 132 | "--name", 133 | str(self.name), 134 | ] 135 | + cli_keywords(worker_options, cls=_Worker, cmd=self.worker_module) 136 | ) 137 | if worker_class is not None: 138 | self.worker_class = worker_class 139 | self.command = " ".join( 140 | [ 141 | self.set_env, 142 | "python", 143 | "-m", 144 | "distributed.cli.dask_spec", 145 | self.scheduler, 146 | "--spec", 147 | "''%s''" # in yaml double single quotes escape the single quote 148 | % json.dumps( 149 | { 150 | "cls": self.worker_class, 151 | "opts": { 152 | **worker_options, 153 | "name": self.name, 154 | }, 155 | } 156 | ), 157 | ] 158 | ) 159 | 160 | async def start(self): 161 | self.cluster._log("Creating worker instance") 162 | self.address, _ = await self.create_vm() 163 | await super().start() 164 | 165 | 166 | class VMCluster(SpecCluster): 167 | """A base class for Virtual Machine based cluster managers. 168 | 169 | This class holds logic around starting a scheduler and workers as VMs. This class 170 | is not intended to be used directly but instead should be subclassed and the attributes 171 | ``scheduler_class`` and ``worker_class`` should be set. 172 | 173 | The scheduler class should be a subclass of ``VMInterface`` with the ``SchedulerMixin``. 174 | The worker class should be a subclass of ``VMInterface`` with the ``WorkerMixin``. 175 | 176 | See ``VMInterface`` docstring for required methods. 177 | 178 | For a reference implementation see :class:`DropletCluster`. 179 | 180 | The following paramaters section should be copied to the subclass docstring and appended 181 | to the provider specific paramaters. 182 | 183 | Parameters 184 | ---------- 185 | n_workers: int 186 | Number of workers to initialise the cluster with. Defaults to ``0``. 187 | worker_module: str 188 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker`` 189 | worker_options: dict 190 | Params to be passed to the worker class. 191 | See :class:`distributed.worker.Worker` for default worker class. 192 | If you set ``worker_module`` then refer to the docstring for the custom worker class. 193 | scheduler_options: dict 194 | Params to be passed to the scheduler class. 195 | See :class:`distributed.scheduler.Scheduler`. 196 | docker_image: string (optional) 197 | The Docker image to run on all instances. 198 | 199 | This image must have a valid Python environment and have ``dask`` installed in order for the 200 | ``dask-scheduler`` and ``dask-worker`` commands to be available. It is recommended the Python 201 | environment matches your local environment where ``EC2Cluster`` is being created from. 202 | 203 | For GPU instance types the Docker image much have NVIDIA drivers and ``dask-cuda`` installed. 204 | 205 | By default the ``daskdev/dask:latest`` image will be used. 206 | docker_args: string (optional) 207 | Extra command line arguments to pass to Docker. 208 | extra_bootstrap: list[str] (optional) 209 | Extra commands to be run during the bootstrap phase. 210 | silence_logs: bool 211 | Whether or not we should silence logging when setting up the cluster. 212 | asynchronous: bool 213 | If this is intended to be used directly within an event loop with 214 | async/await 215 | security: Security or bool, optional 216 | Configures communication security in this cluster. Can be a security 217 | object, or True. If True, temporary self-signed credentials will 218 | be created automatically. Default is ``True``. 219 | debug: bool, optional 220 | More information will be printed when constructing clusters to enable debugging. 221 | 222 | """ 223 | 224 | scheduler_class = None 225 | worker_class = None 226 | options = {} 227 | scheduler_options = {} 228 | worker_options = {} 229 | docker_image = None 230 | command = None 231 | gpu_instance = None 232 | bootstrap = None 233 | auto_shutdown = None 234 | 235 | def __init__( 236 | self, 237 | n_workers: int = 0, 238 | worker_class: str = "dask.distributed.Nanny", 239 | worker_options: dict = {}, 240 | scheduler_options: dict = {}, 241 | docker_image="daskdev/dask:latest", 242 | docker_args: str = "", 243 | extra_bootstrap: list = None, 244 | env_vars: dict = {}, 245 | security: bool = True, 246 | protocol: str = None, 247 | debug: bool = False, 248 | **kwargs, 249 | ): 250 | if self.scheduler_class is None or self.worker_class is None: 251 | raise RuntimeError( 252 | "VMCluster is not intended to be used directly. See docstring for more info." 253 | ) 254 | self._n_workers = n_workers 255 | 256 | if not security: 257 | self.security = None 258 | elif security is True: 259 | # True indicates self-signed temporary credentials should be used 260 | self.security = Security.temporary() 261 | elif not isinstance(security, Security): 262 | raise TypeError("security must be a Security object") 263 | else: 264 | self.security = security 265 | 266 | if protocol is None: 267 | if self.security and self.security.require_encryption: 268 | self.protocol = "tls" 269 | else: 270 | self.protocol = "tcp" 271 | else: 272 | self.protocol = protocol 273 | 274 | self.debug = debug 275 | 276 | if self.security and self.security.require_encryption: 277 | dask.config.set( 278 | { 279 | "distributed.comm.default-scheme": self.protocol, 280 | "distributed.comm.require-encryption": True, 281 | "distributed.comm.tls.ca-file": self.security.tls_ca_file, 282 | "distributed.comm.tls.scheduler.key": self.security.tls_scheduler_key, 283 | "distributed.comm.tls.scheduler.cert": self.security.tls_scheduler_cert, 284 | "distributed.comm.tls.worker.key": self.security.tls_worker_key, 285 | "distributed.comm.tls.worker.cert": self.security.tls_worker_cert, 286 | "distributed.comm.tls.client.key": self.security.tls_client_key, 287 | "distributed.comm.tls.client.cert": self.security.tls_client_cert, 288 | } 289 | ) 290 | 291 | image = self.scheduler_options.get("docker_image", False) or docker_image 292 | self.options["docker_image"] = image 293 | self.scheduler_options["docker_image"] = image 294 | self.scheduler_options["env_vars"] = env_vars 295 | self.scheduler_options["protocol"] = protocol 296 | self.scheduler_options["scheduler_options"] = scheduler_options 297 | self.scheduler_options["extra_bootstrap"] = extra_bootstrap 298 | self.worker_options["env_vars"] = env_vars 299 | self.options["docker_args"] = docker_args 300 | self.options["extra_bootstrap"] = extra_bootstrap 301 | self.scheduler_options["docker_args"] = docker_args 302 | self.worker_options["docker_args"] = docker_args 303 | self.worker_options["docker_image"] = image 304 | self.worker_options["worker_class"] = worker_class 305 | self.worker_options["protocol"] = protocol 306 | self.worker_options["worker_options"] = worker_options 307 | self.worker_options["extra_bootstrap"] = extra_bootstrap 308 | self.uuid = str(uuid.uuid4())[:8] 309 | 310 | super().__init__(**kwargs, security=self.security) 311 | 312 | async def call_async(self, f, *args, **kwargs): 313 | """Run a blocking function in a thread as a coroutine. 314 | 315 | This can only be used to make IO-bound operations non-blocking due to the GIL. 316 | 317 | As of Python 3.9 this can be replaced with :func:`asyncio.to_thread`. 318 | Once 3.9 is our minimum supported version this can be removed/replaced. 319 | 320 | """ 321 | [done], _ = await asyncio.wait( 322 | fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))}, 323 | return_when=asyncio.ALL_COMPLETED, 324 | ) 325 | return done.result() 326 | 327 | async def _start( 328 | self, 329 | ): 330 | while self.status == Status.starting: 331 | await asyncio.sleep(0.01) 332 | if self.status == Status.running: 333 | return 334 | if self.status == Status.closed: 335 | raise ValueError("Cluster is closed") 336 | 337 | self.scheduler_spec = { 338 | "cls": self.scheduler_class, 339 | "options": self.scheduler_options, 340 | } 341 | self.new_spec = {"cls": self.worker_class, "options": self.worker_options} 342 | self.worker_spec = { 343 | self._new_worker_name(i): self.new_spec for i in range(self._n_workers) 344 | } 345 | 346 | with warn_on_duration( 347 | "10s", 348 | "Creating your cluster is taking a surprisingly long time. " 349 | "This is likely due to pending resources. " 350 | "Hang tight! ", 351 | ): 352 | await super()._start() 353 | 354 | def render_process_cloud_init(self, process): 355 | return self.render_cloud_init( 356 | image=process.docker_image, 357 | command=process.command, 358 | docker_args=process.docker_args, 359 | extra_bootstrap=process.extra_bootstrap, 360 | gpu_instance=process.gpu_instance, 361 | bootstrap=process.bootstrap, 362 | auto_shutdown=process.auto_shutdown, 363 | env_vars=process.env_vars, 364 | ) 365 | 366 | def render_cloud_init(self, *args, **kwargs): 367 | loader = FileSystemLoader([os.path.dirname(os.path.abspath(__file__))]) 368 | environment = Environment(loader=loader) 369 | template = environment.get_template("cloud-init.yaml.j2") 370 | cloud_init = template.render(**kwargs) 371 | if self.debug: 372 | print("\nCloud init\n==========\n\n") 373 | print(cloud_init) 374 | return cloud_init 375 | 376 | @classmethod 377 | def get_cloud_init(cls, *args, **kwargs): 378 | cluster = cls(*args, asynchronous=True, **kwargs) 379 | cluster.auto_shutdown = False 380 | return cluster.render_cloud_init( 381 | image=cluster.options["docker_image"], 382 | command="dask-scheduler --version", 383 | docker_args=cluster.options["docker_args"], 384 | extra_bootstrap=cluster.options["extra_bootstrap"], 385 | gpu_instance=cluster.gpu_instance, 386 | bootstrap=cluster.bootstrap, 387 | auto_shutdown=cluster.auto_shutdown, 388 | env_vars=cluster.worker_options["env_vars"], 389 | ) 390 | 391 | def get_tags(self): 392 | """Generate tags to be applied to all resources.""" 393 | return {"creator": "dask-cloudprovider", "cluster-id": self.uuid} 394 | -------------------------------------------------------------------------------- /dask_cloudprovider/hetzner/__init__.py: -------------------------------------------------------------------------------- 1 | from .vserver import HetznerCluster 2 | -------------------------------------------------------------------------------- /dask_cloudprovider/hetzner/tests/test_vserver.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import dask 4 | 5 | hetzner = pytest.importorskip("hcloud") 6 | 7 | from dask_cloudprovider.hetzner.vserver import HetznerCluster 8 | from dask.distributed import Client 9 | from distributed.core import Status 10 | 11 | 12 | async def skip_without_credentials(config): 13 | if config.get("token") is None: 14 | pytest.skip( 15 | """ 16 | You must configure a Hetzner API token to run this test. 17 | 18 | Either set this in your config 19 | 20 | # cloudprovider.yaml 21 | cloudprovider: 22 | hetzner: 23 | token: "yourtoken" 24 | 25 | Or by setting it as an environment variable 26 | 27 | export DASK_CLOUDPROVIDER__HETZNER__TOKEN="yourtoken" 28 | 29 | """ 30 | ) 31 | 32 | 33 | @pytest.fixture 34 | async def config(): 35 | return dask.config.get("cloudprovider.hetzner", {}) 36 | 37 | 38 | @pytest.fixture 39 | @pytest.mark.external 40 | async def cluster(config): 41 | await skip_without_credentials(config) 42 | async with HetznerCluster(asynchronous=True) as cluster: 43 | yield cluster 44 | 45 | 46 | @pytest.mark.asyncio 47 | async def test_init(): 48 | cluster = HetznerCluster(asynchronous=True) 49 | assert cluster.status == Status.created 50 | 51 | 52 | @pytest.mark.asyncio 53 | @pytest.mark.timeout(600) 54 | async def test_create_cluster(cluster): 55 | assert cluster.status == Status.running 56 | 57 | cluster.scale(1) 58 | await cluster 59 | assert len(cluster.workers) == 1 60 | 61 | async with Client(cluster, asynchronous=True) as client: 62 | 63 | def inc(x): 64 | return x + 1 65 | 66 | assert await client.submit(inc, 10).result() == 11 67 | 68 | 69 | @pytest.mark.asyncio 70 | async def test_get_cloud_init(): 71 | cloud_init = HetznerCluster.get_cloud_init( 72 | docker_args="--privileged", 73 | ) 74 | assert " --privileged " in cloud_init 75 | -------------------------------------------------------------------------------- /dask_cloudprovider/hetzner/vserver.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import dask 3 | 4 | from dask_cloudprovider.generic.vmcluster import ( 5 | VMCluster, 6 | VMInterface, 7 | SchedulerMixin, 8 | WorkerMixin, 9 | ) 10 | 11 | try: 12 | import hcloud 13 | except ImportError as e: 14 | msg = ( 15 | "Dask Cloud Provider Hetzner requirements are not installed.\n\n" 16 | "Please pip install as follows:\n\n" 17 | ' pip install "dask-cloudprovider[hcloud]" --upgrade # or python -m pip install' 18 | ) 19 | raise ImportError(msg) from e 20 | 21 | from hcloud.images.domain import Image 22 | from hcloud.server_types.domain import ServerType 23 | from hcloud.actions.domain import Action 24 | 25 | 26 | class VServer(VMInterface): 27 | def __init__( 28 | self, 29 | cluster: str, 30 | config, 31 | env_vars: dict = None, 32 | bootstrap=None, 33 | extra_bootstrap=None, 34 | docker_image: str = None, 35 | image: str = None, 36 | location: str = None, 37 | server_type: str = None, 38 | *args, 39 | **kwargs, 40 | ): 41 | super().__init__(*args, **kwargs) 42 | self.cluster = cluster 43 | self.config = config 44 | self.location = location 45 | self.bootstrap = bootstrap 46 | self.extra_bootstrap = extra_bootstrap 47 | self.env_vars = env_vars 48 | self.client = hcloud.Client(self.config.get("token")) 49 | self.server_type = ServerType(server_type) 50 | self.image = Image(name=image) 51 | self.docker_image = docker_image 52 | 53 | async def create_vm(self): 54 | await self.call_async( 55 | self.client.servers.create, 56 | server_type=self.server_type, 57 | image=self.image, 58 | name=self.name, 59 | user_data=self.cluster.render_process_cloud_init(self), 60 | ) 61 | 62 | self.server = self.client.servers.get_by_name(self.name) 63 | for action in self.server.get_actions(): 64 | while action.status != Action.STATUS_SUCCESS: 65 | await self.call_async(action.reload) 66 | await asyncio.sleep(0.1) 67 | self.cluster._log(f"Created Hetzner vServer {self.name}") 68 | 69 | return self.server.public_net.ipv4.ip, None 70 | 71 | async def destroy_vm(self): 72 | await self.call_async(self.client.servers.delete, server=self.server) 73 | self.cluster._log(f"Terminated vServer {self.name}") 74 | 75 | 76 | class HetznerScheduler(SchedulerMixin, VServer): 77 | """Scheduler running on a Hetzner server.""" 78 | 79 | 80 | class HetznerWorker(WorkerMixin, VServer): 81 | """Worker running on a Hetzner server.""" 82 | 83 | 84 | class HetznerCluster(VMCluster): 85 | """Cluster running on Hetzner cloud vServers. 86 | 87 | VMs in Hetzner are referred to as vServers. This cluster manager constructs a Dask cluster 88 | running on VMs. 89 | 90 | When configuring your cluster you may find it useful to install the ``hcloud`` tool for querying the 91 | Hetzner API for available options. 92 | 93 | https://github.com/hetznercloud/cli 94 | 95 | Parameters 96 | ---------- 97 | image: str 98 | The image to use for the host OS. This should be a Ubuntu variant. 99 | You can list available images with ``hcloud image list|grep Ubuntu``. 100 | location: str 101 | The Hetzner location to launch you cluster in. A full list can be obtained with ``hcloud location list``. 102 | server_type: str 103 | The VM server type. You can get a full list with ``hcloud server-type list``. 104 | The default is ``cx11`` which is vServer with 2GB RAM and 1 vCPU. 105 | n_workers: int 106 | Number of workers to initialise the cluster with. Defaults to ``0``. 107 | worker_module: str 108 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker`` 109 | worker_options: dict 110 | Params to be passed to the worker class. 111 | See :class:`distributed.worker.Worker` for default worker class. 112 | If you set ``worker_module`` then refer to the docstring for the custom worker class. 113 | scheduler_options: dict 114 | Params to be passed to the scheduler class. 115 | See :class:`distributed.scheduler.Scheduler`. 116 | env_vars: dict 117 | Environment variables to be passed to the worker. 118 | extra_bootstrap: list[str] (optional) 119 | Extra commands to be run during the bootstrap phase. 120 | 121 | Example 122 | -------- 123 | 124 | >>> from dask_cloudprovider.hetzner import HetznerCluster 125 | >>> cluster = HetznerCluster(n_workers=1) 126 | 127 | >>> from dask.distributed import Client 128 | >>> client = Client(cluster) 129 | 130 | >>> import dask.array as da 131 | >>> arr = da.random.random((1000, 1000), chunks=(100, 100)) 132 | >>> arr.mean().compute() 133 | 134 | >>> client.close() 135 | >>> cluster.close() 136 | 137 | """ 138 | 139 | def __init__( 140 | self, 141 | bootstrap: str = None, 142 | image: str = None, 143 | location: str = None, 144 | server_type: str = None, 145 | docker_image: str = None, 146 | **kwargs, 147 | ): 148 | self.config = dask.config.get("cloudprovider.hetzner", {}) 149 | 150 | self.scheduler_class = HetznerScheduler 151 | self.worker_class = HetznerWorker 152 | 153 | self.image = dask.config.get("cloudprovider.hetzner.image", override_with=image) 154 | self.docker_image = dask.config.get( 155 | "cloudprovider.hetzner.docker_image", override_with=docker_image 156 | ) 157 | self.location = dask.config.get( 158 | "cloudprovider.hetzner.location", override_with=location 159 | ) 160 | self.server_type = dask.config.get( 161 | "cloudprovider.hetzner.server_type", override_with=server_type 162 | ) 163 | self.bootstrap = dask.config.get( 164 | "cloudprovider.hetzner.bootstrap", override_with=bootstrap 165 | ) 166 | 167 | self.options = { 168 | "bootstrap": self.bootstrap, 169 | "cluster": self, 170 | "config": self.config, 171 | "docker_image": self.docker_image, 172 | "image": self.image, 173 | "location": self.location, 174 | "server_type": self.server_type, 175 | } 176 | self.scheduler_options = {**self.options} 177 | self.worker_options = {**self.options} 178 | super().__init__(**kwargs) 179 | -------------------------------------------------------------------------------- /dask_cloudprovider/ibm/__init__.py: -------------------------------------------------------------------------------- 1 | from .code_engine import IBMCodeEngineCluster 2 | -------------------------------------------------------------------------------- /dask_cloudprovider/ibm/tests/test_code_engine.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import dask 4 | 5 | codeengine = pytest.importorskip("ibm_code_engine_sdk.code_engine_v2") 6 | 7 | from dask_cloudprovider.ibm.code_engine import IBMCodeEngineCluster 8 | from dask.distributed import Client 9 | from distributed.core import Status 10 | 11 | 12 | async def skip_without_credentials(): 13 | if dask.config.get("cloudprovider.ibm.api_key") is None: 14 | pytest.skip( 15 | """ 16 | You must configure a IBM API key to run this test. 17 | 18 | Either set this in your config 19 | 20 | # cloudprovider.yaml 21 | cloudprovider: 22 | ibm: 23 | api_key: "your_api_key" 24 | 25 | Or by setting it as an environment variable 26 | 27 | export DASK_CLOUDPROVIDER__IBM__API_KEY="your_api_key" 28 | 29 | """ 30 | ) 31 | 32 | if dask.config.get("cloudprovider.ibm.project_id") is None: 33 | pytest.skip( 34 | """ 35 | You must configure a IBM project id to run this test. 36 | 37 | Either set this in your config 38 | 39 | # cloudprovider.yaml 40 | cloudprovider: 41 | ibm: 42 | project_id: "your_project_id" 43 | 44 | Or by setting it as an environment variable 45 | 46 | export DASK_CLOUDPROVIDER__IBM__PROJECT_ID="your_project_id" 47 | 48 | """ 49 | ) 50 | 51 | if dask.config.get("cloudprovider.ibm.region") is None: 52 | pytest.skip( 53 | """ 54 | You must configure a IBM project id to run this test. 55 | 56 | Either set this in your config 57 | 58 | # cloudprovider.yaml 59 | cloudprovider: 60 | ibm: 61 | region: "your_region" 62 | 63 | Or by setting it as an environment variable 64 | 65 | export DASK_CLOUDPROVIDER__IBM__REGION="your_region" 66 | 67 | """ 68 | ) 69 | 70 | 71 | @pytest.mark.asyncio 72 | async def test_init(): 73 | await skip_without_credentials() 74 | cluster = IBMCodeEngineCluster(asynchronous=True) 75 | assert cluster.status == Status.created 76 | 77 | 78 | @pytest.mark.asyncio 79 | @pytest.mark.timeout(1200) 80 | @pytest.mark.external 81 | async def test_create_cluster(): 82 | async with IBMCodeEngineCluster(asynchronous=True) as cluster: 83 | cluster.scale(2) 84 | await cluster 85 | assert len(cluster.workers) == 2 86 | 87 | async with Client(cluster, asynchronous=True) as client: 88 | 89 | def inc(x): 90 | return x + 1 91 | 92 | assert await client.submit(inc, 10).result() == 11 93 | 94 | 95 | @pytest.mark.asyncio 96 | @pytest.mark.timeout(1200) 97 | @pytest.mark.external 98 | async def test_create_cluster_sync(): 99 | with IBMCodeEngineCluster() as cluster: 100 | with Client(cluster) as client: 101 | cluster.scale(1) 102 | client.wait_for_workers(1) 103 | assert len(cluster.workers) == 1 104 | 105 | def inc(x): 106 | return x + 1 107 | 108 | assert client.submit(inc, 10).result() == 11 109 | -------------------------------------------------------------------------------- /dask_cloudprovider/nebius/__init__.py: -------------------------------------------------------------------------------- 1 | from .instances import NebiusCluster 2 | -------------------------------------------------------------------------------- /dask_cloudprovider/nebius/instances.py: -------------------------------------------------------------------------------- 1 | import dask 2 | 3 | from dask_cloudprovider.generic.vmcluster import ( 4 | VMCluster, 5 | VMInterface, 6 | SchedulerMixin, 7 | WorkerMixin, 8 | ) 9 | 10 | try: 11 | from nebius.api.nebius.common.v1 import ResourceMetadata 12 | from nebius.api.nebius.vpc.v1 import SubnetServiceClient, ListSubnetsRequest 13 | from nebius.sdk import SDK 14 | from nebius.api.nebius.compute.v1 import ( 15 | InstanceServiceClient, 16 | CreateInstanceRequest, 17 | DiskServiceClient, 18 | CreateDiskRequest, 19 | DiskSpec, 20 | SourceImageFamily, 21 | InstanceSpec, 22 | AttachedDiskSpec, 23 | ExistingDisk, 24 | ResourcesSpec, 25 | NetworkInterfaceSpec, 26 | IPAddress, 27 | PublicIPAddress, 28 | GetInstanceRequest, 29 | DeleteInstanceRequest, 30 | DeleteDiskRequest, 31 | ) 32 | except ImportError as e: 33 | msg = ( 34 | "Dask Cloud Provider Nebius requirements are not installed.\n\n" 35 | "Please pip install as follows:\n\n" 36 | ' pip install "dask-cloudprovider[nebius]" --upgrade # or python -m pip install' 37 | ) 38 | raise ImportError(msg) from e 39 | 40 | 41 | class NebiusInstance(VMInterface): 42 | def __init__( 43 | self, 44 | cluster: str, 45 | config, 46 | env_vars: dict = None, 47 | bootstrap=None, 48 | extra_bootstrap=None, 49 | docker_image: str = None, 50 | image_family: str = None, 51 | project_id: str = None, 52 | server_platform: str = None, 53 | server_preset: str = None, 54 | disk_size: int = None, 55 | *args, 56 | **kwargs, 57 | ): 58 | super().__init__(*args, **kwargs) 59 | self.cluster = cluster 60 | self.config = config 61 | self.extra_bootstrap = extra_bootstrap 62 | self.env_vars = env_vars 63 | self.bootstrap = bootstrap 64 | self.image_family = image_family 65 | self.project_id = project_id 66 | self.docker_image = docker_image 67 | self.server_platform = server_platform 68 | self.server_preset = server_preset 69 | self.sdk = SDK(credentials=self.config.get("token")) 70 | self.disk_size = disk_size 71 | self.instance_id = None 72 | self.disk_id = None 73 | 74 | async def create_vm(self, user_data=None): 75 | service = DiskServiceClient(self.sdk) 76 | operation = await service.create( 77 | CreateDiskRequest( 78 | metadata=ResourceMetadata( 79 | parent_id=self.project_id, 80 | name=self.name + "-disk", 81 | ), 82 | spec=DiskSpec( 83 | source_image_family=SourceImageFamily( 84 | image_family=self.image_family 85 | ), 86 | size_gibibytes=self.disk_size, 87 | type=DiskSpec.DiskType.NETWORK_SSD, 88 | ), 89 | ) 90 | ) 91 | await operation.wait() 92 | self.disk_id = operation.resource_id 93 | 94 | service = SubnetServiceClient(self.sdk) 95 | sub_net = await service.list(ListSubnetsRequest(parent_id=self.project_id)) 96 | subnet_id = sub_net.items[0].metadata.id 97 | 98 | service = InstanceServiceClient(self.sdk) 99 | operation = await service.create( 100 | CreateInstanceRequest( 101 | metadata=ResourceMetadata( 102 | parent_id=self.project_id, 103 | name=self.name, 104 | ), 105 | spec=InstanceSpec( 106 | boot_disk=AttachedDiskSpec( 107 | attach_mode=AttachedDiskSpec.AttachMode(2), 108 | existing_disk=ExistingDisk(id=self.disk_id), 109 | ), 110 | cloud_init_user_data=self.cluster.render_process_cloud_init(self), 111 | resources=ResourcesSpec( 112 | platform=self.server_platform, preset=self.server_preset 113 | ), 114 | network_interfaces=[ 115 | NetworkInterfaceSpec( 116 | subnet_id=subnet_id, 117 | ip_address=IPAddress(), 118 | name="network-interface-0", 119 | public_ip_address=PublicIPAddress(), 120 | ) 121 | ], 122 | ), 123 | ) 124 | ) 125 | self.instance_id = operation.resource_id 126 | 127 | self.cluster._log(f"Creating Nebius instance {self.name}") 128 | await operation.wait() 129 | service = InstanceServiceClient(self.sdk) 130 | operation = await service.get( 131 | GetInstanceRequest( 132 | id=self.instance_id, 133 | ) 134 | ) 135 | internal_ip = operation.status.network_interfaces[0].ip_address.address.split( 136 | "/" 137 | )[0] 138 | external_ip = operation.status.network_interfaces[ 139 | 0 140 | ].public_ip_address.address.split("/")[0] 141 | self.cluster._log( 142 | f"Created Nebius instance {self.name} with internal IP {internal_ip} and external IP {external_ip}" 143 | ) 144 | return internal_ip, external_ip 145 | 146 | async def destroy_vm(self): 147 | if self.instance_id: 148 | service = InstanceServiceClient(self.sdk) 149 | operation = await service.delete( 150 | DeleteInstanceRequest( 151 | id=self.instance_id, 152 | ) 153 | ) 154 | await operation.wait() 155 | 156 | if self.disk_id: 157 | service = DiskServiceClient(self.sdk) 158 | await service.delete( 159 | DeleteDiskRequest( 160 | id=self.disk_id, 161 | ) 162 | ) 163 | self.cluster._log( 164 | f"Terminated instance {self.name} ({self.instance_id}) and deleted disk {self.disk_id}" 165 | ) 166 | self.instance_id = None 167 | self.disk_id = None 168 | 169 | 170 | class NebiusScheduler(SchedulerMixin, NebiusInstance): 171 | """Scheduler running on a Nebius server.""" 172 | 173 | 174 | class NebiusWorker(WorkerMixin, NebiusInstance): 175 | """Worker running on a Nebius server.""" 176 | 177 | 178 | class NebiusCluster(VMCluster): 179 | """Cluster running on Nebius AI Cloud instances. 180 | 181 | VMs in Nebius AI Cloud are referred to as instances. This cluster manager constructs a Dask cluster 182 | running on VMs. 183 | 184 | When configuring your cluster you may find it useful to install the ``nebius`` tool for querying the 185 | Nebius API for available options. 186 | 187 | https://docs.nebius.com/cli/quickstart 188 | 189 | Parameters 190 | ---------- 191 | image_family: str 192 | The image to use for the host OS. This should be a Ubuntu variant. 193 | You find list available images here https://docs.nebius.com/compute/storage/manage#parameters-boot. 194 | project_id: str 195 | The Nebius AI Cloud project id. You can find in Nebius AI Cloud console. 196 | server_platform: str 197 | List of all platforms and presets here https://docs.nebius.com/compute/virtual-machines/types/. 198 | server_preset: str 199 | List of all platforms and presets here https://docs.nebius.com/compute/virtual-machines/types/. 200 | n_workers: int 201 | Number of workers to initialise the cluster with. Defaults to ``0``. 202 | worker_module: str 203 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker`` 204 | worker_options: dict 205 | Params to be passed to the worker class. 206 | See :class:`distributed.worker.Worker` for default worker class. 207 | If you set ``worker_module`` then refer to the docstring for the custom worker class. 208 | scheduler_options: dict 209 | Params to be passed to the scheduler class. 210 | See :class:`distributed.scheduler.Scheduler`. 211 | env_vars: dict 212 | Environment variables to be passed to the worker. 213 | extra_bootstrap: list[str] (optional) 214 | Extra commands to be run during the bootstrap phase. 215 | 216 | Example 217 | -------- 218 | 219 | >>> from dask_cloudprovider.nebius import NebiusCluster 220 | >>> cluster = NebiusCluster(n_workers=1) 221 | 222 | >>> from dask.distributed import Client 223 | >>> client = Client(cluster) 224 | 225 | >>> import dask.array as da 226 | >>> arr = da.random.random((1000, 1000), chunks=(100, 100)) 227 | >>> arr.mean().compute() 228 | 229 | >>> client.close() 230 | >>> cluster.close() 231 | 232 | """ 233 | 234 | def __init__( 235 | self, 236 | bootstrap: str = None, 237 | image_family: str = None, 238 | project_id: str = None, 239 | disk_size: int = None, 240 | server_platform: str = None, 241 | server_preset: str = None, 242 | docker_image: str = None, 243 | debug: bool = False, 244 | **kwargs, 245 | ): 246 | self.config = dask.config.get("cloudprovider.nebius", {}) 247 | 248 | self.scheduler_class = NebiusScheduler 249 | self.worker_class = NebiusWorker 250 | 251 | self.image_family = dask.config.get( 252 | "cloudprovider.nebius.image_family", override_with=image_family 253 | ) 254 | self.docker_image = dask.config.get( 255 | "cloudprovider.nebius.docker_image", override_with=docker_image 256 | ) 257 | self.project_id = dask.config.get( 258 | "cloudprovider.nebius.project_id", override_with=project_id 259 | ) 260 | self.server_platform = dask.config.get( 261 | "cloudprovider.nebius.server_platform", override_with=server_platform 262 | ) 263 | self.server_preset = dask.config.get( 264 | "cloudprovider.nebius.server_preset", override_with=server_preset 265 | ) 266 | self.bootstrap = dask.config.get( 267 | "cloudprovider.nebius.bootstrap", override_with=bootstrap 268 | ) 269 | self.disk_size = dask.config.get( 270 | "cloudprovider.nebius.disk_size", override_with=disk_size 271 | ) 272 | self.debug = debug 273 | 274 | self.options = { 275 | "bootstrap": self.bootstrap, 276 | "cluster": self, 277 | "config": self.config, 278 | "docker_image": self.docker_image, 279 | "image_family": self.image_family, 280 | "project_id": self.project_id, 281 | "server_platform": self.server_platform, 282 | "server_preset": self.server_preset, 283 | "disk_size": self.disk_size, 284 | } 285 | self.scheduler_options = {**self.options} 286 | self.worker_options = {**self.options} 287 | super().__init__(debug=debug, **kwargs) 288 | -------------------------------------------------------------------------------- /dask_cloudprovider/nebius/tests/test_nebius.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import dask 4 | 5 | nebius = pytest.importorskip("nebius") 6 | 7 | from dask_cloudprovider.nebius.instances import NebiusCluster 8 | from dask.distributed import Client 9 | from distributed.core import Status 10 | 11 | 12 | async def skip_without_credentials(config): 13 | if config.get("token") is None or config.get("project_id") is None: 14 | pytest.skip( 15 | """ 16 | You must configure a Nebius AI Cloud API token to run this test. 17 | 18 | Either set this in your config 19 | 20 | # cloudprovider.yaml 21 | cloudprovider: 22 | nebius: 23 | token: "yourtoken" 24 | project_id: "yourprojectid" 25 | 26 | Or by setting it as an environment variable 27 | 28 | export DASK_CLOUDPROVIDER__NEBIUS__TOKEN=$(nebius iam get-access-token) 29 | export DASK_CLOUDPROVIDER__NEBIUS__PROJECT_ID=project_id 30 | 31 | """ 32 | ) 33 | 34 | 35 | @pytest.fixture 36 | async def config(): 37 | return dask.config.get("cloudprovider.nebius", {}) 38 | 39 | 40 | @pytest.fixture 41 | @pytest.mark.external 42 | async def cluster(config): 43 | await skip_without_credentials(config) 44 | async with NebiusCluster(asynchronous=True, debug=True) as cluster: 45 | yield cluster 46 | 47 | 48 | @pytest.mark.asyncio 49 | @pytest.mark.external 50 | async def test_init(): 51 | cluster = NebiusCluster(asynchronous=True, debug=True) 52 | assert cluster.status == Status.created 53 | 54 | 55 | @pytest.mark.asyncio 56 | @pytest.mark.external 57 | async def test_create_cluster(cluster): 58 | assert cluster.status == Status.running 59 | 60 | cluster.scale(1) 61 | await cluster 62 | assert len(cluster.workers) == 1 63 | 64 | async with Client(cluster, asynchronous=True) as client: 65 | 66 | def inc(x): 67 | return x + 1 68 | 69 | assert await client.submit(inc, 10).result() == 11 70 | 71 | 72 | @pytest.mark.asyncio 73 | async def test_get_cloud_init(): 74 | cloud_init = NebiusCluster.get_cloud_init( 75 | docker_args="--privileged", 76 | ) 77 | assert " --privileged " in cloud_init 78 | -------------------------------------------------------------------------------- /dask_cloudprovider/openstack/__init__.py: -------------------------------------------------------------------------------- 1 | from .instances import OpenStackCluster 2 | -------------------------------------------------------------------------------- /dask_cloudprovider/openstack/instances.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import dask 3 | 4 | from dask_cloudprovider.generic.vmcluster import ( 5 | VMCluster, 6 | VMInterface, 7 | SchedulerMixin, 8 | WorkerMixin, 9 | ) 10 | 11 | from distributed.core import Status 12 | 13 | try: 14 | from openstack import connection 15 | except ImportError as e: 16 | msg = ( 17 | "Dask Cloud Provider OpenStack requirements are not installed.\n\n" 18 | "Please pip install as follows:\n\n" 19 | ' pip install "openstacksdk" ' 20 | ) 21 | raise ImportError(msg) from e 22 | 23 | 24 | class OpenStackInstance(VMInterface): 25 | def __init__( 26 | self, 27 | cluster, 28 | config, 29 | region: str = None, 30 | size: str = None, 31 | image: str = None, 32 | docker_image: str = None, 33 | env_vars: str = None, 34 | extra_bootstrap: str = None, 35 | **kwargs, 36 | ): 37 | super().__init__(**kwargs) 38 | self.instance = None 39 | self.cluster = cluster 40 | self.config = config 41 | self.region = region 42 | self.size = size 43 | self.image = image 44 | self.env_vars = env_vars 45 | self.bootstrap = True 46 | self.docker_image = docker_image 47 | self.extra_bootstrap = extra_bootstrap 48 | 49 | async def create_vm(self): 50 | conn = connection.Connection( 51 | region_name=self.region, 52 | auth_url=self.config["auth_url"], 53 | application_credential_id=self.config["application_credential_id"], 54 | application_credential_secret=self.config["application_credential_secret"], 55 | compute_api_version="2", 56 | identity_interface="public", 57 | auth_type="v3applicationcredential", 58 | ) 59 | 60 | self.instance = conn.create_server( 61 | name=self.name, 62 | image=self.image, 63 | flavor=self.size, # Changed 'flavor_id' to 'flavor' 64 | key_name=self.config["keypair_name"], # Add the keypair name here 65 | nics=[ 66 | {"net-id": self.config["network_id"]} 67 | ], # Changed from 'networks' to 'nics' 68 | userdata=self.cluster.render_process_cloud_init(self), 69 | security_groups=[self.config["security_group"]], 70 | ) 71 | 72 | # Wait for the instance to be up and running 73 | while self.instance.status.lower() != "active": 74 | await asyncio.sleep(0.1) 75 | self.instance = conn.compute.get_server(self.instance.id) 76 | 77 | # Retrieve the internal IP address 78 | self.internal_ip = await self.get_internal_ip(conn) 79 | 80 | # Check if a floating IP should be created and assigned 81 | if self.config.get("create_floating_ip", False): 82 | self.external_ip = await self.create_and_assign_floating_ip(conn) 83 | else: 84 | self.external_ip = await self.get_external_ip(conn) 85 | 86 | self.cluster._log( 87 | f"{self.name}\n\tInternal IP: {self.internal_ip}\n\tExternal IP: " 88 | f"{self.external_ip if self.external_ip else 'None'}" 89 | ) 90 | return self.internal_ip, self.external_ip 91 | 92 | async def get_internal_ip(self, conn): 93 | """Fetch the internal IP address from the OpenStack instance.""" 94 | instance = conn.compute.get_server(self.instance.id) 95 | for network in instance.addresses.values(): 96 | for addr in network: 97 | if addr["OS-EXT-IPS:type"] == "fixed": 98 | return addr["addr"] 99 | return None 100 | 101 | async def get_external_ip(self, conn): 102 | """Fetch the external IP address from the OpenStack instance, if it exists.""" 103 | instance = conn.compute.get_server(self.instance.id) 104 | for network in instance.addresses.values(): 105 | for addr in network: 106 | if addr["OS-EXT-IPS:type"] == "floating": 107 | return addr["addr"] 108 | return None 109 | 110 | async def create_and_assign_floating_ip(self, conn): 111 | """Create and assign a floating IP to the instance.""" 112 | try: 113 | # Create a floating IP 114 | floating_ip = await self.call_async( 115 | conn.network.create_ip, 116 | floating_network_id=self.config["external_network_id"], 117 | ) 118 | 119 | # Find the first port of the instance 120 | ports = await self.call_async( 121 | conn.network.ports, 122 | device_id=self.instance.id 123 | ) 124 | ports = list(ports) 125 | if not ports: 126 | raise RuntimeError(f"No network ports found for instance {self.instance.id}") 127 | 128 | # Assign the floating IP to the instance's port 129 | await self.call_async( 130 | conn.network.update_ip, 131 | floating_ip, 132 | port_id=ports[0].id 133 | ) 134 | 135 | return floating_ip.floating_ip_address 136 | except Exception as e: 137 | self.cluster._log(f"Failed to create or assign floating IP: {str(e)}") 138 | return None 139 | 140 | async def destroy_vm(self): 141 | conn = connection.Connection( 142 | region_name=self.region, 143 | auth_url=self.config["auth_url"], 144 | application_credential_id=self.config["application_credential_id"], 145 | application_credential_secret=self.config["application_credential_secret"], 146 | compute_api_version="2", 147 | identity_interface="public", 148 | auth_type="v3applicationcredential", 149 | ) 150 | 151 | # Handle floating IP disassociation and deletion if applicable 152 | if self.config.get( 153 | "create_floating_ip", False 154 | ): # Checks if floating IPs were configured to be created 155 | try: 156 | # Retrieve all floating IPs associated with the instance 157 | floating_ips = conn.network.ips(port_id=self.instance.id) 158 | for ip in floating_ips: 159 | # Disassociate and delete the floating IP 160 | conn.network.update_ip(ip, port_id=None) 161 | conn.network.delete_ip(ip.id) 162 | self.cluster._log(f"Deleted floating IP {ip.floating_ip_address}") 163 | except Exception as e: 164 | self.cluster._log( 165 | f"Failed to clean up floating IPs for instance {self.name}: {str(e)}" 166 | ) 167 | return # Exit if floating IP cleanup fails 168 | 169 | # Then, attempt to delete the instance 170 | try: 171 | instance = conn.compute.get_server(self.instance.id) 172 | if instance: 173 | await self.call_async(conn.compute.delete_server, instance.id) 174 | self.cluster._log(f"Terminated instance {self.name}") 175 | else: 176 | self.cluster._log(f"Instance {self.name} not found or already deleted.") 177 | except Exception as e: 178 | self.cluster._log(f"Failed to terminate instance {self.name}: {str(e)}") 179 | 180 | async def start_vm(self): 181 | # Code to start the instance 182 | pass # Placeholder to ensure correct indentation 183 | 184 | async def stop_vm(self): 185 | # Code to stop the instance 186 | pass # Placeholder to ensure correct indentation 187 | 188 | 189 | class OpenStackScheduler(SchedulerMixin, OpenStackInstance): 190 | """Scheduler running on an OpenStack Instance.""" 191 | 192 | def __init__(self, *args, **kwargs): 193 | super().__init__(*args, **kwargs) 194 | 195 | async def start(self): 196 | await self.start_scheduler() 197 | self.status = Status.running 198 | 199 | async def start_scheduler(self): 200 | self.cluster._log( 201 | f"Launching cluster with the following configuration: " 202 | f"\n OS Image: {self.image} " 203 | f"\n Flavor: {self.size} " 204 | f"\n Docker Image: {self.docker_image} " 205 | f"\n Security Group: {self.config['security_group']} " 206 | ) 207 | self.cluster._log("Creating scheduler instance") 208 | self.internal_ip, self.external_ip = await self.create_vm() 209 | 210 | # Choose the IP based on the access type configuration 211 | if self.config.get("create_floating_ip", True): 212 | # If public access is required and a floating IP is created 213 | self.address = f"{self.cluster.protocol}://{self.external_ip}:{self.port}" 214 | else: 215 | # Use internal IP if no external access is configured 216 | self.address = f"{self.cluster.protocol}://{self.internal_ip}:{self.port}" 217 | 218 | await self.wait_for_scheduler() 219 | 220 | # Storing IPs for cluster-wide use, if necessary 221 | self.cluster.scheduler_internal_ip = self.internal_ip 222 | self.cluster.scheduler_external_ip = self.external_ip 223 | self.cluster.scheduler_port = self.port 224 | 225 | 226 | class OpenStackWorker(WorkerMixin, OpenStackInstance): 227 | """Worker running on a OpenStack Instance.""" 228 | 229 | 230 | class OpenStackCluster(VMCluster): 231 | """Cluster running on Openstack VM Instances 232 | 233 | This cluster manager constructs a Dask cluster running on generic Openstack cloud 234 | 235 | When configuring your cluster you may find it useful to install the 'python-openstackclient' 236 | client for querying the Openstack APIs for available options. 237 | 238 | https://github.com/openstack/python-openstackclient 239 | 240 | Parameters 241 | ---------- 242 | 243 | region: str 244 | The name of the region where resources will be allocated in OpenStack. 245 | Typically set to 'default' unless specified in your cloud configuration. 246 | 247 | List available regions using: `openstack region list`. 248 | auth_url: str 249 | The authentication URL for the OpenStack Identity service (Keystone). 250 | Example: https://cloud.example.com:5000 251 | application_credential_id: str 252 | The application credential id created in OpenStack. 253 | 254 | Create application credentials using: openstack application credential create 255 | application_credential_secret: str 256 | The secret associated with the application credential ID for authentication. 257 | auth_type: str 258 | The type of authentication used, typically "v3applicationcredential" for 259 | using OpenStack application credentials. 260 | network_id: str 261 | The unique identifier for the internal/private network in OpenStack where the cluster 262 | VMs will be connected. 263 | 264 | List available networks using: `openstack network list` 265 | image: str 266 | The OS image name or id to use for the VM. Dask Cloudprovider will boostrap Ubuntu 267 | based images automatically. Other images require Docker and for GPUs 268 | the NVIDIA Drivers and NVIDIA Docker. 269 | 270 | List available images using: `openstack image list` 271 | keypair_name: str 272 | The name of the SSH keypair used for instance access. Ensure you have created a keypair 273 | or use an existing one. 274 | 275 | List available keypairs using: `openstack keypair list` 276 | security_group: str 277 | The security group name that defines firewall rules for instances. 278 | 279 | The default is `default`. Please ensure the follwing accesses are configured: 280 | - egress 0.0.0.0/0 on all ports for downloading docker images and general data access 281 | - ingress /8 on all ports for internal communication of workers 282 | - ingress 0.0.0.0/0 on 8786-8787 for external accessibility of the dashboard/scheduler 283 | - (optional) ingress 0.0.0.0./0 on 22 for ssh access 284 | 285 | List available security groups using: `openstack security group list` 286 | create_floating_ip: bool 287 | Specifies whether to assign a floating IP to each instance, enabling external 288 | access. Set to `True` if external connectivity is needed. 289 | external_network_id: str 290 | The ID of the external network used for assigning floating IPs. 291 | 292 | List available external networks using: `openstack network list --external` 293 | n_workers: int (optional) 294 | Number of workers to initialise the cluster with. Defaults to ``0``. 295 | worker_module: str 296 | The Python module to run for the worker. Defaults to ``distributed.cli.dask_worker`` 297 | worker_options: dict 298 | Params to be passed to the worker class. 299 | See :class:`distributed.worker.Worker` for default worker class. 300 | If you set ``worker_module`` then refer to the docstring for the custom worker class. 301 | scheduler_options: dict 302 | Params to be passed to the scheduler class. 303 | See :class:`distributed.scheduler.Scheduler`. 304 | env_vars: dict 305 | Environment variables to be passed to the worker. 306 | extra_bootstrap: list[str] (optional) 307 | Extra commands to be run during the bootstrap phase. 308 | docker_image: string (optional) 309 | The Docker image to run on all instances. 310 | 311 | This image must have a valid Python environment and have ``dask`` installed in order for the 312 | ``dask-scheduler`` and ``dask-worker`` commands to be available. It is recommended the Python 313 | environment matches your local environment where ``OpenStackCluster`` is being created from. 314 | 315 | For GPU instance types the Docker image much have NVIDIA drivers and ``dask-cuda`` installed. 316 | 317 | By default the ``daskdev/dask:latest`` image will be used. 318 | 319 | Example 320 | -------- 321 | 322 | >>> from dask_cloudprovider.openstack import OpenStackCluster 323 | >>> cluster = OpenStackCluster(n_workers=1) 324 | Launching cluster with the following configuration: 325 | OS Image: ubuntu-22-04 326 | Flavor: 4vcpu-8gbram-50gbdisk 327 | Docker Image: daskdev/dask:latest 328 | Security Group: all-open 329 | Creating scheduler instance 330 | dask-9b85a5f8-scheduler 331 | Internal IP: 10.0.30.148 332 | External IP: None 333 | Waiting for scheduler to run at 10.0.30.148:8786 334 | Scheduler is running 335 | Creating worker instance 336 | 337 | >>> from dask.distributed import Client 338 | >>> client = Client(cluster) 339 | 340 | >>> import dask.array as da 341 | >>> arr = da.random.random((1000, 1000), chunks=(100, 100)) 342 | >>> arr.mean().compute() 343 | 344 | >>> client.close() 345 | >>> cluster.close() 346 | Terminated instance dask-07280176-worker-319005a2 347 | Terminated instance dask-07280176-scheduler 348 | """ 349 | 350 | def __init__( 351 | self, 352 | region: str = None, 353 | size: str = None, 354 | image: str = None, 355 | docker_image: str = None, 356 | debug: bool = False, 357 | bootstrap: bool = True, 358 | **kwargs, 359 | ): 360 | self.config = dask.config.get("cloudprovider.openstack", {}) 361 | self.scheduler_class = OpenStackScheduler 362 | self.worker_class = OpenStackWorker 363 | self.debug = debug 364 | self.bootstrap = ( 365 | bootstrap if bootstrap is not None else self.config.get("bootstrap") 366 | ) 367 | self.options = { 368 | "cluster": self, 369 | "config": self.config, 370 | "region": region if region is not None else self.config.get("region"), 371 | "size": size if size is not None else self.config.get("size"), 372 | "image": image if image is not None else self.config.get("image"), 373 | "docker_image": docker_image or self.config.get("docker_image"), 374 | } 375 | self.scheduler_options = {**self.options} 376 | self.worker_options = {**self.options} 377 | 378 | if "extra_bootstrap" not in kwargs: 379 | kwargs["extra_bootstrap"] = self.config.get("extra_bootstrap") 380 | 381 | super().__init__(debug=debug, **kwargs) 382 | -------------------------------------------------------------------------------- /dask_cloudprovider/openstack/tests/test_instances.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import dask 3 | from dask_cloudprovider.openstack.instances import OpenStackCluster 4 | from dask.distributed import Client 5 | from distributed.core import Status 6 | 7 | # Optional: Skips tests if OpenStack credentials are not set 8 | 9 | 10 | async def skip_without_credentials(config): 11 | if ( 12 | config.get("auth_url") is None 13 | or config.get("application_credential_secret") is None 14 | ): 15 | pytest.skip( 16 | """ 17 | You must configure OpenStack credentials to run this test. 18 | 19 | Set this in your config file or environment variables: 20 | 21 | # cloudprovider.yaml 22 | cloudprovider: 23 | openstack: 24 | auth_url: "your_auth_url" 25 | application_credential_id: "your_app_cred_id" 26 | application_credential_secret: "your_app_cred_secret" 27 | """ 28 | ) 29 | 30 | 31 | @pytest.fixture 32 | async def config(): 33 | return dask.config.get("cloudprovider.openstack", {}) 34 | 35 | 36 | @pytest.fixture 37 | @pytest.mark.external 38 | async def cluster(config): 39 | await skip_without_credentials(config) 40 | 41 | async with OpenStackCluster(asynchronous=True) as cluster: 42 | yield cluster 43 | 44 | 45 | @pytest.mark.asyncio 46 | async def test_init(): 47 | cluster = OpenStackCluster(asynchronous=True) 48 | assert cluster.status == Status.created 49 | 50 | 51 | @pytest.mark.asyncio 52 | @pytest.mark.timeout(600) 53 | async def test_create_cluster(cluster): 54 | assert cluster.status == Status.running 55 | cluster.scale(1) 56 | await cluster 57 | assert len(cluster.workers) == 1 58 | 59 | async with Client(cluster, asynchronous=True) as client: 60 | 61 | def inc(x): 62 | return x + 1 63 | 64 | assert await client.submit(inc, 10).result() == 11 65 | 66 | 67 | @pytest.mark.asyncio 68 | async def test_get_cloud_init(): 69 | cloud_init = OpenStackCluster.get_cloud_init( 70 | docker_args="--privileged", 71 | ) 72 | assert " --privileged " in cloud_init 73 | -------------------------------------------------------------------------------- /dask_cloudprovider/tests/test_imports.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_imports(): 5 | from dask_cloudprovider.aws import EC2Cluster # noqa 6 | from dask_cloudprovider.aws import ECSCluster # noqa 7 | from dask_cloudprovider.aws import FargateCluster # noqa 8 | from dask_cloudprovider.azure import AzureVMCluster # noqa 9 | from dask_cloudprovider.gcp import GCPCluster # noqa 10 | from dask_cloudprovider.digitalocean import DropletCluster # noqa 11 | from dask_cloudprovider.hetzner import HetznerCluster # noqa 12 | 13 | 14 | def test_import_exceptions(): 15 | with pytest.raises(ImportError): 16 | from dask_cloudprovider import EC2Cluster # noqa 17 | with pytest.raises(ImportError): 18 | from dask_cloudprovider import ECSCluster # noqa 19 | with pytest.raises(ImportError): 20 | from dask_cloudprovider import FargateCluster # noqa 21 | with pytest.raises(ImportError): 22 | from dask_cloudprovider import AzureVMCluster # noqa 23 | with pytest.raises(ImportError): 24 | from dask_cloudprovider import GCPCluster # noqa 25 | with pytest.raises(ImportError): 26 | from dask_cloudprovider import DropletCluster # noqa 27 | -------------------------------------------------------------------------------- /dask_cloudprovider/utils/logs.py: -------------------------------------------------------------------------------- 1 | class Log(str): 2 | """A container for logs.""" 3 | 4 | def _widget(self): 5 | from ipywidgets import HTML 6 | 7 | return HTML(value="
{logs}
".format(logs=self)) 8 | 9 | def _ipython_display_(self, **kwargs): 10 | return self._widget()._ipython_display_(**kwargs) 11 | 12 | 13 | class Logs(dict): 14 | """A container for multiple logs.""" 15 | 16 | def _widget(self): 17 | from ipywidgets import Accordion 18 | 19 | accordion = Accordion(children=[log._widget() for log in self.values()]) 20 | [accordion.set_title(i, title) for i, title in enumerate(self.keys())] 21 | return accordion 22 | 23 | def _ipython_display_(self, **kwargs): 24 | return self._widget()._ipython_display_(**kwargs) 25 | -------------------------------------------------------------------------------- /dask_cloudprovider/utils/socket.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | 4 | def is_socket_open(ip, port): 5 | connection = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 6 | try: 7 | connection.connect((ip, int(port))) 8 | connection.shutdown(2) 9 | return True 10 | except Exception: 11 | return False 12 | -------------------------------------------------------------------------------- /dask_cloudprovider/utils/timeout.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import warnings 3 | 4 | 5 | class TimeoutException(RuntimeError): 6 | """Raised when a loop times out.""" 7 | 8 | 9 | class Timeout: 10 | """A timeout object for use in ``while True`` loops instead of ``True``. 11 | 12 | Create an instance of this class before beginning an infinite loop and 13 | call ``run()`` instead of ``True``. 14 | 15 | 16 | Parameters 17 | ---------- 18 | timeout: int 19 | Seconds before loop should timeout. 20 | 21 | error_message: str 22 | Error message to raise in an exception if timeout occurs. 23 | 24 | warn: bool 25 | Only raise a warning instead of a TimeoutException. 26 | 27 | Default ``False``. 28 | Examples 29 | -------- 30 | >>> timeout = Timeout(10, "Oh no! We timed out.") 31 | >>> while timeout.run(): 32 | ... time.sleep(1) # Will timeout after 10 iterations 33 | TimeoutException: Oh no! We timed out. 34 | 35 | You can also pass an exception to raise if you are suppressing for a set 36 | amount of time. 37 | 38 | >>> timeout = Timeout(10, "Oh no! We timed out.") 39 | >>> while timeout.run(): 40 | ... try: 41 | ... some_function_that_raises() 42 | ... break 43 | ... except Exception as e: 44 | ... timeout.set_exception(e) 45 | ... time.sleep(1) # Will timeout after 10 iterations 46 | Exception: The exception from ``some_function_that_raises`` 47 | 48 | 49 | """ 50 | 51 | def __init__(self, timeout, error_message, warn=False): 52 | self.start = None 53 | self.running = False 54 | self.timeout = timeout 55 | self.error_message = error_message 56 | self.warn = warn 57 | self.exception = TimeoutException(self.error_message) 58 | 59 | def run(self): 60 | """Run the timeout. 61 | 62 | This method when called repeatedly will return ``True`` until the 63 | timeout has elapsed. It will then raise or return ``False``. 64 | """ 65 | if not self.running: 66 | self.start = datetime.now() 67 | self.running = True 68 | 69 | if self.start + timedelta(seconds=self.timeout) < datetime.now(): 70 | if self.warn: 71 | warnings.warn(self.error_message) 72 | return False 73 | else: 74 | raise self.exception 75 | return True 76 | 77 | def set_exception(self, e): 78 | """Modify the default timeout exception. 79 | 80 | This would be useful if you are trying something repeatedly but if it 81 | never succeeds before the timeout you want to raise the exception from 82 | the thing you are trying rather than a TimeoutException. 83 | """ 84 | self.exception = e 85 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -a 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | numpydoc 2 | sphinx 3 | dask-sphinx-theme>=3.0.5 4 | # FIXME: This workaround is required until we have sphinx>=5, as enabled by 5 | # dask-sphinx-theme no longer pinning sphinx-book-theme==0.2.0. This is 6 | # tracked in https://github.com/dask/dask-sphinx-theme/issues/68. 7 | # 8 | sphinxcontrib-applehelp<1.0.5 9 | sphinxcontrib-devhelp<1.0.6 10 | sphinxcontrib-htmlhelp<2.0.5 11 | sphinxcontrib-serializinghtml<1.1.10 12 | sphinxcontrib-qthelp<1.0.7 13 | -------------------------------------------------------------------------------- /doc/source/alternatives.rst: -------------------------------------------------------------------------------- 1 | Alternatives 2 | ============ 3 | 4 | Many tools and services exist today for deploying Dask clusters, many of which are commonly used on the cloud. 5 | This project aims to provide cloud native plugins and tools for Dask which can often compliment other approaches. 6 | 7 | Community tools 8 | --------------- 9 | 10 | Dask has a `vibrant ecosystem of community tooling for deploying Dask `_ on various platforms. Many of which can be used on public cloud. 11 | 12 | Kubernetes 13 | ^^^^^^^^^^ 14 | 15 | `Kubernetes `_ is an extremely popular project for managing cloud workloads and is part of the broader `Cloud Native Computing Foundation (CNCF) `_ ecosystem. 16 | 17 | Dask has many options for `deploying clusters on Kubernetes `_. 18 | 19 | HPC on Cloud 20 | ^^^^^^^^^^^^ 21 | 22 | Many popular HPC scheduling tools are used on the cloud and support features such as elastic scaling. 23 | If you are already leveraging HPC tools like `SLURM on the cloud `_ then `Dask has great integration with HPC schedulers `_. 24 | 25 | Hadoop/Spark/Yarn 26 | ^^^^^^^^^^^^^^^^^ 27 | 28 | Many cloud platforms have popular managed services for running Apache Spark workloads. 29 | 30 | If you're already using a managed map-reduce service like `Amazon EMR `_ then check out `dask-yarn `_. 31 | 32 | Nebari 33 | ^^^^^^ 34 | 35 | `Nebari `_ is an open source data science platform which can be run locally or on a cloud platform of your choice. 36 | It includes a managed Dask service built on `Dask Gateway `_ for managing Dask clusters. 37 | 38 | Managed Services 39 | ---------------- 40 | 41 | Cloud vendors and third-party companies also offer managed Dask clusters as a service 42 | 43 | Coiled 44 | ^^^^^^ 45 | 46 | `Coiled `_ is a mature managed Dask service that spawns clusters in your cloud account and allows you to manage them via a central control plane. 47 | 48 | Saturn Cloud 49 | ^^^^^^^^^^^^ 50 | 51 | `Saturn Cloud `_ is a managed data science platform with hosted Dask clusters or the option to deploy them in your own AWS account. 52 | -------------------------------------------------------------------------------- /doc/source/aws.rst: -------------------------------------------------------------------------------- 1 | Amazon Web Services (AWS) 2 | ========================= 3 | 4 | .. currentmodule:: dask_cloudprovider.aws 5 | 6 | .. autosummary:: 7 | EC2Cluster 8 | ECSCluster 9 | FargateCluster 10 | 11 | Overview 12 | -------- 13 | 14 | Authentication 15 | ^^^^^^^^^^^^^^ 16 | 17 | In order to create clusters on AWS you need to set your access key, secret key 18 | and region. The simplest way is to use the aws command line tool. 19 | 20 | .. code-block:: console 21 | 22 | $ pip install awscli 23 | $ aws configure 24 | 25 | 26 | Credentials 27 | ^^^^^^^^^^^ 28 | 29 | In order for your Dask workers to be able to connect to other AWS resources such as S3 they will need credentials. 30 | 31 | This can be done by attaching IAM roles to individual resources or by passing credentials as environment variables. See 32 | each cluster manager docstring for more information. 33 | 34 | Elastic Compute Cloud (EC2) 35 | --------------------------- 36 | 37 | .. autoclass:: EC2Cluster 38 | :members: 39 | 40 | Elastic Container Service (ECS) 41 | ------------------------------- 42 | 43 | .. autoclass:: ECSCluster 44 | :members: 45 | 46 | Fargate 47 | ------- 48 | 49 | .. autoclass:: FargateCluster 50 | :members: 51 | -------------------------------------------------------------------------------- /doc/source/azure.rst: -------------------------------------------------------------------------------- 1 | Microsoft Azure 2 | =============== 3 | 4 | .. currentmodule:: dask_cloudprovider.azure 5 | 6 | .. autosummary:: 7 | AzureVMCluster 8 | 9 | Overview 10 | -------- 11 | 12 | Authentication 13 | ^^^^^^^^^^^^^^ 14 | 15 | In order to create clusters on Azure you need to set your authentication credentials. 16 | You can do this via the ``az`` `command line tool `_. 17 | 18 | .. code-block:: console 19 | 20 | $ az login 21 | 22 | .. note:: 23 | 24 | Setting the default output to ``table`` with ``az configure`` will make the ``az`` tool much easier to use. 25 | 26 | Resource Groups 27 | ^^^^^^^^^^^^^^^ 28 | 29 | To create resources on Azure they must be placed in a resource group. Dask Cloudprovider will need a group to create 30 | Dask components in. 31 | 32 | You can list existing groups via the cli. 33 | 34 | .. code-block:: console 35 | 36 | $ az group list 37 | 38 | You can also create a new resource group if you do not have an existing one. 39 | 40 | .. code-block:: console 41 | 42 | $ az group create --location --name --subscription 43 | 44 | You can get a full list of locations with ``az account list-locations`` and subscriptions with ``az account list``. 45 | 46 | Take note of your resource group name for later. 47 | 48 | Virtual Networks 49 | ^^^^^^^^^^^^^^^^ 50 | 51 | Compute resources on Azure must be placed in virtual networks (vnet). Dask Cloudprovider will require an existing vnet to connect 52 | compute resources to. 53 | 54 | You can list existing vnets via the cli. 55 | 56 | .. code-block:: console 57 | 58 | $ az network vnet list 59 | 60 | You can also create a new vnet via the cli. 61 | 62 | .. code-block:: console 63 | 64 | $ az network vnet create -g -n --address-prefix 10.0.0.0/16 \ 65 | --subnet-name --subnet-prefix 10.0.0.0/24 66 | 67 | This command will create a new vnet in your resource group with one subnet with the ``10.0.0.0/24`` prefix. For more than 255 compute resources you will need additional subnets. 68 | 69 | Take note of your vnet name for later. 70 | 71 | Security Groups 72 | ^^^^^^^^^^^^^^^ 73 | 74 | To allow network traffic to reach your Dask cluster you will need to create a security group which allows traffic on ports 8786-8787 from wherever you are. 75 | 76 | You can list existing security groups via the cli. 77 | 78 | .. code-block:: console 79 | 80 | $ az network nsg list 81 | 82 | Or you can create a new security group. 83 | 84 | .. code-block:: console 85 | 86 | $ az network nsg create -g --name 87 | $ az network nsg rule create -g --nsg-name -n MyNsgRuleWithAsg \ 88 | --priority 500 --source-address-prefixes Internet --destination-port-ranges 8786 8787 \ 89 | --destination-address-prefixes '*' --access Allow --protocol Tcp --description "Allow Internet to Dask on ports 8786,8787." 90 | 91 | This example allows all traffic to 8786-8787 from the internet. It is recommended you make your rules more restrictive than this by limiting it to your corporate network 92 | or specific IP. 93 | 94 | Again take note of this security group name for later. 95 | 96 | Extra options 97 | ^^^^^^^^^^^^^ 98 | 99 | To further customize the VMs created, you can provide ``extra_vm_options`` to :class:`AzureVMCluster`. For example, to set the identity 100 | of the virtual machines to a (previously created) user assigned identity, create an ``azure.mgmt.compute.models.VirtualMachineIdentity`` 101 | 102 | .. code-block:: python 103 | 104 | >>> import os 105 | >>> import azure.identity 106 | >>> import dask_cloudprovider.azure 107 | >>> import azure.mgmt.compute.models 108 | 109 | >>> subscription_id = os.environ["DASK_CLOUDPROVIDER__AZURE__SUBSCRIPTION_ID"] 110 | >>> rg_name = os.environ["DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP"] 111 | >>> identity_name = "dask-cloudprovider-identity" 112 | >>> v = azure.mgmt.compute.models.UserAssignedIdentitiesValue() 113 | >>> user_assigned_identities = { 114 | ... f"/subscriptions/{subscription_id}/resourcegroups/{rg_name}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{identity_name}": v 115 | ... } 116 | >>> identity = azure.mgmt.compute.models.VirtualMachineIdentity( 117 | ... type="UserAssigned", 118 | ... user_assigned_identities=user_assigned_identities 119 | ... ) 120 | 121 | 122 | And then provide that to :class:`AzureVMCluster` 123 | 124 | .. code-block:: python 125 | 126 | >>> cluster = dask_cloudprovider.azure.AzureVMCluster(extra_vm_options={"identity": identity.as_dict()}) 127 | >>> cluster.scale(1) 128 | 129 | Dask Configuration 130 | ^^^^^^^^^^^^^^^^^^ 131 | 132 | You'll provide the names or IDs of the Azure resources when you create a :class:`AzureVMCluster`. You can specify 133 | these values manually, or use Dask's `configuration system `_ 134 | system. For example, the ``resource_group`` value can be specified using an environment variable: 135 | 136 | .. code-block:: console 137 | 138 | $ export DASK_CLOUDPROVIDER__AZURE__RESOURCE_GROUP="" 139 | $ python 140 | 141 | Or you can set it in a YAML configuration file. 142 | 143 | .. code-block:: yaml 144 | 145 | cloudprovider: 146 | azure: 147 | resource_group: "" 148 | azurevm: 149 | vnet: "" 150 | 151 | Note that the options controlling the VMs are under the `cloudprovider.azure.azurevm` key. 152 | 153 | See :doc:`config` for more. 154 | 155 | AzureVM 156 | ------- 157 | 158 | .. autoclass:: AzureVMCluster 159 | :members: 160 | 161 | Azure Spot Instance Plugin 162 | -------------------------- 163 | 164 | .. autoclass:: AzurePreemptibleWorkerPlugin 165 | :members: 166 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Dask-kubernetes documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Feb 8 17:56:16 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | from datetime import datetime 24 | 25 | sys.path.insert(0, os.path.abspath("..")) 26 | 27 | 28 | # -- General configuration ------------------------------------------------ 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = [ 38 | "sphinx.ext.autodoc", 39 | "sphinx.ext.todo", 40 | "sphinx.ext.ifconfig", 41 | "sphinx.ext.viewcode", 42 | "sphinx.ext.autosummary", 43 | "sphinx.ext.extlinks", 44 | "sphinx.ext.intersphinx", 45 | "numpydoc", 46 | ] 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ["_templates"] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | # 54 | # source_suffix = ['.rst', '.md'] 55 | source_suffix = ".rst" 56 | 57 | # The master toctree document. 58 | master_doc = "index" 59 | 60 | # General information about the project. 61 | project = "Dask Cloud Provider" 62 | copyright = f"{datetime.now().year}, Dask Developers" 63 | author = "Dask Developers" 64 | 65 | # The version info for the project you're documenting, acts as replacement for 66 | # |version| and |release|, also used in various other places throughout the 67 | # built documents. 68 | # 69 | # The short X.Y version. 70 | from dask_cloudprovider import __version__ 71 | 72 | version = __version__ 73 | # The full version, including alpha/beta/rc tags. 74 | release = __version__ 75 | 76 | # The language for content autogenerated by Sphinx. Refer to documentation 77 | # for a list of supported languages. 78 | # 79 | # This is also used if you do content translation via gettext catalogs. 80 | # Usually you set "language" from the command line for these cases. 81 | language = None 82 | 83 | # List of patterns, relative to source directory, that match files and 84 | # directories to ignore when looking for source files. 85 | # This patterns also effect to html_static_path and html_extra_path 86 | exclude_patterns = [] 87 | 88 | # The name of the Pygments (syntax highlighting) style to use. 89 | # Commenting this out for now, if we register dask pygments, 90 | # then eventually this line can be: 91 | # pygments_style = "dask" 92 | 93 | # If true, `todo` and `todoList` produce output, else they produce nothing. 94 | todo_include_todos = False 95 | 96 | 97 | # -- Options for HTML output ---------------------------------------------- 98 | 99 | # The theme to use for HTML and HTML Help pages. See the documentation for 100 | # a list of builtin themes. 101 | # 102 | # html_theme = 'alabaster' 103 | 104 | html_theme = "dask_sphinx_theme" 105 | 106 | # Theme options are theme-specific and customize the look and feel of a theme 107 | # further. For a list of options available for each theme, see the 108 | # documentation. 109 | # 110 | # html_theme_options = {} 111 | 112 | # Add any paths that contain custom static files (such as style sheets) here, 113 | # relative to this directory. They are copied after the builtin static files, 114 | # so a file named "default.css" will overwrite the builtin "default.css". 115 | html_static_path = ["_static"] 116 | 117 | # Custom sidebar templates, must be a dictionary that maps document names 118 | # to template names. 119 | # 120 | # This is required for the alabaster theme 121 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 122 | # html_sidebars = { 123 | # "**": [ 124 | # "relations.html", # needs 'show_related': True theme option to display 125 | # "searchbox.html", 126 | # ] 127 | # } 128 | 129 | 130 | # -- Options for HTMLHelp output ------------------------------------------ 131 | 132 | # Output file base name for HTML help builder. 133 | htmlhelp_basename = "dask-cloudprovider-doc" 134 | 135 | 136 | # -- Options for LaTeX output --------------------------------------------- 137 | 138 | latex_elements = { 139 | # The paper size ('letterpaper' or 'a4paper'). 140 | # 141 | # 'papersize': 'letterpaper', 142 | # The font size ('10pt', '11pt' or '12pt'). 143 | # 144 | # 'pointsize': '10pt', 145 | # Additional stuff for the LaTeX preamble. 146 | # 147 | # 'preamble': '', 148 | # Latex figure (float) alignment 149 | # 150 | # 'figure_align': 'htbp', 151 | } 152 | 153 | # Grouping the document tree into LaTeX files. List of tuples 154 | # (source start file, target name, title, 155 | # author, documentclass [howto, manual, or own class]). 156 | latex_documents = [ 157 | ( 158 | master_doc, 159 | "dask-cloudprovider.tex", 160 | "Dask Cloud Provider Documentation", 161 | "Dask Cloud Provider Developers", 162 | "manual", 163 | ) 164 | ] 165 | 166 | 167 | # -- Options for manual page output --------------------------------------- 168 | 169 | # One entry per manual page. List of tuples 170 | # (source start file, name, description, authors, manual section). 171 | man_pages = [ 172 | (master_doc, "dask-cloudprovider", "Dask Cloud Provider Documentation", [author], 1) 173 | ] 174 | 175 | 176 | # -- Options for Texinfo output ------------------------------------------- 177 | 178 | # Grouping the document tree into Texinfo files. List of tuples 179 | # (source start file, target name, title, author, 180 | # dir menu entry, description, category) 181 | texinfo_documents = [ 182 | ( 183 | master_doc, 184 | "Dask Cloud Provider", 185 | "Dask Cloud Provider Documentation", 186 | author, 187 | "Dask-CloudProvider", 188 | "One line description of project.", 189 | "Miscellaneous", 190 | ) 191 | ] 192 | 193 | 194 | intersphinx_mapping = { 195 | "python": ("https://docs.python.org/3", None), 196 | "dask": ("https://docs.dask.org/en/latest/", None), 197 | "distributed": ("https://distributed.dask.org/en/latest/", None), 198 | "dask_kubernetes": ("https://kubernetes.dask.org/en/latest/", None), 199 | } 200 | -------------------------------------------------------------------------------- /doc/source/config.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ============= 3 | 4 | Each cluster manager in Dask Cloudprovider will require some configuration specific to the cloud 5 | services you wish to use. Many config options will have sensible defaults and often you can create 6 | a cluster with just your authentication credentials configured. 7 | 8 | Authentication 9 | -------------- 10 | 11 | All cluster managers assume you have already configured your credentials for the cloud you are using. 12 | 13 | For AWS this would mean storing your access key and secret key in ``~/.aws/credentials``. The AWS CLI 14 | can create this for you by running the command ``aws configure``. 15 | 16 | See each cluster manager for specific details. 17 | 18 | .. warning:: 19 | Most cluster managers also allow passing credentials as keyword arguments, although this would result in 20 | credentials being stored in code and is not advised. 21 | 22 | Cluster config 23 | -------------- 24 | 25 | Configuration can be passed to a cluster manager via keyword arguments, YAML config or environment variables. 26 | 27 | For example the ``FargateCluster`` manager for AWS ECS takes a ``scheduler_mem`` configuration option to set how much memory 28 | to give the scheduler in megabytes. This can be configured in the following ways. 29 | 30 | .. code-block:: python 31 | 32 | from dask_cloudprovider.aws import FargateCluster 33 | 34 | cluster = FargateCluster( 35 | scheduler_mem=8192 36 | ) 37 | 38 | .. code-block:: yaml 39 | 40 | # ~/.config/dask/cloudprovider.yaml 41 | 42 | cloudprovider: 43 | ecs: 44 | scheduler_mem: 8192 45 | 46 | .. code-block:: console 47 | 48 | $ export DASK_CLOUDPROVIDER__ECS__SCHEDULER_MEM=8192 49 | 50 | See each cluster manager and the `Dask configuration docs `_ for more information. -------------------------------------------------------------------------------- /doc/source/digitalocean.rst: -------------------------------------------------------------------------------- 1 | DigitalOcean 2 | ============ 3 | 4 | .. currentmodule:: dask_cloudprovider.digitalocean 5 | 6 | .. autosummary:: 7 | DropletCluster 8 | 9 | Overview 10 | -------- 11 | 12 | Authentication 13 | ^^^^^^^^^^^^^^ 14 | 15 | To authenticate with DigitalOcean you must first generate a 16 | `personal access token `_. 17 | 18 | Then you must put this in your Dask configuration at ``cloudprovider.digitalocean.token``. This can be done by 19 | adding the token to your YAML configuration or exporting an environment variable. 20 | 21 | .. code-block:: yaml 22 | 23 | # ~/.config/dask/cloudprovider.yaml 24 | 25 | cloudprovider: 26 | digitalocean: 27 | token: "yourtoken" 28 | 29 | .. code-block:: console 30 | 31 | $ export DASK_CLOUDPROVIDER__DIGITALOCEAN__TOKEN="yourtoken" 32 | 33 | Droplet 34 | ------- 35 | 36 | .. autoclass:: DropletCluster 37 | :members: -------------------------------------------------------------------------------- /doc/source/gcp.rst: -------------------------------------------------------------------------------- 1 | Google Cloud Platform 2 | ===================== 3 | 4 | .. currentmodule:: dask_cloudprovider.gcp 5 | 6 | .. autosummary:: 7 | GCPCluster 8 | 9 | Overview 10 | -------- 11 | 12 | Authentication 13 | ^^^^^^^^^^^^^^ 14 | 15 | In order to create clusters on GCP you need to set your authentication credentials. 16 | You can do this via the ``gcloud`` `command line tool `_. 17 | 18 | .. code-block:: console 19 | 20 | $ gcloud auth login 21 | 22 | Alternatively you can use a `service account `_ which provides credentials in a JSON file. 23 | You must set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable to the path to the JSON file. 24 | 25 | .. code-block:: console 26 | 27 | $ export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json 28 | 29 | Project ID 30 | ^^^^^^^^^^ 31 | 32 | To use Dask Cloudprovider with GCP you must also configure your `Project ID `_. 33 | Generally when creating a GCP account you will create a default project. This can be found at the top of the GCP dashboard. 34 | 35 | Your Project ID must be added to your Dask config file. 36 | 37 | .. code-block:: yaml 38 | 39 | # ~/.config/dask/cloudprovider.yaml 40 | cloudprovider: 41 | gcp: 42 | projectid: "YOUR PROJECT ID" 43 | 44 | Or via an environment variable. 45 | 46 | .. code-block:: console 47 | 48 | $ export DASK_CLOUDPROVIDER__GCP__PROJECTID="YOUR PROJECT ID" 49 | 50 | Google Cloud VMs 51 | ---------------- 52 | 53 | .. autoclass:: GCPCluster 54 | :members: -------------------------------------------------------------------------------- /doc/source/gpus.rst: -------------------------------------------------------------------------------- 1 | GPU clusters 2 | ============ 3 | 4 | .. currentmodule:: dask_cloudprovider 5 | 6 | Many cloud providers have GPU offerings and so it is possible to launch GPU enabled Dask clusters 7 | with Dask Cloudprovider. 8 | 9 | Each cluster manager handles this differently but generally you will need to configure the following settings: 10 | 11 | - Configure the hardware to include GPUs. This may be by changing the hardware type or adding accelerators. 12 | - Ensure the OS/Docker image has the NVIDIA drivers. For Docker images it is recommended to use the [RAPIDS images](https://hub.docker.com/r/rapidsai/rapidsai/). 13 | - Set the ``worker_module`` config option to ``dask_cuda.cli.dask_cuda_worker`` or ``worker_command`` option to ``dask-cuda-worker``. 14 | 15 | In the following AWS :class:`dask_cloudprovider.aws.EC2Cluster` example we set the ``ami`` to be a Deep Learning AMI with NVIDIA drivers, the ``docker_image`` to RAPIDS, the ``instance_type`` 16 | to ``p3.2xlarge`` which has one NVIDIA Tesla V100 and the ``worker_module`` to ``dask_cuda.cli.dask_cuda_worker``. 17 | 18 | .. code-block:: python 19 | 20 | >>> cluster = EC2Cluster(ami="ami-0c7c7d78f752f8f17", # Example Deep Learning AMI (Ubuntu 18.04) 21 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04", 22 | instance_type="p3.2xlarge", 23 | worker_module="dask_cuda.cli.dask_cuda_worker", 24 | bootstrap=False, 25 | filesystem_size=120) 26 | 27 | See each cluster manager's example sections for info on starting a GPU cluster. -------------------------------------------------------------------------------- /doc/source/hetzner.rst: -------------------------------------------------------------------------------- 1 | Hetzner 2 | ============ 3 | 4 | .. currentmodule:: dask_cloudprovider.hetzner 5 | 6 | .. autosummary:: 7 | HetznerCluster 8 | 9 | Overview 10 | -------- 11 | 12 | Authentication 13 | ^^^^^^^^^^^^^^ 14 | 15 | To authenticate with Hetzner you must first generate a 16 | `personal access token `_. 17 | 18 | Then you must put this in your Dask configuration at ``cloudprovider.hetzner.token``. This can be done by 19 | adding the token to your YAML configuration or exporting an environment variable. 20 | 21 | .. code-block:: yaml 22 | 23 | # ~/.config/dask/cloudprovider.yaml 24 | 25 | cloudprovider: 26 | hetzner: 27 | token: "yourtoken" 28 | 29 | .. code-block:: console 30 | 31 | $ export DASK_CLOUDPROVIDER__HETZNER__TOKEN="yourtoken" 32 | 33 | 34 | .. autoclass:: HetznerCluster 35 | :members: 36 | -------------------------------------------------------------------------------- /doc/source/ibm.rst: -------------------------------------------------------------------------------- 1 | IBM Cloud 2 | ============ 3 | 4 | .. currentmodule:: dask_cloudprovider.ibm 5 | 6 | .. autosummary:: 7 | IBMCodeEngineCluster 8 | 9 | Overview 10 | -------- 11 | 12 | Authentication 13 | ^^^^^^^^^^^^^^ 14 | 15 | To authenticate with IBM Cloud you must first generate an 16 | `API key `_. 17 | 18 | Then you must put this in your Dask configuration at ``cloudprovider.ibm.api_key``. This can be done by 19 | adding the API key to your YAML configuration or exporting an environment variable. 20 | 21 | .. code-block:: yaml 22 | 23 | # ~/.config/dask/cloudprovider.yaml 24 | 25 | cloudprovider: 26 | ibm: 27 | api_key: "your_api_key" 28 | 29 | .. code-block:: console 30 | 31 | $ export DASK_CLOUDPROVIDER__IBM__API_KEY="your_api_key" 32 | 33 | Project ID 34 | ^^^^^^^^^^ 35 | 36 | To use Dask Cloudprovider with IBM Cloud you must also configure your `Project ID `_. 37 | This can be found at the top of the IBM Cloud dashboard. 38 | 39 | Your Project ID must be added to your Dask config file. 40 | 41 | .. code-block:: yaml 42 | 43 | # ~/.config/dask/cloudprovider.yaml 44 | cloudprovider: 45 | ibm: 46 | project_id: "your_project_id" 47 | 48 | Or via an environment variable. 49 | 50 | .. code-block:: console 51 | 52 | $ export DASK_CLOUDPROVIDER__IBM__PROJECT_ID="your_project_id" 53 | 54 | Code Engine 55 | ------- 56 | 57 | .. autoclass:: IBMCodeEngineCluster 58 | :members: -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | Dask Cloud Provider 2 | =================== 3 | 4 | *Native Cloud integration for Dask.* 5 | 6 | This package contains open source tools to help you deploy and operate Dask clusters on the cloud. 7 | It contains cluster managers which can help you launch clusters using native cloud resources like VMs or containers, 8 | it has tools and plugins for use in ANY cluster running on the cloud and is a great source of documentation for Dask cloud deployments. 9 | 10 | It is by no means the "complete" or "only" way to run Dask on the cloud, check out the :doc:`alternatives` page for more tools. 11 | 12 | Cluster managers 13 | ---------------- 14 | 15 | This package provides classes for constructing and managing ephemeral Dask clusters on various 16 | cloud platforms. 17 | 18 | Dask Cloud Provider is one of many options for deploying Dask clusters, see `Deploying Dask `_ in the Dask documentation for an overview of additional options. 19 | 20 | To use a cloud provider cluster manager you can import it and instantiate it. Instantiating the class 21 | will result in cloud resources being created for you. 22 | 23 | .. code-block:: python 24 | 25 | from dask_cloudprovider.aws import FargateCluster 26 | cluster = FargateCluster( 27 | # Cluster manager specific config kwargs 28 | ) 29 | 30 | You can then construct a Dask client with that cluster object to use the cluster. 31 | 32 | .. code-block:: python 33 | 34 | from dask.distributed import Client 35 | client = Client(cluster) 36 | 37 | Once you are connected to the cluster you can go ahead and use Dask and all computation will take 38 | place on your cloud resource. 39 | 40 | Once you are finished be sure to close out your cluster to shut down any cloud resources you have and end any charges. 41 | 42 | .. code-block:: python 43 | 44 | cluster.close() 45 | 46 | .. warning:: 47 | 48 | Cluster managers will attempt to automatically remove hanging cloud resources on garbage collection if the cluster 49 | object is destroyed without calling ``cluster.close()``, however this is not guaranteed. 50 | 51 | To implicitly close your cluster when you are done with it you can optionally contruct the cluster manager via a 52 | context manager. However this will result in the creation and destruction of the whole cluster whenever you run 53 | this code. 54 | 55 | .. code-block:: python 56 | 57 | from dask_cloudprovider.aws import FargateCluster 58 | from dask.distributed import Client 59 | 60 | with FargateCluster(...) as cluster: 61 | with Client(cluster) as client: 62 | # Do some Dask things 63 | 64 | Plugins 65 | ------- 66 | 67 | Dask components like Schedulers and Workers can benefit from being cloud-aware. 68 | This project has plugins and tools that extend these components. 69 | 70 | One example is having the workers check for termination warnings when running on ephemeral/spot instances and begin migrating data to other workers. 71 | 72 | For Azure VMs you could use the :class:`dask_cloudprovider.azure.AzurePreemptibleWorkerPlugin` to do this. 73 | It can be used on any cluster that has workers running on Azure VMs, not just ones created with :class:`dask_cloudprovider.azure.AzureVMCluster`. 74 | 75 | .. code-block:: python 76 | 77 | from distributed import Client 78 | client = Client("") 79 | 80 | from dask_cloudprovider.azure import AzurePreemptibleWorkerPlugin 81 | client.register_worker_plugin(AzurePreemptibleWorkerPlugin()) 82 | 83 | 84 | .. toctree:: 85 | :maxdepth: 2 86 | :hidden: 87 | :caption: Overview 88 | 89 | installation.rst 90 | config.rst 91 | alternatives.rst 92 | 93 | .. toctree:: 94 | :maxdepth: 2 95 | :hidden: 96 | :caption: Providers 97 | 98 | aws.rst 99 | digitalocean.rst 100 | gcp.rst 101 | azure.rst 102 | hetzner.rst 103 | ibm.rst 104 | openstack.rst 105 | nebius.rst 106 | 107 | .. toctree:: 108 | :maxdepth: 2 109 | :hidden: 110 | :caption: Advanced 111 | 112 | troubleshooting.rst 113 | security.rst 114 | gpus.rst 115 | packer.rst 116 | 117 | .. toctree:: 118 | :maxdepth: 2 119 | :hidden: 120 | :caption: Developer 121 | 122 | testing.rst 123 | releasing.rst 124 | -------------------------------------------------------------------------------- /doc/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Pip 5 | --- 6 | 7 | .. code-block:: console 8 | 9 | $ pip install dask-cloudprovider[all] 10 | 11 | You can also restrict your install to just a specific cloud provider by giving their name instead of ``all``. 12 | 13 | .. code-block:: console 14 | 15 | $ pip install dask-cloudprovider[aws]  # or 16 | $ pip install dask-cloudprovider[azure]  # or 17 | $ pip install dask-cloudprovider[azureml]  # or 18 | $ pip install dask-cloudprovider[digitalocean]  # or 19 | $ pip install dask-cloudprovider[gcp]  # or 20 | $ pip install dask-cloudprovider[ibm]  # or 21 | $ pip install dask-cloudprovider[openstack]  # or 22 | $ pip install dask-cloudprovider[nebius] 23 | 24 | Conda 25 | ----- 26 | 27 | .. code-block:: console 28 | 29 | $ conda install -c conda-forge dask-cloudprovider -------------------------------------------------------------------------------- /doc/source/nebius.rst: -------------------------------------------------------------------------------- 1 | Nebius 2 | ============ 3 | 4 | .. currentmodule:: dask_cloudprovider.nebius 5 | 6 | .. autosummary:: 7 | NebiusCluster 8 | 9 | Overview 10 | -------- 11 | 12 | Authentication 13 | ^^^^^^^^^^^^^^ 14 | 15 | 16 | Before creating clusters on Nebius, you must configure your authentication credentials. You can do this using the `nebius` `command line tool `_. 17 | 18 | After obtaining your credentials, add them to your Dask configuration under: 19 | 20 | * cloudprovider.nebius.token 21 | * cloudprovider.nebius.project_id 22 | 23 | You can specify these values by either: 24 | 25 | #. Including the environment variables NB_IAM_TOKEN and NB_PROJECT_ID in your YAML configuration. 26 | 27 | .. code-block:: yaml 28 | 29 | # ~/.config/dask/cloudprovider.yaml 30 | 31 | cloudprovider: 32 | nebius: 33 | token: "your_iam_token" 34 | project_id: "your_project_id" 35 | 36 | #. Exporting them as environment variables in your shell. 37 | 38 | .. code-block:: console 39 | 40 | $ export DASK_CLOUDPROVIDER__NEBIUS__TOKEN=($nebius iam get-access-token) 41 | $ export DASK_CLOUDPROVIDER__NEBIUS__PROJECT_ID="your_project_id" 42 | 43 | Dask Configuration 44 | ^^^^^^^^^^^^^^^^^^ 45 | 46 | You can change configuration of ``server_platform``, ``server_preset`` and ``image_family``. List of all available platforms and presets you can find in `Nebius docs `_. 47 | 48 | .. autoclass:: NebiusCluster 49 | :members: -------------------------------------------------------------------------------- /doc/source/openstack.rst: -------------------------------------------------------------------------------- 1 | Openstack 2 | ============ 3 | 4 | .. currentmodule:: dask_cloudprovider.openstack 5 | 6 | .. autosummary:: 7 | OpenStackCluster 8 | 9 | Overview 10 | -------- 11 | 12 | Authentication 13 | ^^^^^^^^^^^^^^ 14 | 15 | To authenticate with the OpenStack Identity service (Keystone) 16 | 17 | 1) Get your Authentication URL (auth_url) for OpenStack Identity service (Keystone) and put it in your Dask configuration at ``cloudprovider.openstack.auth_url``. 18 | 19 | 2) Get your `region `_ and put it in your Dask configuration at ``cloudprovider.openstack.region``. 20 | .. code-block:: console 21 | 22 | $ openstack region list 23 | +-----------+---------------+-------------+ 24 | | Region | Parent Region | Description | 25 | +-----------+---------------+-------------+ 26 | | RegionOne | None | | 27 | +-----------+---------------+-------------+ 28 | 29 | 3) Generate an `application credential `_. 30 | 31 | .. code-block:: console 32 | 33 | $ openstack application credential create dask --unrestricted 34 | +--------------+----------------------------------------------------------------------------------------+ 35 | | Field | Value | 36 | +--------------+----------------------------------------------------------------------------------------+ 37 | | description | None | 38 | | expires_at | None | 39 | | id | 0a0372dbedfb4e82ab66449c3316ef1e | 40 | | name | dask | 41 | | project_id | e99b6f4b9bf84a9da27e20c9cbfe887a | 42 | | roles | Member anotherrole | 43 | | secret | ArOy6DYcLeLTRlTmfvF1TH1QmRzYbmD91cbVPOHL3ckyRaLXlaq5pTGJqvCvqg6leEvTI1SQeX3QK-3iwmdPxg | 44 | | unrestricted | True | 45 | +--------------+----------------------------------------------------------------------------------------+ 46 | 47 | and put ``application_credential_id`` and ``application_credential_secret`` in your Dask configuration at ``cloudprovider.openstack.application_credential_id`` 48 | and ``cloudprovider.openstack.application_credential_secret``. 49 | 50 | All of this variables can be gathered from either `OpenStack RC file `_ 51 | or `clouds.yaml file `_. 52 | 53 | Example Config File 54 | ^^^^^^^^^^^^^^ 55 | .. code-block:: yaml 56 | 57 | # ~/.config/dask/cloudprovider.yaml 58 | 59 | cloudprovider: 60 | openstack: 61 | region: "RegionOne" 62 | auth_url: "https://cloud.home.karatosun.xyz:5000" 63 | application_credential_id: "0a0372dbedfb4e82ab66449c3316ef1e" 64 | application_credential_secret: "ArOy6DYcLeLTRlTmfvF1TH1QmRzYbmD91cbVPOHL3ckyRaLXlaq5pTGJqvCvqg6leEvTI1SQeX3QK-3iwmdPxg" 65 | auth_type: "v3applicationcredential" 66 | 67 | You can also export them as environment variables. 68 | 69 | .. code-block:: console 70 | 71 | $ export DASK_CLOUDPROVIDER__APPLICATION_CREDENTIAL_ID="0a0372dbedfb4e82ab66449c3316ef1e" 72 | 73 | 74 | .. autoclass:: OpenStackCluster 75 | :members: 76 | -------------------------------------------------------------------------------- /doc/source/packer.rst: -------------------------------------------------------------------------------- 1 | Creating custom OS images with Packer 2 | ===================================== 3 | 4 | Many cloud providers in Dask Cloudprovider involve creating VMs and installing dependencies on those VMs at boot time. 5 | 6 | This can slow down the creation and scaling of clusters, so this page discusses building custom images using `Packer `_ to speed up cluster creation. 7 | 8 | Packer is a utility which boots up a VM on your desired cloud, runs any installation steps and then takes a snapshot of the VM for use as a template for creating 9 | new VMs later. This allows us to run through the installation steps once, and then reuse them when starting Dask components. 10 | 11 | Installing Packer 12 | ----------------- 13 | 14 | See the `official install docs `_. 15 | 16 | Packer Overview 17 | --------------- 18 | 19 | To create an image with packer we need to create a JSON config file. 20 | 21 | A Packer config file is broken into a couple of sections, ``builders`` and ``provisioners``. 22 | 23 | A builder configures what type of image you are building (AWS AMI, GCP VMI, etc). It describes the base 24 | image you are building on top of and connection information for Packer to connect to the build instance. 25 | 26 | When you run ``packer build /path/to/config.json`` a VM (or multiple VMs if you configure more than one) will be 27 | created automatically based on your ``builders`` config section. 28 | 29 | Once your build VM is up and running the ``provisioners`` will be run. These are steps to configure and provision your 30 | machine. In the examples below we are mostly using the ``shell`` provisioner which will run commands on the VM to set things 31 | up. 32 | 33 | Once your provisioning scripts have completed the VM will automatically stop, a snapshot will be taken and you will be provided 34 | with an ID which you can then use as a template in future runs of ``dask-cloudprovider``. 35 | 36 | Image Requirements 37 | ------------------ 38 | 39 | Each cluster manager that uses VMs will have specific requirements for the VM image. 40 | 41 | The AWS ``ECSCluster`` for example requires `ECS optimised AMIs `_. 42 | 43 | The VM cluster managers such as ``EC2cluster`` and ``DropletCluster`` just require `Docker `_ to be installed (or `NVIDIA Docker `_ for GPU VM types). 44 | 45 | Examples 46 | -------- 47 | 48 | ``EC2Cluster`` with cloud-init 49 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 50 | 51 | When any of the ``VMCluster`` based cluster managers, such as ``EC2Cluster``, lauch a new default VM it uses the Ubuntu base image and installs all dependencies 52 | with `cloud-init `_. 53 | 54 | Instead of doing this every time we could use Packer to do this once, and then reuse that image every time. 55 | 56 | Each ``VMCluster`` cluster manager has a class method called ``get_cloud_init`` which takes the same keyword arguments as creating the object itself, but instead 57 | returns the cloud-init file that would be generated. 58 | 59 | .. code-block:: python 60 | 61 | from dask_cloudprovider.aws import EC2Cluster 62 | 63 | cloud_init_config = EC2Cluster.get_cloud_init( 64 | # Pass any kwargs here you would normally pass to ``EC2Cluster`` 65 | ) 66 | print(cloud_init_config) 67 | 68 | We should see some output like this. 69 | 70 | .. code-block:: YAML 71 | 72 | #cloud-config 73 | 74 | packages: 75 | - apt-transport-https 76 | - ca-certificates 77 | - curl 78 | - gnupg-agent 79 | - software-properties-common 80 | 81 | # Enable ipv4 forwarding, required on CIS hardened machines 82 | write_files: 83 | - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf 84 | content: | 85 | net.ipv4.conf.all.forwarding=1 86 | 87 | # create the docker group 88 | groups: 89 | - docker 90 | 91 | # Add default auto created user to docker group 92 | system_info: 93 | default_user: 94 | groups: [docker] 95 | 96 | runcmd: 97 | 98 | # Install Docker 99 | - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - 100 | - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" 101 | - apt-get update -y 102 | - apt-get install -y docker-ce docker-ce-cli containerd.io 103 | - systemctl start docker 104 | - systemctl enable docker 105 | 106 | # Run container 107 | - docker run --net=host daskdev/dask:latest dask-scheduler --version 108 | 109 | We should save this output somewhere for reference later. Let's refer to it as ``/path/to/cloud-init-config.yaml``. 110 | 111 | Next we need a Packer config file to build our image, let's refer to it as ``/path/to/config.json``. 112 | We will use the official Ubuntu 20.04 image and specify our cloud-init config file in the ``user_data_file`` option. 113 | 114 | Packer will not necesserily wait for our cloud-init config to finish executing before taking a snapshot, so we need to add a provisioner 115 | that will block until the cloud-init completes. 116 | 117 | .. code-block:: JSON 118 | 119 | { 120 | "builders": [ 121 | { 122 | "type": "amazon-ebs", 123 | "region": "eu-west-2", 124 | "source_ami_filter": { 125 | "filters": { 126 | "virtualization-type": "hvm", 127 | "name": "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*", 128 | "root-device-type": "ebs" 129 | }, 130 | "owners": [ 131 | "099720109477" 132 | ], 133 | "most_recent": true 134 | }, 135 | "instance_type": "t2.micro", 136 | "ssh_username": "ubuntu", 137 | "ami_name": "dask-cloudprovider {{timestamp}}", 138 | "user_data_file": "/path/to/cloud-init-config.yaml" 139 | } 140 | ], 141 | "provisioners": [ 142 | { 143 | "type": "shell", 144 | "inline": [ 145 | "echo 'Waiting for cloud-init'; while [ ! -f /var/lib/cloud/instance/boot-finished ]; do sleep 1; done; echo 'Done'" 146 | ] 147 | } 148 | ] 149 | } 150 | 151 | Then we can build our image with ``packer build /path/to/config.json``. 152 | 153 | .. code-block:: 154 | 155 | $ packer build /path/to/config.json 156 | amazon-ebs: output will be in this color. 157 | 158 | ==> amazon-ebs: Prevalidating any provided VPC information 159 | ==> amazon-ebs: Prevalidating AMI Name: dask-cloudprovider 1600875672 160 | amazon-ebs: Found Image ID: ami-062c2b6de9e9c54d3 161 | ==> amazon-ebs: Creating temporary keypair: packer_5f6b6c99-46b5-6002-3126-8dcb1696f969 162 | ==> amazon-ebs: Creating temporary security group for this instance: packer_5f6b6c9a-bd7d-8bb3-58a8-d983f0e95a96 163 | ==> amazon-ebs: Authorizing access to port 22 from [0.0.0.0/0] in the temporary security groups... 164 | ==> amazon-ebs: Launching a source AWS instance... 165 | ==> amazon-ebs: Adding tags to source instance 166 | amazon-ebs: Adding tag: "Name": "Packer Builder" 167 | amazon-ebs: Instance ID: i-0531483be973d60d8 168 | ==> amazon-ebs: Waiting for instance (i-0531483be973d60d8) to become ready... 169 | ==> amazon-ebs: Using ssh communicator to connect: 18.133.244.42 170 | ==> amazon-ebs: Waiting for SSH to become available... 171 | ==> amazon-ebs: Connected to SSH! 172 | ==> amazon-ebs: Provisioning with shell script: /var/folders/0l/fmwbqvqn1tq96xf20rlz6xmm0000gp/T/packer-shell512450076 173 | amazon-ebs: Waiting for cloud-init 174 | amazon-ebs: Done 175 | ==> amazon-ebs: Stopping the source instance... 176 | amazon-ebs: Stopping instance 177 | ==> amazon-ebs: Waiting for the instance to stop... 178 | ==> amazon-ebs: Creating AMI dask-cloudprovider 1600875672 from instance i-0531483be973d60d8 179 | amazon-ebs: AMI: ami-064f8db7634d19647 180 | ==> amazon-ebs: Waiting for AMI to become ready... 181 | ==> amazon-ebs: Terminating the source AWS instance... 182 | ==> amazon-ebs: Cleaning up any extra volumes... 183 | ==> amazon-ebs: No volumes to clean up, skipping 184 | ==> amazon-ebs: Deleting temporary security group... 185 | ==> amazon-ebs: Deleting temporary keypair... 186 | Build 'amazon-ebs' finished after 4 minutes 5 seconds. 187 | 188 | ==> Wait completed after 4 minutes 5 seconds 189 | 190 | ==> Builds finished. The artifacts of successful builds are: 191 | --> amazon-ebs: AMIs were created: 192 | eu-west-2: ami-064f8db7634d19647 193 | 194 | Then to use our new image we can create an ``EC2Cluster`` specifying the AMI and disabling the automatic bootstrapping. 195 | 196 | .. code-block:: python 197 | 198 | from dask.distributed import Client 199 | from dask_cloudprovider.aws import EC2Cluster 200 | 201 | cluster = EC2Cluster( 202 | ami="ami-064f8db7634d19647", # AMI ID provided by Packer 203 | bootstrap=False 204 | ) 205 | cluster.scale(2) 206 | 207 | client = Client(cluster) 208 | # Your cluster is ready to use 209 | 210 | ``EC2Cluster`` with RAPIDS 211 | ^^^^^^^^^^^^^^^^^^^^^^^^^^ 212 | 213 | To launch `RAPIDS `_ on AWS EC2 we can select a GPU instance type, choose the official Deep Learning AMIs that Amazon provides and run the official RAPIDS Docker image. 214 | 215 | .. code-block:: python 216 | 217 | from dask_cloudprovider.aws import EC2Cluster 218 | 219 | cluster = EC2Cluster( 220 | ami="ami-0c7c7d78f752f8f17", # Deep Learning AMI (this ID varies by region so find yours in the AWS Console) 221 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9", 222 | instance_type="p3.2xlarge", 223 | bootstrap=False, # Docker is already installed on the Deep Learning AMI 224 | filesystem_size=120, 225 | ) 226 | cluster.scale(2) 227 | 228 | However every time a VM is created by ``EC2Cluster`` the RAPIDS Docker image will need to be pulled from Docker Hub. 229 | The result is that the above snippet can take ~20 minutes to run, so let's create our own AMI which already has the RAPIDS image pulled. 230 | 231 | In our builders section we will specify we want to build on top of the latest Deep Learning AMI by specifying 232 | ``"Deep Learning AMI (Ubuntu 18.04) Version *"`` to list all versions and ``"most_recent": true`` to use the most recent. 233 | 234 | We also restrict the owners to ``898082745236`` which is the ID for the official image channel. 235 | 236 | The official image already has the NVIDIA drivers and NVIDIA Docker runtime installed so the only step we need to do is to 237 | pull the RAPIDS Docker image. That way when a scheduler or worker VM is created the image will already be available on the machine. 238 | 239 | .. code-block:: JSON 240 | 241 | { 242 | "builders": [ 243 | { 244 | "type": "amazon-ebs", 245 | "region": "eu-west-2", 246 | "source_ami_filter": { 247 | "filters": { 248 | "virtualization-type": "hvm", 249 | "name": "Deep Learning AMI (Ubuntu 18.04) Version *", 250 | "root-device-type": "ebs" 251 | }, 252 | "owners": [ 253 | "898082745236" 254 | ], 255 | "most_recent": true 256 | }, 257 | "instance_type": "p3.2xlarge", 258 | "ssh_username": "ubuntu", 259 | "ami_name": "dask-cloudprovider-rapids {{timestamp}}" 260 | } 261 | ], 262 | "provisioners": [ 263 | { 264 | "type": "shell", 265 | "inline": [ 266 | "docker pull rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9" 267 | ] 268 | } 269 | ] 270 | } 271 | 272 | Then we can build our image with ``packer build /path/to/config.json``. 273 | 274 | .. code-block:: 275 | 276 | $ packer build /path/to/config.json 277 | ==> amazon-ebs: Prevalidating any provided VPC information 278 | ==> amazon-ebs: Prevalidating AMI Name: dask-cloudprovider-gpu 1600868638 279 | amazon-ebs: Found Image ID: ami-0c7c7d78f752f8f17 280 | ==> amazon-ebs: Creating temporary keypair: packer_5f6b511e-d3a3-c607-559f-d466560cd23b 281 | ==> amazon-ebs: Creating temporary security group for this instance: packer_5f6b511f-8f62-cf98-ca54-5771f1423d2d 282 | ==> amazon-ebs: Authorizing access to port 22 from [0.0.0.0/0] in the temporary security groups... 283 | ==> amazon-ebs: Launching a source AWS instance... 284 | ==> amazon-ebs: Adding tags to source instance 285 | amazon-ebs: Adding tag: "Name": "Packer Builder" 286 | amazon-ebs: Instance ID: i-077f54ed4ae6bcc66 287 | ==> amazon-ebs: Waiting for instance (i-077f54ed4ae6bcc66) to become ready... 288 | ==> amazon-ebs: Using ssh communicator to connect: 52.56.96.165 289 | ==> amazon-ebs: Waiting for SSH to become available... 290 | ==> amazon-ebs: Connected to SSH! 291 | ==> amazon-ebs: Provisioning with shell script: /var/folders/0l/fmwbqvqn1tq96xf20rlz6xmm0000gp/T/packer-shell376445833 292 | amazon-ebs: Waiting for cloud-init 293 | amazon-ebs: Bootstrap complete 294 | ==> amazon-ebs: Stopping the source instance... 295 | amazon-ebs: Stopping instance 296 | ==> amazon-ebs: Waiting for the instance to stop... 297 | ==> amazon-ebs: Creating AMI dask-cloudprovider-gpu 1600868638 from instance i-077f54ed4ae6bcc66 298 | amazon-ebs: AMI: ami-04e5539cb82859e69 299 | ==> amazon-ebs: Waiting for AMI to become ready... 300 | ==> amazon-ebs: Terminating the source AWS instance... 301 | ==> amazon-ebs: Cleaning up any extra volumes... 302 | ==> amazon-ebs: No volumes to clean up, skipping 303 | ==> amazon-ebs: Deleting temporary security group... 304 | ==> amazon-ebs: Deleting temporary keypair... 305 | Build 'amazon-ebs' finished after 20 minutes 35 seconds. 306 | 307 | It took over 20 minutes to build this image, but now that we've done it once we can reuse the image in our RAPIDS powered Dask clusters. 308 | 309 | We can then run our code snippet again but this time it will take less than 5 minutes to get a running cluster. 310 | 311 | .. code-block:: python 312 | 313 | from dask.distributed import Client 314 | from dask_cloudprovider.aws import EC2Cluster 315 | 316 | cluster = EC2Cluster( 317 | ami="ami-04e5539cb82859e69", # AMI ID provided by Packer 318 | docker_image="rapidsai/rapidsai:cuda10.1-runtime-ubuntu18.04-py3.9", 319 | instance_type="p3.2xlarge", 320 | bootstrap=False, 321 | filesystem_size=120, 322 | ) 323 | cluster.scale(2) 324 | 325 | client = Client(cluster) 326 | # Your cluster is ready to use 327 | -------------------------------------------------------------------------------- /doc/source/releasing.rst: -------------------------------------------------------------------------------- 1 | Releasing 2 | ========= 3 | 4 | Releases are published automatically when a tag is pushed to GitHub. 5 | 6 | .. code-block:: bash 7 | 8 | # Set next version number 9 | export RELEASE=x.x.x 10 | 11 | # Create tags 12 | git commit --allow-empty -m "Release $RELEASE" 13 | git tag -a $RELEASE -m "Version $RELEASE" 14 | 15 | # Push 16 | git push upstream --tags -------------------------------------------------------------------------------- /doc/source/security.rst: -------------------------------------------------------------------------------- 1 | Security 2 | ======== 3 | 4 | Dask Cloudprovider aims to balance ease of use with security best practices. The two are not always compatible so this document aims to outline the compromises and decisions made in this library. 5 | 6 | Public Schedulers 7 | ----------------- 8 | 9 | For each cluster manager to work correctly it must be able to make a connection to the Dask scheduler on port ``8786``. 10 | In many cluster managers the default option is to expose the Dask scheduler and dashboard to the internet via a public IP address. 11 | This makes things quick and easy for new users to get up and running, but may pose a security risk long term. 12 | 13 | Many organisations have policies which do not allow users to assign public IP addresses or open ports. Our best practices 14 | advice is to use Dask Cloudprovider from within a cloud platform, either from a VM or a managed environment. Then disable public 15 | networking. For example: 16 | 17 | .. code-block:: python 18 | 19 | >>> import dask.config, dask_cloudprovider 20 | >>> dask.config.set({"cloudprovider.gcp.public_ingress": False}) 21 | 22 | See each cluster manager for configuration options. 23 | 24 | Authentication and encryption 25 | ----------------------------- 26 | 27 | Cluster managers such as :class:`dask_cloudprovider.aws.EC2Cluster`, :class:`dask_cloudprovider.azure.AzureVMCluster`, 28 | :class:`dask_cloudprovider.gcp.GCPCluster` and :class:`dask_cloudprovider.digitalocean.DropletCluster` enable certificate based authentication 29 | and encryption by default. 30 | 31 | When a cluster is launched with any of these cluster managers a set of temporary keys will be generated and distributed to the cluster nodes 32 | via their startup script. All communication between the client, scheduler and workers will then be encrypted and only clients and workers with 33 | valid certificates will be able to connect to the scheduler. 34 | 35 | You can also specify your own certificates using the :class:`distributed.security.Security` object. 36 | 37 | .. code-block:: python 38 | 39 | >>> from dask_cloudprovider.gcp import GCPCluster 40 | >>> from dask.distributed import Client 41 | >>> from distributed.security import Security 42 | >>> sec = Security(tls_ca_file='cluster_ca.pem', 43 | ... tls_client_cert='cli_cert.pem', 44 | ... tls_client_key='cli_key.pem', 45 | ... require_encryption=True) 46 | >>> cluster = GCPCluster(n_workers=1, security=sec) 47 | >>> client = Client(cluster) 48 | >>> client 49 | 50 | 51 | You can disable secure connections by setting the ``security`` keyword argument to ``False``. This may be desirable when troubleshooting or 52 | when running on a trusted network (entirely inside a VPC for example). 53 | -------------------------------------------------------------------------------- /doc/source/testing.rst: -------------------------------------------------------------------------------- 1 | Testing 2 | ======= 3 | 4 | Tests in ``dask-cloudprovider`` and written and run using ``pytest``. 5 | 6 | To set up your testing environment run: 7 | 8 | .. code-block:: bash 9 | 10 | pip install -r requirements_test.txt 11 | 12 | To run tests run ``pytest`` from the root directory 13 | 14 | .. code-block:: bash 15 | 16 | pytest 17 | 18 | You may notice that many tests will be skipped. This is because those tests create external resources on cloud providers. You can set those tests to run with the 19 | ``--create-external-resources`` flag. 20 | 21 | .. warning:: 22 | 23 | Running tests that create external resources are slow and will cost a small amount of credit on each cloud provider. 24 | 25 | .. code-block:: bash 26 | 27 | pytest -rs --create-external-resources 28 | 29 | It is also helpful to set the ``-rs`` flag here because tests may also skip if you do not have appropriate credentials to create those external resources. 30 | If this is the case the skip reason will contain instructions on how to set up those credentials. For example 31 | 32 | .. code-block:: 33 | 34 | SKIPPED [1] dask_cloudprovider/azure/tests/test_azurevm.py:49: 35 | You must configure your Azure resource group and vnet to run this test. 36 | 37 | $ export DASK_CLOUDPROVIDER__AZURE__LOCATION="" 38 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__RESOURCE_GROUP="" 39 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__VNET="" 40 | $ export DASK_CLOUDPROVIDER__AZURE__AZUREVM__SECURITY_GROUP="" 41 | 42 | -------------------------------------------------------------------------------- /doc/source/troubleshooting.rst: -------------------------------------------------------------------------------- 1 | Troubleshooting 2 | =============== 3 | 4 | This document contains frequently asked troubleshooting problems. 5 | 6 | Unable to connect to scheduler 7 | ------------------------------ 8 | 9 | The most common issue is not being able to connect to the cluster once it has been constructed. 10 | 11 | Each cluster manager will construct a Dask scheduler and by default expose it via a public IP address. You must be able 12 | to connect to that address on ports ``8786`` and ``8787`` from wherever your Python session is. 13 | 14 | If you are unable to connect to this address it is likely that there is something wrong with your network configuration, 15 | for example you may have corporate policies implementing additional firewall rules on your account. 16 | 17 | To reduce the chances of this happening it is often simplest to run Dask Cloudprovider from within the cloud you are trying 18 | to use and configure private networking only. See your specific cluster manager docs for info. 19 | 20 | Invalid CPU or Memory 21 | --------------------- 22 | 23 | When working with ``FargateCluster`` or ``ECSCluster``, CPU and memory arguments can only take values from a fixed set of combinations. 24 | 25 | So, for example, code like this will result in an error 26 | 27 | .. code-block:: python 28 | 29 | from dask_cloudprovider.aws import FargateCluster 30 | cluster = FargateCluster( 31 | image="daskdev/dask:latest", 32 | worker_cpu=256, 33 | worker_mem=30720, 34 | n_workers=2, 35 | fargate_use_private_ip=False, 36 | scheduler_timeout="15 minutes" 37 | ) 38 | client = Client(cluster) 39 | cluster 40 | 41 | # botocore.errorfactory.ClientException: 42 | # An error occurred (ClientException) when calling the RegisterTaskDefinition operation: 43 | # No Fargate configuration exists for given values. 44 | 45 | 46 | This is because ECS and Fargate task definitions with ``CPU=256`` cannot have as much memory as that code is requesting. 47 | 48 | The AWS-accepted set of combinations is documented at 49 | https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html. 50 | 51 | Requested CPU Configuration Above Limit 52 | --------------------------------------- 53 | When creating a ``FargateCluster`` or or ``ECSCluster``, or adding additional workers, you may receive an error response with 54 | "The requested CPU configuration is above your limit". This means that the scheduler and workers requested and any other 55 | EC2 resources you have running in that region use up more than your current service quota 56 | `limit for vCPUs `_. 57 | 58 | You can adjust the scheduler and/or worker CPUs with the ``scheduler_cpu`` and ``worker_cpu`` 59 | `arguments `_. See the "Invalid CPU or Memory" 60 | section in this document for more information. 61 | 62 | However, to get the desired cluster configuration you'll need to request a service limit quota increase. 63 | 64 | Go to ``https://.aws.amazon.com/servicequotas/home/services/ec2/quotas`` and 65 | `request an increase `_ for 66 | "Running On-Demand Standard (A, C, D, H, I, M, R, T, Z) instances". 67 | 68 | Pulling private Docker images 69 | ----------------------------------- 70 | 71 | For cluster managers like ``EC2Cluster``, ``AzureVMCluster`` and ``GCPCluster`` Docker images will be pulled onto VMs created on the cloud of your choice. 72 | 73 | If you need to pull a private Docker images which requires authentication each VM will need to be configured with credentials. These cluster managers accept 74 | and ``extra_bootstrap`` argument where you can provide additional bash commands to be run during startup. This is a good place to log into your Docker registry. 75 | 76 | .. code-block:: python 77 | 78 | from dask_cloudprovider.azure import AzureVMCluster 79 | cluster = AzureVMCluster(... 80 | docker_image="my_private_image:latest", 81 | extra_bootstrap=["docker login -u 'username' -p 'password'"]) 82 | 83 | If you need to access Artifact/Container Registry in GCP, one way of doing it would be to authenticate Docker with 84 | `gcloud credential helper `_ by adding extra bootstrap params similar to 85 | the ones below: 86 | 87 | .. code-block:: python 88 | 89 | from dask_cloudprovider.gcp import GCPCluster 90 | cluster = GCPCluster(... 91 | docker_image=f"{region}-docker.pkg.dev/{project}/{repo}/{image}:{tag}", 92 | extra_bootstrap=[f"gcloud auth configure-docker {region}-docker.pkg.dev"]) 93 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | asyncio_mode = auto 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp>=3.7.3 2 | dask>=2021.01.1 3 | distributed>=2021.01.1 4 | jinja2 5 | tornado>=5 -------------------------------------------------------------------------------- /requirements_test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-asyncio 3 | pytest-timeout 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # References: 3 | # https://flake8.readthedocs.io/en/latest/user/configuration.html 4 | # https://flake8.readthedocs.io/en/latest/user/error-codes.html 5 | 6 | # Note: there cannot be spaces after comma's here 7 | exclude = __init__.py,versioneer.py,dask_cloudprovider/_version.py 8 | ignore = 9 | # Extra space in brackets 10 | E20, 11 | # Multiple spaces around "," 12 | E231,E241, 13 | # Comments 14 | E26, 15 | # Import formatting 16 | E4, 17 | # Comparing types instead of isinstance 18 | E721, 19 | # Assigning lambda expression 20 | E731, 21 | # continuation line under-indented for hanging indent 22 | E121, 23 | # continuation line over-indented for hanging indent 24 | E126, 25 | # continuation line over-indented for visual indent 26 | E127, 27 | # E128 continuation line under-indented for visual indent 28 | E128, 29 | # multiple statements on one line (semicolon) 30 | E702, 31 | # line break before binary operator 32 | W503, 33 | # visually indented line with same indent as next logical line 34 | E129, 35 | # unexpected indentation 36 | E116, 37 | # redefinition of unused 'loop' from line 10 38 | F811, 39 | # local variable is assigned to but never used 40 | F841, 41 | # Ambiguous variable names 42 | E741 43 | 44 | max-line-length = 120 45 | 46 | [versioneer] 47 | VCS = git 48 | style = pep440 49 | versionfile_source = dask_cloudprovider/_version.py 50 | versionfile_build = dask_cloudprovider/_version.py 51 | tag_prefix = 52 | parentdir_prefix = dask-cloudprovider- 53 | 54 | [tool:pytest] 55 | timeout = 300 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from os.path import exists 4 | from setuptools import setup, find_packages 5 | 6 | import versioneer 7 | 8 | extras_require = { 9 | "aws": ["aiobotocore>=0.10.2"], 10 | "azure": [ 11 | "azure-mgmt-compute>=18.0.0", 12 | "azure-mgmt-network>=16.0.0", 13 | "azure-identity", 14 | ], 15 | "digitalocean": ["python-digitalocean>=1.15.0"], 16 | "gcp": ["google-api-python-client>=1.12.5", "google-auth>=1.23.0"], 17 | "hetzner": ["hcloud>=1.10.0"], 18 | "ibm": ["ibm_code_engine_sdk>=3.1.0", "kubernetes>=25.3.0"], 19 | "openstack": ["openstacksdk>=3.3.0"], 20 | "nebius": ["nebius>=0.2.0"], 21 | } 22 | extras_require["all"] = set(pkg for pkgs in extras_require.values() for pkg in pkgs) 23 | 24 | setup( 25 | name="dask-cloudprovider", 26 | cmdclass=versioneer.get_cmdclass(), 27 | version=versioneer.get_version(), 28 | description="Native Cloud Provider integration for Dask", 29 | url="https://github.com/dask/dask-cloudprovider", 30 | keywords="dask,cloud,distributed", 31 | license="BSD", 32 | packages=find_packages(), 33 | include_package_data=True, 34 | long_description=(open("README.rst").read() if exists("README.rst") else ""), 35 | long_description_content_type="text/x-rst", 36 | zip_safe=False, 37 | install_requires=list(open("requirements.txt").read().strip().split("\n")), 38 | extras_require=extras_require, 39 | entry_points=""" 40 | [console_scripts] 41 | dask-ecs=dask_cloudprovider.cli.ecs:go 42 | """, 43 | python_requires=">=3.10", 44 | ) 45 | --------------------------------------------------------------------------------