├── tests ├── __init__.py ├── e2e │ ├── rayjob │ │ └── __init__.py │ ├── mnist_pip_requirements.txt │ ├── install-codeflare-sdk.sh │ ├── start_ray_cluster.py │ ├── mnist_rayjob.py │ ├── mnist_sleep.py │ ├── heterogeneous_clusters_kind_test.py │ ├── local_interactive_sdk_oauth_test.py │ ├── heterogeneous_clusters_oauth_test.py │ ├── minio_deployment.yaml │ ├── mnist_raycluster_sdk_kind_test.py │ ├── mnist_raycluster_sdk_aw_kind_test.py │ ├── cluster_apply_kind_test.py │ └── local_interactive_sdk_kind_test.py ├── upgrade │ ├── __init__.py │ └── conftest.py ├── e2e_v2 │ ├── security │ │ ├── __init__.py │ │ ├── test_mtls.py │ │ └── test_network_policies.py │ ├── upgrade │ │ ├── __init__.py │ │ └── conftest.py │ ├── cluster_management │ │ ├── __init__.py │ │ ├── creation │ │ │ ├── __init__.py │ │ │ ├── test_cluster_kueue.py │ │ │ └── test_cluster_creation.py │ │ ├── configuration │ │ │ ├── __init__.py │ │ │ ├── test_advanced.py │ │ │ ├── test_images.py │ │ │ ├── test_resources.py │ │ │ ├── test_volumes.py │ │ │ └── test_heterogeneous.py │ │ └── interactive │ │ │ ├── __init__.py │ │ │ ├── test_remote.py │ │ │ └── test_in_cluster.py │ ├── job_submission │ │ ├── __init__.py │ │ ├── rayjob_client │ │ │ ├── __init__.py │ │ │ ├── test_remote.py │ │ │ └── test_in_cluster.py │ │ └── rayjob_cr │ │ │ ├── __init__.py │ │ │ └── test_lifecycled_cluster.py │ ├── kueue_integration │ │ ├── __init__.py │ │ ├── test_admission.py │ │ ├── test_queueing.py │ │ └── test_resource_flavors.py │ ├── utils │ │ ├── __init__.py │ │ ├── scripts │ │ │ ├── __init__.py │ │ │ ├── cpu_script.py │ │ │ └── gpu_script.py │ │ └── in_cluster │ │ │ ├── __init__.py │ │ │ └── setup.py │ ├── __init__.py │ └── ui │ │ ├── __init__.py │ │ └── pages │ │ └── __init__.py ├── ui │ ├── __init__.py │ └── pages │ │ └── __init__.py ├── auth-test.crt └── test_cluster_yamls │ ├── appwrapper │ └── test-case-bad.yaml │ └── support_clusters │ ├── test-rc-b.yaml │ └── test-rc-a.yaml ├── .gitattributes ├── src └── codeflare_sdk │ ├── vendored │ ├── python_client │ │ ├── utils │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── constants.py │ ├── .gitignore │ ├── __init__.py │ ├── python_client_test │ │ ├── README.md │ │ └── test_director.py │ ├── pyproject.toml │ └── examples │ │ ├── use-builder.py │ │ ├── use-director.py │ │ └── use-utils.py │ ├── ray │ ├── client │ │ └── __init__.py │ ├── appwrapper │ │ ├── __init__.py │ │ ├── status.py │ │ ├── test_awload.py │ │ ├── awload.py │ │ └── test_status.py │ ├── rayjobs │ │ ├── __init__.py │ │ ├── test │ │ │ └── conftest.py │ │ ├── status.py │ │ └── pretty_print.py │ ├── cluster │ │ ├── __init__.py │ │ ├── status.py │ │ ├── test_status.py │ │ └── test_build_ray_cluster.py │ └── __init__.py │ ├── common │ ├── widgets │ │ └── __init__.py │ ├── kueue │ │ └── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── k8s_utils.py │ │ ├── demos.py │ │ ├── utils.py │ │ └── test_demos.py │ ├── __init__.py │ └── kubernetes_cluster │ │ ├── __init__.py │ │ └── kube_api_helpers.py │ └── __init__.py ├── assets └── images │ └── sdk-diagram.png ├── demo-notebooks ├── additional-demos │ ├── batch-inference │ │ ├── requirements.txt │ │ └── simple_batch_inf.py │ ├── requirements.txt │ └── remote_ray_job_client.ipynb └── guided-demos │ ├── requirements.txt │ ├── preview_nbs │ └── requirements.txt │ ├── notebook-ex-outputs │ └── requirements.txt │ ├── download_mnist_datasets.py │ ├── mnist_fashion.py │ ├── 5_submit_rayjob_cr.ipynb │ └── 3_widget_example.ipynb ├── docs ├── images │ ├── codeflare_sdk.png │ └── codeflare_stack_arch.png ├── sphinx │ ├── user-docs │ │ ├── images │ │ │ ├── ui-buttons.png │ │ │ └── ui-view-clusters.png │ │ ├── ui-widgets.rst │ │ ├── authentication.rst │ │ ├── s3-compatible-storage.rst │ │ ├── setup-kueue.rst │ │ └── ray-cluster-interaction.rst │ ├── Makefile │ ├── make.bat │ ├── index.rst │ └── conf.py └── generate-documentation.md ├── ui-tests ├── .yarnrc ├── jupyter_server_config.py ├── tests │ └── widget_notebook_example.test.ts-snapshots │ │ ├── widgets-cell-0-linux.png │ │ ├── widgets-cell-2-linux.png │ │ ├── widgets-cell-3-linux.png │ │ ├── widgets-cell-4-linux.png │ │ └── widgets-cell-5-linux.png ├── playwright.config.js └── package.json ├── images └── tests │ └── entrypoint.sh ├── .github ├── build │ ├── README.md │ └── Containerfile ├── workflows │ ├── pre-commit.yaml │ ├── snyk-security.yaml │ ├── dependabot-labeler.yaml │ ├── unit-tests.yml │ ├── publish-documentation.yaml │ ├── build-test-image.yaml │ ├── coverage-badge.yaml │ └── ui_notebooks_test.yaml ├── resources │ ├── wait_for_job_cell.json │ └── minio_remote_config_cell.json └── dependabot.yml ├── codecov.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Makefile ├── OWNERS ├── coverage.svg ├── CONTRIBUTING.md ├── README.md ├── target_users.md └── pyproject.toml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e/rayjob/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/upgrade/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/security/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/upgrade/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/security/test_mtls.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/job_submission/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/kueue_integration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/kueue_integration/test_admission.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/kueue_integration/test_queueing.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/security/test_network_policies.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-detectable=false 2 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/creation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/job_submission/rayjob_client/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/job_submission/rayjob_cr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/python_client/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/configuration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/interactive/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/interactive/test_remote.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/job_submission/rayjob_client/test_remote.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/kueue_integration/test_resource_flavors.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/configuration/test_advanced.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/configuration/test_images.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/configuration/test_resources.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/configuration/test_volumes.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/creation/test_cluster_kueue.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/interactive/test_in_cluster.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/job_submission/rayjob_client/test_in_cluster.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/configuration/test_heterogeneous.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/cluster_management/creation/test_cluster_creation.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/job_submission/rayjob_cr/test_lifecycled_cluster.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e_v2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Utility modules for E2E tests 2 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/python_client/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1.0" 2 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/client/__init__.py: -------------------------------------------------------------------------------- 1 | from .ray_jobs import RayJobClient 2 | -------------------------------------------------------------------------------- /tests/e2e_v2/utils/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # Placeholder scripts for RayJob entrypoints 2 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/widgets/__init__.py: -------------------------------------------------------------------------------- 1 | from .widgets import ( 2 | view_clusters, 3 | ) 4 | -------------------------------------------------------------------------------- /tests/e2e_v2/__init__.py: -------------------------------------------------------------------------------- 1 | # E2E Test Suite v2 2 | # Restructured pytest-based E2E tests for CodeFlare SDK 3 | -------------------------------------------------------------------------------- /assets/images/sdk-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/assets/images/sdk-diagram.png -------------------------------------------------------------------------------- /demo-notebooks/additional-demos/batch-inference/requirements.txt: -------------------------------------------------------------------------------- 1 | vllm 2 | transformers 3 | triton>=2.0.0 4 | torch>=2.0.0 5 | -------------------------------------------------------------------------------- /docs/images/codeflare_sdk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/docs/images/codeflare_sdk.png -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==2.4.0 2 | ray_lightning 3 | torchmetrics==1.8.2 4 | torchvision==0.20.1 5 | -------------------------------------------------------------------------------- /docs/images/codeflare_stack_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/docs/images/codeflare_stack_arch.png -------------------------------------------------------------------------------- /ui-tests/.yarnrc: -------------------------------------------------------------------------------- 1 | disable-self-update-check true 2 | ignore-optional true 3 | network-timeout "300000" 4 | registry "https://registry.npmjs.org/" 5 | -------------------------------------------------------------------------------- /demo-notebooks/additional-demos/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==1.9.5 2 | ray_lightning 3 | torchmetrics==0.9.1 4 | torchvision==0.19.0 5 | minio 6 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/preview_nbs/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==2.4.0 2 | ray_lightning 3 | torchmetrics==1.8.2 4 | torchvision==0.20.1 5 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/images/ui-buttons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/docs/sphinx/user-docs/images/ui-buttons.png -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/notebook-ex-outputs/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==2.4.0 2 | ray_lightning 3 | torchmetrics==1.8.2 4 | torchvision==0.20.1 5 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/__init__.py: -------------------------------------------------------------------------------- 1 | from .awload import AWManager 2 | 3 | from .status import ( 4 | AppWrapperStatus, 5 | AppWrapper, 6 | ) 7 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/images/ui-view-clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/docs/sphinx/user-docs/images/ui-view-clusters.png -------------------------------------------------------------------------------- /src/codeflare_sdk/common/kueue/__init__.py: -------------------------------------------------------------------------------- 1 | from .kueue import ( 2 | get_default_kueue_name, 3 | local_queue_exists, 4 | add_queue_label, 5 | list_local_queues, 6 | ) 7 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common utilities for the CodeFlare SDK. 3 | """ 4 | 5 | from .k8s_utils import get_current_namespace 6 | 7 | __all__ = ["get_current_namespace"] 8 | -------------------------------------------------------------------------------- /tests/e2e/mnist_pip_requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu118 2 | torch==2.7.1+cu118 3 | torchvision==0.22.1+cu118 4 | pytorch_lightning==1.9.5 5 | torchmetrics==1.8.2 6 | minio 7 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/rayjobs/__init__.py: -------------------------------------------------------------------------------- 1 | from .rayjob import RayJob, ManagedClusterConfig 2 | from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo 3 | from .config import ManagedClusterConfig 4 | -------------------------------------------------------------------------------- /ui-tests/jupyter_server_config.py: -------------------------------------------------------------------------------- 1 | from jupyterlab.galata import configure_jupyter_server 2 | 3 | configure_jupyter_server(c) 4 | 5 | # Uncomment to set server log level to debug level 6 | # c.ServerApp.log_level = "DEBUG" 7 | -------------------------------------------------------------------------------- /images/tests/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Entrypoint script that handles -- separator in podman commands 3 | # Passes all arguments to run-tests.sh which will forward them to pytest 4 | 5 | exec /codeflare-sdk/run-tests.sh "$@" 6 | -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-0-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-0-linux.png -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-2-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-2-linux.png -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-3-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-3-linux.png -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png -------------------------------------------------------------------------------- /.github/build/README.md: -------------------------------------------------------------------------------- 1 | # Pre-Commit Build Artifacts 2 | 3 | This directory contains the artifacts required to build the codeflare-sdk pre-commit image. 4 | 5 | To build the image run `podman build -f .github/build/Containerfile .` from the root directory. 6 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Importing everything from the kubernetes_cluster module 2 | from .kubernetes_cluster import ( 3 | Authentication, 4 | KubeConfiguration, 5 | TokenAuthentication, 6 | KubeConfigFileAuthentication, 7 | _kube_api_error_handling, 8 | ) 9 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from .status import ( 2 | RayClusterStatus, 3 | CodeFlareClusterStatus, 4 | RayCluster, 5 | ) 6 | 7 | from .cluster import ( 8 | Cluster, 9 | ClusterConfiguration, 10 | get_cluster, 11 | list_all_queued, 12 | list_all_clusters, 13 | ) 14 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/kubernetes_cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from .auth import ( 2 | Authentication, 3 | KubeConfiguration, 4 | TokenAuthentication, 5 | KubeConfigFileAuthentication, 6 | config_check, 7 | get_api_client, 8 | ) 9 | 10 | from .kube_api_helpers import _kube_api_error_handling 11 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "**/*.ipynb" 3 | - "demo-notebooks/**" 4 | - "**/__init__.py" 5 | 6 | coverage: 7 | precision: 2 8 | round: down 9 | status: 10 | project: 11 | default: 12 | target: auto 13 | threshold: 2.5% 14 | patch: 15 | default: 16 | target: 85% 17 | threshold: 2.5% 18 | -------------------------------------------------------------------------------- /tests/e2e/install-codeflare-sdk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd .. 4 | 5 | # Install Poetry and configure virtualenvs 6 | pip install poetry 7 | poetry config virtualenvs.create false 8 | 9 | cd codeflare-sdk 10 | 11 | # Lock dependencies and install them 12 | poetry lock 13 | poetry install --with test,docs 14 | 15 | # Return to the workdir 16 | cd .. 17 | cd workdir 18 | -------------------------------------------------------------------------------- /ui-tests/playwright.config.js: -------------------------------------------------------------------------------- 1 | const baseConfig = require('@jupyterlab/galata/lib/playwright-config'); 2 | 3 | module.exports = { 4 | ...baseConfig, 5 | timeout: 600000, 6 | webServer: { 7 | command: 'yarn start', 8 | url: 'http://localhost:8888/lab', 9 | timeout: 120 * 1000, 10 | reuseExistingServer: !process.env.CI, 11 | }, 12 | retries: 0, 13 | }; 14 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/python_client/constants.py: -------------------------------------------------------------------------------- 1 | # Declares the constants that are used by the client 2 | import logging 3 | 4 | # Group, Version, Plural 5 | GROUP = "ray.io" 6 | CLUSTER_VERSION = "v1" 7 | JOB_VERSION = "v1" 8 | CLUSTER_PLURAL = "rayclusters" 9 | JOB_PLURAL = "rayjobs" 10 | CLUSTER_KIND = "RayCluster" 11 | JOB_KIND = "RayJob" 12 | # log level 13 | LOGLEVEL = logging.INFO 14 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yaml: -------------------------------------------------------------------------------- 1 | name: Pre-commit 2 | on: 3 | pull_request: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | precommit: 8 | runs-on: ubuntu-latest 9 | container: 10 | image: quay.io/project-codeflare/codeflare-sdk-precommit:v0.0.1 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Run pre-commit checks 15 | run: pre-commit run --all-files 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | .python-version 3 | __pycache__/ 4 | .coverage 5 | Pipfile 6 | Pipfile.lock 7 | .venv* 8 | build/ 9 | tls-cluster-namespace 10 | quicktest.yaml 11 | node_modules 12 | .DS_Store 13 | ui-tests/playwright-report 14 | ui-tests/test-results 15 | /src/codeflare_sdk.egg-info/ 16 | docs/sphinx/_build 17 | docs/sphinx/codeflare_sdk.*.rst 18 | docs/sphinx/codeflare_sdk.rst 19 | docs/sphinx/modules.rst 20 | .idea/ 21 | .cursor/plans/ 22 | .cursor/commands/ 23 | /results 24 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | 10 | # Distribution / packaging 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | .tox/ 31 | htmlcov 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | args: [--allow-multiple-documents] 11 | - id: check-added-large-files 12 | - repo: https://github.com/psf/black 13 | rev: 23.3.0 14 | hooks: 15 | - id: black 16 | language_version: python3.9 17 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/__init__.py: -------------------------------------------------------------------------------- 1 | from .appwrapper import AppWrapper, AppWrapperStatus, AWManager 2 | 3 | from .client import ( 4 | RayJobClient, 5 | ) 6 | 7 | from .rayjobs import ( 8 | RayJob, 9 | ManagedClusterConfig, 10 | RayJobDeploymentStatus, 11 | CodeflareRayJobStatus, 12 | RayJobInfo, 13 | ) 14 | 15 | from .cluster import ( 16 | Cluster, 17 | ClusterConfiguration, 18 | get_cluster, 19 | list_all_queued, 20 | list_all_clusters, 21 | RayClusterStatus, 22 | CodeFlareClusterStatus, 23 | RayCluster, 24 | ) 25 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vendored third-party dependencies. 3 | 4 | This directory contains code from external projects that are bundled 5 | with codeflare-sdk to avoid PyPI publishing restrictions. 6 | 7 | Contents: 8 | - python_client: KubeRay Python client from ray-project/kuberay 9 | Source: https://github.com/ray-project/kuberay @ b2fd91b58c2bbe22f9b4f730c5a8f3180c05e570 10 | License: Apache 2.0 (see LICENSE file) 11 | 12 | Vendored because the python-client is not published to PyPI and PyPI 13 | does not allow direct git dependencies. 14 | """ 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Image tag for image containing e2e tests 2 | E2E_TEST_IMAGE_VERSION ?= latest 3 | E2E_TEST_IMAGE ?= quay.io/opendatahub/codeflare-sdk-tests:${E2E_TEST_IMAGE_VERSION} 4 | 5 | # Build the test image 6 | .PHONY: build-test-image 7 | build-test-image: 8 | @echo "Building test image: $(E2E_TEST_IMAGE)" 9 | # Build the Docker image using podman 10 | podman build -f images/tests/Dockerfile -t $(E2E_TEST_IMAGE) . 11 | 12 | # Push the test image 13 | .PHONY: push-test-image 14 | push-test-image: 15 | @echo "Pushing test image: $(E2E_TEST_IMAGE)" 16 | podman push $(E2E_TEST_IMAGE) 17 | -------------------------------------------------------------------------------- /tests/ui/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/e2e_v2/ui/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/ui/pages/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/e2e_v2/ui/pages/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | approvers: 2 | - astefanutti 3 | - Bobbins228 4 | - CathalOConnorRH 5 | - chipspeak 6 | - ChristianZaccaria 7 | - dimakis 8 | - Fiona-Waters 9 | - franciscojavierarceo 10 | - kpostoffice 11 | - kryanbeane 12 | - laurafitzgerald 13 | - pawelpaszki 14 | - pmccarthy 15 | - szaher 16 | - varshaprasad96 17 | reviewers: 18 | - astefanutti 19 | - Bobbins228 20 | - CathalOConnorRH 21 | - chipspeak 22 | - ChristianZaccaria 23 | - dimakis 24 | - Fiona-Waters 25 | - franciscojavierarceo 26 | - kpostoffice 27 | - kryanbeane 28 | - laurafitzgerald 29 | - pawelpaszki 30 | - pmccarthy 31 | - szaher 32 | - varshaprasad96 33 | - Ygnas 34 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/constants.py: -------------------------------------------------------------------------------- 1 | RAY_VERSION = "2.52.1" 2 | """ 3 | The below are used to define the default runtime image for the Ray Cluster. 4 | * For python 3.11:ray:2.52.1-py311-cu121 5 | * For python 3.12:ray:2.52.1-py312-cu128 6 | """ 7 | CUDA_PY311_RUNTIME_IMAGE = "quay.io/modh/ray@sha256:595b3acd10244e33fca1ed5469dccb08df66f470df55ae196f80e56edf35ad5a" 8 | CUDA_PY312_RUNTIME_IMAGE = "quay.io/modh/ray@sha256:6b135421b6e756593a58b4df6664f82fc4b55237ca81475f2867518f15fe6d84" 9 | 10 | # Centralized image selection 11 | SUPPORTED_PYTHON_VERSIONS = { 12 | "3.11": CUDA_PY311_RUNTIME_IMAGE, 13 | "3.12": CUDA_PY312_RUNTIME_IMAGE, 14 | } 15 | MOUNT_PATH = "/home/ray/files" 16 | -------------------------------------------------------------------------------- /.github/resources/wait_for_job_cell.json: -------------------------------------------------------------------------------- 1 | { 2 | "cell_type": "code", 3 | "execution_count": null, 4 | "metadata": {}, 5 | "outputs": [], 6 | "source": [ 7 | "from time import sleep\n", 8 | "\n", 9 | "finished = False\n", 10 | "while not finished:\n", 11 | " sleep(5)\n", 12 | " status = client.get_job_status(submission_id)\n", 13 | " finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n", 14 | " print(status)\n", 15 | "print(\"Job status \" + status)\n", 16 | "print(\"Logs: \")\n", 17 | "print(client.get_job_logs(submission_id))\n", 18 | "assert status == \"SUCCEEDED\", \"Job failed or was stopped!\"" 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /docs/sphinx/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /.github/build/Containerfile: -------------------------------------------------------------------------------- 1 | FROM registry.redhat.io/ubi9/python-39:latest 2 | 3 | LABEL summary="Toolchain for running pre-commit hooks." \ 4 | description="Toolchain for running pre-commit hooks" \ 5 | io.k8s.display-name="Pre-Commit Toolchain" 6 | 7 | USER root 8 | RUN dnf install nodejs -y && \ 9 | dnf clean all && \ 10 | rm -rf /var/cache/dnf 11 | ADD https://mirror.openshift.com/pub/openshift-v4/clients/oc/latest/linux/oc.tar.gz $TMPDIR/ 12 | RUN tar -C /usr/local/bin -xvf $TMPDIR/oc.tar.gz && \ 13 | chmod +x /usr/local/bin/oc && \ 14 | rm $TMPDIR/oc.tar.gz 15 | USER $USERID 16 | 17 | RUN pip3 install poetry && \ 18 | poetry config virtualenvs.create false 19 | COPY pyproject.toml ./ 20 | RUN poetry install 21 | 22 | CMD bash 23 | -------------------------------------------------------------------------------- /.github/resources/minio_remote_config_cell.json: -------------------------------------------------------------------------------- 1 | { 2 | "cell_type": "code", 3 | "execution_count": null, 4 | "metadata": {}, 5 | "outputs": [], 6 | "source": [ 7 | "@ray.remote\n", 8 | "def get_minio_run_config():\n", 9 | " import s3fs\n", 10 | " import pyarrow\n", 11 | " s3_fs = s3fs.S3FileSystem(\n", 12 | " key = \"minio\",\n", 13 | " secret = \"minio123\",\n", 14 | " endpoint_url = \"http://minio-service.default.svc.cluster.local:9000\"\n", 15 | " )\n", 16 | " custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))\n", 17 | " run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)\n", 18 | " return run_config" 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/python_client_test/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | ## For developers 4 | 5 | 1. `pip install -U pip setuptools` 6 | 1. `cd clients/python-client && pip install -e .` 7 | 8 | Uninstall with `pip uninstall python-client`. 9 | 10 | ## For testing run 11 | 12 | `python -m unittest discover 'clients/python-client/python_client_test/'` 13 | 14 | ### Coverage report 15 | 16 | #### Pre-requisites 17 | 18 | * `sudo apt install libsqlite3-dev` 19 | * `pyenv install 3.6.5` # or your Python version 20 | * `pip install db-sqlite3 coverage` 21 | 22 | __To gather data__ 23 | `python -m coverage run -m unittest` 24 | 25 | __to generate a coverage report__ 26 | `python -m coverage report` 27 | 28 | __to generate the test coverage report in HTML format__ 29 | `python -m coverage html` 30 | -------------------------------------------------------------------------------- /docs/generate-documentation.md: -------------------------------------------------------------------------------- 1 | # Generate CodeFlare Documentation with Sphinx 2 | The following is a short guide on how you can use Sphinx to auto-generate code documentation. Documentation for the latest SDK release can be found [here](https://project-codeflare.github.io/codeflare-sdk/index.html). 3 | 4 | 1. Clone the CodeFlare SDK 5 | ``` bash 6 | git clone https://github.com/project-codeflare/codeflare-sdk.git 7 | ``` 8 | 2. [Install Sphinx](https://www.sphinx-doc.org/en/master/usage/installation.html) 9 | 3. Run the below command to generate code documentation 10 | ``` bash 11 | sphinx-apidoc -o docs/sphinx src/codeflare_sdk "**/*test_*" --force # Generates RST files 12 | make html -C docs/sphinx # Builds HTML files 13 | ``` 14 | 4. You can access the docs locally at `docs/sphinx/_build/html/index.html` 15 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "python-client" 3 | version = "0.0.0-dev" 4 | description = "Python Client for Kuberay" 5 | license = "Apache-2.0" 6 | 7 | readme = "README.md" 8 | repository = "https://github.com/ray-project/kuberay" 9 | homepage = "https://github.com/ray-project/kuberay" 10 | keywords = ["kuberay", "python", "client"] 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | "Operating System :: OS Independent" 15 | ] 16 | packages = [ 17 | { include = "python_client" } 18 | ] 19 | 20 | [tool.poetry.dependencies] 21 | python = "^3.11" 22 | kubernetes = ">=25.0.0" 23 | 24 | [build-system] 25 | requires = ["poetry-core>=1.0.0"] 26 | build-backend = "poetry.core.masonry.api" 27 | -------------------------------------------------------------------------------- /ui-tests/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@jupyter-widgets/ui-tests", 3 | "private": true, 4 | "version": "0.1.0", 5 | "description": "ipywidgets UI Tests", 6 | "scripts": { 7 | "start": "jupyter lab --config ./jupyter_server_config.py", 8 | "start:detached": "jlpm start&", 9 | "test": "npx playwright test", 10 | "test:debug": "PWDEBUG=1 npx playwright test", 11 | "test:report": "http-server ./playwright-report -a localhost -o", 12 | "test:update": "npx playwright test --update-snapshots", 13 | "deduplicate": "jlpm && yarn-deduplicate -s fewer --fail" 14 | }, 15 | "author": "Project Jupyter", 16 | "license": "BSD-3-Clause", 17 | "devDependencies": { 18 | "@jupyterlab/galata": "^5.3.0", 19 | "@playwright/test": "^1.57.0", 20 | "yarn-deduplicate": "^6.0.1" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/snyk-security.yaml: -------------------------------------------------------------------------------- 1 | name: Snyk Security 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | snyk-scan: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout code 12 | uses: actions/checkout@v4 13 | 14 | - name: Install Snyk CLI 15 | run: npm install -g snyk 16 | 17 | - name: Snyk Monitor and Test multiple projects 18 | env: 19 | SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} 20 | SNYK_ORG: ${{ secrets.SNYK_ORG }} 21 | run: | 22 | echo "Fetching tags" 23 | git fetch origin 'refs/tags/*:refs/tags/*' 24 | 25 | echo "Authenticating with Snyk" 26 | snyk auth ${SNYK_TOKEN} 27 | 28 | echo "Scanning project: codeflare-sdk/main" 29 | snyk monitor --all-projects --exclude=requirements.txt --org=${SNYK_ORG} --target-reference="main" 30 | -------------------------------------------------------------------------------- /tests/e2e/start_ray_cluster.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from time import sleep 5 | 6 | from codeflare_sdk.ray.cluster.cluster import Cluster, ClusterConfiguration 7 | 8 | namespace = sys.argv[1] 9 | ray_image = os.getenv("RAY_IMAGE") 10 | 11 | cluster = Cluster( 12 | ClusterConfiguration( 13 | name="mnist", 14 | namespace=namespace, 15 | num_workers=1, 16 | head_cpu_requests="500m", 17 | head_cpu_limits="500m", 18 | head_memory_requests=2, 19 | head_memory_limits=2, 20 | worker_cpu_requests="500m", 21 | worker_cpu_limits=1, 22 | worker_memory_requests=1, 23 | worker_memory_limits=2, 24 | image=ray_image, 25 | appwrapper=True, 26 | ) 27 | ) 28 | 29 | cluster.apply() 30 | 31 | cluster.status() 32 | 33 | cluster.wait_ready() 34 | 35 | cluster.status() 36 | 37 | cluster.details() 38 | -------------------------------------------------------------------------------- /docs/sphinx/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /coverage.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | coverage 17 | coverage 18 | 94% 19 | 94% 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/codeflare_sdk/__init__.py: -------------------------------------------------------------------------------- 1 | from .ray import ( 2 | Cluster, 3 | ClusterConfiguration, 4 | RayClusterStatus, 5 | CodeFlareClusterStatus, 6 | RayCluster, 7 | get_cluster, 8 | list_all_queued, 9 | list_all_clusters, 10 | AWManager, 11 | AppWrapperStatus, 12 | RayJobClient, 13 | RayJob, 14 | ManagedClusterConfig, 15 | ) 16 | 17 | from .common.widgets import view_clusters 18 | 19 | from .common import ( 20 | Authentication, 21 | KubeConfiguration, 22 | TokenAuthentication, 23 | KubeConfigFileAuthentication, 24 | ) 25 | 26 | from .common.kueue import ( 27 | list_local_queues, 28 | ) 29 | 30 | from .common.utils import generate_cert 31 | from .common.utils.demos import copy_demo_nbs 32 | 33 | from importlib.metadata import version, PackageNotFoundError 34 | 35 | try: 36 | __version__ = version("codeflare-sdk") # use metadata associated with built package 37 | 38 | except PackageNotFoundError: 39 | __version__ = "v0.0.0" 40 | -------------------------------------------------------------------------------- /.github/workflows/dependabot-labeler.yaml: -------------------------------------------------------------------------------- 1 | # This workflow file adds the 'lgtm' and 'approved' labels to Dependabot PRs 2 | # This is done to ensure that the PRs that pass required status checks are automatically merged by the CodeFlare bot 3 | name: Dependabot Labeler 4 | 5 | on: 6 | pull_request_target: 7 | branches: [ main ] 8 | 9 | jobs: 10 | add-approve-lgtm-label: 11 | if: ${{ github.actor == 'dependabot[bot]' && contains(github.event.pull_request.labels.*.name, 'dependabot') }} 12 | runs-on: ubuntu-latest 13 | 14 | # Permission required to edit a PR 15 | permissions: 16 | pull-requests: write 17 | issues: write 18 | 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v4 22 | 23 | - name: Add approve and lgtm labels to Dependabot PR 24 | run: | 25 | gh pr edit ${{ github.event.pull_request.number }} --add-label "lgtm" --add-label "approved" 26 | env: 27 | GITHUB_TOKEN: ${{ secrets.GH_CLI_TOKEN }} 28 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/k8s_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Kubernetes utility functions for the CodeFlare SDK. 3 | """ 4 | 5 | import os 6 | from kubernetes import config 7 | from ..kubernetes_cluster import config_check, _kube_api_error_handling 8 | 9 | 10 | def get_current_namespace(): # pragma: no cover 11 | """ 12 | Retrieves the current Kubernetes namespace. 13 | 14 | Returns: 15 | str: 16 | The current namespace or None if not found. 17 | """ 18 | if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"): 19 | try: 20 | file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") 21 | active_context = file.readline().strip("\n") 22 | return active_context 23 | except Exception as e: 24 | print("Unable to find current namespace") 25 | print("trying to gather from current context") 26 | try: 27 | _, active_context = config.list_kube_config_contexts(config_check()) 28 | except Exception as e: 29 | return _kube_api_error_handling(e) 30 | try: 31 | return active_context["context"]["namespace"] 32 | except KeyError: 33 | return None 34 | -------------------------------------------------------------------------------- /tests/e2e_v2/utils/in_cluster/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | In-cluster test execution utilities. 3 | 4 | This package provides functions for setting up and managing test execution 5 | inside Kubernetes pods, including RBAC setup, service account management, 6 | and pod execution. 7 | """ 8 | 9 | from .rbac import ( 10 | create_test_service_account, 11 | create_rayjob_rbac, 12 | delete_test_service_account, 13 | ) 14 | from .setup import ( 15 | setup_in_cluster_test_environment, 16 | cleanup_in_cluster_test_environment, 17 | ) 18 | from .pod_execution import ( 19 | PodExecutionResult, 20 | create_test_pod, 21 | create_sdk_test_pod, 22 | run_code_in_pod, 23 | wait_for_pod_completion, 24 | get_pod_logs, 25 | delete_test_pod, 26 | cleanup_test_pods, 27 | ) 28 | 29 | __all__ = [ 30 | "create_test_service_account", 31 | "create_rayjob_rbac", 32 | "delete_test_service_account", 33 | "setup_in_cluster_test_environment", 34 | "cleanup_in_cluster_test_environment", 35 | "PodExecutionResult", 36 | "create_test_pod", 37 | "create_sdk_test_pod", 38 | "run_code_in_pod", 39 | "wait_for_pod_completion", 40 | "get_pod_logs", 41 | "delete_test_pod", 42 | "cleanup_test_pods", 43 | ] 44 | -------------------------------------------------------------------------------- /tests/auth-test.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDOTCCAiGgAwIBAgIUENjaZDrvhc5uV3j7GI8deZJwc+YwDQYJKoZIhvcNAQEL 3 | BQAwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM 4 | GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDAeFw0yNDA1MTMxMTE1NDZaFw0yNTA1 5 | MTMxMTE1NDZaMEUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEw 6 | HwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQwggEiMA0GCSqGSIb3DQEB 7 | AQUAA4IBDwAwggEKAoIBAQDEYYk81jvPijZXXeI9cByf5EIbOVaBTH7I51J9EKG5 8 | Y/KRXI43WgvVEiZ3jP8LJnSD79WhBiL6TgadQZje5ndroRYDM9vyqz1OUZapnOO+ 9 | yzl01y/qSsH8Kn88eLAzkE9HSu4QN9PuJtySyksjDFQJ6kjyE8ZHUSorur0FlLLf 10 | IToFgTuaIPDYjvFRchOCfZ7sV/MF7LxqFfFnaWOYvH41ZdvqJiRcVsMi+mYs9/I/ 11 | I72IMXwVnQDVnK8H84ntEmHNN6NoVuMKla0So4/wKcHJSCgS3axLI2Ka2aaaJo9K 12 | l2cn21NOyodF+DaSFy7qaGRXxoTQ2k9tUrSvxkBJvRmBAgMBAAGjITAfMB0GA1Ud 13 | DgQWBBRTK8mO5XMcmR+Xg/PVNFnvz4eubDANBgkqhkiG9w0BAQsFAAOCAQEAlZva 14 | 6ws3zRff7u0tWT2JJaE1uPqsuAdHtVvEyAMp2QvYfyrgADTroUTaSU4p6ppX/t7v 15 | ynHhuzR6UOVkuY0/CH1P3UUGrEPNOXT8i2BDwL+j4y2K2aRN8zU0Nu/IVePBhu+4 16 | Jdt+3P7/MuwiCON5JukgxUYlQKhVhzFj7GOd2+Ca+fh8Siq3tkWDSN54+90fgylQ 17 | +74Yfya1NVabpzLqP3Isqu2XQhEVaBFvj8Yu0h83e3D8LeQToC3mVMF4yy5BZ9Ty 18 | K66YGlGQgszWEUFPEdsB8Dj/iJMhkWXuyc3u/w0s3t7rXeMYYgr+xrEeK+g0oyB5 19 | xeZuMjd567Znmu5oMw== 20 | -----END CERTIFICATE----- 21 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Python Tests 2 | 3 | on: 4 | pull_request: 5 | branches: [ main, ray-jobs-feature ] 6 | push: 7 | branches: [ main, ray-jobs-feature ] 8 | 9 | jobs: 10 | unit-tests: 11 | 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Set up python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.11' 20 | - name: Install poetry 21 | run: pip install poetry 22 | - name: Install dependencies with poetry 23 | run: | 24 | poetry config virtualenvs.create false 25 | poetry lock 26 | poetry install --with test 27 | - name: Test with pytest and check coverage 28 | run: | 29 | coverage run --omit="src/**/test_*.py,src/codeflare_sdk/common/utils/unit_test_support.py,src/codeflare_sdk/vendored/**" -m pytest 30 | coverage=$(coverage report -m | tail -1 | tail -c 4 | head -c 2) 31 | if (( $coverage < 90 )); then echo "Coverage failed at ${coverage}%"; exit 1; else echo "Coverage passed, ${coverage}%"; fi 32 | - name: Upload to Codecov 33 | uses: codecov/codecov-action@v4 34 | with: 35 | token: ${{ secrets.CODECOV_TOKEN }} 36 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/demos.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import shutil 3 | 4 | package_dir = pathlib.Path(__file__).parent.parent.parent.resolve() 5 | demo_dir = f"{package_dir}/demo-notebooks" 6 | 7 | 8 | def copy_demo_nbs(dir: str = "./demo-notebooks", overwrite: bool = False): 9 | """ 10 | Copy the demo notebooks from the package to the current working directory 11 | 12 | overwrite=True will overwrite any files that exactly match files written by copy_demo_nbs in the target directory. 13 | Any files that exist in the directory that don't match these values will remain untouched. 14 | 15 | Args: 16 | dir (str): 17 | The directory to copy the demo notebooks to. Defaults to "./demo-notebooks". 18 | overwrite (bool): 19 | Whether to overwrite files in the directory if it already exists. Defaults to False. 20 | 21 | Raises: 22 | FileExistsError: 23 | If the directory already exists. 24 | """ 25 | # does dir exist already? 26 | if overwrite is False and pathlib.Path(dir).exists(): 27 | raise FileExistsError( 28 | f"Directory {dir} already exists. Please remove it or provide a different location." 29 | ) 30 | 31 | shutil.copytree(demo_dir, dir, dirs_exist_ok=True) 32 | -------------------------------------------------------------------------------- /docs/sphinx/index.rst: -------------------------------------------------------------------------------- 1 | .. CodeFlare SDK documentation master file, created by 2 | sphinx-quickstart on Thu Oct 10 11:27:58 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | CodeFlare SDK documentation 7 | =========================== 8 | 9 | The CodeFlare SDK is an intuitive, easy-to-use python interface for batch resource requesting, access, job submission, and observation. Simplifying the developer's life while enabling access to high-performance compute resources, either in the cloud or on-prem. 10 | 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | :caption: Code Documentation: 15 | 16 | modules 17 | 18 | .. toctree:: 19 | :maxdepth: 1 20 | :caption: User Documentation: 21 | 22 | user-docs/authentication 23 | user-docs/cluster-configuration 24 | user-docs/ray-cluster-interaction 25 | user-docs/e2e 26 | user-docs/s3-compatible-storage 27 | user-docs/setup-kueue 28 | user-docs/ui-widgets 29 | 30 | Quick Links 31 | =========== 32 | - `PyPi `__ 33 | - `GitHub `__ 34 | - `OpenShift AI Documentation `__ 35 | -------------------------------------------------------------------------------- /docs/sphinx/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert(0, os.path.abspath("..")) 10 | 11 | # -- Project information ----------------------------------------------------- 12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 13 | 14 | project = "CodeFlare SDK" 15 | copyright = "2024, Project CodeFlare" 16 | author = "Project CodeFlare" 17 | release = "v0.21.1" 18 | 19 | # -- General configuration --------------------------------------------------- 20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 21 | 22 | extensions = [ 23 | "sphinx.ext.autodoc", 24 | "sphinx.ext.todo", 25 | "sphinx.ext.viewcode", 26 | "sphinx.ext.autosummary", 27 | "sphinx_rtd_theme", 28 | ] 29 | 30 | templates_path = ["_templates"] 31 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 32 | 33 | 34 | # -- Options for HTML output ------------------------------------------------- 35 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 36 | 37 | html_theme = "sphinx_rtd_theme" 38 | html_static_path = ["_static"] 39 | -------------------------------------------------------------------------------- /tests/e2e/mnist_rayjob.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from time import sleep 4 | 5 | from support import * 6 | 7 | from codeflare_sdk.ray.cluster.cluster import get_cluster 8 | from codeflare_sdk.ray.client import RayJobClient 9 | 10 | namespace = sys.argv[1] 11 | 12 | cluster = get_cluster("mnist", namespace) 13 | 14 | cluster.details() 15 | 16 | auth_token = run_oc_command(["whoami", "--show-token=true"]) 17 | ray_dashboard = cluster.cluster_dashboard_uri() 18 | header = {"Authorization": f"Bearer {auth_token}"} 19 | client = RayJobClient(address=ray_dashboard, headers=header, verify=True) 20 | 21 | # Submit the job 22 | submission_id = client.submit_job( 23 | entrypoint="python mnist.py", 24 | runtime_env={"working_dir": "/", "pip": "requirements.txt"}, 25 | ) 26 | print(f"Submitted job with ID: {submission_id}") 27 | done = False 28 | time = 0 29 | timeout = 900 30 | while not done: 31 | status = client.get_job_status(submission_id) 32 | if status.is_terminal(): 33 | break 34 | if not done: 35 | print(status) 36 | if timeout and time >= timeout: 37 | raise TimeoutError(f"job has timed out after waiting {timeout}s") 38 | sleep(5) 39 | time += 5 40 | 41 | logs = client.get_job_logs(submission_id) 42 | print(logs) 43 | 44 | client.delete_job(submission_id) 45 | cluster.down() 46 | 47 | 48 | if not status == "SUCCEEDED": 49 | exit(1) 50 | else: 51 | exit(0) 52 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | The status sub-module defines Enums containing information for 17 | AppWrapper states, as well as dataclasses to store information for AppWrappers. 18 | """ 19 | 20 | from dataclasses import dataclass 21 | from enum import Enum 22 | 23 | 24 | class AppWrapperStatus(Enum): 25 | """ 26 | Defines the possible reportable phases of an AppWrapper. 27 | """ 28 | 29 | SUSPENDED = "suspended" 30 | RESUMING = "resuming" 31 | RUNNING = "running" 32 | RESETTING = "resetting" 33 | SUSPENDING = "suspending" 34 | SUCCEEDED = "succeeded" 35 | FAILED = "failed" 36 | TERMINATING = "terminating" 37 | 38 | 39 | @dataclass 40 | class AppWrapper: 41 | """ 42 | For storing information about an AppWrapper. 43 | """ 44 | 45 | name: str 46 | status: AppWrapperStatus 47 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 3 | 4 | version: 2 5 | updates: 6 | # This is to update requirements.txt files in the guided-demos, and e2e directories. 7 | - package-ecosystem: "pip" 8 | directories: 9 | - "**/demo-notebooks/guided-demos*" 10 | - "/tests/e2e" 11 | schedule: 12 | interval: "daily" 13 | ignore: 14 | - dependency-name: "*" 15 | update-types: ["version-update:semver-patch"] 16 | open-pull-requests-limit: 1 17 | labels: 18 | - "dependabot" 19 | - "test-guided-notebooks" 20 | 21 | # pip means poetry in this case, this keeps poetry.lock up to date with constraints in pyproject.toml. 22 | - package-ecosystem: "pip" 23 | directory: "/" 24 | schedule: 25 | interval: "daily" 26 | ignore: 27 | - dependency-name: "*" 28 | update-types: ["version-update:semver-patch"] 29 | open-pull-requests-limit: 1 30 | labels: 31 | - "dependabot" 32 | - "test-guided-notebooks" 33 | 34 | # npm means yarn in this case, this keeps yarn.lock up to date with constraints in package.json. 35 | - package-ecosystem: "npm" 36 | directory: "/ui-tests" 37 | schedule: 38 | interval: "daily" 39 | ignore: 40 | - dependency-name: "*" 41 | update-types: ["version-update:semver-patch"] 42 | open-pull-requests-limit: 1 43 | labels: 44 | - "dependabot" 45 | - "test-ui-notebooks" 46 | -------------------------------------------------------------------------------- /.github/workflows/publish-documentation.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Documentation 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | codeflare_sdk_release_version: 7 | type: string 8 | required: true 9 | description: 'Version number (for example: 0.1.0)' 10 | 11 | permissions: 12 | contents: write 13 | 14 | jobs: 15 | docs: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Install Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: 3.11 23 | - name: Install Sphinx 24 | run: | 25 | sudo apt-get update 26 | sudo apt-get install python3-sphinx 27 | - name: Install Poetry 28 | uses: abatilo/actions-poetry@v2 29 | with: 30 | poetry-version: 1.8.3 31 | - name: Create new documentation 32 | run: | 33 | python3 -m venv .venv 34 | source .venv/bin/activate 35 | poetry install --with docs 36 | sed -i 's/release = "v[0-9]\+\.[0-9]\+\.[0-9]\+"/release = "${{ github.event.inputs.codeflare_sdk_release_version }}"/' docs/sphinx/conf.py 37 | sphinx-apidoc -o docs/sphinx src/codeflare_sdk "**/*test_*" --force # Generate docs but ignore test files 38 | make html -C docs/sphinx 39 | - name: Deploy to GitHub Pages 40 | uses: peaceiris/actions-gh-pages@v3 41 | with: 42 | publish_branch: gh-pages 43 | github_token: ${{ secrets.GITHUB_TOKEN }} 44 | publish_dir: docs/sphinx/_build/html 45 | force_orphan: true 46 | -------------------------------------------------------------------------------- /.github/workflows/build-test-image.yaml: -------------------------------------------------------------------------------- 1 | name: Build and Push Test Image 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | inputs: 9 | E2E_TEST_IMAGE_VERSION: 10 | description: 'Tag for the test image (defaults to latest)' 11 | required: false 12 | default: 'latest' 13 | type: string 14 | BRANCH: 15 | description: 'Branch to build from (defaults to main)' 16 | required: false 17 | default: 'main' 18 | type: string 19 | 20 | jobs: 21 | build-and-push: 22 | runs-on: ubuntu-latest 23 | env: 24 | E2E_TEST_IMAGE_VERSION: ${{ github.event.inputs.E2E_TEST_IMAGE_VERSION || 'latest' }} 25 | steps: 26 | - name: Checkout code 27 | uses: actions/checkout@v5 28 | with: 29 | ref: ${{ github.event.inputs.BRANCH || 'main' }} 30 | submodules: recursive 31 | 32 | - name: Login to Quay.io 33 | id: podman-login-quay 34 | env: 35 | QUAY_USERNAME: ${{ secrets.QUAY_ODH_CODEFLARE_SDK_TESTS_USERNAME }} 36 | QUAY_PASSWORD: ${{ secrets.QUAY_ODH_CODEFLARE_SDK_TESTS_PASSWORD }} 37 | run: | 38 | set -euo pipefail 39 | printf '%s' "$QUAY_PASSWORD" | podman login --username "$QUAY_USERNAME" --password-stdin quay.io 40 | 41 | - name: Build test image 42 | run: make build-test-image 43 | 44 | - name: Push test image 45 | run: make push-test-image 46 | 47 | - name: Logout from Quay.io 48 | if: always() && steps.podman-login-quay.outcome == 'success' 49 | run: podman logout quay.io 50 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/download_mnist_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from torchvision.datasets import MNIST 17 | from torchvision import transforms 18 | 19 | 20 | def download_mnist_dataset(destination_dir): 21 | # Ensure the destination directory exists 22 | if not os.path.exists(destination_dir): 23 | os.makedirs(destination_dir) 24 | 25 | # Define transformations 26 | transform = transforms.Compose( 27 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 28 | ) 29 | 30 | # Download the training data 31 | train_set = MNIST( 32 | root=destination_dir, train=True, download=True, transform=transform 33 | ) 34 | 35 | # Download the test data 36 | test_set = MNIST( 37 | root=destination_dir, train=False, download=True, transform=transform 38 | ) 39 | 40 | print(f"MNIST dataset downloaded in {destination_dir}") 41 | 42 | 43 | # Specify the directory where you 44 | destination_dir = os.path.dirname(os.path.abspath(__file__)) 45 | 46 | download_mnist_dataset(destination_dir) 47 | -------------------------------------------------------------------------------- /tests/upgrade/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Conftest for upgrade tests - imports UI fixtures for dashboard tests 17 | """ 18 | 19 | import sys 20 | import os 21 | import pytest 22 | 23 | # Add parent test directory to path 24 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) 25 | 26 | # Import all fixtures from ui/conftest.py 27 | from ui.conftest import ( 28 | selenium_driver, 29 | dashboard_url, 30 | test_credentials, 31 | login_to_dashboard, 32 | ) 33 | 34 | __all__ = ["selenium_driver", "dashboard_url", "test_credentials", "login_to_dashboard"] 35 | 36 | 37 | # Hook to capture test results for teardown methods 38 | @pytest.hookimpl(tryfirst=True, hookwrapper=True) 39 | def pytest_runtest_makereport(item, call): 40 | """ 41 | Hook to capture test results and make them available to teardown methods. 42 | This allows teardown_method to check if the test failed. 43 | """ 44 | outcome = yield 45 | rep = outcome.get_result() 46 | 47 | # Store the result in the item so teardown can access it 48 | setattr(item, f"rep_{rep.when}", rep) 49 | -------------------------------------------------------------------------------- /tests/e2e_v2/upgrade/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Conftest for upgrade tests - imports UI fixtures for dashboard tests 17 | """ 18 | 19 | import sys 20 | import os 21 | import pytest 22 | 23 | # Add parent test directory to path 24 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) 25 | 26 | # Import all fixtures from ui/conftest.py 27 | from ui.conftest import ( 28 | selenium_driver, 29 | dashboard_url, 30 | test_credentials, 31 | login_to_dashboard, 32 | ) 33 | 34 | __all__ = ["selenium_driver", "dashboard_url", "test_credentials", "login_to_dashboard"] 35 | 36 | 37 | # Hook to capture test results for teardown methods 38 | @pytest.hookimpl(tryfirst=True, hookwrapper=True) 39 | def pytest_runtest_makereport(item, call): 40 | """ 41 | Hook to capture test results and make them available to teardown methods. 42 | This allows teardown_method to check if the test failed. 43 | """ 44 | outcome = yield 45 | rep = outcome.get_result() 46 | 47 | # Store the result in the item so teardown can access it 48 | setattr(item, f"rep_{rep.when}", rep) 49 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/rayjobs/test/conftest.py: -------------------------------------------------------------------------------- 1 | """Shared pytest fixtures for rayjobs tests.""" 2 | 3 | import pytest 4 | from unittest.mock import MagicMock 5 | 6 | 7 | # Global test setup that runs automatically for ALL tests 8 | @pytest.fixture(autouse=True) 9 | def auto_mock_setup(mocker): 10 | """Automatically mock common dependencies for all tests.""" 11 | mocker.patch("kubernetes.config.load_kube_config") 12 | 13 | # Always mock get_default_kueue_name to prevent K8s API calls 14 | mocker.patch( 15 | "codeflare_sdk.ray.rayjobs.rayjob.get_default_kueue_name", 16 | return_value="default-queue", 17 | ) 18 | 19 | mock_get_ns = mocker.patch( 20 | "codeflare_sdk.ray.rayjobs.rayjob.get_current_namespace", 21 | return_value="test-namespace", 22 | ) 23 | 24 | mock_rayjob_api = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") 25 | mock_rayjob_instance = MagicMock() 26 | mock_rayjob_api.return_value = mock_rayjob_instance 27 | 28 | mock_cluster_api = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayClusterApi") 29 | mock_cluster_instance = MagicMock() 30 | mock_cluster_api.return_value = mock_cluster_instance 31 | 32 | mock_k8s_api = mocker.patch("kubernetes.client.CoreV1Api") 33 | mock_k8s_instance = MagicMock() 34 | mock_k8s_api.return_value = mock_k8s_instance 35 | 36 | # Mock get_api_client in runtime_env module where it's actually used 37 | mocker.patch("codeflare_sdk.ray.rayjobs.runtime_env.get_api_client") 38 | 39 | # Return the mocked instances so tests can configure them as needed 40 | return { 41 | "rayjob_api": mock_rayjob_instance, 42 | "cluster_api": mock_cluster_instance, 43 | "k8s_api": mock_k8s_instance, 44 | "get_current_namespace": mock_get_ns, 45 | } 46 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/rayjobs/status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | The status sub-module defines Enums containing information for Ray job 17 | deployment states and CodeFlare job states, as well as 18 | dataclasses to store information for Ray jobs. 19 | """ 20 | 21 | from dataclasses import dataclass 22 | from enum import Enum 23 | from typing import Optional 24 | 25 | 26 | class RayJobDeploymentStatus(Enum): 27 | """ 28 | Defines the possible deployment states of a Ray job (from the KubeRay RayJob API). 29 | """ 30 | 31 | COMPLETE = "Complete" 32 | RUNNING = "Running" 33 | FAILED = "Failed" 34 | SUSPENDED = "Suspended" 35 | UNKNOWN = "Unknown" 36 | 37 | 38 | class CodeflareRayJobStatus(Enum): 39 | """ 40 | Defines the possible reportable states of a CodeFlare Ray job. 41 | """ 42 | 43 | COMPLETE = 1 44 | RUNNING = 2 45 | FAILED = 3 46 | SUSPENDED = 4 47 | UNKNOWN = 5 48 | 49 | 50 | @dataclass 51 | class RayJobInfo: 52 | """ 53 | For storing information about a Ray job. 54 | """ 55 | 56 | name: str 57 | job_id: str 58 | status: RayJobDeploymentStatus 59 | namespace: str 60 | cluster_name: str 61 | start_time: Optional[str] = None 62 | end_time: Optional[str] = None 63 | failed_attempts: int = 0 64 | succeeded_attempts: int = 0 65 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to the CodeFlare SDK 2 | 3 | Thank you for your interest in contributing to the CodeFlare SDK! 4 | 5 | ## Getting Started 6 | 7 | ### Prerequisites 8 | 9 | - Python 3.11 10 | - [Poetry](https://python-poetry.org/) 11 | 12 | ### Setting Up Your Development Environment 13 | 14 | 1. **Clone the repository:** 15 | 16 | ```sh 17 | git clone https://github.com/project-codeflare/codeflare-sdk.git 18 | cd codeflare-sdk 19 | ``` 20 | 21 | 2. Create a Poetry virtual environment: 22 | 23 | ```sh 24 | poetry shell 25 | ``` 26 | 27 | 3. Install dependencies: 28 | 29 | ```sh 30 | poetry install 31 | ``` 32 | 33 | - To include test dependencies, run: 34 | 35 | ```sh 36 | poetry install --with test 37 | ``` 38 | 39 | - To include docs dependencies, run: 40 | 41 | ```sh 42 | poetry install --with docs 43 | ``` 44 | 45 | - To include both test and docs dependencies, run: 46 | 47 | ```sh 48 | poetry install --with test,docs 49 | ``` 50 | 51 | ## Development Workflow 52 | 53 | ### Pre-commit 54 | 55 | We use pre-commit to ensure consistent code formatting. To enable pre-commit hooks, run: 56 | 57 | ```sh 58 | pre-commit install 59 | ``` 60 | 61 | ## Testing 62 | 63 | To install CodeFlare SDK in editable mode, run: 64 | 65 | ```sh 66 | pip install -e . 67 | ``` 68 | 69 | ### Unit Testing 70 | 71 | To run the unit tests, execute: 72 | 73 | ```sh 74 | pytest -v src/codeflare_sdk 75 | ``` 76 | 77 | ### Local e2e Testing 78 | 79 | - Please follow the [e2e documentation](https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/sphinx/user-docs/e2e.rst) 80 | 81 | #### Code Coverage 82 | 83 | - Run tests with the following command: `coverage run -m pytest` 84 | - To then view a code coverage report w/ missing lines, run `coverage report -m` 85 | 86 | ### Code Formatting 87 | 88 | - To check file formatting, in top-level dir run `black --check .` 89 | - To auto-reformat all files, remove the `--check` flag 90 | - To reformat an individual file, run `black ` 91 | -------------------------------------------------------------------------------- /.github/workflows/coverage-badge.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will generate and push an updated coverage badge 2 | 3 | name: Coverage Badge 4 | 5 | on: 6 | push: 7 | branches: [ main, ray-jobs-feature ] 8 | 9 | jobs: 10 | report: 11 | 12 | permissions: 13 | contents: write 14 | pull-requests: write 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Python 3.11 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: 3.11 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install poetry 27 | poetry config virtualenvs.create false 28 | poetry lock 29 | poetry install --with test 30 | - name: Generate coverage report 31 | run: | 32 | coverage run --omit="src/**/test_*.py,src/codeflare_sdk/common/utils/unit_test_support.py,src/codeflare_sdk/vendored/**" -m pytest 33 | 34 | - name: Coverage Badge 35 | uses: tj-actions/coverage-badge-py@v2 36 | 37 | - name: Verify Changed files 38 | uses: tj-actions/verify-changed-files@v18 39 | id: changed_files 40 | with: 41 | files: coverage.svg 42 | 43 | - name: Commit files 44 | if: steps.changed_files.outputs.files_changed == 'true' 45 | run: | 46 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 47 | git config --local user.name "github-actions[bot]" 48 | git add coverage.svg 49 | git commit -m "Updated coverage.svg" 50 | 51 | - name: Create Pull Request 52 | if: steps.changed_files.outputs.files_changed == 'true' 53 | uses: peter-evans/create-pull-request@v6 54 | with: 55 | token: ${{ secrets.GITHUB_TOKEN }} 56 | title: "[Automatic] Coverage Badge Update" 57 | commit-message: "Updated coverage.svg" 58 | branch: create-pull-request/coverage-badge-update 59 | delete-branch: true 60 | body: | 61 | This is an automated pull request to update the coverage badge. 62 | 63 | - Updated coverage.svg based on latest test results 64 | -------------------------------------------------------------------------------- /tests/e2e_v2/utils/scripts/cpu_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | CPU-optimized RayJob validation script using Ray Train. 3 | """ 4 | 5 | import ray 6 | import sys 7 | import traceback 8 | from ray import train 9 | 10 | 11 | def train_func(config): 12 | """Minimal training function for CPU execution.""" 13 | worker_rank = config.get("worker_rank", 0) 14 | result = sum(i * i for i in range(1000)) 15 | 16 | try: 17 | train.report({"loss": result, "worker_rank": worker_rank}) 18 | except RuntimeError: 19 | pass 20 | 21 | print(f"Worker {worker_rank} completed CPU training task. Result: {result}") 22 | 23 | 24 | def main(): 25 | """Run a minimal Ray Train task on CPU.""" 26 | try: 27 | ray.init() 28 | print("Starting CPU training task...") 29 | print(f"Ray initialized. Cluster resources: {ray.cluster_resources()}") 30 | 31 | @ray.remote 32 | def train_worker(worker_id): 33 | try: 34 | train_func({"worker_rank": worker_id}) 35 | result = sum(i * i for i in range(1000)) 36 | return {"loss": result, "worker_rank": worker_id} 37 | except Exception as e: 38 | print(f"Ray Train context not available, using fallback: {e}") 39 | result = sum(i * i for i in range(1000)) 40 | print( 41 | f"Worker {worker_id} completed CPU training task. Result: {result}" 42 | ) 43 | return {"loss": result, "worker_rank": worker_id} 44 | 45 | results = ray.get([train_worker.remote(i) for i in range(1)]) 46 | all_metrics = {} 47 | for result in results: 48 | if isinstance(result, dict): 49 | all_metrics.update(result) 50 | 51 | print(f"Training completed successfully. Metrics: {all_metrics}") 52 | print("EXISTING_CLUSTER_JOB_SUCCESS") 53 | return 0 54 | 55 | except Exception as e: 56 | print(f"FAILURE: Exception occurred: {e}") 57 | traceback.print_exc() 58 | return 1 59 | finally: 60 | ray.shutdown() 61 | 62 | 63 | if __name__ == "__main__": 64 | sys.exit(main()) 65 | -------------------------------------------------------------------------------- /demo-notebooks/additional-demos/batch-inference/simple_batch_inf.py: -------------------------------------------------------------------------------- 1 | import ray 2 | from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig 3 | 4 | 5 | # 1. Construct a vLLM processor config. 6 | processor_config = vLLMEngineProcessorConfig( 7 | # The base model. 8 | model_source="unsloth/Llama-3.2-1B-Instruct", 9 | # vLLM engine config. 10 | engine_kwargs=dict( 11 | enable_lora=False, 12 | # # Older GPUs (e.g. T4) don't support bfloat16. You should remove 13 | # # this line if you're using later GPUs. 14 | dtype="half", 15 | # Reduce the model length to fit small GPUs. You should remove 16 | # this line if you're using large GPUs. 17 | max_model_len=1024, 18 | ), 19 | # The batch size used in Ray Data. 20 | batch_size=8, 21 | # Use one GPU in this example. 22 | concurrency=1, 23 | # If you save the LoRA adapter in S3, you can set the following path. 24 | # dynamic_lora_loading_path="s3://your-lora-bucket/", 25 | ) 26 | 27 | # 2. Construct a processor using the processor config. 28 | processor = build_llm_processor( 29 | processor_config, 30 | preprocess=lambda row: dict( 31 | # Remove the LoRA model specification 32 | messages=[ 33 | { 34 | "role": "system", 35 | "content": "You are a calculator. Please only output the answer " 36 | "of the given equation.", 37 | }, 38 | {"role": "user", "content": f"{row['id']} ** 3 = ?"}, 39 | ], 40 | sampling_params=dict( 41 | temperature=0.3, 42 | max_tokens=20, 43 | detokenize=False, 44 | ), 45 | ), 46 | postprocess=lambda row: { 47 | "resp": row["generated_text"], 48 | }, 49 | ) 50 | 51 | # 3. Synthesize a dataset with 32 rows. 52 | ds = ray.data.range(32) 53 | # 4. Apply the processor to the dataset. Note that this line won't kick off 54 | # anything because processor is execution lazily. 55 | ds = processor(ds) 56 | # Materialization kicks off the pipeline execution. 57 | ds = ds.materialize() 58 | 59 | # 5. Print all outputs. 60 | for out in ds.take_all(): 61 | print(out) 62 | print("==========") 63 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/cluster/status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | The status sub-module defines Enums containing information for Ray cluster 17 | states states, and CodeFlare cluster states, as well as 18 | dataclasses to store information for Ray clusters. 19 | """ 20 | 21 | from dataclasses import dataclass, field 22 | from enum import Enum 23 | import typing 24 | from typing import Union 25 | 26 | 27 | class RayClusterStatus(Enum): 28 | """ 29 | Defines the possible reportable states of a Ray cluster. 30 | """ 31 | 32 | # https://github.com/ray-project/kuberay/blob/master/ray-operator/apis/ray/v1/raycluster_types.go#L112-L117 33 | READY = "ready" 34 | UNHEALTHY = "unhealthy" 35 | FAILED = "failed" 36 | UNKNOWN = "unknown" 37 | SUSPENDED = "suspended" 38 | 39 | 40 | class CodeFlareClusterStatus(Enum): 41 | """ 42 | Defines the possible reportable states of a Codeflare cluster. 43 | """ 44 | 45 | READY = 1 46 | STARTING = 2 47 | QUEUED = 3 48 | QUEUEING = 4 49 | FAILED = 5 50 | UNKNOWN = 6 51 | SUSPENDED = 7 52 | 53 | 54 | @dataclass 55 | class RayCluster: 56 | """ 57 | For storing information about a Ray cluster. 58 | """ 59 | 60 | name: str 61 | status: RayClusterStatus 62 | head_cpu_requests: int 63 | head_cpu_limits: int 64 | head_mem_requests: str 65 | head_mem_limits: str 66 | num_workers: int 67 | worker_mem_requests: str 68 | worker_mem_limits: str 69 | worker_cpu_requests: Union[int, str] 70 | worker_cpu_limits: Union[int, str] 71 | namespace: str 72 | dashboard: str 73 | worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) 74 | head_extended_resources: typing.Dict[str, int] = field(default_factory=dict) 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CodeFlare SDK 2 | 3 | [![Python application](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/unit-tests.yml) 4 | ![coverage badge](./coverage.svg) 5 | 6 | An intuitive, easy-to-use python interface for batch resource requesting, access, job submission, and observation. Simplifying the developer's life while enabling access to high-performance compute resources, either in the cloud or on-prem. 7 | 8 | For guided demos and basics walkthroughs, check out the following links: 9 | 10 | - Guided demo notebooks available [here](https://github.com/project-codeflare/codeflare-sdk/tree/main/demo-notebooks/guided-demos), and copies of the notebooks with [expected output](https://github.com/project-codeflare/codeflare-sdk/tree/main/demo-notebooks/guided-demos/notebook-ex-outputs) also available 11 | - these demos can be copied into your current working directory when using the `codeflare-sdk` by using the `codeflare_sdk.copy_demo_nbs()` function 12 | - Additionally, we have a [video walkthrough](https://www.youtube.com/watch?v=U76iIfd9EmE) of these basic demos from June, 2023 13 | 14 | Full documentation can be found [here](https://project-codeflare.github.io/codeflare-sdk/index.html) 15 | 16 | ## Installation 17 | 18 | Can be installed via `pip`: `pip install codeflare-sdk` 19 | 20 | ## Development 21 | 22 | Please see our [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed instructions. 23 | 24 | ## Release Instructions 25 | 26 | ### Automated Releases 27 | 28 | It is possible to use the Release Github workflow to do the release. This is generally the process we follow for releases 29 | 30 | ### Manual Releases 31 | 32 | The following instructions apply when doing release manually. This may be required in instances where the automation is failing. 33 | 34 | - Check and update the version in "pyproject.toml" file. 35 | - Commit all the changes to the repository. 36 | - Create Github release (). 37 | - Build the Python package. `poetry build` 38 | - If not present already, add the API token to Poetry. 39 | `poetry config pypi-token.pypi API_TOKEN` 40 | - Publish the Python package. `poetry publish` 41 | - Trigger the [Publish Documentation](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/publish-documentation.yaml) workflow 42 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | 16 | from codeflare_sdk.common.utils.constants import ( 17 | SUPPORTED_PYTHON_VERSIONS, 18 | CUDA_PY312_RUNTIME_IMAGE, 19 | ) 20 | 21 | 22 | def update_image(image) -> str: 23 | """ 24 | The update_image() function automatically sets the image config parameter to a preset image based on Python version if not specified. 25 | This now points to the centralized function in utils.py. 26 | """ 27 | if not image: 28 | # Pull the image based on the matching Python version (or output a warning if not supported) 29 | image = get_ray_image_for_python_version(warn_on_unsupported=True) 30 | return image 31 | 32 | 33 | def get_ray_image_for_python_version(python_version=None, warn_on_unsupported=True): 34 | """ 35 | Get the appropriate Ray image for a given Python version. 36 | If no version is provided, uses the current runtime Python version. 37 | This prevents us needing to hard code image versions for tests. 38 | 39 | Args: 40 | python_version: Python version string (e.g. "3.11"). If None, detects current version. 41 | warn_on_unsupported: If True, warns and returns None for unsupported versions. 42 | If False, silently falls back to Python 3.12 image. 43 | """ 44 | if python_version is None: 45 | python_version = f"{sys.version_info.major}.{sys.version_info.minor}" 46 | 47 | if python_version in SUPPORTED_PYTHON_VERSIONS: 48 | return SUPPORTED_PYTHON_VERSIONS[python_version] 49 | elif warn_on_unsupported: 50 | import warnings 51 | 52 | warnings.warn( 53 | f"No default Ray image defined for {python_version}. Please provide your own image or use one of the following python versions: {', '.join(SUPPORTED_PYTHON_VERSIONS.keys())}." 54 | ) 55 | return None 56 | else: 57 | return CUDA_PY312_RUNTIME_IMAGE 58 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/mnist_fashion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import ray 4 | from torch.utils.data import DataLoader 5 | from torchvision import datasets 6 | from torchvision.transforms import ToTensor 7 | from ray.train.torch import TorchTrainer 8 | from ray.train import ScalingConfig 9 | 10 | 11 | class NeuralNetwork(nn.Module): 12 | def __init__(self): 13 | super().__init__() 14 | self.flatten = nn.Flatten() 15 | self.linear_relu_stack = nn.Sequential( 16 | nn.Linear(28 * 28, 512), 17 | nn.ReLU(), 18 | nn.Linear(512, 512), 19 | nn.ReLU(), 20 | nn.Linear(512, 10), 21 | ) 22 | 23 | def forward(self, inputs): 24 | inputs = self.flatten(inputs) 25 | logits = self.linear_relu_stack(inputs) 26 | return logits 27 | 28 | 29 | def get_dataset(): 30 | return datasets.FashionMNIST( 31 | root="/tmp/data", 32 | train=True, 33 | download=True, 34 | transform=ToTensor(), 35 | ) 36 | 37 | 38 | def train_func_distributed(): 39 | num_epochs = 3 40 | batch_size = 64 41 | 42 | dataset = get_dataset() 43 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 44 | dataloader = ray.train.torch.prepare_data_loader(dataloader) 45 | 46 | model = NeuralNetwork() 47 | model = ray.train.torch.prepare_model(model) 48 | 49 | criterion = nn.CrossEntropyLoss() 50 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 51 | 52 | for epoch in range(num_epochs): 53 | if ray.train.get_context().get_world_size() > 1: 54 | dataloader.sampler.set_epoch(epoch) 55 | 56 | for inputs, labels in dataloader: 57 | optimizer.zero_grad() 58 | pred = model(inputs) 59 | loss = criterion(pred, labels) 60 | loss.backward() 61 | optimizer.step() 62 | print(f"epoch: {epoch}, loss: {loss.item()}") 63 | 64 | 65 | # For GPU Training, set `use_gpu` to True. 66 | use_gpu = True 67 | 68 | # To learn more about configuring S3 compatible storage check out our docs -> https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/s3-compatible-storage.md 69 | trainer = TorchTrainer( 70 | train_func_distributed, 71 | scaling_config=ScalingConfig( 72 | # num_workers = number of worker nodes with the ray head node included 73 | num_workers=3, 74 | use_gpu=use_gpu, 75 | resources_per_worker={ 76 | "CPU": 1, 77 | }, 78 | ), 79 | ) 80 | 81 | results = trainer.fit() 82 | -------------------------------------------------------------------------------- /tests/e2e/mnist_sleep.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | import torch 17 | import torch.nn as nn 18 | from torch.utils.data import DataLoader 19 | from torchvision import datasets, transforms 20 | 21 | 22 | # Define a simple neural network 23 | class NeuralNetwork(nn.Module): 24 | def __init__(self): 25 | super(NeuralNetwork, self).__init__() 26 | self.flatten = nn.Flatten() 27 | self.linear_relu_stack = nn.Sequential( 28 | nn.Linear(28 * 28, 512), 29 | nn.ReLU(), 30 | nn.Linear(512, 512), 31 | nn.ReLU(), 32 | nn.Linear(512, 10), 33 | ) 34 | 35 | def forward(self, x): 36 | x = self.flatten(x) 37 | logits = self.linear_relu_stack(x) 38 | return logits 39 | 40 | 41 | # Define the training function 42 | def train(): 43 | # Sleeping for 24 hours for upgrade test scenario 44 | print("Sleeping for 24 hours before starting the training for upgrade testing...") 45 | time.sleep(24 * 60 * 60) 46 | 47 | # Load dataset 48 | transform = transforms.Compose([transforms.ToTensor()]) 49 | train_dataset = datasets.FashionMNIST( 50 | root="./data", train=True, download=True, transform=transform 51 | ) 52 | train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) 53 | 54 | # Initialize the neural network, loss function, and optimizer 55 | model = NeuralNetwork() 56 | criterion = nn.CrossEntropyLoss() 57 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 58 | 59 | # Train the model 60 | num_epochs = 3 61 | for epoch in range(num_epochs): 62 | for inputs, labels in train_loader: 63 | optimizer.zero_grad() 64 | outputs = model(inputs) 65 | loss = criterion(outputs, labels) 66 | loss.backward() 67 | optimizer.step() 68 | print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}") 69 | 70 | 71 | if __name__ == "__main__": 72 | train() 73 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/test_demos.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Tests for demos module. 17 | """ 18 | 19 | import pytest 20 | import tempfile 21 | from pathlib import Path 22 | from unittest.mock import patch, MagicMock 23 | from codeflare_sdk.common.utils.demos import copy_demo_nbs 24 | 25 | 26 | class TestCopyDemoNbs: 27 | """Test cases for copy_demo_nbs function.""" 28 | 29 | def test_copy_demo_nbs_directory_exists_error(self): 30 | """Test that FileExistsError is raised when directory exists and overwrite=False.""" 31 | with tempfile.TemporaryDirectory() as temp_dir: 32 | # Create a subdirectory that will conflict 33 | conflict_dir = Path(temp_dir) / "demo-notebooks" 34 | conflict_dir.mkdir() 35 | 36 | with pytest.raises(FileExistsError, match="Directory.*already exists"): 37 | copy_demo_nbs(dir=str(conflict_dir), overwrite=False) 38 | 39 | def test_copy_demo_nbs_overwrite_true(self): 40 | """Test that overwrite=True allows copying to existing directory.""" 41 | with tempfile.TemporaryDirectory() as temp_dir: 42 | # Create a subdirectory that will conflict 43 | conflict_dir = Path(temp_dir) / "demo-notebooks" 44 | conflict_dir.mkdir() 45 | 46 | # Mock the demo_dir to point to a real directory 47 | with patch("codeflare_sdk.common.utils.demos.demo_dir", temp_dir): 48 | # Should not raise an error with overwrite=True 49 | copy_demo_nbs(dir=str(conflict_dir), overwrite=True) 50 | 51 | def test_copy_demo_nbs_default_parameters(self): 52 | """Test copy_demo_nbs with default parameters.""" 53 | with tempfile.TemporaryDirectory() as temp_dir: 54 | # Mock the demo_dir to point to a real directory 55 | with patch("codeflare_sdk.common.utils.demos.demo_dir", temp_dir): 56 | # Should work with default parameters 57 | copy_demo_nbs(dir=temp_dir, overwrite=True) 58 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/ui-widgets.rst: -------------------------------------------------------------------------------- 1 | Jupyter UI Widgets 2 | ================== 3 | 4 | Below are some examples of the Jupyter UI Widgets that are included in 5 | the CodeFlare SDK. 6 | 7 | .. note:: 8 | To use the widgets functionality you must be using the CodeFlare SDK in a Jupyter Notebook environment. 9 | 10 | Cluster Up/Down Buttons 11 | ----------------------- 12 | 13 | The Cluster Up/Down buttons appear after successfully initialising your 14 | `ClusterConfiguration `__. 15 | There are two buttons and a checkbox ``Cluster Up``, ``Cluster Down`` 16 | and ``Wait for Cluster?`` which mimic the 17 | `cluster.apply() `__, 18 | `cluster.down() `__ and 19 | `cluster.wait_ready() `__ 20 | functionality. 21 | 22 | After initialising their ``ClusterConfiguration`` a user can select the 23 | ``Wait for Cluster?`` checkbox then click the ``Cluster Up`` button to 24 | create their Ray Cluster and wait until it is ready. The cluster can be 25 | deleted by clicking the ``Cluster Down`` button. 26 | 27 | .. image:: images/ui-buttons.png 28 | :alt: An image of the up/down ui buttons 29 | 30 | View Clusters UI Table 31 | ---------------------- 32 | 33 | The View Clusters UI Table allows a user to see a list of Ray Clusters 34 | with information on their configuration including number of workers, CPU 35 | requests and limits along with the clusters status. 36 | 37 | .. image:: images/ui-view-clusters.png 38 | :alt: An image of the view clusters ui table 39 | 40 | Above is a list of two Ray Clusters ``raytest`` and ``raytest2`` each of 41 | those headings is clickable and will update the table to view the 42 | selected Cluster's information. There are four buttons under the table 43 | ``Cluster Down``, ``View Jobs``, ``Open Ray Dashboard``, and ``Refresh Data``. \* The 44 | ``Cluster Down`` button will delete the selected Cluster. \* The 45 | ``View Jobs`` button will try to open the Ray Dashboard's Jobs view in a 46 | Web Browser. The link will also be printed to the console. \* The 47 | ``Open Ray Dashboard`` button will try to open the Ray Dashboard view in 48 | a Web Browser. The link will also be printed to the console. \* The 49 | ``Refresh Data`` button will refresh the list of RayClusters, the spec, and 50 | the status of the Ray Cluster. 51 | 52 | The UI Table can be viewed by calling the following function. 53 | 54 | .. code:: python 55 | 56 | from codeflare_sdk import view_clusters 57 | view_clusters() # Accepts namespace parameter but will try to gather the namespace from the current context 58 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/authentication.rst: -------------------------------------------------------------------------------- 1 | Authentication via the CodeFlare SDK 2 | ==================================== 3 | 4 | Currently there are four ways of authenticating to your cluster via the 5 | SDK. Authenticating with your cluster allows you to perform actions such 6 | as creating Ray Clusters and Job Submission. 7 | 8 | Method 1 Token Authentication 9 | ----------------------------- 10 | 11 | This is how a typical user would authenticate to their cluster using 12 | ``TokenAuthentication``. 13 | 14 | :: 15 | 16 | from codeflare_sdk import TokenAuthentication 17 | 18 | auth = TokenAuthentication( 19 | token = "XXXXX", 20 | server = "XXXXX", 21 | skip_tls=False, 22 | # ca_cert_path="/path/to/cert" 23 | ) 24 | auth.login() 25 | # log out with auth.logout() 26 | 27 | Setting ``skip_tls=True`` allows interaction with an HTTPS server 28 | bypassing the server certificate checks although this is not secure. You 29 | can pass a custom certificate to ``TokenAuthentication`` by using 30 | ``ca_cert_path="/path/to/cert"`` when authenticating provided 31 | ``skip_tls=False``. Alternatively you can set the environment variable 32 | ``CF_SDK_CA_CERT_PATH`` to the path of your custom certificate. 33 | 34 | Method 2 Kubernetes Config File Authentication (Default location) 35 | ----------------------------------------------------------------- 36 | 37 | If a user has authenticated to their cluster by alternate means e.g. run 38 | a login command like ``oc login --token= --server=`` 39 | their kubernetes config file should have updated. If the user has not 40 | specifically authenticated through the SDK by other means such as 41 | ``TokenAuthentication`` then the SDK will try to use their default 42 | Kubernetes config file located at ``"$HOME/.kube/config"``. 43 | 44 | Method 3 Specifying a Kubernetes Config File 45 | -------------------------------------------- 46 | 47 | A user can specify a config file via a different authentication class 48 | ``KubeConfigFileAuthentication`` for authenticating with the SDK. This 49 | is what loading a custom config file would typically look like. 50 | 51 | :: 52 | 53 | from codeflare_sdk import KubeConfigFileAuthentication 54 | 55 | auth = KubeConfigFileAuthentication( 56 | kube_config_path="/path/to/config", 57 | ) 58 | auth.load_kube_config() 59 | # log out with auth.logout() 60 | 61 | Method 4 In-Cluster Authentication 62 | ---------------------------------- 63 | 64 | If a user does not authenticate by any of the means detailed above and 65 | does not have a config file at ``"$HOME/.kube/config"`` the SDK will try 66 | to authenticate with the in-cluster configuration file. 67 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/kubernetes_cluster/kube_api_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | This sub-module exists primarily to be used internally for any Kubernetes 17 | API error handling or wrapping. 18 | """ 19 | 20 | import executing 21 | from kubernetes import client, config 22 | 23 | ERROR_MESSAGES = { 24 | "Not Found": "The requested resource could not be located.\n" 25 | "Please verify the resource name and namespace.", 26 | "Unauthorized": "Access to the API is unauthorized.\n" 27 | "Check your credentials or permissions.", 28 | "Forbidden": "Access denied to the Kubernetes resource.\n" 29 | "Ensure your role has sufficient permissions for this operation.", 30 | "Conflict": "A conflict occurred with the RayCluster resource.\n" 31 | "Only one RayCluster with the same name is allowed. " 32 | "Please delete or rename the existing RayCluster before creating a new one with the desired name.", 33 | } 34 | 35 | 36 | # private methods 37 | def _kube_api_error_handling( 38 | e: Exception, print_error: bool = True 39 | ): # pragma: no cover 40 | def print_message(message: str): 41 | if print_error: 42 | print(message) 43 | 44 | if isinstance(e, client.ApiException): 45 | # Retrieve message based on reason, defaulting if reason is not known 46 | message = ERROR_MESSAGES.get( 47 | e.reason, f"Unexpected API error encountered (Reason: {e.reason})" 48 | ) 49 | full_message = f"{message}\nResponse: {e.body}" 50 | print_message(full_message) 51 | 52 | elif isinstance(e, config.ConfigException): 53 | message = "Configuration error: Unable to load Kubernetes configuration. Verify the config file path and format." 54 | print_message(message) 55 | 56 | elif isinstance(e, executing.executing.NotOneValueFound): 57 | message = "Execution error: Expected exactly one value in the operation but found none or multiple." 58 | print_message(message) 59 | 60 | else: 61 | message = f"Unexpected error:\n{str(e)}" 62 | print_message(message) 63 | raise e 64 | -------------------------------------------------------------------------------- /target_users.md: -------------------------------------------------------------------------------- 1 | # CodeFlare Stack Target Users 2 | 3 | [Cluster Admin](#cluster-administrator) 4 | 5 | [Data Scientist I](#data-scientist-i) 6 | 7 | [Data Scientist II](#data-scientist-ii) 8 | 9 | 10 | 11 | ## Cluster Administrator 12 | 13 | * Quota Management 14 | * Gang-Scheduling for Distributed Compute 15 | * Job/Infrastructure Queuing 16 | 17 | I want to enable a team of data scientists to have self-serve, but limited, access to a shared pool of distributed compute resources such as GPUs for large scale machine learning model training jobs. If the existing pool of resources is insufficient, I want my cluster to scale up (to a defined quota) to meet my users’ needs and scale back down automatically when their jobs have completed. I want these features to be made available through simple installation of generic modules via a user-friendly interface. I also want the ability to monitor current queue of pending tasks, the utilization of active resources, and the progress of all current jobs visualized in a simple dashboard. 18 | 19 | ## Data Scientist I 20 | 21 | * Training Mid-Size Models (less than 1,000 nodes) 22 | * Fine-Tuning Existing Models 23 | * Distributed Compute Framework 24 | 25 | I need temporary access to a reasonably large set of GPU enabled nodes on my team’s shared cluster for short term experimentation, parallelizing my existing ML workflow, or fine-tuning existing large scale models. I’d prefer to work from a notebook environment with access to a python sdk that I can use to request the creation of Framework Clusters that I can distribute my workloads across. In addition to interactive experimentation work, I also want the ability to “fire-and-forget” longer running ML jobs onto temporarily deployed Framework Clusters with the ability to monitor these jobs while they are running and access to all of their artifacts once complete. I also want to see where my jobs are in the current queue and the progress of all my current jobs visualized in a simple dashboard. 26 | 27 | ## Data Scientist II 28 | * Training Foundation Models (1,000+ nodes) 29 | * Distributed Compute Framework 30 | 31 | I need temporary (but long term) access to a massive amount of GPU enabled infrastructure to train a foundation model. I want to be able to “fire-and-forget” my ML Job into this environment. Due to the size and cost associated with this job, it has already been well tested and validated, so access to jupyter notebooks is unnecessary. I would prefer to write my job as a bash script leveraging a CLI, or as a python script leveraging an SDK. I need the ability to monitor the job while it is running, as well as access to all of its artifacts once complete. I also want to see where my jobs are in the current queue and the progress of all my current jobs visualized in a simple dashboard. 32 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/examples/use-builder.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from os import path 4 | import json 5 | 6 | 7 | """ 8 | in case you are working directly with the source, and don't wish to 9 | install the module with pip install, you can directly import the packages by uncommenting the following code. 10 | """ 11 | 12 | """ 13 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 14 | 15 | current_dir = os.path.dirname(os.path.abspath(__file__)) 16 | parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir)) 17 | sibling_dirs = [ 18 | d for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d)) 19 | ] 20 | for sibling_dir in sibling_dirs: 21 | sys.path.append(os.path.join(parent_dir, sibling_dir)) 22 | """ 23 | 24 | from codeflare_sdk.vendored.python_client import kuberay_cluster_api 25 | 26 | from codeflare_sdk.vendored.python_client.utils import kuberay_cluster_builder 27 | 28 | 29 | def main(): 30 | print("starting cluster handler...") 31 | my_kuberay_api = kuberay_cluster_api.RayClusterApi() 32 | 33 | my_cluster_builder = kuberay_cluster_builder.ClusterBuilder() 34 | 35 | cluster1 = ( 36 | my_cluster_builder.build_meta( 37 | name="new-cluster1", labels={"demo-cluster": "yes"} 38 | ) 39 | .build_head() 40 | .build_worker(group_name="workers") 41 | .get_cluster() 42 | ) 43 | 44 | if not my_cluster_builder.succeeded: 45 | print("error building the cluster, aborting...") 46 | return 47 | 48 | print("creating raycluster = {}".format(cluster1["metadata"]["name"])) 49 | my_kuberay_api.create_ray_cluster(body=cluster1) 50 | 51 | # the rest of the code is simply to list and cleanup the created cluster 52 | kube_ray_list = my_kuberay_api.list_ray_clusters( 53 | k8s_namespace="default", label_selector="demo-cluster=yes" 54 | ) 55 | if "items" in kube_ray_list: 56 | line = "-" * 72 57 | print(line) 58 | print("{:<63s}{:>2s}".format("Name", "Namespace")) 59 | print(line) 60 | for cluster in kube_ray_list["items"]: 61 | print( 62 | "{:<63s}{:>2s}".format( 63 | cluster["metadata"]["name"], 64 | cluster["metadata"]["namespace"], 65 | ) 66 | ) 67 | print(line) 68 | 69 | if "items" in kube_ray_list: 70 | for cluster in kube_ray_list["items"]: 71 | print("deleting raycluster = {}".format(cluster["metadata"]["name"])) 72 | my_kuberay_api.delete_ray_cluster( 73 | name=cluster["metadata"]["name"], 74 | k8s_namespace=cluster["metadata"]["namespace"], 75 | ) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /tests/e2e_v2/utils/in_cluster/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | High-level setup and cleanup functions for in-cluster test execution. 3 | 4 | This module provides convenient functions that combine service account creation 5 | and RBAC setup for easy use in test setup/teardown methods. 6 | """ 7 | 8 | from kubernetes import client 9 | from .rbac import ( 10 | create_test_service_account, 11 | create_rayjob_rbac, 12 | delete_test_service_account, 13 | ) 14 | 15 | 16 | def setup_in_cluster_test_environment( 17 | api_instance: client.CoreV1Api, 18 | custom_api: client.CustomObjectsApi, 19 | namespace: str, 20 | name_prefix: str = "test-pod", 21 | ) -> str: 22 | """ 23 | Set up a complete in-cluster test environment with service account and RBAC. 24 | 25 | This function: 26 | 1. Creates a ServiceAccount 27 | 2. Creates a Role with permissions for RayJob operations 28 | 3. Creates a RoleBinding linking the Role to the ServiceAccount 29 | 30 | Args: 31 | api_instance: Kubernetes CoreV1Api instance. 32 | custom_api: CustomObjectsApi instance. 33 | namespace: Namespace to create resources in. 34 | name_prefix: Prefix for resource names. 35 | 36 | Returns: 37 | The service account name to use in pod creation. 38 | """ 39 | service_account_name = create_test_service_account( 40 | api_instance=api_instance, 41 | namespace=namespace, 42 | name_prefix=name_prefix, 43 | ) 44 | 45 | try: 46 | create_rayjob_rbac( 47 | api_instance=api_instance, 48 | custom_api=custom_api, 49 | namespace=namespace, 50 | service_account_name=service_account_name, 51 | ) 52 | except Exception: 53 | try: 54 | api_instance.delete_namespaced_service_account( 55 | service_account_name, namespace 56 | ) 57 | except Exception: 58 | pass 59 | raise 60 | 61 | return service_account_name 62 | 63 | 64 | def cleanup_in_cluster_test_environment( 65 | api_instance: client.CoreV1Api, 66 | custom_api: client.CustomObjectsApi, 67 | namespace: str, 68 | service_account_name: str, 69 | ) -> None: 70 | """ 71 | Clean up in-cluster test environment (ServiceAccount, Role, RoleBinding). 72 | 73 | Args: 74 | api_instance: Kubernetes CoreV1Api instance. 75 | custom_api: CustomObjectsApi instance. 76 | namespace: Namespace where resources exist. 77 | service_account_name: Name of the service account to clean up. 78 | """ 79 | delete_test_service_account( 80 | api_instance=api_instance, 81 | custom_api=custom_api, 82 | namespace=namespace, 83 | service_account_name=service_account_name, 84 | ) 85 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "codeflare-sdk" 3 | version = "0.33.0" 4 | 5 | [tool.poetry] 6 | name = "codeflare-sdk" 7 | version = "0.33.0" 8 | description = "Python SDK for codeflare client" 9 | 10 | license = "Apache-2.0" 11 | 12 | # Exclude vendored tests, examples, and build files from the package 13 | exclude = [ 14 | "src/codeflare_sdk/vendored/python_client_test", 15 | "src/codeflare_sdk/vendored/examples", 16 | "src/codeflare_sdk/vendored/pyproject.toml", 17 | "src/codeflare_sdk/vendored/poetry.lock", 18 | "src/codeflare_sdk/vendored/README.md" 19 | ] 20 | 21 | authors = [ 22 | "Michael Clifford ", 23 | "Mustafa Eyceoz ", 24 | "Abhishek Malvankar ", 25 | "Atin Sood ", 26 | ] 27 | 28 | readme = 'README.md' 29 | 30 | repository = "https://github.com/project-codeflare/codeflare-sdk" 31 | homepage = "https://github.com/project-codeflare/codeflare-sdk" 32 | 33 | keywords = ['codeflare', 'python', 'sdk', 'client', 'batch', 'scale'] 34 | 35 | [tool.poetry.dependencies] 36 | python = "^3.11" 37 | openshift-client = "1.0.18" 38 | rich = ">=12.5,<14.0" 39 | ray = {version = "2.52.1", extras = ["data", "default"]} 40 | kubernetes = ">= 27.2.0" 41 | cryptography = "43.0.3" 42 | executing = "1.2.0" 43 | pydantic = ">= 2.10.6" 44 | ipywidgets = "8.1.2" 45 | 46 | [[tool.poetry.source]] 47 | name = "pypi" 48 | 49 | [[tool.poetry.source]] 50 | name = "testpypi" 51 | url = "https://test.pypi.org/simple/" 52 | 53 | [tool.poetry.group.docs] 54 | optional = true 55 | 56 | [tool.poetry.group.docs.dependencies] 57 | sphinx = "7.4.7" 58 | sphinx-rtd-theme = "3.0.1" 59 | 60 | [tool.poetry.group.test] 61 | optional = true 62 | 63 | [tool.poetry.group.test.dependencies] 64 | pytest = "7.4.0" 65 | coverage = "7.6.4" 66 | pytest-mock = "3.11.1" 67 | pytest-timeout = "2.3.1" 68 | jupyterlab = "4.5.0" 69 | selenium = "4.27.1" 70 | webdriver-manager = "4.0.2" 71 | 72 | 73 | [tool.poetry.group.dev.dependencies] 74 | diff-cover = "^9.6.0" 75 | 76 | [tool.pytest.ini_options] 77 | filterwarnings = [ 78 | "ignore::DeprecationWarning:pkg_resources", 79 | "ignore:pkg_resources is deprecated as an API:DeprecationWarning", 80 | ] 81 | markers = [ 82 | "kind", 83 | "openshift", 84 | "nvidia_gpu", 85 | "smoke: Smoke tests - quick validation tests", 86 | "tier1: Tier1 tests - standard test suite", 87 | "pre_upgrade: Tests to run before upgrade", 88 | "post_upgrade: Tests to run after upgrade", 89 | "ui: UI tests requiring browser automation" 90 | ] 91 | addopts = "--timeout=900 --ignore=src/codeflare_sdk/vendored" 92 | testpaths = ["src/codeflare_sdk"] 93 | collect_ignore = ["src/codeflare_sdk/common/utils/unit_test_support.py"] 94 | 95 | [build-system] 96 | requires = ["poetry-core>=1.6.0"] 97 | build-backend = "poetry.core.masonry.api" 98 | -------------------------------------------------------------------------------- /tests/e2e/heterogeneous_clusters_kind_test.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | import time 3 | from codeflare_sdk import ( 4 | Cluster, 5 | ClusterConfiguration, 6 | ) 7 | 8 | from codeflare_sdk.common.kueue.kueue import list_local_queues 9 | 10 | import pytest 11 | 12 | from support import * 13 | 14 | 15 | @pytest.mark.skip(reason="Skipping heterogenous cluster kind test") 16 | @pytest.mark.kind 17 | class TestHeterogeneousClustersKind: 18 | def setup_method(self): 19 | initialize_kubernetes_client(self) 20 | 21 | def teardown_method(self): 22 | delete_namespace(self) 23 | delete_kueue_resources(self) 24 | 25 | @pytest.mark.nvidia_gpu 26 | def test_heterogeneous_clusters(self): 27 | create_namespace(self) 28 | create_kueue_resources(self, 2, with_labels=True, with_tolerations=True) 29 | self.run_heterogeneous_clusters() 30 | 31 | def run_heterogeneous_clusters( 32 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 33 | ): 34 | for flavor in self.resource_flavors: 35 | node_labels = ( 36 | get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {}) 37 | ) 38 | expected_nodes = get_nodes_by_label(self, node_labels) 39 | 40 | print(f"Expected nodes: {expected_nodes}") 41 | cluster_name = f"test-ray-cluster-li-{flavor[-5:]}" 42 | queues = list_local_queues(namespace=self.namespace, flavors=[flavor]) 43 | queue_name = queues[0]["name"] if queues else None 44 | print(f"Using flavor: {flavor}, Queue: {queue_name}") 45 | cluster = Cluster( 46 | ClusterConfiguration( 47 | name=cluster_name, 48 | namespace=self.namespace, 49 | num_workers=1, 50 | head_cpu_requests="500m", 51 | head_cpu_limits="500m", 52 | head_memory_requests=2, 53 | head_memory_limits=2, 54 | worker_cpu_requests="500m", 55 | worker_cpu_limits=1, 56 | worker_memory_requests=1, 57 | worker_memory_limits=4, 58 | worker_extended_resource_requests={ 59 | gpu_resource_name: number_of_gpus 60 | }, 61 | write_to_file=True, 62 | verify_tls=False, 63 | local_queue=queue_name, 64 | ) 65 | ) 66 | cluster.apply() 67 | sleep(5) 68 | node_name = get_pod_node(self, self.namespace, cluster_name) 69 | print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}") 70 | sleep(5) 71 | assert ( 72 | node_name in expected_nodes 73 | ), f"Node {node_name} is not in the expected nodes for flavor {flavor}." 74 | cluster.down() 75 | -------------------------------------------------------------------------------- /tests/e2e/local_interactive_sdk_oauth_test.py: -------------------------------------------------------------------------------- 1 | from codeflare_sdk import ( 2 | Cluster, 3 | ClusterConfiguration, 4 | TokenAuthentication, 5 | generate_cert, 6 | ) 7 | 8 | import math 9 | import pytest 10 | import ray 11 | 12 | from support import * 13 | 14 | 15 | @pytest.mark.skip(reason="Remote ray.init() is temporarily unsupported") 16 | @pytest.mark.openshift 17 | @pytest.mark.tier1 18 | class TestRayLocalInteractiveOauth: 19 | def setup_method(self): 20 | initialize_kubernetes_client(self) 21 | 22 | def teardown_method(self): 23 | delete_namespace(self) 24 | delete_kueue_resources(self) 25 | 26 | def test_local_interactives(self): 27 | self.setup_method() 28 | create_namespace(self) 29 | create_kueue_resources(self) 30 | self.run_local_interactives() 31 | 32 | def run_local_interactives(self): 33 | ray_image = get_ray_image() 34 | 35 | auth = TokenAuthentication( 36 | token=run_oc_command(["whoami", "--show-token=true"]), 37 | server=run_oc_command(["whoami", "--show-server=true"]), 38 | skip_tls=True, 39 | ) 40 | auth.login() 41 | 42 | cluster_name = "test-ray-cluster-li" 43 | 44 | cluster = Cluster( 45 | ClusterConfiguration( 46 | namespace=self.namespace, 47 | name=cluster_name, 48 | num_workers=1, 49 | head_memory_requests=6, 50 | head_memory_limits=8, 51 | head_cpu_requests=1, 52 | head_cpu_limits=1, 53 | worker_cpu_requests=1, 54 | worker_cpu_limits=1, 55 | worker_memory_requests=1, 56 | worker_memory_limits=4, 57 | image=ray_image, 58 | verify_tls=False, 59 | ) 60 | ) 61 | cluster.apply() 62 | cluster.wait_ready() 63 | 64 | generate_cert.generate_tls_cert(cluster_name, self.namespace) 65 | generate_cert.export_env(cluster_name, self.namespace) 66 | 67 | ray.shutdown() 68 | ray.init(address=cluster.local_client_url(), logging_level="DEBUG") 69 | 70 | @ray.remote 71 | def heavy_calculation_part(num_iterations): 72 | result = 0.0 73 | for i in range(num_iterations): 74 | for j in range(num_iterations): 75 | for k in range(num_iterations): 76 | result += math.sin(i) * math.cos(j) * math.tan(k) 77 | return result 78 | 79 | @ray.remote 80 | def heavy_calculation(num_iterations): 81 | results = ray.get( 82 | [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)] 83 | ) 84 | return sum(results) 85 | 86 | ref = heavy_calculation.remote(3000) 87 | result = ray.get(ref) 88 | assert result == 1789.4644387076714 89 | ray.cancel(ref) 90 | ray.shutdown() 91 | 92 | cluster.down() 93 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/s3-compatible-storage.rst: -------------------------------------------------------------------------------- 1 | S3 compatible storage with Ray Train examples 2 | ============================================= 3 | 4 | Some of our distributed training examples require an external storage 5 | solution so that all nodes can access the same data. The following are 6 | examples for configuring S3 or Minio storage for your Ray Train script 7 | or interactive session. 8 | 9 | S3 Bucket 10 | --------- 11 | 12 | In your Python Script add the following environment variables: 13 | 14 | .. code:: python 15 | 16 | os.environ["AWS_ACCESS_KEY_ID"] = "XXXXXXXX" 17 | os.environ["AWS_SECRET_ACCESS_KEY"] = "XXXXXXXX" 18 | os.environ["AWS_DEFAULT_REGION"] = "XXXXXXXX" 19 | 20 | Alternatively you can specify these variables in your runtime 21 | environment on Job Submission. 22 | 23 | .. code:: python 24 | 25 | submission_id = client.submit_job( 26 | entrypoint=..., 27 | runtime_env={ 28 | "env_vars": { 29 | "AWS_ACCESS_KEY_ID": os.environ.get('AWS_ACCESS_KEY_ID'), 30 | "AWS_SECRET_ACCESS_KEY": os.environ.get('AWS_SECRET_ACCESS_KEY'), 31 | "AWS_DEFAULT_REGION": os.environ.get('AWS_DEFAULT_REGION') 32 | }, 33 | } 34 | ) 35 | 36 | In your Trainer configuration you can specify a ``run_config`` which 37 | will utilise your external storage. 38 | 39 | .. code:: python 40 | 41 | trainer = TorchTrainer( 42 | train_func_distributed, 43 | scaling_config=scaling_config, 44 | run_config = ray.train.RunConfig(storage_path="s3://BUCKET_NAME/SUB_PATH/", name="unique_run_name") 45 | ) 46 | 47 | To learn more about Amazon S3 Storage you can find information 48 | `here `__. 49 | 50 | Minio Bucket 51 | ------------ 52 | 53 | In your Python Script add the following function for configuring your 54 | run_config: 55 | 56 | .. code:: python 57 | 58 | import s3fs 59 | import pyarrow 60 | 61 | def get_minio_run_config(): 62 | s3_fs = s3fs.S3FileSystem( 63 | key = os.getenv('MINIO_ACCESS_KEY', "XXXXX"), 64 | secret = os.getenv('MINIO_SECRET_ACCESS_KEY', "XXXXX"), 65 | endpoint_url = os.getenv('MINIO_URL', "XXXXX") 66 | ) 67 | custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs)) 68 | run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs) 69 | return run_config 70 | 71 | You can update the ``run_config`` to further suit your needs above. 72 | Lastly the new ``run_config`` must be added to the Trainer: 73 | 74 | .. code:: python 75 | 76 | trainer = TorchTrainer( 77 | train_func_distributed, 78 | scaling_config=scaling_config, 79 | run_config = get_minio_run_config() 80 | ) 81 | 82 | To find more information on creating a Minio Bucket compatible with 83 | RHOAI you can refer to this 84 | `documentation `__. 85 | Note: You must have ``s3fs`` and ``pyarrow`` installed in your 86 | environment for this method. 87 | -------------------------------------------------------------------------------- /demo-notebooks/additional-demos/remote_ray_job_client.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Submit a training job remotely to Ray Dashboard protected by oAuth.\n", 8 | "This notebook will demonstrate how to submit Ray jobs to an existing Raycluster, using the CodeFlare SDK.\n", 9 | "\n", 10 | "### Requirements\n", 11 | "* Ray Cluster running in OpenShift protected by oAuth.\n", 12 | "* The Ray Dashboard URL for the Ray Cluster.\n", 13 | "* An OpenShift authorization token with permissions to access the Route.\n", 14 | "* A training job, defined in python, within the working directory.\n", 15 | "* A requirements.txt or equivalent file containing any additional packages to install onto the Ray images." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# Import dependencies from codeflare-sdk\n", 25 | "from codeflare_sdk import RayJobClient" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Setup Authentication Configuration \n", 35 | "auth_token = \"XXXX\" # Replace with the actual token\n", 36 | "header = {\n", 37 | " 'Authorization': f'Bearer {auth_token}'\n", 38 | "}" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Gather the dashboard URL (provided by the creator of the RayCluster)\n", 48 | "ray_dashboard = \"XXXX\" # Replace with the Ray dashboard URL" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "#Initialize the RayJobClient\n", 58 | "client = RayJobClient(address=ray_dashboard, headers=header, verify=True)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Submit a job using the RayJobClient\n", 68 | "entrypoint_command = \"python XXXX\" # Replace with the training script name\n", 69 | "submission_id = client.submit_job(\n", 70 | " entrypoint=entrypoint_command,\n", 71 | " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", 72 | ")" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# Get the job's status\n", 82 | "client.get_job_status(submission_id)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Get the job's logs\n", 92 | "client.get_job_logs(submission_id)" 93 | ] 94 | } 95 | ], 96 | "metadata": { 97 | "language_info": { 98 | "name": "python" 99 | } 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 2 103 | } 104 | -------------------------------------------------------------------------------- /tests/e2e/heterogeneous_clusters_oauth_test.py: -------------------------------------------------------------------------------- 1 | from codeflare_sdk import ( 2 | Cluster, 3 | ClusterConfiguration, 4 | TokenAuthentication, 5 | ) 6 | 7 | from codeflare_sdk.common.kueue.kueue import list_local_queues 8 | 9 | import pytest 10 | 11 | from support import * 12 | 13 | 14 | @pytest.mark.skip(reason="Temporarily skipped due to needed investigation") 15 | @pytest.mark.openshift 16 | @pytest.mark.tier1 17 | class TestHeterogeneousClustersOauth: 18 | def setup_method(self): 19 | initialize_kubernetes_client(self) 20 | 21 | def teardown_method(self): 22 | delete_namespace(self) 23 | delete_kueue_resources(self) 24 | 25 | def test_heterogeneous_clusters(self): 26 | create_namespace(self) 27 | create_kueue_resources(self, 2, with_labels=True, with_tolerations=True) 28 | self.run_heterogeneous_clusters() 29 | 30 | def run_heterogeneous_clusters( 31 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 32 | ): 33 | ray_image = get_ray_image() 34 | 35 | auth = TokenAuthentication( 36 | token=run_oc_command(["whoami", "--show-token=true"]), 37 | server=run_oc_command(["whoami", "--show-server=true"]), 38 | skip_tls=True, 39 | ) 40 | auth.login() 41 | 42 | for flavor in self.resource_flavors: 43 | node_labels = ( 44 | get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {}) 45 | ) 46 | expected_nodes = get_nodes_by_label(self, node_labels) 47 | 48 | print(f"Expected nodes: {expected_nodes}") 49 | cluster_name = f"test-ray-cluster-li-{flavor[-5:]}" 50 | queues = list_local_queues(namespace=self.namespace, flavors=[flavor]) 51 | queue_name = queues[0]["name"] if queues else None 52 | print(f"Using flavor: {flavor}, Queue: {queue_name}") 53 | cluster = Cluster( 54 | ClusterConfiguration( 55 | namespace=self.namespace, 56 | name=cluster_name, 57 | num_workers=1, 58 | head_cpu_requests="500m", 59 | head_cpu_limits="500m", 60 | head_memory_requests=2, 61 | head_memory_limits=4, 62 | worker_cpu_requests="500m", 63 | worker_cpu_limits=1, 64 | worker_memory_requests=2, 65 | worker_memory_limits=4, 66 | image=ray_image, 67 | verify_tls=False, 68 | local_queue=queue_name, 69 | ) 70 | ) 71 | cluster.apply() 72 | # Wait for the cluster to be scheduled and ready, we don't need the dashboard for this check 73 | cluster.wait_ready(dashboard_check=False) 74 | node_name = get_pod_node(self, self.namespace, cluster_name) 75 | print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}") 76 | assert ( 77 | node_name in expected_nodes 78 | ), f"Node {node_name} is not in the expected nodes for flavor {flavor}." 79 | cluster.down() 80 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/test_awload.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from codeflare_sdk.common.utils.unit_test_support import ( 15 | apply_template, 16 | arg_check_aw_apply_effect, 17 | arg_check_aw_del_effect, 18 | get_template_variables, 19 | ) 20 | from codeflare_sdk.ray.appwrapper import AWManager 21 | from codeflare_sdk.ray.cluster import Cluster, ClusterConfiguration 22 | import os 23 | from pathlib import Path 24 | 25 | parent = Path(__file__).resolve().parents[4] # project directory 26 | aw_dir = os.path.expanduser("~/.codeflare/resources/") 27 | 28 | 29 | def test_AWManager_creation(mocker): 30 | mocker.patch("kubernetes.client.ApisApi.get_api_versions") 31 | mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object") 32 | # Create test.yaml 33 | Cluster( 34 | ClusterConfiguration( 35 | name="test", 36 | namespace="ns", 37 | write_to_file=True, 38 | appwrapper=True, 39 | ) 40 | ) 41 | 42 | testaw = AWManager(f"{aw_dir}test.yaml") 43 | assert testaw.name == "test" 44 | assert testaw.namespace == "ns" 45 | assert testaw.submitted == False 46 | try: 47 | testaw = AWManager("fake") 48 | except Exception as e: 49 | assert type(e) == FileNotFoundError 50 | assert str(e) == "[Errno 2] No such file or directory: 'fake'" 51 | try: 52 | testaw = apply_template( 53 | AWManager( 54 | f"{parent}/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml" 55 | ), 56 | get_template_variables(), 57 | ) 58 | except Exception as e: 59 | assert type(e) == ValueError 60 | assert ( 61 | str(e) 62 | == f"{parent}/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml is not a correctly formatted AppWrapper yaml" 63 | ) 64 | 65 | 66 | def test_AWManager_submit_remove(mocker, capsys): 67 | mocker.patch("kubernetes.client.ApisApi.get_api_versions") 68 | testaw = AWManager(f"{aw_dir}test.yaml") 69 | testaw.remove() 70 | captured = capsys.readouterr() 71 | assert ( 72 | captured.out 73 | == "AppWrapper not submitted by this manager yet, nothing to remove\n" 74 | ) 75 | assert testaw.submitted == False 76 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 77 | mocker.patch( 78 | "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", 79 | side_effect=arg_check_aw_apply_effect, 80 | ) 81 | mocker.patch( 82 | "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", 83 | side_effect=arg_check_aw_del_effect, 84 | ) 85 | testaw.submit() 86 | assert testaw.submitted == True 87 | testaw.remove() 88 | assert testaw.submitted == False 89 | 90 | 91 | # Make sure to always keep this function last 92 | def test_cleanup(): 93 | os.remove(f"{aw_dir}test.yaml") 94 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/examples/use-director.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from os import path 4 | import json 5 | import time 6 | 7 | """ 8 | in case you are working directly with the source, and don't wish to 9 | install the module with pip install, you can directly import the packages by uncommenting the following code. 10 | """ 11 | 12 | """ 13 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 14 | 15 | current_dir = os.path.dirname(os.path.abspath(__file__)) 16 | parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir)) 17 | sibling_dirs = [ 18 | d for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d)) 19 | ] 20 | for sibling_dir in sibling_dirs: 21 | sys.path.append(os.path.join(parent_dir, sibling_dir)) 22 | 23 | """ 24 | from codeflare_sdk.vendored.python_client import kuberay_cluster_api 25 | 26 | from codeflare_sdk.vendored.python_client.utils import kuberay_cluster_builder 27 | 28 | 29 | def wait(duration: int = 5, step_name: str = "next"): 30 | print("waiting for {} seconds before {} step".format(duration, step_name)) 31 | for i in range(duration, 0, -1): 32 | sys.stdout.write(str(i) + " ") 33 | sys.stdout.flush() 34 | time.sleep(1) 35 | print() 36 | 37 | 38 | def main(): 39 | print("starting cluster handler...") 40 | 41 | my_kube_ray_api = kuberay_cluster_api.RayClusterApi() 42 | 43 | my_cluster_director = kuberay_cluster_builder.Director() 44 | 45 | # building the raycluster representation 46 | cluster_body = my_cluster_director.build_small_cluster( 47 | name="new-small-cluster", k8s_namespace="default" 48 | ) 49 | 50 | # creating the raycluster in k8s 51 | if cluster_body: 52 | print("creating the cluster...") 53 | my_kube_ray_api.create_ray_cluster(body=cluster_body) 54 | 55 | # now the cluster should be created. 56 | # the rest of the code is simply to fetch, print and cleanup the created cluster 57 | 58 | print("fetching the cluster...") 59 | # fetching the raycluster from k8s api-server 60 | kube_ray_cluster = my_kube_ray_api.get_ray_cluster( 61 | name=cluster_body["metadata"]["name"], k8s_namespace="default" 62 | ) 63 | 64 | if kube_ray_cluster: 65 | print( 66 | "try: kubectl -n {} get raycluster {} -o yaml".format( 67 | kube_ray_cluster["metadata"]["namespace"], 68 | kube_ray_cluster["metadata"]["name"], 69 | ) 70 | ) 71 | wait(step_name="print created cluster in JSON") 72 | print("printing the raycluster JSON representation...") 73 | json_formatted_str = json.dumps(kube_ray_cluster, indent=2) 74 | print(json_formatted_str) 75 | 76 | # waiting until the cluster is running, and has its status updated 77 | is_running = my_kube_ray_api.wait_until_ray_cluster_running( 78 | name=kube_ray_cluster["metadata"]["name"], 79 | k8s_namespace=kube_ray_cluster["metadata"]["namespace"], 80 | ) 81 | 82 | print( 83 | "raycluster {} status is {}".format( 84 | kube_ray_cluster["metadata"]["name"], "Running" if is_running else "unknown" 85 | ) 86 | ) 87 | 88 | wait(step_name="cleaning up") 89 | print("deleting raycluster {}.".format(kube_ray_cluster["metadata"]["name"])) 90 | 91 | my_kube_ray_api.delete_ray_cluster( 92 | name=kube_ray_cluster["metadata"]["name"], 93 | k8s_namespace=kube_ray_cluster["metadata"]["namespace"], 94 | ) 95 | 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/appwrapper/test-case-bad.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppsWrapper 3 | metadata: 4 | labels: 5 | orderedinstance: cpu.small_gpu.large 6 | nam: unit-test-cluster 7 | namspace: ns 8 | spec: 9 | components: 10 | - template: 11 | apiVersion: ray.io/v1 12 | kind: RayCluster 13 | metadata: 14 | labels: 15 | controller-tools.k8s.io: '1.0' 16 | name: unit-test-cluster 17 | namespace: ns 18 | spec: 19 | autoscalerOptions: 20 | idleTimeoutSeconds: 60 21 | resources: 22 | limits: 23 | cpu: 500m 24 | memory: 512Mi 25 | requests: 26 | cpu: 500m 27 | memory: 512Mi 28 | upscalingMode: Default 29 | enableInTreeAutoscaling: false 30 | headGroupSpec: 31 | rayStartParams: 32 | block: 'true' 33 | dashboard-host: 0.0.0.0 34 | num-gpus: '0' 35 | resources: '"{}"' 36 | serviceType: ClusterIP 37 | template: 38 | spec: 39 | containers: 40 | - env: 41 | - name: MY_POD_IP 42 | valueFrom: 43 | fieldRef: 44 | fieldPath: status.podIP 45 | image: "${image}" 46 | imagePullPolicy: IfNotPresent 47 | lifecycle: 48 | preStop: 49 | exec: 50 | command: 51 | - /bin/sh 52 | - -c 53 | - ray stop 54 | name: ray-head 55 | ports: 56 | - containerPort: 6379 57 | name: gcs 58 | - containerPort: 8265 59 | name: dashboard 60 | - containerPort: 10001 61 | name: client 62 | resources: 63 | limits: 64 | cpu: 2 65 | memory: 8G 66 | requests: 67 | cpu: 2 68 | memory: 8G 69 | rayVersion: 2.52.1 70 | workerGroupSpecs: 71 | - groupName: small-group-unit-test-cluster 72 | maxReplicas: 2 73 | minReplicas: 2 74 | rayStartParams: 75 | block: 'true' 76 | num-gpus: '7' 77 | resources: '"{}"' 78 | replicas: 2 79 | template: 80 | metadata: 81 | annotations: 82 | key: value 83 | labels: 84 | key: value 85 | spec: 86 | containers: 87 | - env: 88 | - name: MY_POD_IP 89 | valueFrom: 90 | fieldRef: 91 | fieldPath: status.podIP 92 | image: "${image}" 93 | lifecycle: 94 | preStop: 95 | exec: 96 | command: 97 | - /bin/sh 98 | - -c 99 | - ray stop 100 | name: machine-learning 101 | resources: 102 | limits: 103 | cpu: 4 104 | memory: 6G 105 | nvidia.com/gpu: 7 106 | requests: 107 | cpu: 3 108 | memory: 5G 109 | nvidia.com/gpu: 7 110 | -------------------------------------------------------------------------------- /tests/e2e/minio_deployment.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: PersistentVolumeClaim 3 | apiVersion: v1 4 | metadata: 5 | name: minio-pvc 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 20Gi 12 | volumeMode: Filesystem 13 | --- 14 | kind: Secret 15 | apiVersion: v1 16 | metadata: 17 | name: minio-secret 18 | stringData: 19 | # change the username and password to your own values. 20 | # ensure that the user is at least 3 characters long and the password at least 8 21 | minio_root_user: minio 22 | minio_root_password: minio123 23 | --- 24 | kind: Deployment 25 | apiVersion: apps/v1 26 | metadata: 27 | name: minio 28 | spec: 29 | replicas: 1 30 | selector: 31 | matchLabels: 32 | app: minio 33 | template: 34 | metadata: 35 | creationTimestamp: null 36 | labels: 37 | app: minio 38 | spec: 39 | volumes: 40 | - name: data 41 | persistentVolumeClaim: 42 | claimName: minio-pvc 43 | containers: 44 | - resources: 45 | limits: 46 | cpu: 250m 47 | memory: 1Gi 48 | requests: 49 | cpu: 20m 50 | memory: 100Mi 51 | readinessProbe: 52 | tcpSocket: 53 | port: 9000 54 | initialDelaySeconds: 5 55 | timeoutSeconds: 1 56 | periodSeconds: 5 57 | successThreshold: 1 58 | failureThreshold: 3 59 | terminationMessagePath: /dev/termination-log 60 | name: minio 61 | livenessProbe: 62 | tcpSocket: 63 | port: 9000 64 | initialDelaySeconds: 30 65 | timeoutSeconds: 1 66 | periodSeconds: 5 67 | successThreshold: 1 68 | failureThreshold: 3 69 | env: 70 | - name: MINIO_ROOT_USER 71 | valueFrom: 72 | secretKeyRef: 73 | name: minio-secret 74 | key: minio_root_user 75 | - name: MINIO_ROOT_PASSWORD 76 | valueFrom: 77 | secretKeyRef: 78 | name: minio-secret 79 | key: minio_root_password 80 | ports: 81 | - containerPort: 9000 82 | protocol: TCP 83 | - containerPort: 9090 84 | protocol: TCP 85 | imagePullPolicy: IfNotPresent 86 | volumeMounts: 87 | - name: data 88 | mountPath: /data 89 | subPath: minio 90 | terminationMessagePolicy: File 91 | image: quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z 92 | args: 93 | - server 94 | - /data 95 | - --console-address 96 | - :9090 97 | restartPolicy: Always 98 | terminationGracePeriodSeconds: 30 99 | dnsPolicy: ClusterFirst 100 | securityContext: {} 101 | schedulerName: default-scheduler 102 | strategy: 103 | type: Recreate 104 | revisionHistoryLimit: 10 105 | progressDeadlineSeconds: 600 106 | --- 107 | kind: Service 108 | apiVersion: v1 109 | metadata: 110 | name: minio-service 111 | spec: 112 | ipFamilies: 113 | - IPv4 114 | ports: 115 | - name: api 116 | protocol: TCP 117 | port: 9000 118 | targetPort: 9000 119 | - name: ui 120 | protocol: TCP 121 | port: 9090 122 | targetPort: 9090 123 | internalTrafficPolicy: Cluster 124 | type: ClusterIP 125 | ipFamilyPolicy: SingleStack 126 | sessionAffinity: None 127 | selector: 128 | app: minio 129 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/awload.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | The awload sub-module contains the definition of the AWManager object, which handles 17 | submission and deletion of existing AppWrappers from a user's file system. 18 | """ 19 | 20 | from os.path import isfile 21 | import errno 22 | import os 23 | import yaml 24 | 25 | from kubernetes import client 26 | from ...common import _kube_api_error_handling 27 | from ...common.kubernetes_cluster.auth import ( 28 | config_check, 29 | get_api_client, 30 | ) 31 | 32 | 33 | class AWManager: 34 | """ 35 | An object for submitting and removing existing AppWrapper yamls 36 | to be added to the Kueue localqueue. 37 | """ 38 | 39 | def __init__(self, filename: str) -> None: 40 | """ 41 | Create the AppWrapper Manager object by passing in an 42 | AppWrapper yaml file 43 | """ 44 | if not isfile(filename): 45 | raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename) 46 | self.filename = filename 47 | try: 48 | with open(self.filename) as f: 49 | self.awyaml = yaml.load(f, Loader=yaml.FullLoader) 50 | assert self.awyaml["kind"] == "AppWrapper" 51 | self.name = self.awyaml["metadata"]["name"] 52 | self.namespace = self.awyaml["metadata"]["namespace"] 53 | except: 54 | raise ValueError( 55 | f"{filename } is not a correctly formatted AppWrapper yaml" 56 | ) 57 | self.submitted = False 58 | 59 | def submit(self) -> None: 60 | """ 61 | Attempts to create the AppWrapper custom resource using the yaml file 62 | """ 63 | try: 64 | config_check() 65 | api_instance = client.CustomObjectsApi(get_api_client()) 66 | api_instance.create_namespaced_custom_object( 67 | group="workload.codeflare.dev", 68 | version="v1beta2", 69 | namespace=self.namespace, 70 | plural="appwrappers", 71 | body=self.awyaml, 72 | ) 73 | except Exception as e: 74 | return _kube_api_error_handling(e) 75 | 76 | self.submitted = True 77 | print(f"AppWrapper {self.filename} submitted!") 78 | 79 | def remove(self) -> None: 80 | """ 81 | Attempts to delete the AppWrapper custom resource matching the name in the yaml, 82 | if submitted by this manager. 83 | """ 84 | if not self.submitted: 85 | print("AppWrapper not submitted by this manager yet, nothing to remove") 86 | return 87 | 88 | try: 89 | config_check() 90 | api_instance = client.CustomObjectsApi(get_api_client()) 91 | api_instance.delete_namespaced_custom_object( 92 | group="workload.codeflare.dev", 93 | version="v1beta2", 94 | namespace=self.namespace, 95 | plural="appwrappers", 96 | name=self.name, 97 | ) 98 | except Exception as e: 99 | return _kube_api_error_handling(e) 100 | 101 | self.submitted = False 102 | print(f"AppWrapper {self.name} removed!") 103 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/test_status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from codeflare_sdk.ray.cluster.cluster import ( 16 | _app_wrapper_status, 17 | Cluster, 18 | ClusterConfiguration, 19 | ) 20 | from codeflare_sdk.ray.appwrapper import AppWrapper, AppWrapperStatus 21 | from codeflare_sdk.ray.cluster.status import CodeFlareClusterStatus 22 | from codeflare_sdk.common.utils.unit_test_support import get_local_queue 23 | import os 24 | 25 | aw_dir = os.path.expanduser("~/.codeflare/resources/") 26 | 27 | 28 | def test_cluster_status(mocker): 29 | mocker.patch("kubernetes.client.ApisApi.get_api_versions") 30 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 31 | mocker.patch( 32 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", 33 | return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), 34 | ) 35 | fake_aw = AppWrapper("test", AppWrapperStatus.FAILED) 36 | 37 | cf = Cluster( 38 | ClusterConfiguration( 39 | name="test", 40 | namespace="ns", 41 | write_to_file=True, 42 | appwrapper=True, 43 | local_queue="local-queue-default", 44 | ) 45 | ) 46 | mocker.patch( 47 | "codeflare_sdk.ray.cluster.cluster._app_wrapper_status", return_value=None 48 | ) 49 | mocker.patch( 50 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=None 51 | ) 52 | status, ready = cf.status() 53 | assert status == CodeFlareClusterStatus.UNKNOWN 54 | assert ready == False 55 | 56 | mocker.patch( 57 | "codeflare_sdk.ray.cluster.cluster._app_wrapper_status", return_value=fake_aw 58 | ) 59 | status, ready = cf.status() 60 | assert status == CodeFlareClusterStatus.FAILED 61 | assert ready == False 62 | 63 | fake_aw.status = AppWrapperStatus.SUSPENDED 64 | status, ready = cf.status() 65 | assert status == CodeFlareClusterStatus.QUEUED 66 | assert ready == False 67 | 68 | fake_aw.status = AppWrapperStatus.RESUMING 69 | status, ready = cf.status() 70 | assert status == CodeFlareClusterStatus.STARTING 71 | assert ready == False 72 | 73 | fake_aw.status = AppWrapperStatus.RESETTING 74 | status, ready = cf.status() 75 | assert status == CodeFlareClusterStatus.STARTING 76 | assert ready == False 77 | 78 | fake_aw.status = AppWrapperStatus.RUNNING 79 | status, ready = cf.status() 80 | assert status == CodeFlareClusterStatus.UNKNOWN 81 | assert ready == False 82 | 83 | 84 | def aw_status_fields(group, version, namespace, plural, *args): 85 | assert group == "workload.codeflare.dev" 86 | assert version == "v1beta2" 87 | assert namespace == "test-ns" 88 | assert plural == "appwrappers" 89 | assert args == tuple() 90 | return {"items": []} 91 | 92 | 93 | def test_aw_status(mocker): 94 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 95 | mocker.patch( 96 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", 97 | side_effect=aw_status_fields, 98 | ) 99 | aw = _app_wrapper_status("test-aw", "test-ns") 100 | assert aw == None 101 | 102 | 103 | # Make sure to always keep this function last 104 | def test_cleanup(): 105 | os.remove(f"{aw_dir}test.yaml") 106 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/cluster/test_status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from codeflare_sdk.ray.cluster.cluster import ( 16 | Cluster, 17 | ClusterConfiguration, 18 | _ray_cluster_status, 19 | ) 20 | from codeflare_sdk.ray.cluster.status import ( 21 | CodeFlareClusterStatus, 22 | RayClusterStatus, 23 | RayCluster, 24 | ) 25 | import os 26 | from ...common.utils.unit_test_support import get_local_queue 27 | 28 | aw_dir = os.path.expanduser("~/.codeflare/resources/") 29 | 30 | 31 | def test_cluster_status(mocker): 32 | mocker.patch("kubernetes.client.ApisApi.get_api_versions") 33 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 34 | 35 | fake_ray = RayCluster( 36 | name="test", 37 | status=RayClusterStatus.UNKNOWN, 38 | num_workers=1, 39 | worker_mem_requests=2, 40 | worker_mem_limits=2, 41 | worker_cpu_requests=1, 42 | worker_cpu_limits=1, 43 | namespace="ns", 44 | dashboard="fake-uri", 45 | head_cpu_requests=2, 46 | head_cpu_limits=2, 47 | head_mem_requests=8, 48 | head_mem_limits=8, 49 | ) 50 | 51 | mocker.patch( 52 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", 53 | return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), 54 | ) 55 | 56 | cf = Cluster( 57 | ClusterConfiguration( 58 | name="test", 59 | namespace="ns", 60 | write_to_file=True, 61 | appwrapper=False, 62 | local_queue="local-queue-default", 63 | ) 64 | ) 65 | mocker.patch( 66 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=None 67 | ) 68 | status, ready = cf.status() 69 | assert status == CodeFlareClusterStatus.UNKNOWN 70 | assert ready == False 71 | 72 | mocker.patch( 73 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=fake_ray 74 | ) 75 | 76 | status, ready = cf.status() 77 | assert status == CodeFlareClusterStatus.STARTING 78 | assert ready == False 79 | 80 | fake_ray.status = RayClusterStatus.FAILED 81 | status, ready = cf.status() 82 | assert status == CodeFlareClusterStatus.FAILED 83 | assert ready == False 84 | 85 | fake_ray.status = RayClusterStatus.UNHEALTHY 86 | status, ready = cf.status() 87 | assert status == CodeFlareClusterStatus.FAILED 88 | assert ready == False 89 | 90 | fake_ray.status = RayClusterStatus.READY 91 | status, ready = cf.status() 92 | assert status == CodeFlareClusterStatus.READY 93 | assert ready == True 94 | 95 | 96 | def rc_status_fields(group, version, namespace, plural, *args): 97 | assert group == "ray.io" 98 | assert version == "v1" 99 | assert namespace == "test-ns" 100 | assert plural == "rayclusters" 101 | assert args == tuple() 102 | return {"items": []} 103 | 104 | 105 | def test_rc_status(mocker): 106 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 107 | mocker.patch( 108 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", 109 | side_effect=rc_status_fields, 110 | ) 111 | rc = _ray_cluster_status("test-rc", "test-ns") 112 | assert rc == None 113 | 114 | 115 | # Make sure to always keep this function last 116 | def test_cleanup(): 117 | os.remove(f"{aw_dir}test.yaml") 118 | -------------------------------------------------------------------------------- /tests/e2e/mnist_raycluster_sdk_kind_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from time import sleep 4 | 5 | from codeflare_sdk import Cluster, ClusterConfiguration 6 | from codeflare_sdk.ray.client import RayJobClient 7 | 8 | import pytest 9 | 10 | from support import * 11 | 12 | # This test creates a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster 13 | 14 | 15 | @pytest.mark.kind 16 | class TestRayClusterSDKKind: 17 | def setup_method(self): 18 | initialize_kubernetes_client(self) 19 | 20 | def teardown_method(self): 21 | delete_namespace(self) 22 | delete_kueue_resources(self) 23 | 24 | def test_mnist_ray_cluster_sdk_kind(self): 25 | self.setup_method() 26 | create_namespace(self) 27 | create_kueue_resources(self) 28 | self.run_mnist_raycluster_sdk_kind(accelerator="cpu") 29 | 30 | @pytest.mark.nvidia_gpu 31 | def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): 32 | self.setup_method() 33 | create_namespace(self) 34 | create_kueue_resources(self) 35 | self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) 36 | 37 | def run_mnist_raycluster_sdk_kind( 38 | self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 39 | ): 40 | cluster = Cluster( 41 | ClusterConfiguration( 42 | name="mnist", 43 | namespace=self.namespace, 44 | num_workers=1, 45 | head_cpu_requests="500m", 46 | head_cpu_limits="500m", 47 | worker_cpu_requests="500m", 48 | worker_cpu_limits=1, 49 | worker_memory_requests=1, 50 | worker_memory_limits=4, 51 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, 52 | write_to_file=True, 53 | verify_tls=False, 54 | ) 55 | ) 56 | 57 | cluster.apply() 58 | 59 | cluster.status() 60 | 61 | cluster.wait_ready() 62 | 63 | cluster.status() 64 | 65 | cluster.details() 66 | 67 | self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) 68 | 69 | assert_get_cluster_and_jobsubmit( 70 | self, "mnist", accelerator="gpu", number_of_gpus=1 71 | ) 72 | 73 | # Assertions 74 | 75 | def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): 76 | ray_dashboard = cluster.cluster_dashboard_uri() 77 | client = RayJobClient(address=ray_dashboard, verify=False) 78 | 79 | submission_id = client.submit_job( 80 | entrypoint="python mnist.py", 81 | runtime_env={ 82 | "working_dir": "./tests/e2e/", 83 | "pip": "./tests/e2e/mnist_pip_requirements.txt", 84 | "env_vars": get_setup_env_variables(ACCELERATOR=accelerator), 85 | }, 86 | entrypoint_num_gpus=number_of_gpus, 87 | ) 88 | print(f"Submitted job with ID: {submission_id}") 89 | done = False 90 | time = 0 91 | timeout = 900 92 | while not done: 93 | status = client.get_job_status(submission_id) 94 | if status.is_terminal(): 95 | break 96 | if not done: 97 | print(status) 98 | if timeout and time >= timeout: 99 | raise TimeoutError(f"job has timed out after waiting {timeout}s") 100 | sleep(5) 101 | time += 5 102 | 103 | logs = client.get_job_logs(submission_id) 104 | print(logs) 105 | 106 | self.assert_job_completion(status) 107 | 108 | client.delete_job(submission_id) 109 | 110 | def assert_job_completion(self, status): 111 | if status == "SUCCEEDED": 112 | print(f"Job has completed: '{status}'") 113 | assert True 114 | else: 115 | print(f"Job has completed: '{status}'") 116 | assert False 117 | -------------------------------------------------------------------------------- /tests/e2e_v2/utils/scripts/gpu_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | GPU-optimized RayJob validation script using Ray Train. 3 | 4 | This script performs a minimal Ray Train task suitable for GPU execution 5 | to validate that a RayJob can successfully connect to and use an existing Ray cluster 6 | with GPU resources. 7 | 8 | Usage as RayJob entrypoint: 9 | python gpu_script.py 10 | """ 11 | 12 | import ray 13 | import sys 14 | from ray import train 15 | from ray.train import ScalingConfig 16 | from ray.train.torch import TorchTrainer 17 | 18 | 19 | def train_func(config): 20 | """ 21 | Minimal training function for GPU execution. 22 | 23 | This performs a simple computation task that validates: 24 | 1. Ray Train can initialize with GPU 25 | 2. GPU workers can execute tasks 26 | 3. Results can be aggregated 27 | 28 | Args: 29 | config: Training configuration dict 30 | """ 31 | # Get the current worker context 32 | worker_rank = train.get_context().get_world_rank() 33 | 34 | # Check if GPU is available 35 | try: 36 | import torch 37 | 38 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 39 | print(f"Worker {worker_rank} using device: {device}") 40 | 41 | if torch.cuda.is_available(): 42 | # Simple GPU computation task 43 | # Create a small tensor and perform computation on GPU 44 | x = torch.randn(100, 100, device=device) 45 | y = torch.randn(100, 100, device=device) 46 | result = torch.matmul(x, y).sum().item() 47 | print(f"Worker {worker_rank} completed GPU computation. Result: {result}") 48 | else: 49 | # Fallback to CPU if GPU not available 50 | print(f"Worker {worker_rank}: GPU not available, using CPU fallback") 51 | result = sum(i * i for i in range(1000)) 52 | except ImportError: 53 | # If PyTorch is not available, use simple CPU computation 54 | print(f"Worker {worker_rank}: PyTorch not available, using CPU computation") 55 | result = sum(i * i for i in range(1000)) 56 | 57 | # Report metrics back 58 | train.report({"loss": result, "worker_rank": worker_rank}) 59 | 60 | 61 | def main(): 62 | """ 63 | Run a minimal Ray Train task on GPU. 64 | 65 | This validates that: 66 | 1. Ray can be initialized (auto-connects to cluster when run as RayJob) 67 | 2. Ray Train can execute a distributed task with GPU 68 | 3. The job can complete successfully 69 | 70 | Returns: 71 | 0 on success, 1 on failure 72 | """ 73 | try: 74 | # Initialize Ray (auto-connects to cluster when run as RayJob) 75 | ray.init() 76 | 77 | print("Starting GPU training task...") 78 | 79 | # Check cluster resources 80 | resources = ray.cluster_resources() 81 | print(f"Cluster resources: {resources}") 82 | 83 | # Check if GPU is available in the cluster 84 | gpu_available = "GPU" in resources and resources.get("GPU", 0) > 0 85 | print(f"GPU available in cluster: {gpu_available}") 86 | 87 | # Create a minimal Ray Train trainer for GPU 88 | # Using TorchTrainer (the current Ray Train API) with GPU configuration 89 | trainer = TorchTrainer( 90 | train_func, 91 | scaling_config=ScalingConfig( 92 | num_workers=1, # Use 1 worker for minimal test 93 | use_gpu=True, # Request GPU 94 | ), 95 | ) 96 | 97 | # Run the training 98 | result = trainer.fit() 99 | 100 | print(f"Training completed successfully. Metrics: {result.metrics}") 101 | 102 | # Print success marker that tests can check for 103 | print("EXISTING_CLUSTER_JOB_SUCCESS") 104 | 105 | return 0 106 | 107 | except Exception as e: 108 | print(f"FAILURE: Exception occurred: {e}") 109 | import traceback 110 | 111 | traceback.print_exc() 112 | return 1 113 | finally: 114 | ray.shutdown() 115 | 116 | 117 | if __name__ == "__main__": 118 | sys.exit(main()) 119 | -------------------------------------------------------------------------------- /tests/e2e/mnist_raycluster_sdk_aw_kind_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from time import sleep 4 | 5 | from codeflare_sdk import Cluster, ClusterConfiguration 6 | from codeflare_sdk.ray.client import RayJobClient 7 | 8 | import pytest 9 | 10 | from support import * 11 | 12 | # This test creates an AppWrapper containing a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster 13 | 14 | 15 | @pytest.mark.kind 16 | class TestRayClusterSDKAppWrapperKind: 17 | def setup_method(self): 18 | initialize_kubernetes_client(self) 19 | 20 | def teardown_method(self): 21 | delete_namespace(self) 22 | delete_kueue_resources(self) 23 | 24 | def test_mnist_ray_cluster_sdk_kind(self): 25 | self.setup_method() 26 | create_namespace(self) 27 | create_kueue_resources(self) 28 | self.run_mnist_raycluster_sdk_kind(accelerator="cpu") 29 | 30 | @pytest.mark.nvidia_gpu 31 | def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): 32 | self.setup_method() 33 | create_namespace(self) 34 | create_kueue_resources(self) 35 | self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) 36 | 37 | def run_mnist_raycluster_sdk_kind( 38 | self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 39 | ): 40 | cluster = Cluster( 41 | ClusterConfiguration( 42 | name="mnist", 43 | namespace=self.namespace, 44 | num_workers=1, 45 | head_cpu_requests="500m", 46 | head_cpu_limits="500m", 47 | worker_cpu_requests="500m", 48 | worker_cpu_limits=1, 49 | worker_memory_requests=1, 50 | worker_memory_limits=4, 51 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, 52 | write_to_file=True, 53 | verify_tls=False, 54 | appwrapper=True, 55 | ) 56 | ) 57 | 58 | cluster.apply() 59 | 60 | cluster.status() 61 | 62 | cluster.wait_ready() 63 | 64 | cluster.status() 65 | 66 | cluster.details() 67 | 68 | self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) 69 | assert_get_cluster_and_jobsubmit( 70 | self, "mnist", accelerator="gpu", number_of_gpus=1 71 | ) 72 | 73 | # Assertions 74 | 75 | def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): 76 | ray_dashboard = cluster.cluster_dashboard_uri() 77 | client = RayJobClient(address=ray_dashboard, verify=False) 78 | 79 | submission_id = client.submit_job( 80 | entrypoint="python mnist.py", 81 | runtime_env={ 82 | "working_dir": "./tests/e2e/", 83 | "pip": "./tests/e2e/mnist_pip_requirements.txt", 84 | "env_vars": get_setup_env_variables(ACCELERATOR=accelerator), 85 | }, 86 | entrypoint_num_gpus=number_of_gpus, 87 | ) 88 | print(f"Submitted job with ID: {submission_id}") 89 | done = False 90 | time = 0 91 | timeout = 900 92 | while not done: 93 | status = client.get_job_status(submission_id) 94 | if status.is_terminal(): 95 | break 96 | if not done: 97 | print(status) 98 | if timeout and time >= timeout: 99 | raise TimeoutError(f"job has timed out after waiting {timeout}s") 100 | sleep(5) 101 | time += 5 102 | 103 | logs = client.get_job_logs(submission_id) 104 | print(logs) 105 | 106 | self.assert_job_completion(status) 107 | 108 | client.delete_job(submission_id) 109 | 110 | def assert_job_completion(self, status): 111 | if status == "SUCCEEDED": 112 | print(f"Job has completed: '{status}'") 113 | assert True 114 | else: 115 | print(f"Job has completed: '{status}'") 116 | assert False 117 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/setup-kueue.rst: -------------------------------------------------------------------------------- 1 | Basic Kueue Resources configuration 2 | =================================== 3 | 4 | Introduction: 5 | ------------- 6 | 7 | This document is designed for administrators who have Kueue installed on 8 | their cluster. We will walk through the process of setting up essential 9 | Kueue resources, namely Cluster Queue, Resource Flavor, and Local Queue. 10 | 11 | 1. Resource Flavor: 12 | ------------------- 13 | 14 | Resource Flavors allow the cluster admin to reflect differing resource capabilities 15 | of nodes within a clusters, such as CPU, memory, GPU, etc. These can then be assigned 16 | to workloads to ensure they are executed on nodes with appropriate resources. 17 | 18 | The YAML configuration provided below creates an empty Resource Flavor 19 | named default-flavor. It serves as a starting point and does not specify 20 | any detailed resource characteristics. 21 | 22 | .. code:: yaml 23 | 24 | apiVersion: kueue.x-k8s.io/v1beta1 25 | kind: ResourceFlavor 26 | metadata: 27 | name: default-flavor 28 | 29 | For more detailed information on Resource Flavor configuration options, 30 | refer to the Kueue documentation: `Resource Flavor 31 | Configuration `__ 32 | 33 | 2. Cluster Queue: 34 | ----------------- 35 | 36 | A Cluster Queue represents a shared queue across the entire cluster. It 37 | allows the cluster admin to define global settings for workload 38 | prioritization and resource allocation. 39 | 40 | When setting up a Cluster Queue in Kueue, it’s crucial that the resource 41 | specifications match the actual capacities and operational requirements 42 | of your cluster. The example provided outlines a basic setup; however, 43 | each cluster may have different resource availabilities and needs. 44 | 45 | .. code:: yaml 46 | 47 | apiVersion: kueue.x-k8s.io/v1beta1 48 | kind: ClusterQueue 49 | metadata: 50 | name: "cluster-queue" 51 | spec: 52 | namespaceSelector: {} # match all. 53 | resourceGroups: 54 | - coveredResources: ["cpu", "memory", "pods", "nvidia.com/gpu"] 55 | flavors: 56 | - name: "default-flavor" 57 | resources: 58 | - name: "cpu" 59 | nominalQuota: 9 60 | - name: "memory" 61 | nominalQuota: 36Gi 62 | - name: "pods" 63 | nominalQuota: 5 64 | - name: "nvidia.com/gpu" 65 | nominalQuota: '0' 66 | 67 | For more detailed information on Cluster Queue configuration options, 68 | refer to the Kueue documentation: `Cluster Queue 69 | Configuration `__ 70 | 71 | 3. Local Queue (With Default Annotation): 72 | ----------------------------------------- 73 | 74 | A Local Queue represents a queue associated with a specific namespace 75 | within the cluster. It allows namespace-level control over workload 76 | prioritization and resource allocation. 77 | 78 | .. code:: yaml 79 | 80 | apiVersion: kueue.x-k8s.io/v1beta1 81 | kind: LocalQueue 82 | metadata: 83 | namespace: team-a 84 | name: team-a-queue 85 | annotations: 86 | kueue.x-k8s.io/default-queue: "true" 87 | spec: 88 | clusterQueue: cluster-queue 89 | 90 | In the LocalQueue configuration provided above, the annotations field 91 | specifies ``kueue.x-k8s.io/default-queue: "true"``. This annotation 92 | indicates that the team-a-queue is designated as the default queue for 93 | the team-a namespace. When this is set, any workloads submitted to the 94 | team-a namespace without explicitly specifying a queue will 95 | automatically be routed to the team-a-queue. 96 | 97 | For more detailed information on Local Queue configuration options, 98 | refer to the Kueue documentation: `Local Queue 99 | Configuration `__ 100 | 101 | Conclusion: 102 | ----------- 103 | 104 | By following the steps outlined in this document, the cluster admin can 105 | successfully create the basic Kueue resources necessary for workload 106 | management in the cluster. For more advanced configurations and 107 | features, please refer to the comprehensive `Kueue 108 | documentation `__. 109 | -------------------------------------------------------------------------------- /tests/e2e/cluster_apply_kind_test.py: -------------------------------------------------------------------------------- 1 | from codeflare_sdk import Cluster, ClusterConfiguration 2 | import pytest 3 | import time 4 | from kubernetes import client 5 | from codeflare_sdk.common.utils import constants 6 | 7 | from support import ( 8 | initialize_kubernetes_client, 9 | create_namespace, 10 | delete_namespace, 11 | get_ray_cluster, 12 | ) 13 | 14 | 15 | @pytest.mark.kind 16 | class TestRayClusterApply: 17 | def setup_method(self): 18 | initialize_kubernetes_client(self) 19 | 20 | def teardown_method(self): 21 | delete_namespace(self) 22 | 23 | def test_cluster_apply(self): 24 | self.setup_method() 25 | create_namespace(self) 26 | 27 | cluster_name = "test-cluster-apply" 28 | namespace = self.namespace 29 | 30 | # Initial configuration with 1 worker 31 | initial_config = ClusterConfiguration( 32 | name=cluster_name, 33 | namespace=namespace, 34 | num_workers=1, 35 | head_cpu_requests="500m", 36 | head_cpu_limits="1", 37 | head_memory_requests="1Gi", 38 | head_memory_limits="2Gi", 39 | worker_cpu_requests="500m", 40 | worker_cpu_limits="1", 41 | worker_memory_requests="1Gi", 42 | worker_memory_limits="2Gi", 43 | image=f"rayproject/ray:{constants.RAY_VERSION}", 44 | write_to_file=True, 45 | verify_tls=False, 46 | ) 47 | 48 | # Create the cluster 49 | cluster = Cluster(initial_config) 50 | cluster.apply() 51 | 52 | # Wait for the cluster to be ready 53 | cluster.wait_ready(dashboard_check=False) 54 | status, ready = cluster.status() 55 | assert ready, f"Cluster {cluster_name} is not ready: {status}" 56 | 57 | # Verify the cluster is created 58 | ray_cluster = get_ray_cluster(cluster_name, namespace) 59 | assert ray_cluster is not None, "Cluster was not created successfully" 60 | assert ( 61 | ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 1 62 | ), "Initial worker count does not match" 63 | 64 | # Update configuration with 2 workers 65 | updated_config = ClusterConfiguration( 66 | name=cluster_name, 67 | namespace=namespace, 68 | num_workers=2, 69 | head_cpu_requests="500m", 70 | head_cpu_limits="1", 71 | head_memory_requests="1Gi", 72 | head_memory_limits="2Gi", 73 | worker_cpu_requests="500m", 74 | worker_cpu_limits="1", 75 | worker_memory_requests="1Gi", 76 | worker_memory_limits="2Gi", 77 | image=f"rayproject/ray:{constants.RAY_VERSION}", 78 | write_to_file=True, 79 | verify_tls=False, 80 | ) 81 | 82 | # Apply the updated configuration 83 | cluster.config = updated_config 84 | cluster.apply() 85 | 86 | # Give Kubernetes a moment to process the update 87 | time.sleep(5) 88 | 89 | # Wait for the updated cluster to be ready 90 | cluster.wait_ready(dashboard_check=False) 91 | updated_status, updated_ready = cluster.status() 92 | assert ( 93 | updated_ready 94 | ), f"Cluster {cluster_name} is not ready after update: {updated_status}" 95 | 96 | # Verify the cluster is updated 97 | updated_ray_cluster = get_ray_cluster(cluster_name, namespace) 98 | assert ( 99 | updated_ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 2 100 | ), "Worker count was not updated" 101 | 102 | # Clean up 103 | cluster.down() 104 | 105 | # Wait for deletion to complete (finalizers may delay deletion) 106 | max_wait = 30 # seconds 107 | wait_interval = 2 108 | elapsed = 0 109 | 110 | while elapsed < max_wait: 111 | ray_cluster = get_ray_cluster(cluster_name, namespace) 112 | if ray_cluster is None: 113 | break 114 | time.sleep(wait_interval) 115 | elapsed += wait_interval 116 | 117 | assert ( 118 | ray_cluster is None 119 | ), f"Cluster was not deleted successfully after {max_wait}s" 120 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/cluster/test_build_ray_cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from collections import namedtuple 15 | import sys 16 | from .build_ray_cluster import gen_names, update_image, build_ray_cluster 17 | import uuid 18 | from codeflare_sdk.ray.cluster.cluster import ClusterConfiguration, Cluster 19 | 20 | 21 | def test_gen_names_with_name(mocker): 22 | mocker.patch.object( 23 | uuid, "uuid4", return_value=uuid.UUID("00000000-0000-0000-0000-000000000001") 24 | ) 25 | name = "myname" 26 | appwrapper_name, cluster_name = gen_names(name) 27 | assert appwrapper_name == name 28 | assert cluster_name == name 29 | 30 | 31 | def test_gen_names_without_name(mocker): 32 | mocker.patch.object( 33 | uuid, "uuid4", return_value=uuid.UUID("00000000-0000-0000-0000-000000000001") 34 | ) 35 | appwrapper_name, cluster_name = gen_names(None) 36 | assert appwrapper_name.startswith("appwrapper-") 37 | assert cluster_name.startswith("cluster-") 38 | 39 | 40 | def test_update_image_without_supported_python_version(mocker): 41 | # Mock SUPPORTED_PYTHON_VERSIONS 42 | mocker.patch.dict( 43 | "codeflare_sdk.common.utils.constants.SUPPORTED_PYTHON_VERSIONS", 44 | { 45 | "3.11": "ray-py3.11", 46 | "3.12": "ray-py3.12", 47 | }, 48 | ) 49 | 50 | # Create a namedtuple to mock sys.version_info 51 | VersionInfo = namedtuple( 52 | "version_info", ["major", "minor", "micro", "releaselevel", "serial"] 53 | ) 54 | mocker.patch.object(sys, "version_info", VersionInfo(3, 8, 0, "final", 0)) 55 | 56 | # Mock warnings.warn to check if it gets called 57 | warn_mock = mocker.patch("warnings.warn") 58 | 59 | # Call the update_image function with no image provided 60 | image = update_image(None) 61 | 62 | # Assert that the warning was called with the expected message 63 | warn_mock.assert_called_once_with( 64 | "No default Ray image defined for 3.8. Please provide your own image or use one of the following python versions: 3.11, 3.12." 65 | ) 66 | 67 | # Assert that no image was set since the Python version is not supported 68 | assert image is None 69 | 70 | 71 | def test_build_ray_cluster_with_gcs_ft(mocker): 72 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 73 | mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object") 74 | 75 | cluster = Cluster( 76 | ClusterConfiguration( 77 | name="test", 78 | namespace="ns", 79 | enable_gcs_ft=True, 80 | redis_address="redis:6379", 81 | redis_password_secret={"name": "redis-password-secret", "key": "password"}, 82 | external_storage_namespace="new-ns", 83 | ) 84 | ) 85 | 86 | mocker.patch("codeflare_sdk.ray.cluster.build_ray_cluster.config_check") 87 | mocker.patch( 88 | "codeflare_sdk.ray.cluster.build_ray_cluster.get_api_client", return_value=None 89 | ) 90 | mocker.patch( 91 | "codeflare_sdk.ray.cluster.build_ray_cluster.update_image", return_value=None 92 | ) 93 | 94 | resource = build_ray_cluster(cluster) 95 | 96 | assert "spec" in resource 97 | assert "gcsFaultToleranceOptions" in resource["spec"] 98 | 99 | gcs_ft_options = resource["spec"]["gcsFaultToleranceOptions"] 100 | 101 | assert gcs_ft_options["redisAddress"] == "redis:6379" 102 | assert gcs_ft_options["externalStorageNamespace"] == "new-ns" 103 | assert ( 104 | gcs_ft_options["redisPassword"]["valueFrom"]["secretKeyRef"]["name"] 105 | == "redis-password-secret" 106 | ) 107 | assert ( 108 | gcs_ft_options["redisPassword"]["valueFrom"]["secretKeyRef"]["key"] 109 | == "password" 110 | ) 111 | -------------------------------------------------------------------------------- /tests/e2e/local_interactive_sdk_kind_test.py: -------------------------------------------------------------------------------- 1 | from codeflare_sdk import ( 2 | Cluster, 3 | ClusterConfiguration, 4 | generate_cert, 5 | ) 6 | 7 | import pytest 8 | import ray 9 | import math 10 | import subprocess 11 | 12 | from support import * 13 | 14 | 15 | @pytest.mark.kind 16 | class TestRayLocalInteractiveKind: 17 | def setup_method(self): 18 | initialize_kubernetes_client(self) 19 | self.port_forward_process = None 20 | 21 | def cleanup_port_forward(self): 22 | if self.port_forward_process: 23 | self.port_forward_process.terminate() 24 | self.port_forward_process.wait(timeout=10) 25 | self.port_forward_process = None 26 | 27 | def teardown_method(self): 28 | self.cleanup_port_forward() 29 | delete_namespace(self) 30 | delete_kueue_resources(self) 31 | 32 | def test_local_interactives(self): 33 | self.setup_method() 34 | create_namespace(self) 35 | create_kueue_resources(self) 36 | self.run_local_interactives() 37 | 38 | @pytest.mark.nvidia_gpu 39 | def test_local_interactives_nvidia_gpu(self): 40 | self.setup_method() 41 | create_namespace(self) 42 | create_kueue_resources(self) 43 | self.run_local_interactives(number_of_gpus=1) 44 | 45 | def run_local_interactives( 46 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 47 | ): 48 | cluster_name = "test-ray-cluster-li" 49 | 50 | ray.shutdown() 51 | 52 | cluster = Cluster( 53 | ClusterConfiguration( 54 | name=cluster_name, 55 | namespace=self.namespace, 56 | num_workers=1, 57 | head_cpu_requests="500m", 58 | head_cpu_limits="500m", 59 | worker_cpu_requests="500m", 60 | worker_cpu_limits=1, 61 | worker_memory_requests=1, 62 | worker_memory_limits=4, 63 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, 64 | verify_tls=False, 65 | ) 66 | ) 67 | 68 | cluster.apply() 69 | 70 | cluster.wait_ready() 71 | cluster.status() 72 | 73 | generate_cert.generate_tls_cert(cluster_name, self.namespace) 74 | generate_cert.export_env(cluster_name, self.namespace) 75 | 76 | print(cluster.local_client_url()) 77 | 78 | @ray.remote(num_gpus=number_of_gpus / 2) 79 | def heavy_calculation_part(num_iterations): 80 | result = 0.0 81 | for i in range(num_iterations): 82 | for j in range(num_iterations): 83 | for k in range(num_iterations): 84 | result += math.sin(i) * math.cos(j) * math.tan(k) 85 | return result 86 | 87 | @ray.remote(num_gpus=number_of_gpus / 2) 88 | def heavy_calculation(num_iterations): 89 | results = ray.get( 90 | [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)] 91 | ) 92 | return sum(results) 93 | 94 | # Attempt to port forward 95 | try: 96 | local_port = "20001" 97 | ray_client_port = "10001" 98 | 99 | port_forward_cmd = [ 100 | "kubectl", 101 | "port-forward", 102 | "-n", 103 | self.namespace, 104 | f"svc/{cluster_name}-head-svc", 105 | f"{local_port}:{ray_client_port}", 106 | ] 107 | self.port_forward_process = subprocess.Popen( 108 | port_forward_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL 109 | ) 110 | 111 | client_url = f"ray://localhost:{local_port}" 112 | cluster.status() 113 | 114 | ray.init(address=client_url, logging_level="INFO") 115 | 116 | ref = heavy_calculation.remote(3000) 117 | result = ray.get(ref) 118 | assert ( 119 | result == 1789.4644387076728 120 | ) # Updated result after moving to Python 3.12 (0.0000000000008% difference to old assertion) 121 | ray.cancel(ref) 122 | ray.shutdown() 123 | 124 | cluster.down() 125 | finally: 126 | self.cleanup_port_forward() 127 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/5_submit_rayjob_cr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9259e514", 6 | "metadata": {}, 7 | "source": [ 8 | "# Submitting a RayJob CR\n", 9 | "\n", 10 | "In this notebook, we will go through the basics of using the SDK to:\n", 11 | " * Define a RayCluster configuration\n", 12 | " * Use this configuration alongside a RayJob definition\n", 13 | " * Submit the RayJob, and allow Kuberay Operator to lifecycle the RayCluster for the RayJob" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "18136ea7", 19 | "metadata": {}, 20 | "source": [ 21 | "## Defining and Submitting the RayJob\n", 22 | "First, we'll need to import the relevant CodeFlare SDK packages. You can do this by executing the below cell." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "51e18292", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from codeflare_sdk import RayJob, ManagedClusterConfig" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "649c5911", 38 | "metadata": {}, 39 | "source": [ 40 | "Run the below `oc login` command using your Token and Server URL. Ensure the command is prepended by `!` and not `%`. This will work when running both locally and within RHOAI." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "dc364888", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "!oc login --token= --server=" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "5581eca9", 56 | "metadata": {}, 57 | "source": [ 58 | "Next we'll need to define the ManagedClusterConfig. Kuberay will use this to spin up a short-lived RayCluster that will only exist as long as the job" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "3094c60a", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "cluster_config = ManagedClusterConfig(\n", 69 | " head_memory_requests=6,\n", 70 | " head_memory_limits=8,\n", 71 | " num_workers=2,\n", 72 | " worker_cpu_requests=1,\n", 73 | " worker_cpu_limits=1,\n", 74 | " worker_memory_requests=4,\n", 75 | " worker_memory_limits=6,\n", 76 | " head_accelerators={'nvidia.com/gpu': 0},\n", 77 | " worker_accelerators={'nvidia.com/gpu': 0},\n", 78 | ")" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "02a2b32b", 84 | "metadata": {}, 85 | "source": [ 86 | "Lastly we can pass the ManagedClusterConfig into the RayJob and submit it. You do not need to worry about tearing down the cluster when the job has completed, that is handled for you!" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "e905ccea", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "job = RayJob(\n", 97 | " job_name=\"demo-rayjob\",\n", 98 | " entrypoint=\"python -c 'print(\\\"Hello from RayJob!\\\")'\",\n", 99 | " cluster_config=cluster_config,\n", 100 | " namespace=\"your-namespace\"\n", 101 | ")\n", 102 | "\n", 103 | "job.submit()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "id": "f3612de2", 109 | "metadata": {}, 110 | "source": [ 111 | "We can check the status of our job by executing the below cell. The status may appear as `unknown` for a time while the RayCluster spins up." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "96d92f93", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "job.status()" 122 | ] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.11.11" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 5 146 | } 147 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/rayjobs/pretty_print.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | This sub-module exists primarily to be used internally by the RayJob object 17 | (in the rayjob sub-module) for pretty-printing job status and details. 18 | """ 19 | 20 | from rich.console import Console 21 | from rich.table import Table 22 | from rich.panel import Panel 23 | from typing import Tuple, Optional 24 | 25 | from .status import RayJobDeploymentStatus, RayJobInfo 26 | 27 | 28 | def print_job_status(job_info: RayJobInfo): 29 | """ 30 | Pretty print the job status in a format similar to cluster status. 31 | """ 32 | status_display, header_color = _get_status_display(job_info.status) 33 | 34 | # Create main info table 35 | table = _create_info_table(header_color, job_info.name, status_display) 36 | table.add_row(f"[bold]Job ID:[/bold] {job_info.job_id}") 37 | table.add_row(f"[bold]Status:[/bold] {job_info.status.value}") 38 | table.add_row(f"[bold]RayCluster:[/bold] {job_info.cluster_name}") 39 | table.add_row(f"[bold]Namespace:[/bold] {job_info.namespace}") 40 | 41 | # Add timing information if available 42 | if job_info.start_time: 43 | table.add_row() 44 | table.add_row(f"[bold]Started:[/bold] {job_info.start_time}") 45 | 46 | # Add attempt counts if there are failures 47 | if job_info.failed_attempts > 0: 48 | table.add_row(f"[bold]Failed Attempts:[/bold] {job_info.failed_attempts}") 49 | 50 | _print_table_in_panel(table) 51 | 52 | 53 | def print_no_job_found(job_name: str, namespace: str): 54 | """ 55 | Print a message when no job is found. 56 | """ 57 | # Create table with error message 58 | table = _create_info_table( 59 | "[white on red][bold]Name", job_name, "[bold red]No RayJob found" 60 | ) 61 | table.add_row() 62 | table.add_row("Please run rayjob.submit() to submit a job.") 63 | table.add_row() 64 | table.add_row(f"[bold]Namespace:[/bold] {namespace}") 65 | 66 | _print_table_in_panel(table) 67 | 68 | 69 | def _get_status_display(status: RayJobDeploymentStatus) -> Tuple[str, str]: 70 | """ 71 | Get the display string and header color for a given status. 72 | 73 | Returns: 74 | Tuple of (status_display, header_color) 75 | """ 76 | status_mapping = { 77 | RayJobDeploymentStatus.COMPLETE: ( 78 | "Complete :white_heavy_check_mark:", 79 | "[white on green][bold]Name", 80 | ), 81 | RayJobDeploymentStatus.RUNNING: ("Running :gear:", "[white on blue][bold]Name"), 82 | RayJobDeploymentStatus.FAILED: ("Failed :x:", "[white on red][bold]Name"), 83 | RayJobDeploymentStatus.SUSPENDED: ( 84 | "Suspended :pause_button:", 85 | "[white on yellow][bold]Name", 86 | ), 87 | } 88 | 89 | return status_mapping.get( 90 | status, ("Unknown :question:", "[white on red][bold]Name") 91 | ) 92 | 93 | 94 | def _create_info_table(header_color: str, name: str, status_display: str) -> Table: 95 | """ 96 | Create a standardized info table with header and status. 97 | 98 | Returns: 99 | Table with header row, name/status row, and empty separator row 100 | """ 101 | table = Table(box=None, show_header=False) 102 | table.add_row(header_color) 103 | table.add_row("[bold underline]" + name, status_display) 104 | table.add_row() # Empty separator row 105 | return table 106 | 107 | 108 | def _print_table_in_panel(table: Table): 109 | """ 110 | Print a table wrapped in a consistent panel format. 111 | """ 112 | console = Console() 113 | main_table = Table( 114 | box=None, title="[bold] :package: CodeFlare RayJob Status :package:" 115 | ) 116 | main_table.add_row(Panel.fit(table)) 117 | console.print(main_table) 118 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/ray-cluster-interaction.rst: -------------------------------------------------------------------------------- 1 | Ray Cluster Interaction 2 | ======================= 3 | 4 | The CodeFlare SDK offers multiple ways to interact with Ray Clusters 5 | including the below methods. 6 | 7 | get_cluster() 8 | ------------- 9 | 10 | The ``get_cluster()`` function is used to initialise a ``Cluster`` 11 | object from a pre-existing Ray Cluster/AppWrapper. Below is an example 12 | of it's usage: 13 | 14 | :: 15 | 16 | from codeflare_sdk import get_cluster 17 | cluster = get_cluster(cluster_name="raytest", namespace="example", is_appwrapper=False, write_to_file=False) 18 | -> output: Yaml resources loaded for raytest 19 | cluster.status() 20 | -> output: 21 | 🚀 CodeFlare Cluster Status 🚀 22 | ╭─────────────────────────────────────────────────────────────────╮ 23 | │ Name │ 24 | │ raytest Active ✅ │ 25 | │ │ 26 | │ URI: ray://raytest-head-svc.example.svc:10001 │ 27 | │ │ 28 | │ Dashboard🔗 │ 29 | │ │ 30 | ╰─────────────────────────────────────────────────────────────────╯ 31 | (, True) 32 | cluster.down() 33 | cluster.apply() # This function will create an exact copy of the retrieved Ray Cluster only if the Ray Cluster has been previously deleted. 34 | 35 | | These are the parameters the ``get_cluster()`` function accepts: 36 | | ``cluster_name: str # Required`` -> The name of the Ray Cluster. 37 | | ``namespace: str # Default: "default"`` -> The namespace of the Ray Cluster. 38 | | ``is_appwrapper: bool # Default: False`` -> When set to 39 | | ``True`` the function will attempt to retrieve an AppWrapper instead of a Ray Cluster. 40 | | ``write_to_file: bool # Default: False`` -> When set to ``True`` the Ray Cluster/AppWrapper will be written to a file similar to how it is done in ``ClusterConfiguration``. 41 | 42 | list_all_queued() 43 | ----------------- 44 | 45 | | The ``list_all_queued()`` function returns (and prints by default) a list of all currently queued-up Ray Clusters in a given namespace. 46 | | It accepts the following parameters: 47 | | ``namespace: str # Required`` -> The namespace you want to retrieve the list from. 48 | | ``print_to_console: bool # Default: True`` -> Allows the user to print the list to their console. 49 | | ``appwrapper: bool # Default: False`` -> When set to ``True`` allows the user to list queued AppWrappers. 50 | 51 | list_all_clusters() 52 | ------------------- 53 | 54 | | The ``list_all_clusters()`` function will return a list of detailed descriptions of Ray Clusters to the console by default. 55 | | It accepts the following parameters: 56 | | ``namespace: str # Required`` -> The namespace you want to retrieve the list from. 57 | | ``print_to_console: bool # Default: True`` -> A boolean that allows the user to print the list to their console. 58 | 59 | .. note:: 60 | 61 | The following methods require a ``Cluster`` object to be 62 | initialized. See :doc:`./cluster-configuration` 63 | 64 | cluster.apply() 65 | ------------ 66 | 67 | | The ``cluster.apply()`` function applies a Ray Cluster in the given namespace. If the cluster already exists, it is updated. 68 | | If it does not exist it is created. 69 | 70 | cluster.down() 71 | -------------- 72 | 73 | | The ``cluster.down()`` function deletes the Ray Cluster in the given namespace. 74 | 75 | cluster.status() 76 | ---------------- 77 | 78 | | The ``cluster.status()`` function prints out the status of the Ray Cluster's state with a link to the Ray Dashboard. 79 | 80 | cluster.details() 81 | ----------------- 82 | 83 | | The ``cluster.details()`` function prints out a detailed description of the Ray Cluster's status, worker resources and a link to the Ray Dashboard. 84 | 85 | cluster.wait_ready() 86 | -------------------- 87 | 88 | | The ``cluster.wait_ready()`` function waits for the requested cluster to be ready, up to an optional timeout and checks every 5 seconds. 89 | | It accepts the following parameters: 90 | | ``timeout: Optional[int] # Default: None`` -> Allows the user to define a timeout for the ``wait_ready()`` function. 91 | | ``dashboard_check: bool # Default: True`` -> If enabled the ``wait_ready()`` function will wait until the Ray Dashboard is ready too. 92 | -------------------------------------------------------------------------------- /.github/workflows/ui_notebooks_test.yaml: -------------------------------------------------------------------------------- 1 | name: UI notebooks tests 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | types: [ labeled ] 7 | 8 | concurrency: 9 | group: ${{ github.head_ref }}-${{ github.workflow }} 10 | cancel-in-progress: true 11 | 12 | env: 13 | CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" 14 | 15 | jobs: 16 | verify-3_widget_example: 17 | if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') || contains(github.event.pull_request.labels.*.name, 'test-ui-notebooks') }} 18 | runs-on: ubuntu-latest-4core 19 | 20 | steps: 21 | - name: Checkout code 22 | uses: actions/checkout@v4 23 | with: 24 | submodules: recursive 25 | 26 | - name: Checkout common repo code 27 | uses: actions/checkout@v4 28 | with: 29 | repository: "project-codeflare/codeflare-common" 30 | ref: "main" 31 | path: "common" 32 | 33 | - name: Checkout CodeFlare operator repository 34 | uses: actions/checkout@v4 35 | with: 36 | repository: project-codeflare/codeflare-operator 37 | path: codeflare-operator 38 | 39 | - name: Set Go 40 | uses: actions/setup-go@v5 41 | with: 42 | go-version-file: "./codeflare-operator/go.mod" 43 | cache-dependency-path: "./codeflare-operator/go.sum" 44 | 45 | - name: Set up gotestfmt 46 | uses: gotesttools/gotestfmt-action@v2 47 | with: 48 | token: ${{ secrets.GITHUB_TOKEN }} 49 | 50 | - name: Set up specific Python version 51 | uses: actions/setup-python@v5 52 | with: 53 | python-version: "3.11" 54 | cache: "pip" # caching pip dependencies 55 | 56 | - name: Setup and start KinD cluster 57 | uses: ./common/github-actions/kind 58 | 59 | - name: Deploy CodeFlare stack 60 | id: deploy 61 | run: | 62 | cd codeflare-operator 63 | echo Setting up CodeFlare stack 64 | make setup-e2e 65 | echo Deploying CodeFlare operator 66 | make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" 67 | kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager 68 | cd .. 69 | 70 | - name: Setup Guided notebooks execution 71 | run: | 72 | echo "Installing papermill and dependencies..." 73 | pip install poetry ipython ipykernel 74 | poetry config virtualenvs.create false 75 | echo "Installing SDK..." 76 | poetry install --with test,docs 77 | 78 | - name: Install Yarn dependencies 79 | run: | 80 | poetry run yarn install 81 | poetry run yarn playwright install chromium 82 | working-directory: ui-tests 83 | 84 | - name: Fix 3_widget_example.ipynb notebook for test 85 | run: | 86 | # Remove login/logout cells, as KinD doesn't support authentication using token 87 | jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb 88 | jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb 89 | # Set explicit namespace as SDK need it (currently) to resolve local queues 90 | sed -i "s|head_memory_limits=2,|head_memory_limits=2, namespace='default',|" 3_widget_example.ipynb 91 | sed -i "s|view_clusters()|view_clusters('default')|" 3_widget_example.ipynb 92 | working-directory: demo-notebooks/guided-demos 93 | 94 | - name: Run UI notebook tests 95 | run: | 96 | set -euo pipefail 97 | 98 | poetry run yarn test 99 | working-directory: ui-tests 100 | 101 | - name: Upload Playwright Test assets 102 | if: always() 103 | uses: actions/upload-artifact@v4 104 | with: 105 | name: ipywidgets-test-assets 106 | path: | 107 | ui-tests/test-results 108 | 109 | - name: Upload Playwright Test report 110 | if: always() 111 | uses: actions/upload-artifact@v4 112 | with: 113 | name: ipywidgets-test-report 114 | path: | 115 | ui-tests/playwright-report 116 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/examples/use-utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from os import path 4 | import json 5 | 6 | 7 | """ 8 | in case you are working directly with the source, and don't wish to 9 | install the module with pip install, you can directly import the packages by uncommenting the following code. 10 | """ 11 | 12 | """ 13 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 14 | 15 | current_dir = os.path.dirname(os.path.abspath(__file__)) 16 | parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir)) 17 | sibling_dirs = [ 18 | d for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d)) 19 | ] 20 | for sibling_dir in sibling_dirs: 21 | sys.path.append(os.path.join(parent_dir, sibling_dir)) 22 | """ 23 | 24 | from codeflare_sdk.vendored.python_client import kuberay_cluster_api 25 | 26 | from codeflare_sdk.vendored.python_client.utils import ( 27 | kuberay_cluster_utils, 28 | kuberay_cluster_builder, 29 | ) 30 | 31 | 32 | def main(): 33 | print("starting cluster handler...") 34 | my_kuberay_api = kuberay_cluster_api.RayClusterApi() # this is the main api object 35 | 36 | my_cluster_builder = ( 37 | kuberay_cluster_builder.ClusterBuilder() 38 | ) # this is the builder object, to create a cluster with a more granular control 39 | 40 | my_Cluster_utils = ( 41 | kuberay_cluster_utils.ClusterUtils() 42 | ) # this is the utils object, to perform operations on a cluster 43 | 44 | cluster1 = ( 45 | my_cluster_builder.build_meta( 46 | name="new-cluster1", labels={"demo-cluster": "yes"} 47 | ) 48 | .build_head() 49 | .build_worker(group_name="workers") 50 | .get_cluster() 51 | ) # this is the cluster object, it is a dict 52 | 53 | if not my_cluster_builder.succeeded: 54 | print("error building the cluster, aborting...") 55 | return 56 | 57 | print("creating raycluster = {}".format(cluster1["metadata"]["name"])) 58 | my_kuberay_api.create_ray_cluster( 59 | body=cluster1 60 | ) # this is the api call to create the cluster1 in k8s 61 | 62 | cluster_to_patch, succeeded = my_Cluster_utils.update_worker_group_replicas( 63 | cluster1, group_name="workers", max_replicas=4, min_replicas=1, replicas=2 64 | ) 65 | 66 | if succeeded: 67 | print( 68 | "trying to patch raycluster = {}".format( 69 | cluster_to_patch["metadata"]["name"] 70 | ) 71 | ) 72 | my_kuberay_api.patch_ray_cluster( 73 | name=cluster_to_patch["metadata"]["name"], ray_patch=cluster_to_patch 74 | ) # this is the api call to patch the cluster1 in k8s 75 | 76 | cluster_to_patch, succeeded = my_Cluster_utils.duplicate_worker_group( 77 | cluster1, group_name="workers", new_group_name="duplicate-workers" 78 | ) # this is the api call to duplicate the worker group in the cluster1 79 | if succeeded: 80 | print( 81 | "trying to patch raycluster = {}".format( 82 | cluster_to_patch["metadata"]["name"] 83 | ) 84 | ) 85 | my_kuberay_api.patch_ray_cluster( 86 | name=cluster_to_patch["metadata"]["name"], ray_patch=cluster_to_patch 87 | ) 88 | 89 | # the rest of the code is simply to list and cleanup the created cluster 90 | kube_ray_list = my_kuberay_api.list_ray_clusters( 91 | k8s_namespace="default", label_selector="demo-cluster=yes" 92 | ) # this is the api call to list the clusters in k8s 93 | if "items" in kube_ray_list: 94 | line = "-" * 72 95 | print(line) 96 | print("{:<63s}{:>2s}".format("Name", "Namespace")) 97 | print(line) 98 | for cluster in kube_ray_list["items"]: 99 | print( 100 | "{:<63s}{:>2s}".format( 101 | cluster["metadata"]["name"], 102 | cluster["metadata"]["namespace"], 103 | ) 104 | ) 105 | print(line) 106 | 107 | if "items" in kube_ray_list: 108 | for cluster in kube_ray_list["items"]: 109 | print("deleting raycluster = {}".format(cluster["metadata"]["name"])) 110 | my_kuberay_api.delete_ray_cluster( 111 | name=cluster["metadata"]["name"], 112 | k8s_namespace=cluster["metadata"]["namespace"], 113 | ) # this is the api call to delete the cluster in k8s 114 | 115 | 116 | if __name__ == "__main__": 117 | main() 118 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/3_widget_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8d4a42f6", 6 | "metadata": {}, 7 | "source": [ 8 | "In this notebook, we will go through the basics of using the SDK to:\n", 9 | " - Spin up a Ray cluster with our desired resources\n", 10 | " - View the status and specs of our Ray cluster\n", 11 | " - Take down the Ray cluster when finished" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# Import pieces from codeflare-sdk\n", 22 | "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, view_clusters" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "614daa0c", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Create authentication object for user permissions\n", 33 | "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", 34 | "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", 35 | "auth = TokenAuthentication(\n", 36 | " token = \"XXXXX\",\n", 37 | " server = \"XXXXX\",\n", 38 | " skip_tls=False\n", 39 | ")\n", 40 | "auth.login()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "bc27f84c", 46 | "metadata": {}, 47 | "source": [ 48 | "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n", 49 | "\n", 50 | "NOTE: The default images used by the CodeFlare SDK for creating a RayCluster resource depend on the installed Python version:\n", 51 | "\n", 52 | "- For Python 3.11: 'quay.io/modh/ray:2.52.1-py311-cu121'\n", 53 | "- For Python 3.12: 'quay.io/modh/ray:2.52.1-py312-cu128'\n", 54 | "\n", 55 | "If you prefer to use a custom Ray image that better suits your needs, you can specify it in the image field to override the default." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "0f4bc870-091f-4e11-9642-cba145710159", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# Create and configure our cluster object\n", 66 | "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", 67 | "cluster = Cluster(ClusterConfiguration(\n", 68 | " name='widgettest',\n", 69 | " head_cpu_requests='500m',\n", 70 | " head_cpu_limits='500m',\n", 71 | " head_memory_requests=6,\n", 72 | " head_memory_limits=8,\n", 73 | " head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n", 74 | " worker_extended_resource_requests={'nvidia.com/gpu':0},\n", 75 | " num_workers=2,\n", 76 | " worker_cpu_requests='250m',\n", 77 | " worker_cpu_limits=1,\n", 78 | " worker_memory_requests=4,\n", 79 | " worker_memory_limits=6,\n", 80 | " # image=\"\", # Optional Field\n", 81 | " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n", 82 | " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", 83 | "))" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "3de6403c", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "view_clusters()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "2d8e6ce3", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "cluster.status()" 104 | ] 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 3 (ipykernel)", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.9.18" 124 | }, 125 | "vscode": { 126 | "interpreter": { 127 | "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" 128 | } 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 5 133 | } 134 | -------------------------------------------------------------------------------- /src/codeflare_sdk/vendored/python_client_test/test_director.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from codeflare_sdk.vendored.python_client.utils import kuberay_cluster_builder 3 | 4 | 5 | class TestDirector(unittest.TestCase): 6 | def __init__(self, methodName: str = ...) -> None: 7 | super().__init__(methodName) 8 | self.director = kuberay_cluster_builder.Director() 9 | 10 | def test_build_basic_cluster(self): 11 | cluster = self.director.build_basic_cluster(name="basic-cluster") 12 | # testing meta 13 | actual = cluster["metadata"]["name"] 14 | expected = "basic-cluster" 15 | self.assertEqual(actual, expected) 16 | 17 | actual = cluster["metadata"]["namespace"] 18 | expected = "default" 19 | self.assertEqual(actual, expected) 20 | 21 | # testing the head pod 22 | actual = cluster["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ 23 | "resources" 24 | ]["requests"]["cpu"] 25 | expected = "2" 26 | self.assertEqual(actual, expected) 27 | 28 | def test_build_small_cluster(self): 29 | cluster = self.director.build_small_cluster(name="small-cluster") 30 | # testing meta 31 | actual = cluster["metadata"]["name"] 32 | expected = "small-cluster" 33 | self.assertEqual(actual, expected) 34 | 35 | actual = cluster["metadata"]["namespace"] 36 | expected = "default" 37 | self.assertEqual(actual, expected) 38 | 39 | # testing the head pod 40 | actual = cluster["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ 41 | "resources" 42 | ]["requests"]["cpu"] 43 | expected = "2" 44 | self.assertEqual(actual, expected) 45 | 46 | # testing the workergroup 47 | actual = cluster["spec"]["workerGroupSpecs"][0]["replicas"] 48 | expected = 1 49 | self.assertEqual(actual, expected) 50 | 51 | actual = cluster["spec"]["workerGroupSpecs"][0]["template"]["spec"][ 52 | "containers" 53 | ][0]["resources"]["requests"]["cpu"] 54 | expected = "1" 55 | self.assertEqual(actual, expected) 56 | 57 | def test_build_medium_cluster(self): 58 | cluster = self.director.build_medium_cluster(name="medium-cluster") 59 | # testing meta 60 | actual = cluster["metadata"]["name"] 61 | expected = "medium-cluster" 62 | self.assertEqual(actual, expected) 63 | 64 | actual = cluster["metadata"]["namespace"] 65 | expected = "default" 66 | self.assertEqual(actual, expected) 67 | 68 | # testing the head pod 69 | actual = cluster["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ 70 | "resources" 71 | ]["requests"]["cpu"] 72 | expected = "2" 73 | self.assertEqual(actual, expected) 74 | 75 | # testing the workergroup 76 | actual = cluster["spec"]["workerGroupSpecs"][0]["replicas"] 77 | expected = 3 78 | self.assertEqual(actual, expected) 79 | 80 | actual = cluster["spec"]["workerGroupSpecs"][0]["groupName"] 81 | expected = "medium-cluster-workers" 82 | self.assertEqual(actual, expected) 83 | 84 | actual = cluster["spec"]["workerGroupSpecs"][0]["template"]["spec"][ 85 | "containers" 86 | ][0]["resources"]["requests"]["cpu"] 87 | expected = "2" 88 | self.assertEqual(actual, expected) 89 | 90 | def test_build_large_cluster(self): 91 | cluster = self.director.build_large_cluster(name="large-cluster") 92 | # testing meta 93 | actual = cluster["metadata"]["name"] 94 | expected = "large-cluster" 95 | self.assertEqual(actual, expected) 96 | 97 | actual = cluster["metadata"]["namespace"] 98 | expected = "default" 99 | self.assertEqual(actual, expected) 100 | 101 | # testing the head pod 102 | actual = cluster["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ 103 | "resources" 104 | ]["requests"]["cpu"] 105 | expected = "2" 106 | self.assertEqual(actual, expected) 107 | 108 | # testing the workergroup 109 | actual = cluster["spec"]["workerGroupSpecs"][0]["replicas"] 110 | expected = 6 111 | self.assertEqual(actual, expected) 112 | 113 | actual = cluster["spec"]["workerGroupSpecs"][0]["groupName"] 114 | expected = "large-cluster-workers" 115 | self.assertEqual(actual, expected) 116 | 117 | actual = cluster["spec"]["workerGroupSpecs"][0]["template"]["spec"][ 118 | "containers" 119 | ][0]["resources"]["requests"]["cpu"] 120 | expected = "3" 121 | self.assertEqual(actual, expected) 122 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/support_clusters/test-rc-b.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: ray.io/v1 2 | kind: RayCluster 3 | metadata: 4 | labels: 5 | controller-tools.k8s.io: '1.0' 6 | kueue.x-k8s.io/queue-name: local_default_queue 7 | name: test-rc-b 8 | namespace: ns 9 | spec: 10 | autoscalerOptions: 11 | idleTimeoutSeconds: 60 12 | resources: 13 | limits: 14 | cpu: 500m 15 | memory: 512Mi 16 | requests: 17 | cpu: 500m 18 | memory: 512Mi 19 | upscalingMode: Default 20 | enableInTreeAutoscaling: false 21 | headGroupSpec: 22 | enableIngress: false 23 | rayStartParams: 24 | block: 'true' 25 | dashboard-host: 0.0.0.0 26 | num-gpus: '0' 27 | resources: '"{}"' 28 | serviceType: ClusterIP 29 | template: 30 | spec: 31 | containers: 32 | - image: "${image}" 33 | imagePullPolicy: IfNotPresent 34 | lifecycle: 35 | preStop: 36 | exec: 37 | command: 38 | - /bin/sh 39 | - -c 40 | - ray stop 41 | name: ray-head 42 | ports: 43 | - containerPort: 6379 44 | name: gcs 45 | - containerPort: 8265 46 | name: dashboard 47 | - containerPort: 10001 48 | name: client 49 | resources: 50 | limits: 51 | cpu: 2 52 | memory: 8G 53 | requests: 54 | cpu: 2 55 | memory: 8G 56 | volumeMounts: 57 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 58 | name: odh-trusted-ca-cert 59 | subPath: odh-trusted-ca-bundle.crt 60 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 61 | name: odh-trusted-ca-cert 62 | subPath: odh-trusted-ca-bundle.crt 63 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 64 | name: odh-ca-cert 65 | subPath: odh-ca-bundle.crt 66 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 67 | name: odh-ca-cert 68 | subPath: odh-ca-bundle.crt 69 | imagePullSecrets: [] 70 | volumes: 71 | - configMap: 72 | items: 73 | - key: ca-bundle.crt 74 | path: odh-trusted-ca-bundle.crt 75 | name: odh-trusted-ca-bundle 76 | optional: true 77 | name: odh-trusted-ca-cert 78 | - configMap: 79 | items: 80 | - key: odh-ca-bundle.crt 81 | path: odh-ca-bundle.crt 82 | name: odh-trusted-ca-bundle 83 | optional: true 84 | name: odh-ca-cert 85 | rayVersion: 2.52.1 86 | workerGroupSpecs: 87 | - groupName: small-group-test-rc-b 88 | maxReplicas: 1 89 | minReplicas: 1 90 | rayStartParams: 91 | block: 'true' 92 | num-gpus: '0' 93 | resources: '"{}"' 94 | replicas: 1 95 | template: 96 | metadata: 97 | annotations: 98 | key: value 99 | labels: 100 | key: value 101 | spec: 102 | containers: 103 | - image: "${image}" 104 | lifecycle: 105 | preStop: 106 | exec: 107 | command: 108 | - /bin/sh 109 | - -c 110 | - ray stop 111 | name: machine-learning 112 | resources: 113 | limits: 114 | cpu: 1 115 | memory: 2G 116 | requests: 117 | cpu: 1 118 | memory: 2G 119 | volumeMounts: 120 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 121 | name: odh-trusted-ca-cert 122 | subPath: odh-trusted-ca-bundle.crt 123 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 124 | name: odh-trusted-ca-cert 125 | subPath: odh-trusted-ca-bundle.crt 126 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 127 | name: odh-ca-cert 128 | subPath: odh-ca-bundle.crt 129 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 130 | name: odh-ca-cert 131 | subPath: odh-ca-bundle.crt 132 | imagePullSecrets: [] 133 | volumes: 134 | - configMap: 135 | items: 136 | - key: ca-bundle.crt 137 | path: odh-trusted-ca-bundle.crt 138 | name: odh-trusted-ca-bundle 139 | optional: true 140 | name: odh-trusted-ca-cert 141 | - configMap: 142 | items: 143 | - key: odh-ca-bundle.crt 144 | path: odh-ca-bundle.crt 145 | name: odh-trusted-ca-bundle 146 | optional: true 147 | name: odh-ca-cert 148 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/support_clusters/test-rc-a.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: ray.io/v1 2 | kind: RayCluster 3 | metadata: 4 | labels: 5 | controller-tools.k8s.io: '1.0' 6 | kueue.x-k8s.io/queue-name: local_default_queue 7 | name: test-cluster-a 8 | namespace: ns 9 | spec: 10 | autoscalerOptions: 11 | idleTimeoutSeconds: 60 12 | resources: 13 | limits: 14 | cpu: 500m 15 | memory: 512Mi 16 | requests: 17 | cpu: 500m 18 | memory: 512Mi 19 | upscalingMode: Default 20 | enableInTreeAutoscaling: false 21 | headGroupSpec: 22 | enableIngress: false 23 | rayStartParams: 24 | block: 'true' 25 | dashboard-host: 0.0.0.0 26 | num-gpus: '0' 27 | resources: '"{}"' 28 | serviceType: ClusterIP 29 | template: 30 | spec: 31 | containers: 32 | - image: "${image}" 33 | imagePullPolicy: IfNotPresent 34 | lifecycle: 35 | preStop: 36 | exec: 37 | command: 38 | - /bin/sh 39 | - -c 40 | - ray stop 41 | name: ray-head 42 | ports: 43 | - containerPort: 6379 44 | name: gcs 45 | - containerPort: 8265 46 | name: dashboard 47 | - containerPort: 10001 48 | name: client 49 | resources: 50 | limits: 51 | cpu: 2 52 | memory: 8G 53 | requests: 54 | cpu: 2 55 | memory: 8G 56 | volumeMounts: 57 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 58 | name: odh-trusted-ca-cert 59 | subPath: odh-trusted-ca-bundle.crt 60 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 61 | name: odh-trusted-ca-cert 62 | subPath: odh-trusted-ca-bundle.crt 63 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 64 | name: odh-ca-cert 65 | subPath: odh-ca-bundle.crt 66 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 67 | name: odh-ca-cert 68 | subPath: odh-ca-bundle.crt 69 | imagePullSecrets: [] 70 | volumes: 71 | - configMap: 72 | items: 73 | - key: ca-bundle.crt 74 | path: odh-trusted-ca-bundle.crt 75 | name: odh-trusted-ca-bundle 76 | optional: true 77 | name: odh-trusted-ca-cert 78 | - configMap: 79 | items: 80 | - key: odh-ca-bundle.crt 81 | path: odh-ca-bundle.crt 82 | name: odh-trusted-ca-bundle 83 | optional: true 84 | name: odh-ca-cert 85 | rayVersion: 2.52.1 86 | workerGroupSpecs: 87 | - groupName: small-group-test-cluster-a 88 | maxReplicas: 1 89 | minReplicas: 1 90 | rayStartParams: 91 | block: 'true' 92 | num-gpus: '0' 93 | resources: '"{}"' 94 | replicas: 1 95 | template: 96 | metadata: 97 | annotations: 98 | key: value 99 | labels: 100 | key: value 101 | spec: 102 | containers: 103 | - image: "${image}" 104 | lifecycle: 105 | preStop: 106 | exec: 107 | command: 108 | - /bin/sh 109 | - -c 110 | - ray stop 111 | name: machine-learning 112 | resources: 113 | limits: 114 | cpu: 1 115 | memory: 2G 116 | requests: 117 | cpu: 1 118 | memory: 2G 119 | volumeMounts: 120 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 121 | name: odh-trusted-ca-cert 122 | subPath: odh-trusted-ca-bundle.crt 123 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 124 | name: odh-trusted-ca-cert 125 | subPath: odh-trusted-ca-bundle.crt 126 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 127 | name: odh-ca-cert 128 | subPath: odh-ca-bundle.crt 129 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 130 | name: odh-ca-cert 131 | subPath: odh-ca-bundle.crt 132 | imagePullSecrets: [] 133 | volumes: 134 | - configMap: 135 | items: 136 | - key: ca-bundle.crt 137 | path: odh-trusted-ca-bundle.crt 138 | name: odh-trusted-ca-bundle 139 | optional: true 140 | name: odh-trusted-ca-cert 141 | - configMap: 142 | items: 143 | - key: odh-ca-bundle.crt 144 | path: odh-ca-bundle.crt 145 | name: odh-trusted-ca-bundle 146 | optional: true 147 | name: odh-ca-cert 148 | --------------------------------------------------------------------------------