├── tests
├── __init__.py
├── e2e
│ ├── rayjob
│ │ └── __init__.py
│ ├── mnist_pip_requirements.txt
│ ├── install-codeflare-sdk.sh
│ ├── start_ray_cluster.py
│ ├── mnist_rayjob.py
│ ├── mnist_sleep.py
│ ├── heterogeneous_clusters_kind_test.py
│ ├── local_interactive_sdk_oauth_test.py
│ ├── heterogeneous_clusters_oauth_test.py
│ ├── minio_deployment.yaml
│ ├── mnist_raycluster_sdk_kind_test.py
│ ├── mnist_raycluster_sdk_aw_kind_test.py
│ ├── cluster_apply_kind_test.py
│ └── local_interactive_sdk_kind_test.py
├── upgrade
│ ├── __init__.py
│ └── conftest.py
├── e2e_v2
│ ├── security
│ │ ├── __init__.py
│ │ ├── test_mtls.py
│ │ └── test_network_policies.py
│ ├── upgrade
│ │ ├── __init__.py
│ │ └── conftest.py
│ ├── cluster_management
│ │ ├── __init__.py
│ │ ├── creation
│ │ │ ├── __init__.py
│ │ │ ├── test_cluster_kueue.py
│ │ │ └── test_cluster_creation.py
│ │ ├── configuration
│ │ │ ├── __init__.py
│ │ │ ├── test_advanced.py
│ │ │ ├── test_images.py
│ │ │ ├── test_resources.py
│ │ │ ├── test_volumes.py
│ │ │ └── test_heterogeneous.py
│ │ └── interactive
│ │ │ ├── __init__.py
│ │ │ ├── test_remote.py
│ │ │ └── test_in_cluster.py
│ ├── job_submission
│ │ ├── __init__.py
│ │ ├── rayjob_client
│ │ │ ├── __init__.py
│ │ │ ├── test_remote.py
│ │ │ └── test_in_cluster.py
│ │ └── rayjob_cr
│ │ │ ├── __init__.py
│ │ │ └── test_lifecycled_cluster.py
│ ├── kueue_integration
│ │ ├── __init__.py
│ │ ├── test_admission.py
│ │ ├── test_queueing.py
│ │ └── test_resource_flavors.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── scripts
│ │ │ ├── __init__.py
│ │ │ ├── cpu_script.py
│ │ │ └── gpu_script.py
│ │ └── in_cluster
│ │ │ ├── __init__.py
│ │ │ └── setup.py
│ ├── __init__.py
│ └── ui
│ │ ├── __init__.py
│ │ └── pages
│ │ └── __init__.py
├── ui
│ ├── __init__.py
│ └── pages
│ │ └── __init__.py
├── auth-test.crt
└── test_cluster_yamls
│ ├── appwrapper
│ └── test-case-bad.yaml
│ └── support_clusters
│ ├── test-rc-b.yaml
│ └── test-rc-a.yaml
├── .gitattributes
├── src
└── codeflare_sdk
│ ├── vendored
│ ├── python_client
│ │ ├── utils
│ │ │ └── __init__.py
│ │ ├── __init__.py
│ │ └── constants.py
│ ├── .gitignore
│ ├── __init__.py
│ ├── python_client_test
│ │ ├── README.md
│ │ └── test_director.py
│ ├── pyproject.toml
│ └── examples
│ │ ├── use-builder.py
│ │ ├── use-director.py
│ │ └── use-utils.py
│ ├── ray
│ ├── client
│ │ └── __init__.py
│ ├── appwrapper
│ │ ├── __init__.py
│ │ ├── status.py
│ │ ├── test_awload.py
│ │ ├── awload.py
│ │ └── test_status.py
│ ├── rayjobs
│ │ ├── __init__.py
│ │ ├── test
│ │ │ └── conftest.py
│ │ ├── status.py
│ │ └── pretty_print.py
│ ├── cluster
│ │ ├── __init__.py
│ │ ├── status.py
│ │ ├── test_status.py
│ │ └── test_build_ray_cluster.py
│ └── __init__.py
│ ├── common
│ ├── widgets
│ │ └── __init__.py
│ ├── kueue
│ │ └── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── k8s_utils.py
│ │ ├── demos.py
│ │ ├── utils.py
│ │ └── test_demos.py
│ ├── __init__.py
│ └── kubernetes_cluster
│ │ ├── __init__.py
│ │ └── kube_api_helpers.py
│ └── __init__.py
├── assets
└── images
│ └── sdk-diagram.png
├── demo-notebooks
├── additional-demos
│ ├── batch-inference
│ │ ├── requirements.txt
│ │ └── simple_batch_inf.py
│ ├── requirements.txt
│ └── remote_ray_job_client.ipynb
└── guided-demos
│ ├── requirements.txt
│ ├── preview_nbs
│ └── requirements.txt
│ ├── notebook-ex-outputs
│ └── requirements.txt
│ ├── download_mnist_datasets.py
│ ├── mnist_fashion.py
│ ├── 5_submit_rayjob_cr.ipynb
│ └── 3_widget_example.ipynb
├── docs
├── images
│ ├── codeflare_sdk.png
│ └── codeflare_stack_arch.png
├── sphinx
│ ├── user-docs
│ │ ├── images
│ │ │ ├── ui-buttons.png
│ │ │ └── ui-view-clusters.png
│ │ ├── ui-widgets.rst
│ │ ├── authentication.rst
│ │ ├── s3-compatible-storage.rst
│ │ ├── setup-kueue.rst
│ │ └── ray-cluster-interaction.rst
│ ├── Makefile
│ ├── make.bat
│ ├── index.rst
│ └── conf.py
└── generate-documentation.md
├── ui-tests
├── .yarnrc
├── jupyter_server_config.py
├── tests
│ └── widget_notebook_example.test.ts-snapshots
│ │ ├── widgets-cell-0-linux.png
│ │ ├── widgets-cell-2-linux.png
│ │ ├── widgets-cell-3-linux.png
│ │ ├── widgets-cell-4-linux.png
│ │ └── widgets-cell-5-linux.png
├── playwright.config.js
└── package.json
├── images
└── tests
│ └── entrypoint.sh
├── .github
├── build
│ ├── README.md
│ └── Containerfile
├── workflows
│ ├── pre-commit.yaml
│ ├── snyk-security.yaml
│ ├── dependabot-labeler.yaml
│ ├── unit-tests.yml
│ ├── publish-documentation.yaml
│ ├── build-test-image.yaml
│ ├── coverage-badge.yaml
│ └── ui_notebooks_test.yaml
├── resources
│ ├── wait_for_job_cell.json
│ └── minio_remote_config_cell.json
└── dependabot.yml
├── codecov.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Makefile
├── OWNERS
├── coverage.svg
├── CONTRIBUTING.md
├── README.md
├── target_users.md
└── pyproject.toml
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e/rayjob/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/upgrade/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/security/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/upgrade/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/security/test_mtls.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/job_submission/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/kueue_integration/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/kueue_integration/test_admission.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/kueue_integration/test_queueing.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/security/test_network_policies.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-detectable=false
2 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/creation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/job_submission/rayjob_client/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/job_submission/rayjob_cr/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/python_client/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/configuration/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/interactive/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/interactive/test_remote.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/job_submission/rayjob_client/test_remote.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/kueue_integration/test_resource_flavors.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/configuration/test_advanced.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/configuration/test_images.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/configuration/test_resources.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/configuration/test_volumes.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/creation/test_cluster_kueue.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/interactive/test_in_cluster.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/job_submission/rayjob_client/test_in_cluster.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/configuration/test_heterogeneous.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/cluster_management/creation/test_cluster_creation.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/job_submission/rayjob_cr/test_lifecycled_cluster.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/e2e_v2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Utility modules for E2E tests
2 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/python_client/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1.0"
2 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/client/__init__.py:
--------------------------------------------------------------------------------
1 | from .ray_jobs import RayJobClient
2 |
--------------------------------------------------------------------------------
/tests/e2e_v2/utils/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | # Placeholder scripts for RayJob entrypoints
2 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/widgets/__init__.py:
--------------------------------------------------------------------------------
1 | from .widgets import (
2 | view_clusters,
3 | )
4 |
--------------------------------------------------------------------------------
/tests/e2e_v2/__init__.py:
--------------------------------------------------------------------------------
1 | # E2E Test Suite v2
2 | # Restructured pytest-based E2E tests for CodeFlare SDK
3 |
--------------------------------------------------------------------------------
/assets/images/sdk-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/assets/images/sdk-diagram.png
--------------------------------------------------------------------------------
/demo-notebooks/additional-demos/batch-inference/requirements.txt:
--------------------------------------------------------------------------------
1 | vllm
2 | transformers
3 | triton>=2.0.0
4 | torch>=2.0.0
5 |
--------------------------------------------------------------------------------
/docs/images/codeflare_sdk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/docs/images/codeflare_sdk.png
--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==2.4.0
2 | ray_lightning
3 | torchmetrics==1.8.2
4 | torchvision==0.20.1
5 |
--------------------------------------------------------------------------------
/docs/images/codeflare_stack_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/docs/images/codeflare_stack_arch.png
--------------------------------------------------------------------------------
/ui-tests/.yarnrc:
--------------------------------------------------------------------------------
1 | disable-self-update-check true
2 | ignore-optional true
3 | network-timeout "300000"
4 | registry "https://registry.npmjs.org/"
5 |
--------------------------------------------------------------------------------
/demo-notebooks/additional-demos/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==1.9.5
2 | ray_lightning
3 | torchmetrics==0.9.1
4 | torchvision==0.19.0
5 | minio
6 |
--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/preview_nbs/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==2.4.0
2 | ray_lightning
3 | torchmetrics==1.8.2
4 | torchvision==0.20.1
5 |
--------------------------------------------------------------------------------
/docs/sphinx/user-docs/images/ui-buttons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/docs/sphinx/user-docs/images/ui-buttons.png
--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/notebook-ex-outputs/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==2.4.0
2 | ray_lightning
3 | torchmetrics==1.8.2
4 | torchvision==0.20.1
5 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/__init__.py:
--------------------------------------------------------------------------------
1 | from .awload import AWManager
2 |
3 | from .status import (
4 | AppWrapperStatus,
5 | AppWrapper,
6 | )
7 |
--------------------------------------------------------------------------------
/docs/sphinx/user-docs/images/ui-view-clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/docs/sphinx/user-docs/images/ui-view-clusters.png
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/kueue/__init__.py:
--------------------------------------------------------------------------------
1 | from .kueue import (
2 | get_default_kueue_name,
3 | local_queue_exists,
4 | add_queue_label,
5 | list_local_queues,
6 | )
7 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Common utilities for the CodeFlare SDK.
3 | """
4 |
5 | from .k8s_utils import get_current_namespace
6 |
7 | __all__ = ["get_current_namespace"]
8 |
--------------------------------------------------------------------------------
/tests/e2e/mnist_pip_requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cu118
2 | torch==2.7.1+cu118
3 | torchvision==0.22.1+cu118
4 | pytorch_lightning==1.9.5
5 | torchmetrics==1.8.2
6 | minio
7 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/rayjobs/__init__.py:
--------------------------------------------------------------------------------
1 | from .rayjob import RayJob, ManagedClusterConfig
2 | from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo
3 | from .config import ManagedClusterConfig
4 |
--------------------------------------------------------------------------------
/ui-tests/jupyter_server_config.py:
--------------------------------------------------------------------------------
1 | from jupyterlab.galata import configure_jupyter_server
2 |
3 | configure_jupyter_server(c)
4 |
5 | # Uncomment to set server log level to debug level
6 | # c.ServerApp.log_level = "DEBUG"
7 |
--------------------------------------------------------------------------------
/images/tests/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Entrypoint script that handles -- separator in podman commands
3 | # Passes all arguments to run-tests.sh which will forward them to pytest
4 |
5 | exec /codeflare-sdk/run-tests.sh "$@"
6 |
--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-0-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-0-linux.png
--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-2-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-2-linux.png
--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-3-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-3-linux.png
--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png
--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/HEAD/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png
--------------------------------------------------------------------------------
/.github/build/README.md:
--------------------------------------------------------------------------------
1 | # Pre-Commit Build Artifacts
2 |
3 | This directory contains the artifacts required to build the codeflare-sdk pre-commit image.
4 |
5 | To build the image run `podman build -f .github/build/Containerfile .` from the root directory.
6 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Importing everything from the kubernetes_cluster module
2 | from .kubernetes_cluster import (
3 | Authentication,
4 | KubeConfiguration,
5 | TokenAuthentication,
6 | KubeConfigFileAuthentication,
7 | _kube_api_error_handling,
8 | )
9 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/cluster/__init__.py:
--------------------------------------------------------------------------------
1 | from .status import (
2 | RayClusterStatus,
3 | CodeFlareClusterStatus,
4 | RayCluster,
5 | )
6 |
7 | from .cluster import (
8 | Cluster,
9 | ClusterConfiguration,
10 | get_cluster,
11 | list_all_queued,
12 | list_all_clusters,
13 | )
14 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/kubernetes_cluster/__init__.py:
--------------------------------------------------------------------------------
1 | from .auth import (
2 | Authentication,
3 | KubeConfiguration,
4 | TokenAuthentication,
5 | KubeConfigFileAuthentication,
6 | config_check,
7 | get_api_client,
8 | )
9 |
10 | from .kube_api_helpers import _kube_api_error_handling
11 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | ignore:
2 | - "**/*.ipynb"
3 | - "demo-notebooks/**"
4 | - "**/__init__.py"
5 |
6 | coverage:
7 | precision: 2
8 | round: down
9 | status:
10 | project:
11 | default:
12 | target: auto
13 | threshold: 2.5%
14 | patch:
15 | default:
16 | target: 85%
17 | threshold: 2.5%
18 |
--------------------------------------------------------------------------------
/tests/e2e/install-codeflare-sdk.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cd ..
4 |
5 | # Install Poetry and configure virtualenvs
6 | pip install poetry
7 | poetry config virtualenvs.create false
8 |
9 | cd codeflare-sdk
10 |
11 | # Lock dependencies and install them
12 | poetry lock
13 | poetry install --with test,docs
14 |
15 | # Return to the workdir
16 | cd ..
17 | cd workdir
18 |
--------------------------------------------------------------------------------
/ui-tests/playwright.config.js:
--------------------------------------------------------------------------------
1 | const baseConfig = require('@jupyterlab/galata/lib/playwright-config');
2 |
3 | module.exports = {
4 | ...baseConfig,
5 | timeout: 600000,
6 | webServer: {
7 | command: 'yarn start',
8 | url: 'http://localhost:8888/lab',
9 | timeout: 120 * 1000,
10 | reuseExistingServer: !process.env.CI,
11 | },
12 | retries: 0,
13 | };
14 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/python_client/constants.py:
--------------------------------------------------------------------------------
1 | # Declares the constants that are used by the client
2 | import logging
3 |
4 | # Group, Version, Plural
5 | GROUP = "ray.io"
6 | CLUSTER_VERSION = "v1"
7 | JOB_VERSION = "v1"
8 | CLUSTER_PLURAL = "rayclusters"
9 | JOB_PLURAL = "rayjobs"
10 | CLUSTER_KIND = "RayCluster"
11 | JOB_KIND = "RayJob"
12 | # log level
13 | LOGLEVEL = logging.INFO
14 |
--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yaml:
--------------------------------------------------------------------------------
1 | name: Pre-commit
2 | on:
3 | pull_request:
4 | workflow_dispatch:
5 |
6 | jobs:
7 | precommit:
8 | runs-on: ubuntu-latest
9 | container:
10 | image: quay.io/project-codeflare/codeflare-sdk-precommit:v0.0.1
11 | steps:
12 | - uses: actions/checkout@v4
13 |
14 | - name: Run pre-commit checks
15 | run: pre-commit run --all-files
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist/
2 | .python-version
3 | __pycache__/
4 | .coverage
5 | Pipfile
6 | Pipfile.lock
7 | .venv*
8 | build/
9 | tls-cluster-namespace
10 | quicktest.yaml
11 | node_modules
12 | .DS_Store
13 | ui-tests/playwright-report
14 | ui-tests/test-results
15 | /src/codeflare_sdk.egg-info/
16 | docs/sphinx/_build
17 | docs/sphinx/codeflare_sdk.*.rst
18 | docs/sphinx/codeflare_sdk.rst
19 | docs/sphinx/modules.rst
20 | .idea/
21 | .cursor/plans/
22 | .cursor/commands/
23 | /results
24 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/.gitignore:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 |
10 | # Distribution / packaging
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 |
29 | # Unit test / coverage reports
30 | .tox/
31 | htmlcov
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v3.2.0
6 | hooks:
7 | - id: trailing-whitespace
8 | - id: end-of-file-fixer
9 | - id: check-yaml
10 | args: [--allow-multiple-documents]
11 | - id: check-added-large-files
12 | - repo: https://github.com/psf/black
13 | rev: 23.3.0
14 | hooks:
15 | - id: black
16 | language_version: python3.9
17 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/__init__.py:
--------------------------------------------------------------------------------
1 | from .appwrapper import AppWrapper, AppWrapperStatus, AWManager
2 |
3 | from .client import (
4 | RayJobClient,
5 | )
6 |
7 | from .rayjobs import (
8 | RayJob,
9 | ManagedClusterConfig,
10 | RayJobDeploymentStatus,
11 | CodeflareRayJobStatus,
12 | RayJobInfo,
13 | )
14 |
15 | from .cluster import (
16 | Cluster,
17 | ClusterConfiguration,
18 | get_cluster,
19 | list_all_queued,
20 | list_all_clusters,
21 | RayClusterStatus,
22 | CodeFlareClusterStatus,
23 | RayCluster,
24 | )
25 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Vendored third-party dependencies.
3 |
4 | This directory contains code from external projects that are bundled
5 | with codeflare-sdk to avoid PyPI publishing restrictions.
6 |
7 | Contents:
8 | - python_client: KubeRay Python client from ray-project/kuberay
9 | Source: https://github.com/ray-project/kuberay @ b2fd91b58c2bbe22f9b4f730c5a8f3180c05e570
10 | License: Apache 2.0 (see LICENSE file)
11 |
12 | Vendored because the python-client is not published to PyPI and PyPI
13 | does not allow direct git dependencies.
14 | """
15 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Image tag for image containing e2e tests
2 | E2E_TEST_IMAGE_VERSION ?= latest
3 | E2E_TEST_IMAGE ?= quay.io/opendatahub/codeflare-sdk-tests:${E2E_TEST_IMAGE_VERSION}
4 |
5 | # Build the test image
6 | .PHONY: build-test-image
7 | build-test-image:
8 | @echo "Building test image: $(E2E_TEST_IMAGE)"
9 | # Build the Docker image using podman
10 | podman build -f images/tests/Dockerfile -t $(E2E_TEST_IMAGE) .
11 |
12 | # Push the test image
13 | .PHONY: push-test-image
14 | push-test-image:
15 | @echo "Pushing test image: $(E2E_TEST_IMAGE)"
16 | podman push $(E2E_TEST_IMAGE)
17 |
--------------------------------------------------------------------------------
/tests/ui/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tests/e2e_v2/ui/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tests/ui/pages/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tests/e2e_v2/ui/pages/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
1 | approvers:
2 | - astefanutti
3 | - Bobbins228
4 | - CathalOConnorRH
5 | - chipspeak
6 | - ChristianZaccaria
7 | - dimakis
8 | - Fiona-Waters
9 | - franciscojavierarceo
10 | - kpostoffice
11 | - kryanbeane
12 | - laurafitzgerald
13 | - pawelpaszki
14 | - pmccarthy
15 | - szaher
16 | - varshaprasad96
17 | reviewers:
18 | - astefanutti
19 | - Bobbins228
20 | - CathalOConnorRH
21 | - chipspeak
22 | - ChristianZaccaria
23 | - dimakis
24 | - Fiona-Waters
25 | - franciscojavierarceo
26 | - kpostoffice
27 | - kryanbeane
28 | - laurafitzgerald
29 | - pawelpaszki
30 | - pmccarthy
31 | - szaher
32 | - varshaprasad96
33 | - Ygnas
34 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/constants.py:
--------------------------------------------------------------------------------
1 | RAY_VERSION = "2.52.1"
2 | """
3 | The below are used to define the default runtime image for the Ray Cluster.
4 | * For python 3.11:ray:2.52.1-py311-cu121
5 | * For python 3.12:ray:2.52.1-py312-cu128
6 | """
7 | CUDA_PY311_RUNTIME_IMAGE = "quay.io/modh/ray@sha256:595b3acd10244e33fca1ed5469dccb08df66f470df55ae196f80e56edf35ad5a"
8 | CUDA_PY312_RUNTIME_IMAGE = "quay.io/modh/ray@sha256:6b135421b6e756593a58b4df6664f82fc4b55237ca81475f2867518f15fe6d84"
9 |
10 | # Centralized image selection
11 | SUPPORTED_PYTHON_VERSIONS = {
12 | "3.11": CUDA_PY311_RUNTIME_IMAGE,
13 | "3.12": CUDA_PY312_RUNTIME_IMAGE,
14 | }
15 | MOUNT_PATH = "/home/ray/files"
16 |
--------------------------------------------------------------------------------
/.github/resources/wait_for_job_cell.json:
--------------------------------------------------------------------------------
1 | {
2 | "cell_type": "code",
3 | "execution_count": null,
4 | "metadata": {},
5 | "outputs": [],
6 | "source": [
7 | "from time import sleep\n",
8 | "\n",
9 | "finished = False\n",
10 | "while not finished:\n",
11 | " sleep(5)\n",
12 | " status = client.get_job_status(submission_id)\n",
13 | " finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n",
14 | " print(status)\n",
15 | "print(\"Job status \" + status)\n",
16 | "print(\"Logs: \")\n",
17 | "print(client.get_job_logs(submission_id))\n",
18 | "assert status == \"SUCCEEDED\", \"Job failed or was stopped!\""
19 | ]
20 | }
21 |
--------------------------------------------------------------------------------
/docs/sphinx/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/.github/build/Containerfile:
--------------------------------------------------------------------------------
1 | FROM registry.redhat.io/ubi9/python-39:latest
2 |
3 | LABEL summary="Toolchain for running pre-commit hooks." \
4 | description="Toolchain for running pre-commit hooks" \
5 | io.k8s.display-name="Pre-Commit Toolchain"
6 |
7 | USER root
8 | RUN dnf install nodejs -y && \
9 | dnf clean all && \
10 | rm -rf /var/cache/dnf
11 | ADD https://mirror.openshift.com/pub/openshift-v4/clients/oc/latest/linux/oc.tar.gz $TMPDIR/
12 | RUN tar -C /usr/local/bin -xvf $TMPDIR/oc.tar.gz && \
13 | chmod +x /usr/local/bin/oc && \
14 | rm $TMPDIR/oc.tar.gz
15 | USER $USERID
16 |
17 | RUN pip3 install poetry && \
18 | poetry config virtualenvs.create false
19 | COPY pyproject.toml ./
20 | RUN poetry install
21 |
22 | CMD bash
23 |
--------------------------------------------------------------------------------
/.github/resources/minio_remote_config_cell.json:
--------------------------------------------------------------------------------
1 | {
2 | "cell_type": "code",
3 | "execution_count": null,
4 | "metadata": {},
5 | "outputs": [],
6 | "source": [
7 | "@ray.remote\n",
8 | "def get_minio_run_config():\n",
9 | " import s3fs\n",
10 | " import pyarrow\n",
11 | " s3_fs = s3fs.S3FileSystem(\n",
12 | " key = \"minio\",\n",
13 | " secret = \"minio123\",\n",
14 | " endpoint_url = \"http://minio-service.default.svc.cluster.local:9000\"\n",
15 | " )\n",
16 | " custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))\n",
17 | " run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)\n",
18 | " return run_config"
19 | ]
20 | }
21 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/python_client_test/README.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | ## For developers
4 |
5 | 1. `pip install -U pip setuptools`
6 | 1. `cd clients/python-client && pip install -e .`
7 |
8 | Uninstall with `pip uninstall python-client`.
9 |
10 | ## For testing run
11 |
12 | `python -m unittest discover 'clients/python-client/python_client_test/'`
13 |
14 | ### Coverage report
15 |
16 | #### Pre-requisites
17 |
18 | * `sudo apt install libsqlite3-dev`
19 | * `pyenv install 3.6.5` # or your Python version
20 | * `pip install db-sqlite3 coverage`
21 |
22 | __To gather data__
23 | `python -m coverage run -m unittest`
24 |
25 | __to generate a coverage report__
26 | `python -m coverage report`
27 |
28 | __to generate the test coverage report in HTML format__
29 | `python -m coverage html`
30 |
--------------------------------------------------------------------------------
/docs/generate-documentation.md:
--------------------------------------------------------------------------------
1 | # Generate CodeFlare Documentation with Sphinx
2 | The following is a short guide on how you can use Sphinx to auto-generate code documentation. Documentation for the latest SDK release can be found [here](https://project-codeflare.github.io/codeflare-sdk/index.html).
3 |
4 | 1. Clone the CodeFlare SDK
5 | ``` bash
6 | git clone https://github.com/project-codeflare/codeflare-sdk.git
7 | ```
8 | 2. [Install Sphinx](https://www.sphinx-doc.org/en/master/usage/installation.html)
9 | 3. Run the below command to generate code documentation
10 | ``` bash
11 | sphinx-apidoc -o docs/sphinx src/codeflare_sdk "**/*test_*" --force # Generates RST files
12 | make html -C docs/sphinx # Builds HTML files
13 | ```
14 | 4. You can access the docs locally at `docs/sphinx/_build/html/index.html`
15 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "python-client"
3 | version = "0.0.0-dev"
4 | description = "Python Client for Kuberay"
5 | license = "Apache-2.0"
6 |
7 | readme = "README.md"
8 | repository = "https://github.com/ray-project/kuberay"
9 | homepage = "https://github.com/ray-project/kuberay"
10 | keywords = ["kuberay", "python", "client"]
11 | classifiers = [
12 | "Programming Language :: Python :: 3",
13 | "License :: OSI Approved :: Apache Software License",
14 | "Operating System :: OS Independent"
15 | ]
16 | packages = [
17 | { include = "python_client" }
18 | ]
19 |
20 | [tool.poetry.dependencies]
21 | python = "^3.11"
22 | kubernetes = ">=25.0.0"
23 |
24 | [build-system]
25 | requires = ["poetry-core>=1.0.0"]
26 | build-backend = "poetry.core.masonry.api"
27 |
--------------------------------------------------------------------------------
/ui-tests/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@jupyter-widgets/ui-tests",
3 | "private": true,
4 | "version": "0.1.0",
5 | "description": "ipywidgets UI Tests",
6 | "scripts": {
7 | "start": "jupyter lab --config ./jupyter_server_config.py",
8 | "start:detached": "jlpm start&",
9 | "test": "npx playwright test",
10 | "test:debug": "PWDEBUG=1 npx playwright test",
11 | "test:report": "http-server ./playwright-report -a localhost -o",
12 | "test:update": "npx playwright test --update-snapshots",
13 | "deduplicate": "jlpm && yarn-deduplicate -s fewer --fail"
14 | },
15 | "author": "Project Jupyter",
16 | "license": "BSD-3-Clause",
17 | "devDependencies": {
18 | "@jupyterlab/galata": "^5.3.0",
19 | "@playwright/test": "^1.57.0",
20 | "yarn-deduplicate": "^6.0.1"
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/.github/workflows/snyk-security.yaml:
--------------------------------------------------------------------------------
1 | name: Snyk Security
2 | on:
3 | push:
4 | branches:
5 | - main
6 |
7 | jobs:
8 | snyk-scan:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Checkout code
12 | uses: actions/checkout@v4
13 |
14 | - name: Install Snyk CLI
15 | run: npm install -g snyk
16 |
17 | - name: Snyk Monitor and Test multiple projects
18 | env:
19 | SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
20 | SNYK_ORG: ${{ secrets.SNYK_ORG }}
21 | run: |
22 | echo "Fetching tags"
23 | git fetch origin 'refs/tags/*:refs/tags/*'
24 |
25 | echo "Authenticating with Snyk"
26 | snyk auth ${SNYK_TOKEN}
27 |
28 | echo "Scanning project: codeflare-sdk/main"
29 | snyk monitor --all-projects --exclude=requirements.txt --org=${SNYK_ORG} --target-reference="main"
30 |
--------------------------------------------------------------------------------
/tests/e2e/start_ray_cluster.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 |
4 | from time import sleep
5 |
6 | from codeflare_sdk.ray.cluster.cluster import Cluster, ClusterConfiguration
7 |
8 | namespace = sys.argv[1]
9 | ray_image = os.getenv("RAY_IMAGE")
10 |
11 | cluster = Cluster(
12 | ClusterConfiguration(
13 | name="mnist",
14 | namespace=namespace,
15 | num_workers=1,
16 | head_cpu_requests="500m",
17 | head_cpu_limits="500m",
18 | head_memory_requests=2,
19 | head_memory_limits=2,
20 | worker_cpu_requests="500m",
21 | worker_cpu_limits=1,
22 | worker_memory_requests=1,
23 | worker_memory_limits=2,
24 | image=ray_image,
25 | appwrapper=True,
26 | )
27 | )
28 |
29 | cluster.apply()
30 |
31 | cluster.status()
32 |
33 | cluster.wait_ready()
34 |
35 | cluster.status()
36 |
37 | cluster.details()
38 |
--------------------------------------------------------------------------------
/docs/sphinx/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/coverage.svg:
--------------------------------------------------------------------------------
1 |
2 |
22 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/__init__.py:
--------------------------------------------------------------------------------
1 | from .ray import (
2 | Cluster,
3 | ClusterConfiguration,
4 | RayClusterStatus,
5 | CodeFlareClusterStatus,
6 | RayCluster,
7 | get_cluster,
8 | list_all_queued,
9 | list_all_clusters,
10 | AWManager,
11 | AppWrapperStatus,
12 | RayJobClient,
13 | RayJob,
14 | ManagedClusterConfig,
15 | )
16 |
17 | from .common.widgets import view_clusters
18 |
19 | from .common import (
20 | Authentication,
21 | KubeConfiguration,
22 | TokenAuthentication,
23 | KubeConfigFileAuthentication,
24 | )
25 |
26 | from .common.kueue import (
27 | list_local_queues,
28 | )
29 |
30 | from .common.utils import generate_cert
31 | from .common.utils.demos import copy_demo_nbs
32 |
33 | from importlib.metadata import version, PackageNotFoundError
34 |
35 | try:
36 | __version__ = version("codeflare-sdk") # use metadata associated with built package
37 |
38 | except PackageNotFoundError:
39 | __version__ = "v0.0.0"
40 |
--------------------------------------------------------------------------------
/.github/workflows/dependabot-labeler.yaml:
--------------------------------------------------------------------------------
1 | # This workflow file adds the 'lgtm' and 'approved' labels to Dependabot PRs
2 | # This is done to ensure that the PRs that pass required status checks are automatically merged by the CodeFlare bot
3 | name: Dependabot Labeler
4 |
5 | on:
6 | pull_request_target:
7 | branches: [ main ]
8 |
9 | jobs:
10 | add-approve-lgtm-label:
11 | if: ${{ github.actor == 'dependabot[bot]' && contains(github.event.pull_request.labels.*.name, 'dependabot') }}
12 | runs-on: ubuntu-latest
13 |
14 | # Permission required to edit a PR
15 | permissions:
16 | pull-requests: write
17 | issues: write
18 |
19 | steps:
20 | - name: Checkout code
21 | uses: actions/checkout@v4
22 |
23 | - name: Add approve and lgtm labels to Dependabot PR
24 | run: |
25 | gh pr edit ${{ github.event.pull_request.number }} --add-label "lgtm" --add-label "approved"
26 | env:
27 | GITHUB_TOKEN: ${{ secrets.GH_CLI_TOKEN }}
28 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/k8s_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Kubernetes utility functions for the CodeFlare SDK.
3 | """
4 |
5 | import os
6 | from kubernetes import config
7 | from ..kubernetes_cluster import config_check, _kube_api_error_handling
8 |
9 |
10 | def get_current_namespace(): # pragma: no cover
11 | """
12 | Retrieves the current Kubernetes namespace.
13 |
14 | Returns:
15 | str:
16 | The current namespace or None if not found.
17 | """
18 | if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
19 | try:
20 | file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r")
21 | active_context = file.readline().strip("\n")
22 | return active_context
23 | except Exception as e:
24 | print("Unable to find current namespace")
25 | print("trying to gather from current context")
26 | try:
27 | _, active_context = config.list_kube_config_contexts(config_check())
28 | except Exception as e:
29 | return _kube_api_error_handling(e)
30 | try:
31 | return active_context["context"]["namespace"]
32 | except KeyError:
33 | return None
34 |
--------------------------------------------------------------------------------
/tests/e2e_v2/utils/in_cluster/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | In-cluster test execution utilities.
3 |
4 | This package provides functions for setting up and managing test execution
5 | inside Kubernetes pods, including RBAC setup, service account management,
6 | and pod execution.
7 | """
8 |
9 | from .rbac import (
10 | create_test_service_account,
11 | create_rayjob_rbac,
12 | delete_test_service_account,
13 | )
14 | from .setup import (
15 | setup_in_cluster_test_environment,
16 | cleanup_in_cluster_test_environment,
17 | )
18 | from .pod_execution import (
19 | PodExecutionResult,
20 | create_test_pod,
21 | create_sdk_test_pod,
22 | run_code_in_pod,
23 | wait_for_pod_completion,
24 | get_pod_logs,
25 | delete_test_pod,
26 | cleanup_test_pods,
27 | )
28 |
29 | __all__ = [
30 | "create_test_service_account",
31 | "create_rayjob_rbac",
32 | "delete_test_service_account",
33 | "setup_in_cluster_test_environment",
34 | "cleanup_in_cluster_test_environment",
35 | "PodExecutionResult",
36 | "create_test_pod",
37 | "create_sdk_test_pod",
38 | "run_code_in_pod",
39 | "wait_for_pod_completion",
40 | "get_pod_logs",
41 | "delete_test_pod",
42 | "cleanup_test_pods",
43 | ]
44 |
--------------------------------------------------------------------------------
/tests/auth-test.crt:
--------------------------------------------------------------------------------
1 | -----BEGIN CERTIFICATE-----
2 | MIIDOTCCAiGgAwIBAgIUENjaZDrvhc5uV3j7GI8deZJwc+YwDQYJKoZIhvcNAQEL
3 | BQAwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM
4 | GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDAeFw0yNDA1MTMxMTE1NDZaFw0yNTA1
5 | MTMxMTE1NDZaMEUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEw
6 | HwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQwggEiMA0GCSqGSIb3DQEB
7 | AQUAA4IBDwAwggEKAoIBAQDEYYk81jvPijZXXeI9cByf5EIbOVaBTH7I51J9EKG5
8 | Y/KRXI43WgvVEiZ3jP8LJnSD79WhBiL6TgadQZje5ndroRYDM9vyqz1OUZapnOO+
9 | yzl01y/qSsH8Kn88eLAzkE9HSu4QN9PuJtySyksjDFQJ6kjyE8ZHUSorur0FlLLf
10 | IToFgTuaIPDYjvFRchOCfZ7sV/MF7LxqFfFnaWOYvH41ZdvqJiRcVsMi+mYs9/I/
11 | I72IMXwVnQDVnK8H84ntEmHNN6NoVuMKla0So4/wKcHJSCgS3axLI2Ka2aaaJo9K
12 | l2cn21NOyodF+DaSFy7qaGRXxoTQ2k9tUrSvxkBJvRmBAgMBAAGjITAfMB0GA1Ud
13 | DgQWBBRTK8mO5XMcmR+Xg/PVNFnvz4eubDANBgkqhkiG9w0BAQsFAAOCAQEAlZva
14 | 6ws3zRff7u0tWT2JJaE1uPqsuAdHtVvEyAMp2QvYfyrgADTroUTaSU4p6ppX/t7v
15 | ynHhuzR6UOVkuY0/CH1P3UUGrEPNOXT8i2BDwL+j4y2K2aRN8zU0Nu/IVePBhu+4
16 | Jdt+3P7/MuwiCON5JukgxUYlQKhVhzFj7GOd2+Ca+fh8Siq3tkWDSN54+90fgylQ
17 | +74Yfya1NVabpzLqP3Isqu2XQhEVaBFvj8Yu0h83e3D8LeQToC3mVMF4yy5BZ9Ty
18 | K66YGlGQgszWEUFPEdsB8Dj/iJMhkWXuyc3u/w0s3t7rXeMYYgr+xrEeK+g0oyB5
19 | xeZuMjd567Znmu5oMw==
20 | -----END CERTIFICATE-----
21 |
--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
1 | name: Python Tests
2 |
3 | on:
4 | pull_request:
5 | branches: [ main, ray-jobs-feature ]
6 | push:
7 | branches: [ main, ray-jobs-feature ]
8 |
9 | jobs:
10 | unit-tests:
11 |
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 |
16 | - name: Set up python
17 | uses: actions/setup-python@v5
18 | with:
19 | python-version: '3.11'
20 | - name: Install poetry
21 | run: pip install poetry
22 | - name: Install dependencies with poetry
23 | run: |
24 | poetry config virtualenvs.create false
25 | poetry lock
26 | poetry install --with test
27 | - name: Test with pytest and check coverage
28 | run: |
29 | coverage run --omit="src/**/test_*.py,src/codeflare_sdk/common/utils/unit_test_support.py,src/codeflare_sdk/vendored/**" -m pytest
30 | coverage=$(coverage report -m | tail -1 | tail -c 4 | head -c 2)
31 | if (( $coverage < 90 )); then echo "Coverage failed at ${coverage}%"; exit 1; else echo "Coverage passed, ${coverage}%"; fi
32 | - name: Upload to Codecov
33 | uses: codecov/codecov-action@v4
34 | with:
35 | token: ${{ secrets.CODECOV_TOKEN }}
36 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/demos.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import shutil
3 |
4 | package_dir = pathlib.Path(__file__).parent.parent.parent.resolve()
5 | demo_dir = f"{package_dir}/demo-notebooks"
6 |
7 |
8 | def copy_demo_nbs(dir: str = "./demo-notebooks", overwrite: bool = False):
9 | """
10 | Copy the demo notebooks from the package to the current working directory
11 |
12 | overwrite=True will overwrite any files that exactly match files written by copy_demo_nbs in the target directory.
13 | Any files that exist in the directory that don't match these values will remain untouched.
14 |
15 | Args:
16 | dir (str):
17 | The directory to copy the demo notebooks to. Defaults to "./demo-notebooks".
18 | overwrite (bool):
19 | Whether to overwrite files in the directory if it already exists. Defaults to False.
20 |
21 | Raises:
22 | FileExistsError:
23 | If the directory already exists.
24 | """
25 | # does dir exist already?
26 | if overwrite is False and pathlib.Path(dir).exists():
27 | raise FileExistsError(
28 | f"Directory {dir} already exists. Please remove it or provide a different location."
29 | )
30 |
31 | shutil.copytree(demo_dir, dir, dirs_exist_ok=True)
32 |
--------------------------------------------------------------------------------
/docs/sphinx/index.rst:
--------------------------------------------------------------------------------
1 | .. CodeFlare SDK documentation master file, created by
2 | sphinx-quickstart on Thu Oct 10 11:27:58 2024.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | CodeFlare SDK documentation
7 | ===========================
8 |
9 | The CodeFlare SDK is an intuitive, easy-to-use python interface for batch resource requesting, access, job submission, and observation. Simplifying the developer's life while enabling access to high-performance compute resources, either in the cloud or on-prem.
10 |
11 |
12 | .. toctree::
13 | :maxdepth: 2
14 | :caption: Code Documentation:
15 |
16 | modules
17 |
18 | .. toctree::
19 | :maxdepth: 1
20 | :caption: User Documentation:
21 |
22 | user-docs/authentication
23 | user-docs/cluster-configuration
24 | user-docs/ray-cluster-interaction
25 | user-docs/e2e
26 | user-docs/s3-compatible-storage
27 | user-docs/setup-kueue
28 | user-docs/ui-widgets
29 |
30 | Quick Links
31 | ===========
32 | - `PyPi `__
33 | - `GitHub `__
34 | - `OpenShift AI Documentation `__
35 |
--------------------------------------------------------------------------------
/docs/sphinx/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | import os
7 | import sys
8 |
9 | sys.path.insert(0, os.path.abspath(".."))
10 |
11 | # -- Project information -----------------------------------------------------
12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
13 |
14 | project = "CodeFlare SDK"
15 | copyright = "2024, Project CodeFlare"
16 | author = "Project CodeFlare"
17 | release = "v0.21.1"
18 |
19 | # -- General configuration ---------------------------------------------------
20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
21 |
22 | extensions = [
23 | "sphinx.ext.autodoc",
24 | "sphinx.ext.todo",
25 | "sphinx.ext.viewcode",
26 | "sphinx.ext.autosummary",
27 | "sphinx_rtd_theme",
28 | ]
29 |
30 | templates_path = ["_templates"]
31 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
32 |
33 |
34 | # -- Options for HTML output -------------------------------------------------
35 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
36 |
37 | html_theme = "sphinx_rtd_theme"
38 | html_static_path = ["_static"]
39 |
--------------------------------------------------------------------------------
/tests/e2e/mnist_rayjob.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from time import sleep
4 |
5 | from support import *
6 |
7 | from codeflare_sdk.ray.cluster.cluster import get_cluster
8 | from codeflare_sdk.ray.client import RayJobClient
9 |
10 | namespace = sys.argv[1]
11 |
12 | cluster = get_cluster("mnist", namespace)
13 |
14 | cluster.details()
15 |
16 | auth_token = run_oc_command(["whoami", "--show-token=true"])
17 | ray_dashboard = cluster.cluster_dashboard_uri()
18 | header = {"Authorization": f"Bearer {auth_token}"}
19 | client = RayJobClient(address=ray_dashboard, headers=header, verify=True)
20 |
21 | # Submit the job
22 | submission_id = client.submit_job(
23 | entrypoint="python mnist.py",
24 | runtime_env={"working_dir": "/", "pip": "requirements.txt"},
25 | )
26 | print(f"Submitted job with ID: {submission_id}")
27 | done = False
28 | time = 0
29 | timeout = 900
30 | while not done:
31 | status = client.get_job_status(submission_id)
32 | if status.is_terminal():
33 | break
34 | if not done:
35 | print(status)
36 | if timeout and time >= timeout:
37 | raise TimeoutError(f"job has timed out after waiting {timeout}s")
38 | sleep(5)
39 | time += 5
40 |
41 | logs = client.get_job_logs(submission_id)
42 | print(logs)
43 |
44 | client.delete_job(submission_id)
45 | cluster.down()
46 |
47 |
48 | if not status == "SUCCEEDED":
49 | exit(1)
50 | else:
51 | exit(0)
52 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/status.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | The status sub-module defines Enums containing information for
17 | AppWrapper states, as well as dataclasses to store information for AppWrappers.
18 | """
19 |
20 | from dataclasses import dataclass
21 | from enum import Enum
22 |
23 |
24 | class AppWrapperStatus(Enum):
25 | """
26 | Defines the possible reportable phases of an AppWrapper.
27 | """
28 |
29 | SUSPENDED = "suspended"
30 | RESUMING = "resuming"
31 | RUNNING = "running"
32 | RESETTING = "resetting"
33 | SUSPENDING = "suspending"
34 | SUCCEEDED = "succeeded"
35 | FAILED = "failed"
36 | TERMINATING = "terminating"
37 |
38 |
39 | @dataclass
40 | class AppWrapper:
41 | """
42 | For storing information about an AppWrapper.
43 | """
44 |
45 | name: str
46 | status: AppWrapperStatus
47 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # Please see the documentation for all configuration options:
2 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
3 |
4 | version: 2
5 | updates:
6 | # This is to update requirements.txt files in the guided-demos, and e2e directories.
7 | - package-ecosystem: "pip"
8 | directories:
9 | - "**/demo-notebooks/guided-demos*"
10 | - "/tests/e2e"
11 | schedule:
12 | interval: "daily"
13 | ignore:
14 | - dependency-name: "*"
15 | update-types: ["version-update:semver-patch"]
16 | open-pull-requests-limit: 1
17 | labels:
18 | - "dependabot"
19 | - "test-guided-notebooks"
20 |
21 | # pip means poetry in this case, this keeps poetry.lock up to date with constraints in pyproject.toml.
22 | - package-ecosystem: "pip"
23 | directory: "/"
24 | schedule:
25 | interval: "daily"
26 | ignore:
27 | - dependency-name: "*"
28 | update-types: ["version-update:semver-patch"]
29 | open-pull-requests-limit: 1
30 | labels:
31 | - "dependabot"
32 | - "test-guided-notebooks"
33 |
34 | # npm means yarn in this case, this keeps yarn.lock up to date with constraints in package.json.
35 | - package-ecosystem: "npm"
36 | directory: "/ui-tests"
37 | schedule:
38 | interval: "daily"
39 | ignore:
40 | - dependency-name: "*"
41 | update-types: ["version-update:semver-patch"]
42 | open-pull-requests-limit: 1
43 | labels:
44 | - "dependabot"
45 | - "test-ui-notebooks"
46 |
--------------------------------------------------------------------------------
/.github/workflows/publish-documentation.yaml:
--------------------------------------------------------------------------------
1 | name: Publish Documentation
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | codeflare_sdk_release_version:
7 | type: string
8 | required: true
9 | description: 'Version number (for example: 0.1.0)'
10 |
11 | permissions:
12 | contents: write
13 |
14 | jobs:
15 | docs:
16 | runs-on: ubuntu-latest
17 | steps:
18 | - uses: actions/checkout@v4
19 | - name: Install Python
20 | uses: actions/setup-python@v5
21 | with:
22 | python-version: 3.11
23 | - name: Install Sphinx
24 | run: |
25 | sudo apt-get update
26 | sudo apt-get install python3-sphinx
27 | - name: Install Poetry
28 | uses: abatilo/actions-poetry@v2
29 | with:
30 | poetry-version: 1.8.3
31 | - name: Create new documentation
32 | run: |
33 | python3 -m venv .venv
34 | source .venv/bin/activate
35 | poetry install --with docs
36 | sed -i 's/release = "v[0-9]\+\.[0-9]\+\.[0-9]\+"/release = "${{ github.event.inputs.codeflare_sdk_release_version }}"/' docs/sphinx/conf.py
37 | sphinx-apidoc -o docs/sphinx src/codeflare_sdk "**/*test_*" --force # Generate docs but ignore test files
38 | make html -C docs/sphinx
39 | - name: Deploy to GitHub Pages
40 | uses: peaceiris/actions-gh-pages@v3
41 | with:
42 | publish_branch: gh-pages
43 | github_token: ${{ secrets.GITHUB_TOKEN }}
44 | publish_dir: docs/sphinx/_build/html
45 | force_orphan: true
46 |
--------------------------------------------------------------------------------
/.github/workflows/build-test-image.yaml:
--------------------------------------------------------------------------------
1 | name: Build and Push Test Image
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | workflow_dispatch:
8 | inputs:
9 | E2E_TEST_IMAGE_VERSION:
10 | description: 'Tag for the test image (defaults to latest)'
11 | required: false
12 | default: 'latest'
13 | type: string
14 | BRANCH:
15 | description: 'Branch to build from (defaults to main)'
16 | required: false
17 | default: 'main'
18 | type: string
19 |
20 | jobs:
21 | build-and-push:
22 | runs-on: ubuntu-latest
23 | env:
24 | E2E_TEST_IMAGE_VERSION: ${{ github.event.inputs.E2E_TEST_IMAGE_VERSION || 'latest' }}
25 | steps:
26 | - name: Checkout code
27 | uses: actions/checkout@v5
28 | with:
29 | ref: ${{ github.event.inputs.BRANCH || 'main' }}
30 | submodules: recursive
31 |
32 | - name: Login to Quay.io
33 | id: podman-login-quay
34 | env:
35 | QUAY_USERNAME: ${{ secrets.QUAY_ODH_CODEFLARE_SDK_TESTS_USERNAME }}
36 | QUAY_PASSWORD: ${{ secrets.QUAY_ODH_CODEFLARE_SDK_TESTS_PASSWORD }}
37 | run: |
38 | set -euo pipefail
39 | printf '%s' "$QUAY_PASSWORD" | podman login --username "$QUAY_USERNAME" --password-stdin quay.io
40 |
41 | - name: Build test image
42 | run: make build-test-image
43 |
44 | - name: Push test image
45 | run: make push-test-image
46 |
47 | - name: Logout from Quay.io
48 | if: always() && steps.podman-login-quay.outcome == 'success'
49 | run: podman logout quay.io
50 |
--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/download_mnist_datasets.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | from torchvision.datasets import MNIST
17 | from torchvision import transforms
18 |
19 |
20 | def download_mnist_dataset(destination_dir):
21 | # Ensure the destination directory exists
22 | if not os.path.exists(destination_dir):
23 | os.makedirs(destination_dir)
24 |
25 | # Define transformations
26 | transform = transforms.Compose(
27 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
28 | )
29 |
30 | # Download the training data
31 | train_set = MNIST(
32 | root=destination_dir, train=True, download=True, transform=transform
33 | )
34 |
35 | # Download the test data
36 | test_set = MNIST(
37 | root=destination_dir, train=False, download=True, transform=transform
38 | )
39 |
40 | print(f"MNIST dataset downloaded in {destination_dir}")
41 |
42 |
43 | # Specify the directory where you
44 | destination_dir = os.path.dirname(os.path.abspath(__file__))
45 |
46 | download_mnist_dataset(destination_dir)
47 |
--------------------------------------------------------------------------------
/tests/upgrade/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Conftest for upgrade tests - imports UI fixtures for dashboard tests
17 | """
18 |
19 | import sys
20 | import os
21 | import pytest
22 |
23 | # Add parent test directory to path
24 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
25 |
26 | # Import all fixtures from ui/conftest.py
27 | from ui.conftest import (
28 | selenium_driver,
29 | dashboard_url,
30 | test_credentials,
31 | login_to_dashboard,
32 | )
33 |
34 | __all__ = ["selenium_driver", "dashboard_url", "test_credentials", "login_to_dashboard"]
35 |
36 |
37 | # Hook to capture test results for teardown methods
38 | @pytest.hookimpl(tryfirst=True, hookwrapper=True)
39 | def pytest_runtest_makereport(item, call):
40 | """
41 | Hook to capture test results and make them available to teardown methods.
42 | This allows teardown_method to check if the test failed.
43 | """
44 | outcome = yield
45 | rep = outcome.get_result()
46 |
47 | # Store the result in the item so teardown can access it
48 | setattr(item, f"rep_{rep.when}", rep)
49 |
--------------------------------------------------------------------------------
/tests/e2e_v2/upgrade/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Conftest for upgrade tests - imports UI fixtures for dashboard tests
17 | """
18 |
19 | import sys
20 | import os
21 | import pytest
22 |
23 | # Add parent test directory to path
24 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
25 |
26 | # Import all fixtures from ui/conftest.py
27 | from ui.conftest import (
28 | selenium_driver,
29 | dashboard_url,
30 | test_credentials,
31 | login_to_dashboard,
32 | )
33 |
34 | __all__ = ["selenium_driver", "dashboard_url", "test_credentials", "login_to_dashboard"]
35 |
36 |
37 | # Hook to capture test results for teardown methods
38 | @pytest.hookimpl(tryfirst=True, hookwrapper=True)
39 | def pytest_runtest_makereport(item, call):
40 | """
41 | Hook to capture test results and make them available to teardown methods.
42 | This allows teardown_method to check if the test failed.
43 | """
44 | outcome = yield
45 | rep = outcome.get_result()
46 |
47 | # Store the result in the item so teardown can access it
48 | setattr(item, f"rep_{rep.when}", rep)
49 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/rayjobs/test/conftest.py:
--------------------------------------------------------------------------------
1 | """Shared pytest fixtures for rayjobs tests."""
2 |
3 | import pytest
4 | from unittest.mock import MagicMock
5 |
6 |
7 | # Global test setup that runs automatically for ALL tests
8 | @pytest.fixture(autouse=True)
9 | def auto_mock_setup(mocker):
10 | """Automatically mock common dependencies for all tests."""
11 | mocker.patch("kubernetes.config.load_kube_config")
12 |
13 | # Always mock get_default_kueue_name to prevent K8s API calls
14 | mocker.patch(
15 | "codeflare_sdk.ray.rayjobs.rayjob.get_default_kueue_name",
16 | return_value="default-queue",
17 | )
18 |
19 | mock_get_ns = mocker.patch(
20 | "codeflare_sdk.ray.rayjobs.rayjob.get_current_namespace",
21 | return_value="test-namespace",
22 | )
23 |
24 | mock_rayjob_api = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi")
25 | mock_rayjob_instance = MagicMock()
26 | mock_rayjob_api.return_value = mock_rayjob_instance
27 |
28 | mock_cluster_api = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayClusterApi")
29 | mock_cluster_instance = MagicMock()
30 | mock_cluster_api.return_value = mock_cluster_instance
31 |
32 | mock_k8s_api = mocker.patch("kubernetes.client.CoreV1Api")
33 | mock_k8s_instance = MagicMock()
34 | mock_k8s_api.return_value = mock_k8s_instance
35 |
36 | # Mock get_api_client in runtime_env module where it's actually used
37 | mocker.patch("codeflare_sdk.ray.rayjobs.runtime_env.get_api_client")
38 |
39 | # Return the mocked instances so tests can configure them as needed
40 | return {
41 | "rayjob_api": mock_rayjob_instance,
42 | "cluster_api": mock_cluster_instance,
43 | "k8s_api": mock_k8s_instance,
44 | "get_current_namespace": mock_get_ns,
45 | }
46 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/rayjobs/status.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | The status sub-module defines Enums containing information for Ray job
17 | deployment states and CodeFlare job states, as well as
18 | dataclasses to store information for Ray jobs.
19 | """
20 |
21 | from dataclasses import dataclass
22 | from enum import Enum
23 | from typing import Optional
24 |
25 |
26 | class RayJobDeploymentStatus(Enum):
27 | """
28 | Defines the possible deployment states of a Ray job (from the KubeRay RayJob API).
29 | """
30 |
31 | COMPLETE = "Complete"
32 | RUNNING = "Running"
33 | FAILED = "Failed"
34 | SUSPENDED = "Suspended"
35 | UNKNOWN = "Unknown"
36 |
37 |
38 | class CodeflareRayJobStatus(Enum):
39 | """
40 | Defines the possible reportable states of a CodeFlare Ray job.
41 | """
42 |
43 | COMPLETE = 1
44 | RUNNING = 2
45 | FAILED = 3
46 | SUSPENDED = 4
47 | UNKNOWN = 5
48 |
49 |
50 | @dataclass
51 | class RayJobInfo:
52 | """
53 | For storing information about a Ray job.
54 | """
55 |
56 | name: str
57 | job_id: str
58 | status: RayJobDeploymentStatus
59 | namespace: str
60 | cluster_name: str
61 | start_time: Optional[str] = None
62 | end_time: Optional[str] = None
63 | failed_attempts: int = 0
64 | succeeded_attempts: int = 0
65 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to the CodeFlare SDK
2 |
3 | Thank you for your interest in contributing to the CodeFlare SDK!
4 |
5 | ## Getting Started
6 |
7 | ### Prerequisites
8 |
9 | - Python 3.11
10 | - [Poetry](https://python-poetry.org/)
11 |
12 | ### Setting Up Your Development Environment
13 |
14 | 1. **Clone the repository:**
15 |
16 | ```sh
17 | git clone https://github.com/project-codeflare/codeflare-sdk.git
18 | cd codeflare-sdk
19 | ```
20 |
21 | 2. Create a Poetry virtual environment:
22 |
23 | ```sh
24 | poetry shell
25 | ```
26 |
27 | 3. Install dependencies:
28 |
29 | ```sh
30 | poetry install
31 | ```
32 |
33 | - To include test dependencies, run:
34 |
35 | ```sh
36 | poetry install --with test
37 | ```
38 |
39 | - To include docs dependencies, run:
40 |
41 | ```sh
42 | poetry install --with docs
43 | ```
44 |
45 | - To include both test and docs dependencies, run:
46 |
47 | ```sh
48 | poetry install --with test,docs
49 | ```
50 |
51 | ## Development Workflow
52 |
53 | ### Pre-commit
54 |
55 | We use pre-commit to ensure consistent code formatting. To enable pre-commit hooks, run:
56 |
57 | ```sh
58 | pre-commit install
59 | ```
60 |
61 | ## Testing
62 |
63 | To install CodeFlare SDK in editable mode, run:
64 |
65 | ```sh
66 | pip install -e .
67 | ```
68 |
69 | ### Unit Testing
70 |
71 | To run the unit tests, execute:
72 |
73 | ```sh
74 | pytest -v src/codeflare_sdk
75 | ```
76 |
77 | ### Local e2e Testing
78 |
79 | - Please follow the [e2e documentation](https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/sphinx/user-docs/e2e.rst)
80 |
81 | #### Code Coverage
82 |
83 | - Run tests with the following command: `coverage run -m pytest`
84 | - To then view a code coverage report w/ missing lines, run `coverage report -m`
85 |
86 | ### Code Formatting
87 |
88 | - To check file formatting, in top-level dir run `black --check .`
89 | - To auto-reformat all files, remove the `--check` flag
90 | - To reformat an individual file, run `black `
91 |
--------------------------------------------------------------------------------
/.github/workflows/coverage-badge.yaml:
--------------------------------------------------------------------------------
1 | # This workflow will generate and push an updated coverage badge
2 |
3 | name: Coverage Badge
4 |
5 | on:
6 | push:
7 | branches: [ main, ray-jobs-feature ]
8 |
9 | jobs:
10 | report:
11 |
12 | permissions:
13 | contents: write
14 | pull-requests: write
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - uses: actions/checkout@v4
19 | - name: Set up Python 3.11
20 | uses: actions/setup-python@v5
21 | with:
22 | python-version: 3.11
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install poetry
27 | poetry config virtualenvs.create false
28 | poetry lock
29 | poetry install --with test
30 | - name: Generate coverage report
31 | run: |
32 | coverage run --omit="src/**/test_*.py,src/codeflare_sdk/common/utils/unit_test_support.py,src/codeflare_sdk/vendored/**" -m pytest
33 |
34 | - name: Coverage Badge
35 | uses: tj-actions/coverage-badge-py@v2
36 |
37 | - name: Verify Changed files
38 | uses: tj-actions/verify-changed-files@v18
39 | id: changed_files
40 | with:
41 | files: coverage.svg
42 |
43 | - name: Commit files
44 | if: steps.changed_files.outputs.files_changed == 'true'
45 | run: |
46 | git config --local user.email "github-actions[bot]@users.noreply.github.com"
47 | git config --local user.name "github-actions[bot]"
48 | git add coverage.svg
49 | git commit -m "Updated coverage.svg"
50 |
51 | - name: Create Pull Request
52 | if: steps.changed_files.outputs.files_changed == 'true'
53 | uses: peter-evans/create-pull-request@v6
54 | with:
55 | token: ${{ secrets.GITHUB_TOKEN }}
56 | title: "[Automatic] Coverage Badge Update"
57 | commit-message: "Updated coverage.svg"
58 | branch: create-pull-request/coverage-badge-update
59 | delete-branch: true
60 | body: |
61 | This is an automated pull request to update the coverage badge.
62 |
63 | - Updated coverage.svg based on latest test results
64 |
--------------------------------------------------------------------------------
/tests/e2e_v2/utils/scripts/cpu_script.py:
--------------------------------------------------------------------------------
1 | """
2 | CPU-optimized RayJob validation script using Ray Train.
3 | """
4 |
5 | import ray
6 | import sys
7 | import traceback
8 | from ray import train
9 |
10 |
11 | def train_func(config):
12 | """Minimal training function for CPU execution."""
13 | worker_rank = config.get("worker_rank", 0)
14 | result = sum(i * i for i in range(1000))
15 |
16 | try:
17 | train.report({"loss": result, "worker_rank": worker_rank})
18 | except RuntimeError:
19 | pass
20 |
21 | print(f"Worker {worker_rank} completed CPU training task. Result: {result}")
22 |
23 |
24 | def main():
25 | """Run a minimal Ray Train task on CPU."""
26 | try:
27 | ray.init()
28 | print("Starting CPU training task...")
29 | print(f"Ray initialized. Cluster resources: {ray.cluster_resources()}")
30 |
31 | @ray.remote
32 | def train_worker(worker_id):
33 | try:
34 | train_func({"worker_rank": worker_id})
35 | result = sum(i * i for i in range(1000))
36 | return {"loss": result, "worker_rank": worker_id}
37 | except Exception as e:
38 | print(f"Ray Train context not available, using fallback: {e}")
39 | result = sum(i * i for i in range(1000))
40 | print(
41 | f"Worker {worker_id} completed CPU training task. Result: {result}"
42 | )
43 | return {"loss": result, "worker_rank": worker_id}
44 |
45 | results = ray.get([train_worker.remote(i) for i in range(1)])
46 | all_metrics = {}
47 | for result in results:
48 | if isinstance(result, dict):
49 | all_metrics.update(result)
50 |
51 | print(f"Training completed successfully. Metrics: {all_metrics}")
52 | print("EXISTING_CLUSTER_JOB_SUCCESS")
53 | return 0
54 |
55 | except Exception as e:
56 | print(f"FAILURE: Exception occurred: {e}")
57 | traceback.print_exc()
58 | return 1
59 | finally:
60 | ray.shutdown()
61 |
62 |
63 | if __name__ == "__main__":
64 | sys.exit(main())
65 |
--------------------------------------------------------------------------------
/demo-notebooks/additional-demos/batch-inference/simple_batch_inf.py:
--------------------------------------------------------------------------------
1 | import ray
2 | from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
3 |
4 |
5 | # 1. Construct a vLLM processor config.
6 | processor_config = vLLMEngineProcessorConfig(
7 | # The base model.
8 | model_source="unsloth/Llama-3.2-1B-Instruct",
9 | # vLLM engine config.
10 | engine_kwargs=dict(
11 | enable_lora=False,
12 | # # Older GPUs (e.g. T4) don't support bfloat16. You should remove
13 | # # this line if you're using later GPUs.
14 | dtype="half",
15 | # Reduce the model length to fit small GPUs. You should remove
16 | # this line if you're using large GPUs.
17 | max_model_len=1024,
18 | ),
19 | # The batch size used in Ray Data.
20 | batch_size=8,
21 | # Use one GPU in this example.
22 | concurrency=1,
23 | # If you save the LoRA adapter in S3, you can set the following path.
24 | # dynamic_lora_loading_path="s3://your-lora-bucket/",
25 | )
26 |
27 | # 2. Construct a processor using the processor config.
28 | processor = build_llm_processor(
29 | processor_config,
30 | preprocess=lambda row: dict(
31 | # Remove the LoRA model specification
32 | messages=[
33 | {
34 | "role": "system",
35 | "content": "You are a calculator. Please only output the answer "
36 | "of the given equation.",
37 | },
38 | {"role": "user", "content": f"{row['id']} ** 3 = ?"},
39 | ],
40 | sampling_params=dict(
41 | temperature=0.3,
42 | max_tokens=20,
43 | detokenize=False,
44 | ),
45 | ),
46 | postprocess=lambda row: {
47 | "resp": row["generated_text"],
48 | },
49 | )
50 |
51 | # 3. Synthesize a dataset with 32 rows.
52 | ds = ray.data.range(32)
53 | # 4. Apply the processor to the dataset. Note that this line won't kick off
54 | # anything because processor is execution lazily.
55 | ds = processor(ds)
56 | # Materialization kicks off the pipeline execution.
57 | ds = ds.materialize()
58 |
59 | # 5. Print all outputs.
60 | for out in ds.take_all():
61 | print(out)
62 | print("==========")
63 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/cluster/status.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | The status sub-module defines Enums containing information for Ray cluster
17 | states states, and CodeFlare cluster states, as well as
18 | dataclasses to store information for Ray clusters.
19 | """
20 |
21 | from dataclasses import dataclass, field
22 | from enum import Enum
23 | import typing
24 | from typing import Union
25 |
26 |
27 | class RayClusterStatus(Enum):
28 | """
29 | Defines the possible reportable states of a Ray cluster.
30 | """
31 |
32 | # https://github.com/ray-project/kuberay/blob/master/ray-operator/apis/ray/v1/raycluster_types.go#L112-L117
33 | READY = "ready"
34 | UNHEALTHY = "unhealthy"
35 | FAILED = "failed"
36 | UNKNOWN = "unknown"
37 | SUSPENDED = "suspended"
38 |
39 |
40 | class CodeFlareClusterStatus(Enum):
41 | """
42 | Defines the possible reportable states of a Codeflare cluster.
43 | """
44 |
45 | READY = 1
46 | STARTING = 2
47 | QUEUED = 3
48 | QUEUEING = 4
49 | FAILED = 5
50 | UNKNOWN = 6
51 | SUSPENDED = 7
52 |
53 |
54 | @dataclass
55 | class RayCluster:
56 | """
57 | For storing information about a Ray cluster.
58 | """
59 |
60 | name: str
61 | status: RayClusterStatus
62 | head_cpu_requests: int
63 | head_cpu_limits: int
64 | head_mem_requests: str
65 | head_mem_limits: str
66 | num_workers: int
67 | worker_mem_requests: str
68 | worker_mem_limits: str
69 | worker_cpu_requests: Union[int, str]
70 | worker_cpu_limits: Union[int, str]
71 | namespace: str
72 | dashboard: str
73 | worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict)
74 | head_extended_resources: typing.Dict[str, int] = field(default_factory=dict)
75 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CodeFlare SDK
2 |
3 | [](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/unit-tests.yml)
4 | 
5 |
6 | An intuitive, easy-to-use python interface for batch resource requesting, access, job submission, and observation. Simplifying the developer's life while enabling access to high-performance compute resources, either in the cloud or on-prem.
7 |
8 | For guided demos and basics walkthroughs, check out the following links:
9 |
10 | - Guided demo notebooks available [here](https://github.com/project-codeflare/codeflare-sdk/tree/main/demo-notebooks/guided-demos), and copies of the notebooks with [expected output](https://github.com/project-codeflare/codeflare-sdk/tree/main/demo-notebooks/guided-demos/notebook-ex-outputs) also available
11 | - these demos can be copied into your current working directory when using the `codeflare-sdk` by using the `codeflare_sdk.copy_demo_nbs()` function
12 | - Additionally, we have a [video walkthrough](https://www.youtube.com/watch?v=U76iIfd9EmE) of these basic demos from June, 2023
13 |
14 | Full documentation can be found [here](https://project-codeflare.github.io/codeflare-sdk/index.html)
15 |
16 | ## Installation
17 |
18 | Can be installed via `pip`: `pip install codeflare-sdk`
19 |
20 | ## Development
21 |
22 | Please see our [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed instructions.
23 |
24 | ## Release Instructions
25 |
26 | ### Automated Releases
27 |
28 | It is possible to use the Release Github workflow to do the release. This is generally the process we follow for releases
29 |
30 | ### Manual Releases
31 |
32 | The following instructions apply when doing release manually. This may be required in instances where the automation is failing.
33 |
34 | - Check and update the version in "pyproject.toml" file.
35 | - Commit all the changes to the repository.
36 | - Create Github release ().
37 | - Build the Python package. `poetry build`
38 | - If not present already, add the API token to Poetry.
39 | `poetry config pypi-token.pypi API_TOKEN`
40 | - Publish the Python package. `poetry publish`
41 | - Trigger the [Publish Documentation](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/publish-documentation.yaml) workflow
42 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import sys
15 |
16 | from codeflare_sdk.common.utils.constants import (
17 | SUPPORTED_PYTHON_VERSIONS,
18 | CUDA_PY312_RUNTIME_IMAGE,
19 | )
20 |
21 |
22 | def update_image(image) -> str:
23 | """
24 | The update_image() function automatically sets the image config parameter to a preset image based on Python version if not specified.
25 | This now points to the centralized function in utils.py.
26 | """
27 | if not image:
28 | # Pull the image based on the matching Python version (or output a warning if not supported)
29 | image = get_ray_image_for_python_version(warn_on_unsupported=True)
30 | return image
31 |
32 |
33 | def get_ray_image_for_python_version(python_version=None, warn_on_unsupported=True):
34 | """
35 | Get the appropriate Ray image for a given Python version.
36 | If no version is provided, uses the current runtime Python version.
37 | This prevents us needing to hard code image versions for tests.
38 |
39 | Args:
40 | python_version: Python version string (e.g. "3.11"). If None, detects current version.
41 | warn_on_unsupported: If True, warns and returns None for unsupported versions.
42 | If False, silently falls back to Python 3.12 image.
43 | """
44 | if python_version is None:
45 | python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
46 |
47 | if python_version in SUPPORTED_PYTHON_VERSIONS:
48 | return SUPPORTED_PYTHON_VERSIONS[python_version]
49 | elif warn_on_unsupported:
50 | import warnings
51 |
52 | warnings.warn(
53 | f"No default Ray image defined for {python_version}. Please provide your own image or use one of the following python versions: {', '.join(SUPPORTED_PYTHON_VERSIONS.keys())}."
54 | )
55 | return None
56 | else:
57 | return CUDA_PY312_RUNTIME_IMAGE
58 |
--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/mnist_fashion.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import ray
4 | from torch.utils.data import DataLoader
5 | from torchvision import datasets
6 | from torchvision.transforms import ToTensor
7 | from ray.train.torch import TorchTrainer
8 | from ray.train import ScalingConfig
9 |
10 |
11 | class NeuralNetwork(nn.Module):
12 | def __init__(self):
13 | super().__init__()
14 | self.flatten = nn.Flatten()
15 | self.linear_relu_stack = nn.Sequential(
16 | nn.Linear(28 * 28, 512),
17 | nn.ReLU(),
18 | nn.Linear(512, 512),
19 | nn.ReLU(),
20 | nn.Linear(512, 10),
21 | )
22 |
23 | def forward(self, inputs):
24 | inputs = self.flatten(inputs)
25 | logits = self.linear_relu_stack(inputs)
26 | return logits
27 |
28 |
29 | def get_dataset():
30 | return datasets.FashionMNIST(
31 | root="/tmp/data",
32 | train=True,
33 | download=True,
34 | transform=ToTensor(),
35 | )
36 |
37 |
38 | def train_func_distributed():
39 | num_epochs = 3
40 | batch_size = 64
41 |
42 | dataset = get_dataset()
43 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
44 | dataloader = ray.train.torch.prepare_data_loader(dataloader)
45 |
46 | model = NeuralNetwork()
47 | model = ray.train.torch.prepare_model(model)
48 |
49 | criterion = nn.CrossEntropyLoss()
50 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
51 |
52 | for epoch in range(num_epochs):
53 | if ray.train.get_context().get_world_size() > 1:
54 | dataloader.sampler.set_epoch(epoch)
55 |
56 | for inputs, labels in dataloader:
57 | optimizer.zero_grad()
58 | pred = model(inputs)
59 | loss = criterion(pred, labels)
60 | loss.backward()
61 | optimizer.step()
62 | print(f"epoch: {epoch}, loss: {loss.item()}")
63 |
64 |
65 | # For GPU Training, set `use_gpu` to True.
66 | use_gpu = True
67 |
68 | # To learn more about configuring S3 compatible storage check out our docs -> https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/s3-compatible-storage.md
69 | trainer = TorchTrainer(
70 | train_func_distributed,
71 | scaling_config=ScalingConfig(
72 | # num_workers = number of worker nodes with the ray head node included
73 | num_workers=3,
74 | use_gpu=use_gpu,
75 | resources_per_worker={
76 | "CPU": 1,
77 | },
78 | ),
79 | )
80 |
81 | results = trainer.fit()
82 |
--------------------------------------------------------------------------------
/tests/e2e/mnist_sleep.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import time
16 | import torch
17 | import torch.nn as nn
18 | from torch.utils.data import DataLoader
19 | from torchvision import datasets, transforms
20 |
21 |
22 | # Define a simple neural network
23 | class NeuralNetwork(nn.Module):
24 | def __init__(self):
25 | super(NeuralNetwork, self).__init__()
26 | self.flatten = nn.Flatten()
27 | self.linear_relu_stack = nn.Sequential(
28 | nn.Linear(28 * 28, 512),
29 | nn.ReLU(),
30 | nn.Linear(512, 512),
31 | nn.ReLU(),
32 | nn.Linear(512, 10),
33 | )
34 |
35 | def forward(self, x):
36 | x = self.flatten(x)
37 | logits = self.linear_relu_stack(x)
38 | return logits
39 |
40 |
41 | # Define the training function
42 | def train():
43 | # Sleeping for 24 hours for upgrade test scenario
44 | print("Sleeping for 24 hours before starting the training for upgrade testing...")
45 | time.sleep(24 * 60 * 60)
46 |
47 | # Load dataset
48 | transform = transforms.Compose([transforms.ToTensor()])
49 | train_dataset = datasets.FashionMNIST(
50 | root="./data", train=True, download=True, transform=transform
51 | )
52 | train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
53 |
54 | # Initialize the neural network, loss function, and optimizer
55 | model = NeuralNetwork()
56 | criterion = nn.CrossEntropyLoss()
57 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
58 |
59 | # Train the model
60 | num_epochs = 3
61 | for epoch in range(num_epochs):
62 | for inputs, labels in train_loader:
63 | optimizer.zero_grad()
64 | outputs = model(inputs)
65 | loss = criterion(outputs, labels)
66 | loss.backward()
67 | optimizer.step()
68 | print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
69 |
70 |
71 | if __name__ == "__main__":
72 | train()
73 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/test_demos.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Tests for demos module.
17 | """
18 |
19 | import pytest
20 | import tempfile
21 | from pathlib import Path
22 | from unittest.mock import patch, MagicMock
23 | from codeflare_sdk.common.utils.demos import copy_demo_nbs
24 |
25 |
26 | class TestCopyDemoNbs:
27 | """Test cases for copy_demo_nbs function."""
28 |
29 | def test_copy_demo_nbs_directory_exists_error(self):
30 | """Test that FileExistsError is raised when directory exists and overwrite=False."""
31 | with tempfile.TemporaryDirectory() as temp_dir:
32 | # Create a subdirectory that will conflict
33 | conflict_dir = Path(temp_dir) / "demo-notebooks"
34 | conflict_dir.mkdir()
35 |
36 | with pytest.raises(FileExistsError, match="Directory.*already exists"):
37 | copy_demo_nbs(dir=str(conflict_dir), overwrite=False)
38 |
39 | def test_copy_demo_nbs_overwrite_true(self):
40 | """Test that overwrite=True allows copying to existing directory."""
41 | with tempfile.TemporaryDirectory() as temp_dir:
42 | # Create a subdirectory that will conflict
43 | conflict_dir = Path(temp_dir) / "demo-notebooks"
44 | conflict_dir.mkdir()
45 |
46 | # Mock the demo_dir to point to a real directory
47 | with patch("codeflare_sdk.common.utils.demos.demo_dir", temp_dir):
48 | # Should not raise an error with overwrite=True
49 | copy_demo_nbs(dir=str(conflict_dir), overwrite=True)
50 |
51 | def test_copy_demo_nbs_default_parameters(self):
52 | """Test copy_demo_nbs with default parameters."""
53 | with tempfile.TemporaryDirectory() as temp_dir:
54 | # Mock the demo_dir to point to a real directory
55 | with patch("codeflare_sdk.common.utils.demos.demo_dir", temp_dir):
56 | # Should work with default parameters
57 | copy_demo_nbs(dir=temp_dir, overwrite=True)
58 |
--------------------------------------------------------------------------------
/docs/sphinx/user-docs/ui-widgets.rst:
--------------------------------------------------------------------------------
1 | Jupyter UI Widgets
2 | ==================
3 |
4 | Below are some examples of the Jupyter UI Widgets that are included in
5 | the CodeFlare SDK.
6 |
7 | .. note::
8 | To use the widgets functionality you must be using the CodeFlare SDK in a Jupyter Notebook environment.
9 |
10 | Cluster Up/Down Buttons
11 | -----------------------
12 |
13 | The Cluster Up/Down buttons appear after successfully initialising your
14 | `ClusterConfiguration `__.
15 | There are two buttons and a checkbox ``Cluster Up``, ``Cluster Down``
16 | and ``Wait for Cluster?`` which mimic the
17 | `cluster.apply() `__,
18 | `cluster.down() `__ and
19 | `cluster.wait_ready() `__
20 | functionality.
21 |
22 | After initialising their ``ClusterConfiguration`` a user can select the
23 | ``Wait for Cluster?`` checkbox then click the ``Cluster Up`` button to
24 | create their Ray Cluster and wait until it is ready. The cluster can be
25 | deleted by clicking the ``Cluster Down`` button.
26 |
27 | .. image:: images/ui-buttons.png
28 | :alt: An image of the up/down ui buttons
29 |
30 | View Clusters UI Table
31 | ----------------------
32 |
33 | The View Clusters UI Table allows a user to see a list of Ray Clusters
34 | with information on their configuration including number of workers, CPU
35 | requests and limits along with the clusters status.
36 |
37 | .. image:: images/ui-view-clusters.png
38 | :alt: An image of the view clusters ui table
39 |
40 | Above is a list of two Ray Clusters ``raytest`` and ``raytest2`` each of
41 | those headings is clickable and will update the table to view the
42 | selected Cluster's information. There are four buttons under the table
43 | ``Cluster Down``, ``View Jobs``, ``Open Ray Dashboard``, and ``Refresh Data``. \* The
44 | ``Cluster Down`` button will delete the selected Cluster. \* The
45 | ``View Jobs`` button will try to open the Ray Dashboard's Jobs view in a
46 | Web Browser. The link will also be printed to the console. \* The
47 | ``Open Ray Dashboard`` button will try to open the Ray Dashboard view in
48 | a Web Browser. The link will also be printed to the console. \* The
49 | ``Refresh Data`` button will refresh the list of RayClusters, the spec, and
50 | the status of the Ray Cluster.
51 |
52 | The UI Table can be viewed by calling the following function.
53 |
54 | .. code:: python
55 |
56 | from codeflare_sdk import view_clusters
57 | view_clusters() # Accepts namespace parameter but will try to gather the namespace from the current context
58 |
--------------------------------------------------------------------------------
/docs/sphinx/user-docs/authentication.rst:
--------------------------------------------------------------------------------
1 | Authentication via the CodeFlare SDK
2 | ====================================
3 |
4 | Currently there are four ways of authenticating to your cluster via the
5 | SDK. Authenticating with your cluster allows you to perform actions such
6 | as creating Ray Clusters and Job Submission.
7 |
8 | Method 1 Token Authentication
9 | -----------------------------
10 |
11 | This is how a typical user would authenticate to their cluster using
12 | ``TokenAuthentication``.
13 |
14 | ::
15 |
16 | from codeflare_sdk import TokenAuthentication
17 |
18 | auth = TokenAuthentication(
19 | token = "XXXXX",
20 | server = "XXXXX",
21 | skip_tls=False,
22 | # ca_cert_path="/path/to/cert"
23 | )
24 | auth.login()
25 | # log out with auth.logout()
26 |
27 | Setting ``skip_tls=True`` allows interaction with an HTTPS server
28 | bypassing the server certificate checks although this is not secure. You
29 | can pass a custom certificate to ``TokenAuthentication`` by using
30 | ``ca_cert_path="/path/to/cert"`` when authenticating provided
31 | ``skip_tls=False``. Alternatively you can set the environment variable
32 | ``CF_SDK_CA_CERT_PATH`` to the path of your custom certificate.
33 |
34 | Method 2 Kubernetes Config File Authentication (Default location)
35 | -----------------------------------------------------------------
36 |
37 | If a user has authenticated to their cluster by alternate means e.g. run
38 | a login command like ``oc login --token= --server=``
39 | their kubernetes config file should have updated. If the user has not
40 | specifically authenticated through the SDK by other means such as
41 | ``TokenAuthentication`` then the SDK will try to use their default
42 | Kubernetes config file located at ``"$HOME/.kube/config"``.
43 |
44 | Method 3 Specifying a Kubernetes Config File
45 | --------------------------------------------
46 |
47 | A user can specify a config file via a different authentication class
48 | ``KubeConfigFileAuthentication`` for authenticating with the SDK. This
49 | is what loading a custom config file would typically look like.
50 |
51 | ::
52 |
53 | from codeflare_sdk import KubeConfigFileAuthentication
54 |
55 | auth = KubeConfigFileAuthentication(
56 | kube_config_path="/path/to/config",
57 | )
58 | auth.load_kube_config()
59 | # log out with auth.logout()
60 |
61 | Method 4 In-Cluster Authentication
62 | ----------------------------------
63 |
64 | If a user does not authenticate by any of the means detailed above and
65 | does not have a config file at ``"$HOME/.kube/config"`` the SDK will try
66 | to authenticate with the in-cluster configuration file.
67 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/common/kubernetes_cluster/kube_api_helpers.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | This sub-module exists primarily to be used internally for any Kubernetes
17 | API error handling or wrapping.
18 | """
19 |
20 | import executing
21 | from kubernetes import client, config
22 |
23 | ERROR_MESSAGES = {
24 | "Not Found": "The requested resource could not be located.\n"
25 | "Please verify the resource name and namespace.",
26 | "Unauthorized": "Access to the API is unauthorized.\n"
27 | "Check your credentials or permissions.",
28 | "Forbidden": "Access denied to the Kubernetes resource.\n"
29 | "Ensure your role has sufficient permissions for this operation.",
30 | "Conflict": "A conflict occurred with the RayCluster resource.\n"
31 | "Only one RayCluster with the same name is allowed. "
32 | "Please delete or rename the existing RayCluster before creating a new one with the desired name.",
33 | }
34 |
35 |
36 | # private methods
37 | def _kube_api_error_handling(
38 | e: Exception, print_error: bool = True
39 | ): # pragma: no cover
40 | def print_message(message: str):
41 | if print_error:
42 | print(message)
43 |
44 | if isinstance(e, client.ApiException):
45 | # Retrieve message based on reason, defaulting if reason is not known
46 | message = ERROR_MESSAGES.get(
47 | e.reason, f"Unexpected API error encountered (Reason: {e.reason})"
48 | )
49 | full_message = f"{message}\nResponse: {e.body}"
50 | print_message(full_message)
51 |
52 | elif isinstance(e, config.ConfigException):
53 | message = "Configuration error: Unable to load Kubernetes configuration. Verify the config file path and format."
54 | print_message(message)
55 |
56 | elif isinstance(e, executing.executing.NotOneValueFound):
57 | message = "Execution error: Expected exactly one value in the operation but found none or multiple."
58 | print_message(message)
59 |
60 | else:
61 | message = f"Unexpected error:\n{str(e)}"
62 | print_message(message)
63 | raise e
64 |
--------------------------------------------------------------------------------
/target_users.md:
--------------------------------------------------------------------------------
1 | # CodeFlare Stack Target Users
2 |
3 | [Cluster Admin](#cluster-administrator)
4 |
5 | [Data Scientist I](#data-scientist-i)
6 |
7 | [Data Scientist II](#data-scientist-ii)
8 |
9 |
10 |
11 | ## Cluster Administrator
12 |
13 | * Quota Management
14 | * Gang-Scheduling for Distributed Compute
15 | * Job/Infrastructure Queuing
16 |
17 | I want to enable a team of data scientists to have self-serve, but limited, access to a shared pool of distributed compute resources such as GPUs for large scale machine learning model training jobs. If the existing pool of resources is insufficient, I want my cluster to scale up (to a defined quota) to meet my users’ needs and scale back down automatically when their jobs have completed. I want these features to be made available through simple installation of generic modules via a user-friendly interface. I also want the ability to monitor current queue of pending tasks, the utilization of active resources, and the progress of all current jobs visualized in a simple dashboard.
18 |
19 | ## Data Scientist I
20 |
21 | * Training Mid-Size Models (less than 1,000 nodes)
22 | * Fine-Tuning Existing Models
23 | * Distributed Compute Framework
24 |
25 | I need temporary access to a reasonably large set of GPU enabled nodes on my team’s shared cluster for short term experimentation, parallelizing my existing ML workflow, or fine-tuning existing large scale models. I’d prefer to work from a notebook environment with access to a python sdk that I can use to request the creation of Framework Clusters that I can distribute my workloads across. In addition to interactive experimentation work, I also want the ability to “fire-and-forget” longer running ML jobs onto temporarily deployed Framework Clusters with the ability to monitor these jobs while they are running and access to all of their artifacts once complete. I also want to see where my jobs are in the current queue and the progress of all my current jobs visualized in a simple dashboard.
26 |
27 | ## Data Scientist II
28 | * Training Foundation Models (1,000+ nodes)
29 | * Distributed Compute Framework
30 |
31 | I need temporary (but long term) access to a massive amount of GPU enabled infrastructure to train a foundation model. I want to be able to “fire-and-forget” my ML Job into this environment. Due to the size and cost associated with this job, it has already been well tested and validated, so access to jupyter notebooks is unnecessary. I would prefer to write my job as a bash script leveraging a CLI, or as a python script leveraging an SDK. I need the ability to monitor the job while it is running, as well as access to all of its artifacts once complete. I also want to see where my jobs are in the current queue and the progress of all my current jobs visualized in a simple dashboard.
32 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/examples/use-builder.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from os import path
4 | import json
5 |
6 |
7 | """
8 | in case you are working directly with the source, and don't wish to
9 | install the module with pip install, you can directly import the packages by uncommenting the following code.
10 | """
11 |
12 | """
13 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
14 |
15 | current_dir = os.path.dirname(os.path.abspath(__file__))
16 | parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
17 | sibling_dirs = [
18 | d for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d))
19 | ]
20 | for sibling_dir in sibling_dirs:
21 | sys.path.append(os.path.join(parent_dir, sibling_dir))
22 | """
23 |
24 | from codeflare_sdk.vendored.python_client import kuberay_cluster_api
25 |
26 | from codeflare_sdk.vendored.python_client.utils import kuberay_cluster_builder
27 |
28 |
29 | def main():
30 | print("starting cluster handler...")
31 | my_kuberay_api = kuberay_cluster_api.RayClusterApi()
32 |
33 | my_cluster_builder = kuberay_cluster_builder.ClusterBuilder()
34 |
35 | cluster1 = (
36 | my_cluster_builder.build_meta(
37 | name="new-cluster1", labels={"demo-cluster": "yes"}
38 | )
39 | .build_head()
40 | .build_worker(group_name="workers")
41 | .get_cluster()
42 | )
43 |
44 | if not my_cluster_builder.succeeded:
45 | print("error building the cluster, aborting...")
46 | return
47 |
48 | print("creating raycluster = {}".format(cluster1["metadata"]["name"]))
49 | my_kuberay_api.create_ray_cluster(body=cluster1)
50 |
51 | # the rest of the code is simply to list and cleanup the created cluster
52 | kube_ray_list = my_kuberay_api.list_ray_clusters(
53 | k8s_namespace="default", label_selector="demo-cluster=yes"
54 | )
55 | if "items" in kube_ray_list:
56 | line = "-" * 72
57 | print(line)
58 | print("{:<63s}{:>2s}".format("Name", "Namespace"))
59 | print(line)
60 | for cluster in kube_ray_list["items"]:
61 | print(
62 | "{:<63s}{:>2s}".format(
63 | cluster["metadata"]["name"],
64 | cluster["metadata"]["namespace"],
65 | )
66 | )
67 | print(line)
68 |
69 | if "items" in kube_ray_list:
70 | for cluster in kube_ray_list["items"]:
71 | print("deleting raycluster = {}".format(cluster["metadata"]["name"]))
72 | my_kuberay_api.delete_ray_cluster(
73 | name=cluster["metadata"]["name"],
74 | k8s_namespace=cluster["metadata"]["namespace"],
75 | )
76 |
77 |
78 | if __name__ == "__main__":
79 | main()
80 |
--------------------------------------------------------------------------------
/tests/e2e_v2/utils/in_cluster/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | High-level setup and cleanup functions for in-cluster test execution.
3 |
4 | This module provides convenient functions that combine service account creation
5 | and RBAC setup for easy use in test setup/teardown methods.
6 | """
7 |
8 | from kubernetes import client
9 | from .rbac import (
10 | create_test_service_account,
11 | create_rayjob_rbac,
12 | delete_test_service_account,
13 | )
14 |
15 |
16 | def setup_in_cluster_test_environment(
17 | api_instance: client.CoreV1Api,
18 | custom_api: client.CustomObjectsApi,
19 | namespace: str,
20 | name_prefix: str = "test-pod",
21 | ) -> str:
22 | """
23 | Set up a complete in-cluster test environment with service account and RBAC.
24 |
25 | This function:
26 | 1. Creates a ServiceAccount
27 | 2. Creates a Role with permissions for RayJob operations
28 | 3. Creates a RoleBinding linking the Role to the ServiceAccount
29 |
30 | Args:
31 | api_instance: Kubernetes CoreV1Api instance.
32 | custom_api: CustomObjectsApi instance.
33 | namespace: Namespace to create resources in.
34 | name_prefix: Prefix for resource names.
35 |
36 | Returns:
37 | The service account name to use in pod creation.
38 | """
39 | service_account_name = create_test_service_account(
40 | api_instance=api_instance,
41 | namespace=namespace,
42 | name_prefix=name_prefix,
43 | )
44 |
45 | try:
46 | create_rayjob_rbac(
47 | api_instance=api_instance,
48 | custom_api=custom_api,
49 | namespace=namespace,
50 | service_account_name=service_account_name,
51 | )
52 | except Exception:
53 | try:
54 | api_instance.delete_namespaced_service_account(
55 | service_account_name, namespace
56 | )
57 | except Exception:
58 | pass
59 | raise
60 |
61 | return service_account_name
62 |
63 |
64 | def cleanup_in_cluster_test_environment(
65 | api_instance: client.CoreV1Api,
66 | custom_api: client.CustomObjectsApi,
67 | namespace: str,
68 | service_account_name: str,
69 | ) -> None:
70 | """
71 | Clean up in-cluster test environment (ServiceAccount, Role, RoleBinding).
72 |
73 | Args:
74 | api_instance: Kubernetes CoreV1Api instance.
75 | custom_api: CustomObjectsApi instance.
76 | namespace: Namespace where resources exist.
77 | service_account_name: Name of the service account to clean up.
78 | """
79 | delete_test_service_account(
80 | api_instance=api_instance,
81 | custom_api=custom_api,
82 | namespace=namespace,
83 | service_account_name=service_account_name,
84 | )
85 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "codeflare-sdk"
3 | version = "0.33.0"
4 |
5 | [tool.poetry]
6 | name = "codeflare-sdk"
7 | version = "0.33.0"
8 | description = "Python SDK for codeflare client"
9 |
10 | license = "Apache-2.0"
11 |
12 | # Exclude vendored tests, examples, and build files from the package
13 | exclude = [
14 | "src/codeflare_sdk/vendored/python_client_test",
15 | "src/codeflare_sdk/vendored/examples",
16 | "src/codeflare_sdk/vendored/pyproject.toml",
17 | "src/codeflare_sdk/vendored/poetry.lock",
18 | "src/codeflare_sdk/vendored/README.md"
19 | ]
20 |
21 | authors = [
22 | "Michael Clifford ",
23 | "Mustafa Eyceoz ",
24 | "Abhishek Malvankar ",
25 | "Atin Sood ",
26 | ]
27 |
28 | readme = 'README.md'
29 |
30 | repository = "https://github.com/project-codeflare/codeflare-sdk"
31 | homepage = "https://github.com/project-codeflare/codeflare-sdk"
32 |
33 | keywords = ['codeflare', 'python', 'sdk', 'client', 'batch', 'scale']
34 |
35 | [tool.poetry.dependencies]
36 | python = "^3.11"
37 | openshift-client = "1.0.18"
38 | rich = ">=12.5,<14.0"
39 | ray = {version = "2.52.1", extras = ["data", "default"]}
40 | kubernetes = ">= 27.2.0"
41 | cryptography = "43.0.3"
42 | executing = "1.2.0"
43 | pydantic = ">= 2.10.6"
44 | ipywidgets = "8.1.2"
45 |
46 | [[tool.poetry.source]]
47 | name = "pypi"
48 |
49 | [[tool.poetry.source]]
50 | name = "testpypi"
51 | url = "https://test.pypi.org/simple/"
52 |
53 | [tool.poetry.group.docs]
54 | optional = true
55 |
56 | [tool.poetry.group.docs.dependencies]
57 | sphinx = "7.4.7"
58 | sphinx-rtd-theme = "3.0.1"
59 |
60 | [tool.poetry.group.test]
61 | optional = true
62 |
63 | [tool.poetry.group.test.dependencies]
64 | pytest = "7.4.0"
65 | coverage = "7.6.4"
66 | pytest-mock = "3.11.1"
67 | pytest-timeout = "2.3.1"
68 | jupyterlab = "4.5.0"
69 | selenium = "4.27.1"
70 | webdriver-manager = "4.0.2"
71 |
72 |
73 | [tool.poetry.group.dev.dependencies]
74 | diff-cover = "^9.6.0"
75 |
76 | [tool.pytest.ini_options]
77 | filterwarnings = [
78 | "ignore::DeprecationWarning:pkg_resources",
79 | "ignore:pkg_resources is deprecated as an API:DeprecationWarning",
80 | ]
81 | markers = [
82 | "kind",
83 | "openshift",
84 | "nvidia_gpu",
85 | "smoke: Smoke tests - quick validation tests",
86 | "tier1: Tier1 tests - standard test suite",
87 | "pre_upgrade: Tests to run before upgrade",
88 | "post_upgrade: Tests to run after upgrade",
89 | "ui: UI tests requiring browser automation"
90 | ]
91 | addopts = "--timeout=900 --ignore=src/codeflare_sdk/vendored"
92 | testpaths = ["src/codeflare_sdk"]
93 | collect_ignore = ["src/codeflare_sdk/common/utils/unit_test_support.py"]
94 |
95 | [build-system]
96 | requires = ["poetry-core>=1.6.0"]
97 | build-backend = "poetry.core.masonry.api"
98 |
--------------------------------------------------------------------------------
/tests/e2e/heterogeneous_clusters_kind_test.py:
--------------------------------------------------------------------------------
1 | from time import sleep
2 | import time
3 | from codeflare_sdk import (
4 | Cluster,
5 | ClusterConfiguration,
6 | )
7 |
8 | from codeflare_sdk.common.kueue.kueue import list_local_queues
9 |
10 | import pytest
11 |
12 | from support import *
13 |
14 |
15 | @pytest.mark.skip(reason="Skipping heterogenous cluster kind test")
16 | @pytest.mark.kind
17 | class TestHeterogeneousClustersKind:
18 | def setup_method(self):
19 | initialize_kubernetes_client(self)
20 |
21 | def teardown_method(self):
22 | delete_namespace(self)
23 | delete_kueue_resources(self)
24 |
25 | @pytest.mark.nvidia_gpu
26 | def test_heterogeneous_clusters(self):
27 | create_namespace(self)
28 | create_kueue_resources(self, 2, with_labels=True, with_tolerations=True)
29 | self.run_heterogeneous_clusters()
30 |
31 | def run_heterogeneous_clusters(
32 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
33 | ):
34 | for flavor in self.resource_flavors:
35 | node_labels = (
36 | get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {})
37 | )
38 | expected_nodes = get_nodes_by_label(self, node_labels)
39 |
40 | print(f"Expected nodes: {expected_nodes}")
41 | cluster_name = f"test-ray-cluster-li-{flavor[-5:]}"
42 | queues = list_local_queues(namespace=self.namespace, flavors=[flavor])
43 | queue_name = queues[0]["name"] if queues else None
44 | print(f"Using flavor: {flavor}, Queue: {queue_name}")
45 | cluster = Cluster(
46 | ClusterConfiguration(
47 | name=cluster_name,
48 | namespace=self.namespace,
49 | num_workers=1,
50 | head_cpu_requests="500m",
51 | head_cpu_limits="500m",
52 | head_memory_requests=2,
53 | head_memory_limits=2,
54 | worker_cpu_requests="500m",
55 | worker_cpu_limits=1,
56 | worker_memory_requests=1,
57 | worker_memory_limits=4,
58 | worker_extended_resource_requests={
59 | gpu_resource_name: number_of_gpus
60 | },
61 | write_to_file=True,
62 | verify_tls=False,
63 | local_queue=queue_name,
64 | )
65 | )
66 | cluster.apply()
67 | sleep(5)
68 | node_name = get_pod_node(self, self.namespace, cluster_name)
69 | print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}")
70 | sleep(5)
71 | assert (
72 | node_name in expected_nodes
73 | ), f"Node {node_name} is not in the expected nodes for flavor {flavor}."
74 | cluster.down()
75 |
--------------------------------------------------------------------------------
/tests/e2e/local_interactive_sdk_oauth_test.py:
--------------------------------------------------------------------------------
1 | from codeflare_sdk import (
2 | Cluster,
3 | ClusterConfiguration,
4 | TokenAuthentication,
5 | generate_cert,
6 | )
7 |
8 | import math
9 | import pytest
10 | import ray
11 |
12 | from support import *
13 |
14 |
15 | @pytest.mark.skip(reason="Remote ray.init() is temporarily unsupported")
16 | @pytest.mark.openshift
17 | @pytest.mark.tier1
18 | class TestRayLocalInteractiveOauth:
19 | def setup_method(self):
20 | initialize_kubernetes_client(self)
21 |
22 | def teardown_method(self):
23 | delete_namespace(self)
24 | delete_kueue_resources(self)
25 |
26 | def test_local_interactives(self):
27 | self.setup_method()
28 | create_namespace(self)
29 | create_kueue_resources(self)
30 | self.run_local_interactives()
31 |
32 | def run_local_interactives(self):
33 | ray_image = get_ray_image()
34 |
35 | auth = TokenAuthentication(
36 | token=run_oc_command(["whoami", "--show-token=true"]),
37 | server=run_oc_command(["whoami", "--show-server=true"]),
38 | skip_tls=True,
39 | )
40 | auth.login()
41 |
42 | cluster_name = "test-ray-cluster-li"
43 |
44 | cluster = Cluster(
45 | ClusterConfiguration(
46 | namespace=self.namespace,
47 | name=cluster_name,
48 | num_workers=1,
49 | head_memory_requests=6,
50 | head_memory_limits=8,
51 | head_cpu_requests=1,
52 | head_cpu_limits=1,
53 | worker_cpu_requests=1,
54 | worker_cpu_limits=1,
55 | worker_memory_requests=1,
56 | worker_memory_limits=4,
57 | image=ray_image,
58 | verify_tls=False,
59 | )
60 | )
61 | cluster.apply()
62 | cluster.wait_ready()
63 |
64 | generate_cert.generate_tls_cert(cluster_name, self.namespace)
65 | generate_cert.export_env(cluster_name, self.namespace)
66 |
67 | ray.shutdown()
68 | ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
69 |
70 | @ray.remote
71 | def heavy_calculation_part(num_iterations):
72 | result = 0.0
73 | for i in range(num_iterations):
74 | for j in range(num_iterations):
75 | for k in range(num_iterations):
76 | result += math.sin(i) * math.cos(j) * math.tan(k)
77 | return result
78 |
79 | @ray.remote
80 | def heavy_calculation(num_iterations):
81 | results = ray.get(
82 | [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
83 | )
84 | return sum(results)
85 |
86 | ref = heavy_calculation.remote(3000)
87 | result = ray.get(ref)
88 | assert result == 1789.4644387076714
89 | ray.cancel(ref)
90 | ray.shutdown()
91 |
92 | cluster.down()
93 |
--------------------------------------------------------------------------------
/docs/sphinx/user-docs/s3-compatible-storage.rst:
--------------------------------------------------------------------------------
1 | S3 compatible storage with Ray Train examples
2 | =============================================
3 |
4 | Some of our distributed training examples require an external storage
5 | solution so that all nodes can access the same data. The following are
6 | examples for configuring S3 or Minio storage for your Ray Train script
7 | or interactive session.
8 |
9 | S3 Bucket
10 | ---------
11 |
12 | In your Python Script add the following environment variables:
13 |
14 | .. code:: python
15 |
16 | os.environ["AWS_ACCESS_KEY_ID"] = "XXXXXXXX"
17 | os.environ["AWS_SECRET_ACCESS_KEY"] = "XXXXXXXX"
18 | os.environ["AWS_DEFAULT_REGION"] = "XXXXXXXX"
19 |
20 | Alternatively you can specify these variables in your runtime
21 | environment on Job Submission.
22 |
23 | .. code:: python
24 |
25 | submission_id = client.submit_job(
26 | entrypoint=...,
27 | runtime_env={
28 | "env_vars": {
29 | "AWS_ACCESS_KEY_ID": os.environ.get('AWS_ACCESS_KEY_ID'),
30 | "AWS_SECRET_ACCESS_KEY": os.environ.get('AWS_SECRET_ACCESS_KEY'),
31 | "AWS_DEFAULT_REGION": os.environ.get('AWS_DEFAULT_REGION')
32 | },
33 | }
34 | )
35 |
36 | In your Trainer configuration you can specify a ``run_config`` which
37 | will utilise your external storage.
38 |
39 | .. code:: python
40 |
41 | trainer = TorchTrainer(
42 | train_func_distributed,
43 | scaling_config=scaling_config,
44 | run_config = ray.train.RunConfig(storage_path="s3://BUCKET_NAME/SUB_PATH/", name="unique_run_name")
45 | )
46 |
47 | To learn more about Amazon S3 Storage you can find information
48 | `here `__.
49 |
50 | Minio Bucket
51 | ------------
52 |
53 | In your Python Script add the following function for configuring your
54 | run_config:
55 |
56 | .. code:: python
57 |
58 | import s3fs
59 | import pyarrow
60 |
61 | def get_minio_run_config():
62 | s3_fs = s3fs.S3FileSystem(
63 | key = os.getenv('MINIO_ACCESS_KEY', "XXXXX"),
64 | secret = os.getenv('MINIO_SECRET_ACCESS_KEY', "XXXXX"),
65 | endpoint_url = os.getenv('MINIO_URL', "XXXXX")
66 | )
67 | custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))
68 | run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)
69 | return run_config
70 |
71 | You can update the ``run_config`` to further suit your needs above.
72 | Lastly the new ``run_config`` must be added to the Trainer:
73 |
74 | .. code:: python
75 |
76 | trainer = TorchTrainer(
77 | train_func_distributed,
78 | scaling_config=scaling_config,
79 | run_config = get_minio_run_config()
80 | )
81 |
82 | To find more information on creating a Minio Bucket compatible with
83 | RHOAI you can refer to this
84 | `documentation `__.
85 | Note: You must have ``s3fs`` and ``pyarrow`` installed in your
86 | environment for this method.
87 |
--------------------------------------------------------------------------------
/demo-notebooks/additional-demos/remote_ray_job_client.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Submit a training job remotely to Ray Dashboard protected by oAuth.\n",
8 | "This notebook will demonstrate how to submit Ray jobs to an existing Raycluster, using the CodeFlare SDK.\n",
9 | "\n",
10 | "### Requirements\n",
11 | "* Ray Cluster running in OpenShift protected by oAuth.\n",
12 | "* The Ray Dashboard URL for the Ray Cluster.\n",
13 | "* An OpenShift authorization token with permissions to access the Route.\n",
14 | "* A training job, defined in python, within the working directory.\n",
15 | "* A requirements.txt or equivalent file containing any additional packages to install onto the Ray images."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "# Import dependencies from codeflare-sdk\n",
25 | "from codeflare_sdk import RayJobClient"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# Setup Authentication Configuration \n",
35 | "auth_token = \"XXXX\" # Replace with the actual token\n",
36 | "header = {\n",
37 | " 'Authorization': f'Bearer {auth_token}'\n",
38 | "}"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "# Gather the dashboard URL (provided by the creator of the RayCluster)\n",
48 | "ray_dashboard = \"XXXX\" # Replace with the Ray dashboard URL"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "#Initialize the RayJobClient\n",
58 | "client = RayJobClient(address=ray_dashboard, headers=header, verify=True)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "# Submit a job using the RayJobClient\n",
68 | "entrypoint_command = \"python XXXX\" # Replace with the training script name\n",
69 | "submission_id = client.submit_job(\n",
70 | " entrypoint=entrypoint_command,\n",
71 | " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
72 | ")"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "# Get the job's status\n",
82 | "client.get_job_status(submission_id)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# Get the job's logs\n",
92 | "client.get_job_logs(submission_id)"
93 | ]
94 | }
95 | ],
96 | "metadata": {
97 | "language_info": {
98 | "name": "python"
99 | }
100 | },
101 | "nbformat": 4,
102 | "nbformat_minor": 2
103 | }
104 |
--------------------------------------------------------------------------------
/tests/e2e/heterogeneous_clusters_oauth_test.py:
--------------------------------------------------------------------------------
1 | from codeflare_sdk import (
2 | Cluster,
3 | ClusterConfiguration,
4 | TokenAuthentication,
5 | )
6 |
7 | from codeflare_sdk.common.kueue.kueue import list_local_queues
8 |
9 | import pytest
10 |
11 | from support import *
12 |
13 |
14 | @pytest.mark.skip(reason="Temporarily skipped due to needed investigation")
15 | @pytest.mark.openshift
16 | @pytest.mark.tier1
17 | class TestHeterogeneousClustersOauth:
18 | def setup_method(self):
19 | initialize_kubernetes_client(self)
20 |
21 | def teardown_method(self):
22 | delete_namespace(self)
23 | delete_kueue_resources(self)
24 |
25 | def test_heterogeneous_clusters(self):
26 | create_namespace(self)
27 | create_kueue_resources(self, 2, with_labels=True, with_tolerations=True)
28 | self.run_heterogeneous_clusters()
29 |
30 | def run_heterogeneous_clusters(
31 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
32 | ):
33 | ray_image = get_ray_image()
34 |
35 | auth = TokenAuthentication(
36 | token=run_oc_command(["whoami", "--show-token=true"]),
37 | server=run_oc_command(["whoami", "--show-server=true"]),
38 | skip_tls=True,
39 | )
40 | auth.login()
41 |
42 | for flavor in self.resource_flavors:
43 | node_labels = (
44 | get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {})
45 | )
46 | expected_nodes = get_nodes_by_label(self, node_labels)
47 |
48 | print(f"Expected nodes: {expected_nodes}")
49 | cluster_name = f"test-ray-cluster-li-{flavor[-5:]}"
50 | queues = list_local_queues(namespace=self.namespace, flavors=[flavor])
51 | queue_name = queues[0]["name"] if queues else None
52 | print(f"Using flavor: {flavor}, Queue: {queue_name}")
53 | cluster = Cluster(
54 | ClusterConfiguration(
55 | namespace=self.namespace,
56 | name=cluster_name,
57 | num_workers=1,
58 | head_cpu_requests="500m",
59 | head_cpu_limits="500m",
60 | head_memory_requests=2,
61 | head_memory_limits=4,
62 | worker_cpu_requests="500m",
63 | worker_cpu_limits=1,
64 | worker_memory_requests=2,
65 | worker_memory_limits=4,
66 | image=ray_image,
67 | verify_tls=False,
68 | local_queue=queue_name,
69 | )
70 | )
71 | cluster.apply()
72 | # Wait for the cluster to be scheduled and ready, we don't need the dashboard for this check
73 | cluster.wait_ready(dashboard_check=False)
74 | node_name = get_pod_node(self, self.namespace, cluster_name)
75 | print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}")
76 | assert (
77 | node_name in expected_nodes
78 | ), f"Node {node_name} is not in the expected nodes for flavor {flavor}."
79 | cluster.down()
80 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/test_awload.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from codeflare_sdk.common.utils.unit_test_support import (
15 | apply_template,
16 | arg_check_aw_apply_effect,
17 | arg_check_aw_del_effect,
18 | get_template_variables,
19 | )
20 | from codeflare_sdk.ray.appwrapper import AWManager
21 | from codeflare_sdk.ray.cluster import Cluster, ClusterConfiguration
22 | import os
23 | from pathlib import Path
24 |
25 | parent = Path(__file__).resolve().parents[4] # project directory
26 | aw_dir = os.path.expanduser("~/.codeflare/resources/")
27 |
28 |
29 | def test_AWManager_creation(mocker):
30 | mocker.patch("kubernetes.client.ApisApi.get_api_versions")
31 | mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object")
32 | # Create test.yaml
33 | Cluster(
34 | ClusterConfiguration(
35 | name="test",
36 | namespace="ns",
37 | write_to_file=True,
38 | appwrapper=True,
39 | )
40 | )
41 |
42 | testaw = AWManager(f"{aw_dir}test.yaml")
43 | assert testaw.name == "test"
44 | assert testaw.namespace == "ns"
45 | assert testaw.submitted == False
46 | try:
47 | testaw = AWManager("fake")
48 | except Exception as e:
49 | assert type(e) == FileNotFoundError
50 | assert str(e) == "[Errno 2] No such file or directory: 'fake'"
51 | try:
52 | testaw = apply_template(
53 | AWManager(
54 | f"{parent}/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml"
55 | ),
56 | get_template_variables(),
57 | )
58 | except Exception as e:
59 | assert type(e) == ValueError
60 | assert (
61 | str(e)
62 | == f"{parent}/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml is not a correctly formatted AppWrapper yaml"
63 | )
64 |
65 |
66 | def test_AWManager_submit_remove(mocker, capsys):
67 | mocker.patch("kubernetes.client.ApisApi.get_api_versions")
68 | testaw = AWManager(f"{aw_dir}test.yaml")
69 | testaw.remove()
70 | captured = capsys.readouterr()
71 | assert (
72 | captured.out
73 | == "AppWrapper not submitted by this manager yet, nothing to remove\n"
74 | )
75 | assert testaw.submitted == False
76 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
77 | mocker.patch(
78 | "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object",
79 | side_effect=arg_check_aw_apply_effect,
80 | )
81 | mocker.patch(
82 | "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object",
83 | side_effect=arg_check_aw_del_effect,
84 | )
85 | testaw.submit()
86 | assert testaw.submitted == True
87 | testaw.remove()
88 | assert testaw.submitted == False
89 |
90 |
91 | # Make sure to always keep this function last
92 | def test_cleanup():
93 | os.remove(f"{aw_dir}test.yaml")
94 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/examples/use-director.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from os import path
4 | import json
5 | import time
6 |
7 | """
8 | in case you are working directly with the source, and don't wish to
9 | install the module with pip install, you can directly import the packages by uncommenting the following code.
10 | """
11 |
12 | """
13 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
14 |
15 | current_dir = os.path.dirname(os.path.abspath(__file__))
16 | parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
17 | sibling_dirs = [
18 | d for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d))
19 | ]
20 | for sibling_dir in sibling_dirs:
21 | sys.path.append(os.path.join(parent_dir, sibling_dir))
22 |
23 | """
24 | from codeflare_sdk.vendored.python_client import kuberay_cluster_api
25 |
26 | from codeflare_sdk.vendored.python_client.utils import kuberay_cluster_builder
27 |
28 |
29 | def wait(duration: int = 5, step_name: str = "next"):
30 | print("waiting for {} seconds before {} step".format(duration, step_name))
31 | for i in range(duration, 0, -1):
32 | sys.stdout.write(str(i) + " ")
33 | sys.stdout.flush()
34 | time.sleep(1)
35 | print()
36 |
37 |
38 | def main():
39 | print("starting cluster handler...")
40 |
41 | my_kube_ray_api = kuberay_cluster_api.RayClusterApi()
42 |
43 | my_cluster_director = kuberay_cluster_builder.Director()
44 |
45 | # building the raycluster representation
46 | cluster_body = my_cluster_director.build_small_cluster(
47 | name="new-small-cluster", k8s_namespace="default"
48 | )
49 |
50 | # creating the raycluster in k8s
51 | if cluster_body:
52 | print("creating the cluster...")
53 | my_kube_ray_api.create_ray_cluster(body=cluster_body)
54 |
55 | # now the cluster should be created.
56 | # the rest of the code is simply to fetch, print and cleanup the created cluster
57 |
58 | print("fetching the cluster...")
59 | # fetching the raycluster from k8s api-server
60 | kube_ray_cluster = my_kube_ray_api.get_ray_cluster(
61 | name=cluster_body["metadata"]["name"], k8s_namespace="default"
62 | )
63 |
64 | if kube_ray_cluster:
65 | print(
66 | "try: kubectl -n {} get raycluster {} -o yaml".format(
67 | kube_ray_cluster["metadata"]["namespace"],
68 | kube_ray_cluster["metadata"]["name"],
69 | )
70 | )
71 | wait(step_name="print created cluster in JSON")
72 | print("printing the raycluster JSON representation...")
73 | json_formatted_str = json.dumps(kube_ray_cluster, indent=2)
74 | print(json_formatted_str)
75 |
76 | # waiting until the cluster is running, and has its status updated
77 | is_running = my_kube_ray_api.wait_until_ray_cluster_running(
78 | name=kube_ray_cluster["metadata"]["name"],
79 | k8s_namespace=kube_ray_cluster["metadata"]["namespace"],
80 | )
81 |
82 | print(
83 | "raycluster {} status is {}".format(
84 | kube_ray_cluster["metadata"]["name"], "Running" if is_running else "unknown"
85 | )
86 | )
87 |
88 | wait(step_name="cleaning up")
89 | print("deleting raycluster {}.".format(kube_ray_cluster["metadata"]["name"]))
90 |
91 | my_kube_ray_api.delete_ray_cluster(
92 | name=kube_ray_cluster["metadata"]["name"],
93 | k8s_namespace=kube_ray_cluster["metadata"]["namespace"],
94 | )
95 |
96 |
97 | if __name__ == "__main__":
98 | main()
99 |
--------------------------------------------------------------------------------
/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: workload.codeflare.dev/v1beta2
2 | kind: AppsWrapper
3 | metadata:
4 | labels:
5 | orderedinstance: cpu.small_gpu.large
6 | nam: unit-test-cluster
7 | namspace: ns
8 | spec:
9 | components:
10 | - template:
11 | apiVersion: ray.io/v1
12 | kind: RayCluster
13 | metadata:
14 | labels:
15 | controller-tools.k8s.io: '1.0'
16 | name: unit-test-cluster
17 | namespace: ns
18 | spec:
19 | autoscalerOptions:
20 | idleTimeoutSeconds: 60
21 | resources:
22 | limits:
23 | cpu: 500m
24 | memory: 512Mi
25 | requests:
26 | cpu: 500m
27 | memory: 512Mi
28 | upscalingMode: Default
29 | enableInTreeAutoscaling: false
30 | headGroupSpec:
31 | rayStartParams:
32 | block: 'true'
33 | dashboard-host: 0.0.0.0
34 | num-gpus: '0'
35 | resources: '"{}"'
36 | serviceType: ClusterIP
37 | template:
38 | spec:
39 | containers:
40 | - env:
41 | - name: MY_POD_IP
42 | valueFrom:
43 | fieldRef:
44 | fieldPath: status.podIP
45 | image: "${image}"
46 | imagePullPolicy: IfNotPresent
47 | lifecycle:
48 | preStop:
49 | exec:
50 | command:
51 | - /bin/sh
52 | - -c
53 | - ray stop
54 | name: ray-head
55 | ports:
56 | - containerPort: 6379
57 | name: gcs
58 | - containerPort: 8265
59 | name: dashboard
60 | - containerPort: 10001
61 | name: client
62 | resources:
63 | limits:
64 | cpu: 2
65 | memory: 8G
66 | requests:
67 | cpu: 2
68 | memory: 8G
69 | rayVersion: 2.52.1
70 | workerGroupSpecs:
71 | - groupName: small-group-unit-test-cluster
72 | maxReplicas: 2
73 | minReplicas: 2
74 | rayStartParams:
75 | block: 'true'
76 | num-gpus: '7'
77 | resources: '"{}"'
78 | replicas: 2
79 | template:
80 | metadata:
81 | annotations:
82 | key: value
83 | labels:
84 | key: value
85 | spec:
86 | containers:
87 | - env:
88 | - name: MY_POD_IP
89 | valueFrom:
90 | fieldRef:
91 | fieldPath: status.podIP
92 | image: "${image}"
93 | lifecycle:
94 | preStop:
95 | exec:
96 | command:
97 | - /bin/sh
98 | - -c
99 | - ray stop
100 | name: machine-learning
101 | resources:
102 | limits:
103 | cpu: 4
104 | memory: 6G
105 | nvidia.com/gpu: 7
106 | requests:
107 | cpu: 3
108 | memory: 5G
109 | nvidia.com/gpu: 7
110 |
--------------------------------------------------------------------------------
/tests/e2e/minio_deployment.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | kind: PersistentVolumeClaim
3 | apiVersion: v1
4 | metadata:
5 | name: minio-pvc
6 | spec:
7 | accessModes:
8 | - ReadWriteOnce
9 | resources:
10 | requests:
11 | storage: 20Gi
12 | volumeMode: Filesystem
13 | ---
14 | kind: Secret
15 | apiVersion: v1
16 | metadata:
17 | name: minio-secret
18 | stringData:
19 | # change the username and password to your own values.
20 | # ensure that the user is at least 3 characters long and the password at least 8
21 | minio_root_user: minio
22 | minio_root_password: minio123
23 | ---
24 | kind: Deployment
25 | apiVersion: apps/v1
26 | metadata:
27 | name: minio
28 | spec:
29 | replicas: 1
30 | selector:
31 | matchLabels:
32 | app: minio
33 | template:
34 | metadata:
35 | creationTimestamp: null
36 | labels:
37 | app: minio
38 | spec:
39 | volumes:
40 | - name: data
41 | persistentVolumeClaim:
42 | claimName: minio-pvc
43 | containers:
44 | - resources:
45 | limits:
46 | cpu: 250m
47 | memory: 1Gi
48 | requests:
49 | cpu: 20m
50 | memory: 100Mi
51 | readinessProbe:
52 | tcpSocket:
53 | port: 9000
54 | initialDelaySeconds: 5
55 | timeoutSeconds: 1
56 | periodSeconds: 5
57 | successThreshold: 1
58 | failureThreshold: 3
59 | terminationMessagePath: /dev/termination-log
60 | name: minio
61 | livenessProbe:
62 | tcpSocket:
63 | port: 9000
64 | initialDelaySeconds: 30
65 | timeoutSeconds: 1
66 | periodSeconds: 5
67 | successThreshold: 1
68 | failureThreshold: 3
69 | env:
70 | - name: MINIO_ROOT_USER
71 | valueFrom:
72 | secretKeyRef:
73 | name: minio-secret
74 | key: minio_root_user
75 | - name: MINIO_ROOT_PASSWORD
76 | valueFrom:
77 | secretKeyRef:
78 | name: minio-secret
79 | key: minio_root_password
80 | ports:
81 | - containerPort: 9000
82 | protocol: TCP
83 | - containerPort: 9090
84 | protocol: TCP
85 | imagePullPolicy: IfNotPresent
86 | volumeMounts:
87 | - name: data
88 | mountPath: /data
89 | subPath: minio
90 | terminationMessagePolicy: File
91 | image: quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z
92 | args:
93 | - server
94 | - /data
95 | - --console-address
96 | - :9090
97 | restartPolicy: Always
98 | terminationGracePeriodSeconds: 30
99 | dnsPolicy: ClusterFirst
100 | securityContext: {}
101 | schedulerName: default-scheduler
102 | strategy:
103 | type: Recreate
104 | revisionHistoryLimit: 10
105 | progressDeadlineSeconds: 600
106 | ---
107 | kind: Service
108 | apiVersion: v1
109 | metadata:
110 | name: minio-service
111 | spec:
112 | ipFamilies:
113 | - IPv4
114 | ports:
115 | - name: api
116 | protocol: TCP
117 | port: 9000
118 | targetPort: 9000
119 | - name: ui
120 | protocol: TCP
121 | port: 9090
122 | targetPort: 9090
123 | internalTrafficPolicy: Cluster
124 | type: ClusterIP
125 | ipFamilyPolicy: SingleStack
126 | sessionAffinity: None
127 | selector:
128 | app: minio
129 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/awload.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | The awload sub-module contains the definition of the AWManager object, which handles
17 | submission and deletion of existing AppWrappers from a user's file system.
18 | """
19 |
20 | from os.path import isfile
21 | import errno
22 | import os
23 | import yaml
24 |
25 | from kubernetes import client
26 | from ...common import _kube_api_error_handling
27 | from ...common.kubernetes_cluster.auth import (
28 | config_check,
29 | get_api_client,
30 | )
31 |
32 |
33 | class AWManager:
34 | """
35 | An object for submitting and removing existing AppWrapper yamls
36 | to be added to the Kueue localqueue.
37 | """
38 |
39 | def __init__(self, filename: str) -> None:
40 | """
41 | Create the AppWrapper Manager object by passing in an
42 | AppWrapper yaml file
43 | """
44 | if not isfile(filename):
45 | raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
46 | self.filename = filename
47 | try:
48 | with open(self.filename) as f:
49 | self.awyaml = yaml.load(f, Loader=yaml.FullLoader)
50 | assert self.awyaml["kind"] == "AppWrapper"
51 | self.name = self.awyaml["metadata"]["name"]
52 | self.namespace = self.awyaml["metadata"]["namespace"]
53 | except:
54 | raise ValueError(
55 | f"{filename } is not a correctly formatted AppWrapper yaml"
56 | )
57 | self.submitted = False
58 |
59 | def submit(self) -> None:
60 | """
61 | Attempts to create the AppWrapper custom resource using the yaml file
62 | """
63 | try:
64 | config_check()
65 | api_instance = client.CustomObjectsApi(get_api_client())
66 | api_instance.create_namespaced_custom_object(
67 | group="workload.codeflare.dev",
68 | version="v1beta2",
69 | namespace=self.namespace,
70 | plural="appwrappers",
71 | body=self.awyaml,
72 | )
73 | except Exception as e:
74 | return _kube_api_error_handling(e)
75 |
76 | self.submitted = True
77 | print(f"AppWrapper {self.filename} submitted!")
78 |
79 | def remove(self) -> None:
80 | """
81 | Attempts to delete the AppWrapper custom resource matching the name in the yaml,
82 | if submitted by this manager.
83 | """
84 | if not self.submitted:
85 | print("AppWrapper not submitted by this manager yet, nothing to remove")
86 | return
87 |
88 | try:
89 | config_check()
90 | api_instance = client.CustomObjectsApi(get_api_client())
91 | api_instance.delete_namespaced_custom_object(
92 | group="workload.codeflare.dev",
93 | version="v1beta2",
94 | namespace=self.namespace,
95 | plural="appwrappers",
96 | name=self.name,
97 | )
98 | except Exception as e:
99 | return _kube_api_error_handling(e)
100 |
101 | self.submitted = False
102 | print(f"AppWrapper {self.name} removed!")
103 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/test_status.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from codeflare_sdk.ray.cluster.cluster import (
16 | _app_wrapper_status,
17 | Cluster,
18 | ClusterConfiguration,
19 | )
20 | from codeflare_sdk.ray.appwrapper import AppWrapper, AppWrapperStatus
21 | from codeflare_sdk.ray.cluster.status import CodeFlareClusterStatus
22 | from codeflare_sdk.common.utils.unit_test_support import get_local_queue
23 | import os
24 |
25 | aw_dir = os.path.expanduser("~/.codeflare/resources/")
26 |
27 |
28 | def test_cluster_status(mocker):
29 | mocker.patch("kubernetes.client.ApisApi.get_api_versions")
30 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
31 | mocker.patch(
32 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
33 | return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
34 | )
35 | fake_aw = AppWrapper("test", AppWrapperStatus.FAILED)
36 |
37 | cf = Cluster(
38 | ClusterConfiguration(
39 | name="test",
40 | namespace="ns",
41 | write_to_file=True,
42 | appwrapper=True,
43 | local_queue="local-queue-default",
44 | )
45 | )
46 | mocker.patch(
47 | "codeflare_sdk.ray.cluster.cluster._app_wrapper_status", return_value=None
48 | )
49 | mocker.patch(
50 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=None
51 | )
52 | status, ready = cf.status()
53 | assert status == CodeFlareClusterStatus.UNKNOWN
54 | assert ready == False
55 |
56 | mocker.patch(
57 | "codeflare_sdk.ray.cluster.cluster._app_wrapper_status", return_value=fake_aw
58 | )
59 | status, ready = cf.status()
60 | assert status == CodeFlareClusterStatus.FAILED
61 | assert ready == False
62 |
63 | fake_aw.status = AppWrapperStatus.SUSPENDED
64 | status, ready = cf.status()
65 | assert status == CodeFlareClusterStatus.QUEUED
66 | assert ready == False
67 |
68 | fake_aw.status = AppWrapperStatus.RESUMING
69 | status, ready = cf.status()
70 | assert status == CodeFlareClusterStatus.STARTING
71 | assert ready == False
72 |
73 | fake_aw.status = AppWrapperStatus.RESETTING
74 | status, ready = cf.status()
75 | assert status == CodeFlareClusterStatus.STARTING
76 | assert ready == False
77 |
78 | fake_aw.status = AppWrapperStatus.RUNNING
79 | status, ready = cf.status()
80 | assert status == CodeFlareClusterStatus.UNKNOWN
81 | assert ready == False
82 |
83 |
84 | def aw_status_fields(group, version, namespace, plural, *args):
85 | assert group == "workload.codeflare.dev"
86 | assert version == "v1beta2"
87 | assert namespace == "test-ns"
88 | assert plural == "appwrappers"
89 | assert args == tuple()
90 | return {"items": []}
91 |
92 |
93 | def test_aw_status(mocker):
94 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
95 | mocker.patch(
96 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
97 | side_effect=aw_status_fields,
98 | )
99 | aw = _app_wrapper_status("test-aw", "test-ns")
100 | assert aw == None
101 |
102 |
103 | # Make sure to always keep this function last
104 | def test_cleanup():
105 | os.remove(f"{aw_dir}test.yaml")
106 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/cluster/test_status.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from codeflare_sdk.ray.cluster.cluster import (
16 | Cluster,
17 | ClusterConfiguration,
18 | _ray_cluster_status,
19 | )
20 | from codeflare_sdk.ray.cluster.status import (
21 | CodeFlareClusterStatus,
22 | RayClusterStatus,
23 | RayCluster,
24 | )
25 | import os
26 | from ...common.utils.unit_test_support import get_local_queue
27 |
28 | aw_dir = os.path.expanduser("~/.codeflare/resources/")
29 |
30 |
31 | def test_cluster_status(mocker):
32 | mocker.patch("kubernetes.client.ApisApi.get_api_versions")
33 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
34 |
35 | fake_ray = RayCluster(
36 | name="test",
37 | status=RayClusterStatus.UNKNOWN,
38 | num_workers=1,
39 | worker_mem_requests=2,
40 | worker_mem_limits=2,
41 | worker_cpu_requests=1,
42 | worker_cpu_limits=1,
43 | namespace="ns",
44 | dashboard="fake-uri",
45 | head_cpu_requests=2,
46 | head_cpu_limits=2,
47 | head_mem_requests=8,
48 | head_mem_limits=8,
49 | )
50 |
51 | mocker.patch(
52 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
53 | return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
54 | )
55 |
56 | cf = Cluster(
57 | ClusterConfiguration(
58 | name="test",
59 | namespace="ns",
60 | write_to_file=True,
61 | appwrapper=False,
62 | local_queue="local-queue-default",
63 | )
64 | )
65 | mocker.patch(
66 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=None
67 | )
68 | status, ready = cf.status()
69 | assert status == CodeFlareClusterStatus.UNKNOWN
70 | assert ready == False
71 |
72 | mocker.patch(
73 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=fake_ray
74 | )
75 |
76 | status, ready = cf.status()
77 | assert status == CodeFlareClusterStatus.STARTING
78 | assert ready == False
79 |
80 | fake_ray.status = RayClusterStatus.FAILED
81 | status, ready = cf.status()
82 | assert status == CodeFlareClusterStatus.FAILED
83 | assert ready == False
84 |
85 | fake_ray.status = RayClusterStatus.UNHEALTHY
86 | status, ready = cf.status()
87 | assert status == CodeFlareClusterStatus.FAILED
88 | assert ready == False
89 |
90 | fake_ray.status = RayClusterStatus.READY
91 | status, ready = cf.status()
92 | assert status == CodeFlareClusterStatus.READY
93 | assert ready == True
94 |
95 |
96 | def rc_status_fields(group, version, namespace, plural, *args):
97 | assert group == "ray.io"
98 | assert version == "v1"
99 | assert namespace == "test-ns"
100 | assert plural == "rayclusters"
101 | assert args == tuple()
102 | return {"items": []}
103 |
104 |
105 | def test_rc_status(mocker):
106 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
107 | mocker.patch(
108 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
109 | side_effect=rc_status_fields,
110 | )
111 | rc = _ray_cluster_status("test-rc", "test-ns")
112 | assert rc == None
113 |
114 |
115 | # Make sure to always keep this function last
116 | def test_cleanup():
117 | os.remove(f"{aw_dir}test.yaml")
118 |
--------------------------------------------------------------------------------
/tests/e2e/mnist_raycluster_sdk_kind_test.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | from time import sleep
4 |
5 | from codeflare_sdk import Cluster, ClusterConfiguration
6 | from codeflare_sdk.ray.client import RayJobClient
7 |
8 | import pytest
9 |
10 | from support import *
11 |
12 | # This test creates a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster
13 |
14 |
15 | @pytest.mark.kind
16 | class TestRayClusterSDKKind:
17 | def setup_method(self):
18 | initialize_kubernetes_client(self)
19 |
20 | def teardown_method(self):
21 | delete_namespace(self)
22 | delete_kueue_resources(self)
23 |
24 | def test_mnist_ray_cluster_sdk_kind(self):
25 | self.setup_method()
26 | create_namespace(self)
27 | create_kueue_resources(self)
28 | self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
29 |
30 | @pytest.mark.nvidia_gpu
31 | def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
32 | self.setup_method()
33 | create_namespace(self)
34 | create_kueue_resources(self)
35 | self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
36 |
37 | def run_mnist_raycluster_sdk_kind(
38 | self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
39 | ):
40 | cluster = Cluster(
41 | ClusterConfiguration(
42 | name="mnist",
43 | namespace=self.namespace,
44 | num_workers=1,
45 | head_cpu_requests="500m",
46 | head_cpu_limits="500m",
47 | worker_cpu_requests="500m",
48 | worker_cpu_limits=1,
49 | worker_memory_requests=1,
50 | worker_memory_limits=4,
51 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
52 | write_to_file=True,
53 | verify_tls=False,
54 | )
55 | )
56 |
57 | cluster.apply()
58 |
59 | cluster.status()
60 |
61 | cluster.wait_ready()
62 |
63 | cluster.status()
64 |
65 | cluster.details()
66 |
67 | self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
68 |
69 | assert_get_cluster_and_jobsubmit(
70 | self, "mnist", accelerator="gpu", number_of_gpus=1
71 | )
72 |
73 | # Assertions
74 |
75 | def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
76 | ray_dashboard = cluster.cluster_dashboard_uri()
77 | client = RayJobClient(address=ray_dashboard, verify=False)
78 |
79 | submission_id = client.submit_job(
80 | entrypoint="python mnist.py",
81 | runtime_env={
82 | "working_dir": "./tests/e2e/",
83 | "pip": "./tests/e2e/mnist_pip_requirements.txt",
84 | "env_vars": get_setup_env_variables(ACCELERATOR=accelerator),
85 | },
86 | entrypoint_num_gpus=number_of_gpus,
87 | )
88 | print(f"Submitted job with ID: {submission_id}")
89 | done = False
90 | time = 0
91 | timeout = 900
92 | while not done:
93 | status = client.get_job_status(submission_id)
94 | if status.is_terminal():
95 | break
96 | if not done:
97 | print(status)
98 | if timeout and time >= timeout:
99 | raise TimeoutError(f"job has timed out after waiting {timeout}s")
100 | sleep(5)
101 | time += 5
102 |
103 | logs = client.get_job_logs(submission_id)
104 | print(logs)
105 |
106 | self.assert_job_completion(status)
107 |
108 | client.delete_job(submission_id)
109 |
110 | def assert_job_completion(self, status):
111 | if status == "SUCCEEDED":
112 | print(f"Job has completed: '{status}'")
113 | assert True
114 | else:
115 | print(f"Job has completed: '{status}'")
116 | assert False
117 |
--------------------------------------------------------------------------------
/tests/e2e_v2/utils/scripts/gpu_script.py:
--------------------------------------------------------------------------------
1 | """
2 | GPU-optimized RayJob validation script using Ray Train.
3 |
4 | This script performs a minimal Ray Train task suitable for GPU execution
5 | to validate that a RayJob can successfully connect to and use an existing Ray cluster
6 | with GPU resources.
7 |
8 | Usage as RayJob entrypoint:
9 | python gpu_script.py
10 | """
11 |
12 | import ray
13 | import sys
14 | from ray import train
15 | from ray.train import ScalingConfig
16 | from ray.train.torch import TorchTrainer
17 |
18 |
19 | def train_func(config):
20 | """
21 | Minimal training function for GPU execution.
22 |
23 | This performs a simple computation task that validates:
24 | 1. Ray Train can initialize with GPU
25 | 2. GPU workers can execute tasks
26 | 3. Results can be aggregated
27 |
28 | Args:
29 | config: Training configuration dict
30 | """
31 | # Get the current worker context
32 | worker_rank = train.get_context().get_world_rank()
33 |
34 | # Check if GPU is available
35 | try:
36 | import torch
37 |
38 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39 | print(f"Worker {worker_rank} using device: {device}")
40 |
41 | if torch.cuda.is_available():
42 | # Simple GPU computation task
43 | # Create a small tensor and perform computation on GPU
44 | x = torch.randn(100, 100, device=device)
45 | y = torch.randn(100, 100, device=device)
46 | result = torch.matmul(x, y).sum().item()
47 | print(f"Worker {worker_rank} completed GPU computation. Result: {result}")
48 | else:
49 | # Fallback to CPU if GPU not available
50 | print(f"Worker {worker_rank}: GPU not available, using CPU fallback")
51 | result = sum(i * i for i in range(1000))
52 | except ImportError:
53 | # If PyTorch is not available, use simple CPU computation
54 | print(f"Worker {worker_rank}: PyTorch not available, using CPU computation")
55 | result = sum(i * i for i in range(1000))
56 |
57 | # Report metrics back
58 | train.report({"loss": result, "worker_rank": worker_rank})
59 |
60 |
61 | def main():
62 | """
63 | Run a minimal Ray Train task on GPU.
64 |
65 | This validates that:
66 | 1. Ray can be initialized (auto-connects to cluster when run as RayJob)
67 | 2. Ray Train can execute a distributed task with GPU
68 | 3. The job can complete successfully
69 |
70 | Returns:
71 | 0 on success, 1 on failure
72 | """
73 | try:
74 | # Initialize Ray (auto-connects to cluster when run as RayJob)
75 | ray.init()
76 |
77 | print("Starting GPU training task...")
78 |
79 | # Check cluster resources
80 | resources = ray.cluster_resources()
81 | print(f"Cluster resources: {resources}")
82 |
83 | # Check if GPU is available in the cluster
84 | gpu_available = "GPU" in resources and resources.get("GPU", 0) > 0
85 | print(f"GPU available in cluster: {gpu_available}")
86 |
87 | # Create a minimal Ray Train trainer for GPU
88 | # Using TorchTrainer (the current Ray Train API) with GPU configuration
89 | trainer = TorchTrainer(
90 | train_func,
91 | scaling_config=ScalingConfig(
92 | num_workers=1, # Use 1 worker for minimal test
93 | use_gpu=True, # Request GPU
94 | ),
95 | )
96 |
97 | # Run the training
98 | result = trainer.fit()
99 |
100 | print(f"Training completed successfully. Metrics: {result.metrics}")
101 |
102 | # Print success marker that tests can check for
103 | print("EXISTING_CLUSTER_JOB_SUCCESS")
104 |
105 | return 0
106 |
107 | except Exception as e:
108 | print(f"FAILURE: Exception occurred: {e}")
109 | import traceback
110 |
111 | traceback.print_exc()
112 | return 1
113 | finally:
114 | ray.shutdown()
115 |
116 |
117 | if __name__ == "__main__":
118 | sys.exit(main())
119 |
--------------------------------------------------------------------------------
/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | from time import sleep
4 |
5 | from codeflare_sdk import Cluster, ClusterConfiguration
6 | from codeflare_sdk.ray.client import RayJobClient
7 |
8 | import pytest
9 |
10 | from support import *
11 |
12 | # This test creates an AppWrapper containing a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster
13 |
14 |
15 | @pytest.mark.kind
16 | class TestRayClusterSDKAppWrapperKind:
17 | def setup_method(self):
18 | initialize_kubernetes_client(self)
19 |
20 | def teardown_method(self):
21 | delete_namespace(self)
22 | delete_kueue_resources(self)
23 |
24 | def test_mnist_ray_cluster_sdk_kind(self):
25 | self.setup_method()
26 | create_namespace(self)
27 | create_kueue_resources(self)
28 | self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
29 |
30 | @pytest.mark.nvidia_gpu
31 | def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
32 | self.setup_method()
33 | create_namespace(self)
34 | create_kueue_resources(self)
35 | self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
36 |
37 | def run_mnist_raycluster_sdk_kind(
38 | self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
39 | ):
40 | cluster = Cluster(
41 | ClusterConfiguration(
42 | name="mnist",
43 | namespace=self.namespace,
44 | num_workers=1,
45 | head_cpu_requests="500m",
46 | head_cpu_limits="500m",
47 | worker_cpu_requests="500m",
48 | worker_cpu_limits=1,
49 | worker_memory_requests=1,
50 | worker_memory_limits=4,
51 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
52 | write_to_file=True,
53 | verify_tls=False,
54 | appwrapper=True,
55 | )
56 | )
57 |
58 | cluster.apply()
59 |
60 | cluster.status()
61 |
62 | cluster.wait_ready()
63 |
64 | cluster.status()
65 |
66 | cluster.details()
67 |
68 | self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
69 | assert_get_cluster_and_jobsubmit(
70 | self, "mnist", accelerator="gpu", number_of_gpus=1
71 | )
72 |
73 | # Assertions
74 |
75 | def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
76 | ray_dashboard = cluster.cluster_dashboard_uri()
77 | client = RayJobClient(address=ray_dashboard, verify=False)
78 |
79 | submission_id = client.submit_job(
80 | entrypoint="python mnist.py",
81 | runtime_env={
82 | "working_dir": "./tests/e2e/",
83 | "pip": "./tests/e2e/mnist_pip_requirements.txt",
84 | "env_vars": get_setup_env_variables(ACCELERATOR=accelerator),
85 | },
86 | entrypoint_num_gpus=number_of_gpus,
87 | )
88 | print(f"Submitted job with ID: {submission_id}")
89 | done = False
90 | time = 0
91 | timeout = 900
92 | while not done:
93 | status = client.get_job_status(submission_id)
94 | if status.is_terminal():
95 | break
96 | if not done:
97 | print(status)
98 | if timeout and time >= timeout:
99 | raise TimeoutError(f"job has timed out after waiting {timeout}s")
100 | sleep(5)
101 | time += 5
102 |
103 | logs = client.get_job_logs(submission_id)
104 | print(logs)
105 |
106 | self.assert_job_completion(status)
107 |
108 | client.delete_job(submission_id)
109 |
110 | def assert_job_completion(self, status):
111 | if status == "SUCCEEDED":
112 | print(f"Job has completed: '{status}'")
113 | assert True
114 | else:
115 | print(f"Job has completed: '{status}'")
116 | assert False
117 |
--------------------------------------------------------------------------------
/docs/sphinx/user-docs/setup-kueue.rst:
--------------------------------------------------------------------------------
1 | Basic Kueue Resources configuration
2 | ===================================
3 |
4 | Introduction:
5 | -------------
6 |
7 | This document is designed for administrators who have Kueue installed on
8 | their cluster. We will walk through the process of setting up essential
9 | Kueue resources, namely Cluster Queue, Resource Flavor, and Local Queue.
10 |
11 | 1. Resource Flavor:
12 | -------------------
13 |
14 | Resource Flavors allow the cluster admin to reflect differing resource capabilities
15 | of nodes within a clusters, such as CPU, memory, GPU, etc. These can then be assigned
16 | to workloads to ensure they are executed on nodes with appropriate resources.
17 |
18 | The YAML configuration provided below creates an empty Resource Flavor
19 | named default-flavor. It serves as a starting point and does not specify
20 | any detailed resource characteristics.
21 |
22 | .. code:: yaml
23 |
24 | apiVersion: kueue.x-k8s.io/v1beta1
25 | kind: ResourceFlavor
26 | metadata:
27 | name: default-flavor
28 |
29 | For more detailed information on Resource Flavor configuration options,
30 | refer to the Kueue documentation: `Resource Flavor
31 | Configuration `__
32 |
33 | 2. Cluster Queue:
34 | -----------------
35 |
36 | A Cluster Queue represents a shared queue across the entire cluster. It
37 | allows the cluster admin to define global settings for workload
38 | prioritization and resource allocation.
39 |
40 | When setting up a Cluster Queue in Kueue, it’s crucial that the resource
41 | specifications match the actual capacities and operational requirements
42 | of your cluster. The example provided outlines a basic setup; however,
43 | each cluster may have different resource availabilities and needs.
44 |
45 | .. code:: yaml
46 |
47 | apiVersion: kueue.x-k8s.io/v1beta1
48 | kind: ClusterQueue
49 | metadata:
50 | name: "cluster-queue"
51 | spec:
52 | namespaceSelector: {} # match all.
53 | resourceGroups:
54 | - coveredResources: ["cpu", "memory", "pods", "nvidia.com/gpu"]
55 | flavors:
56 | - name: "default-flavor"
57 | resources:
58 | - name: "cpu"
59 | nominalQuota: 9
60 | - name: "memory"
61 | nominalQuota: 36Gi
62 | - name: "pods"
63 | nominalQuota: 5
64 | - name: "nvidia.com/gpu"
65 | nominalQuota: '0'
66 |
67 | For more detailed information on Cluster Queue configuration options,
68 | refer to the Kueue documentation: `Cluster Queue
69 | Configuration `__
70 |
71 | 3. Local Queue (With Default Annotation):
72 | -----------------------------------------
73 |
74 | A Local Queue represents a queue associated with a specific namespace
75 | within the cluster. It allows namespace-level control over workload
76 | prioritization and resource allocation.
77 |
78 | .. code:: yaml
79 |
80 | apiVersion: kueue.x-k8s.io/v1beta1
81 | kind: LocalQueue
82 | metadata:
83 | namespace: team-a
84 | name: team-a-queue
85 | annotations:
86 | kueue.x-k8s.io/default-queue: "true"
87 | spec:
88 | clusterQueue: cluster-queue
89 |
90 | In the LocalQueue configuration provided above, the annotations field
91 | specifies ``kueue.x-k8s.io/default-queue: "true"``. This annotation
92 | indicates that the team-a-queue is designated as the default queue for
93 | the team-a namespace. When this is set, any workloads submitted to the
94 | team-a namespace without explicitly specifying a queue will
95 | automatically be routed to the team-a-queue.
96 |
97 | For more detailed information on Local Queue configuration options,
98 | refer to the Kueue documentation: `Local Queue
99 | Configuration `__
100 |
101 | Conclusion:
102 | -----------
103 |
104 | By following the steps outlined in this document, the cluster admin can
105 | successfully create the basic Kueue resources necessary for workload
106 | management in the cluster. For more advanced configurations and
107 | features, please refer to the comprehensive `Kueue
108 | documentation `__.
109 |
--------------------------------------------------------------------------------
/tests/e2e/cluster_apply_kind_test.py:
--------------------------------------------------------------------------------
1 | from codeflare_sdk import Cluster, ClusterConfiguration
2 | import pytest
3 | import time
4 | from kubernetes import client
5 | from codeflare_sdk.common.utils import constants
6 |
7 | from support import (
8 | initialize_kubernetes_client,
9 | create_namespace,
10 | delete_namespace,
11 | get_ray_cluster,
12 | )
13 |
14 |
15 | @pytest.mark.kind
16 | class TestRayClusterApply:
17 | def setup_method(self):
18 | initialize_kubernetes_client(self)
19 |
20 | def teardown_method(self):
21 | delete_namespace(self)
22 |
23 | def test_cluster_apply(self):
24 | self.setup_method()
25 | create_namespace(self)
26 |
27 | cluster_name = "test-cluster-apply"
28 | namespace = self.namespace
29 |
30 | # Initial configuration with 1 worker
31 | initial_config = ClusterConfiguration(
32 | name=cluster_name,
33 | namespace=namespace,
34 | num_workers=1,
35 | head_cpu_requests="500m",
36 | head_cpu_limits="1",
37 | head_memory_requests="1Gi",
38 | head_memory_limits="2Gi",
39 | worker_cpu_requests="500m",
40 | worker_cpu_limits="1",
41 | worker_memory_requests="1Gi",
42 | worker_memory_limits="2Gi",
43 | image=f"rayproject/ray:{constants.RAY_VERSION}",
44 | write_to_file=True,
45 | verify_tls=False,
46 | )
47 |
48 | # Create the cluster
49 | cluster = Cluster(initial_config)
50 | cluster.apply()
51 |
52 | # Wait for the cluster to be ready
53 | cluster.wait_ready(dashboard_check=False)
54 | status, ready = cluster.status()
55 | assert ready, f"Cluster {cluster_name} is not ready: {status}"
56 |
57 | # Verify the cluster is created
58 | ray_cluster = get_ray_cluster(cluster_name, namespace)
59 | assert ray_cluster is not None, "Cluster was not created successfully"
60 | assert (
61 | ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 1
62 | ), "Initial worker count does not match"
63 |
64 | # Update configuration with 2 workers
65 | updated_config = ClusterConfiguration(
66 | name=cluster_name,
67 | namespace=namespace,
68 | num_workers=2,
69 | head_cpu_requests="500m",
70 | head_cpu_limits="1",
71 | head_memory_requests="1Gi",
72 | head_memory_limits="2Gi",
73 | worker_cpu_requests="500m",
74 | worker_cpu_limits="1",
75 | worker_memory_requests="1Gi",
76 | worker_memory_limits="2Gi",
77 | image=f"rayproject/ray:{constants.RAY_VERSION}",
78 | write_to_file=True,
79 | verify_tls=False,
80 | )
81 |
82 | # Apply the updated configuration
83 | cluster.config = updated_config
84 | cluster.apply()
85 |
86 | # Give Kubernetes a moment to process the update
87 | time.sleep(5)
88 |
89 | # Wait for the updated cluster to be ready
90 | cluster.wait_ready(dashboard_check=False)
91 | updated_status, updated_ready = cluster.status()
92 | assert (
93 | updated_ready
94 | ), f"Cluster {cluster_name} is not ready after update: {updated_status}"
95 |
96 | # Verify the cluster is updated
97 | updated_ray_cluster = get_ray_cluster(cluster_name, namespace)
98 | assert (
99 | updated_ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 2
100 | ), "Worker count was not updated"
101 |
102 | # Clean up
103 | cluster.down()
104 |
105 | # Wait for deletion to complete (finalizers may delay deletion)
106 | max_wait = 30 # seconds
107 | wait_interval = 2
108 | elapsed = 0
109 |
110 | while elapsed < max_wait:
111 | ray_cluster = get_ray_cluster(cluster_name, namespace)
112 | if ray_cluster is None:
113 | break
114 | time.sleep(wait_interval)
115 | elapsed += wait_interval
116 |
117 | assert (
118 | ray_cluster is None
119 | ), f"Cluster was not deleted successfully after {max_wait}s"
120 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/cluster/test_build_ray_cluster.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from collections import namedtuple
15 | import sys
16 | from .build_ray_cluster import gen_names, update_image, build_ray_cluster
17 | import uuid
18 | from codeflare_sdk.ray.cluster.cluster import ClusterConfiguration, Cluster
19 |
20 |
21 | def test_gen_names_with_name(mocker):
22 | mocker.patch.object(
23 | uuid, "uuid4", return_value=uuid.UUID("00000000-0000-0000-0000-000000000001")
24 | )
25 | name = "myname"
26 | appwrapper_name, cluster_name = gen_names(name)
27 | assert appwrapper_name == name
28 | assert cluster_name == name
29 |
30 |
31 | def test_gen_names_without_name(mocker):
32 | mocker.patch.object(
33 | uuid, "uuid4", return_value=uuid.UUID("00000000-0000-0000-0000-000000000001")
34 | )
35 | appwrapper_name, cluster_name = gen_names(None)
36 | assert appwrapper_name.startswith("appwrapper-")
37 | assert cluster_name.startswith("cluster-")
38 |
39 |
40 | def test_update_image_without_supported_python_version(mocker):
41 | # Mock SUPPORTED_PYTHON_VERSIONS
42 | mocker.patch.dict(
43 | "codeflare_sdk.common.utils.constants.SUPPORTED_PYTHON_VERSIONS",
44 | {
45 | "3.11": "ray-py3.11",
46 | "3.12": "ray-py3.12",
47 | },
48 | )
49 |
50 | # Create a namedtuple to mock sys.version_info
51 | VersionInfo = namedtuple(
52 | "version_info", ["major", "minor", "micro", "releaselevel", "serial"]
53 | )
54 | mocker.patch.object(sys, "version_info", VersionInfo(3, 8, 0, "final", 0))
55 |
56 | # Mock warnings.warn to check if it gets called
57 | warn_mock = mocker.patch("warnings.warn")
58 |
59 | # Call the update_image function with no image provided
60 | image = update_image(None)
61 |
62 | # Assert that the warning was called with the expected message
63 | warn_mock.assert_called_once_with(
64 | "No default Ray image defined for 3.8. Please provide your own image or use one of the following python versions: 3.11, 3.12."
65 | )
66 |
67 | # Assert that no image was set since the Python version is not supported
68 | assert image is None
69 |
70 |
71 | def test_build_ray_cluster_with_gcs_ft(mocker):
72 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
73 | mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object")
74 |
75 | cluster = Cluster(
76 | ClusterConfiguration(
77 | name="test",
78 | namespace="ns",
79 | enable_gcs_ft=True,
80 | redis_address="redis:6379",
81 | redis_password_secret={"name": "redis-password-secret", "key": "password"},
82 | external_storage_namespace="new-ns",
83 | )
84 | )
85 |
86 | mocker.patch("codeflare_sdk.ray.cluster.build_ray_cluster.config_check")
87 | mocker.patch(
88 | "codeflare_sdk.ray.cluster.build_ray_cluster.get_api_client", return_value=None
89 | )
90 | mocker.patch(
91 | "codeflare_sdk.ray.cluster.build_ray_cluster.update_image", return_value=None
92 | )
93 |
94 | resource = build_ray_cluster(cluster)
95 |
96 | assert "spec" in resource
97 | assert "gcsFaultToleranceOptions" in resource["spec"]
98 |
99 | gcs_ft_options = resource["spec"]["gcsFaultToleranceOptions"]
100 |
101 | assert gcs_ft_options["redisAddress"] == "redis:6379"
102 | assert gcs_ft_options["externalStorageNamespace"] == "new-ns"
103 | assert (
104 | gcs_ft_options["redisPassword"]["valueFrom"]["secretKeyRef"]["name"]
105 | == "redis-password-secret"
106 | )
107 | assert (
108 | gcs_ft_options["redisPassword"]["valueFrom"]["secretKeyRef"]["key"]
109 | == "password"
110 | )
111 |
--------------------------------------------------------------------------------
/tests/e2e/local_interactive_sdk_kind_test.py:
--------------------------------------------------------------------------------
1 | from codeflare_sdk import (
2 | Cluster,
3 | ClusterConfiguration,
4 | generate_cert,
5 | )
6 |
7 | import pytest
8 | import ray
9 | import math
10 | import subprocess
11 |
12 | from support import *
13 |
14 |
15 | @pytest.mark.kind
16 | class TestRayLocalInteractiveKind:
17 | def setup_method(self):
18 | initialize_kubernetes_client(self)
19 | self.port_forward_process = None
20 |
21 | def cleanup_port_forward(self):
22 | if self.port_forward_process:
23 | self.port_forward_process.terminate()
24 | self.port_forward_process.wait(timeout=10)
25 | self.port_forward_process = None
26 |
27 | def teardown_method(self):
28 | self.cleanup_port_forward()
29 | delete_namespace(self)
30 | delete_kueue_resources(self)
31 |
32 | def test_local_interactives(self):
33 | self.setup_method()
34 | create_namespace(self)
35 | create_kueue_resources(self)
36 | self.run_local_interactives()
37 |
38 | @pytest.mark.nvidia_gpu
39 | def test_local_interactives_nvidia_gpu(self):
40 | self.setup_method()
41 | create_namespace(self)
42 | create_kueue_resources(self)
43 | self.run_local_interactives(number_of_gpus=1)
44 |
45 | def run_local_interactives(
46 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
47 | ):
48 | cluster_name = "test-ray-cluster-li"
49 |
50 | ray.shutdown()
51 |
52 | cluster = Cluster(
53 | ClusterConfiguration(
54 | name=cluster_name,
55 | namespace=self.namespace,
56 | num_workers=1,
57 | head_cpu_requests="500m",
58 | head_cpu_limits="500m",
59 | worker_cpu_requests="500m",
60 | worker_cpu_limits=1,
61 | worker_memory_requests=1,
62 | worker_memory_limits=4,
63 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
64 | verify_tls=False,
65 | )
66 | )
67 |
68 | cluster.apply()
69 |
70 | cluster.wait_ready()
71 | cluster.status()
72 |
73 | generate_cert.generate_tls_cert(cluster_name, self.namespace)
74 | generate_cert.export_env(cluster_name, self.namespace)
75 |
76 | print(cluster.local_client_url())
77 |
78 | @ray.remote(num_gpus=number_of_gpus / 2)
79 | def heavy_calculation_part(num_iterations):
80 | result = 0.0
81 | for i in range(num_iterations):
82 | for j in range(num_iterations):
83 | for k in range(num_iterations):
84 | result += math.sin(i) * math.cos(j) * math.tan(k)
85 | return result
86 |
87 | @ray.remote(num_gpus=number_of_gpus / 2)
88 | def heavy_calculation(num_iterations):
89 | results = ray.get(
90 | [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
91 | )
92 | return sum(results)
93 |
94 | # Attempt to port forward
95 | try:
96 | local_port = "20001"
97 | ray_client_port = "10001"
98 |
99 | port_forward_cmd = [
100 | "kubectl",
101 | "port-forward",
102 | "-n",
103 | self.namespace,
104 | f"svc/{cluster_name}-head-svc",
105 | f"{local_port}:{ray_client_port}",
106 | ]
107 | self.port_forward_process = subprocess.Popen(
108 | port_forward_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
109 | )
110 |
111 | client_url = f"ray://localhost:{local_port}"
112 | cluster.status()
113 |
114 | ray.init(address=client_url, logging_level="INFO")
115 |
116 | ref = heavy_calculation.remote(3000)
117 | result = ray.get(ref)
118 | assert (
119 | result == 1789.4644387076728
120 | ) # Updated result after moving to Python 3.12 (0.0000000000008% difference to old assertion)
121 | ray.cancel(ref)
122 | ray.shutdown()
123 |
124 | cluster.down()
125 | finally:
126 | self.cleanup_port_forward()
127 |
--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/5_submit_rayjob_cr.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "9259e514",
6 | "metadata": {},
7 | "source": [
8 | "# Submitting a RayJob CR\n",
9 | "\n",
10 | "In this notebook, we will go through the basics of using the SDK to:\n",
11 | " * Define a RayCluster configuration\n",
12 | " * Use this configuration alongside a RayJob definition\n",
13 | " * Submit the RayJob, and allow Kuberay Operator to lifecycle the RayCluster for the RayJob"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "id": "18136ea7",
19 | "metadata": {},
20 | "source": [
21 | "## Defining and Submitting the RayJob\n",
22 | "First, we'll need to import the relevant CodeFlare SDK packages. You can do this by executing the below cell."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "id": "51e18292",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from codeflare_sdk import RayJob, ManagedClusterConfig"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "id": "649c5911",
38 | "metadata": {},
39 | "source": [
40 | "Run the below `oc login` command using your Token and Server URL. Ensure the command is prepended by `!` and not `%`. This will work when running both locally and within RHOAI."
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "id": "dc364888",
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "!oc login --token= --server="
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "id": "5581eca9",
56 | "metadata": {},
57 | "source": [
58 | "Next we'll need to define the ManagedClusterConfig. Kuberay will use this to spin up a short-lived RayCluster that will only exist as long as the job"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "id": "3094c60a",
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "cluster_config = ManagedClusterConfig(\n",
69 | " head_memory_requests=6,\n",
70 | " head_memory_limits=8,\n",
71 | " num_workers=2,\n",
72 | " worker_cpu_requests=1,\n",
73 | " worker_cpu_limits=1,\n",
74 | " worker_memory_requests=4,\n",
75 | " worker_memory_limits=6,\n",
76 | " head_accelerators={'nvidia.com/gpu': 0},\n",
77 | " worker_accelerators={'nvidia.com/gpu': 0},\n",
78 | ")"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "id": "02a2b32b",
84 | "metadata": {},
85 | "source": [
86 | "Lastly we can pass the ManagedClusterConfig into the RayJob and submit it. You do not need to worry about tearing down the cluster when the job has completed, that is handled for you!"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "id": "e905ccea",
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "job = RayJob(\n",
97 | " job_name=\"demo-rayjob\",\n",
98 | " entrypoint=\"python -c 'print(\\\"Hello from RayJob!\\\")'\",\n",
99 | " cluster_config=cluster_config,\n",
100 | " namespace=\"your-namespace\"\n",
101 | ")\n",
102 | "\n",
103 | "job.submit()"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "id": "f3612de2",
109 | "metadata": {},
110 | "source": [
111 | "We can check the status of our job by executing the below cell. The status may appear as `unknown` for a time while the RayCluster spins up."
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "id": "96d92f93",
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "job.status()"
122 | ]
123 | }
124 | ],
125 | "metadata": {
126 | "kernelspec": {
127 | "display_name": "Python 3",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "codemirror_mode": {
133 | "name": "ipython",
134 | "version": 3
135 | },
136 | "file_extension": ".py",
137 | "mimetype": "text/x-python",
138 | "name": "python",
139 | "nbconvert_exporter": "python",
140 | "pygments_lexer": "ipython3",
141 | "version": "3.11.11"
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 5
146 | }
147 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/rayjobs/pretty_print.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 IBM, Red Hat
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | This sub-module exists primarily to be used internally by the RayJob object
17 | (in the rayjob sub-module) for pretty-printing job status and details.
18 | """
19 |
20 | from rich.console import Console
21 | from rich.table import Table
22 | from rich.panel import Panel
23 | from typing import Tuple, Optional
24 |
25 | from .status import RayJobDeploymentStatus, RayJobInfo
26 |
27 |
28 | def print_job_status(job_info: RayJobInfo):
29 | """
30 | Pretty print the job status in a format similar to cluster status.
31 | """
32 | status_display, header_color = _get_status_display(job_info.status)
33 |
34 | # Create main info table
35 | table = _create_info_table(header_color, job_info.name, status_display)
36 | table.add_row(f"[bold]Job ID:[/bold] {job_info.job_id}")
37 | table.add_row(f"[bold]Status:[/bold] {job_info.status.value}")
38 | table.add_row(f"[bold]RayCluster:[/bold] {job_info.cluster_name}")
39 | table.add_row(f"[bold]Namespace:[/bold] {job_info.namespace}")
40 |
41 | # Add timing information if available
42 | if job_info.start_time:
43 | table.add_row()
44 | table.add_row(f"[bold]Started:[/bold] {job_info.start_time}")
45 |
46 | # Add attempt counts if there are failures
47 | if job_info.failed_attempts > 0:
48 | table.add_row(f"[bold]Failed Attempts:[/bold] {job_info.failed_attempts}")
49 |
50 | _print_table_in_panel(table)
51 |
52 |
53 | def print_no_job_found(job_name: str, namespace: str):
54 | """
55 | Print a message when no job is found.
56 | """
57 | # Create table with error message
58 | table = _create_info_table(
59 | "[white on red][bold]Name", job_name, "[bold red]No RayJob found"
60 | )
61 | table.add_row()
62 | table.add_row("Please run rayjob.submit() to submit a job.")
63 | table.add_row()
64 | table.add_row(f"[bold]Namespace:[/bold] {namespace}")
65 |
66 | _print_table_in_panel(table)
67 |
68 |
69 | def _get_status_display(status: RayJobDeploymentStatus) -> Tuple[str, str]:
70 | """
71 | Get the display string and header color for a given status.
72 |
73 | Returns:
74 | Tuple of (status_display, header_color)
75 | """
76 | status_mapping = {
77 | RayJobDeploymentStatus.COMPLETE: (
78 | "Complete :white_heavy_check_mark:",
79 | "[white on green][bold]Name",
80 | ),
81 | RayJobDeploymentStatus.RUNNING: ("Running :gear:", "[white on blue][bold]Name"),
82 | RayJobDeploymentStatus.FAILED: ("Failed :x:", "[white on red][bold]Name"),
83 | RayJobDeploymentStatus.SUSPENDED: (
84 | "Suspended :pause_button:",
85 | "[white on yellow][bold]Name",
86 | ),
87 | }
88 |
89 | return status_mapping.get(
90 | status, ("Unknown :question:", "[white on red][bold]Name")
91 | )
92 |
93 |
94 | def _create_info_table(header_color: str, name: str, status_display: str) -> Table:
95 | """
96 | Create a standardized info table with header and status.
97 |
98 | Returns:
99 | Table with header row, name/status row, and empty separator row
100 | """
101 | table = Table(box=None, show_header=False)
102 | table.add_row(header_color)
103 | table.add_row("[bold underline]" + name, status_display)
104 | table.add_row() # Empty separator row
105 | return table
106 |
107 |
108 | def _print_table_in_panel(table: Table):
109 | """
110 | Print a table wrapped in a consistent panel format.
111 | """
112 | console = Console()
113 | main_table = Table(
114 | box=None, title="[bold] :package: CodeFlare RayJob Status :package:"
115 | )
116 | main_table.add_row(Panel.fit(table))
117 | console.print(main_table)
118 |
--------------------------------------------------------------------------------
/docs/sphinx/user-docs/ray-cluster-interaction.rst:
--------------------------------------------------------------------------------
1 | Ray Cluster Interaction
2 | =======================
3 |
4 | The CodeFlare SDK offers multiple ways to interact with Ray Clusters
5 | including the below methods.
6 |
7 | get_cluster()
8 | -------------
9 |
10 | The ``get_cluster()`` function is used to initialise a ``Cluster``
11 | object from a pre-existing Ray Cluster/AppWrapper. Below is an example
12 | of it's usage:
13 |
14 | ::
15 |
16 | from codeflare_sdk import get_cluster
17 | cluster = get_cluster(cluster_name="raytest", namespace="example", is_appwrapper=False, write_to_file=False)
18 | -> output: Yaml resources loaded for raytest
19 | cluster.status()
20 | -> output:
21 | 🚀 CodeFlare Cluster Status 🚀
22 | ╭─────────────────────────────────────────────────────────────────╮
23 | │ Name │
24 | │ raytest Active ✅ │
25 | │ │
26 | │ URI: ray://raytest-head-svc.example.svc:10001 │
27 | │ │
28 | │ Dashboard🔗 │
29 | │ │
30 | ╰─────────────────────────────────────────────────────────────────╯
31 | (, True)
32 | cluster.down()
33 | cluster.apply() # This function will create an exact copy of the retrieved Ray Cluster only if the Ray Cluster has been previously deleted.
34 |
35 | | These are the parameters the ``get_cluster()`` function accepts:
36 | | ``cluster_name: str # Required`` -> The name of the Ray Cluster.
37 | | ``namespace: str # Default: "default"`` -> The namespace of the Ray Cluster.
38 | | ``is_appwrapper: bool # Default: False`` -> When set to
39 | | ``True`` the function will attempt to retrieve an AppWrapper instead of a Ray Cluster.
40 | | ``write_to_file: bool # Default: False`` -> When set to ``True`` the Ray Cluster/AppWrapper will be written to a file similar to how it is done in ``ClusterConfiguration``.
41 |
42 | list_all_queued()
43 | -----------------
44 |
45 | | The ``list_all_queued()`` function returns (and prints by default) a list of all currently queued-up Ray Clusters in a given namespace.
46 | | It accepts the following parameters:
47 | | ``namespace: str # Required`` -> The namespace you want to retrieve the list from.
48 | | ``print_to_console: bool # Default: True`` -> Allows the user to print the list to their console.
49 | | ``appwrapper: bool # Default: False`` -> When set to ``True`` allows the user to list queued AppWrappers.
50 |
51 | list_all_clusters()
52 | -------------------
53 |
54 | | The ``list_all_clusters()`` function will return a list of detailed descriptions of Ray Clusters to the console by default.
55 | | It accepts the following parameters:
56 | | ``namespace: str # Required`` -> The namespace you want to retrieve the list from.
57 | | ``print_to_console: bool # Default: True`` -> A boolean that allows the user to print the list to their console.
58 |
59 | .. note::
60 |
61 | The following methods require a ``Cluster`` object to be
62 | initialized. See :doc:`./cluster-configuration`
63 |
64 | cluster.apply()
65 | ------------
66 |
67 | | The ``cluster.apply()`` function applies a Ray Cluster in the given namespace. If the cluster already exists, it is updated.
68 | | If it does not exist it is created.
69 |
70 | cluster.down()
71 | --------------
72 |
73 | | The ``cluster.down()`` function deletes the Ray Cluster in the given namespace.
74 |
75 | cluster.status()
76 | ----------------
77 |
78 | | The ``cluster.status()`` function prints out the status of the Ray Cluster's state with a link to the Ray Dashboard.
79 |
80 | cluster.details()
81 | -----------------
82 |
83 | | The ``cluster.details()`` function prints out a detailed description of the Ray Cluster's status, worker resources and a link to the Ray Dashboard.
84 |
85 | cluster.wait_ready()
86 | --------------------
87 |
88 | | The ``cluster.wait_ready()`` function waits for the requested cluster to be ready, up to an optional timeout and checks every 5 seconds.
89 | | It accepts the following parameters:
90 | | ``timeout: Optional[int] # Default: None`` -> Allows the user to define a timeout for the ``wait_ready()`` function.
91 | | ``dashboard_check: bool # Default: True`` -> If enabled the ``wait_ready()`` function will wait until the Ray Dashboard is ready too.
92 |
--------------------------------------------------------------------------------
/.github/workflows/ui_notebooks_test.yaml:
--------------------------------------------------------------------------------
1 | name: UI notebooks tests
2 |
3 | on:
4 | pull_request:
5 | branches: [ main ]
6 | types: [ labeled ]
7 |
8 | concurrency:
9 | group: ${{ github.head_ref }}-${{ github.workflow }}
10 | cancel-in-progress: true
11 |
12 | env:
13 | CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
14 |
15 | jobs:
16 | verify-3_widget_example:
17 | if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') || contains(github.event.pull_request.labels.*.name, 'test-ui-notebooks') }}
18 | runs-on: ubuntu-latest-4core
19 |
20 | steps:
21 | - name: Checkout code
22 | uses: actions/checkout@v4
23 | with:
24 | submodules: recursive
25 |
26 | - name: Checkout common repo code
27 | uses: actions/checkout@v4
28 | with:
29 | repository: "project-codeflare/codeflare-common"
30 | ref: "main"
31 | path: "common"
32 |
33 | - name: Checkout CodeFlare operator repository
34 | uses: actions/checkout@v4
35 | with:
36 | repository: project-codeflare/codeflare-operator
37 | path: codeflare-operator
38 |
39 | - name: Set Go
40 | uses: actions/setup-go@v5
41 | with:
42 | go-version-file: "./codeflare-operator/go.mod"
43 | cache-dependency-path: "./codeflare-operator/go.sum"
44 |
45 | - name: Set up gotestfmt
46 | uses: gotesttools/gotestfmt-action@v2
47 | with:
48 | token: ${{ secrets.GITHUB_TOKEN }}
49 |
50 | - name: Set up specific Python version
51 | uses: actions/setup-python@v5
52 | with:
53 | python-version: "3.11"
54 | cache: "pip" # caching pip dependencies
55 |
56 | - name: Setup and start KinD cluster
57 | uses: ./common/github-actions/kind
58 |
59 | - name: Deploy CodeFlare stack
60 | id: deploy
61 | run: |
62 | cd codeflare-operator
63 | echo Setting up CodeFlare stack
64 | make setup-e2e
65 | echo Deploying CodeFlare operator
66 | make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
67 | kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
68 | cd ..
69 |
70 | - name: Setup Guided notebooks execution
71 | run: |
72 | echo "Installing papermill and dependencies..."
73 | pip install poetry ipython ipykernel
74 | poetry config virtualenvs.create false
75 | echo "Installing SDK..."
76 | poetry install --with test,docs
77 |
78 | - name: Install Yarn dependencies
79 | run: |
80 | poetry run yarn install
81 | poetry run yarn playwright install chromium
82 | working-directory: ui-tests
83 |
84 | - name: Fix 3_widget_example.ipynb notebook for test
85 | run: |
86 | # Remove login/logout cells, as KinD doesn't support authentication using token
87 | jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb
88 | jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb
89 | # Set explicit namespace as SDK need it (currently) to resolve local queues
90 | sed -i "s|head_memory_limits=2,|head_memory_limits=2, namespace='default',|" 3_widget_example.ipynb
91 | sed -i "s|view_clusters()|view_clusters('default')|" 3_widget_example.ipynb
92 | working-directory: demo-notebooks/guided-demos
93 |
94 | - name: Run UI notebook tests
95 | run: |
96 | set -euo pipefail
97 |
98 | poetry run yarn test
99 | working-directory: ui-tests
100 |
101 | - name: Upload Playwright Test assets
102 | if: always()
103 | uses: actions/upload-artifact@v4
104 | with:
105 | name: ipywidgets-test-assets
106 | path: |
107 | ui-tests/test-results
108 |
109 | - name: Upload Playwright Test report
110 | if: always()
111 | uses: actions/upload-artifact@v4
112 | with:
113 | name: ipywidgets-test-report
114 | path: |
115 | ui-tests/playwright-report
116 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/examples/use-utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from os import path
4 | import json
5 |
6 |
7 | """
8 | in case you are working directly with the source, and don't wish to
9 | install the module with pip install, you can directly import the packages by uncommenting the following code.
10 | """
11 |
12 | """
13 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
14 |
15 | current_dir = os.path.dirname(os.path.abspath(__file__))
16 | parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
17 | sibling_dirs = [
18 | d for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d))
19 | ]
20 | for sibling_dir in sibling_dirs:
21 | sys.path.append(os.path.join(parent_dir, sibling_dir))
22 | """
23 |
24 | from codeflare_sdk.vendored.python_client import kuberay_cluster_api
25 |
26 | from codeflare_sdk.vendored.python_client.utils import (
27 | kuberay_cluster_utils,
28 | kuberay_cluster_builder,
29 | )
30 |
31 |
32 | def main():
33 | print("starting cluster handler...")
34 | my_kuberay_api = kuberay_cluster_api.RayClusterApi() # this is the main api object
35 |
36 | my_cluster_builder = (
37 | kuberay_cluster_builder.ClusterBuilder()
38 | ) # this is the builder object, to create a cluster with a more granular control
39 |
40 | my_Cluster_utils = (
41 | kuberay_cluster_utils.ClusterUtils()
42 | ) # this is the utils object, to perform operations on a cluster
43 |
44 | cluster1 = (
45 | my_cluster_builder.build_meta(
46 | name="new-cluster1", labels={"demo-cluster": "yes"}
47 | )
48 | .build_head()
49 | .build_worker(group_name="workers")
50 | .get_cluster()
51 | ) # this is the cluster object, it is a dict
52 |
53 | if not my_cluster_builder.succeeded:
54 | print("error building the cluster, aborting...")
55 | return
56 |
57 | print("creating raycluster = {}".format(cluster1["metadata"]["name"]))
58 | my_kuberay_api.create_ray_cluster(
59 | body=cluster1
60 | ) # this is the api call to create the cluster1 in k8s
61 |
62 | cluster_to_patch, succeeded = my_Cluster_utils.update_worker_group_replicas(
63 | cluster1, group_name="workers", max_replicas=4, min_replicas=1, replicas=2
64 | )
65 |
66 | if succeeded:
67 | print(
68 | "trying to patch raycluster = {}".format(
69 | cluster_to_patch["metadata"]["name"]
70 | )
71 | )
72 | my_kuberay_api.patch_ray_cluster(
73 | name=cluster_to_patch["metadata"]["name"], ray_patch=cluster_to_patch
74 | ) # this is the api call to patch the cluster1 in k8s
75 |
76 | cluster_to_patch, succeeded = my_Cluster_utils.duplicate_worker_group(
77 | cluster1, group_name="workers", new_group_name="duplicate-workers"
78 | ) # this is the api call to duplicate the worker group in the cluster1
79 | if succeeded:
80 | print(
81 | "trying to patch raycluster = {}".format(
82 | cluster_to_patch["metadata"]["name"]
83 | )
84 | )
85 | my_kuberay_api.patch_ray_cluster(
86 | name=cluster_to_patch["metadata"]["name"], ray_patch=cluster_to_patch
87 | )
88 |
89 | # the rest of the code is simply to list and cleanup the created cluster
90 | kube_ray_list = my_kuberay_api.list_ray_clusters(
91 | k8s_namespace="default", label_selector="demo-cluster=yes"
92 | ) # this is the api call to list the clusters in k8s
93 | if "items" in kube_ray_list:
94 | line = "-" * 72
95 | print(line)
96 | print("{:<63s}{:>2s}".format("Name", "Namespace"))
97 | print(line)
98 | for cluster in kube_ray_list["items"]:
99 | print(
100 | "{:<63s}{:>2s}".format(
101 | cluster["metadata"]["name"],
102 | cluster["metadata"]["namespace"],
103 | )
104 | )
105 | print(line)
106 |
107 | if "items" in kube_ray_list:
108 | for cluster in kube_ray_list["items"]:
109 | print("deleting raycluster = {}".format(cluster["metadata"]["name"]))
110 | my_kuberay_api.delete_ray_cluster(
111 | name=cluster["metadata"]["name"],
112 | k8s_namespace=cluster["metadata"]["namespace"],
113 | ) # this is the api call to delete the cluster in k8s
114 |
115 |
116 | if __name__ == "__main__":
117 | main()
118 |
--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/3_widget_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8d4a42f6",
6 | "metadata": {},
7 | "source": [
8 | "In this notebook, we will go through the basics of using the SDK to:\n",
9 | " - Spin up a Ray cluster with our desired resources\n",
10 | " - View the status and specs of our Ray cluster\n",
11 | " - Take down the Ray cluster when finished"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "# Import pieces from codeflare-sdk\n",
22 | "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, view_clusters"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "id": "614daa0c",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# Create authentication object for user permissions\n",
33 | "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
34 | "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
35 | "auth = TokenAuthentication(\n",
36 | " token = \"XXXXX\",\n",
37 | " server = \"XXXXX\",\n",
38 | " skip_tls=False\n",
39 | ")\n",
40 | "auth.login()"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "id": "bc27f84c",
46 | "metadata": {},
47 | "source": [
48 | "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n",
49 | "\n",
50 | "NOTE: The default images used by the CodeFlare SDK for creating a RayCluster resource depend on the installed Python version:\n",
51 | "\n",
52 | "- For Python 3.11: 'quay.io/modh/ray:2.52.1-py311-cu121'\n",
53 | "- For Python 3.12: 'quay.io/modh/ray:2.52.1-py312-cu128'\n",
54 | "\n",
55 | "If you prefer to use a custom Ray image that better suits your needs, you can specify it in the image field to override the default."
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "id": "0f4bc870-091f-4e11-9642-cba145710159",
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "# Create and configure our cluster object\n",
66 | "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
67 | "cluster = Cluster(ClusterConfiguration(\n",
68 | " name='widgettest',\n",
69 | " head_cpu_requests='500m',\n",
70 | " head_cpu_limits='500m',\n",
71 | " head_memory_requests=6,\n",
72 | " head_memory_limits=8,\n",
73 | " head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n",
74 | " worker_extended_resource_requests={'nvidia.com/gpu':0},\n",
75 | " num_workers=2,\n",
76 | " worker_cpu_requests='250m',\n",
77 | " worker_cpu_limits=1,\n",
78 | " worker_memory_requests=4,\n",
79 | " worker_memory_limits=6,\n",
80 | " # image=\"\", # Optional Field\n",
81 | " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n",
82 | " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
83 | "))"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "id": "3de6403c",
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "view_clusters()"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "id": "2d8e6ce3",
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "cluster.status()"
104 | ]
105 | }
106 | ],
107 | "metadata": {
108 | "kernelspec": {
109 | "display_name": "Python 3 (ipykernel)",
110 | "language": "python",
111 | "name": "python3"
112 | },
113 | "language_info": {
114 | "codemirror_mode": {
115 | "name": "ipython",
116 | "version": 3
117 | },
118 | "file_extension": ".py",
119 | "mimetype": "text/x-python",
120 | "name": "python",
121 | "nbconvert_exporter": "python",
122 | "pygments_lexer": "ipython3",
123 | "version": "3.9.18"
124 | },
125 | "vscode": {
126 | "interpreter": {
127 | "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
128 | }
129 | }
130 | },
131 | "nbformat": 4,
132 | "nbformat_minor": 5
133 | }
134 |
--------------------------------------------------------------------------------
/src/codeflare_sdk/vendored/python_client_test/test_director.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from codeflare_sdk.vendored.python_client.utils import kuberay_cluster_builder
3 |
4 |
5 | class TestDirector(unittest.TestCase):
6 | def __init__(self, methodName: str = ...) -> None:
7 | super().__init__(methodName)
8 | self.director = kuberay_cluster_builder.Director()
9 |
10 | def test_build_basic_cluster(self):
11 | cluster = self.director.build_basic_cluster(name="basic-cluster")
12 | # testing meta
13 | actual = cluster["metadata"]["name"]
14 | expected = "basic-cluster"
15 | self.assertEqual(actual, expected)
16 |
17 | actual = cluster["metadata"]["namespace"]
18 | expected = "default"
19 | self.assertEqual(actual, expected)
20 |
21 | # testing the head pod
22 | actual = cluster["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
23 | "resources"
24 | ]["requests"]["cpu"]
25 | expected = "2"
26 | self.assertEqual(actual, expected)
27 |
28 | def test_build_small_cluster(self):
29 | cluster = self.director.build_small_cluster(name="small-cluster")
30 | # testing meta
31 | actual = cluster["metadata"]["name"]
32 | expected = "small-cluster"
33 | self.assertEqual(actual, expected)
34 |
35 | actual = cluster["metadata"]["namespace"]
36 | expected = "default"
37 | self.assertEqual(actual, expected)
38 |
39 | # testing the head pod
40 | actual = cluster["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
41 | "resources"
42 | ]["requests"]["cpu"]
43 | expected = "2"
44 | self.assertEqual(actual, expected)
45 |
46 | # testing the workergroup
47 | actual = cluster["spec"]["workerGroupSpecs"][0]["replicas"]
48 | expected = 1
49 | self.assertEqual(actual, expected)
50 |
51 | actual = cluster["spec"]["workerGroupSpecs"][0]["template"]["spec"][
52 | "containers"
53 | ][0]["resources"]["requests"]["cpu"]
54 | expected = "1"
55 | self.assertEqual(actual, expected)
56 |
57 | def test_build_medium_cluster(self):
58 | cluster = self.director.build_medium_cluster(name="medium-cluster")
59 | # testing meta
60 | actual = cluster["metadata"]["name"]
61 | expected = "medium-cluster"
62 | self.assertEqual(actual, expected)
63 |
64 | actual = cluster["metadata"]["namespace"]
65 | expected = "default"
66 | self.assertEqual(actual, expected)
67 |
68 | # testing the head pod
69 | actual = cluster["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
70 | "resources"
71 | ]["requests"]["cpu"]
72 | expected = "2"
73 | self.assertEqual(actual, expected)
74 |
75 | # testing the workergroup
76 | actual = cluster["spec"]["workerGroupSpecs"][0]["replicas"]
77 | expected = 3
78 | self.assertEqual(actual, expected)
79 |
80 | actual = cluster["spec"]["workerGroupSpecs"][0]["groupName"]
81 | expected = "medium-cluster-workers"
82 | self.assertEqual(actual, expected)
83 |
84 | actual = cluster["spec"]["workerGroupSpecs"][0]["template"]["spec"][
85 | "containers"
86 | ][0]["resources"]["requests"]["cpu"]
87 | expected = "2"
88 | self.assertEqual(actual, expected)
89 |
90 | def test_build_large_cluster(self):
91 | cluster = self.director.build_large_cluster(name="large-cluster")
92 | # testing meta
93 | actual = cluster["metadata"]["name"]
94 | expected = "large-cluster"
95 | self.assertEqual(actual, expected)
96 |
97 | actual = cluster["metadata"]["namespace"]
98 | expected = "default"
99 | self.assertEqual(actual, expected)
100 |
101 | # testing the head pod
102 | actual = cluster["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
103 | "resources"
104 | ]["requests"]["cpu"]
105 | expected = "2"
106 | self.assertEqual(actual, expected)
107 |
108 | # testing the workergroup
109 | actual = cluster["spec"]["workerGroupSpecs"][0]["replicas"]
110 | expected = 6
111 | self.assertEqual(actual, expected)
112 |
113 | actual = cluster["spec"]["workerGroupSpecs"][0]["groupName"]
114 | expected = "large-cluster-workers"
115 | self.assertEqual(actual, expected)
116 |
117 | actual = cluster["spec"]["workerGroupSpecs"][0]["template"]["spec"][
118 | "containers"
119 | ][0]["resources"]["requests"]["cpu"]
120 | expected = "3"
121 | self.assertEqual(actual, expected)
122 |
--------------------------------------------------------------------------------
/tests/test_cluster_yamls/support_clusters/test-rc-b.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: ray.io/v1
2 | kind: RayCluster
3 | metadata:
4 | labels:
5 | controller-tools.k8s.io: '1.0'
6 | kueue.x-k8s.io/queue-name: local_default_queue
7 | name: test-rc-b
8 | namespace: ns
9 | spec:
10 | autoscalerOptions:
11 | idleTimeoutSeconds: 60
12 | resources:
13 | limits:
14 | cpu: 500m
15 | memory: 512Mi
16 | requests:
17 | cpu: 500m
18 | memory: 512Mi
19 | upscalingMode: Default
20 | enableInTreeAutoscaling: false
21 | headGroupSpec:
22 | enableIngress: false
23 | rayStartParams:
24 | block: 'true'
25 | dashboard-host: 0.0.0.0
26 | num-gpus: '0'
27 | resources: '"{}"'
28 | serviceType: ClusterIP
29 | template:
30 | spec:
31 | containers:
32 | - image: "${image}"
33 | imagePullPolicy: IfNotPresent
34 | lifecycle:
35 | preStop:
36 | exec:
37 | command:
38 | - /bin/sh
39 | - -c
40 | - ray stop
41 | name: ray-head
42 | ports:
43 | - containerPort: 6379
44 | name: gcs
45 | - containerPort: 8265
46 | name: dashboard
47 | - containerPort: 10001
48 | name: client
49 | resources:
50 | limits:
51 | cpu: 2
52 | memory: 8G
53 | requests:
54 | cpu: 2
55 | memory: 8G
56 | volumeMounts:
57 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
58 | name: odh-trusted-ca-cert
59 | subPath: odh-trusted-ca-bundle.crt
60 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
61 | name: odh-trusted-ca-cert
62 | subPath: odh-trusted-ca-bundle.crt
63 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
64 | name: odh-ca-cert
65 | subPath: odh-ca-bundle.crt
66 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
67 | name: odh-ca-cert
68 | subPath: odh-ca-bundle.crt
69 | imagePullSecrets: []
70 | volumes:
71 | - configMap:
72 | items:
73 | - key: ca-bundle.crt
74 | path: odh-trusted-ca-bundle.crt
75 | name: odh-trusted-ca-bundle
76 | optional: true
77 | name: odh-trusted-ca-cert
78 | - configMap:
79 | items:
80 | - key: odh-ca-bundle.crt
81 | path: odh-ca-bundle.crt
82 | name: odh-trusted-ca-bundle
83 | optional: true
84 | name: odh-ca-cert
85 | rayVersion: 2.52.1
86 | workerGroupSpecs:
87 | - groupName: small-group-test-rc-b
88 | maxReplicas: 1
89 | minReplicas: 1
90 | rayStartParams:
91 | block: 'true'
92 | num-gpus: '0'
93 | resources: '"{}"'
94 | replicas: 1
95 | template:
96 | metadata:
97 | annotations:
98 | key: value
99 | labels:
100 | key: value
101 | spec:
102 | containers:
103 | - image: "${image}"
104 | lifecycle:
105 | preStop:
106 | exec:
107 | command:
108 | - /bin/sh
109 | - -c
110 | - ray stop
111 | name: machine-learning
112 | resources:
113 | limits:
114 | cpu: 1
115 | memory: 2G
116 | requests:
117 | cpu: 1
118 | memory: 2G
119 | volumeMounts:
120 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
121 | name: odh-trusted-ca-cert
122 | subPath: odh-trusted-ca-bundle.crt
123 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
124 | name: odh-trusted-ca-cert
125 | subPath: odh-trusted-ca-bundle.crt
126 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
127 | name: odh-ca-cert
128 | subPath: odh-ca-bundle.crt
129 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
130 | name: odh-ca-cert
131 | subPath: odh-ca-bundle.crt
132 | imagePullSecrets: []
133 | volumes:
134 | - configMap:
135 | items:
136 | - key: ca-bundle.crt
137 | path: odh-trusted-ca-bundle.crt
138 | name: odh-trusted-ca-bundle
139 | optional: true
140 | name: odh-trusted-ca-cert
141 | - configMap:
142 | items:
143 | - key: odh-ca-bundle.crt
144 | path: odh-ca-bundle.crt
145 | name: odh-trusted-ca-bundle
146 | optional: true
147 | name: odh-ca-cert
148 |
--------------------------------------------------------------------------------
/tests/test_cluster_yamls/support_clusters/test-rc-a.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: ray.io/v1
2 | kind: RayCluster
3 | metadata:
4 | labels:
5 | controller-tools.k8s.io: '1.0'
6 | kueue.x-k8s.io/queue-name: local_default_queue
7 | name: test-cluster-a
8 | namespace: ns
9 | spec:
10 | autoscalerOptions:
11 | idleTimeoutSeconds: 60
12 | resources:
13 | limits:
14 | cpu: 500m
15 | memory: 512Mi
16 | requests:
17 | cpu: 500m
18 | memory: 512Mi
19 | upscalingMode: Default
20 | enableInTreeAutoscaling: false
21 | headGroupSpec:
22 | enableIngress: false
23 | rayStartParams:
24 | block: 'true'
25 | dashboard-host: 0.0.0.0
26 | num-gpus: '0'
27 | resources: '"{}"'
28 | serviceType: ClusterIP
29 | template:
30 | spec:
31 | containers:
32 | - image: "${image}"
33 | imagePullPolicy: IfNotPresent
34 | lifecycle:
35 | preStop:
36 | exec:
37 | command:
38 | - /bin/sh
39 | - -c
40 | - ray stop
41 | name: ray-head
42 | ports:
43 | - containerPort: 6379
44 | name: gcs
45 | - containerPort: 8265
46 | name: dashboard
47 | - containerPort: 10001
48 | name: client
49 | resources:
50 | limits:
51 | cpu: 2
52 | memory: 8G
53 | requests:
54 | cpu: 2
55 | memory: 8G
56 | volumeMounts:
57 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
58 | name: odh-trusted-ca-cert
59 | subPath: odh-trusted-ca-bundle.crt
60 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
61 | name: odh-trusted-ca-cert
62 | subPath: odh-trusted-ca-bundle.crt
63 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
64 | name: odh-ca-cert
65 | subPath: odh-ca-bundle.crt
66 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
67 | name: odh-ca-cert
68 | subPath: odh-ca-bundle.crt
69 | imagePullSecrets: []
70 | volumes:
71 | - configMap:
72 | items:
73 | - key: ca-bundle.crt
74 | path: odh-trusted-ca-bundle.crt
75 | name: odh-trusted-ca-bundle
76 | optional: true
77 | name: odh-trusted-ca-cert
78 | - configMap:
79 | items:
80 | - key: odh-ca-bundle.crt
81 | path: odh-ca-bundle.crt
82 | name: odh-trusted-ca-bundle
83 | optional: true
84 | name: odh-ca-cert
85 | rayVersion: 2.52.1
86 | workerGroupSpecs:
87 | - groupName: small-group-test-cluster-a
88 | maxReplicas: 1
89 | minReplicas: 1
90 | rayStartParams:
91 | block: 'true'
92 | num-gpus: '0'
93 | resources: '"{}"'
94 | replicas: 1
95 | template:
96 | metadata:
97 | annotations:
98 | key: value
99 | labels:
100 | key: value
101 | spec:
102 | containers:
103 | - image: "${image}"
104 | lifecycle:
105 | preStop:
106 | exec:
107 | command:
108 | - /bin/sh
109 | - -c
110 | - ray stop
111 | name: machine-learning
112 | resources:
113 | limits:
114 | cpu: 1
115 | memory: 2G
116 | requests:
117 | cpu: 1
118 | memory: 2G
119 | volumeMounts:
120 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
121 | name: odh-trusted-ca-cert
122 | subPath: odh-trusted-ca-bundle.crt
123 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
124 | name: odh-trusted-ca-cert
125 | subPath: odh-trusted-ca-bundle.crt
126 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
127 | name: odh-ca-cert
128 | subPath: odh-ca-bundle.crt
129 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
130 | name: odh-ca-cert
131 | subPath: odh-ca-bundle.crt
132 | imagePullSecrets: []
133 | volumes:
134 | - configMap:
135 | items:
136 | - key: ca-bundle.crt
137 | path: odh-trusted-ca-bundle.crt
138 | name: odh-trusted-ca-bundle
139 | optional: true
140 | name: odh-trusted-ca-cert
141 | - configMap:
142 | items:
143 | - key: odh-ca-bundle.crt
144 | path: odh-ca-bundle.crt
145 | name: odh-trusted-ca-bundle
146 | optional: true
147 | name: odh-ca-cert
148 |
--------------------------------------------------------------------------------