├── .gitattributes ├── .github ├── build │ ├── Containerfile │ └── README.md ├── dependabot.yml ├── resources │ ├── minio_remote_config_cell.json │ └── wait_for_job_cell.json └── workflows │ ├── additional_demo_notebook_tests.yaml │ ├── coverage-badge.yaml │ ├── dependabot-labeler.yaml │ ├── e2e_tests.yaml │ ├── guided_notebook_tests.yaml │ ├── odh-notebooks-sync.yml │ ├── pre-commit.yaml │ ├── publish-documentation.yaml │ ├── release.yaml │ ├── snyk-security.yaml │ ├── ui_notebooks_test.yaml │ └── unit-tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── OWNERS ├── README.md ├── assets └── images │ └── sdk-diagram.png ├── coverage.svg ├── demo-notebooks ├── additional-demos │ ├── hf_interactive.ipynb │ ├── local_interactive.ipynb │ ├── mnist.py │ ├── ray_job_client.ipynb │ ├── remote_ray_job_client.ipynb │ └── requirements.txt └── guided-demos │ ├── 0_basic_ray.ipynb │ ├── 1_cluster_job_client.ipynb │ ├── 2_basic_interactive.ipynb │ ├── 3_widget_example.ipynb │ ├── download_mnist_datasets.py │ ├── mnist.py │ ├── mnist_disconnected.py │ ├── mnist_fashion.py │ ├── notebook-ex-outputs │ ├── 0_basic_ray.ipynb │ ├── 1_cluster_job_client.ipynb │ ├── 2_basic_interactive.ipynb │ ├── interactivetest.yaml │ ├── jobtest.yaml │ ├── mnist.py │ ├── raytest.yaml │ └── requirements.txt │ ├── preview_nbs │ ├── 0_basic_ray.ipynb │ ├── 1_cluster_job_client.ipynb │ ├── 2_basic_interactive.ipynb │ ├── mnist.py │ └── requirements.txt │ └── requirements.txt ├── docs ├── designs │ ├── CodeFlare-SDK-design-doc.md │ └── History │ │ └── CodeFlareSDK_Design_Doc.md ├── generate-documentation.md ├── images │ ├── codeflare_sdk.png │ └── codeflare_stack_arch.png └── sphinx │ ├── Makefile │ ├── conf.py │ ├── index.rst │ ├── make.bat │ └── user-docs │ ├── authentication.rst │ ├── cluster-configuration.rst │ ├── e2e.rst │ ├── images │ ├── ui-buttons.png │ └── ui-view-clusters.png │ ├── ray-cluster-interaction.rst │ ├── s3-compatible-storage.rst │ ├── setup-kueue.rst │ └── ui-widgets.rst ├── poetry.lock ├── pyproject.toml ├── src └── codeflare_sdk │ ├── __init__.py │ ├── common │ ├── __init__.py │ ├── kubernetes_cluster │ │ ├── __init__.py │ │ ├── auth.py │ │ ├── kube_api_helpers.py │ │ └── test_auth.py │ ├── kueue │ │ ├── __init__.py │ │ ├── kueue.py │ │ └── test_kueue.py │ ├── utils │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── demos.py │ │ ├── generate_cert.py │ │ ├── test_generate_cert.py │ │ └── unit_test_support.py │ └── widgets │ │ ├── __init__.py │ │ ├── test_widgets.py │ │ └── widgets.py │ └── ray │ ├── __init__.py │ ├── appwrapper │ ├── __init__.py │ ├── awload.py │ ├── status.py │ ├── test_awload.py │ └── test_status.py │ ├── client │ ├── __init__.py │ ├── ray_jobs.py │ └── test_ray_jobs.py │ └── cluster │ ├── __init__.py │ ├── build_ray_cluster.py │ ├── cluster.py │ ├── config.py │ ├── pretty_print.py │ ├── status.py │ ├── test_build_ray_cluster.py │ ├── test_cluster.py │ ├── test_config.py │ ├── test_pretty_print.py │ └── test_status.py ├── target_users.md ├── tests ├── __init__.py ├── auth-test.crt ├── e2e │ ├── cluster_apply_kind_test.py │ ├── heterogeneous_clusters_kind_test.py │ ├── heterogeneous_clusters_oauth_test.py │ ├── install-codeflare-sdk.sh │ ├── local_interactive_sdk_kind_test.py │ ├── local_interactive_sdk_oauth_test.py │ ├── minio_deployment.yaml │ ├── mnist.py │ ├── mnist_pip_requirements.txt │ ├── mnist_raycluster_sdk_aw_kind_test.py │ ├── mnist_raycluster_sdk_kind_test.py │ ├── mnist_raycluster_sdk_oauth_test.py │ ├── mnist_rayjob.py │ ├── mnist_sleep.py │ ├── start_ray_cluster.py │ └── support.py ├── test_cluster_yamls │ ├── appwrapper │ │ ├── test-case-bad.yaml │ │ └── unit-test-all-params.yaml │ ├── kueue │ │ ├── aw_kueue.yaml │ │ └── ray_cluster_kueue.yaml │ ├── ray │ │ ├── default-appwrapper.yaml │ │ ├── default-ray-cluster.yaml │ │ └── unit-test-all-params.yaml │ └── support_clusters │ │ ├── test-aw-a.yaml │ │ ├── test-aw-b.yaml │ │ ├── test-rc-a.yaml │ │ └── test-rc-b.yaml └── upgrade │ ├── __init__.py │ ├── raycluster_sdk_upgrade_sleep_test.py │ └── raycluster_sdk_upgrade_test.py └── ui-tests ├── .yarnrc ├── jupyter_server_config.py ├── package.json ├── playwright.config.js ├── tests ├── widget_notebook_example.test.ts └── widget_notebook_example.test.ts-snapshots │ ├── widgets-cell-0-linux.png │ ├── widgets-cell-2-linux.png │ ├── widgets-cell-3-linux.png │ ├── widgets-cell-4-linux.png │ └── widgets-cell-5-linux.png └── yarn.lock /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-detectable=false 2 | -------------------------------------------------------------------------------- /.github/build/Containerfile: -------------------------------------------------------------------------------- 1 | FROM registry.redhat.io/ubi9/python-39:latest 2 | 3 | LABEL summary="Toolchain for running pre-commit hooks." \ 4 | description="Toolchain for running pre-commit hooks" \ 5 | io.k8s.display-name="Pre-Commit Toolchain" 6 | 7 | USER root 8 | RUN dnf install nodejs -y && \ 9 | dnf clean all && \ 10 | rm -rf /var/cache/dnf 11 | ADD https://mirror.openshift.com/pub/openshift-v4/clients/oc/latest/linux/oc.tar.gz $TMPDIR/ 12 | RUN tar -C /usr/local/bin -xvf $TMPDIR/oc.tar.gz && \ 13 | chmod +x /usr/local/bin/oc && \ 14 | rm $TMPDIR/oc.tar.gz 15 | USER $USERID 16 | 17 | RUN pip3 install poetry && \ 18 | poetry config virtualenvs.create false 19 | COPY pyproject.toml ./ 20 | RUN poetry install 21 | 22 | CMD bash 23 | -------------------------------------------------------------------------------- /.github/build/README.md: -------------------------------------------------------------------------------- 1 | # Pre-Commit Build Artifacts 2 | 3 | This directory contains the artifacts required to build the codeflare-sdk pre-commit image. 4 | 5 | To build the image run `podman build -f .github/build/Containerfile .` from the root directory. 6 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 3 | 4 | version: 2 5 | updates: 6 | # This is to update requirements.txt files in the guided-demos, and e2e directories. 7 | - package-ecosystem: "pip" 8 | directories: 9 | - "**/demo-notebooks/guided-demos*" 10 | - "/tests/e2e" 11 | schedule: 12 | interval: "daily" 13 | ignore: 14 | - dependency-name: "*" 15 | update-types: ["version-update:semver-patch"] 16 | open-pull-requests-limit: 1 17 | labels: 18 | - "dependabot" 19 | - "test-guided-notebooks" 20 | 21 | # pip means poetry in this case, this keeps poetry.lock up to date with constraints in pyproject.toml. 22 | - package-ecosystem: "pip" 23 | directory: "/" 24 | schedule: 25 | interval: "daily" 26 | ignore: 27 | - dependency-name: "*" 28 | update-types: ["version-update:semver-patch"] 29 | open-pull-requests-limit: 1 30 | labels: 31 | - "dependabot" 32 | - "test-guided-notebooks" 33 | 34 | # npm means yarn in this case, this keeps yarn.lock up to date with constraints in package.json. 35 | - package-ecosystem: "npm" 36 | directory: "/ui-tests" 37 | schedule: 38 | interval: "daily" 39 | ignore: 40 | - dependency-name: "*" 41 | update-types: ["version-update:semver-patch"] 42 | open-pull-requests-limit: 1 43 | labels: 44 | - "dependabot" 45 | - "test-ui-notebooks" 46 | -------------------------------------------------------------------------------- /.github/resources/minio_remote_config_cell.json: -------------------------------------------------------------------------------- 1 | { 2 | "cell_type": "code", 3 | "execution_count": null, 4 | "metadata": {}, 5 | "outputs": [], 6 | "source": [ 7 | "@ray.remote\n", 8 | "def get_minio_run_config():\n", 9 | " import s3fs\n", 10 | " import pyarrow\n", 11 | " s3_fs = s3fs.S3FileSystem(\n", 12 | " key = \"minio\",\n", 13 | " secret = \"minio123\",\n", 14 | " endpoint_url = \"http://minio-service.default.svc.cluster.local:9000\"\n", 15 | " )\n", 16 | " custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))\n", 17 | " run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)\n", 18 | " return run_config" 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /.github/resources/wait_for_job_cell.json: -------------------------------------------------------------------------------- 1 | { 2 | "cell_type": "code", 3 | "execution_count": null, 4 | "metadata": {}, 5 | "outputs": [], 6 | "source": [ 7 | "from time import sleep\n", 8 | "\n", 9 | "finished = False\n", 10 | "while not finished:\n", 11 | " sleep(5)\n", 12 | " status = client.get_job_status(submission_id)\n", 13 | " finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n", 14 | " print(status)\n", 15 | "print(\"Job status \" + status)\n", 16 | "print(\"Logs: \")\n", 17 | "print(client.get_job_logs(submission_id))\n", 18 | "assert status == \"SUCCEEDED\", \"Job failed or was stopped!\"" 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /.github/workflows/coverage-badge.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will generate and push an updated coverage badge 2 | 3 | name: Coverage Badge 4 | 5 | on: 6 | push: 7 | branches: [ main ] 8 | 9 | jobs: 10 | report: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python 3.11 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: 3.11 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install poetry 24 | poetry config virtualenvs.create false 25 | poetry lock 26 | poetry install --with test 27 | - name: Generate coverage report 28 | run: | 29 | coverage run --omit="src/**/test_*.py,src/codeflare_sdk/common/utils/unit_test_support.py" -m pytest 30 | 31 | - name: Coverage Badge 32 | uses: tj-actions/coverage-badge-py@v2 33 | 34 | - name: Verify Changed files 35 | uses: tj-actions/verify-changed-files@v18 36 | id: changed_files 37 | with: 38 | files: coverage.svg 39 | 40 | - name: Commit files 41 | if: steps.changed_files.outputs.files_changed == 'true' 42 | run: | 43 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 44 | git config --local user.name "github-actions[bot]" 45 | git add coverage.svg 46 | git commit -m "Updated coverage.svg" 47 | 48 | - name: Create Pull Request 49 | if: steps.changed_files.outputs.files_changed == 'true' 50 | uses: peter-evans/create-pull-request@v4 51 | with: 52 | token: ${{ secrets.GITHUB_TOKEN }} 53 | title: "[Automatic] Coverage Badge Update" 54 | -------------------------------------------------------------------------------- /.github/workflows/dependabot-labeler.yaml: -------------------------------------------------------------------------------- 1 | # This workflow file adds the 'lgtm' and 'approved' labels to Dependabot PRs 2 | # This is done to ensure that the PRs that pass required status checks are automatically merged by the CodeFlare bot 3 | name: Dependabot Labeler 4 | 5 | on: 6 | pull_request_target: 7 | branches: [ main ] 8 | 9 | jobs: 10 | add-approve-lgtm-label: 11 | if: ${{ github.actor == 'dependabot[bot]' && contains(github.event.pull_request.labels.*.name, 'dependabot') }} 12 | runs-on: ubuntu-latest 13 | 14 | # Permission required to edit a PR 15 | permissions: 16 | pull-requests: write 17 | issues: write 18 | 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v4 22 | 23 | - name: Add approve and lgtm labels to Dependabot PR 24 | run: | 25 | gh pr edit ${{ github.event.pull_request.number }} --add-label "lgtm" --add-label "approved" 26 | env: 27 | GITHUB_TOKEN: ${{ secrets.GH_CLI_TOKEN }} 28 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yaml: -------------------------------------------------------------------------------- 1 | name: Pre-commit 2 | on: 3 | pull_request: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | precommit: 8 | runs-on: ubuntu-latest 9 | container: 10 | image: quay.io/project-codeflare/codeflare-sdk-precommit:v0.0.1 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Run pre-commit checks 15 | run: pre-commit run --all-files 16 | -------------------------------------------------------------------------------- /.github/workflows/publish-documentation.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Documentation 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | codeflare_sdk_release_version: 7 | type: string 8 | required: true 9 | description: 'Version number (for example: 0.1.0)' 10 | 11 | permissions: 12 | contents: write 13 | 14 | jobs: 15 | docs: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Install Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: 3.11 23 | - name: Install Sphinx 24 | run: | 25 | sudo apt-get update 26 | sudo apt-get install python3-sphinx 27 | - name: Install Poetry 28 | uses: abatilo/actions-poetry@v2 29 | with: 30 | poetry-version: 1.8.3 31 | - name: Create new documentation 32 | run: | 33 | python3 -m venv .venv 34 | source .venv/bin/activate 35 | poetry install --with docs 36 | sed -i 's/release = "v[0-9]\+\.[0-9]\+\.[0-9]\+"/release = "${{ github.event.inputs.codeflare_sdk_release_version }}"/' docs/sphinx/conf.py 37 | sphinx-apidoc -o docs/sphinx src/codeflare_sdk "**/*test_*" --force # Generate docs but ignore test files 38 | make html -C docs/sphinx 39 | - name: Deploy to GitHub Pages 40 | uses: peaceiris/actions-gh-pages@v3 41 | with: 42 | publish_branch: gh-pages 43 | github_token: ${{ secrets.GITHUB_TOKEN }} 44 | publish_dir: docs/sphinx/_build/html 45 | force_orphan: true 46 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | release-version: 7 | type: string 8 | required: true 9 | description: 'Version number (for example: 0.1.0)' 10 | is-stable: 11 | description: 'Select if the built image should be tagged as stable' 12 | required: true 13 | type: boolean 14 | quay-organization: 15 | description: 'Quay organization used to push the built images to' 16 | required: true 17 | default: 'project-codeflare' 18 | python_version: 19 | type: string 20 | default: "3.11" 21 | required: true 22 | poetry_version: 23 | type: string 24 | default: "1.8.3" 25 | required: true 26 | codeflare-repository-organization: 27 | type: string 28 | default: "project-codeflare" 29 | 30 | env: 31 | PR_BRANCH_NAME: snyk-tag-monitoring-${{ github.run_id }} 32 | 33 | jobs: 34 | release: 35 | runs-on: ubuntu-latest 36 | permissions: 37 | contents: write 38 | id-token: write # This permission is required for trusted publishing 39 | pull-requests: write # This permission is required for creating PRs 40 | actions: write # This permission is required for running actions 41 | steps: 42 | - name: Checkout the repository 43 | uses: actions/checkout@v4 44 | with: 45 | token: ${{ secrets.GH_CLI_TOKEN }} 46 | - name: Install Python 47 | uses: actions/setup-python@v5 48 | with: 49 | python-version: ${{ github.event.inputs.python_version }} 50 | - name: Install Poetry 51 | uses: abatilo/actions-poetry@v2 52 | with: 53 | poetry-version: ${{ github.event.inputs.poetry_version }} 54 | - name: Change version in pyproject.toml 55 | run: poetry version "${{ github.event.inputs.release-version }}" 56 | - name: Run poetry install 57 | run: poetry install --with docs 58 | - name: Create new documentation 59 | run: | 60 | gh workflow run publish-documentation.yaml \ 61 | --repo ${{ github.event.inputs.codeflare-repository-organization }}/codeflare-sdk \ 62 | --ref ${{ github.ref }} \ 63 | --field codeflare_sdk_release_version=${{ github.event.inputs.release-version }} 64 | env: 65 | GITHUB_TOKEN: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }} 66 | - name: Copy demo notebooks into SDK package 67 | run: cp -r demo-notebooks src/codeflare_sdk/demo-notebooks 68 | - name: Run poetry build 69 | run: poetry build 70 | env: 71 | GITHUB_TOKEN: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }} 72 | - name: Create Github release 73 | uses: ncipollo/release-action@v1 74 | with: 75 | tag: "v${{ github.event.inputs.release-version }}" 76 | generateReleaseNotes: true 77 | - name: Publish package distributions to PyPI 78 | uses: pypa/gh-action-pypi-publish@release/v1 79 | 80 | - name: Sync ODH Notebooks 81 | run: | 82 | gh workflow run odh-notebooks-sync.yml \ 83 | --repo ${{ github.event.inputs.codeflare-repository-organization }}/codeflare-sdk \ 84 | --ref ${{ github.ref }} \ 85 | --field upstream-repository-organization=opendatahub-io \ 86 | --field codeflare-repository-organization=${{ github.event.inputs.codeflare-repository-organization }} \ 87 | --field codeflare_sdk_release_version=${{ github.event.inputs.release-version }} 88 | env: 89 | GITHUB_TOKEN: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }} 90 | shell: bash 91 | 92 | - name: Install Snyk CLI and setup monitoring for new release tag 93 | env: 94 | SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} 95 | SNYK_ORG: ${{ secrets.SNYK_ORG }} 96 | run: | 97 | echo "Installing Snyk CLI" 98 | npm install -g snyk 99 | 100 | echo "Fetching tags" 101 | git fetch origin 'refs/tags/*:refs/tags/*' 102 | 103 | echo "Authenticating with Snyk" 104 | snyk auth ${SNYK_TOKEN} 105 | 106 | echo "Scanning project: codeflare-sdk/v${{ github.event.inputs.release-version }}" 107 | git checkout v${{ github.event.inputs.release-version }} 108 | snyk monitor --all-projects --exclude=requirements.txt --org=${SNYK_ORG} --target-reference="$(git describe --tags)" 109 | -------------------------------------------------------------------------------- /.github/workflows/snyk-security.yaml: -------------------------------------------------------------------------------- 1 | name: Snyk Security 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | snyk-scan: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout code 12 | uses: actions/checkout@v4 13 | 14 | - name: Install Snyk CLI 15 | run: npm install -g snyk 16 | 17 | - name: Snyk Monitor and Test multiple projects 18 | env: 19 | SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} 20 | SNYK_ORG: ${{ secrets.SNYK_ORG }} 21 | run: | 22 | echo "Fetching tags" 23 | git fetch origin 'refs/tags/*:refs/tags/*' 24 | 25 | echo "Authenticating with Snyk" 26 | snyk auth ${SNYK_TOKEN} 27 | 28 | echo "Scanning project: codeflare-sdk/main" 29 | snyk monitor --all-projects --exclude=requirements.txt --org=${SNYK_ORG} --target-reference="main" 30 | -------------------------------------------------------------------------------- /.github/workflows/ui_notebooks_test.yaml: -------------------------------------------------------------------------------- 1 | name: UI notebooks tests 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | types: [ labeled ] 7 | 8 | concurrency: 9 | group: ${{ github.head_ref }}-${{ github.workflow }} 10 | cancel-in-progress: true 11 | 12 | env: 13 | CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" 14 | 15 | jobs: 16 | verify-3_widget_example: 17 | if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') || contains(github.event.pull_request.labels.*.name, 'test-ui-notebooks') }} 18 | runs-on: ubuntu-latest-4core 19 | 20 | steps: 21 | - name: Checkout code 22 | uses: actions/checkout@v4 23 | with: 24 | submodules: recursive 25 | 26 | - name: Checkout common repo code 27 | uses: actions/checkout@v4 28 | with: 29 | repository: "project-codeflare/codeflare-common" 30 | ref: "main" 31 | path: "common" 32 | 33 | - name: Checkout CodeFlare operator repository 34 | uses: actions/checkout@v4 35 | with: 36 | repository: project-codeflare/codeflare-operator 37 | path: codeflare-operator 38 | 39 | - name: Set Go 40 | uses: actions/setup-go@v5 41 | with: 42 | go-version-file: "./codeflare-operator/go.mod" 43 | cache-dependency-path: "./codeflare-operator/go.sum" 44 | 45 | - name: Set up gotestfmt 46 | uses: gotesttools/gotestfmt-action@v2 47 | with: 48 | token: ${{ secrets.GITHUB_TOKEN }} 49 | 50 | - name: Set up specific Python version 51 | uses: actions/setup-python@v5 52 | with: 53 | python-version: "3.11" 54 | cache: "pip" # caching pip dependencies 55 | 56 | - name: Setup and start KinD cluster 57 | uses: ./common/github-actions/kind 58 | 59 | - name: Deploy CodeFlare stack 60 | id: deploy 61 | run: | 62 | cd codeflare-operator 63 | echo Setting up CodeFlare stack 64 | make setup-e2e 65 | echo Deploying CodeFlare operator 66 | make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" 67 | kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager 68 | cd .. 69 | 70 | - name: Setup Guided notebooks execution 71 | run: | 72 | echo "Installing papermill and dependencies..." 73 | pip install poetry ipython ipykernel 74 | poetry config virtualenvs.create false 75 | echo "Installing SDK..." 76 | poetry install --with test,docs 77 | 78 | - name: Install Yarn dependencies 79 | run: | 80 | poetry run yarn install 81 | poetry run yarn playwright install chromium 82 | working-directory: ui-tests 83 | 84 | - name: Fix 3_widget_example.ipynb notebook for test 85 | run: | 86 | # Remove login/logout cells, as KinD doesn't support authentication using token 87 | jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb 88 | jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb 89 | # Set explicit namespace as SDK need it (currently) to resolve local queues 90 | sed -i "s|head_memory_limits=2,|head_memory_limits=2, namespace='default',|" 3_widget_example.ipynb 91 | sed -i "s|view_clusters()|view_clusters('default')|" 3_widget_example.ipynb 92 | working-directory: demo-notebooks/guided-demos 93 | 94 | - name: Run UI notebook tests 95 | run: | 96 | set -euo pipefail 97 | 98 | poetry run yarn test 99 | working-directory: ui-tests 100 | 101 | - name: Upload Playwright Test assets 102 | if: always() 103 | uses: actions/upload-artifact@v4 104 | with: 105 | name: ipywidgets-test-assets 106 | path: | 107 | ui-tests/test-results 108 | 109 | - name: Upload Playwright Test report 110 | if: always() 111 | uses: actions/upload-artifact@v4 112 | with: 113 | name: ipywidgets-test-report 114 | path: | 115 | ui-tests/playwright-report 116 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Python Tests 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | push: 7 | branches: [ main ] 8 | 9 | jobs: 10 | unit-tests: 11 | 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Set up python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.11' 20 | - name: Install poetry 21 | run: pip install poetry 22 | - name: Install dependencies with poetry 23 | run: | 24 | poetry config virtualenvs.create false 25 | poetry lock 26 | poetry install --with test 27 | - name: Test with pytest and check coverage 28 | run: | 29 | coverage run --omit="src/**/test_*.py,src/codeflare_sdk/common/utils/unit_test_support.py" -m pytest 30 | coverage=$(coverage report -m | tail -1 | tail -c 4 | head -c 2) 31 | if (( $coverage < 90 )); then echo "Coverage failed at ${coverage}%"; exit 1; else echo "Coverage passed, ${coverage}%"; fi 32 | - name: Upload to Codecov 33 | uses: codecov/codecov-action@v4 34 | with: 35 | token: ${{ secrets.CODECOV_TOKEN }} 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | .python-version 3 | __pycache__/ 4 | .coverage 5 | Pipfile 6 | Pipfile.lock 7 | .venv* 8 | build/ 9 | tls-cluster-namespace 10 | quicktest.yaml 11 | node_modules 12 | .DS_Store 13 | ui-tests/playwright-report 14 | ui-tests/test-results 15 | /src/codeflare_sdk.egg-info/ 16 | docs/sphinx/_build 17 | docs/sphinx/codeflare_sdk.*.rst 18 | docs/sphinx/codeflare_sdk.rst 19 | docs/sphinx/modules.rst 20 | .idea/ 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | args: [--allow-multiple-documents] 11 | - id: check-added-large-files 12 | - repo: https://github.com/psf/black 13 | rev: 23.3.0 14 | hooks: 15 | - id: black 16 | language_version: python3.9 17 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to the CodeFlare SDK 2 | 3 | Thank you for your interest in contributing to the CodeFlare SDK! 4 | 5 | ## Getting Started 6 | 7 | ### Prerequisites 8 | 9 | - Python 3.11 10 | - [Poetry](https://python-poetry.org/) 11 | 12 | ### Setting Up Your Development Environment 13 | 14 | 1. **Clone the repository:** 15 | 16 | ```sh 17 | git clone https://github.com/project-codeflare/codeflare-sdk.git 18 | cd codeflare-sdk 19 | ``` 20 | 21 | 2. Create a Poetry virtual environment: 22 | 23 | ```sh 24 | poetry shell 25 | ``` 26 | 27 | 3. Install dependencies: 28 | 29 | ```sh 30 | poetry install 31 | ``` 32 | 33 | - To include test dependencies, run: 34 | 35 | ```sh 36 | poetry install --with test 37 | ``` 38 | 39 | - To include docs dependencies, run: 40 | 41 | ```sh 42 | poetry install --with docs 43 | ``` 44 | 45 | - To include both test and docs dependencies, run: 46 | 47 | ```sh 48 | poetry install --with test,docs 49 | ``` 50 | 51 | ## Development Workflow 52 | 53 | ### Pre-commit 54 | 55 | We use pre-commit to ensure consistent code formatting. To enable pre-commit hooks, run: 56 | 57 | ```sh 58 | pre-commit install 59 | ``` 60 | 61 | ## Testing 62 | 63 | To install CodeFlare SDK in editable mode, run: 64 | 65 | ```sh 66 | pip install -e . 67 | ``` 68 | 69 | ### Unit Testing 70 | 71 | To run the unit tests, execute: 72 | 73 | ```sh 74 | pytest -v src/codeflare_sdk 75 | ``` 76 | 77 | ### Local e2e Testing 78 | 79 | - Please follow the [e2e documentation](https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/sphinx/user-docs/e2e.rst) 80 | 81 | #### Code Coverage 82 | 83 | - Run tests with the following command: `coverage run -m pytest` 84 | - To then view a code coverage report w/ missing lines, run `coverage report -m` 85 | 86 | ### Code Formatting 87 | 88 | - To check file formatting, in top-level dir run `black --check .` 89 | - To auto-reformat all files, remove the `--check` flag 90 | - To reformat an individual file, run `black ` 91 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | approvers: 2 | - astefanutti 3 | - Bobbins228 4 | - chipspeak 5 | - ChristianZaccaria 6 | - dimakis 7 | - Fiona-Waters 8 | - franciscojavierarceo 9 | - kpostoffice 10 | - kryanbeane 11 | - laurafitzgerald 12 | - pawelpaszki 13 | - pmccarthy 14 | - szaher 15 | - varshaprasad96 16 | reviewers: 17 | - astefanutti 18 | - Bobbins228 19 | - chipspeak 20 | - ChristianZaccaria 21 | - dimakis 22 | - Fiona-Waters 23 | - franciscojavierarceo 24 | - kpostoffice 25 | - kryanbeane 26 | - laurafitzgerald 27 | - pawelpaszki 28 | - pmccarthy 29 | - szaher 30 | - varshaprasad96 31 | - Ygnas 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CodeFlare SDK 2 | 3 | [![Python application](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/unit-tests.yml) 4 | ![coverage badge](./coverage.svg) 5 | 6 | An intuitive, easy-to-use python interface for batch resource requesting, access, job submission, and observation. Simplifying the developer's life while enabling access to high-performance compute resources, either in the cloud or on-prem. 7 | 8 | For guided demos and basics walkthroughs, check out the following links: 9 | 10 | - Guided demo notebooks available [here](https://github.com/project-codeflare/codeflare-sdk/tree/main/demo-notebooks/guided-demos), and copies of the notebooks with [expected output](https://github.com/project-codeflare/codeflare-sdk/tree/main/demo-notebooks/guided-demos/notebook-ex-outputs) also available 11 | - these demos can be copied into your current working directory when using the `codeflare-sdk` by using the `codeflare_sdk.copy_demo_nbs()` function 12 | - Additionally, we have a [video walkthrough](https://www.youtube.com/watch?v=U76iIfd9EmE) of these basic demos from June, 2023 13 | 14 | Full documentation can be found [here](https://project-codeflare.github.io/codeflare-sdk/index.html) 15 | 16 | ## Installation 17 | 18 | Can be installed via `pip`: `pip install codeflare-sdk` 19 | 20 | ## Development 21 | 22 | Please see our [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed instructions. 23 | 24 | ## Release Instructions 25 | 26 | ### Automated Releases 27 | 28 | It is possible to use the Release Github workflow to do the release. This is generally the process we follow for releases 29 | 30 | ### Manual Releases 31 | 32 | The following instructions apply when doing release manually. This may be required in instances where the automation is failing. 33 | 34 | - Check and update the version in "pyproject.toml" file. 35 | - Commit all the changes to the repository. 36 | - Create Github release (). 37 | - Build the Python package. `poetry build` 38 | - If not present already, add the API token to Poetry. 39 | `poetry config pypi-token.pypi API_TOKEN` 40 | - Publish the Python package. `poetry publish` 41 | - Trigger the [Publish Documentation](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/publish-documentation.yaml) workflow 42 | -------------------------------------------------------------------------------- /assets/images/sdk-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/assets/images/sdk-diagram.png -------------------------------------------------------------------------------- /coverage.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | coverage 17 | coverage 18 | 92% 19 | 92% 20 | 21 | 22 | -------------------------------------------------------------------------------- /demo-notebooks/additional-demos/remote_ray_job_client.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Submit a training job remotely to Ray Dashboard protected by oAuth.\n", 8 | "This notebook will demonstrate how to submit Ray jobs to an existing Raycluster, using the CodeFlare SDK.\n", 9 | "\n", 10 | "### Requirements\n", 11 | "* Ray Cluster running in OpenShift protected by oAuth.\n", 12 | "* The Ray Dashboard URL for the Ray Cluster.\n", 13 | "* An OpenShift authorization token with permissions to access the Route.\n", 14 | "* A training job, defined in python, within the working directory.\n", 15 | "* A requirements.txt or equivalent file containing any additional packages to install onto the Ray images." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# Import dependencies from codeflare-sdk\n", 25 | "from codeflare_sdk import RayJobClient" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Setup Authentication Configuration \n", 35 | "auth_token = \"XXXX\" # Replace with the actual token\n", 36 | "header = {\n", 37 | " 'Authorization': f'Bearer {auth_token}'\n", 38 | "}" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Gather the dashboard URL (provided by the creator of the RayCluster)\n", 48 | "ray_dashboard = \"XXXX\" # Replace with the Ray dashboard URL" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "#Initialize the RayJobClient\n", 58 | "client = RayJobClient(address=ray_dashboard, headers=header, verify=True)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Submit a job using the RayJobClient\n", 68 | "entrypoint_command = \"python XXXX\" # Replace with the training script name\n", 69 | "submission_id = client.submit_job(\n", 70 | " entrypoint=entrypoint_command,\n", 71 | " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", 72 | ")" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# Get the job's status\n", 82 | "client.get_job_status(submission_id)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Get the job's logs\n", 92 | "client.get_job_logs(submission_id)" 93 | ] 94 | } 95 | ], 96 | "metadata": { 97 | "language_info": { 98 | "name": "python" 99 | } 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 2 103 | } 104 | -------------------------------------------------------------------------------- /demo-notebooks/additional-demos/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==1.9.5 2 | ray_lightning 3 | torchmetrics==0.9.1 4 | torchvision==0.19.0 5 | minio 6 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/3_widget_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8d4a42f6", 6 | "metadata": {}, 7 | "source": [ 8 | "In this notebook, we will go through the basics of using the SDK to:\n", 9 | " - Spin up a Ray cluster with our desired resources\n", 10 | " - View the status and specs of our Ray cluster\n", 11 | " - Take down the Ray cluster when finished" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# Import pieces from codeflare-sdk\n", 22 | "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, view_clusters" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "614daa0c", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Create authentication object for user permissions\n", 33 | "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", 34 | "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", 35 | "auth = TokenAuthentication(\n", 36 | " token = \"XXXXX\",\n", 37 | " server = \"XXXXX\",\n", 38 | " skip_tls=False\n", 39 | ")\n", 40 | "auth.login()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "bc27f84c", 46 | "metadata": {}, 47 | "source": [ 48 | "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n", 49 | "\n", 50 | "NOTE: The default images used by the CodeFlare SDK for creating a RayCluster resource depend on the installed Python version:\n", 51 | "\n", 52 | "- For Python 3.11: 'quay.io/modh/ray:2.47.1-py311-cu121'\n", 53 | "\n", 54 | "If you prefer to use a custom Ray image that better suits your needs, you can specify it in the image field to override the default." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "0f4bc870-091f-4e11-9642-cba145710159", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# Create and configure our cluster object\n", 65 | "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", 66 | "cluster = Cluster(ClusterConfiguration(\n", 67 | " name='widgettest',\n", 68 | " head_cpu_requests='500m',\n", 69 | " head_cpu_limits='500m',\n", 70 | " head_memory_requests=2,\n", 71 | " head_memory_limits=2,\n", 72 | " head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n", 73 | " worker_extended_resource_requests={'nvidia.com/gpu':0},\n", 74 | " num_workers=2,\n", 75 | " worker_cpu_requests='250m',\n", 76 | " worker_cpu_limits=1,\n", 77 | " worker_memory_requests=2,\n", 78 | " worker_memory_limits=2,\n", 79 | " # image=\"\", # Optional Field\n", 80 | " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n", 81 | " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", 82 | "))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "3de6403c", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "view_clusters()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "2d8e6ce3", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "cluster.status()" 103 | ] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 3 (ipykernel)", 109 | "language": "python", 110 | "name": "python3" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.9.18" 123 | }, 124 | "vscode": { 125 | "interpreter": { 126 | "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" 127 | } 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 5 132 | } 133 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/download_mnist_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from torchvision.datasets import MNIST 17 | from torchvision import transforms 18 | 19 | 20 | def download_mnist_dataset(destination_dir): 21 | # Ensure the destination directory exists 22 | if not os.path.exists(destination_dir): 23 | os.makedirs(destination_dir) 24 | 25 | # Define transformations 26 | transform = transforms.Compose( 27 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 28 | ) 29 | 30 | # Download the training data 31 | train_set = MNIST( 32 | root=destination_dir, train=True, download=True, transform=transform 33 | ) 34 | 35 | # Download the test data 36 | test_set = MNIST( 37 | root=destination_dir, train=False, download=True, transform=transform 38 | ) 39 | 40 | print(f"MNIST dataset downloaded in {destination_dir}") 41 | 42 | 43 | # Specify the directory where you 44 | destination_dir = os.path.dirname(os.path.abspath(__file__)) 45 | 46 | download_mnist_dataset(destination_dir) 47 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/mnist.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # In[] 16 | import os 17 | 18 | import torch 19 | from pytorch_lightning import LightningModule, Trainer 20 | from pytorch_lightning.callbacks.progress import TQDMProgressBar 21 | from pytorch_lightning.loggers import CSVLogger 22 | from torch import nn 23 | from torch.nn import functional as F 24 | from torch.utils.data import DataLoader, random_split 25 | from torchmetrics import Accuracy 26 | from torchvision import transforms 27 | from torchvision.datasets import MNIST 28 | 29 | PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") 30 | BATCH_SIZE = 256 if torch.cuda.is_available() else 64 31 | # %% 32 | 33 | print("prior to running the trainer") 34 | print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) 35 | print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) 36 | 37 | 38 | class LitMNIST(LightningModule): 39 | def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): 40 | super().__init__() 41 | 42 | # Set our init args as class attributes 43 | self.data_dir = data_dir 44 | self.hidden_size = hidden_size 45 | self.learning_rate = learning_rate 46 | 47 | # Hardcode some dataset specific attributes 48 | self.num_classes = 10 49 | self.dims = (1, 28, 28) 50 | channels, width, height = self.dims 51 | self.transform = transforms.Compose( 52 | [ 53 | transforms.ToTensor(), 54 | transforms.Normalize((0.1307,), (0.3081,)), 55 | ] 56 | ) 57 | 58 | # Define PyTorch model 59 | self.model = nn.Sequential( 60 | nn.Flatten(), 61 | nn.Linear(channels * width * height, hidden_size), 62 | nn.ReLU(), 63 | nn.Dropout(0.1), 64 | nn.Linear(hidden_size, hidden_size), 65 | nn.ReLU(), 66 | nn.Dropout(0.1), 67 | nn.Linear(hidden_size, self.num_classes), 68 | ) 69 | 70 | self.val_accuracy = Accuracy() 71 | self.test_accuracy = Accuracy() 72 | 73 | def forward(self, x): 74 | x = self.model(x) 75 | return F.log_softmax(x, dim=1) 76 | 77 | def training_step(self, batch, batch_idx): 78 | x, y = batch 79 | logits = self(x) 80 | loss = F.nll_loss(logits, y) 81 | return loss 82 | 83 | def validation_step(self, batch, batch_idx): 84 | x, y = batch 85 | logits = self(x) 86 | loss = F.nll_loss(logits, y) 87 | preds = torch.argmax(logits, dim=1) 88 | self.val_accuracy.update(preds, y) 89 | 90 | # Calling self.log will surface up scalars for you in TensorBoard 91 | self.log("val_loss", loss, prog_bar=True) 92 | self.log("val_acc", self.val_accuracy, prog_bar=True) 93 | 94 | def test_step(self, batch, batch_idx): 95 | x, y = batch 96 | logits = self(x) 97 | loss = F.nll_loss(logits, y) 98 | preds = torch.argmax(logits, dim=1) 99 | self.test_accuracy.update(preds, y) 100 | 101 | # Calling self.log will surface up scalars for you in TensorBoard 102 | self.log("test_loss", loss, prog_bar=True) 103 | self.log("test_acc", self.test_accuracy, prog_bar=True) 104 | 105 | def configure_optimizers(self): 106 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 107 | return optimizer 108 | 109 | #################### 110 | # DATA RELATED HOOKS 111 | #################### 112 | 113 | def prepare_data(self): 114 | # download 115 | print("Downloading MNIST dataset...") 116 | MNIST(self.data_dir, train=True, download=True) 117 | MNIST(self.data_dir, train=False, download=True) 118 | 119 | def setup(self, stage=None): 120 | # Assign train/val datasets for use in dataloaders 121 | if stage == "fit" or stage is None: 122 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 123 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 124 | 125 | # Assign test dataset for use in dataloader(s) 126 | if stage == "test" or stage is None: 127 | self.mnist_test = MNIST( 128 | self.data_dir, train=False, transform=self.transform 129 | ) 130 | 131 | def train_dataloader(self): 132 | return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) 133 | 134 | def val_dataloader(self): 135 | return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) 136 | 137 | def test_dataloader(self): 138 | return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) 139 | 140 | 141 | # Init DataLoader from MNIST Dataset 142 | 143 | model = LitMNIST() 144 | 145 | print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) 146 | print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) 147 | 148 | # Initialize a trainer 149 | trainer = Trainer( 150 | accelerator="auto", 151 | # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs 152 | max_epochs=5, 153 | callbacks=[TQDMProgressBar(refresh_rate=20)], 154 | num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), 155 | devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), 156 | strategy="ddp", 157 | ) 158 | 159 | # Train the model ⚡ 160 | trainer.fit(model) 161 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/mnist_fashion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import ray 4 | from torch.utils.data import DataLoader 5 | from torchvision import datasets 6 | from torchvision.transforms import ToTensor 7 | from ray.train.torch import TorchTrainer 8 | from ray.train import ScalingConfig 9 | 10 | 11 | def get_dataset(): 12 | return datasets.FashionMNIST( 13 | root="/tmp/data", 14 | train=True, 15 | download=True, 16 | transform=ToTensor(), 17 | ) 18 | 19 | 20 | class NeuralNetwork(nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.flatten = nn.Flatten() 24 | self.linear_relu_stack = nn.Sequential( 25 | nn.Linear(28 * 28, 512), 26 | nn.ReLU(), 27 | nn.Linear(512, 512), 28 | nn.ReLU(), 29 | nn.Linear(512, 10), 30 | ) 31 | 32 | def forward(self, inputs): 33 | inputs = self.flatten(inputs) 34 | logits = self.linear_relu_stack(inputs) 35 | return logits 36 | 37 | 38 | def get_dataset(): 39 | return datasets.FashionMNIST( 40 | root="/tmp/data", 41 | train=True, 42 | download=True, 43 | transform=ToTensor(), 44 | ) 45 | 46 | 47 | def train_func_distributed(): 48 | num_epochs = 3 49 | batch_size = 64 50 | 51 | dataset = get_dataset() 52 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 53 | dataloader = ray.train.torch.prepare_data_loader(dataloader) 54 | 55 | model = NeuralNetwork() 56 | model = ray.train.torch.prepare_model(model) 57 | 58 | criterion = nn.CrossEntropyLoss() 59 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 60 | 61 | for epoch in range(num_epochs): 62 | if ray.train.get_context().get_world_size() > 1: 63 | dataloader.sampler.set_epoch(epoch) 64 | 65 | for inputs, labels in dataloader: 66 | optimizer.zero_grad() 67 | pred = model(inputs) 68 | loss = criterion(pred, labels) 69 | loss.backward() 70 | optimizer.step() 71 | print(f"epoch: {epoch}, loss: {loss.item()}") 72 | 73 | 74 | # For GPU Training, set `use_gpu` to True. 75 | use_gpu = True 76 | 77 | # To learn more about configuring S3 compatible storage check out our docs -> https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/s3-compatible-storage.md 78 | trainer = TorchTrainer( 79 | train_func_distributed, 80 | scaling_config=ScalingConfig( 81 | # num_workers = number of worker nodes with the ray head node included 82 | num_workers=3, 83 | use_gpu=use_gpu, 84 | resources_per_worker={ 85 | "CPU": 1, 86 | }, 87 | trainer_resources={ 88 | "CPU": 0, 89 | }, 90 | ), 91 | ) 92 | 93 | results = trainer.fit() 94 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/notebook-ex-outputs/jobtest.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta1 2 | kind: AppWrapper 3 | metadata: 4 | name: jobtest 5 | namespace: default 6 | spec: 7 | priority: 9 8 | resources: 9 | GenericItems: 10 | - custompodresources: 11 | - limits: 12 | cpu: 2 13 | memory: 8G 14 | nvidia.com/gpu: 0 15 | replicas: 1 16 | requests: 17 | cpu: 2 18 | memory: 8G 19 | nvidia.com/gpu: 0 20 | - limits: 21 | cpu: 1 22 | memory: 4G 23 | nvidia.com/gpu: 0 24 | replicas: 2 25 | requests: 26 | cpu: 1 27 | memory: 4G 28 | nvidia.com/gpu: 0 29 | generictemplate: 30 | apiVersion: ray.io/v1 31 | kind: RayCluster 32 | metadata: 33 | labels: 34 | appwrapper.mcad.ibm.com: jobtest 35 | controller-tools.k8s.io: '1.0' 36 | name: jobtest 37 | namespace: default 38 | spec: 39 | autoscalerOptions: 40 | idleTimeoutSeconds: 60 41 | imagePullPolicy: Always 42 | resources: 43 | limits: 44 | cpu: 500m 45 | memory: 512Mi 46 | requests: 47 | cpu: 500m 48 | memory: 512Mi 49 | upscalingMode: Default 50 | enableInTreeAutoscaling: false 51 | headGroupSpec: 52 | rayStartParams: 53 | block: 'true' 54 | dashboard-host: 0.0.0.0 55 | num-gpus: '0' 56 | serviceType: ClusterIP 57 | template: 58 | spec: 59 | containers: 60 | - env: 61 | - name: MY_POD_IP 62 | valueFrom: 63 | fieldRef: 64 | fieldPath: status.podIP 65 | - name: RAY_USE_TLS 66 | value: '0' 67 | - name: RAY_TLS_SERVER_CERT 68 | value: /home/ray/workspace/tls/server.crt 69 | - name: RAY_TLS_SERVER_KEY 70 | value: /home/ray/workspace/tls/server.key 71 | - name: RAY_TLS_CA_CERT 72 | value: /home/ray/workspace/tls/ca.crt 73 | image: quay.io/modh/ray:2.47.1-py311-cu121 74 | imagePullPolicy: Always 75 | lifecycle: 76 | preStop: 77 | exec: 78 | command: 79 | - /bin/sh 80 | - -c 81 | - ray stop 82 | name: ray-head 83 | ports: 84 | - containerPort: 6379 85 | name: gcs 86 | - containerPort: 8265 87 | name: dashboard 88 | - containerPort: 10001 89 | name: client 90 | resources: 91 | limits: 92 | cpu: 2 93 | memory: 8G 94 | nvidia.com/gpu: 0 95 | requests: 96 | cpu: 2 97 | memory: 8G 98 | nvidia.com/gpu: 0 99 | imagePullSecrets: [] 100 | rayVersion: 2.47.1 101 | workerGroupSpecs: 102 | - groupName: small-group-jobtest 103 | maxReplicas: 2 104 | minReplicas: 2 105 | rayStartParams: 106 | block: 'true' 107 | num-gpus: '0' 108 | replicas: 2 109 | template: 110 | metadata: 111 | annotations: 112 | key: value 113 | labels: 114 | key: value 115 | spec: 116 | containers: 117 | - env: 118 | - name: MY_POD_IP 119 | valueFrom: 120 | fieldRef: 121 | fieldPath: status.podIP 122 | - name: RAY_USE_TLS 123 | value: '0' 124 | - name: RAY_TLS_SERVER_CERT 125 | value: /home/ray/workspace/tls/server.crt 126 | - name: RAY_TLS_SERVER_KEY 127 | value: /home/ray/workspace/tls/server.key 128 | - name: RAY_TLS_CA_CERT 129 | value: /home/ray/workspace/tls/ca.crt 130 | image: quay.io/modh/ray:2.47.1-py311-cu121 131 | lifecycle: 132 | preStop: 133 | exec: 134 | command: 135 | - /bin/sh 136 | - -c 137 | - ray stop 138 | name: machine-learning 139 | resources: 140 | limits: 141 | cpu: 1 142 | memory: 4G 143 | nvidia.com/gpu: 0 144 | requests: 145 | cpu: 1 146 | memory: 4G 147 | nvidia.com/gpu: 0 148 | imagePullSecrets: [] 149 | replicas: 1 150 | - generictemplate: 151 | apiVersion: route.openshift.io/v1 152 | kind: Route 153 | metadata: 154 | labels: 155 | odh-ray-cluster-service: jobtest-head-svc 156 | name: ray-dashboard-jobtest 157 | namespace: default 158 | spec: 159 | port: 160 | targetPort: dashboard 161 | to: 162 | kind: Service 163 | name: jobtest-head-svc 164 | replicas: 1 165 | Items: [] 166 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/notebook-ex-outputs/raytest.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta1 2 | kind: AppWrapper 3 | metadata: 4 | name: raytest 5 | namespace: default 6 | spec: 7 | priority: 9 8 | resources: 9 | GenericItems: 10 | - custompodresources: 11 | - limits: 12 | cpu: 2 13 | memory: 8G 14 | nvidia.com/gpu: 0 15 | replicas: 1 16 | requests: 17 | cpu: 2 18 | memory: 8G 19 | nvidia.com/gpu: 0 20 | - limits: 21 | cpu: 1 22 | memory: 4G 23 | nvidia.com/gpu: 0 24 | replicas: 2 25 | requests: 26 | cpu: 1 27 | memory: 4G 28 | nvidia.com/gpu: 0 29 | generictemplate: 30 | apiVersion: ray.io/v1 31 | kind: RayCluster 32 | metadata: 33 | labels: 34 | appwrapper.mcad.ibm.com: raytest 35 | controller-tools.k8s.io: '1.0' 36 | name: raytest 37 | namespace: default 38 | spec: 39 | autoscalerOptions: 40 | idleTimeoutSeconds: 60 41 | imagePullPolicy: Always 42 | resources: 43 | limits: 44 | cpu: 500m 45 | memory: 512Mi 46 | requests: 47 | cpu: 500m 48 | memory: 512Mi 49 | upscalingMode: Default 50 | enableInTreeAutoscaling: false 51 | headGroupSpec: 52 | rayStartParams: 53 | block: 'true' 54 | dashboard-host: 0.0.0.0 55 | num-gpus: '0' 56 | serviceType: ClusterIP 57 | template: 58 | spec: 59 | containers: 60 | - env: 61 | - name: MY_POD_IP 62 | valueFrom: 63 | fieldRef: 64 | fieldPath: status.podIP 65 | - name: RAY_USE_TLS 66 | value: '0' 67 | - name: RAY_TLS_SERVER_CERT 68 | value: /home/ray/workspace/tls/server.crt 69 | - name: RAY_TLS_SERVER_KEY 70 | value: /home/ray/workspace/tls/server.key 71 | - name: RAY_TLS_CA_CERT 72 | value: /home/ray/workspace/tls/ca.crt 73 | image: quay.io/modh/ray:2.47.1-py311-cu121 74 | imagePullPolicy: Always 75 | lifecycle: 76 | preStop: 77 | exec: 78 | command: 79 | - /bin/sh 80 | - -c 81 | - ray stop 82 | name: ray-head 83 | ports: 84 | - containerPort: 6379 85 | name: gcs 86 | - containerPort: 8265 87 | name: dashboard 88 | - containerPort: 10001 89 | name: client 90 | resources: 91 | limits: 92 | cpu: 2 93 | memory: 8G 94 | nvidia.com/gpu: 0 95 | requests: 96 | cpu: 2 97 | memory: 8G 98 | nvidia.com/gpu: 0 99 | imagePullSecrets: [] 100 | rayVersion: 2.47.1 101 | workerGroupSpecs: 102 | - groupName: small-group-raytest 103 | maxReplicas: 2 104 | minReplicas: 2 105 | rayStartParams: 106 | block: 'true' 107 | num-gpus: '0' 108 | replicas: 2 109 | template: 110 | metadata: 111 | annotations: 112 | key: value 113 | labels: 114 | key: value 115 | spec: 116 | containers: 117 | - env: 118 | - name: MY_POD_IP 119 | valueFrom: 120 | fieldRef: 121 | fieldPath: status.podIP 122 | - name: RAY_USE_TLS 123 | value: '0' 124 | - name: RAY_TLS_SERVER_CERT 125 | value: /home/ray/workspace/tls/server.crt 126 | - name: RAY_TLS_SERVER_KEY 127 | value: /home/ray/workspace/tls/server.key 128 | - name: RAY_TLS_CA_CERT 129 | value: /home/ray/workspace/tls/ca.crt 130 | image: quay.io/modh/ray:2.47.1-py311-cu121 131 | lifecycle: 132 | preStop: 133 | exec: 134 | command: 135 | - /bin/sh 136 | - -c 137 | - ray stop 138 | name: machine-learning 139 | resources: 140 | limits: 141 | cpu: 1 142 | memory: 4G 143 | nvidia.com/gpu: 0 144 | requests: 145 | cpu: 1 146 | memory: 4G 147 | nvidia.com/gpu: 0 148 | imagePullSecrets: [] 149 | replicas: 1 150 | - generictemplate: 151 | apiVersion: route.openshift.io/v1 152 | kind: Route 153 | metadata: 154 | labels: 155 | odh-ray-cluster-service: raytest-head-svc 156 | name: ray-dashboard-raytest 157 | namespace: default 158 | spec: 159 | port: 160 | targetPort: dashboard 161 | to: 162 | kind: Service 163 | name: raytest-head-svc 164 | replicas: 1 165 | Items: [] 166 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/notebook-ex-outputs/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==2.4.0 2 | ray_lightning 3 | torchmetrics==1.6.0 4 | torchvision==0.20.1 5 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/preview_nbs/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==2.4.0 2 | ray_lightning 3 | torchmetrics==1.6.0 4 | torchvision==0.20.1 5 | -------------------------------------------------------------------------------- /demo-notebooks/guided-demos/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==2.4.0 2 | ray_lightning 3 | torchmetrics==1.6.0 4 | torchvision==0.20.1 5 | -------------------------------------------------------------------------------- /docs/generate-documentation.md: -------------------------------------------------------------------------------- 1 | # Generate CodeFlare Documentation with Sphinx 2 | The following is a short guide on how you can use Sphinx to auto-generate code documentation. Documentation for the latest SDK release can be found [here](https://project-codeflare.github.io/codeflare-sdk/index.html). 3 | 4 | 1. Clone the CodeFlare SDK 5 | ``` bash 6 | git clone https://github.com/project-codeflare/codeflare-sdk.git 7 | ``` 8 | 2. [Install Sphinx](https://www.sphinx-doc.org/en/master/usage/installation.html) 9 | 3. Run the below command to generate code documentation 10 | ``` bash 11 | sphinx-apidoc -o docs/sphinx src/codeflare_sdk "**/*test_*" --force # Generates RST files 12 | make html -C docs/sphinx # Builds HTML files 13 | ``` 14 | 4. You can access the docs locally at `docs/sphinx/_build/html/index.html` 15 | -------------------------------------------------------------------------------- /docs/images/codeflare_sdk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/docs/images/codeflare_sdk.png -------------------------------------------------------------------------------- /docs/images/codeflare_stack_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/docs/images/codeflare_stack_arch.png -------------------------------------------------------------------------------- /docs/sphinx/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/sphinx/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert(0, os.path.abspath("..")) 10 | 11 | # -- Project information ----------------------------------------------------- 12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 13 | 14 | project = "CodeFlare SDK" 15 | copyright = "2024, Project CodeFlare" 16 | author = "Project CodeFlare" 17 | release = "v0.21.1" 18 | 19 | # -- General configuration --------------------------------------------------- 20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 21 | 22 | extensions = [ 23 | "sphinx.ext.autodoc", 24 | "sphinx.ext.todo", 25 | "sphinx.ext.viewcode", 26 | "sphinx.ext.autosummary", 27 | "sphinx_rtd_theme", 28 | ] 29 | 30 | templates_path = ["_templates"] 31 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 32 | 33 | 34 | # -- Options for HTML output ------------------------------------------------- 35 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 36 | 37 | html_theme = "sphinx_rtd_theme" 38 | html_static_path = ["_static"] 39 | -------------------------------------------------------------------------------- /docs/sphinx/index.rst: -------------------------------------------------------------------------------- 1 | .. CodeFlare SDK documentation master file, created by 2 | sphinx-quickstart on Thu Oct 10 11:27:58 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | CodeFlare SDK documentation 7 | =========================== 8 | 9 | The CodeFlare SDK is an intuitive, easy-to-use python interface for batch resource requesting, access, job submission, and observation. Simplifying the developer's life while enabling access to high-performance compute resources, either in the cloud or on-prem. 10 | 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | :caption: Code Documentation: 15 | 16 | modules 17 | 18 | .. toctree:: 19 | :maxdepth: 1 20 | :caption: User Documentation: 21 | 22 | user-docs/authentication 23 | user-docs/cluster-configuration 24 | user-docs/ray-cluster-interaction 25 | user-docs/e2e 26 | user-docs/s3-compatible-storage 27 | user-docs/setup-kueue 28 | user-docs/ui-widgets 29 | 30 | Quick Links 31 | =========== 32 | - `PyPi `__ 33 | - `GitHub `__ 34 | - `OpenShift AI Documentation `__ 35 | -------------------------------------------------------------------------------- /docs/sphinx/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/authentication.rst: -------------------------------------------------------------------------------- 1 | Authentication via the CodeFlare SDK 2 | ==================================== 3 | 4 | Currently there are four ways of authenticating to your cluster via the 5 | SDK. Authenticating with your cluster allows you to perform actions such 6 | as creating Ray Clusters and Job Submission. 7 | 8 | Method 1 Token Authentication 9 | ----------------------------- 10 | 11 | This is how a typical user would authenticate to their cluster using 12 | ``TokenAuthentication``. 13 | 14 | :: 15 | 16 | from codeflare_sdk import TokenAuthentication 17 | 18 | auth = TokenAuthentication( 19 | token = "XXXXX", 20 | server = "XXXXX", 21 | skip_tls=False, 22 | # ca_cert_path="/path/to/cert" 23 | ) 24 | auth.login() 25 | # log out with auth.logout() 26 | 27 | Setting ``skip_tls=True`` allows interaction with an HTTPS server 28 | bypassing the server certificate checks although this is not secure. You 29 | can pass a custom certificate to ``TokenAuthentication`` by using 30 | ``ca_cert_path="/path/to/cert"`` when authenticating provided 31 | ``skip_tls=False``. Alternatively you can set the environment variable 32 | ``CF_SDK_CA_CERT_PATH`` to the path of your custom certificate. 33 | 34 | Method 2 Kubernetes Config File Authentication (Default location) 35 | ----------------------------------------------------------------- 36 | 37 | If a user has authenticated to their cluster by alternate means e.g. run 38 | a login command like ``oc login --token= --server=`` 39 | their kubernetes config file should have updated. If the user has not 40 | specifically authenticated through the SDK by other means such as 41 | ``TokenAuthentication`` then the SDK will try to use their default 42 | Kubernetes config file located at ``"$HOME/.kube/config"``. 43 | 44 | Method 3 Specifying a Kubernetes Config File 45 | -------------------------------------------- 46 | 47 | A user can specify a config file via a different authentication class 48 | ``KubeConfigFileAuthentication`` for authenticating with the SDK. This 49 | is what loading a custom config file would typically look like. 50 | 51 | :: 52 | 53 | from codeflare_sdk import KubeConfigFileAuthentication 54 | 55 | auth = KubeConfigFileAuthentication( 56 | kube_config_path="/path/to/config", 57 | ) 58 | auth.load_kube_config() 59 | # log out with auth.logout() 60 | 61 | Method 4 In-Cluster Authentication 62 | ---------------------------------- 63 | 64 | If a user does not authenticate by any of the means detailed above and 65 | does not have a config file at ``"$HOME/.kube/config"`` the SDK will try 66 | to authenticate with the in-cluster configuration file. 67 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/images/ui-buttons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/docs/sphinx/user-docs/images/ui-buttons.png -------------------------------------------------------------------------------- /docs/sphinx/user-docs/images/ui-view-clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/docs/sphinx/user-docs/images/ui-view-clusters.png -------------------------------------------------------------------------------- /docs/sphinx/user-docs/ray-cluster-interaction.rst: -------------------------------------------------------------------------------- 1 | Ray Cluster Interaction 2 | ======================= 3 | 4 | The CodeFlare SDK offers multiple ways to interact with Ray Clusters 5 | including the below methods. 6 | 7 | get_cluster() 8 | ------------- 9 | 10 | The ``get_cluster()`` function is used to initialise a ``Cluster`` 11 | object from a pre-existing Ray Cluster/AppWrapper. Below is an example 12 | of it's usage: 13 | 14 | :: 15 | 16 | from codeflare_sdk import get_cluster 17 | cluster = get_cluster(cluster_name="raytest", namespace="example", is_appwrapper=False, write_to_file=False) 18 | -> output: Yaml resources loaded for raytest 19 | cluster.status() 20 | -> output: 21 | 🚀 CodeFlare Cluster Status 🚀 22 | ╭─────────────────────────────────────────────────────────────────╮ 23 | │ Name │ 24 | │ raytest Active ✅ │ 25 | │ │ 26 | │ URI: ray://raytest-head-svc.example.svc:10001 │ 27 | │ │ 28 | │ Dashboard🔗 │ 29 | │ │ 30 | ╰─────────────────────────────────────────────────────────────────╯ 31 | (, True) 32 | cluster.down() 33 | cluster.up() # This function will create an exact copy of the retrieved Ray Cluster only if the Ray Cluster has been previously deleted. 34 | 35 | | These are the parameters the ``get_cluster()`` function accepts: 36 | | ``cluster_name: str # Required`` -> The name of the Ray Cluster. 37 | | ``namespace: str # Default: "default"`` -> The namespace of the Ray Cluster. 38 | | ``is_appwrapper: bool # Default: False`` -> When set to 39 | | ``True`` the function will attempt to retrieve an AppWrapper instead of a Ray Cluster. 40 | | ``write_to_file: bool # Default: False`` -> When set to ``True`` the Ray Cluster/AppWrapper will be written to a file similar to how it is done in ``ClusterConfiguration``. 41 | 42 | list_all_queued() 43 | ----------------- 44 | 45 | | The ``list_all_queued()`` function returns (and prints by default) a list of all currently queued-up Ray Clusters in a given namespace. 46 | | It accepts the following parameters: 47 | | ``namespace: str # Required`` -> The namespace you want to retrieve the list from. 48 | | ``print_to_console: bool # Default: True`` -> Allows the user to print the list to their console. 49 | | ``appwrapper: bool # Default: False`` -> When set to ``True`` allows the user to list queued AppWrappers. 50 | 51 | list_all_clusters() 52 | ------------------- 53 | 54 | | The ``list_all_clusters()`` function will return a list of detailed descriptions of Ray Clusters to the console by default. 55 | | It accepts the following parameters: 56 | | ``namespace: str # Required`` -> The namespace you want to retrieve the list from. 57 | | ``print_to_console: bool # Default: True`` -> A boolean that allows the user to print the list to their console. 58 | 59 | .. note:: 60 | 61 | The following methods require a ``Cluster`` object to be 62 | initialized. See :doc:`./cluster-configuration` 63 | 64 | cluster.up() 65 | ------------ 66 | 67 | | The ``cluster.up()`` function creates a Ray Cluster in the given namespace. 68 | 69 | cluster.apply() 70 | ------------ 71 | 72 | | The ``cluster.apply()`` function applies a Ray Cluster in the given namespace. If the cluster already exists, it is updated. 73 | | If it does not exist it is created. 74 | 75 | cluster.down() 76 | -------------- 77 | 78 | | The ``cluster.down()`` function deletes the Ray Cluster in the given namespace. 79 | 80 | cluster.status() 81 | ---------------- 82 | 83 | | The ``cluster.status()`` function prints out the status of the Ray Cluster's state with a link to the Ray Dashboard. 84 | 85 | cluster.details() 86 | ----------------- 87 | 88 | | The ``cluster.details()`` function prints out a detailed description of the Ray Cluster's status, worker resources and a link to the Ray Dashboard. 89 | 90 | cluster.wait_ready() 91 | -------------------- 92 | 93 | | The ``cluster.wait_ready()`` function waits for the requested cluster to be ready, up to an optional timeout and checks every 5 seconds. 94 | | It accepts the following parameters: 95 | | ``timeout: Optional[int] # Default: None`` -> Allows the user to define a timeout for the ``wait_ready()`` function. 96 | | ``dashboard_check: bool # Default: True`` -> If enabled the ``wait_ready()`` function will wait until the Ray Dashboard is ready too. 97 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/s3-compatible-storage.rst: -------------------------------------------------------------------------------- 1 | S3 compatible storage with Ray Train examples 2 | ============================================= 3 | 4 | Some of our distributed training examples require an external storage 5 | solution so that all nodes can access the same data. The following are 6 | examples for configuring S3 or Minio storage for your Ray Train script 7 | or interactive session. 8 | 9 | S3 Bucket 10 | --------- 11 | 12 | In your Python Script add the following environment variables: 13 | 14 | .. code:: python 15 | 16 | os.environ["AWS_ACCESS_KEY_ID"] = "XXXXXXXX" 17 | os.environ["AWS_SECRET_ACCESS_KEY"] = "XXXXXXXX" 18 | os.environ["AWS_DEFAULT_REGION"] = "XXXXXXXX" 19 | 20 | Alternatively you can specify these variables in your runtime 21 | environment on Job Submission. 22 | 23 | .. code:: python 24 | 25 | submission_id = client.submit_job( 26 | entrypoint=..., 27 | runtime_env={ 28 | "env_vars": { 29 | "AWS_ACCESS_KEY_ID": os.environ.get('AWS_ACCESS_KEY_ID'), 30 | "AWS_SECRET_ACCESS_KEY": os.environ.get('AWS_SECRET_ACCESS_KEY'), 31 | "AWS_DEFAULT_REGION": os.environ.get('AWS_DEFAULT_REGION') 32 | }, 33 | } 34 | ) 35 | 36 | In your Trainer configuration you can specify a ``run_config`` which 37 | will utilise your external storage. 38 | 39 | .. code:: python 40 | 41 | trainer = TorchTrainer( 42 | train_func_distributed, 43 | scaling_config=scaling_config, 44 | run_config = ray.train.RunConfig(storage_path="s3://BUCKET_NAME/SUB_PATH/", name="unique_run_name") 45 | ) 46 | 47 | To learn more about Amazon S3 Storage you can find information 48 | `here `__. 49 | 50 | Minio Bucket 51 | ------------ 52 | 53 | In your Python Script add the following function for configuring your 54 | run_config: 55 | 56 | .. code:: python 57 | 58 | import s3fs 59 | import pyarrow 60 | 61 | def get_minio_run_config(): 62 | s3_fs = s3fs.S3FileSystem( 63 | key = os.getenv('MINIO_ACCESS_KEY', "XXXXX"), 64 | secret = os.getenv('MINIO_SECRET_ACCESS_KEY', "XXXXX"), 65 | endpoint_url = os.getenv('MINIO_URL', "XXXXX") 66 | ) 67 | custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs)) 68 | run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs) 69 | return run_config 70 | 71 | You can update the ``run_config`` to further suit your needs above. 72 | Lastly the new ``run_config`` must be added to the Trainer: 73 | 74 | .. code:: python 75 | 76 | trainer = TorchTrainer( 77 | train_func_distributed, 78 | scaling_config=scaling_config, 79 | run_config = get_minio_run_config() 80 | ) 81 | 82 | To find more information on creating a Minio Bucket compatible with 83 | RHOAI you can refer to this 84 | `documentation `__. 85 | Note: You must have ``s3fs`` and ``pyarrow`` installed in your 86 | environment for this method. 87 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/setup-kueue.rst: -------------------------------------------------------------------------------- 1 | Basic Kueue Resources configuration 2 | =================================== 3 | 4 | Introduction: 5 | ------------- 6 | 7 | This document is designed for administrators who have Kueue installed on 8 | their cluster. We will walk through the process of setting up essential 9 | Kueue resources, namely Cluster Queue, Resource Flavor, and Local Queue. 10 | 11 | 1. Resource Flavor: 12 | ------------------- 13 | 14 | Resource Flavors allow the cluster admin to reflect differing resource capabilities 15 | of nodes within a clusters, such as CPU, memory, GPU, etc. These can then be assigned 16 | to workloads to ensure they are executed on nodes with appropriate resources. 17 | 18 | The YAML configuration provided below creates an empty Resource Flavor 19 | named default-flavor. It serves as a starting point and does not specify 20 | any detailed resource characteristics. 21 | 22 | .. code:: yaml 23 | 24 | apiVersion: kueue.x-k8s.io/v1beta1 25 | kind: ResourceFlavor 26 | metadata: 27 | name: default-flavor 28 | 29 | For more detailed information on Resource Flavor configuration options, 30 | refer to the Kueue documentation: `Resource Flavor 31 | Configuration `__ 32 | 33 | 2. Cluster Queue: 34 | ----------------- 35 | 36 | A Cluster Queue represents a shared queue across the entire cluster. It 37 | allows the cluster admin to define global settings for workload 38 | prioritization and resource allocation. 39 | 40 | When setting up a Cluster Queue in Kueue, it’s crucial that the resource 41 | specifications match the actual capacities and operational requirements 42 | of your cluster. The example provided outlines a basic setup; however, 43 | each cluster may have different resource availabilities and needs. 44 | 45 | .. code:: yaml 46 | 47 | apiVersion: kueue.x-k8s.io/v1beta1 48 | kind: ClusterQueue 49 | metadata: 50 | name: "cluster-queue" 51 | spec: 52 | namespaceSelector: {} # match all. 53 | resourceGroups: 54 | - coveredResources: ["cpu", "memory", "pods", "nvidia.com/gpu"] 55 | flavors: 56 | - name: "default-flavor" 57 | resources: 58 | - name: "cpu" 59 | nominalQuota: 9 60 | - name: "memory" 61 | nominalQuota: 36Gi 62 | - name: "pods" 63 | nominalQuota: 5 64 | - name: "nvidia.com/gpu" 65 | nominalQuota: '0' 66 | 67 | For more detailed information on Cluster Queue configuration options, 68 | refer to the Kueue documentation: `Cluster Queue 69 | Configuration `__ 70 | 71 | 3. Local Queue (With Default Annotation): 72 | ----------------------------------------- 73 | 74 | A Local Queue represents a queue associated with a specific namespace 75 | within the cluster. It allows namespace-level control over workload 76 | prioritization and resource allocation. 77 | 78 | .. code:: yaml 79 | 80 | apiVersion: kueue.x-k8s.io/v1beta1 81 | kind: LocalQueue 82 | metadata: 83 | namespace: team-a 84 | name: team-a-queue 85 | annotations: 86 | kueue.x-k8s.io/default-queue: "true" 87 | spec: 88 | clusterQueue: cluster-queue 89 | 90 | In the LocalQueue configuration provided above, the annotations field 91 | specifies ``kueue.x-k8s.io/default-queue: "true"``. This annotation 92 | indicates that the team-a-queue is designated as the default queue for 93 | the team-a namespace. When this is set, any workloads submitted to the 94 | team-a namespace without explicitly specifying a queue will 95 | automatically be routed to the team-a-queue. 96 | 97 | For more detailed information on Local Queue configuration options, 98 | refer to the Kueue documentation: `Local Queue 99 | Configuration `__ 100 | 101 | Conclusion: 102 | ----------- 103 | 104 | By following the steps outlined in this document, the cluster admin can 105 | successfully create the basic Kueue resources necessary for workload 106 | management in the cluster. For more advanced configurations and 107 | features, please refer to the comprehensive `Kueue 108 | documentation `__. 109 | -------------------------------------------------------------------------------- /docs/sphinx/user-docs/ui-widgets.rst: -------------------------------------------------------------------------------- 1 | Jupyter UI Widgets 2 | ================== 3 | 4 | Below are some examples of the Jupyter UI Widgets that are included in 5 | the CodeFlare SDK. 6 | 7 | .. note:: 8 | To use the widgets functionality you must be using the CodeFlare SDK in a Jupyter Notebook environment. 9 | 10 | Cluster Up/Down Buttons 11 | ----------------------- 12 | 13 | The Cluster Up/Down buttons appear after successfully initialising your 14 | `ClusterConfiguration `__. 15 | There are two buttons and a checkbox ``Cluster Up``, ``Cluster Down`` 16 | and ``Wait for Cluster?`` which mimic the 17 | `cluster.up() `__, 18 | `cluster.down() `__ and 19 | `cluster.wait_ready() `__ 20 | functionality. 21 | 22 | After initialising their ``ClusterConfiguration`` a user can select the 23 | ``Wait for Cluster?`` checkbox then click the ``Cluster Up`` button to 24 | create their Ray Cluster and wait until it is ready. The cluster can be 25 | deleted by clicking the ``Cluster Down`` button. 26 | 27 | .. image:: images/ui-buttons.png 28 | :alt: An image of the up/down ui buttons 29 | 30 | View Clusters UI Table 31 | ---------------------- 32 | 33 | The View Clusters UI Table allows a user to see a list of Ray Clusters 34 | with information on their configuration including number of workers, CPU 35 | requests and limits along with the clusters status. 36 | 37 | .. image:: images/ui-view-clusters.png 38 | :alt: An image of the view clusters ui table 39 | 40 | Above is a list of two Ray Clusters ``raytest`` and ``raytest2`` each of 41 | those headings is clickable and will update the table to view the 42 | selected Cluster's information. There are four buttons under the table 43 | ``Cluster Down``, ``View Jobs``, ``Open Ray Dashboard``, and ``Refresh Data``. \* The 44 | ``Cluster Down`` button will delete the selected Cluster. \* The 45 | ``View Jobs`` button will try to open the Ray Dashboard's Jobs view in a 46 | Web Browser. The link will also be printed to the console. \* The 47 | ``Open Ray Dashboard`` button will try to open the Ray Dashboard view in 48 | a Web Browser. The link will also be printed to the console. \* The 49 | ``Refresh Data`` button will refresh the list of RayClusters, the spec, and 50 | the status of the Ray Cluster. 51 | 52 | The UI Table can be viewed by calling the following function. 53 | 54 | .. code:: python 55 | 56 | from codeflare_sdk import view_clusters 57 | view_clusters() # Accepts namespace parameter but will try to gather the namespace from the current context 58 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "codeflare-sdk" 3 | version = "0.0.0-dev" 4 | description = "Python SDK for codeflare client" 5 | 6 | license = "Apache-2.0" 7 | 8 | authors = [ 9 | "Michael Clifford ", 10 | "Mustafa Eyceoz ", 11 | "Abhishek Malvankar ", 12 | "Atin Sood ", 13 | ] 14 | 15 | readme = 'README.md' 16 | 17 | repository = "https://github.com/project-codeflare/codeflare-sdk" 18 | homepage = "https://github.com/project-codeflare/codeflare-sdk" 19 | 20 | keywords = ['codeflare', 'python', 'sdk', 'client', 'batch', 'scale'] 21 | 22 | [tool.poetry.dependencies] 23 | python = "^3.11" 24 | openshift-client = "1.0.18" 25 | rich = ">=12.5,<14.0" 26 | ray = {version = "2.47.1", extras = ["data", "default"]} 27 | kubernetes = ">= 27.2.0" 28 | cryptography = "43.0.3" 29 | executing = "1.2.0" 30 | pydantic = "< 2" 31 | ipywidgets = "8.1.2" 32 | 33 | [tool.poetry.group.docs] 34 | optional = true 35 | 36 | [tool.poetry.group.docs.dependencies] 37 | sphinx = "7.4.7" 38 | sphinx-rtd-theme = "3.0.1" 39 | 40 | [tool.poetry.group.test] 41 | optional = true 42 | 43 | [tool.poetry.group.test.dependencies] 44 | pytest = "7.4.0" 45 | coverage = "7.6.4" 46 | pytest-mock = "3.11.1" 47 | pytest-timeout = "2.3.1" 48 | jupyterlab = "4.3.1" 49 | 50 | [tool.pytest.ini_options] 51 | filterwarnings = [ 52 | "ignore::DeprecationWarning:pkg_resources", 53 | "ignore:pkg_resources is deprecated as an API:DeprecationWarning", 54 | ] 55 | markers = [ 56 | "kind", 57 | "openshift", 58 | "nvidia_gpu" 59 | ] 60 | addopts = "--timeout=900" 61 | testpaths = ["src/codeflare_sdk"] 62 | collect_ignore = ["src/codeflare_sdk/common/utils/unit_test_support.py"] 63 | -------------------------------------------------------------------------------- /src/codeflare_sdk/__init__.py: -------------------------------------------------------------------------------- 1 | from .ray import ( 2 | Cluster, 3 | ClusterConfiguration, 4 | RayClusterStatus, 5 | CodeFlareClusterStatus, 6 | RayCluster, 7 | get_cluster, 8 | list_all_queued, 9 | list_all_clusters, 10 | AWManager, 11 | AppWrapperStatus, 12 | RayJobClient, 13 | ) 14 | 15 | from .common.widgets import view_clusters 16 | 17 | from .common import ( 18 | Authentication, 19 | KubeConfiguration, 20 | TokenAuthentication, 21 | KubeConfigFileAuthentication, 22 | ) 23 | 24 | from .common.kueue import ( 25 | list_local_queues, 26 | ) 27 | 28 | from .common.utils import generate_cert 29 | from .common.utils.demos import copy_demo_nbs 30 | 31 | from importlib.metadata import version, PackageNotFoundError 32 | 33 | try: 34 | __version__ = version("codeflare-sdk") # use metadata associated with built package 35 | 36 | except PackageNotFoundError: 37 | __version__ = "v0.0.0" 38 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Importing everything from the kubernetes_cluster module 2 | from .kubernetes_cluster import ( 3 | Authentication, 4 | KubeConfiguration, 5 | TokenAuthentication, 6 | KubeConfigFileAuthentication, 7 | _kube_api_error_handling, 8 | ) 9 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/kubernetes_cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from .auth import ( 2 | Authentication, 3 | KubeConfiguration, 4 | TokenAuthentication, 5 | KubeConfigFileAuthentication, 6 | config_check, 7 | get_api_client, 8 | ) 9 | 10 | from .kube_api_helpers import _kube_api_error_handling 11 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/kubernetes_cluster/kube_api_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | This sub-module exists primarily to be used internally for any Kubernetes 17 | API error handling or wrapping. 18 | """ 19 | 20 | import executing 21 | from kubernetes import client, config 22 | 23 | ERROR_MESSAGES = { 24 | "Not Found": "The requested resource could not be located.\n" 25 | "Please verify the resource name and namespace.", 26 | "Unauthorized": "Access to the API is unauthorized.\n" 27 | "Check your credentials or permissions.", 28 | "Forbidden": "Access denied to the Kubernetes resource.\n" 29 | "Ensure your role has sufficient permissions for this operation.", 30 | "Conflict": "A conflict occurred with the RayCluster resource.\n" 31 | "Only one RayCluster with the same name is allowed. " 32 | "Please delete or rename the existing RayCluster before creating a new one with the desired name.", 33 | } 34 | 35 | 36 | # private methods 37 | def _kube_api_error_handling( 38 | e: Exception, print_error: bool = True 39 | ): # pragma: no cover 40 | def print_message(message: str): 41 | if print_error: 42 | print(message) 43 | 44 | if isinstance(e, client.ApiException): 45 | # Retrieve message based on reason, defaulting if reason is not known 46 | message = ERROR_MESSAGES.get( 47 | e.reason, f"Unexpected API error encountered (Reason: {e.reason})" 48 | ) 49 | full_message = f"{message}\nResponse: {e.body}" 50 | print_message(full_message) 51 | 52 | elif isinstance(e, config.ConfigException): 53 | message = "Configuration error: Unable to load Kubernetes configuration. Verify the config file path and format." 54 | print_message(message) 55 | 56 | elif isinstance(e, executing.executing.NotOneValueFound): 57 | message = "Execution error: Expected exactly one value in the operation but found none or multiple." 58 | print_message(message) 59 | 60 | else: 61 | message = f"Unexpected error:\n{str(e)}" 62 | print_message(message) 63 | raise e 64 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/kueue/__init__.py: -------------------------------------------------------------------------------- 1 | from .kueue import ( 2 | get_default_kueue_name, 3 | local_queue_exists, 4 | add_queue_label, 5 | list_local_queues, 6 | ) 7 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/src/codeflare_sdk/common/utils/__init__.py -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/constants.py: -------------------------------------------------------------------------------- 1 | RAY_VERSION = "2.47.1" 2 | # Below references ray:2.47.1-py311-cu121 3 | CUDA_RUNTIME_IMAGE = "quay.io/modh/ray@sha256:6d076aeb38ab3c34a6a2ef0f58dc667089aa15826fa08a73273c629333e12f1e" 4 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/demos.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import shutil 3 | 4 | package_dir = pathlib.Path(__file__).parent.parent.parent.resolve() 5 | demo_dir = f"{package_dir}/demo-notebooks" 6 | 7 | 8 | def copy_demo_nbs(dir: str = "./demo-notebooks", overwrite: bool = False): 9 | """ 10 | Copy the demo notebooks from the package to the current working directory 11 | 12 | overwrite=True will overwrite any files that exactly match files written by copy_demo_nbs in the target directory. 13 | Any files that exist in the directory that don't match these values will remain untouched. 14 | 15 | Args: 16 | dir (str): 17 | The directory to copy the demo notebooks to. Defaults to "./demo-notebooks". 18 | overwrite (bool): 19 | Whether to overwrite files in the directory if it already exists. Defaults to False. 20 | 21 | Raises: 22 | FileExistsError: 23 | If the directory already exists. 24 | """ 25 | # does dir exist already? 26 | if overwrite is False and pathlib.Path(dir).exists(): 27 | raise FileExistsError( 28 | f"Directory {dir} already exists. Please remove it or provide a different location." 29 | ) 30 | 31 | shutil.copytree(demo_dir, dir, dirs_exist_ok=True) 32 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/utils/test_generate_cert.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import base64 16 | 17 | from cryptography.hazmat.primitives.serialization import ( 18 | Encoding, 19 | PublicFormat, 20 | load_pem_private_key, 21 | ) 22 | from cryptography.x509 import load_pem_x509_certificate 23 | import os 24 | from codeflare_sdk.common.utils.generate_cert import ( 25 | export_env, 26 | generate_ca_cert, 27 | generate_tls_cert, 28 | ) 29 | from kubernetes import client 30 | 31 | 32 | def test_generate_ca_cert(): 33 | """ 34 | test the function codeflare_sdk.common.utils.generate_ca_cert generates the correct outputs 35 | """ 36 | key, certificate = generate_ca_cert() 37 | cert = load_pem_x509_certificate(base64.b64decode(certificate)) 38 | private_pub_key_bytes = ( 39 | load_pem_private_key(base64.b64decode(key), password=None) 40 | .public_key() 41 | .public_bytes(Encoding.PEM, PublicFormat.SubjectPublicKeyInfo) 42 | ) 43 | cert_pub_key_bytes = cert.public_key().public_bytes( 44 | Encoding.PEM, PublicFormat.SubjectPublicKeyInfo 45 | ) 46 | assert type(key) == str 47 | assert type(certificate) == str 48 | # Veirfy ca.cert is self signed 49 | assert cert.verify_directly_issued_by(cert) == None 50 | # Verify cert has the public key bytes from the private key 51 | assert cert_pub_key_bytes == private_pub_key_bytes 52 | 53 | 54 | def secret_ca_retreival(secret_name, namespace): 55 | ca_private_key_bytes, ca_cert = generate_ca_cert() 56 | data = {"ca.crt": ca_cert, "ca.key": ca_private_key_bytes} 57 | assert secret_name == "ca-secret-cluster" 58 | assert namespace == "namespace" 59 | return client.models.V1Secret(data=data) 60 | 61 | 62 | def test_generate_tls_cert(mocker): 63 | """ 64 | test the function codeflare_sdk.common.utils.generate_ca_cert generates the correct outputs 65 | """ 66 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 67 | mocker.patch( 68 | "codeflare_sdk.common.utils.generate_cert.get_secret_name", 69 | return_value="ca-secret-cluster", 70 | ) 71 | mocker.patch( 72 | "kubernetes.client.CoreV1Api.read_namespaced_secret", 73 | side_effect=secret_ca_retreival, 74 | ) 75 | 76 | generate_tls_cert("cluster", "namespace") 77 | assert os.path.exists("tls-cluster-namespace") 78 | assert os.path.exists(os.path.join("tls-cluster-namespace", "ca.crt")) 79 | assert os.path.exists(os.path.join("tls-cluster-namespace", "tls.crt")) 80 | assert os.path.exists(os.path.join("tls-cluster-namespace", "tls.key")) 81 | 82 | # verify the that the signed tls.crt is issued by the ca_cert (root cert) 83 | with open(os.path.join("tls-cluster-namespace", "tls.crt"), "r") as f: 84 | tls_cert = load_pem_x509_certificate(f.read().encode("utf-8")) 85 | with open(os.path.join("tls-cluster-namespace", "ca.crt"), "r") as f: 86 | root_cert = load_pem_x509_certificate(f.read().encode("utf-8")) 87 | assert tls_cert.verify_directly_issued_by(root_cert) == None 88 | 89 | 90 | def test_export_env(): 91 | """ 92 | test the function codeflare_sdk.common.utils.generate_ca_cert.export_ev generates the correct outputs 93 | """ 94 | tls_dir = "cluster" 95 | ns = "namespace" 96 | export_env(tls_dir, ns) 97 | assert os.environ["RAY_USE_TLS"] == "1" 98 | assert os.environ["RAY_TLS_SERVER_CERT"] == os.path.join( 99 | os.getcwd(), f"tls-{tls_dir}-{ns}", "tls.crt" 100 | ) 101 | assert os.environ["RAY_TLS_SERVER_KEY"] == os.path.join( 102 | os.getcwd(), f"tls-{tls_dir}-{ns}", "tls.key" 103 | ) 104 | assert os.environ["RAY_TLS_CA_CERT"] == os.path.join( 105 | os.getcwd(), f"tls-{tls_dir}-{ns}", "ca.crt" 106 | ) 107 | 108 | 109 | # Make sure to always keep this function last 110 | def test_cleanup(): 111 | os.remove("tls-cluster-namespace/ca.crt") 112 | os.remove("tls-cluster-namespace/tls.crt") 113 | os.remove("tls-cluster-namespace/tls.key") 114 | os.rmdir("tls-cluster-namespace") 115 | -------------------------------------------------------------------------------- /src/codeflare_sdk/common/widgets/__init__.py: -------------------------------------------------------------------------------- 1 | from .widgets import ( 2 | view_clusters, 3 | ) 4 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/__init__.py: -------------------------------------------------------------------------------- 1 | from .appwrapper import AppWrapper, AppWrapperStatus, AWManager 2 | 3 | from .client import ( 4 | RayJobClient, 5 | ) 6 | 7 | from .cluster import ( 8 | Cluster, 9 | ClusterConfiguration, 10 | get_cluster, 11 | list_all_queued, 12 | list_all_clusters, 13 | RayClusterStatus, 14 | CodeFlareClusterStatus, 15 | RayCluster, 16 | ) 17 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/__init__.py: -------------------------------------------------------------------------------- 1 | from .awload import AWManager 2 | 3 | from .status import ( 4 | AppWrapperStatus, 5 | AppWrapper, 6 | ) 7 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/awload.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | The awload sub-module contains the definition of the AWManager object, which handles 17 | submission and deletion of existing AppWrappers from a user's file system. 18 | """ 19 | 20 | from os.path import isfile 21 | import errno 22 | import os 23 | import yaml 24 | 25 | from kubernetes import client 26 | from ...common import _kube_api_error_handling 27 | from ...common.kubernetes_cluster.auth import ( 28 | config_check, 29 | get_api_client, 30 | ) 31 | 32 | 33 | class AWManager: 34 | """ 35 | An object for submitting and removing existing AppWrapper yamls 36 | to be added to the Kueue localqueue. 37 | """ 38 | 39 | def __init__(self, filename: str) -> None: 40 | """ 41 | Create the AppWrapper Manager object by passing in an 42 | AppWrapper yaml file 43 | """ 44 | if not isfile(filename): 45 | raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename) 46 | self.filename = filename 47 | try: 48 | with open(self.filename) as f: 49 | self.awyaml = yaml.load(f, Loader=yaml.FullLoader) 50 | assert self.awyaml["kind"] == "AppWrapper" 51 | self.name = self.awyaml["metadata"]["name"] 52 | self.namespace = self.awyaml["metadata"]["namespace"] 53 | except: 54 | raise ValueError( 55 | f"{filename } is not a correctly formatted AppWrapper yaml" 56 | ) 57 | self.submitted = False 58 | 59 | def submit(self) -> None: 60 | """ 61 | Attempts to create the AppWrapper custom resource using the yaml file 62 | """ 63 | try: 64 | config_check() 65 | api_instance = client.CustomObjectsApi(get_api_client()) 66 | api_instance.create_namespaced_custom_object( 67 | group="workload.codeflare.dev", 68 | version="v1beta2", 69 | namespace=self.namespace, 70 | plural="appwrappers", 71 | body=self.awyaml, 72 | ) 73 | except Exception as e: 74 | return _kube_api_error_handling(e) 75 | 76 | self.submitted = True 77 | print(f"AppWrapper {self.filename} submitted!") 78 | 79 | def remove(self) -> None: 80 | """ 81 | Attempts to delete the AppWrapper custom resource matching the name in the yaml, 82 | if submitted by this manager. 83 | """ 84 | if not self.submitted: 85 | print("AppWrapper not submitted by this manager yet, nothing to remove") 86 | return 87 | 88 | try: 89 | config_check() 90 | api_instance = client.CustomObjectsApi(get_api_client()) 91 | api_instance.delete_namespaced_custom_object( 92 | group="workload.codeflare.dev", 93 | version="v1beta2", 94 | namespace=self.namespace, 95 | plural="appwrappers", 96 | name=self.name, 97 | ) 98 | except Exception as e: 99 | return _kube_api_error_handling(e) 100 | 101 | self.submitted = False 102 | print(f"AppWrapper {self.name} removed!") 103 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | The status sub-module defines Enums containing information for 17 | AppWrapper states, as well as dataclasses to store information for AppWrappers. 18 | """ 19 | 20 | from dataclasses import dataclass 21 | from enum import Enum 22 | 23 | 24 | class AppWrapperStatus(Enum): 25 | """ 26 | Defines the possible reportable phases of an AppWrapper. 27 | """ 28 | 29 | SUSPENDED = "suspended" 30 | RESUMING = "resuming" 31 | RUNNING = "running" 32 | RESETTING = "resetting" 33 | SUSPENDING = "suspending" 34 | SUCCEEDED = "succeeded" 35 | FAILED = "failed" 36 | TERMINATING = "terminating" 37 | 38 | 39 | @dataclass 40 | class AppWrapper: 41 | """ 42 | For storing information about an AppWrapper. 43 | """ 44 | 45 | name: str 46 | status: AppWrapperStatus 47 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/test_awload.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from codeflare_sdk.common.utils.unit_test_support import ( 15 | apply_template, 16 | arg_check_aw_apply_effect, 17 | arg_check_aw_del_effect, 18 | get_template_variables, 19 | ) 20 | from codeflare_sdk.ray.appwrapper import AWManager 21 | from codeflare_sdk.ray.cluster import Cluster, ClusterConfiguration 22 | import os 23 | from pathlib import Path 24 | 25 | parent = Path(__file__).resolve().parents[4] # project directory 26 | aw_dir = os.path.expanduser("~/.codeflare/resources/") 27 | 28 | 29 | def test_AWManager_creation(mocker): 30 | mocker.patch("kubernetes.client.ApisApi.get_api_versions") 31 | mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object") 32 | # Create test.yaml 33 | Cluster( 34 | ClusterConfiguration( 35 | name="test", 36 | namespace="ns", 37 | write_to_file=True, 38 | appwrapper=True, 39 | ) 40 | ) 41 | 42 | testaw = AWManager(f"{aw_dir}test.yaml") 43 | assert testaw.name == "test" 44 | assert testaw.namespace == "ns" 45 | assert testaw.submitted == False 46 | try: 47 | testaw = AWManager("fake") 48 | except Exception as e: 49 | assert type(e) == FileNotFoundError 50 | assert str(e) == "[Errno 2] No such file or directory: 'fake'" 51 | try: 52 | testaw = apply_template( 53 | AWManager( 54 | f"{parent}/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml" 55 | ), 56 | get_template_variables(), 57 | ) 58 | except Exception as e: 59 | assert type(e) == ValueError 60 | assert ( 61 | str(e) 62 | == f"{parent}/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml is not a correctly formatted AppWrapper yaml" 63 | ) 64 | 65 | 66 | def test_AWManager_submit_remove(mocker, capsys): 67 | mocker.patch("kubernetes.client.ApisApi.get_api_versions") 68 | testaw = AWManager(f"{aw_dir}test.yaml") 69 | testaw.remove() 70 | captured = capsys.readouterr() 71 | assert ( 72 | captured.out 73 | == "AppWrapper not submitted by this manager yet, nothing to remove\n" 74 | ) 75 | assert testaw.submitted == False 76 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 77 | mocker.patch( 78 | "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", 79 | side_effect=arg_check_aw_apply_effect, 80 | ) 81 | mocker.patch( 82 | "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", 83 | side_effect=arg_check_aw_del_effect, 84 | ) 85 | testaw.submit() 86 | assert testaw.submitted == True 87 | testaw.remove() 88 | assert testaw.submitted == False 89 | 90 | 91 | # Make sure to always keep this function last 92 | def test_cleanup(): 93 | os.remove(f"{aw_dir}test.yaml") 94 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/appwrapper/test_status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from codeflare_sdk.ray.cluster.cluster import ( 16 | _app_wrapper_status, 17 | Cluster, 18 | ClusterConfiguration, 19 | ) 20 | from codeflare_sdk.ray.appwrapper import AppWrapper, AppWrapperStatus 21 | from codeflare_sdk.ray.cluster.status import CodeFlareClusterStatus 22 | from codeflare_sdk.common.utils.unit_test_support import get_local_queue 23 | import os 24 | 25 | aw_dir = os.path.expanduser("~/.codeflare/resources/") 26 | 27 | 28 | def test_cluster_status(mocker): 29 | mocker.patch("kubernetes.client.ApisApi.get_api_versions") 30 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 31 | mocker.patch( 32 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", 33 | return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), 34 | ) 35 | fake_aw = AppWrapper("test", AppWrapperStatus.FAILED) 36 | 37 | cf = Cluster( 38 | ClusterConfiguration( 39 | name="test", 40 | namespace="ns", 41 | write_to_file=True, 42 | appwrapper=True, 43 | local_queue="local-queue-default", 44 | ) 45 | ) 46 | mocker.patch( 47 | "codeflare_sdk.ray.cluster.cluster._app_wrapper_status", return_value=None 48 | ) 49 | mocker.patch( 50 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=None 51 | ) 52 | status, ready = cf.status() 53 | assert status == CodeFlareClusterStatus.UNKNOWN 54 | assert ready == False 55 | 56 | mocker.patch( 57 | "codeflare_sdk.ray.cluster.cluster._app_wrapper_status", return_value=fake_aw 58 | ) 59 | status, ready = cf.status() 60 | assert status == CodeFlareClusterStatus.FAILED 61 | assert ready == False 62 | 63 | fake_aw.status = AppWrapperStatus.SUSPENDED 64 | status, ready = cf.status() 65 | assert status == CodeFlareClusterStatus.QUEUED 66 | assert ready == False 67 | 68 | fake_aw.status = AppWrapperStatus.RESUMING 69 | status, ready = cf.status() 70 | assert status == CodeFlareClusterStatus.STARTING 71 | assert ready == False 72 | 73 | fake_aw.status = AppWrapperStatus.RESETTING 74 | status, ready = cf.status() 75 | assert status == CodeFlareClusterStatus.STARTING 76 | assert ready == False 77 | 78 | fake_aw.status = AppWrapperStatus.RUNNING 79 | status, ready = cf.status() 80 | assert status == CodeFlareClusterStatus.UNKNOWN 81 | assert ready == False 82 | 83 | 84 | def aw_status_fields(group, version, namespace, plural, *args): 85 | assert group == "workload.codeflare.dev" 86 | assert version == "v1beta2" 87 | assert namespace == "test-ns" 88 | assert plural == "appwrappers" 89 | assert args == tuple() 90 | return {"items": []} 91 | 92 | 93 | def test_aw_status(mocker): 94 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 95 | mocker.patch( 96 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", 97 | side_effect=aw_status_fields, 98 | ) 99 | aw = _app_wrapper_status("test-aw", "test-ns") 100 | assert aw == None 101 | 102 | 103 | # Make sure to always keep this function last 104 | def test_cleanup(): 105 | os.remove(f"{aw_dir}test.yaml") 106 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/client/__init__.py: -------------------------------------------------------------------------------- 1 | from .ray_jobs import RayJobClient 2 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from .status import ( 2 | RayClusterStatus, 3 | CodeFlareClusterStatus, 4 | RayCluster, 5 | ) 6 | 7 | from .cluster import ( 8 | Cluster, 9 | ClusterConfiguration, 10 | get_cluster, 11 | list_all_queued, 12 | list_all_clusters, 13 | ) 14 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/cluster/status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | The status sub-module defines Enums containing information for Ray cluster 17 | states states, and CodeFlare cluster states, as well as 18 | dataclasses to store information for Ray clusters. 19 | """ 20 | 21 | from dataclasses import dataclass, field 22 | from enum import Enum 23 | import typing 24 | from typing import Union 25 | 26 | 27 | class RayClusterStatus(Enum): 28 | """ 29 | Defines the possible reportable states of a Ray cluster. 30 | """ 31 | 32 | # https://github.com/ray-project/kuberay/blob/master/ray-operator/apis/ray/v1/raycluster_types.go#L112-L117 33 | READY = "ready" 34 | UNHEALTHY = "unhealthy" 35 | FAILED = "failed" 36 | UNKNOWN = "unknown" 37 | SUSPENDED = "suspended" 38 | 39 | 40 | class CodeFlareClusterStatus(Enum): 41 | """ 42 | Defines the possible reportable states of a Codeflare cluster. 43 | """ 44 | 45 | READY = 1 46 | STARTING = 2 47 | QUEUED = 3 48 | QUEUEING = 4 49 | FAILED = 5 50 | UNKNOWN = 6 51 | SUSPENDED = 7 52 | 53 | 54 | @dataclass 55 | class RayCluster: 56 | """ 57 | For storing information about a Ray cluster. 58 | """ 59 | 60 | name: str 61 | status: RayClusterStatus 62 | head_cpu_requests: int 63 | head_cpu_limits: int 64 | head_mem_requests: str 65 | head_mem_limits: str 66 | num_workers: int 67 | worker_mem_requests: str 68 | worker_mem_limits: str 69 | worker_cpu_requests: Union[int, str] 70 | worker_cpu_limits: Union[int, str] 71 | namespace: str 72 | dashboard: str 73 | worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) 74 | head_extended_resources: typing.Dict[str, int] = field(default_factory=dict) 75 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/cluster/test_build_ray_cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from collections import namedtuple 15 | import sys 16 | from .build_ray_cluster import gen_names, update_image, build_ray_cluster 17 | import uuid 18 | from codeflare_sdk.ray.cluster.cluster import ClusterConfiguration, Cluster 19 | 20 | 21 | def test_gen_names_with_name(mocker): 22 | mocker.patch.object( 23 | uuid, "uuid4", return_value=uuid.UUID("00000000-0000-0000-0000-000000000001") 24 | ) 25 | name = "myname" 26 | appwrapper_name, cluster_name = gen_names(name) 27 | assert appwrapper_name == name 28 | assert cluster_name == name 29 | 30 | 31 | def test_gen_names_without_name(mocker): 32 | mocker.patch.object( 33 | uuid, "uuid4", return_value=uuid.UUID("00000000-0000-0000-0000-000000000001") 34 | ) 35 | appwrapper_name, cluster_name = gen_names(None) 36 | assert appwrapper_name.startswith("appwrapper-") 37 | assert cluster_name.startswith("cluster-") 38 | 39 | 40 | def test_update_image_without_supported_python_version(mocker): 41 | # Mock SUPPORTED_PYTHON_VERSIONS 42 | mocker.patch.dict( 43 | "codeflare_sdk.ray.cluster.build_ray_cluster.SUPPORTED_PYTHON_VERSIONS", 44 | { 45 | "3.11": "ray-py3.11", 46 | }, 47 | ) 48 | 49 | # Create a namedtuple to mock sys.version_info 50 | VersionInfo = namedtuple( 51 | "version_info", ["major", "minor", "micro", "releaselevel", "serial"] 52 | ) 53 | mocker.patch.object(sys, "version_info", VersionInfo(3, 8, 0, "final", 0)) 54 | 55 | # Mock warnings.warn to check if it gets called 56 | warn_mock = mocker.patch("warnings.warn") 57 | 58 | # Call the update_image function with no image provided 59 | image = update_image(None) 60 | 61 | # Assert that the warning was called with the expected message 62 | warn_mock.assert_called_once_with( 63 | "No default Ray image defined for 3.8. Please provide your own image or use one of the following python versions: 3.11." 64 | ) 65 | 66 | # Assert that no image was set since the Python version is not supported 67 | assert image is None 68 | 69 | 70 | def test_build_ray_cluster_with_gcs_ft(mocker): 71 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 72 | mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object") 73 | 74 | cluster = Cluster( 75 | ClusterConfiguration( 76 | name="test", 77 | namespace="ns", 78 | enable_gcs_ft=True, 79 | redis_address="redis:6379", 80 | redis_password_secret={"name": "redis-password-secret", "key": "password"}, 81 | external_storage_namespace="new-ns", 82 | ) 83 | ) 84 | 85 | mocker.patch("codeflare_sdk.ray.cluster.build_ray_cluster.config_check") 86 | mocker.patch( 87 | "codeflare_sdk.ray.cluster.build_ray_cluster.get_api_client", return_value=None 88 | ) 89 | mocker.patch( 90 | "codeflare_sdk.ray.cluster.build_ray_cluster.update_image", return_value=None 91 | ) 92 | 93 | resource = build_ray_cluster(cluster) 94 | 95 | assert "spec" in resource 96 | assert "gcsFaultToleranceOptions" in resource["spec"] 97 | 98 | gcs_ft_options = resource["spec"]["gcsFaultToleranceOptions"] 99 | 100 | assert gcs_ft_options["redisAddress"] == "redis:6379" 101 | assert gcs_ft_options["externalStorageNamespace"] == "new-ns" 102 | assert ( 103 | gcs_ft_options["redisPassword"]["valueFrom"]["secretKeyRef"]["name"] 104 | == "redis-password-secret" 105 | ) 106 | assert ( 107 | gcs_ft_options["redisPassword"]["valueFrom"]["secretKeyRef"]["key"] 108 | == "password" 109 | ) 110 | -------------------------------------------------------------------------------- /src/codeflare_sdk/ray/cluster/test_status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from codeflare_sdk.ray.cluster.cluster import ( 16 | Cluster, 17 | ClusterConfiguration, 18 | _ray_cluster_status, 19 | ) 20 | from codeflare_sdk.ray.cluster.status import ( 21 | CodeFlareClusterStatus, 22 | RayClusterStatus, 23 | RayCluster, 24 | ) 25 | import os 26 | from ...common.utils.unit_test_support import get_local_queue 27 | 28 | aw_dir = os.path.expanduser("~/.codeflare/resources/") 29 | 30 | 31 | def test_cluster_status(mocker): 32 | mocker.patch("kubernetes.client.ApisApi.get_api_versions") 33 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 34 | 35 | fake_ray = RayCluster( 36 | name="test", 37 | status=RayClusterStatus.UNKNOWN, 38 | num_workers=1, 39 | worker_mem_requests=2, 40 | worker_mem_limits=2, 41 | worker_cpu_requests=1, 42 | worker_cpu_limits=1, 43 | namespace="ns", 44 | dashboard="fake-uri", 45 | head_cpu_requests=2, 46 | head_cpu_limits=2, 47 | head_mem_requests=8, 48 | head_mem_limits=8, 49 | ) 50 | 51 | mocker.patch( 52 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", 53 | return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), 54 | ) 55 | 56 | cf = Cluster( 57 | ClusterConfiguration( 58 | name="test", 59 | namespace="ns", 60 | write_to_file=True, 61 | appwrapper=False, 62 | local_queue="local-queue-default", 63 | ) 64 | ) 65 | mocker.patch( 66 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=None 67 | ) 68 | status, ready = cf.status() 69 | assert status == CodeFlareClusterStatus.UNKNOWN 70 | assert ready == False 71 | 72 | mocker.patch( 73 | "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=fake_ray 74 | ) 75 | 76 | status, ready = cf.status() 77 | assert status == CodeFlareClusterStatus.STARTING 78 | assert ready == False 79 | 80 | fake_ray.status = RayClusterStatus.FAILED 81 | status, ready = cf.status() 82 | assert status == CodeFlareClusterStatus.FAILED 83 | assert ready == False 84 | 85 | fake_ray.status = RayClusterStatus.UNHEALTHY 86 | status, ready = cf.status() 87 | assert status == CodeFlareClusterStatus.FAILED 88 | assert ready == False 89 | 90 | fake_ray.status = RayClusterStatus.READY 91 | status, ready = cf.status() 92 | assert status == CodeFlareClusterStatus.READY 93 | assert ready == True 94 | 95 | 96 | def rc_status_fields(group, version, namespace, plural, *args): 97 | assert group == "ray.io" 98 | assert version == "v1" 99 | assert namespace == "test-ns" 100 | assert plural == "rayclusters" 101 | assert args == tuple() 102 | return {"items": []} 103 | 104 | 105 | def test_rc_status(mocker): 106 | mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") 107 | mocker.patch( 108 | "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", 109 | side_effect=rc_status_fields, 110 | ) 111 | rc = _ray_cluster_status("test-rc", "test-ns") 112 | assert rc == None 113 | 114 | 115 | # Make sure to always keep this function last 116 | def test_cleanup(): 117 | os.remove(f"{aw_dir}test.yaml") 118 | -------------------------------------------------------------------------------- /target_users.md: -------------------------------------------------------------------------------- 1 | # CodeFlare Stack Target Users 2 | 3 | [Cluster Admin](#cluster-administrator) 4 | 5 | [Data Scientist I](#data-scientist-i) 6 | 7 | [Data Scientist II](#data-scientist-ii) 8 | 9 | 10 | 11 | ## Cluster Administrator 12 | 13 | * Quota Management 14 | * Gang-Scheduling for Distributed Compute 15 | * Job/Infrastructure Queuing 16 | 17 | I want to enable a team of data scientists to have self-serve, but limited, access to a shared pool of distributed compute resources such as GPUs for large scale machine learning model training jobs. If the existing pool of resources is insufficient, I want my cluster to scale up (to a defined quota) to meet my users’ needs and scale back down automatically when their jobs have completed. I want these features to be made available through simple installation of generic modules via a user-friendly interface. I also want the ability to monitor current queue of pending tasks, the utilization of active resources, and the progress of all current jobs visualized in a simple dashboard. 18 | 19 | ## Data Scientist I 20 | 21 | * Training Mid-Size Models (less than 1,000 nodes) 22 | * Fine-Tuning Existing Models 23 | * Distributed Compute Framework 24 | 25 | I need temporary access to a reasonably large set of GPU enabled nodes on my team’s shared cluster for short term experimentation, parallelizing my existing ML workflow, or fine-tuning existing large scale models. I’d prefer to work from a notebook environment with access to a python sdk that I can use to request the creation of Framework Clusters that I can distribute my workloads across. In addition to interactive experimentation work, I also want the ability to “fire-and-forget” longer running ML jobs onto temporarily deployed Framework Clusters with the ability to monitor these jobs while they are running and access to all of their artifacts once complete. I also want to see where my jobs are in the current queue and the progress of all my current jobs visualized in a simple dashboard. 26 | 27 | ## Data Scientist II 28 | * Training Foundation Models (1,000+ nodes) 29 | * Distributed Compute Framework 30 | 31 | I need temporary (but long term) access to a massive amount of GPU enabled infrastructure to train a foundation model. I want to be able to “fire-and-forget” my ML Job into this environment. Due to the size and cost associated with this job, it has already been well tested and validated, so access to jupyter notebooks is unnecessary. I would prefer to write my job as a bash script leveraging a CLI, or as a python script leveraging an SDK. I need the ability to monitor the job while it is running, as well as access to all of its artifacts once complete. I also want to see where my jobs are in the current queue and the progress of all my current jobs visualized in a simple dashboard. 32 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/tests/__init__.py -------------------------------------------------------------------------------- /tests/auth-test.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDOTCCAiGgAwIBAgIUENjaZDrvhc5uV3j7GI8deZJwc+YwDQYJKoZIhvcNAQEL 3 | BQAwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM 4 | GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDAeFw0yNDA1MTMxMTE1NDZaFw0yNTA1 5 | MTMxMTE1NDZaMEUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEw 6 | HwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQwggEiMA0GCSqGSIb3DQEB 7 | AQUAA4IBDwAwggEKAoIBAQDEYYk81jvPijZXXeI9cByf5EIbOVaBTH7I51J9EKG5 8 | Y/KRXI43WgvVEiZ3jP8LJnSD79WhBiL6TgadQZje5ndroRYDM9vyqz1OUZapnOO+ 9 | yzl01y/qSsH8Kn88eLAzkE9HSu4QN9PuJtySyksjDFQJ6kjyE8ZHUSorur0FlLLf 10 | IToFgTuaIPDYjvFRchOCfZ7sV/MF7LxqFfFnaWOYvH41ZdvqJiRcVsMi+mYs9/I/ 11 | I72IMXwVnQDVnK8H84ntEmHNN6NoVuMKla0So4/wKcHJSCgS3axLI2Ka2aaaJo9K 12 | l2cn21NOyodF+DaSFy7qaGRXxoTQ2k9tUrSvxkBJvRmBAgMBAAGjITAfMB0GA1Ud 13 | DgQWBBRTK8mO5XMcmR+Xg/PVNFnvz4eubDANBgkqhkiG9w0BAQsFAAOCAQEAlZva 14 | 6ws3zRff7u0tWT2JJaE1uPqsuAdHtVvEyAMp2QvYfyrgADTroUTaSU4p6ppX/t7v 15 | ynHhuzR6UOVkuY0/CH1P3UUGrEPNOXT8i2BDwL+j4y2K2aRN8zU0Nu/IVePBhu+4 16 | Jdt+3P7/MuwiCON5JukgxUYlQKhVhzFj7GOd2+Ca+fh8Siq3tkWDSN54+90fgylQ 17 | +74Yfya1NVabpzLqP3Isqu2XQhEVaBFvj8Yu0h83e3D8LeQToC3mVMF4yy5BZ9Ty 18 | K66YGlGQgszWEUFPEdsB8Dj/iJMhkWXuyc3u/w0s3t7rXeMYYgr+xrEeK+g0oyB5 19 | xeZuMjd567Znmu5oMw== 20 | -----END CERTIFICATE----- 21 | -------------------------------------------------------------------------------- /tests/e2e/cluster_apply_kind_test.py: -------------------------------------------------------------------------------- 1 | from codeflare_sdk import Cluster, ClusterConfiguration 2 | import pytest 3 | from kubernetes import client 4 | 5 | from support import ( 6 | initialize_kubernetes_client, 7 | create_namespace, 8 | delete_namespace, 9 | get_ray_cluster, 10 | ) 11 | 12 | 13 | @pytest.mark.kind 14 | class TestRayClusterApply: 15 | def setup_method(self): 16 | initialize_kubernetes_client(self) 17 | 18 | def teardown_method(self): 19 | delete_namespace(self) 20 | 21 | def test_cluster_apply(self): 22 | self.setup_method() 23 | create_namespace(self) 24 | 25 | cluster_name = "test-cluster-apply" 26 | namespace = self.namespace 27 | 28 | # Initial configuration with 1 worker 29 | initial_config = ClusterConfiguration( 30 | name=cluster_name, 31 | namespace=namespace, 32 | num_workers=1, 33 | head_cpu_requests="500m", 34 | head_cpu_limits="1", 35 | head_memory_requests="1Gi", 36 | head_memory_limits="2Gi", 37 | worker_cpu_requests="500m", 38 | worker_cpu_limits="1", 39 | worker_memory_requests="1Gi", 40 | worker_memory_limits="2Gi", 41 | write_to_file=True, 42 | verify_tls=False, 43 | ) 44 | 45 | # Create the cluster 46 | cluster = Cluster(initial_config) 47 | cluster.apply() 48 | 49 | # Wait for the cluster to be ready 50 | cluster.wait_ready() 51 | status = cluster.status() 52 | assert status["ready"], f"Cluster {cluster_name} is not ready: {status}" 53 | 54 | # Verify the cluster is created 55 | ray_cluster = get_ray_cluster(cluster_name, namespace) 56 | assert ray_cluster is not None, "Cluster was not created successfully" 57 | assert ( 58 | ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 1 59 | ), "Initial worker count does not match" 60 | 61 | # Update configuration with 3 workers 62 | updated_config = ClusterConfiguration( 63 | name=cluster_name, 64 | namespace=namespace, 65 | num_workers=2, 66 | head_cpu_requests="500m", 67 | head_cpu_limits="1", 68 | head_memory_requests="1Gi", 69 | head_memory_limits="2Gi", 70 | worker_cpu_requests="500m", 71 | worker_cpu_limits="1", 72 | worker_memory_requests="1Gi", 73 | worker_memory_limits="2Gi", 74 | write_to_file=True, 75 | verify_tls=False, 76 | ) 77 | 78 | # Apply the updated configuration 79 | cluster.config = updated_config 80 | cluster.apply() 81 | 82 | # Wait for the updated cluster to be ready 83 | cluster.wait_ready() 84 | updated_status = cluster.status() 85 | assert updated_status[ 86 | "ready" 87 | ], f"Cluster {cluster_name} is not ready after update: {updated_status}" 88 | 89 | # Verify the cluster is updated 90 | updated_ray_cluster = get_ray_cluster(cluster_name, namespace) 91 | assert ( 92 | updated_ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 2 93 | ), "Worker count was not updated" 94 | 95 | # Clean up 96 | cluster.down() 97 | ray_cluster = get_ray_cluster(cluster_name, namespace) 98 | assert ray_cluster is None, "Cluster was not deleted successfully" 99 | 100 | def test_apply_invalid_update(self): 101 | self.setup_method() 102 | create_namespace(self) 103 | 104 | cluster_name = "test-cluster-apply-invalid" 105 | namespace = self.namespace 106 | 107 | # Initial configuration 108 | initial_config = ClusterConfiguration( 109 | name=cluster_name, 110 | namespace=namespace, 111 | num_workers=1, 112 | head_cpu_requests="500m", 113 | head_cpu_limits="1", 114 | head_memory_requests="1Gi", 115 | head_memory_limits="2Gi", 116 | worker_cpu_requests="500m", 117 | worker_cpu_limits="1", 118 | worker_memory_requests="1Gi", 119 | worker_memory_limits="2Gi", 120 | write_to_file=True, 121 | verify_tls=False, 122 | ) 123 | 124 | # Create the cluster 125 | cluster = Cluster(initial_config) 126 | cluster.apply() 127 | 128 | # Wait for the cluster to be ready 129 | cluster.wait_ready() 130 | status = cluster.status() 131 | assert status["ready"], f"Cluster {cluster_name} is not ready: {status}" 132 | 133 | # Update with an invalid configuration (e.g., immutable field change) 134 | invalid_config = ClusterConfiguration( 135 | name=cluster_name, 136 | namespace=namespace, 137 | num_workers=2, 138 | head_cpu_requests="1", 139 | head_cpu_limits="2", # Changing CPU limits (immutable) 140 | head_memory_requests="1Gi", 141 | head_memory_limits="2Gi", 142 | worker_cpu_requests="500m", 143 | worker_cpu_limits="1", 144 | worker_memory_requests="1Gi", 145 | worker_memory_limits="2Gi", 146 | write_to_file=True, 147 | verify_tls=False, 148 | ) 149 | 150 | # Try to apply the invalid configuration and expect failure 151 | cluster.config = invalid_config 152 | with pytest.raises(RuntimeError, match="Immutable fields detected"): 153 | cluster.apply() 154 | 155 | # Clean up 156 | cluster.down() 157 | -------------------------------------------------------------------------------- /tests/e2e/heterogeneous_clusters_kind_test.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | import time 3 | from codeflare_sdk import ( 4 | Cluster, 5 | ClusterConfiguration, 6 | ) 7 | 8 | from codeflare_sdk.common.kueue.kueue import list_local_queues 9 | 10 | import pytest 11 | 12 | from support import * 13 | 14 | 15 | @pytest.mark.skip(reason="Skipping heterogenous cluster kind test") 16 | @pytest.mark.kind 17 | class TestHeterogeneousClustersKind: 18 | def setup_method(self): 19 | initialize_kubernetes_client(self) 20 | 21 | def teardown_method(self): 22 | delete_namespace(self) 23 | delete_kueue_resources(self) 24 | 25 | @pytest.mark.nvidia_gpu 26 | def test_heterogeneous_clusters(self): 27 | create_namespace(self) 28 | create_kueue_resources(self, 2, with_labels=True, with_tolerations=True) 29 | self.run_heterogeneous_clusters() 30 | 31 | def run_heterogeneous_clusters( 32 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 33 | ): 34 | for flavor in self.resource_flavors: 35 | node_labels = ( 36 | get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {}) 37 | ) 38 | expected_nodes = get_nodes_by_label(self, node_labels) 39 | 40 | print(f"Expected nodes: {expected_nodes}") 41 | cluster_name = f"test-ray-cluster-li-{flavor[-5:]}" 42 | queues = list_local_queues(namespace=self.namespace, flavors=[flavor]) 43 | queue_name = queues[0]["name"] if queues else None 44 | print(f"Using flavor: {flavor}, Queue: {queue_name}") 45 | cluster = Cluster( 46 | ClusterConfiguration( 47 | name=cluster_name, 48 | namespace=self.namespace, 49 | num_workers=1, 50 | head_cpu_requests="500m", 51 | head_cpu_limits="500m", 52 | head_memory_requests=2, 53 | head_memory_limits=2, 54 | worker_cpu_requests="500m", 55 | worker_cpu_limits=1, 56 | worker_memory_requests=1, 57 | worker_memory_limits=4, 58 | worker_extended_resource_requests={ 59 | gpu_resource_name: number_of_gpus 60 | }, 61 | write_to_file=True, 62 | verify_tls=False, 63 | local_queue=queue_name, 64 | ) 65 | ) 66 | cluster.up() 67 | sleep(5) 68 | node_name = get_pod_node(self, self.namespace, cluster_name) 69 | print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}") 70 | sleep(5) 71 | assert ( 72 | node_name in expected_nodes 73 | ), f"Node {node_name} is not in the expected nodes for flavor {flavor}." 74 | cluster.down() 75 | -------------------------------------------------------------------------------- /tests/e2e/heterogeneous_clusters_oauth_test.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | import time 3 | from codeflare_sdk import ( 4 | Cluster, 5 | ClusterConfiguration, 6 | TokenAuthentication, 7 | ) 8 | 9 | from codeflare_sdk.common.kueue.kueue import list_local_queues 10 | 11 | import pytest 12 | 13 | from support import * 14 | 15 | 16 | @pytest.mark.openshift 17 | class TestHeterogeneousClustersOauth: 18 | def setup_method(self): 19 | initialize_kubernetes_client(self) 20 | 21 | def teardown_method(self): 22 | delete_namespace(self) 23 | delete_kueue_resources(self) 24 | 25 | def test_heterogeneous_clusters(self): 26 | create_namespace(self) 27 | create_kueue_resources(self, 2, with_labels=True, with_tolerations=True) 28 | self.run_heterogeneous_clusters() 29 | 30 | def run_heterogeneous_clusters( 31 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 32 | ): 33 | ray_image = get_ray_image() 34 | 35 | auth = TokenAuthentication( 36 | token=run_oc_command(["whoami", "--show-token=true"]), 37 | server=run_oc_command(["whoami", "--show-server=true"]), 38 | skip_tls=True, 39 | ) 40 | auth.login() 41 | 42 | for flavor in self.resource_flavors: 43 | node_labels = ( 44 | get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {}) 45 | ) 46 | expected_nodes = get_nodes_by_label(self, node_labels) 47 | 48 | print(f"Expected nodes: {expected_nodes}") 49 | cluster_name = f"test-ray-cluster-li-{flavor[-5:]}" 50 | queues = list_local_queues(namespace=self.namespace, flavors=[flavor]) 51 | queue_name = queues[0]["name"] if queues else None 52 | print(f"Using flavor: {flavor}, Queue: {queue_name}") 53 | cluster = Cluster( 54 | ClusterConfiguration( 55 | namespace=self.namespace, 56 | name=cluster_name, 57 | num_workers=1, 58 | head_cpu_requests="500m", 59 | head_cpu_limits="500m", 60 | worker_cpu_requests="500m", 61 | worker_cpu_limits=1, 62 | worker_memory_requests=1, 63 | worker_memory_limits=4, 64 | image=ray_image, 65 | verify_tls=False, 66 | local_queue=queue_name, 67 | ) 68 | ) 69 | cluster.up() 70 | sleep(5) 71 | node_name = get_pod_node(self, self.namespace, cluster_name) 72 | print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}") 73 | sleep(5) 74 | assert ( 75 | node_name in expected_nodes 76 | ), f"Node {node_name} is not in the expected nodes for flavor {flavor}." 77 | cluster.down() 78 | -------------------------------------------------------------------------------- /tests/e2e/install-codeflare-sdk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd .. 4 | 5 | # Install Poetry and configure virtualenvs 6 | pip install poetry 7 | poetry config virtualenvs.create false 8 | 9 | cd codeflare-sdk 10 | 11 | # Lock dependencies and install them 12 | poetry lock 13 | poetry install --with test,docs 14 | 15 | # Return to the workdir 16 | cd .. 17 | cd workdir 18 | -------------------------------------------------------------------------------- /tests/e2e/local_interactive_sdk_kind_test.py: -------------------------------------------------------------------------------- 1 | from codeflare_sdk import ( 2 | Cluster, 3 | ClusterConfiguration, 4 | generate_cert, 5 | ) 6 | 7 | import pytest 8 | import ray 9 | import math 10 | import subprocess 11 | 12 | from support import * 13 | 14 | 15 | @pytest.mark.kind 16 | class TestRayLocalInteractiveKind: 17 | def setup_method(self): 18 | initialize_kubernetes_client(self) 19 | self.port_forward_process = None 20 | 21 | def cleanup_port_forward(self): 22 | if self.port_forward_process: 23 | self.port_forward_process.terminate() 24 | self.port_forward_process.wait(timeout=10) 25 | self.port_forward_process = None 26 | 27 | def teardown_method(self): 28 | self.cleanup_port_forward() 29 | delete_namespace(self) 30 | delete_kueue_resources(self) 31 | 32 | def test_local_interactives(self): 33 | self.setup_method() 34 | create_namespace(self) 35 | create_kueue_resources(self) 36 | self.run_local_interactives() 37 | 38 | @pytest.mark.nvidia_gpu 39 | def test_local_interactives_nvidia_gpu(self): 40 | self.setup_method() 41 | create_namespace(self) 42 | create_kueue_resources(self) 43 | self.run_local_interactives(number_of_gpus=1) 44 | 45 | def run_local_interactives( 46 | self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 47 | ): 48 | cluster_name = "test-ray-cluster-li" 49 | 50 | ray.shutdown() 51 | 52 | cluster = Cluster( 53 | ClusterConfiguration( 54 | name=cluster_name, 55 | namespace=self.namespace, 56 | num_workers=1, 57 | head_cpu_requests="500m", 58 | head_cpu_limits="500m", 59 | worker_cpu_requests="500m", 60 | worker_cpu_limits=1, 61 | worker_memory_requests=1, 62 | worker_memory_limits=4, 63 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, 64 | verify_tls=False, 65 | ) 66 | ) 67 | 68 | cluster.up() 69 | 70 | cluster.wait_ready() 71 | cluster.status() 72 | 73 | generate_cert.generate_tls_cert(cluster_name, self.namespace) 74 | generate_cert.export_env(cluster_name, self.namespace) 75 | 76 | print(cluster.local_client_url()) 77 | 78 | @ray.remote(num_gpus=number_of_gpus / 2) 79 | def heavy_calculation_part(num_iterations): 80 | result = 0.0 81 | for i in range(num_iterations): 82 | for j in range(num_iterations): 83 | for k in range(num_iterations): 84 | result += math.sin(i) * math.cos(j) * math.tan(k) 85 | return result 86 | 87 | @ray.remote(num_gpus=number_of_gpus / 2) 88 | def heavy_calculation(num_iterations): 89 | results = ray.get( 90 | [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)] 91 | ) 92 | return sum(results) 93 | 94 | # Attempt to port forward 95 | try: 96 | local_port = "20001" 97 | ray_client_port = "10001" 98 | 99 | port_forward_cmd = [ 100 | "kubectl", 101 | "port-forward", 102 | "-n", 103 | self.namespace, 104 | f"svc/{cluster_name}-head-svc", 105 | f"{local_port}:{ray_client_port}", 106 | ] 107 | self.port_forward_process = subprocess.Popen( 108 | port_forward_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL 109 | ) 110 | 111 | client_url = f"ray://localhost:{local_port}" 112 | cluster.status() 113 | 114 | ray.init(address=client_url, logging_level="INFO") 115 | 116 | ref = heavy_calculation.remote(3000) 117 | result = ray.get(ref) 118 | assert result == 1789.4644387076714 119 | ray.cancel(ref) 120 | ray.shutdown() 121 | 122 | cluster.down() 123 | finally: 124 | self.cleanup_port_forward() 125 | -------------------------------------------------------------------------------- /tests/e2e/local_interactive_sdk_oauth_test.py: -------------------------------------------------------------------------------- 1 | from codeflare_sdk import ( 2 | Cluster, 3 | ClusterConfiguration, 4 | TokenAuthentication, 5 | generate_cert, 6 | ) 7 | 8 | import math 9 | import pytest 10 | import ray 11 | 12 | from support import * 13 | 14 | 15 | @pytest.mark.openshift 16 | class TestRayLocalInteractiveOauth: 17 | def setup_method(self): 18 | initialize_kubernetes_client(self) 19 | 20 | def teardown_method(self): 21 | delete_namespace(self) 22 | delete_kueue_resources(self) 23 | 24 | def test_local_interactives(self): 25 | self.setup_method() 26 | create_namespace(self) 27 | create_kueue_resources(self) 28 | self.run_local_interactives() 29 | 30 | def run_local_interactives(self): 31 | ray_image = get_ray_image() 32 | 33 | auth = TokenAuthentication( 34 | token=run_oc_command(["whoami", "--show-token=true"]), 35 | server=run_oc_command(["whoami", "--show-server=true"]), 36 | skip_tls=True, 37 | ) 38 | auth.login() 39 | 40 | cluster_name = "test-ray-cluster-li" 41 | 42 | cluster = Cluster( 43 | ClusterConfiguration( 44 | namespace=self.namespace, 45 | name=cluster_name, 46 | num_workers=1, 47 | worker_cpu_requests=1, 48 | worker_cpu_limits=1, 49 | worker_memory_requests=1, 50 | worker_memory_limits=4, 51 | image=ray_image, 52 | verify_tls=False, 53 | ) 54 | ) 55 | cluster.up() 56 | cluster.wait_ready() 57 | 58 | generate_cert.generate_tls_cert(cluster_name, self.namespace) 59 | generate_cert.export_env(cluster_name, self.namespace) 60 | 61 | ray.shutdown() 62 | ray.init(address=cluster.local_client_url(), logging_level="DEBUG") 63 | 64 | @ray.remote 65 | def heavy_calculation_part(num_iterations): 66 | result = 0.0 67 | for i in range(num_iterations): 68 | for j in range(num_iterations): 69 | for k in range(num_iterations): 70 | result += math.sin(i) * math.cos(j) * math.tan(k) 71 | return result 72 | 73 | @ray.remote 74 | def heavy_calculation(num_iterations): 75 | results = ray.get( 76 | [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)] 77 | ) 78 | return sum(results) 79 | 80 | ref = heavy_calculation.remote(3000) 81 | result = ray.get(ref) 82 | assert result == 1789.4644387076714 83 | ray.cancel(ref) 84 | ray.shutdown() 85 | 86 | cluster.down() 87 | -------------------------------------------------------------------------------- /tests/e2e/minio_deployment.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: PersistentVolumeClaim 3 | apiVersion: v1 4 | metadata: 5 | name: minio-pvc 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 20Gi 12 | volumeMode: Filesystem 13 | --- 14 | kind: Secret 15 | apiVersion: v1 16 | metadata: 17 | name: minio-secret 18 | stringData: 19 | # change the username and password to your own values. 20 | # ensure that the user is at least 3 characters long and the password at least 8 21 | minio_root_user: minio 22 | minio_root_password: minio123 23 | --- 24 | kind: Deployment 25 | apiVersion: apps/v1 26 | metadata: 27 | name: minio 28 | spec: 29 | replicas: 1 30 | selector: 31 | matchLabels: 32 | app: minio 33 | template: 34 | metadata: 35 | creationTimestamp: null 36 | labels: 37 | app: minio 38 | spec: 39 | volumes: 40 | - name: data 41 | persistentVolumeClaim: 42 | claimName: minio-pvc 43 | containers: 44 | - resources: 45 | limits: 46 | cpu: 250m 47 | memory: 1Gi 48 | requests: 49 | cpu: 20m 50 | memory: 100Mi 51 | readinessProbe: 52 | tcpSocket: 53 | port: 9000 54 | initialDelaySeconds: 5 55 | timeoutSeconds: 1 56 | periodSeconds: 5 57 | successThreshold: 1 58 | failureThreshold: 3 59 | terminationMessagePath: /dev/termination-log 60 | name: minio 61 | livenessProbe: 62 | tcpSocket: 63 | port: 9000 64 | initialDelaySeconds: 30 65 | timeoutSeconds: 1 66 | periodSeconds: 5 67 | successThreshold: 1 68 | failureThreshold: 3 69 | env: 70 | - name: MINIO_ROOT_USER 71 | valueFrom: 72 | secretKeyRef: 73 | name: minio-secret 74 | key: minio_root_user 75 | - name: MINIO_ROOT_PASSWORD 76 | valueFrom: 77 | secretKeyRef: 78 | name: minio-secret 79 | key: minio_root_password 80 | ports: 81 | - containerPort: 9000 82 | protocol: TCP 83 | - containerPort: 9090 84 | protocol: TCP 85 | imagePullPolicy: IfNotPresent 86 | volumeMounts: 87 | - name: data 88 | mountPath: /data 89 | subPath: minio 90 | terminationMessagePolicy: File 91 | image: quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z 92 | args: 93 | - server 94 | - /data 95 | - --console-address 96 | - :9090 97 | restartPolicy: Always 98 | terminationGracePeriodSeconds: 30 99 | dnsPolicy: ClusterFirst 100 | securityContext: {} 101 | schedulerName: default-scheduler 102 | strategy: 103 | type: Recreate 104 | revisionHistoryLimit: 10 105 | progressDeadlineSeconds: 600 106 | --- 107 | kind: Service 108 | apiVersion: v1 109 | metadata: 110 | name: minio-service 111 | spec: 112 | ipFamilies: 113 | - IPv4 114 | ports: 115 | - name: api 116 | protocol: TCP 117 | port: 9000 118 | targetPort: 9000 119 | - name: ui 120 | protocol: TCP 121 | port: 9090 122 | targetPort: 9090 123 | internalTrafficPolicy: Cluster 124 | type: ClusterIP 125 | ipFamilyPolicy: SingleStack 126 | sessionAffinity: None 127 | selector: 128 | app: minio 129 | -------------------------------------------------------------------------------- /tests/e2e/mnist_pip_requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch_lightning==1.9.5 2 | torchmetrics==0.9.1 3 | torchvision==0.20.1 4 | minio 5 | -------------------------------------------------------------------------------- /tests/e2e/mnist_raycluster_sdk_aw_kind_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from time import sleep 4 | 5 | from codeflare_sdk import Cluster, ClusterConfiguration 6 | from codeflare_sdk.ray.client import RayJobClient 7 | 8 | import pytest 9 | 10 | from support import * 11 | 12 | # This test creates an AppWrapper containing a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster 13 | 14 | 15 | @pytest.mark.kind 16 | class TestRayClusterSDKAppWrapperKind: 17 | def setup_method(self): 18 | initialize_kubernetes_client(self) 19 | 20 | def teardown_method(self): 21 | delete_namespace(self) 22 | delete_kueue_resources(self) 23 | 24 | def test_mnist_ray_cluster_sdk_kind(self): 25 | self.setup_method() 26 | create_namespace(self) 27 | create_kueue_resources(self) 28 | self.run_mnist_raycluster_sdk_kind(accelerator="cpu") 29 | 30 | @pytest.mark.nvidia_gpu 31 | def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): 32 | self.setup_method() 33 | create_namespace(self) 34 | create_kueue_resources(self) 35 | self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) 36 | 37 | def run_mnist_raycluster_sdk_kind( 38 | self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 39 | ): 40 | cluster = Cluster( 41 | ClusterConfiguration( 42 | name="mnist", 43 | namespace=self.namespace, 44 | num_workers=1, 45 | head_cpu_requests="500m", 46 | head_cpu_limits="500m", 47 | worker_cpu_requests="500m", 48 | worker_cpu_limits=1, 49 | worker_memory_requests=1, 50 | worker_memory_limits=4, 51 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, 52 | write_to_file=True, 53 | verify_tls=False, 54 | appwrapper=True, 55 | ) 56 | ) 57 | 58 | cluster.up() 59 | 60 | cluster.status() 61 | 62 | cluster.wait_ready() 63 | 64 | cluster.status() 65 | 66 | cluster.details() 67 | 68 | self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) 69 | assert_get_cluster_and_jobsubmit( 70 | self, "mnist", accelerator="gpu", number_of_gpus=1 71 | ) 72 | 73 | # Assertions 74 | 75 | def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): 76 | ray_dashboard = cluster.cluster_dashboard_uri() 77 | client = RayJobClient(address=ray_dashboard, verify=False) 78 | 79 | submission_id = client.submit_job( 80 | entrypoint="python mnist.py", 81 | runtime_env={ 82 | "working_dir": "./tests/e2e/", 83 | "pip": "./tests/e2e/mnist_pip_requirements.txt", 84 | "env_vars": get_setup_env_variables(ACCELERATOR=accelerator), 85 | }, 86 | entrypoint_num_gpus=number_of_gpus, 87 | ) 88 | print(f"Submitted job with ID: {submission_id}") 89 | done = False 90 | time = 0 91 | timeout = 900 92 | while not done: 93 | status = client.get_job_status(submission_id) 94 | if status.is_terminal(): 95 | break 96 | if not done: 97 | print(status) 98 | if timeout and time >= timeout: 99 | raise TimeoutError(f"job has timed out after waiting {timeout}s") 100 | sleep(5) 101 | time += 5 102 | 103 | logs = client.get_job_logs(submission_id) 104 | print(logs) 105 | 106 | self.assert_job_completion(status) 107 | 108 | client.delete_job(submission_id) 109 | 110 | def assert_job_completion(self, status): 111 | if status == "SUCCEEDED": 112 | print(f"Job has completed: '{status}'") 113 | assert True 114 | else: 115 | print(f"Job has completed: '{status}'") 116 | assert False 117 | -------------------------------------------------------------------------------- /tests/e2e/mnist_raycluster_sdk_kind_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from time import sleep 4 | 5 | from codeflare_sdk import Cluster, ClusterConfiguration 6 | from codeflare_sdk.ray.client import RayJobClient 7 | 8 | import pytest 9 | 10 | from support import * 11 | 12 | # This test creates a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster 13 | 14 | 15 | @pytest.mark.kind 16 | class TestRayClusterSDKKind: 17 | def setup_method(self): 18 | initialize_kubernetes_client(self) 19 | 20 | def teardown_method(self): 21 | delete_namespace(self) 22 | delete_kueue_resources(self) 23 | 24 | def test_mnist_ray_cluster_sdk_kind(self): 25 | self.setup_method() 26 | create_namespace(self) 27 | create_kueue_resources(self) 28 | self.run_mnist_raycluster_sdk_kind(accelerator="cpu") 29 | 30 | @pytest.mark.nvidia_gpu 31 | def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): 32 | self.setup_method() 33 | create_namespace(self) 34 | create_kueue_resources(self) 35 | self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) 36 | 37 | def run_mnist_raycluster_sdk_kind( 38 | self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 39 | ): 40 | cluster = Cluster( 41 | ClusterConfiguration( 42 | name="mnist", 43 | namespace=self.namespace, 44 | num_workers=1, 45 | head_cpu_requests="500m", 46 | head_cpu_limits="500m", 47 | worker_cpu_requests="500m", 48 | worker_cpu_limits=1, 49 | worker_memory_requests=1, 50 | worker_memory_limits=4, 51 | worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, 52 | write_to_file=True, 53 | verify_tls=False, 54 | ) 55 | ) 56 | 57 | cluster.up() 58 | 59 | cluster.status() 60 | 61 | cluster.wait_ready() 62 | 63 | cluster.status() 64 | 65 | cluster.details() 66 | 67 | self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) 68 | 69 | assert_get_cluster_and_jobsubmit( 70 | self, "mnist", accelerator="gpu", number_of_gpus=1 71 | ) 72 | 73 | # Assertions 74 | 75 | def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): 76 | ray_dashboard = cluster.cluster_dashboard_uri() 77 | client = RayJobClient(address=ray_dashboard, verify=False) 78 | 79 | submission_id = client.submit_job( 80 | entrypoint="python mnist.py", 81 | runtime_env={ 82 | "working_dir": "./tests/e2e/", 83 | "pip": "./tests/e2e/mnist_pip_requirements.txt", 84 | "env_vars": get_setup_env_variables(ACCELERATOR=accelerator), 85 | }, 86 | entrypoint_num_gpus=number_of_gpus, 87 | ) 88 | print(f"Submitted job with ID: {submission_id}") 89 | done = False 90 | time = 0 91 | timeout = 900 92 | while not done: 93 | status = client.get_job_status(submission_id) 94 | if status.is_terminal(): 95 | break 96 | if not done: 97 | print(status) 98 | if timeout and time >= timeout: 99 | raise TimeoutError(f"job has timed out after waiting {timeout}s") 100 | sleep(5) 101 | time += 5 102 | 103 | logs = client.get_job_logs(submission_id) 104 | print(logs) 105 | 106 | self.assert_job_completion(status) 107 | 108 | client.delete_job(submission_id) 109 | 110 | def assert_job_completion(self, status): 111 | if status == "SUCCEEDED": 112 | print(f"Job has completed: '{status}'") 113 | assert True 114 | else: 115 | print(f"Job has completed: '{status}'") 116 | assert False 117 | -------------------------------------------------------------------------------- /tests/e2e/mnist_raycluster_sdk_oauth_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from time import sleep 4 | 5 | from codeflare_sdk import ( 6 | Cluster, 7 | ClusterConfiguration, 8 | TokenAuthentication, 9 | ) 10 | from codeflare_sdk.ray.client import RayJobClient 11 | 12 | import pytest 13 | 14 | from support import * 15 | 16 | # This test creates a Ray Cluster and covers the Ray Job submission with authentication and without authentication functionality on Openshift Cluster 17 | 18 | 19 | @pytest.mark.openshift 20 | class TestRayClusterSDKOauth: 21 | def setup_method(self): 22 | initialize_kubernetes_client(self) 23 | 24 | def teardown_method(self): 25 | delete_namespace(self) 26 | delete_kueue_resources(self) 27 | 28 | def test_mnist_ray_cluster_sdk_auth(self): 29 | self.setup_method() 30 | create_namespace(self) 31 | create_kueue_resources(self) 32 | self.run_mnist_raycluster_sdk_oauth() 33 | 34 | def run_mnist_raycluster_sdk_oauth(self): 35 | ray_image = get_ray_image() 36 | 37 | auth = TokenAuthentication( 38 | token=run_oc_command(["whoami", "--show-token=true"]), 39 | server=run_oc_command(["whoami", "--show-server=true"]), 40 | skip_tls=True, 41 | ) 42 | auth.login() 43 | 44 | cluster = Cluster( 45 | ClusterConfiguration( 46 | name="mnist", 47 | namespace=self.namespace, 48 | num_workers=1, 49 | head_cpu_requests="500m", 50 | head_cpu_limits="500m", 51 | worker_cpu_requests=1, 52 | worker_cpu_limits=1, 53 | worker_memory_requests=1, 54 | worker_memory_limits=4, 55 | image=ray_image, 56 | write_to_file=True, 57 | verify_tls=False, 58 | ) 59 | ) 60 | 61 | cluster.up() 62 | 63 | cluster.status() 64 | 65 | cluster.wait_ready() 66 | 67 | cluster.status() 68 | 69 | cluster.details() 70 | 71 | self.assert_jobsubmit_withoutLogin(cluster) 72 | self.assert_jobsubmit_withlogin(cluster) 73 | assert_get_cluster_and_jobsubmit(self, "mnist") 74 | 75 | # Assertions 76 | 77 | def assert_jobsubmit_withoutLogin(self, cluster): 78 | dashboard_url = cluster.cluster_dashboard_uri() 79 | jobdata = { 80 | "entrypoint": "python mnist.py", 81 | "runtime_env": { 82 | "working_dir": "./tests/e2e/", 83 | "pip": "./tests/e2e/mnist_pip_requirements.txt", 84 | "env_vars": get_setup_env_variables(), 85 | }, 86 | } 87 | try: 88 | response = requests.post( 89 | dashboard_url + "/api/jobs/", verify=False, json=jobdata 90 | ) 91 | if response.status_code == 403: 92 | assert True 93 | else: 94 | response.raise_for_status() 95 | assert False 96 | 97 | except Exception as e: 98 | print(f"An unexpected error occurred. Error: {e}") 99 | assert False 100 | 101 | def assert_jobsubmit_withlogin(self, cluster): 102 | auth_token = run_oc_command(["whoami", "--show-token=true"]) 103 | ray_dashboard = cluster.cluster_dashboard_uri() 104 | header = {"Authorization": f"Bearer {auth_token}"} 105 | client = RayJobClient(address=ray_dashboard, headers=header, verify=False) 106 | 107 | submission_id = client.submit_job( 108 | entrypoint="python mnist.py", 109 | runtime_env={ 110 | "working_dir": "./tests/e2e/", 111 | "pip": "./tests/e2e/mnist_pip_requirements.txt", 112 | "env_vars": get_setup_env_variables(), 113 | }, 114 | entrypoint_num_cpus=1, 115 | ) 116 | print(f"Submitted job with ID: {submission_id}") 117 | done = False 118 | time = 0 119 | timeout = 900 120 | while not done: 121 | status = client.get_job_status(submission_id) 122 | if status.is_terminal(): 123 | break 124 | if not done: 125 | print(status) 126 | if timeout and time >= timeout: 127 | raise TimeoutError(f"job has timed out after waiting {timeout}s") 128 | sleep(5) 129 | time += 5 130 | 131 | logs = client.get_job_logs(submission_id) 132 | print(logs) 133 | 134 | self.assert_job_completion(status) 135 | 136 | client.delete_job(submission_id) 137 | 138 | def assert_job_completion(self, status): 139 | if status == "SUCCEEDED": 140 | print(f"Job has completed: '{status}'") 141 | assert True 142 | else: 143 | print(f"Job has completed: '{status}'") 144 | assert False 145 | -------------------------------------------------------------------------------- /tests/e2e/mnist_rayjob.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from time import sleep 4 | 5 | from support import * 6 | 7 | from codeflare_sdk.ray.cluster.cluster import get_cluster 8 | from codeflare_sdk.ray.client import RayJobClient 9 | 10 | namespace = sys.argv[1] 11 | 12 | cluster = get_cluster("mnist", namespace) 13 | 14 | cluster.details() 15 | 16 | auth_token = run_oc_command(["whoami", "--show-token=true"]) 17 | ray_dashboard = cluster.cluster_dashboard_uri() 18 | header = {"Authorization": f"Bearer {auth_token}"} 19 | client = RayJobClient(address=ray_dashboard, headers=header, verify=True) 20 | 21 | # Submit the job 22 | submission_id = client.submit_job( 23 | entrypoint="python mnist.py", 24 | runtime_env={"working_dir": "/", "pip": "requirements.txt"}, 25 | ) 26 | print(f"Submitted job with ID: {submission_id}") 27 | done = False 28 | time = 0 29 | timeout = 900 30 | while not done: 31 | status = client.get_job_status(submission_id) 32 | if status.is_terminal(): 33 | break 34 | if not done: 35 | print(status) 36 | if timeout and time >= timeout: 37 | raise TimeoutError(f"job has timed out after waiting {timeout}s") 38 | sleep(5) 39 | time += 5 40 | 41 | logs = client.get_job_logs(submission_id) 42 | print(logs) 43 | 44 | client.delete_job(submission_id) 45 | cluster.down() 46 | 47 | 48 | if not status == "SUCCEEDED": 49 | exit(1) 50 | else: 51 | exit(0) 52 | -------------------------------------------------------------------------------- /tests/e2e/mnist_sleep.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 IBM, Red Hat 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | import torch 17 | import torch.nn as nn 18 | from torch.utils.data import DataLoader 19 | from torchvision import datasets, transforms 20 | 21 | 22 | # Define a simple neural network 23 | class NeuralNetwork(nn.Module): 24 | def __init__(self): 25 | super(NeuralNetwork, self).__init__() 26 | self.flatten = nn.Flatten() 27 | self.linear_relu_stack = nn.Sequential( 28 | nn.Linear(28 * 28, 512), 29 | nn.ReLU(), 30 | nn.Linear(512, 512), 31 | nn.ReLU(), 32 | nn.Linear(512, 10), 33 | ) 34 | 35 | def forward(self, x): 36 | x = self.flatten(x) 37 | logits = self.linear_relu_stack(x) 38 | return logits 39 | 40 | 41 | # Define the training function 42 | def train(): 43 | # Sleeping for 24 hours for upgrade test scenario 44 | print("Sleeping for 24 hours before starting the training for upgrade testing...") 45 | time.sleep(24 * 60 * 60) 46 | 47 | # Load dataset 48 | transform = transforms.Compose([transforms.ToTensor()]) 49 | train_dataset = datasets.FashionMNIST( 50 | root="./data", train=True, download=True, transform=transform 51 | ) 52 | train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) 53 | 54 | # Initialize the neural network, loss function, and optimizer 55 | model = NeuralNetwork() 56 | criterion = nn.CrossEntropyLoss() 57 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 58 | 59 | # Train the model 60 | num_epochs = 3 61 | for epoch in range(num_epochs): 62 | for inputs, labels in train_loader: 63 | optimizer.zero_grad() 64 | outputs = model(inputs) 65 | loss = criterion(outputs, labels) 66 | loss.backward() 67 | optimizer.step() 68 | print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}") 69 | 70 | 71 | if __name__ == "__main__": 72 | train() 73 | -------------------------------------------------------------------------------- /tests/e2e/start_ray_cluster.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from time import sleep 5 | 6 | from codeflare_sdk.ray.cluster.cluster import Cluster, ClusterConfiguration 7 | 8 | namespace = sys.argv[1] 9 | ray_image = os.getenv("RAY_IMAGE") 10 | 11 | cluster = Cluster( 12 | ClusterConfiguration( 13 | name="mnist", 14 | namespace=namespace, 15 | num_workers=1, 16 | head_cpu_requests="500m", 17 | head_cpu_limits="500m", 18 | head_memory_requests=2, 19 | head_memory_limits=2, 20 | worker_cpu_requests="500m", 21 | worker_cpu_limits=1, 22 | worker_memory_requests=1, 23 | worker_memory_limits=2, 24 | image=ray_image, 25 | appwrapper=True, 26 | ) 27 | ) 28 | 29 | cluster.up() 30 | 31 | cluster.status() 32 | 33 | cluster.wait_ready() 34 | 35 | cluster.status() 36 | 37 | cluster.details() 38 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/appwrapper/test-case-bad.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppsWrapper 3 | metadata: 4 | labels: 5 | orderedinstance: cpu.small_gpu.large 6 | nam: unit-test-cluster 7 | namspace: ns 8 | spec: 9 | components: 10 | - template: 11 | apiVersion: ray.io/v1 12 | kind: RayCluster 13 | metadata: 14 | labels: 15 | controller-tools.k8s.io: '1.0' 16 | name: unit-test-cluster 17 | namespace: ns 18 | spec: 19 | autoscalerOptions: 20 | idleTimeoutSeconds: 60 21 | resources: 22 | limits: 23 | cpu: 500m 24 | memory: 512Mi 25 | requests: 26 | cpu: 500m 27 | memory: 512Mi 28 | upscalingMode: Default 29 | enableInTreeAutoscaling: false 30 | headGroupSpec: 31 | rayStartParams: 32 | block: 'true' 33 | dashboard-host: 0.0.0.0 34 | num-gpus: '0' 35 | resources: '"{}"' 36 | serviceType: ClusterIP 37 | template: 38 | spec: 39 | containers: 40 | - env: 41 | - name: MY_POD_IP 42 | valueFrom: 43 | fieldRef: 44 | fieldPath: status.podIP 45 | image: "${image}" 46 | imagePullPolicy: IfNotPresent 47 | lifecycle: 48 | preStop: 49 | exec: 50 | command: 51 | - /bin/sh 52 | - -c 53 | - ray stop 54 | name: ray-head 55 | ports: 56 | - containerPort: 6379 57 | name: gcs 58 | - containerPort: 8265 59 | name: dashboard 60 | - containerPort: 10001 61 | name: client 62 | resources: 63 | limits: 64 | cpu: 2 65 | memory: 8G 66 | requests: 67 | cpu: 2 68 | memory: 8G 69 | rayVersion: 2.47.1 70 | workerGroupSpecs: 71 | - groupName: small-group-unit-test-cluster 72 | maxReplicas: 2 73 | minReplicas: 2 74 | rayStartParams: 75 | block: 'true' 76 | num-gpus: '7' 77 | resources: '"{}"' 78 | replicas: 2 79 | template: 80 | metadata: 81 | annotations: 82 | key: value 83 | labels: 84 | key: value 85 | spec: 86 | containers: 87 | - env: 88 | - name: MY_POD_IP 89 | valueFrom: 90 | fieldRef: 91 | fieldPath: status.podIP 92 | image: "${image}" 93 | lifecycle: 94 | preStop: 95 | exec: 96 | command: 97 | - /bin/sh 98 | - -c 99 | - ray stop 100 | name: machine-learning 101 | resources: 102 | limits: 103 | cpu: 4 104 | memory: 6G 105 | nvidia.com/gpu: 7 106 | requests: 107 | cpu: 3 108 | memory: 5G 109 | nvidia.com/gpu: 7 110 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/kueue/aw_kueue.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | labels: 5 | kueue.x-k8s.io/queue-name: local-queue-default 6 | name: unit-test-aw-kueue 7 | namespace: ns 8 | spec: 9 | components: 10 | - template: 11 | apiVersion: ray.io/v1 12 | kind: RayCluster 13 | metadata: 14 | labels: 15 | controller-tools.k8s.io: '1.0' 16 | name: unit-test-aw-kueue 17 | namespace: ns 18 | spec: 19 | autoscalerOptions: 20 | idleTimeoutSeconds: 60 21 | resources: 22 | limits: 23 | cpu: 500m 24 | memory: 512Mi 25 | requests: 26 | cpu: 500m 27 | memory: 512Mi 28 | upscalingMode: Default 29 | enableInTreeAutoscaling: false 30 | headGroupSpec: 31 | enableIngress: false 32 | rayStartParams: 33 | block: 'true' 34 | dashboard-host: 0.0.0.0 35 | num-gpus: '0' 36 | resources: '"{}"' 37 | serviceType: ClusterIP 38 | template: 39 | spec: 40 | containers: 41 | - image: "${image}" 42 | imagePullPolicy: Always 43 | lifecycle: 44 | preStop: 45 | exec: 46 | command: 47 | - /bin/sh 48 | - -c 49 | - ray stop 50 | name: ray-head 51 | ports: 52 | - containerPort: 6379 53 | name: gcs 54 | - containerPort: 8265 55 | name: dashboard 56 | - containerPort: 10001 57 | name: client 58 | resources: 59 | limits: 60 | cpu: 2 61 | memory: 8G 62 | requests: 63 | cpu: 2 64 | memory: 8G 65 | volumeMounts: 66 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 67 | name: odh-trusted-ca-cert 68 | subPath: odh-trusted-ca-bundle.crt 69 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 70 | name: odh-trusted-ca-cert 71 | subPath: odh-trusted-ca-bundle.crt 72 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 73 | name: odh-ca-cert 74 | subPath: odh-ca-bundle.crt 75 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 76 | name: odh-ca-cert 77 | subPath: odh-ca-bundle.crt 78 | env: 79 | - name: RAY_USAGE_STATS_ENABLED 80 | value: '0' 81 | volumes: 82 | - configMap: 83 | items: 84 | - key: ca-bundle.crt 85 | path: odh-trusted-ca-bundle.crt 86 | name: odh-trusted-ca-bundle 87 | optional: true 88 | name: odh-trusted-ca-cert 89 | - configMap: 90 | items: 91 | - key: odh-ca-bundle.crt 92 | path: odh-ca-bundle.crt 93 | name: odh-trusted-ca-bundle 94 | optional: true 95 | name: odh-ca-cert 96 | rayVersion: 2.47.1 97 | workerGroupSpecs: 98 | - groupName: small-group-unit-test-aw-kueue 99 | maxReplicas: 2 100 | minReplicas: 2 101 | rayStartParams: 102 | block: 'true' 103 | num-gpus: '0' 104 | resources: '"{}"' 105 | replicas: 2 106 | template: 107 | spec: 108 | containers: 109 | - image: "${image}" 110 | imagePullPolicy: Always 111 | lifecycle: 112 | preStop: 113 | exec: 114 | command: 115 | - /bin/sh 116 | - -c 117 | - ray stop 118 | name: machine-learning 119 | resources: 120 | limits: 121 | cpu: 4 122 | memory: 6G 123 | requests: 124 | cpu: 3 125 | memory: 5G 126 | volumeMounts: 127 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 128 | name: odh-trusted-ca-cert 129 | subPath: odh-trusted-ca-bundle.crt 130 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 131 | name: odh-trusted-ca-cert 132 | subPath: odh-trusted-ca-bundle.crt 133 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 134 | name: odh-ca-cert 135 | subPath: odh-ca-bundle.crt 136 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 137 | name: odh-ca-cert 138 | subPath: odh-ca-bundle.crt 139 | env: 140 | - name: RAY_USAGE_STATS_ENABLED 141 | value: '0' 142 | volumes: 143 | - configMap: 144 | items: 145 | - key: ca-bundle.crt 146 | path: odh-trusted-ca-bundle.crt 147 | name: odh-trusted-ca-bundle 148 | optional: true 149 | name: odh-trusted-ca-cert 150 | - configMap: 151 | items: 152 | - key: odh-ca-bundle.crt 153 | path: odh-ca-bundle.crt 154 | name: odh-trusted-ca-bundle 155 | optional: true 156 | name: odh-ca-cert 157 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | labels: 5 | kueue.x-k8s.io/queue-name: local-queue-default 6 | name: unit-test-cluster-kueue 7 | namespace: ns 8 | spec: 9 | components: 10 | - template: 11 | apiVersion: ray.io/v1 12 | kind: RayCluster 13 | metadata: 14 | labels: 15 | controller-tools.k8s.io: '1.0' 16 | name: unit-test-cluster-kueue 17 | namespace: ns 18 | spec: 19 | autoscalerOptions: 20 | idleTimeoutSeconds: 60 21 | resources: 22 | limits: 23 | cpu: 500m 24 | memory: 512Mi 25 | requests: 26 | cpu: 500m 27 | memory: 512Mi 28 | upscalingMode: Default 29 | enableInTreeAutoscaling: false 30 | headGroupSpec: 31 | enableIngress: false 32 | rayStartParams: 33 | block: 'true' 34 | dashboard-host: 0.0.0.0 35 | num-gpus: '0' 36 | resources: '"{}"' 37 | serviceType: ClusterIP 38 | template: 39 | spec: 40 | containers: 41 | - image: "${image}" 42 | imagePullPolicy: Always 43 | lifecycle: 44 | preStop: 45 | exec: 46 | command: 47 | - /bin/sh 48 | - -c 49 | - ray stop 50 | name: ray-head 51 | ports: 52 | - containerPort: 6379 53 | name: gcs 54 | - containerPort: 8265 55 | name: dashboard 56 | - containerPort: 10001 57 | name: client 58 | resources: 59 | limits: 60 | cpu: 2 61 | memory: 8G 62 | requests: 63 | cpu: 2 64 | memory: 8G 65 | volumeMounts: 66 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 67 | name: odh-trusted-ca-cert 68 | subPath: odh-trusted-ca-bundle.crt 69 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 70 | name: odh-trusted-ca-cert 71 | subPath: odh-trusted-ca-bundle.crt 72 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 73 | name: odh-ca-cert 74 | subPath: odh-ca-bundle.crt 75 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 76 | name: odh-ca-cert 77 | subPath: odh-ca-bundle.crt 78 | env: 79 | - name: RAY_USAGE_STATS_ENABLED 80 | value: '0' 81 | volumes: 82 | - configMap: 83 | items: 84 | - key: ca-bundle.crt 85 | path: odh-trusted-ca-bundle.crt 86 | name: odh-trusted-ca-bundle 87 | optional: true 88 | name: odh-trusted-ca-cert 89 | - configMap: 90 | items: 91 | - key: odh-ca-bundle.crt 92 | path: odh-ca-bundle.crt 93 | name: odh-trusted-ca-bundle 94 | optional: true 95 | name: odh-ca-cert 96 | rayVersion: 2.47.1 97 | workerGroupSpecs: 98 | - groupName: small-group-unit-test-cluster-kueue 99 | maxReplicas: 2 100 | minReplicas: 2 101 | rayStartParams: 102 | block: 'true' 103 | num-gpus: '0' 104 | resources: '"{}"' 105 | replicas: 2 106 | template: 107 | spec: 108 | containers: 109 | - image: "${image}" 110 | imagePullPolicy: Always 111 | lifecycle: 112 | preStop: 113 | exec: 114 | command: 115 | - /bin/sh 116 | - -c 117 | - ray stop 118 | name: machine-learning 119 | resources: 120 | limits: 121 | cpu: 4 122 | memory: 6G 123 | requests: 124 | cpu: 3 125 | memory: 5G 126 | volumeMounts: 127 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 128 | name: odh-trusted-ca-cert 129 | subPath: odh-trusted-ca-bundle.crt 130 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 131 | name: odh-trusted-ca-cert 132 | subPath: odh-trusted-ca-bundle.crt 133 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 134 | name: odh-ca-cert 135 | subPath: odh-ca-bundle.crt 136 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 137 | name: odh-ca-cert 138 | subPath: odh-ca-bundle.crt 139 | env: 140 | - name: RAY_USAGE_STATS_ENABLED 141 | value: '0' 142 | volumes: 143 | - configMap: 144 | items: 145 | - key: ca-bundle.crt 146 | path: odh-trusted-ca-bundle.crt 147 | name: odh-trusted-ca-bundle 148 | optional: true 149 | name: odh-trusted-ca-cert 150 | - configMap: 151 | items: 152 | - key: odh-ca-bundle.crt 153 | path: odh-ca-bundle.crt 154 | name: odh-trusted-ca-bundle 155 | optional: true 156 | name: odh-ca-cert 157 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/ray/default-appwrapper.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: default-appwrapper 5 | namespace: ns 6 | spec: 7 | components: 8 | - template: 9 | apiVersion: ray.io/v1 10 | kind: RayCluster 11 | metadata: 12 | labels: 13 | controller-tools.k8s.io: '1.0' 14 | name: default-appwrapper 15 | namespace: ns 16 | spec: 17 | autoscalerOptions: 18 | idleTimeoutSeconds: 60 19 | resources: 20 | limits: 21 | cpu: 500m 22 | memory: 512Mi 23 | requests: 24 | cpu: 500m 25 | memory: 512Mi 26 | upscalingMode: Default 27 | enableInTreeAutoscaling: false 28 | headGroupSpec: 29 | enableIngress: false 30 | rayStartParams: 31 | block: 'true' 32 | dashboard-host: 0.0.0.0 33 | num-gpus: '0' 34 | resources: '"{}"' 35 | serviceType: ClusterIP 36 | template: 37 | spec: 38 | containers: 39 | - image: "${image}" 40 | imagePullPolicy: Always 41 | lifecycle: 42 | preStop: 43 | exec: 44 | command: 45 | - /bin/sh 46 | - -c 47 | - ray stop 48 | name: ray-head 49 | ports: 50 | - containerPort: 6379 51 | name: gcs 52 | - containerPort: 8265 53 | name: dashboard 54 | - containerPort: 10001 55 | name: client 56 | env: 57 | - name: RAY_USAGE_STATS_ENABLED 58 | value: '0' 59 | resources: 60 | limits: 61 | cpu: 2 62 | memory: 8G 63 | requests: 64 | cpu: 2 65 | memory: 8G 66 | volumeMounts: 67 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 68 | name: odh-trusted-ca-cert 69 | subPath: odh-trusted-ca-bundle.crt 70 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 71 | name: odh-trusted-ca-cert 72 | subPath: odh-trusted-ca-bundle.crt 73 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 74 | name: odh-ca-cert 75 | subPath: odh-ca-bundle.crt 76 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 77 | name: odh-ca-cert 78 | subPath: odh-ca-bundle.crt 79 | volumes: 80 | - configMap: 81 | items: 82 | - key: ca-bundle.crt 83 | path: odh-trusted-ca-bundle.crt 84 | name: odh-trusted-ca-bundle 85 | optional: true 86 | name: odh-trusted-ca-cert 87 | - configMap: 88 | items: 89 | - key: odh-ca-bundle.crt 90 | path: odh-ca-bundle.crt 91 | name: odh-trusted-ca-bundle 92 | optional: true 93 | name: odh-ca-cert 94 | rayVersion: 2.47.1 95 | workerGroupSpecs: 96 | - groupName: small-group-default-appwrapper 97 | maxReplicas: 1 98 | minReplicas: 1 99 | rayStartParams: 100 | block: 'true' 101 | num-gpus: '0' 102 | resources: '"{}"' 103 | replicas: 1 104 | template: 105 | spec: 106 | containers: 107 | - image: "${image}" 108 | imagePullPolicy: Always 109 | lifecycle: 110 | preStop: 111 | exec: 112 | command: 113 | - /bin/sh 114 | - -c 115 | - ray stop 116 | name: machine-learning 117 | env: 118 | - name: RAY_USAGE_STATS_ENABLED 119 | value: '0' 120 | resources: 121 | limits: 122 | cpu: 1 123 | memory: 2G 124 | requests: 125 | cpu: 1 126 | memory: 2G 127 | volumeMounts: 128 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 129 | name: odh-trusted-ca-cert 130 | subPath: odh-trusted-ca-bundle.crt 131 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 132 | name: odh-trusted-ca-cert 133 | subPath: odh-trusted-ca-bundle.crt 134 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 135 | name: odh-ca-cert 136 | subPath: odh-ca-bundle.crt 137 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 138 | name: odh-ca-cert 139 | subPath: odh-ca-bundle.crt 140 | volumes: 141 | - configMap: 142 | items: 143 | - key: ca-bundle.crt 144 | path: odh-trusted-ca-bundle.crt 145 | name: odh-trusted-ca-bundle 146 | optional: true 147 | name: odh-trusted-ca-cert 148 | - configMap: 149 | items: 150 | - key: odh-ca-bundle.crt 151 | path: odh-ca-bundle.crt 152 | name: odh-trusted-ca-bundle 153 | optional: true 154 | name: odh-ca-cert 155 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/ray/default-ray-cluster.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: ray.io/v1 2 | kind: RayCluster 3 | metadata: 4 | labels: 5 | controller-tools.k8s.io: '1.0' 6 | name: default-cluster 7 | namespace: ns 8 | spec: 9 | autoscalerOptions: 10 | idleTimeoutSeconds: 60 11 | resources: 12 | limits: 13 | cpu: 500m 14 | memory: 512Mi 15 | requests: 16 | cpu: 500m 17 | memory: 512Mi 18 | upscalingMode: Default 19 | enableInTreeAutoscaling: false 20 | headGroupSpec: 21 | enableIngress: false 22 | rayStartParams: 23 | block: 'true' 24 | dashboard-host: 0.0.0.0 25 | num-gpus: '0' 26 | resources: '"{}"' 27 | serviceType: ClusterIP 28 | template: 29 | spec: 30 | containers: 31 | - image: "${image}" 32 | imagePullPolicy: Always 33 | lifecycle: 34 | preStop: 35 | exec: 36 | command: 37 | - /bin/sh 38 | - -c 39 | - ray stop 40 | name: ray-head 41 | ports: 42 | - containerPort: 6379 43 | name: gcs 44 | - containerPort: 8265 45 | name: dashboard 46 | - containerPort: 10001 47 | name: client 48 | env: 49 | - name: RAY_USAGE_STATS_ENABLED 50 | value: '0' 51 | resources: 52 | limits: 53 | cpu: 2 54 | memory: 8G 55 | requests: 56 | cpu: 2 57 | memory: 8G 58 | volumeMounts: 59 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 60 | name: odh-trusted-ca-cert 61 | subPath: odh-trusted-ca-bundle.crt 62 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 63 | name: odh-trusted-ca-cert 64 | subPath: odh-trusted-ca-bundle.crt 65 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 66 | name: odh-ca-cert 67 | subPath: odh-ca-bundle.crt 68 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 69 | name: odh-ca-cert 70 | subPath: odh-ca-bundle.crt 71 | volumes: 72 | - configMap: 73 | items: 74 | - key: ca-bundle.crt 75 | path: odh-trusted-ca-bundle.crt 76 | name: odh-trusted-ca-bundle 77 | optional: true 78 | name: odh-trusted-ca-cert 79 | - configMap: 80 | items: 81 | - key: odh-ca-bundle.crt 82 | path: odh-ca-bundle.crt 83 | name: odh-trusted-ca-bundle 84 | optional: true 85 | name: odh-ca-cert 86 | rayVersion: 2.47.1 87 | workerGroupSpecs: 88 | - groupName: small-group-default-cluster 89 | maxReplicas: 1 90 | minReplicas: 1 91 | rayStartParams: 92 | block: 'true' 93 | num-gpus: '0' 94 | resources: '"{}"' 95 | replicas: 1 96 | template: 97 | spec: 98 | containers: 99 | - image: "${image}" 100 | imagePullPolicy: Always 101 | lifecycle: 102 | preStop: 103 | exec: 104 | command: 105 | - /bin/sh 106 | - -c 107 | - ray stop 108 | name: machine-learning 109 | resources: 110 | limits: 111 | cpu: 1 112 | memory: 2G 113 | requests: 114 | cpu: 1 115 | memory: 2G 116 | env: 117 | - name: RAY_USAGE_STATS_ENABLED 118 | value: '0' 119 | volumeMounts: 120 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 121 | name: odh-trusted-ca-cert 122 | subPath: odh-trusted-ca-bundle.crt 123 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 124 | name: odh-trusted-ca-cert 125 | subPath: odh-trusted-ca-bundle.crt 126 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 127 | name: odh-ca-cert 128 | subPath: odh-ca-bundle.crt 129 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 130 | name: odh-ca-cert 131 | subPath: odh-ca-bundle.crt 132 | volumes: 133 | - configMap: 134 | items: 135 | - key: ca-bundle.crt 136 | path: odh-trusted-ca-bundle.crt 137 | name: odh-trusted-ca-bundle 138 | optional: true 139 | name: odh-trusted-ca-cert 140 | - configMap: 141 | items: 142 | - key: odh-ca-bundle.crt 143 | path: odh-ca-bundle.crt 144 | name: odh-trusted-ca-bundle 145 | optional: true 146 | name: odh-ca-cert 147 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/support_clusters/test-aw-a.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | labels: 5 | kueue.x-k8s.io/queue-name: local_default_queue 6 | name: test-cluster-a 7 | namespace: ns 8 | spec: 9 | components: 10 | - template: 11 | apiVersion: ray.io/v1 12 | kind: RayCluster 13 | metadata: 14 | labels: 15 | controller-tools.k8s.io: '1.0' 16 | name: test-cluster-a 17 | namespace: ns 18 | spec: 19 | autoscalerOptions: 20 | idleTimeoutSeconds: 60 21 | resources: 22 | limits: 23 | cpu: 500m 24 | memory: 512Mi 25 | requests: 26 | cpu: 500m 27 | memory: 512Mi 28 | upscalingMode: Default 29 | enableInTreeAutoscaling: false 30 | headGroupSpec: 31 | enableIngress: false 32 | rayStartParams: 33 | block: 'true' 34 | dashboard-host: 0.0.0.0 35 | num-gpus: '0' 36 | resources: '"{}"' 37 | serviceType: ClusterIP 38 | template: 39 | spec: 40 | containers: 41 | - image: "${image}" 42 | imagePullPolicy: IfNotPresent 43 | lifecycle: 44 | preStop: 45 | exec: 46 | command: 47 | - /bin/sh 48 | - -c 49 | - ray stop 50 | name: ray-head 51 | ports: 52 | - containerPort: 6379 53 | name: gcs 54 | - containerPort: 8265 55 | name: dashboard 56 | - containerPort: 10001 57 | name: client 58 | resources: 59 | limits: 60 | cpu: 2 61 | memory: 8G 62 | requests: 63 | cpu: 2 64 | memory: 8G 65 | volumeMounts: 66 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 67 | name: odh-trusted-ca-cert 68 | subPath: odh-trusted-ca-bundle.crt 69 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 70 | name: odh-trusted-ca-cert 71 | subPath: odh-trusted-ca-bundle.crt 72 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 73 | name: odh-ca-cert 74 | subPath: odh-ca-bundle.crt 75 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 76 | name: odh-ca-cert 77 | subPath: odh-ca-bundle.crt 78 | imagePullSecrets: [] 79 | volumes: 80 | - configMap: 81 | items: 82 | - key: ca-bundle.crt 83 | path: odh-trusted-ca-bundle.crt 84 | name: odh-trusted-ca-bundle 85 | optional: true 86 | name: odh-trusted-ca-cert 87 | - configMap: 88 | items: 89 | - key: odh-ca-bundle.crt 90 | path: odh-ca-bundle.crt 91 | name: odh-trusted-ca-bundle 92 | optional: true 93 | name: odh-ca-cert 94 | rayVersion: 2.47.1 95 | workerGroupSpecs: 96 | - groupName: small-group-test-cluster-a 97 | maxReplicas: 1 98 | minReplicas: 1 99 | rayStartParams: 100 | block: 'true' 101 | num-gpus: '0' 102 | resources: '"{}"' 103 | replicas: 1 104 | template: 105 | metadata: 106 | annotations: 107 | key: value 108 | labels: 109 | key: value 110 | spec: 111 | containers: 112 | - image: "${image}" 113 | lifecycle: 114 | preStop: 115 | exec: 116 | command: 117 | - /bin/sh 118 | - -c 119 | - ray stop 120 | name: machine-learning 121 | resources: 122 | limits: 123 | cpu: 1 124 | memory: 2G 125 | requests: 126 | cpu: 1 127 | memory: 2G 128 | volumeMounts: 129 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 130 | name: odh-trusted-ca-cert 131 | subPath: odh-trusted-ca-bundle.crt 132 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 133 | name: odh-trusted-ca-cert 134 | subPath: odh-trusted-ca-bundle.crt 135 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 136 | name: odh-ca-cert 137 | subPath: odh-ca-bundle.crt 138 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 139 | name: odh-ca-cert 140 | subPath: odh-ca-bundle.crt 141 | imagePullSecrets: [] 142 | volumes: 143 | - configMap: 144 | items: 145 | - key: ca-bundle.crt 146 | path: odh-trusted-ca-bundle.crt 147 | name: odh-trusted-ca-bundle 148 | optional: true 149 | name: odh-trusted-ca-cert 150 | - configMap: 151 | items: 152 | - key: odh-ca-bundle.crt 153 | path: odh-ca-bundle.crt 154 | name: odh-trusted-ca-bundle 155 | optional: true 156 | name: odh-ca-cert 157 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/support_clusters/test-aw-b.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | labels: 5 | kueue.x-k8s.io/queue-name: local_default_queue 6 | name: test-cluster-b 7 | namespace: ns 8 | spec: 9 | components: 10 | - template: 11 | apiVersion: ray.io/v1 12 | kind: RayCluster 13 | metadata: 14 | labels: 15 | controller-tools.k8s.io: '1.0' 16 | name: test-cluster-b 17 | namespace: ns 18 | spec: 19 | autoscalerOptions: 20 | idleTimeoutSeconds: 60 21 | resources: 22 | limits: 23 | cpu: 500m 24 | memory: 512Mi 25 | requests: 26 | cpu: 500m 27 | memory: 512Mi 28 | upscalingMode: Default 29 | enableInTreeAutoscaling: false 30 | headGroupSpec: 31 | enableIngress: false 32 | rayStartParams: 33 | block: 'true' 34 | dashboard-host: 0.0.0.0 35 | num-gpus: '0' 36 | resources: '"{}"' 37 | serviceType: ClusterIP 38 | template: 39 | spec: 40 | containers: 41 | - image: "${image}" 42 | imagePullPolicy: IfNotPresent 43 | lifecycle: 44 | preStop: 45 | exec: 46 | command: 47 | - /bin/sh 48 | - -c 49 | - ray stop 50 | name: ray-head 51 | ports: 52 | - containerPort: 6379 53 | name: gcs 54 | - containerPort: 8265 55 | name: dashboard 56 | - containerPort: 10001 57 | name: client 58 | resources: 59 | limits: 60 | cpu: 2 61 | memory: 8G 62 | requests: 63 | cpu: 2 64 | memory: 8G 65 | volumeMounts: 66 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 67 | name: odh-trusted-ca-cert 68 | subPath: odh-trusted-ca-bundle.crt 69 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 70 | name: odh-trusted-ca-cert 71 | subPath: odh-trusted-ca-bundle.crt 72 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 73 | name: odh-ca-cert 74 | subPath: odh-ca-bundle.crt 75 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 76 | name: odh-ca-cert 77 | subPath: odh-ca-bundle.crt 78 | imagePullSecrets: [] 79 | volumes: 80 | - configMap: 81 | items: 82 | - key: ca-bundle.crt 83 | path: odh-trusted-ca-bundle.crt 84 | name: odh-trusted-ca-bundle 85 | optional: true 86 | name: odh-trusted-ca-cert 87 | - configMap: 88 | items: 89 | - key: odh-ca-bundle.crt 90 | path: odh-ca-bundle.crt 91 | name: odh-trusted-ca-bundle 92 | optional: true 93 | name: odh-ca-cert 94 | rayVersion: 2.47.1 95 | workerGroupSpecs: 96 | - groupName: small-group-test-cluster-b 97 | maxReplicas: 1 98 | minReplicas: 1 99 | rayStartParams: 100 | block: 'true' 101 | num-gpus: '0' 102 | resources: '"{}"' 103 | replicas: 1 104 | template: 105 | metadata: 106 | annotations: 107 | key: value 108 | labels: 109 | key: value 110 | spec: 111 | containers: 112 | - image: "${image}" 113 | lifecycle: 114 | preStop: 115 | exec: 116 | command: 117 | - /bin/sh 118 | - -c 119 | - ray stop 120 | name: machine-learning 121 | resources: 122 | limits: 123 | cpu: 1 124 | memory: 2G 125 | requests: 126 | cpu: 1 127 | memory: 2G 128 | volumeMounts: 129 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 130 | name: odh-trusted-ca-cert 131 | subPath: odh-trusted-ca-bundle.crt 132 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 133 | name: odh-trusted-ca-cert 134 | subPath: odh-trusted-ca-bundle.crt 135 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 136 | name: odh-ca-cert 137 | subPath: odh-ca-bundle.crt 138 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 139 | name: odh-ca-cert 140 | subPath: odh-ca-bundle.crt 141 | imagePullSecrets: [] 142 | volumes: 143 | - configMap: 144 | items: 145 | - key: ca-bundle.crt 146 | path: odh-trusted-ca-bundle.crt 147 | name: odh-trusted-ca-bundle 148 | optional: true 149 | name: odh-trusted-ca-cert 150 | - configMap: 151 | items: 152 | - key: odh-ca-bundle.crt 153 | path: odh-ca-bundle.crt 154 | name: odh-trusted-ca-bundle 155 | optional: true 156 | name: odh-ca-cert 157 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/support_clusters/test-rc-a.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: ray.io/v1 2 | kind: RayCluster 3 | metadata: 4 | labels: 5 | controller-tools.k8s.io: '1.0' 6 | kueue.x-k8s.io/queue-name: local_default_queue 7 | name: test-cluster-a 8 | namespace: ns 9 | spec: 10 | autoscalerOptions: 11 | idleTimeoutSeconds: 60 12 | resources: 13 | limits: 14 | cpu: 500m 15 | memory: 512Mi 16 | requests: 17 | cpu: 500m 18 | memory: 512Mi 19 | upscalingMode: Default 20 | enableInTreeAutoscaling: false 21 | headGroupSpec: 22 | enableIngress: false 23 | rayStartParams: 24 | block: 'true' 25 | dashboard-host: 0.0.0.0 26 | num-gpus: '0' 27 | resources: '"{}"' 28 | serviceType: ClusterIP 29 | template: 30 | spec: 31 | containers: 32 | - image: "${image}" 33 | imagePullPolicy: IfNotPresent 34 | lifecycle: 35 | preStop: 36 | exec: 37 | command: 38 | - /bin/sh 39 | - -c 40 | - ray stop 41 | name: ray-head 42 | ports: 43 | - containerPort: 6379 44 | name: gcs 45 | - containerPort: 8265 46 | name: dashboard 47 | - containerPort: 10001 48 | name: client 49 | resources: 50 | limits: 51 | cpu: 2 52 | memory: 8G 53 | requests: 54 | cpu: 2 55 | memory: 8G 56 | volumeMounts: 57 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 58 | name: odh-trusted-ca-cert 59 | subPath: odh-trusted-ca-bundle.crt 60 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 61 | name: odh-trusted-ca-cert 62 | subPath: odh-trusted-ca-bundle.crt 63 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 64 | name: odh-ca-cert 65 | subPath: odh-ca-bundle.crt 66 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 67 | name: odh-ca-cert 68 | subPath: odh-ca-bundle.crt 69 | imagePullSecrets: [] 70 | volumes: 71 | - configMap: 72 | items: 73 | - key: ca-bundle.crt 74 | path: odh-trusted-ca-bundle.crt 75 | name: odh-trusted-ca-bundle 76 | optional: true 77 | name: odh-trusted-ca-cert 78 | - configMap: 79 | items: 80 | - key: odh-ca-bundle.crt 81 | path: odh-ca-bundle.crt 82 | name: odh-trusted-ca-bundle 83 | optional: true 84 | name: odh-ca-cert 85 | rayVersion: 2.47.1 86 | workerGroupSpecs: 87 | - groupName: small-group-test-cluster-a 88 | maxReplicas: 1 89 | minReplicas: 1 90 | rayStartParams: 91 | block: 'true' 92 | num-gpus: '0' 93 | resources: '"{}"' 94 | replicas: 1 95 | template: 96 | metadata: 97 | annotations: 98 | key: value 99 | labels: 100 | key: value 101 | spec: 102 | containers: 103 | - image: "${image}" 104 | lifecycle: 105 | preStop: 106 | exec: 107 | command: 108 | - /bin/sh 109 | - -c 110 | - ray stop 111 | name: machine-learning 112 | resources: 113 | limits: 114 | cpu: 1 115 | memory: 2G 116 | requests: 117 | cpu: 1 118 | memory: 2G 119 | volumeMounts: 120 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 121 | name: odh-trusted-ca-cert 122 | subPath: odh-trusted-ca-bundle.crt 123 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 124 | name: odh-trusted-ca-cert 125 | subPath: odh-trusted-ca-bundle.crt 126 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 127 | name: odh-ca-cert 128 | subPath: odh-ca-bundle.crt 129 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 130 | name: odh-ca-cert 131 | subPath: odh-ca-bundle.crt 132 | imagePullSecrets: [] 133 | volumes: 134 | - configMap: 135 | items: 136 | - key: ca-bundle.crt 137 | path: odh-trusted-ca-bundle.crt 138 | name: odh-trusted-ca-bundle 139 | optional: true 140 | name: odh-trusted-ca-cert 141 | - configMap: 142 | items: 143 | - key: odh-ca-bundle.crt 144 | path: odh-ca-bundle.crt 145 | name: odh-trusted-ca-bundle 146 | optional: true 147 | name: odh-ca-cert 148 | -------------------------------------------------------------------------------- /tests/test_cluster_yamls/support_clusters/test-rc-b.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: ray.io/v1 2 | kind: RayCluster 3 | metadata: 4 | labels: 5 | controller-tools.k8s.io: '1.0' 6 | kueue.x-k8s.io/queue-name: local_default_queue 7 | name: test-rc-b 8 | namespace: ns 9 | spec: 10 | autoscalerOptions: 11 | idleTimeoutSeconds: 60 12 | resources: 13 | limits: 14 | cpu: 500m 15 | memory: 512Mi 16 | requests: 17 | cpu: 500m 18 | memory: 512Mi 19 | upscalingMode: Default 20 | enableInTreeAutoscaling: false 21 | headGroupSpec: 22 | enableIngress: false 23 | rayStartParams: 24 | block: 'true' 25 | dashboard-host: 0.0.0.0 26 | num-gpus: '0' 27 | resources: '"{}"' 28 | serviceType: ClusterIP 29 | template: 30 | spec: 31 | containers: 32 | - image: "${image}" 33 | imagePullPolicy: IfNotPresent 34 | lifecycle: 35 | preStop: 36 | exec: 37 | command: 38 | - /bin/sh 39 | - -c 40 | - ray stop 41 | name: ray-head 42 | ports: 43 | - containerPort: 6379 44 | name: gcs 45 | - containerPort: 8265 46 | name: dashboard 47 | - containerPort: 10001 48 | name: client 49 | resources: 50 | limits: 51 | cpu: 2 52 | memory: 8G 53 | requests: 54 | cpu: 2 55 | memory: 8G 56 | volumeMounts: 57 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 58 | name: odh-trusted-ca-cert 59 | subPath: odh-trusted-ca-bundle.crt 60 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 61 | name: odh-trusted-ca-cert 62 | subPath: odh-trusted-ca-bundle.crt 63 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 64 | name: odh-ca-cert 65 | subPath: odh-ca-bundle.crt 66 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 67 | name: odh-ca-cert 68 | subPath: odh-ca-bundle.crt 69 | imagePullSecrets: [] 70 | volumes: 71 | - configMap: 72 | items: 73 | - key: ca-bundle.crt 74 | path: odh-trusted-ca-bundle.crt 75 | name: odh-trusted-ca-bundle 76 | optional: true 77 | name: odh-trusted-ca-cert 78 | - configMap: 79 | items: 80 | - key: odh-ca-bundle.crt 81 | path: odh-ca-bundle.crt 82 | name: odh-trusted-ca-bundle 83 | optional: true 84 | name: odh-ca-cert 85 | rayVersion: 2.47.1 86 | workerGroupSpecs: 87 | - groupName: small-group-test-rc-b 88 | maxReplicas: 1 89 | minReplicas: 1 90 | rayStartParams: 91 | block: 'true' 92 | num-gpus: '0' 93 | resources: '"{}"' 94 | replicas: 1 95 | template: 96 | metadata: 97 | annotations: 98 | key: value 99 | labels: 100 | key: value 101 | spec: 102 | containers: 103 | - image: "${image}" 104 | lifecycle: 105 | preStop: 106 | exec: 107 | command: 108 | - /bin/sh 109 | - -c 110 | - ray stop 111 | name: machine-learning 112 | resources: 113 | limits: 114 | cpu: 1 115 | memory: 2G 116 | requests: 117 | cpu: 1 118 | memory: 2G 119 | volumeMounts: 120 | - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt 121 | name: odh-trusted-ca-cert 122 | subPath: odh-trusted-ca-bundle.crt 123 | - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt 124 | name: odh-trusted-ca-cert 125 | subPath: odh-trusted-ca-bundle.crt 126 | - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt 127 | name: odh-ca-cert 128 | subPath: odh-ca-bundle.crt 129 | - mountPath: /etc/ssl/certs/odh-ca-bundle.crt 130 | name: odh-ca-cert 131 | subPath: odh-ca-bundle.crt 132 | imagePullSecrets: [] 133 | volumes: 134 | - configMap: 135 | items: 136 | - key: ca-bundle.crt 137 | path: odh-trusted-ca-bundle.crt 138 | name: odh-trusted-ca-bundle 139 | optional: true 140 | name: odh-trusted-ca-cert 141 | - configMap: 142 | items: 143 | - key: odh-ca-bundle.crt 144 | path: odh-ca-bundle.crt 145 | name: odh-trusted-ca-bundle 146 | optional: true 147 | name: odh-ca-cert 148 | -------------------------------------------------------------------------------- /tests/upgrade/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/tests/upgrade/__init__.py -------------------------------------------------------------------------------- /tests/upgrade/raycluster_sdk_upgrade_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from time import sleep 3 | 4 | from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication 5 | from codeflare_sdk.ray.client import RayJobClient 6 | 7 | from tests.e2e.support import * 8 | from codeflare_sdk.ray.cluster.cluster import get_cluster 9 | 10 | from codeflare_sdk.common import _kube_api_error_handling 11 | 12 | namespace = "test-ns-rayupgrade" 13 | # Global variables for kueue resources 14 | cluster_queue = "cluster-queue-mnist" 15 | flavor = "default-flavor-mnist" 16 | local_queue = "local-queue-mnist" 17 | 18 | 19 | # Creates a Ray cluster 20 | class TestMNISTRayClusterUp: 21 | def setup_method(self): 22 | initialize_kubernetes_client(self) 23 | create_namespace_with_name(self, namespace) 24 | try: 25 | create_cluster_queue(self, cluster_queue, flavor) 26 | create_resource_flavor(self, flavor) 27 | create_local_queue(self, cluster_queue, local_queue) 28 | except Exception as e: 29 | delete_namespace(self) 30 | delete_kueue_resources(self) 31 | return _kube_api_error_handling(e) 32 | 33 | def test_mnist_ray_cluster_sdk_auth(self): 34 | self.run_mnist_raycluster_sdk_oauth() 35 | 36 | def run_mnist_raycluster_sdk_oauth(self): 37 | ray_image = get_ray_image() 38 | 39 | auth = TokenAuthentication( 40 | token=run_oc_command(["whoami", "--show-token=true"]), 41 | server=run_oc_command(["whoami", "--show-server=true"]), 42 | skip_tls=True, 43 | ) 44 | auth.login() 45 | 46 | cluster = Cluster( 47 | ClusterConfiguration( 48 | name="mnist", 49 | namespace=self.namespace, 50 | num_workers=1, 51 | head_cpu_requests=1, 52 | head_cpu_limits=1, 53 | head_memory_requests=6, 54 | head_memory_limits=8, 55 | worker_cpu_requests=1, 56 | worker_cpu_limits=1, 57 | worker_memory_requests=6, 58 | worker_memory_limits=8, 59 | image=ray_image, 60 | write_to_file=True, 61 | verify_tls=False, 62 | ) 63 | ) 64 | 65 | try: 66 | cluster.up() 67 | cluster.status() 68 | # wait for raycluster to be Ready 69 | cluster.wait_ready() 70 | cluster.status() 71 | # Check cluster details 72 | cluster.details() 73 | # Assert the cluster status is READY 74 | _, ready = cluster.status() 75 | assert ready 76 | 77 | except Exception as e: 78 | print(f"An unexpected error occurred. Error: ", e) 79 | delete_namespace(self) 80 | assert False, "Cluster is not ready!" 81 | 82 | 83 | class TestMnistJobSubmit: 84 | def setup_method(self): 85 | initialize_kubernetes_client(self) 86 | auth = TokenAuthentication( 87 | token=run_oc_command(["whoami", "--show-token=true"]), 88 | server=run_oc_command(["whoami", "--show-server=true"]), 89 | skip_tls=True, 90 | ) 91 | auth.login() 92 | self.namespace = namespace 93 | self.cluster = get_cluster("mnist", self.namespace) 94 | if not self.cluster: 95 | raise RuntimeError("TestRayClusterUp needs to be run before this test") 96 | 97 | def test_mnist_job_submission(self): 98 | self.assert_jobsubmit_withoutLogin(self.cluster) 99 | self.assert_jobsubmit_withlogin(self.cluster) 100 | 101 | # Assertions 102 | def assert_jobsubmit_withoutLogin(self, cluster): 103 | dashboard_url = cluster.cluster_dashboard_uri() 104 | try: 105 | RayJobClient(address=dashboard_url, verify=False) 106 | assert False 107 | except Exception as e: 108 | if e.response.status_code == 403: 109 | assert True 110 | else: 111 | print(f"An unexpected error occurred. Error: {e}") 112 | assert False 113 | 114 | def assert_jobsubmit_withlogin(self, cluster): 115 | auth_token = run_oc_command(["whoami", "--show-token=true"]) 116 | ray_dashboard = cluster.cluster_dashboard_uri() 117 | header = {"Authorization": f"Bearer {auth_token}"} 118 | client = RayJobClient(address=ray_dashboard, headers=header, verify=False) 119 | 120 | # Submit the job 121 | submission_id = client.submit_job( 122 | entrypoint="python mnist.py", 123 | runtime_env={ 124 | "working_dir": "./tests/e2e/", 125 | "pip": "./tests/e2e/mnist_pip_requirements.txt", 126 | "env_vars": get_setup_env_variables(), 127 | }, 128 | ) 129 | print(f"Submitted job with ID: {submission_id}") 130 | done = False 131 | time = 0 132 | timeout = 900 133 | while not done: 134 | status = client.get_job_status(submission_id) 135 | if status.is_terminal(): 136 | break 137 | if not done: 138 | print(status) 139 | if timeout and time >= timeout: 140 | raise TimeoutError(f"job has timed out after waiting {timeout}s") 141 | sleep(5) 142 | time += 5 143 | 144 | logs = client.get_job_logs(submission_id) 145 | print(logs) 146 | 147 | self.assert_job_completion(status) 148 | 149 | client.delete_job(submission_id) 150 | 151 | def assert_job_completion(self, status): 152 | if status == "SUCCEEDED": 153 | print(f"Job has completed: '{status}'") 154 | assert True 155 | else: 156 | print(f"Job has completed: '{status}'") 157 | assert False 158 | -------------------------------------------------------------------------------- /ui-tests/.yarnrc: -------------------------------------------------------------------------------- 1 | disable-self-update-check true 2 | ignore-optional true 3 | network-timeout "300000" 4 | registry "https://registry.npmjs.org/" 5 | -------------------------------------------------------------------------------- /ui-tests/jupyter_server_config.py: -------------------------------------------------------------------------------- 1 | from jupyterlab.galata import configure_jupyter_server 2 | 3 | configure_jupyter_server(c) 4 | 5 | # Uncomment to set server log level to debug level 6 | # c.ServerApp.log_level = "DEBUG" 7 | -------------------------------------------------------------------------------- /ui-tests/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@jupyter-widgets/ui-tests", 3 | "private": true, 4 | "version": "0.1.0", 5 | "description": "ipywidgets UI Tests", 6 | "scripts": { 7 | "start": "jupyter lab --config ./jupyter_server_config.py", 8 | "start:detached": "jlpm start&", 9 | "test": "npx playwright test", 10 | "test:debug": "PWDEBUG=1 npx playwright test", 11 | "test:report": "http-server ./playwright-report -a localhost -o", 12 | "test:update": "npx playwright test --update-snapshots", 13 | "deduplicate": "jlpm && yarn-deduplicate -s fewer --fail" 14 | }, 15 | "author": "Project Jupyter", 16 | "license": "BSD-3-Clause", 17 | "devDependencies": { 18 | "@jupyterlab/galata": "^5.3.0", 19 | "@playwright/test": "^1.49.0", 20 | "yarn-deduplicate": "^6.0.1" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /ui-tests/playwright.config.js: -------------------------------------------------------------------------------- 1 | const baseConfig = require('@jupyterlab/galata/lib/playwright-config'); 2 | 3 | module.exports = { 4 | ...baseConfig, 5 | timeout: 600000, 6 | webServer: { 7 | command: 'yarn start', 8 | url: 'http://localhost:8888/lab', 9 | timeout: 120 * 1000, 10 | reuseExistingServer: !process.env.CI, 11 | }, 12 | retries: 0, 13 | }; 14 | -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-0-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-0-linux.png -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-2-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-2-linux.png -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-3-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-3-linux.png -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png -------------------------------------------------------------------------------- /ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png --------------------------------------------------------------------------------