├── .gitattributes
├── .github
    ├── build
    │   ├── Containerfile
    │   └── README.md
    ├── dependabot.yml
    ├── resources
    │   ├── minio_remote_config_cell.json
    │   └── wait_for_job_cell.json
    └── workflows
    │   ├── additional_demo_notebook_tests.yaml
    │   ├── coverage-badge.yaml
    │   ├── dependabot-labeler.yaml
    │   ├── e2e_tests.yaml
    │   ├── guided_notebook_tests.yaml
    │   ├── odh-notebooks-sync.yml
    │   ├── pre-commit.yaml
    │   ├── publish-documentation.yaml
    │   ├── release.yaml
    │   ├── snyk-security.yaml
    │   ├── ui_notebooks_test.yaml
    │   └── unit-tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── OWNERS
├── README.md
├── assets
    └── images
    │   └── sdk-diagram.png
├── coverage.svg
├── demo-notebooks
    ├── additional-demos
    │   ├── hf_interactive.ipynb
    │   ├── local_interactive.ipynb
    │   ├── mnist.py
    │   ├── ray_job_client.ipynb
    │   ├── remote_ray_job_client.ipynb
    │   └── requirements.txt
    └── guided-demos
    │   ├── 0_basic_ray.ipynb
    │   ├── 1_cluster_job_client.ipynb
    │   ├── 2_basic_interactive.ipynb
    │   ├── 3_widget_example.ipynb
    │   ├── download_mnist_datasets.py
    │   ├── mnist.py
    │   ├── mnist_disconnected.py
    │   ├── mnist_fashion.py
    │   ├── notebook-ex-outputs
    │       ├── 0_basic_ray.ipynb
    │       ├── 1_cluster_job_client.ipynb
    │       ├── 2_basic_interactive.ipynb
    │       ├── interactivetest.yaml
    │       ├── jobtest.yaml
    │       ├── mnist.py
    │       ├── raytest.yaml
    │       └── requirements.txt
    │   ├── preview_nbs
    │       ├── 0_basic_ray.ipynb
    │       ├── 1_cluster_job_client.ipynb
    │       ├── 2_basic_interactive.ipynb
    │       ├── mnist.py
    │       └── requirements.txt
    │   └── requirements.txt
├── docs
    ├── designs
    │   ├── CodeFlare-SDK-design-doc.md
    │   └── History
    │   │   └── CodeFlareSDK_Design_Doc.md
    ├── generate-documentation.md
    ├── images
    │   ├── codeflare_sdk.png
    │   └── codeflare_stack_arch.png
    └── sphinx
    │   ├── Makefile
    │   ├── conf.py
    │   ├── index.rst
    │   ├── make.bat
    │   └── user-docs
    │       ├── authentication.rst
    │       ├── cluster-configuration.rst
    │       ├── e2e.rst
    │       ├── images
    │           ├── ui-buttons.png
    │           └── ui-view-clusters.png
    │       ├── ray-cluster-interaction.rst
    │       ├── s3-compatible-storage.rst
    │       ├── setup-kueue.rst
    │       └── ui-widgets.rst
├── poetry.lock
├── pyproject.toml
├── src
    └── codeflare_sdk
    │   ├── __init__.py
    │   ├── common
    │       ├── __init__.py
    │       ├── kubernetes_cluster
    │       │   ├── __init__.py
    │       │   ├── auth.py
    │       │   ├── kube_api_helpers.py
    │       │   └── test_auth.py
    │       ├── kueue
    │       │   ├── __init__.py
    │       │   ├── kueue.py
    │       │   └── test_kueue.py
    │       ├── utils
    │       │   ├── __init__.py
    │       │   ├── constants.py
    │       │   ├── demos.py
    │       │   ├── generate_cert.py
    │       │   ├── test_generate_cert.py
    │       │   └── unit_test_support.py
    │       └── widgets
    │       │   ├── __init__.py
    │       │   ├── test_widgets.py
    │       │   └── widgets.py
    │   └── ray
    │       ├── __init__.py
    │       ├── appwrapper
    │           ├── __init__.py
    │           ├── awload.py
    │           ├── status.py
    │           ├── test_awload.py
    │           └── test_status.py
    │       ├── client
    │           ├── __init__.py
    │           ├── ray_jobs.py
    │           └── test_ray_jobs.py
    │       └── cluster
    │           ├── __init__.py
    │           ├── build_ray_cluster.py
    │           ├── cluster.py
    │           ├── config.py
    │           ├── pretty_print.py
    │           ├── status.py
    │           ├── test_build_ray_cluster.py
    │           ├── test_cluster.py
    │           ├── test_config.py
    │           ├── test_pretty_print.py
    │           └── test_status.py
├── target_users.md
├── tests
    ├── __init__.py
    ├── auth-test.crt
    ├── e2e
    │   ├── cluster_apply_kind_test.py
    │   ├── heterogeneous_clusters_kind_test.py
    │   ├── heterogeneous_clusters_oauth_test.py
    │   ├── install-codeflare-sdk.sh
    │   ├── local_interactive_sdk_kind_test.py
    │   ├── local_interactive_sdk_oauth_test.py
    │   ├── minio_deployment.yaml
    │   ├── mnist.py
    │   ├── mnist_pip_requirements.txt
    │   ├── mnist_raycluster_sdk_aw_kind_test.py
    │   ├── mnist_raycluster_sdk_kind_test.py
    │   ├── mnist_raycluster_sdk_oauth_test.py
    │   ├── mnist_rayjob.py
    │   ├── mnist_sleep.py
    │   ├── start_ray_cluster.py
    │   └── support.py
    ├── test_cluster_yamls
    │   ├── appwrapper
    │   │   ├── test-case-bad.yaml
    │   │   └── unit-test-all-params.yaml
    │   ├── kueue
    │   │   ├── aw_kueue.yaml
    │   │   └── ray_cluster_kueue.yaml
    │   ├── ray
    │   │   ├── default-appwrapper.yaml
    │   │   ├── default-ray-cluster.yaml
    │   │   └── unit-test-all-params.yaml
    │   └── support_clusters
    │   │   ├── test-aw-a.yaml
    │   │   ├── test-aw-b.yaml
    │   │   ├── test-rc-a.yaml
    │   │   └── test-rc-b.yaml
    └── upgrade
    │   ├── __init__.py
    │   ├── raycluster_sdk_upgrade_sleep_test.py
    │   └── raycluster_sdk_upgrade_test.py
└── ui-tests
    ├── .yarnrc
    ├── jupyter_server_config.py
    ├── package.json
    ├── playwright.config.js
    ├── tests
        ├── widget_notebook_example.test.ts
        └── widget_notebook_example.test.ts-snapshots
        │   ├── widgets-cell-0-linux.png
        │   ├── widgets-cell-2-linux.png
        │   ├── widgets-cell-3-linux.png
        │   ├── widgets-cell-4-linux.png
        │   └── widgets-cell-5-linux.png
    └── yarn.lock


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-detectable=false
2 | 


--------------------------------------------------------------------------------
/.github/build/Containerfile:
--------------------------------------------------------------------------------
 1 | FROM registry.redhat.io/ubi9/python-39:latest
 2 | 
 3 | LABEL summary="Toolchain for running pre-commit hooks." \
 4 |     description="Toolchain for running pre-commit hooks" \
 5 |     io.k8s.display-name="Pre-Commit Toolchain"
 6 | 
 7 | USER root
 8 | RUN dnf install nodejs -y && \
 9 |     dnf clean all && \
10 |     rm -rf /var/cache/dnf
11 | ADD https://mirror.openshift.com/pub/openshift-v4/clients/oc/latest/linux/oc.tar.gz $TMPDIR/
12 | RUN tar -C /usr/local/bin -xvf $TMPDIR/oc.tar.gz && \
13 |     chmod +x /usr/local/bin/oc && \
14 |     rm $TMPDIR/oc.tar.gz
15 | USER $USERID
16 | 
17 | RUN pip3 install poetry && \
18 |     poetry config virtualenvs.create false
19 | COPY pyproject.toml ./
20 | RUN poetry install
21 | 
22 | CMD bash
23 | 


--------------------------------------------------------------------------------
/.github/build/README.md:
--------------------------------------------------------------------------------
1 | # Pre-Commit Build Artifacts
2 | 
3 | This directory contains the artifacts required to build the codeflare-sdk pre-commit image.
4 | 
5 | To build the image run `podman build -f .github/build/Containerfile .` from the root directory.
6 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Please see the documentation for all configuration options:
 2 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 3 | 
 4 | version: 2
 5 | updates:
 6 |   # This is to update requirements.txt files in the guided-demos, and e2e directories.
 7 |   - package-ecosystem: "pip"
 8 |     directories:
 9 |       - "**/demo-notebooks/guided-demos*"
10 |       - "/tests/e2e"
11 |     schedule:
12 |       interval: "daily"
13 |     ignore:
14 |       - dependency-name: "*"
15 |         update-types: ["version-update:semver-patch"]
16 |     open-pull-requests-limit: 1
17 |     labels:
18 |       - "dependabot"
19 |       - "test-guided-notebooks"
20 | 
21 |   # pip means poetry in this case, this keeps poetry.lock up to date with constraints in pyproject.toml.
22 |   - package-ecosystem: "pip"
23 |     directory: "/"
24 |     schedule:
25 |       interval: "daily"
26 |     ignore:
27 |       - dependency-name: "*"
28 |         update-types: ["version-update:semver-patch"]
29 |     open-pull-requests-limit: 1
30 |     labels:
31 |       - "dependabot"
32 |       - "test-guided-notebooks"
33 | 
34 |   # npm means yarn in this case, this keeps yarn.lock up to date with constraints in package.json.
35 |   - package-ecosystem: "npm"
36 |     directory: "/ui-tests"
37 |     schedule:
38 |       interval: "daily"
39 |     ignore:
40 |       - dependency-name: "*"
41 |         update-types: ["version-update:semver-patch"]
42 |     open-pull-requests-limit: 1
43 |     labels:
44 |       - "dependabot"
45 |       - "test-ui-notebooks"
46 | 


--------------------------------------------------------------------------------
/.github/resources/minio_remote_config_cell.json:
--------------------------------------------------------------------------------
 1 |   {
 2 |    "cell_type": "code",
 3 |    "execution_count": null,
 4 |    "metadata": {},
 5 |    "outputs": [],
 6 |    "source": [
 7 |     "@ray.remote\n",
 8 |     "def get_minio_run_config():\n",
 9 |     "    import s3fs\n",
10 |     "    import pyarrow\n",
11 |     "    s3_fs = s3fs.S3FileSystem(\n",
12 |     "        key = \"minio\",\n",
13 |     "        secret = \"minio123\",\n",
14 |     "        endpoint_url = \"http://minio-service.default.svc.cluster.local:9000\"\n",
15 |     "    )\n",
16 |     "    custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))\n",
17 |     "    run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)\n",
18 |     "    return run_config"
19 |    ]
20 |   }
21 | 


--------------------------------------------------------------------------------
/.github/resources/wait_for_job_cell.json:
--------------------------------------------------------------------------------
 1 |   {
 2 |    "cell_type": "code",
 3 |    "execution_count": null,
 4 |    "metadata": {},
 5 |    "outputs": [],
 6 |    "source": [
 7 |     "from time import sleep\n",
 8 |     "\n",
 9 |     "finished = False\n",
10 |     "while not finished:\n",
11 |     "    sleep(5)\n",
12 |     "    status = client.get_job_status(submission_id)\n",
13 |     "    finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n",
14 |     "    print(status)\n",
15 |     "print(\"Job status \" + status)\n",
16 |     "print(\"Logs: \")\n",
17 |     "print(client.get_job_logs(submission_id))\n",
18 |     "assert status == \"SUCCEEDED\", \"Job failed or was stopped!\""
19 |    ]
20 |   }
21 | 


--------------------------------------------------------------------------------
/.github/workflows/coverage-badge.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow will generate and push an updated coverage badge
 2 | 
 3 | name: Coverage Badge
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   report:
11 | 
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v4
16 |     - name: Set up Python 3.11
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: 3.11
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install poetry
24 |         poetry config virtualenvs.create false
25 |         poetry lock
26 |         poetry install --with test
27 |     - name: Generate coverage report
28 |       run: |
29 |         coverage run --omit="src/**/test_*.py,src/codeflare_sdk/common/utils/unit_test_support.py" -m pytest
30 | 
31 |     - name: Coverage Badge
32 |       uses: tj-actions/coverage-badge-py@v2
33 | 
34 |     - name: Verify Changed files
35 |       uses: tj-actions/verify-changed-files@v18
36 |       id: changed_files
37 |       with:
38 |         files: coverage.svg
39 | 
40 |     - name: Commit files
41 |       if: steps.changed_files.outputs.files_changed == 'true'
42 |       run: |
43 |         git config --local user.email "github-actions[bot]@users.noreply.github.com"
44 |         git config --local user.name "github-actions[bot]"
45 |         git add coverage.svg
46 |         git commit -m "Updated coverage.svg"
47 | 
48 |     - name: Create Pull Request
49 |       if: steps.changed_files.outputs.files_changed == 'true'
50 |       uses: peter-evans/create-pull-request@v4
51 |       with:
52 |         token: ${{ secrets.GITHUB_TOKEN }}
53 |         title: "[Automatic] Coverage Badge Update"
54 | 


--------------------------------------------------------------------------------
/.github/workflows/dependabot-labeler.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow file adds the 'lgtm' and 'approved' labels to Dependabot PRs
 2 | # This is done to ensure that the PRs that pass required status checks are automatically merged by the CodeFlare bot
 3 | name: Dependabot Labeler
 4 | 
 5 | on:
 6 |   pull_request_target:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |     add-approve-lgtm-label:
11 |         if: ${{ github.actor == 'dependabot[bot]' && contains(github.event.pull_request.labels.*.name, 'dependabot') }}
12 |         runs-on: ubuntu-latest
13 | 
14 |         # Permission required to edit a PR
15 |         permissions:
16 |           pull-requests: write
17 |           issues: write
18 | 
19 |         steps:
20 |             - name: Checkout code
21 |               uses: actions/checkout@v4
22 | 
23 |             - name: Add approve and lgtm labels to Dependabot PR
24 |               run: |
25 |                   gh pr edit ${{ github.event.pull_request.number }} --add-label "lgtm" --add-label "approved"
26 |               env:
27 |                 GITHUB_TOKEN: ${{ secrets.GH_CLI_TOKEN }}
28 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yaml:
--------------------------------------------------------------------------------
 1 | name: Pre-commit
 2 | on:
 3 |   pull_request:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   precommit:
 8 |     runs-on: ubuntu-latest
 9 |     container:
10 |       image: quay.io/project-codeflare/codeflare-sdk-precommit:v0.0.1
11 |     steps:
12 |       - uses: actions/checkout@v4
13 | 
14 |       - name: Run pre-commit checks
15 |         run: pre-commit run --all-files
16 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-documentation.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Documentation
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       codeflare_sdk_release_version:
 7 |         type: string
 8 |         required: true
 9 |         description: 'Version number (for example: 0.1.0)'
10 | 
11 | permissions:
12 |   contents: write
13 | 
14 | jobs:
15 |   docs:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - name: Install Python
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: 3.11
23 |       - name: Install Sphinx
24 |         run: |
25 |             sudo apt-get update
26 |             sudo apt-get install python3-sphinx
27 |       - name: Install Poetry
28 |         uses: abatilo/actions-poetry@v2
29 |         with:
30 |           poetry-version: 1.8.3
31 |       - name: Create new documentation
32 |         run: |
33 |           python3 -m venv .venv
34 |           source .venv/bin/activate
35 |           poetry install --with docs
36 |           sed -i 's/release = "v[0-9]\+\.[0-9]\+\.[0-9]\+"/release = "${{ github.event.inputs.codeflare_sdk_release_version }}"/' docs/sphinx/conf.py
37 |           sphinx-apidoc -o docs/sphinx src/codeflare_sdk "**/*test_*" --force # Generate docs but ignore test files
38 |           make html -C docs/sphinx
39 |       - name: Deploy to GitHub Pages
40 |         uses: peaceiris/actions-gh-pages@v3
41 |         with:
42 |           publish_branch: gh-pages
43 |           github_token: ${{ secrets.GITHUB_TOKEN }}
44 |           publish_dir: docs/sphinx/_build/html
45 |           force_orphan: true
46 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
  1 | name: Release
  2 | 
  3 | on:
  4 |     workflow_dispatch:
  5 |         inputs:
  6 |             release-version:
  7 |               type: string
  8 |               required: true
  9 |               description: 'Version number (for example: 0.1.0)'
 10 |             is-stable:
 11 |               description: 'Select if the built image should be tagged as stable'
 12 |               required: true
 13 |               type: boolean
 14 |             quay-organization:
 15 |               description: 'Quay organization used to push the built images to'
 16 |               required: true
 17 |               default: 'project-codeflare'
 18 |             python_version:
 19 |               type: string
 20 |               default: "3.11"
 21 |               required: true
 22 |             poetry_version:
 23 |               type: string
 24 |               default: "1.8.3"
 25 |               required: true
 26 |             codeflare-repository-organization:
 27 |               type: string
 28 |               default: "project-codeflare"
 29 | 
 30 | env:
 31 |   PR_BRANCH_NAME: snyk-tag-monitoring-${{ github.run_id }}
 32 | 
 33 | jobs:
 34 |     release:
 35 |         runs-on: ubuntu-latest
 36 |         permissions:
 37 |             contents: write
 38 |             id-token: write  # This permission is required for trusted publishing
 39 |             pull-requests: write # This permission is required for creating PRs
 40 |             actions: write # This permission is required for running actions
 41 |         steps:
 42 |             - name: Checkout the repository
 43 |               uses: actions/checkout@v4
 44 |               with:
 45 |                   token: ${{ secrets.GH_CLI_TOKEN }}
 46 |             - name: Install Python
 47 |               uses: actions/setup-python@v5
 48 |               with:
 49 |                 python-version: ${{ github.event.inputs.python_version }}
 50 |             - name: Install Poetry
 51 |               uses: abatilo/actions-poetry@v2
 52 |               with:
 53 |                 poetry-version: ${{ github.event.inputs.poetry_version }}
 54 |             - name: Change version in pyproject.toml
 55 |               run: poetry version "${{ github.event.inputs.release-version }}"
 56 |             - name: Run poetry install
 57 |               run: poetry install --with docs
 58 |             - name: Create new documentation
 59 |               run: |
 60 |                 gh workflow run publish-documentation.yaml \
 61 |                   --repo ${{ github.event.inputs.codeflare-repository-organization }}/codeflare-sdk \
 62 |                   --ref ${{ github.ref }} \
 63 |                   --field codeflare_sdk_release_version=${{ github.event.inputs.release-version }}
 64 |               env:
 65 |                 GITHUB_TOKEN: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }}
 66 |             - name: Copy demo notebooks into SDK package
 67 |               run: cp -r demo-notebooks src/codeflare_sdk/demo-notebooks
 68 |             - name: Run poetry build
 69 |               run: poetry build
 70 |               env:
 71 |                 GITHUB_TOKEN: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }}
 72 |             - name: Create Github release
 73 |               uses: ncipollo/release-action@v1
 74 |               with:
 75 |                   tag: "v${{ github.event.inputs.release-version }}"
 76 |                   generateReleaseNotes: true
 77 |             - name: Publish package distributions to PyPI
 78 |               uses: pypa/gh-action-pypi-publish@release/v1
 79 | 
 80 |             - name: Sync ODH Notebooks
 81 |               run: |
 82 |                 gh workflow run odh-notebooks-sync.yml \
 83 |                   --repo ${{ github.event.inputs.codeflare-repository-organization }}/codeflare-sdk \
 84 |                   --ref ${{ github.ref }} \
 85 |                   --field upstream-repository-organization=opendatahub-io \
 86 |                   --field codeflare-repository-organization=${{ github.event.inputs.codeflare-repository-organization }} \
 87 |                   --field codeflare_sdk_release_version=${{ github.event.inputs.release-version }}
 88 |               env:
 89 |                 GITHUB_TOKEN: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }}
 90 |               shell: bash
 91 | 
 92 |             - name: Install Snyk CLI and setup monitoring for new release tag
 93 |               env:
 94 |                 SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
 95 |                 SNYK_ORG: ${{ secrets.SNYK_ORG }}
 96 |               run: |
 97 |                 echo "Installing Snyk CLI"
 98 |                 npm install -g snyk
 99 | 
100 |                 echo "Fetching tags"
101 |                 git fetch origin 'refs/tags/*:refs/tags/*'
102 | 
103 |                 echo "Authenticating with Snyk"
104 |                 snyk auth ${SNYK_TOKEN}
105 | 
106 |                 echo "Scanning project: codeflare-sdk/v${{ github.event.inputs.release-version }}"
107 |                 git checkout v${{ github.event.inputs.release-version }}
108 |                 snyk monitor --all-projects --exclude=requirements.txt --org=${SNYK_ORG} --target-reference="$(git describe --tags)"
109 | 


--------------------------------------------------------------------------------
/.github/workflows/snyk-security.yaml:
--------------------------------------------------------------------------------
 1 | name: Snyk Security
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | jobs:
 8 |   snyk-scan:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Checkout code
12 |         uses: actions/checkout@v4
13 | 
14 |       - name: Install Snyk CLI
15 |         run: npm install -g snyk
16 | 
17 |       - name: Snyk Monitor and Test multiple projects
18 |         env:
19 |           SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
20 |           SNYK_ORG: ${{ secrets.SNYK_ORG }}
21 |         run: |
22 |           echo "Fetching tags"
23 |           git fetch origin 'refs/tags/*:refs/tags/*'
24 | 
25 |           echo "Authenticating with Snyk"
26 |           snyk auth ${SNYK_TOKEN}
27 | 
28 |           echo "Scanning project: codeflare-sdk/main"
29 |           snyk monitor --all-projects --exclude=requirements.txt --org=${SNYK_ORG} --target-reference="main"
30 | 


--------------------------------------------------------------------------------
/.github/workflows/ui_notebooks_test.yaml:
--------------------------------------------------------------------------------
  1 | name: UI notebooks tests
  2 | 
  3 | on:
  4 |   pull_request:
  5 |     branches: [ main ]
  6 |     types: [ labeled ]
  7 | 
  8 | concurrency:
  9 |   group: ${{ github.head_ref }}-${{ github.workflow }}
 10 |   cancel-in-progress: true
 11 | 
 12 | env:
 13 |   CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
 14 | 
 15 | jobs:
 16 |   verify-3_widget_example:
 17 |     if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') || contains(github.event.pull_request.labels.*.name, 'test-ui-notebooks') }}
 18 |     runs-on: ubuntu-latest-4core
 19 | 
 20 |     steps:
 21 |       - name: Checkout code
 22 |         uses: actions/checkout@v4
 23 |         with:
 24 |           submodules: recursive
 25 | 
 26 |       - name: Checkout common repo code
 27 |         uses: actions/checkout@v4
 28 |         with:
 29 |           repository: "project-codeflare/codeflare-common"
 30 |           ref: "main"
 31 |           path: "common"
 32 | 
 33 |       - name: Checkout CodeFlare operator repository
 34 |         uses: actions/checkout@v4
 35 |         with:
 36 |           repository: project-codeflare/codeflare-operator
 37 |           path: codeflare-operator
 38 | 
 39 |       - name: Set Go
 40 |         uses: actions/setup-go@v5
 41 |         with:
 42 |           go-version-file: "./codeflare-operator/go.mod"
 43 |           cache-dependency-path: "./codeflare-operator/go.sum"
 44 | 
 45 |       - name: Set up gotestfmt
 46 |         uses: gotesttools/gotestfmt-action@v2
 47 |         with:
 48 |           token: ${{ secrets.GITHUB_TOKEN }}
 49 | 
 50 |       - name: Set up specific Python version
 51 |         uses: actions/setup-python@v5
 52 |         with:
 53 |           python-version: "3.11"
 54 |           cache: "pip" # caching pip dependencies
 55 | 
 56 |       - name: Setup and start KinD cluster
 57 |         uses: ./common/github-actions/kind
 58 | 
 59 |       - name: Deploy CodeFlare stack
 60 |         id: deploy
 61 |         run: |
 62 |           cd codeflare-operator
 63 |           echo Setting up CodeFlare stack
 64 |           make setup-e2e
 65 |           echo Deploying CodeFlare operator
 66 |           make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
 67 |           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
 68 |           cd ..
 69 | 
 70 |       - name: Setup Guided notebooks execution
 71 |         run: |
 72 |           echo "Installing papermill and dependencies..."
 73 |           pip install poetry ipython ipykernel
 74 |           poetry config virtualenvs.create false
 75 |           echo "Installing SDK..."
 76 |           poetry install --with test,docs
 77 | 
 78 |       - name: Install Yarn dependencies
 79 |         run: |
 80 |           poetry run yarn install
 81 |           poetry run yarn playwright install chromium
 82 |         working-directory: ui-tests
 83 | 
 84 |       - name: Fix 3_widget_example.ipynb notebook for test
 85 |         run: |
 86 |           # Remove login/logout cells, as KinD doesn't support authentication using token
 87 |           jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb
 88 |           jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb
 89 |           # Set explicit namespace as SDK need it (currently) to resolve local queues
 90 |           sed -i "s|head_memory_limits=2,|head_memory_limits=2, namespace='default',|" 3_widget_example.ipynb
 91 |           sed -i "s|view_clusters()|view_clusters('default')|" 3_widget_example.ipynb
 92 |         working-directory: demo-notebooks/guided-demos
 93 | 
 94 |       - name: Run UI notebook tests
 95 |         run: |
 96 |           set -euo pipefail
 97 | 
 98 |           poetry run yarn test
 99 |         working-directory: ui-tests
100 | 
101 |       - name: Upload Playwright Test assets
102 |         if: always()
103 |         uses: actions/upload-artifact@v4
104 |         with:
105 |           name: ipywidgets-test-assets
106 |           path: |
107 |             ui-tests/test-results
108 | 
109 |       - name: Upload Playwright Test report
110 |         if: always()
111 |         uses: actions/upload-artifact@v4
112 |         with:
113 |           name: ipywidgets-test-report
114 |           path: |
115 |             ui-tests/playwright-report
116 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Python Tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ main ]
 6 |   push:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   unit-tests:
11 | 
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 | 
16 |       - name: Set up python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: '3.11'
20 |       - name: Install poetry
21 |         run: pip install poetry
22 |       - name: Install dependencies with poetry
23 |         run: |
24 |           poetry config virtualenvs.create false
25 |           poetry lock
26 |           poetry install --with test
27 |       - name: Test with pytest and check coverage
28 |         run: |
29 |           coverage run --omit="src/**/test_*.py,src/codeflare_sdk/common/utils/unit_test_support.py" -m pytest
30 |           coverage=$(coverage report -m | tail -1 | tail -c 4 | head -c 2)
31 |           if (( $coverage < 90 )); then echo "Coverage failed at ${coverage}%"; exit 1; else echo "Coverage passed, ${coverage}%"; fi
32 |       - name: Upload to Codecov
33 |         uses: codecov/codecov-action@v4
34 |         with:
35 |           token: ${{ secrets.CODECOV_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dist/
 2 | .python-version
 3 | __pycache__/
 4 | .coverage
 5 | Pipfile
 6 | Pipfile.lock
 7 | .venv*
 8 | build/
 9 | tls-cluster-namespace
10 | quicktest.yaml
11 | node_modules
12 | .DS_Store
13 | ui-tests/playwright-report
14 | ui-tests/test-results
15 | /src/codeflare_sdk.egg-info/
16 | docs/sphinx/_build
17 | docs/sphinx/codeflare_sdk.*.rst
18 | docs/sphinx/codeflare_sdk.rst
19 | docs/sphinx/modules.rst
20 | .idea/
21 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v3.2.0
 6 |     hooks:
 7 |     -   id: trailing-whitespace
 8 |     -   id: end-of-file-fixer
 9 |     -   id: check-yaml
10 |         args: [--allow-multiple-documents]
11 |     -   id: check-added-large-files
12 | -   repo: https://github.com/psf/black
13 |     rev: 23.3.0
14 |     hooks:
15 |       - id: black
16 |         language_version: python3.9
17 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to the CodeFlare SDK
 2 | 
 3 | Thank you for your interest in contributing to the CodeFlare SDK!
 4 | 
 5 | ## Getting Started
 6 | 
 7 | ### Prerequisites
 8 | 
 9 | - Python 3.11
10 | - [Poetry](https://python-poetry.org/)
11 | 
12 | ### Setting Up Your Development Environment
13 | 
14 | 1. **Clone the repository:**
15 | 
16 |    ```sh
17 |    git clone https://github.com/project-codeflare/codeflare-sdk.git
18 |    cd codeflare-sdk
19 |    ```
20 | 
21 | 2. Create a Poetry virtual environment:
22 | 
23 |    ```sh
24 |    poetry shell
25 |    ```
26 | 
27 | 3. Install dependencies:
28 | 
29 |    ```sh
30 |    poetry install
31 |    ```
32 | 
33 |     - To include test dependencies, run:
34 | 
35 |       ```sh
36 |       poetry install --with test
37 |       ```
38 | 
39 |     - To include docs dependencies, run:
40 | 
41 |       ```sh
42 |       poetry install --with docs
43 |       ```
44 | 
45 |     - To include both test and docs dependencies, run:
46 | 
47 |       ```sh
48 |       poetry install --with test,docs
49 |       ```
50 | 
51 | ## Development Workflow
52 | 
53 | ### Pre-commit
54 | 
55 | We use pre-commit to ensure consistent code formatting. To enable pre-commit hooks, run:
56 | 
57 | ```sh
58 | pre-commit install
59 | ```
60 | 
61 | ## Testing
62 | 
63 | To install CodeFlare SDK in editable mode, run:
64 | 
65 | ```sh
66 | pip install -e .
67 | ```
68 | 
69 | ### Unit Testing
70 | 
71 | To run the unit tests, execute:
72 | 
73 | ```sh
74 | pytest -v src/codeflare_sdk
75 | ```
76 | 
77 | ### Local e2e Testing
78 | 
79 | - Please follow the [e2e documentation](https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/sphinx/user-docs/e2e.rst)
80 | 
81 | #### Code Coverage
82 | 
83 | - Run tests with the following command: `coverage run -m pytest`
84 | - To then view a code coverage report w/ missing lines, run `coverage report -m`
85 | 
86 | ### Code Formatting
87 | 
88 | - To check file formatting, in top-level dir run `black --check .`
89 | - To auto-reformat all files, remove the `--check` flag
90 | - To reformat an individual file, run `black <filename>`
91 | 


--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
 1 | approvers:
 2 |   - astefanutti
 3 |   - Bobbins228
 4 |   - chipspeak
 5 |   - ChristianZaccaria
 6 |   - dimakis
 7 |   - Fiona-Waters
 8 |   - franciscojavierarceo
 9 |   - kpostoffice
10 |   - kryanbeane
11 |   - laurafitzgerald
12 |   - pawelpaszki
13 |   - pmccarthy
14 |   - szaher
15 |   - varshaprasad96
16 | reviewers:
17 |   - astefanutti
18 |   - Bobbins228
19 |   - chipspeak
20 |   - ChristianZaccaria
21 |   - dimakis
22 |   - Fiona-Waters
23 |   - franciscojavierarceo
24 |   - kpostoffice
25 |   - kryanbeane
26 |   - laurafitzgerald
27 |   - pawelpaszki
28 |   - pmccarthy
29 |   - szaher
30 |   - varshaprasad96
31 |   - Ygnas
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CodeFlare SDK
 2 | 
 3 | [![Python application](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/unit-tests.yml)
 4 | ![coverage badge](./coverage.svg)
 5 | 
 6 | An intuitive, easy-to-use python interface for batch resource requesting, access, job submission, and observation. Simplifying the developer's life while enabling access to high-performance compute resources, either in the cloud or on-prem.
 7 | 
 8 | For guided demos and basics walkthroughs, check out the following links:
 9 | 
10 | - Guided demo notebooks available [here](https://github.com/project-codeflare/codeflare-sdk/tree/main/demo-notebooks/guided-demos), and copies of the notebooks with [expected output](https://github.com/project-codeflare/codeflare-sdk/tree/main/demo-notebooks/guided-demos/notebook-ex-outputs) also available
11 | - these demos can be copied into your current working directory when using the `codeflare-sdk` by using the `codeflare_sdk.copy_demo_nbs()` function
12 | - Additionally, we have a [video walkthrough](https://www.youtube.com/watch?v=U76iIfd9EmE) of these basic demos from June, 2023
13 | 
14 | Full documentation can be found [here](https://project-codeflare.github.io/codeflare-sdk/index.html)
15 | 
16 | ## Installation
17 | 
18 | Can be installed via `pip`: `pip install codeflare-sdk`
19 | 
20 | ## Development
21 | 
22 | Please see our [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed instructions.
23 | 
24 | ## Release Instructions
25 | 
26 | ### Automated Releases
27 | 
28 | It is possible to use the Release Github workflow to do the release. This is generally the process we follow for releases
29 | 
30 | ### Manual Releases
31 | 
32 | The following instructions apply when doing release manually. This may be required in instances where the automation is failing.
33 | 
34 | - Check and update the version in "pyproject.toml" file.
35 | - Commit all the changes to the repository.
36 | - Create Github release (<https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository#creating-a-release>).
37 | - Build the Python package. `poetry build`
38 | - If not present already, add the API token to Poetry.
39 | `poetry config pypi-token.pypi API_TOKEN`
40 | - Publish the Python package. `poetry publish`
41 | - Trigger the [Publish Documentation](https://github.com/project-codeflare/codeflare-sdk/actions/workflows/publish-documentation.yaml) workflow
42 | 


--------------------------------------------------------------------------------
/assets/images/sdk-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/assets/images/sdk-diagram.png


--------------------------------------------------------------------------------
/coverage.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
 3 |     <linearGradient id="b" x2="0" y2="100%">
 4 |         <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
 5 |         <stop offset="1" stop-opacity=".1"/>
 6 |     </linearGradient>
 7 |     <mask id="a">
 8 |         <rect width="99" height="20" rx="3" fill="#fff"/>
 9 |     </mask>
10 |     <g mask="url(#a)">
11 |         <path fill="#555" d="M0 0h63v20H0z"/>
12 |         <path fill="#97CA00" d="M63 0h36v20H63z"/>
13 |         <path fill="url(#b)" d="M0 0h99v20H0z"/>
14 |     </g>
15 |     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
16 |         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
17 |         <text x="31.5" y="14">coverage</text>
18 |         <text x="80" y="15" fill="#010101" fill-opacity=".3">92%</text>
19 |         <text x="80" y="14">92%</text>
20 |     </g>
21 | </svg>
22 | 


--------------------------------------------------------------------------------
/demo-notebooks/additional-demos/remote_ray_job_client.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Submit a training job remotely to Ray Dashboard protected by oAuth.\n",
  8 |     "This notebook will demonstrate how to submit Ray jobs to an existing Raycluster, using the CodeFlare SDK.\n",
  9 |     "\n",
 10 |     "### Requirements\n",
 11 |     "* Ray Cluster running in OpenShift protected by oAuth.\n",
 12 |     "* The Ray Dashboard URL for the Ray Cluster.\n",
 13 |     "* An OpenShift authorization token with permissions to access the Route.\n",
 14 |     "* A training job, defined in python, within the working directory.\n",
 15 |     "* A requirements.txt or equivalent file containing any additional packages to install onto the Ray images."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Import dependencies from codeflare-sdk\n",
 25 |     "from codeflare_sdk import RayJobClient"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# Setup Authentication Configuration \n",
 35 |     "auth_token = \"XXXX\" # Replace with the actual token\n",
 36 |     "header = {\n",
 37 |     "    'Authorization': f'Bearer {auth_token}'\n",
 38 |     "}"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# Gather the dashboard URL (provided by the creator of the RayCluster)\n",
 48 |     "ray_dashboard = \"XXXX\" # Replace with the Ray dashboard URL"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "#Initialize the RayJobClient\n",
 58 |     "client = RayJobClient(address=ray_dashboard, headers=header, verify=True)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# Submit a job using the RayJobClient\n",
 68 |     "entrypoint_command = \"python XXXX\" # Replace with the training script name\n",
 69 |     "submission_id = client.submit_job(\n",
 70 |     "    entrypoint=entrypoint_command,\n",
 71 |     "    runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
 72 |     ")"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# Get the job's status\n",
 82 |     "client.get_job_status(submission_id)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Get the job's logs\n",
 92 |     "client.get_job_logs(submission_id)"
 93 |    ]
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "language_info": {
 98 |    "name": "python"
 99 |   }
100 |  },
101 |  "nbformat": 4,
102 |  "nbformat_minor": 2
103 | }
104 | 


--------------------------------------------------------------------------------
/demo-notebooks/additional-demos/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==1.9.5
2 | ray_lightning
3 | torchmetrics==0.9.1
4 | torchvision==0.19.0
5 | minio
6 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/3_widget_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "8d4a42f6",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "In this notebook, we will go through the basics of using the SDK to:\n",
  9 |     " - Spin up a Ray cluster with our desired resources\n",
 10 |     " - View the status and specs of our Ray cluster\n",
 11 |     " - Take down the Ray cluster when finished"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "# Import pieces from codeflare-sdk\n",
 22 |     "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, view_clusters"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "614daa0c",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Create authentication object for user permissions\n",
 33 |     "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
 34 |     "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
 35 |     "auth = TokenAuthentication(\n",
 36 |     "    token = \"XXXXX\",\n",
 37 |     "    server = \"XXXXX\",\n",
 38 |     "    skip_tls=False\n",
 39 |     ")\n",
 40 |     "auth.login()"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "bc27f84c",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n",
 49 |     "\n",
 50 |     "NOTE: The default images used by the CodeFlare SDK for creating a RayCluster resource depend on the installed Python version:\n",
 51 |     "\n",
 52 |     "- For Python 3.11: 'quay.io/modh/ray:2.47.1-py311-cu121'\n",
 53 |     "\n",
 54 |     "If you prefer to use a custom Ray image that better suits your needs, you can specify it in the image field to override the default."
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "id": "0f4bc870-091f-4e11-9642-cba145710159",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# Create and configure our cluster object\n",
 65 |     "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
 66 |     "cluster = Cluster(ClusterConfiguration(\n",
 67 |     "    name='widgettest',\n",
 68 |     "    head_cpu_requests='500m',\n",
 69 |     "    head_cpu_limits='500m',\n",
 70 |     "    head_memory_requests=2,\n",
 71 |     "    head_memory_limits=2,\n",
 72 |     "    head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n",
 73 |     "    worker_extended_resource_requests={'nvidia.com/gpu':0},\n",
 74 |     "    num_workers=2,\n",
 75 |     "    worker_cpu_requests='250m',\n",
 76 |     "    worker_cpu_limits=1,\n",
 77 |     "    worker_memory_requests=2,\n",
 78 |     "    worker_memory_limits=2,\n",
 79 |     "    # image=\"\", # Optional Field\n",
 80 |     "    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n",
 81 |     "    # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
 82 |     "))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "id": "3de6403c",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "view_clusters()"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "2d8e6ce3",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "cluster.status()"
103 |    ]
104 |   }
105 |  ],
106 |  "metadata": {
107 |   "kernelspec": {
108 |    "display_name": "Python 3 (ipykernel)",
109 |    "language": "python",
110 |    "name": "python3"
111 |   },
112 |   "language_info": {
113 |    "codemirror_mode": {
114 |     "name": "ipython",
115 |     "version": 3
116 |    },
117 |    "file_extension": ".py",
118 |    "mimetype": "text/x-python",
119 |    "name": "python",
120 |    "nbconvert_exporter": "python",
121 |    "pygments_lexer": "ipython3",
122 |    "version": "3.9.18"
123 |   },
124 |   "vscode": {
125 |    "interpreter": {
126 |     "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
127 |    }
128 |   }
129 |  },
130 |  "nbformat": 4,
131 |  "nbformat_minor": 5
132 | }
133 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/download_mnist_datasets.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 IBM, Red Hat
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | from torchvision.datasets import MNIST
17 | from torchvision import transforms
18 | 
19 | 
20 | def download_mnist_dataset(destination_dir):
21 |     # Ensure the destination directory exists
22 |     if not os.path.exists(destination_dir):
23 |         os.makedirs(destination_dir)
24 | 
25 |     # Define transformations
26 |     transform = transforms.Compose(
27 |         [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
28 |     )
29 | 
30 |     # Download the training data
31 |     train_set = MNIST(
32 |         root=destination_dir, train=True, download=True, transform=transform
33 |     )
34 | 
35 |     # Download the test data
36 |     test_set = MNIST(
37 |         root=destination_dir, train=False, download=True, transform=transform
38 |     )
39 | 
40 |     print(f"MNIST dataset downloaded in {destination_dir}")
41 | 
42 | 
43 | # Specify the directory where you
44 | destination_dir = os.path.dirname(os.path.abspath(__file__))
45 | 
46 | download_mnist_dataset(destination_dir)
47 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/mnist.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 IBM, Red Hat
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # In[]
 16 | import os
 17 | 
 18 | import torch
 19 | from pytorch_lightning import LightningModule, Trainer
 20 | from pytorch_lightning.callbacks.progress import TQDMProgressBar
 21 | from pytorch_lightning.loggers import CSVLogger
 22 | from torch import nn
 23 | from torch.nn import functional as F
 24 | from torch.utils.data import DataLoader, random_split
 25 | from torchmetrics import Accuracy
 26 | from torchvision import transforms
 27 | from torchvision.datasets import MNIST
 28 | 
 29 | PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
 30 | BATCH_SIZE = 256 if torch.cuda.is_available() else 64
 31 | # %%
 32 | 
 33 | print("prior to running the trainer")
 34 | print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
 35 | print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
 36 | 
 37 | 
 38 | class LitMNIST(LightningModule):
 39 |     def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
 40 |         super().__init__()
 41 | 
 42 |         # Set our init args as class attributes
 43 |         self.data_dir = data_dir
 44 |         self.hidden_size = hidden_size
 45 |         self.learning_rate = learning_rate
 46 | 
 47 |         # Hardcode some dataset specific attributes
 48 |         self.num_classes = 10
 49 |         self.dims = (1, 28, 28)
 50 |         channels, width, height = self.dims
 51 |         self.transform = transforms.Compose(
 52 |             [
 53 |                 transforms.ToTensor(),
 54 |                 transforms.Normalize((0.1307,), (0.3081,)),
 55 |             ]
 56 |         )
 57 | 
 58 |         # Define PyTorch model
 59 |         self.model = nn.Sequential(
 60 |             nn.Flatten(),
 61 |             nn.Linear(channels * width * height, hidden_size),
 62 |             nn.ReLU(),
 63 |             nn.Dropout(0.1),
 64 |             nn.Linear(hidden_size, hidden_size),
 65 |             nn.ReLU(),
 66 |             nn.Dropout(0.1),
 67 |             nn.Linear(hidden_size, self.num_classes),
 68 |         )
 69 | 
 70 |         self.val_accuracy = Accuracy()
 71 |         self.test_accuracy = Accuracy()
 72 | 
 73 |     def forward(self, x):
 74 |         x = self.model(x)
 75 |         return F.log_softmax(x, dim=1)
 76 | 
 77 |     def training_step(self, batch, batch_idx):
 78 |         x, y = batch
 79 |         logits = self(x)
 80 |         loss = F.nll_loss(logits, y)
 81 |         return loss
 82 | 
 83 |     def validation_step(self, batch, batch_idx):
 84 |         x, y = batch
 85 |         logits = self(x)
 86 |         loss = F.nll_loss(logits, y)
 87 |         preds = torch.argmax(logits, dim=1)
 88 |         self.val_accuracy.update(preds, y)
 89 | 
 90 |         # Calling self.log will surface up scalars for you in TensorBoard
 91 |         self.log("val_loss", loss, prog_bar=True)
 92 |         self.log("val_acc", self.val_accuracy, prog_bar=True)
 93 | 
 94 |     def test_step(self, batch, batch_idx):
 95 |         x, y = batch
 96 |         logits = self(x)
 97 |         loss = F.nll_loss(logits, y)
 98 |         preds = torch.argmax(logits, dim=1)
 99 |         self.test_accuracy.update(preds, y)
100 | 
101 |         # Calling self.log will surface up scalars for you in TensorBoard
102 |         self.log("test_loss", loss, prog_bar=True)
103 |         self.log("test_acc", self.test_accuracy, prog_bar=True)
104 | 
105 |     def configure_optimizers(self):
106 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
107 |         return optimizer
108 | 
109 |     ####################
110 |     # DATA RELATED HOOKS
111 |     ####################
112 | 
113 |     def prepare_data(self):
114 |         # download
115 |         print("Downloading MNIST dataset...")
116 |         MNIST(self.data_dir, train=True, download=True)
117 |         MNIST(self.data_dir, train=False, download=True)
118 | 
119 |     def setup(self, stage=None):
120 |         # Assign train/val datasets for use in dataloaders
121 |         if stage == "fit" or stage is None:
122 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
123 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
124 | 
125 |         # Assign test dataset for use in dataloader(s)
126 |         if stage == "test" or stage is None:
127 |             self.mnist_test = MNIST(
128 |                 self.data_dir, train=False, transform=self.transform
129 |             )
130 | 
131 |     def train_dataloader(self):
132 |         return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)
133 | 
134 |     def val_dataloader(self):
135 |         return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)
136 | 
137 |     def test_dataloader(self):
138 |         return DataLoader(self.mnist_test, batch_size=BATCH_SIZE)
139 | 
140 | 
141 | # Init DataLoader from MNIST Dataset
142 | 
143 | model = LitMNIST()
144 | 
145 | print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1)))
146 | print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1)))
147 | 
148 | # Initialize a trainer
149 | trainer = Trainer(
150 |     accelerator="auto",
151 |     # devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
152 |     max_epochs=5,
153 |     callbacks=[TQDMProgressBar(refresh_rate=20)],
154 |     num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)),
155 |     devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)),
156 |     strategy="ddp",
157 | )
158 | 
159 | # Train the model ⚡
160 | trainer.fit(model)
161 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/mnist_fashion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import ray
 4 | from torch.utils.data import DataLoader
 5 | from torchvision import datasets
 6 | from torchvision.transforms import ToTensor
 7 | from ray.train.torch import TorchTrainer
 8 | from ray.train import ScalingConfig
 9 | 
10 | 
11 | def get_dataset():
12 |     return datasets.FashionMNIST(
13 |         root="/tmp/data",
14 |         train=True,
15 |         download=True,
16 |         transform=ToTensor(),
17 |     )
18 | 
19 | 
20 | class NeuralNetwork(nn.Module):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.flatten = nn.Flatten()
24 |         self.linear_relu_stack = nn.Sequential(
25 |             nn.Linear(28 * 28, 512),
26 |             nn.ReLU(),
27 |             nn.Linear(512, 512),
28 |             nn.ReLU(),
29 |             nn.Linear(512, 10),
30 |         )
31 | 
32 |     def forward(self, inputs):
33 |         inputs = self.flatten(inputs)
34 |         logits = self.linear_relu_stack(inputs)
35 |         return logits
36 | 
37 | 
38 | def get_dataset():
39 |     return datasets.FashionMNIST(
40 |         root="/tmp/data",
41 |         train=True,
42 |         download=True,
43 |         transform=ToTensor(),
44 |     )
45 | 
46 | 
47 | def train_func_distributed():
48 |     num_epochs = 3
49 |     batch_size = 64
50 | 
51 |     dataset = get_dataset()
52 |     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
53 |     dataloader = ray.train.torch.prepare_data_loader(dataloader)
54 | 
55 |     model = NeuralNetwork()
56 |     model = ray.train.torch.prepare_model(model)
57 | 
58 |     criterion = nn.CrossEntropyLoss()
59 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
60 | 
61 |     for epoch in range(num_epochs):
62 |         if ray.train.get_context().get_world_size() > 1:
63 |             dataloader.sampler.set_epoch(epoch)
64 | 
65 |         for inputs, labels in dataloader:
66 |             optimizer.zero_grad()
67 |             pred = model(inputs)
68 |             loss = criterion(pred, labels)
69 |             loss.backward()
70 |             optimizer.step()
71 |         print(f"epoch: {epoch}, loss: {loss.item()}")
72 | 
73 | 
74 | # For GPU Training, set `use_gpu` to True.
75 | use_gpu = True
76 | 
77 | # To learn more about configuring S3 compatible storage check out our docs -> https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/s3-compatible-storage.md
78 | trainer = TorchTrainer(
79 |     train_func_distributed,
80 |     scaling_config=ScalingConfig(
81 |         # num_workers = number of worker nodes with the ray head node included
82 |         num_workers=3,
83 |         use_gpu=use_gpu,
84 |         resources_per_worker={
85 |             "CPU": 1,
86 |         },
87 |         trainer_resources={
88 |             "CPU": 0,
89 |         },
90 |     ),
91 | )
92 | 
93 | results = trainer.fit()
94 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/notebook-ex-outputs/jobtest.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: workload.codeflare.dev/v1beta1
  2 | kind: AppWrapper
  3 | metadata:
  4 |   name: jobtest
  5 |   namespace: default
  6 | spec:
  7 |   priority: 9
  8 |   resources:
  9 |     GenericItems:
 10 |     - custompodresources:
 11 |       - limits:
 12 |           cpu: 2
 13 |           memory: 8G
 14 |           nvidia.com/gpu: 0
 15 |         replicas: 1
 16 |         requests:
 17 |           cpu: 2
 18 |           memory: 8G
 19 |           nvidia.com/gpu: 0
 20 |       - limits:
 21 |           cpu: 1
 22 |           memory: 4G
 23 |           nvidia.com/gpu: 0
 24 |         replicas: 2
 25 |         requests:
 26 |           cpu: 1
 27 |           memory: 4G
 28 |           nvidia.com/gpu: 0
 29 |       generictemplate:
 30 |         apiVersion: ray.io/v1
 31 |         kind: RayCluster
 32 |         metadata:
 33 |           labels:
 34 |             appwrapper.mcad.ibm.com: jobtest
 35 |             controller-tools.k8s.io: '1.0'
 36 |           name: jobtest
 37 |           namespace: default
 38 |         spec:
 39 |           autoscalerOptions:
 40 |             idleTimeoutSeconds: 60
 41 |             imagePullPolicy: Always
 42 |             resources:
 43 |               limits:
 44 |                 cpu: 500m
 45 |                 memory: 512Mi
 46 |               requests:
 47 |                 cpu: 500m
 48 |                 memory: 512Mi
 49 |             upscalingMode: Default
 50 |           enableInTreeAutoscaling: false
 51 |           headGroupSpec:
 52 |             rayStartParams:
 53 |               block: 'true'
 54 |               dashboard-host: 0.0.0.0
 55 |               num-gpus: '0'
 56 |             serviceType: ClusterIP
 57 |             template:
 58 |               spec:
 59 |                 containers:
 60 |                 - env:
 61 |                   - name: MY_POD_IP
 62 |                     valueFrom:
 63 |                       fieldRef:
 64 |                         fieldPath: status.podIP
 65 |                   - name: RAY_USE_TLS
 66 |                     value: '0'
 67 |                   - name: RAY_TLS_SERVER_CERT
 68 |                     value: /home/ray/workspace/tls/server.crt
 69 |                   - name: RAY_TLS_SERVER_KEY
 70 |                     value: /home/ray/workspace/tls/server.key
 71 |                   - name: RAY_TLS_CA_CERT
 72 |                     value: /home/ray/workspace/tls/ca.crt
 73 |                   image: quay.io/modh/ray:2.47.1-py311-cu121
 74 |                   imagePullPolicy: Always
 75 |                   lifecycle:
 76 |                     preStop:
 77 |                       exec:
 78 |                         command:
 79 |                         - /bin/sh
 80 |                         - -c
 81 |                         - ray stop
 82 |                   name: ray-head
 83 |                   ports:
 84 |                   - containerPort: 6379
 85 |                     name: gcs
 86 |                   - containerPort: 8265
 87 |                     name: dashboard
 88 |                   - containerPort: 10001
 89 |                     name: client
 90 |                   resources:
 91 |                     limits:
 92 |                       cpu: 2
 93 |                       memory: 8G
 94 |                       nvidia.com/gpu: 0
 95 |                     requests:
 96 |                       cpu: 2
 97 |                       memory: 8G
 98 |                       nvidia.com/gpu: 0
 99 |                 imagePullSecrets: []
100 |           rayVersion: 2.47.1
101 |           workerGroupSpecs:
102 |           - groupName: small-group-jobtest
103 |             maxReplicas: 2
104 |             minReplicas: 2
105 |             rayStartParams:
106 |               block: 'true'
107 |               num-gpus: '0'
108 |             replicas: 2
109 |             template:
110 |               metadata:
111 |                 annotations:
112 |                   key: value
113 |                 labels:
114 |                   key: value
115 |               spec:
116 |                 containers:
117 |                 - env:
118 |                   - name: MY_POD_IP
119 |                     valueFrom:
120 |                       fieldRef:
121 |                         fieldPath: status.podIP
122 |                   - name: RAY_USE_TLS
123 |                     value: '0'
124 |                   - name: RAY_TLS_SERVER_CERT
125 |                     value: /home/ray/workspace/tls/server.crt
126 |                   - name: RAY_TLS_SERVER_KEY
127 |                     value: /home/ray/workspace/tls/server.key
128 |                   - name: RAY_TLS_CA_CERT
129 |                     value: /home/ray/workspace/tls/ca.crt
130 |                   image: quay.io/modh/ray:2.47.1-py311-cu121
131 |                   lifecycle:
132 |                     preStop:
133 |                       exec:
134 |                         command:
135 |                         - /bin/sh
136 |                         - -c
137 |                         - ray stop
138 |                   name: machine-learning
139 |                   resources:
140 |                     limits:
141 |                       cpu: 1
142 |                       memory: 4G
143 |                       nvidia.com/gpu: 0
144 |                     requests:
145 |                       cpu: 1
146 |                       memory: 4G
147 |                       nvidia.com/gpu: 0
148 |                 imagePullSecrets: []
149 |       replicas: 1
150 |     - generictemplate:
151 |         apiVersion: route.openshift.io/v1
152 |         kind: Route
153 |         metadata:
154 |           labels:
155 |             odh-ray-cluster-service: jobtest-head-svc
156 |           name: ray-dashboard-jobtest
157 |           namespace: default
158 |         spec:
159 |           port:
160 |             targetPort: dashboard
161 |           to:
162 |             kind: Service
163 |             name: jobtest-head-svc
164 |       replicas: 1
165 |     Items: []
166 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/notebook-ex-outputs/raytest.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: workload.codeflare.dev/v1beta1
  2 | kind: AppWrapper
  3 | metadata:
  4 |   name: raytest
  5 |   namespace: default
  6 | spec:
  7 |   priority: 9
  8 |   resources:
  9 |     GenericItems:
 10 |     - custompodresources:
 11 |       - limits:
 12 |           cpu: 2
 13 |           memory: 8G
 14 |           nvidia.com/gpu: 0
 15 |         replicas: 1
 16 |         requests:
 17 |           cpu: 2
 18 |           memory: 8G
 19 |           nvidia.com/gpu: 0
 20 |       - limits:
 21 |           cpu: 1
 22 |           memory: 4G
 23 |           nvidia.com/gpu: 0
 24 |         replicas: 2
 25 |         requests:
 26 |           cpu: 1
 27 |           memory: 4G
 28 |           nvidia.com/gpu: 0
 29 |       generictemplate:
 30 |         apiVersion: ray.io/v1
 31 |         kind: RayCluster
 32 |         metadata:
 33 |           labels:
 34 |             appwrapper.mcad.ibm.com: raytest
 35 |             controller-tools.k8s.io: '1.0'
 36 |           name: raytest
 37 |           namespace: default
 38 |         spec:
 39 |           autoscalerOptions:
 40 |             idleTimeoutSeconds: 60
 41 |             imagePullPolicy: Always
 42 |             resources:
 43 |               limits:
 44 |                 cpu: 500m
 45 |                 memory: 512Mi
 46 |               requests:
 47 |                 cpu: 500m
 48 |                 memory: 512Mi
 49 |             upscalingMode: Default
 50 |           enableInTreeAutoscaling: false
 51 |           headGroupSpec:
 52 |             rayStartParams:
 53 |               block: 'true'
 54 |               dashboard-host: 0.0.0.0
 55 |               num-gpus: '0'
 56 |             serviceType: ClusterIP
 57 |             template:
 58 |               spec:
 59 |                 containers:
 60 |                 - env:
 61 |                   - name: MY_POD_IP
 62 |                     valueFrom:
 63 |                       fieldRef:
 64 |                         fieldPath: status.podIP
 65 |                   - name: RAY_USE_TLS
 66 |                     value: '0'
 67 |                   - name: RAY_TLS_SERVER_CERT
 68 |                     value: /home/ray/workspace/tls/server.crt
 69 |                   - name: RAY_TLS_SERVER_KEY
 70 |                     value: /home/ray/workspace/tls/server.key
 71 |                   - name: RAY_TLS_CA_CERT
 72 |                     value: /home/ray/workspace/tls/ca.crt
 73 |                   image: quay.io/modh/ray:2.47.1-py311-cu121
 74 |                   imagePullPolicy: Always
 75 |                   lifecycle:
 76 |                     preStop:
 77 |                       exec:
 78 |                         command:
 79 |                         - /bin/sh
 80 |                         - -c
 81 |                         - ray stop
 82 |                   name: ray-head
 83 |                   ports:
 84 |                   - containerPort: 6379
 85 |                     name: gcs
 86 |                   - containerPort: 8265
 87 |                     name: dashboard
 88 |                   - containerPort: 10001
 89 |                     name: client
 90 |                   resources:
 91 |                     limits:
 92 |                       cpu: 2
 93 |                       memory: 8G
 94 |                       nvidia.com/gpu: 0
 95 |                     requests:
 96 |                       cpu: 2
 97 |                       memory: 8G
 98 |                       nvidia.com/gpu: 0
 99 |                 imagePullSecrets: []
100 |           rayVersion: 2.47.1
101 |           workerGroupSpecs:
102 |           - groupName: small-group-raytest
103 |             maxReplicas: 2
104 |             minReplicas: 2
105 |             rayStartParams:
106 |               block: 'true'
107 |               num-gpus: '0'
108 |             replicas: 2
109 |             template:
110 |               metadata:
111 |                 annotations:
112 |                   key: value
113 |                 labels:
114 |                   key: value
115 |               spec:
116 |                 containers:
117 |                 - env:
118 |                   - name: MY_POD_IP
119 |                     valueFrom:
120 |                       fieldRef:
121 |                         fieldPath: status.podIP
122 |                   - name: RAY_USE_TLS
123 |                     value: '0'
124 |                   - name: RAY_TLS_SERVER_CERT
125 |                     value: /home/ray/workspace/tls/server.crt
126 |                   - name: RAY_TLS_SERVER_KEY
127 |                     value: /home/ray/workspace/tls/server.key
128 |                   - name: RAY_TLS_CA_CERT
129 |                     value: /home/ray/workspace/tls/ca.crt
130 |                   image: quay.io/modh/ray:2.47.1-py311-cu121
131 |                   lifecycle:
132 |                     preStop:
133 |                       exec:
134 |                         command:
135 |                         - /bin/sh
136 |                         - -c
137 |                         - ray stop
138 |                   name: machine-learning
139 |                   resources:
140 |                     limits:
141 |                       cpu: 1
142 |                       memory: 4G
143 |                       nvidia.com/gpu: 0
144 |                     requests:
145 |                       cpu: 1
146 |                       memory: 4G
147 |                       nvidia.com/gpu: 0
148 |                 imagePullSecrets: []
149 |       replicas: 1
150 |     - generictemplate:
151 |         apiVersion: route.openshift.io/v1
152 |         kind: Route
153 |         metadata:
154 |           labels:
155 |             odh-ray-cluster-service: raytest-head-svc
156 |           name: ray-dashboard-raytest
157 |           namespace: default
158 |         spec:
159 |           port:
160 |             targetPort: dashboard
161 |           to:
162 |             kind: Service
163 |             name: raytest-head-svc
164 |       replicas: 1
165 |     Items: []
166 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/notebook-ex-outputs/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==2.4.0
2 | ray_lightning
3 | torchmetrics==1.6.0
4 | torchvision==0.20.1
5 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/preview_nbs/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==2.4.0
2 | ray_lightning
3 | torchmetrics==1.6.0
4 | torchvision==0.20.1
5 | 


--------------------------------------------------------------------------------
/demo-notebooks/guided-demos/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==2.4.0
2 | ray_lightning
3 | torchmetrics==1.6.0
4 | torchvision==0.20.1
5 | 


--------------------------------------------------------------------------------
/docs/generate-documentation.md:
--------------------------------------------------------------------------------
 1 | # Generate CodeFlare Documentation with Sphinx
 2 | The following is a short guide on how you can use Sphinx to auto-generate code documentation. Documentation for the latest SDK release can be found [here](https://project-codeflare.github.io/codeflare-sdk/index.html).
 3 | 
 4 | 1. Clone the CodeFlare SDK
 5 | ``` bash
 6 | git clone https://github.com/project-codeflare/codeflare-sdk.git
 7 | ```
 8 | 2. [Install Sphinx](https://www.sphinx-doc.org/en/master/usage/installation.html)
 9 | 3. Run the below command to generate code documentation
10 | ``` bash
11 | sphinx-apidoc -o docs/sphinx src/codeflare_sdk "**/*test_*" --force # Generates RST files
12 | make html -C docs/sphinx # Builds HTML files
13 | ```
14 | 4. You can access the docs locally at `docs/sphinx/_build/html/index.html`
15 | 


--------------------------------------------------------------------------------
/docs/images/codeflare_sdk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/docs/images/codeflare_sdk.png


--------------------------------------------------------------------------------
/docs/images/codeflare_stack_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/docs/images/codeflare_stack_arch.png


--------------------------------------------------------------------------------
/docs/sphinx/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/sphinx/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(0, os.path.abspath(".."))
10 | 
11 | # -- Project information -----------------------------------------------------
12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
13 | 
14 | project = "CodeFlare SDK"
15 | copyright = "2024, Project CodeFlare"
16 | author = "Project CodeFlare"
17 | release = "v0.21.1"
18 | 
19 | # -- General configuration ---------------------------------------------------
20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
21 | 
22 | extensions = [
23 |     "sphinx.ext.autodoc",
24 |     "sphinx.ext.todo",
25 |     "sphinx.ext.viewcode",
26 |     "sphinx.ext.autosummary",
27 |     "sphinx_rtd_theme",
28 | ]
29 | 
30 | templates_path = ["_templates"]
31 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
32 | 
33 | 
34 | # -- Options for HTML output -------------------------------------------------
35 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
36 | 
37 | html_theme = "sphinx_rtd_theme"
38 | html_static_path = ["_static"]
39 | 


--------------------------------------------------------------------------------
/docs/sphinx/index.rst:
--------------------------------------------------------------------------------
 1 | .. CodeFlare SDK documentation master file, created by
 2 |    sphinx-quickstart on Thu Oct 10 11:27:58 2024.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | CodeFlare SDK documentation
 7 | ===========================
 8 | 
 9 | The CodeFlare SDK is an intuitive, easy-to-use python interface for batch resource requesting, access, job submission, and observation. Simplifying the developer's life while enabling access to high-performance compute resources, either in the cloud or on-prem.
10 | 
11 | 
12 | .. toctree::
13 |    :maxdepth: 2
14 |    :caption: Code Documentation:
15 | 
16 |    modules
17 | 
18 | .. toctree::
19 |    :maxdepth: 1
20 |    :caption: User Documentation:
21 | 
22 |    user-docs/authentication
23 |    user-docs/cluster-configuration
24 |    user-docs/ray-cluster-interaction
25 |    user-docs/e2e
26 |    user-docs/s3-compatible-storage
27 |    user-docs/setup-kueue
28 |    user-docs/ui-widgets
29 | 
30 | Quick Links
31 | ===========
32 | - `PyPi <https://pypi.org/project/codeflare-sdk/>`__
33 | - `GitHub <https://github.com/project-codeflare/codeflare-sdk>`__
34 | - `OpenShift AI Documentation <https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.13/html/working_with_distributed_workloads/index>`__
35 | 


--------------------------------------------------------------------------------
/docs/sphinx/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/sphinx/user-docs/authentication.rst:
--------------------------------------------------------------------------------
 1 | Authentication via the CodeFlare SDK
 2 | ====================================
 3 | 
 4 | Currently there are four ways of authenticating to your cluster via the
 5 | SDK. Authenticating with your cluster allows you to perform actions such
 6 | as creating Ray Clusters and Job Submission.
 7 | 
 8 | Method 1 Token Authentication
 9 | -----------------------------
10 | 
11 | This is how a typical user would authenticate to their cluster using
12 | ``TokenAuthentication``.
13 | 
14 | ::
15 | 
16 |    from codeflare_sdk import TokenAuthentication
17 | 
18 |    auth = TokenAuthentication(
19 |        token = "XXXXX",
20 |        server = "XXXXX",
21 |        skip_tls=False,
22 |        # ca_cert_path="/path/to/cert"
23 |    )
24 |    auth.login()
25 |    # log out with auth.logout()
26 | 
27 | Setting ``skip_tls=True`` allows interaction with an HTTPS server
28 | bypassing the server certificate checks although this is not secure. You
29 | can pass a custom certificate to ``TokenAuthentication`` by using
30 | ``ca_cert_path="/path/to/cert"`` when authenticating provided
31 | ``skip_tls=False``. Alternatively you can set the environment variable
32 | ``CF_SDK_CA_CERT_PATH`` to the path of your custom certificate.
33 | 
34 | Method 2 Kubernetes Config File Authentication (Default location)
35 | -----------------------------------------------------------------
36 | 
37 | If a user has authenticated to their cluster by alternate means e.g. run
38 | a login command like ``oc login --token=<token> --server=<server>``
39 | their kubernetes config file should have updated. If the user has not
40 | specifically authenticated through the SDK by other means such as
41 | ``TokenAuthentication`` then the SDK will try to use their default
42 | Kubernetes config file located at ``"$HOME/.kube/config"``.
43 | 
44 | Method 3 Specifying a Kubernetes Config File
45 | --------------------------------------------
46 | 
47 | A user can specify a config file via a different authentication class
48 | ``KubeConfigFileAuthentication`` for authenticating with the SDK. This
49 | is what loading a custom config file would typically look like.
50 | 
51 | ::
52 | 
53 |    from codeflare_sdk import KubeConfigFileAuthentication
54 | 
55 |    auth = KubeConfigFileAuthentication(
56 |        kube_config_path="/path/to/config",
57 |    )
58 |    auth.load_kube_config()
59 |    # log out with auth.logout()
60 | 
61 | Method 4 In-Cluster Authentication
62 | ----------------------------------
63 | 
64 | If a user does not authenticate by any of the means detailed above and
65 | does not have a config file at ``"$HOME/.kube/config"`` the SDK will try
66 | to authenticate with the in-cluster configuration file.
67 | 


--------------------------------------------------------------------------------
/docs/sphinx/user-docs/images/ui-buttons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/docs/sphinx/user-docs/images/ui-buttons.png


--------------------------------------------------------------------------------
/docs/sphinx/user-docs/images/ui-view-clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/docs/sphinx/user-docs/images/ui-view-clusters.png


--------------------------------------------------------------------------------
/docs/sphinx/user-docs/ray-cluster-interaction.rst:
--------------------------------------------------------------------------------
 1 | Ray Cluster Interaction
 2 | =======================
 3 | 
 4 | The CodeFlare SDK offers multiple ways to interact with Ray Clusters
 5 | including the below methods.
 6 | 
 7 | get_cluster()
 8 | -------------
 9 | 
10 | The ``get_cluster()`` function is used to initialise a ``Cluster``
11 | object from a pre-existing Ray Cluster/AppWrapper. Below is an example
12 | of it's usage:
13 | 
14 | ::
15 | 
16 |    from codeflare_sdk import get_cluster
17 |    cluster = get_cluster(cluster_name="raytest", namespace="example", is_appwrapper=False, write_to_file=False)
18 |    -> output: Yaml resources loaded for raytest
19 |    cluster.status()
20 |    -> output:
21 |                        🚀 CodeFlare Cluster Status 🚀
22 |     ╭─────────────────────────────────────────────────────────────────╮
23 |     │   Name                                                          │
24 |     │   raytest                                           Active ✅   │
25 |     │                                                                 │
26 |     │   URI: ray://raytest-head-svc.example.svc:10001                 │
27 |     │                                                                 │
28 |     │   Dashboard🔗                                                   │
29 |     │                                                                 │
30 |     ╰─────────────────────────────────────────────────────────────────╯
31 |    (<CodeFlareClusterStatus.READY: 1>, True)
32 |    cluster.down()
33 |    cluster.up() # This function will create an exact copy of the retrieved Ray Cluster only if the Ray Cluster has been previously deleted.
34 | 
35 | | These are the parameters the ``get_cluster()`` function accepts:
36 | | ``cluster_name: str # Required`` -> The name of the Ray Cluster.
37 | | ``namespace: str # Default: "default"`` -> The namespace of the Ray Cluster.
38 | | ``is_appwrapper: bool # Default: False`` -> When set to
39 | | ``True`` the function will attempt to retrieve an AppWrapper instead of a Ray Cluster.
40 | | ``write_to_file: bool # Default: False`` -> When set to ``True`` the Ray Cluster/AppWrapper will be written to a file similar to how it is done in ``ClusterConfiguration``.
41 | 
42 | list_all_queued()
43 | -----------------
44 | 
45 | | The ``list_all_queued()`` function returns (and prints by default) a list of all currently queued-up Ray Clusters in a given namespace.
46 | | It accepts the following parameters:
47 | | ``namespace: str # Required`` -> The namespace you want to retrieve the list from.
48 | | ``print_to_console: bool # Default: True`` -> Allows the user to print the list to their console.
49 | | ``appwrapper: bool # Default: False`` -> When set to ``True`` allows the user to list queued AppWrappers.
50 | 
51 | list_all_clusters()
52 | -------------------
53 | 
54 | | The ``list_all_clusters()`` function will return a list of detailed descriptions of Ray Clusters to the console by default.
55 | | It accepts the following parameters:
56 | | ``namespace: str # Required`` -> The namespace you want to retrieve the list from.
57 | | ``print_to_console: bool # Default: True`` -> A boolean that allows the user to print the list to their console.
58 | 
59 | .. note::
60 | 
61 |    The following methods require a ``Cluster`` object to be
62 |    initialized. See :doc:`./cluster-configuration`
63 | 
64 | cluster.up()
65 | ------------
66 | 
67 | | The ``cluster.up()`` function creates a Ray Cluster in the given namespace.
68 | 
69 | cluster.apply()
70 | ------------
71 | 
72 | | The ``cluster.apply()`` function applies a Ray Cluster in the given namespace. If the cluster already exists, it is updated.
73 | | If it does not exist it is created.
74 | 
75 | cluster.down()
76 | --------------
77 | 
78 | | The ``cluster.down()`` function deletes the Ray Cluster in the given namespace.
79 | 
80 | cluster.status()
81 | ----------------
82 | 
83 | | The ``cluster.status()`` function prints out the status of the Ray Cluster's state with a link to the Ray Dashboard.
84 | 
85 | cluster.details()
86 | -----------------
87 | 
88 | | The ``cluster.details()`` function prints out a detailed description of the Ray Cluster's status, worker resources and a link to the Ray Dashboard.
89 | 
90 | cluster.wait_ready()
91 | --------------------
92 | 
93 | | The ``cluster.wait_ready()`` function waits for the requested cluster to be ready, up to an optional timeout and checks every 5 seconds.
94 | | It accepts the following parameters:
95 | | ``timeout: Optional[int] # Default: None`` -> Allows the user to define a timeout for the ``wait_ready()`` function.
96 | | ``dashboard_check: bool # Default: True`` -> If enabled the ``wait_ready()`` function will wait until the Ray Dashboard is ready too.
97 | 


--------------------------------------------------------------------------------
/docs/sphinx/user-docs/s3-compatible-storage.rst:
--------------------------------------------------------------------------------
 1 | S3 compatible storage with Ray Train examples
 2 | =============================================
 3 | 
 4 | Some of our distributed training examples require an external storage
 5 | solution so that all nodes can access the same data. The following are
 6 | examples for configuring S3 or Minio storage for your Ray Train script
 7 | or interactive session.
 8 | 
 9 | S3 Bucket
10 | ---------
11 | 
12 | In your Python Script add the following environment variables:
13 | 
14 | .. code:: python
15 | 
16 |    os.environ["AWS_ACCESS_KEY_ID"] = "XXXXXXXX"
17 |    os.environ["AWS_SECRET_ACCESS_KEY"] = "XXXXXXXX"
18 |    os.environ["AWS_DEFAULT_REGION"] = "XXXXXXXX"
19 | 
20 | Alternatively you can specify these variables in your runtime
21 | environment on Job Submission.
22 | 
23 | .. code:: python
24 | 
25 |    submission_id = client.submit_job(
26 |        entrypoint=...,
27 |        runtime_env={
28 |            "env_vars": {
29 |                "AWS_ACCESS_KEY_ID": os.environ.get('AWS_ACCESS_KEY_ID'),
30 |                "AWS_SECRET_ACCESS_KEY": os.environ.get('AWS_SECRET_ACCESS_KEY'),
31 |                "AWS_DEFAULT_REGION": os.environ.get('AWS_DEFAULT_REGION')
32 |            },
33 |        }
34 |    )
35 | 
36 | In your Trainer configuration you can specify a ``run_config`` which
37 | will utilise your external storage.
38 | 
39 | .. code:: python
40 | 
41 |    trainer = TorchTrainer(
42 |        train_func_distributed,
43 |        scaling_config=scaling_config,
44 |        run_config = ray.train.RunConfig(storage_path="s3://BUCKET_NAME/SUB_PATH/", name="unique_run_name")
45 |    )
46 | 
47 | To learn more about Amazon S3 Storage you can find information
48 | `here <https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html>`__.
49 | 
50 | Minio Bucket
51 | ------------
52 | 
53 | In your Python Script add the following function for configuring your
54 | run_config:
55 | 
56 | .. code:: python
57 | 
58 |    import s3fs
59 |    import pyarrow
60 | 
61 |    def get_minio_run_config():
62 |       s3_fs = s3fs.S3FileSystem(
63 |           key = os.getenv('MINIO_ACCESS_KEY', "XXXXX"),
64 |           secret = os.getenv('MINIO_SECRET_ACCESS_KEY', "XXXXX"),
65 |           endpoint_url = os.getenv('MINIO_URL', "XXXXX")
66 |       )
67 |       custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))
68 |       run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)
69 |       return run_config
70 | 
71 | You can update the ``run_config`` to further suit your needs above.
72 | Lastly the new ``run_config`` must be added to the Trainer:
73 | 
74 | .. code:: python
75 | 
76 |    trainer = TorchTrainer(
77 |        train_func_distributed,
78 |        scaling_config=scaling_config,
79 |        run_config = get_minio_run_config()
80 |    )
81 | 
82 | To find more information on creating a Minio Bucket compatible with
83 | RHOAI you can refer to this
84 | `documentation <https://ai-on-openshift.io/tools-and-applications/minio/minio/>`__.
85 | Note: You must have ``s3fs`` and ``pyarrow`` installed in your
86 | environment for this method.
87 | 


--------------------------------------------------------------------------------
/docs/sphinx/user-docs/setup-kueue.rst:
--------------------------------------------------------------------------------
  1 | Basic Kueue Resources configuration
  2 | ===================================
  3 | 
  4 | Introduction:
  5 | -------------
  6 | 
  7 | This document is designed for administrators who have Kueue installed on
  8 | their cluster. We will walk through the process of setting up essential
  9 | Kueue resources, namely Cluster Queue, Resource Flavor, and Local Queue.
 10 | 
 11 | 1. Resource Flavor:
 12 | -------------------
 13 | 
 14 | Resource Flavors allow the cluster admin to reflect differing resource capabilities
 15 | of nodes within a clusters, such as CPU, memory, GPU, etc. These can then be assigned
 16 | to workloads to ensure they are executed on nodes with appropriate resources.
 17 | 
 18 | The YAML configuration provided below creates an empty Resource Flavor
 19 | named default-flavor. It serves as a starting point and does not specify
 20 | any detailed resource characteristics.
 21 | 
 22 | .. code:: yaml
 23 | 
 24 |    apiVersion: kueue.x-k8s.io/v1beta1
 25 |    kind: ResourceFlavor
 26 |    metadata:
 27 |      name: default-flavor
 28 | 
 29 | For more detailed information on Resource Flavor configuration options,
 30 | refer to the Kueue documentation: `Resource Flavor
 31 | Configuration <https://kueue.sigs.k8s.io/docs/concepts/resource_flavor/>`__
 32 | 
 33 | 2. Cluster Queue:
 34 | -----------------
 35 | 
 36 | A Cluster Queue represents a shared queue across the entire cluster. It
 37 | allows the cluster admin to define global settings for workload
 38 | prioritization and resource allocation.
 39 | 
 40 | When setting up a Cluster Queue in Kueue, it’s crucial that the resource
 41 | specifications match the actual capacities and operational requirements
 42 | of your cluster. The example provided outlines a basic setup; however,
 43 | each cluster may have different resource availabilities and needs.
 44 | 
 45 | .. code:: yaml
 46 | 
 47 |    apiVersion: kueue.x-k8s.io/v1beta1
 48 |    kind: ClusterQueue
 49 |    metadata:
 50 |      name: "cluster-queue"
 51 |    spec:
 52 |      namespaceSelector: {} # match all.
 53 |      resourceGroups:
 54 |      - coveredResources: ["cpu", "memory", "pods", "nvidia.com/gpu"]
 55 |        flavors:
 56 |        - name: "default-flavor"
 57 |          resources:
 58 |          - name: "cpu"
 59 |            nominalQuota: 9
 60 |          - name: "memory"
 61 |            nominalQuota: 36Gi
 62 |          - name: "pods"
 63 |            nominalQuota: 5
 64 |          - name: "nvidia.com/gpu"
 65 |            nominalQuota: '0'
 66 | 
 67 | For more detailed information on Cluster Queue configuration options,
 68 | refer to the Kueue documentation: `Cluster Queue
 69 | Configuration <https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/>`__
 70 | 
 71 | 3. Local Queue (With Default Annotation):
 72 | -----------------------------------------
 73 | 
 74 | A Local Queue represents a queue associated with a specific namespace
 75 | within the cluster. It allows namespace-level control over workload
 76 | prioritization and resource allocation.
 77 | 
 78 | .. code:: yaml
 79 | 
 80 |    apiVersion: kueue.x-k8s.io/v1beta1
 81 |    kind: LocalQueue
 82 |    metadata:
 83 |      namespace: team-a
 84 |      name: team-a-queue
 85 |      annotations:
 86 |        kueue.x-k8s.io/default-queue: "true"
 87 |    spec:
 88 |      clusterQueue: cluster-queue
 89 | 
 90 | In the LocalQueue configuration provided above, the annotations field
 91 | specifies ``kueue.x-k8s.io/default-queue: "true"``. This annotation
 92 | indicates that the team-a-queue is designated as the default queue for
 93 | the team-a namespace. When this is set, any workloads submitted to the
 94 | team-a namespace without explicitly specifying a queue will
 95 | automatically be routed to the team-a-queue.
 96 | 
 97 | For more detailed information on Local Queue configuration options,
 98 | refer to the Kueue documentation: `Local Queue
 99 | Configuration <https://kueue.sigs.k8s.io/docs/concepts/local_queue/>`__
100 | 
101 | Conclusion:
102 | -----------
103 | 
104 | By following the steps outlined in this document, the cluster admin can
105 | successfully create the basic Kueue resources necessary for workload
106 | management in the cluster. For more advanced configurations and
107 | features, please refer to the comprehensive `Kueue
108 | documentation <https://kueue.sigs.k8s.io/docs/concepts/>`__.
109 | 


--------------------------------------------------------------------------------
/docs/sphinx/user-docs/ui-widgets.rst:
--------------------------------------------------------------------------------
 1 | Jupyter UI Widgets
 2 | ==================
 3 | 
 4 | Below are some examples of the Jupyter UI Widgets that are included in
 5 | the CodeFlare SDK.
 6 | 
 7 | .. note::
 8 |    To use the widgets functionality you must be using the CodeFlare SDK in a Jupyter Notebook environment.
 9 | 
10 | Cluster Up/Down Buttons
11 | -----------------------
12 | 
13 | The Cluster Up/Down buttons appear after successfully initialising your
14 | `ClusterConfiguration <cluster-configuration.md#ray-cluster-configuration>`__.
15 | There are two buttons and a checkbox ``Cluster Up``, ``Cluster Down``
16 | and ``Wait for Cluster?`` which mimic the
17 | `cluster.up() <ray-cluster-interaction.md#clusterup>`__,
18 | `cluster.down() <ray-cluster-interaction.md#clusterdown>`__ and
19 | `cluster.wait_ready() <ray-cluster-interaction.md#clusterwait_ready>`__
20 | functionality.
21 | 
22 | After initialising their ``ClusterConfiguration`` a user can select the
23 | ``Wait for Cluster?`` checkbox then click the ``Cluster Up`` button to
24 | create their Ray Cluster and wait until it is ready. The cluster can be
25 | deleted by clicking the ``Cluster Down`` button.
26 | 
27 | .. image:: images/ui-buttons.png
28 |    :alt: An image of the up/down ui buttons
29 | 
30 | View Clusters UI Table
31 | ----------------------
32 | 
33 | The View Clusters UI Table allows a user to see a list of Ray Clusters
34 | with information on their configuration including number of workers, CPU
35 | requests and limits along with the clusters status.
36 | 
37 | .. image:: images/ui-view-clusters.png
38 |    :alt: An image of the view clusters ui table
39 | 
40 | Above is a list of two Ray Clusters ``raytest`` and ``raytest2`` each of
41 | those headings is clickable and will update the table to view the
42 | selected Cluster's information. There are four buttons under the table
43 | ``Cluster Down``, ``View Jobs``, ``Open Ray Dashboard``, and ``Refresh Data``. \* The
44 | ``Cluster Down`` button will delete the selected Cluster. \* The
45 | ``View Jobs`` button will try to open the Ray Dashboard's Jobs view in a
46 | Web Browser. The link will also be printed to the console. \* The
47 | ``Open Ray Dashboard`` button will try to open the Ray Dashboard view in
48 | a Web Browser. The link will also be printed to the console. \* The
49 | ``Refresh Data`` button will refresh the list of RayClusters, the spec, and
50 | the status of the Ray Cluster.
51 | 
52 | The UI Table can be viewed by calling the following function.
53 | 
54 | .. code:: python
55 | 
56 |    from codeflare_sdk import view_clusters
57 |    view_clusters() # Accepts namespace parameter but will try to gather the namespace from the current context
58 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "codeflare-sdk"
 3 | version = "0.0.0-dev"
 4 | description = "Python SDK for codeflare client"
 5 | 
 6 | license = "Apache-2.0"
 7 | 
 8 | authors = [
 9 |     "Michael Clifford <mcliffor@redhat.com>",
10 |     "Mustafa Eyceoz <meyceoz@redhat.com>",
11 |     "Abhishek Malvankar <asmalvan@us.ibm.com>",
12 |     "Atin Sood <asood@us.ibm.com>",
13 | ]
14 | 
15 | readme = 'README.md'
16 | 
17 | repository = "https://github.com/project-codeflare/codeflare-sdk"
18 | homepage = "https://github.com/project-codeflare/codeflare-sdk"
19 | 
20 | keywords = ['codeflare', 'python', 'sdk', 'client', 'batch', 'scale']
21 | 
22 | [tool.poetry.dependencies]
23 | python = "^3.11"
24 | openshift-client = "1.0.18"
25 | rich = ">=12.5,<14.0"
26 | ray = {version = "2.47.1", extras = ["data", "default"]}
27 | kubernetes = ">= 27.2.0"
28 | cryptography = "43.0.3"
29 | executing = "1.2.0"
30 | pydantic = "< 2"
31 | ipywidgets = "8.1.2"
32 | 
33 | [tool.poetry.group.docs]
34 | optional = true
35 | 
36 | [tool.poetry.group.docs.dependencies]
37 | sphinx = "7.4.7"
38 | sphinx-rtd-theme = "3.0.1"
39 | 
40 | [tool.poetry.group.test]
41 | optional = true
42 | 
43 | [tool.poetry.group.test.dependencies]
44 | pytest = "7.4.0"
45 | coverage = "7.6.4"
46 | pytest-mock = "3.11.1"
47 | pytest-timeout = "2.3.1"
48 | jupyterlab = "4.3.1"
49 | 
50 | [tool.pytest.ini_options]
51 | filterwarnings = [
52 |     "ignore::DeprecationWarning:pkg_resources",
53 |     "ignore:pkg_resources is deprecated as an API:DeprecationWarning",
54 | ]
55 | markers = [
56 |     "kind",
57 |     "openshift",
58 |     "nvidia_gpu"
59 | ]
60 | addopts = "--timeout=900"
61 | testpaths = ["src/codeflare_sdk"]
62 | collect_ignore = ["src/codeflare_sdk/common/utils/unit_test_support.py"]
63 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ray import (
 2 |     Cluster,
 3 |     ClusterConfiguration,
 4 |     RayClusterStatus,
 5 |     CodeFlareClusterStatus,
 6 |     RayCluster,
 7 |     get_cluster,
 8 |     list_all_queued,
 9 |     list_all_clusters,
10 |     AWManager,
11 |     AppWrapperStatus,
12 |     RayJobClient,
13 | )
14 | 
15 | from .common.widgets import view_clusters
16 | 
17 | from .common import (
18 |     Authentication,
19 |     KubeConfiguration,
20 |     TokenAuthentication,
21 |     KubeConfigFileAuthentication,
22 | )
23 | 
24 | from .common.kueue import (
25 |     list_local_queues,
26 | )
27 | 
28 | from .common.utils import generate_cert
29 | from .common.utils.demos import copy_demo_nbs
30 | 
31 | from importlib.metadata import version, PackageNotFoundError
32 | 
33 | try:
34 |     __version__ = version("codeflare-sdk")  # use metadata associated with built package
35 | 
36 | except PackageNotFoundError:
37 |     __version__ = "v0.0.0"
38 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Importing everything from the kubernetes_cluster module
2 | from .kubernetes_cluster import (
3 |     Authentication,
4 |     KubeConfiguration,
5 |     TokenAuthentication,
6 |     KubeConfigFileAuthentication,
7 |     _kube_api_error_handling,
8 | )
9 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/kubernetes_cluster/__init__.py:
--------------------------------------------------------------------------------
 1 | from .auth import (
 2 |     Authentication,
 3 |     KubeConfiguration,
 4 |     TokenAuthentication,
 5 |     KubeConfigFileAuthentication,
 6 |     config_check,
 7 |     get_api_client,
 8 | )
 9 | 
10 | from .kube_api_helpers import _kube_api_error_handling
11 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/kubernetes_cluster/kube_api_helpers.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 IBM, Red Hat
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | This sub-module exists primarily to be used internally for any Kubernetes
17 | API error handling or wrapping.
18 | """
19 | 
20 | import executing
21 | from kubernetes import client, config
22 | 
23 | ERROR_MESSAGES = {
24 |     "Not Found": "The requested resource could not be located.\n"
25 |     "Please verify the resource name and namespace.",
26 |     "Unauthorized": "Access to the API is unauthorized.\n"
27 |     "Check your credentials or permissions.",
28 |     "Forbidden": "Access denied to the Kubernetes resource.\n"
29 |     "Ensure your role has sufficient permissions for this operation.",
30 |     "Conflict": "A conflict occurred with the RayCluster resource.\n"
31 |     "Only one RayCluster with the same name is allowed. "
32 |     "Please delete or rename the existing RayCluster before creating a new one with the desired name.",
33 | }
34 | 
35 | 
36 | # private methods
37 | def _kube_api_error_handling(
38 |     e: Exception, print_error: bool = True
39 | ):  # pragma: no cover
40 |     def print_message(message: str):
41 |         if print_error:
42 |             print(message)
43 | 
44 |     if isinstance(e, client.ApiException):
45 |         # Retrieve message based on reason, defaulting if reason is not known
46 |         message = ERROR_MESSAGES.get(
47 |             e.reason, f"Unexpected API error encountered (Reason: {e.reason})"
48 |         )
49 |         full_message = f"{message}\nResponse: {e.body}"
50 |         print_message(full_message)
51 | 
52 |     elif isinstance(e, config.ConfigException):
53 |         message = "Configuration error: Unable to load Kubernetes configuration. Verify the config file path and format."
54 |         print_message(message)
55 | 
56 |     elif isinstance(e, executing.executing.NotOneValueFound):
57 |         message = "Execution error: Expected exactly one value in the operation but found none or multiple."
58 |         print_message(message)
59 | 
60 |     else:
61 |         message = f"Unexpected error:\n{str(e)}"
62 |         print_message(message)
63 |         raise e
64 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/kueue/__init__.py:
--------------------------------------------------------------------------------
1 | from .kueue import (
2 |     get_default_kueue_name,
3 |     local_queue_exists,
4 |     add_queue_label,
5 |     list_local_queues,
6 | )
7 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/src/codeflare_sdk/common/utils/__init__.py


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/constants.py:
--------------------------------------------------------------------------------
1 | RAY_VERSION = "2.47.1"
2 | # Below references ray:2.47.1-py311-cu121
3 | CUDA_RUNTIME_IMAGE = "quay.io/modh/ray@sha256:6d076aeb38ab3c34a6a2ef0f58dc667089aa15826fa08a73273c629333e12f1e"
4 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/demos.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import shutil
 3 | 
 4 | package_dir = pathlib.Path(__file__).parent.parent.parent.resolve()
 5 | demo_dir = f"{package_dir}/demo-notebooks"
 6 | 
 7 | 
 8 | def copy_demo_nbs(dir: str = "./demo-notebooks", overwrite: bool = False):
 9 |     """
10 |     Copy the demo notebooks from the package to the current working directory
11 | 
12 |     overwrite=True will overwrite any files that exactly match files written by copy_demo_nbs in the target directory.
13 |     Any files that exist in the directory that don't match these values will remain untouched.
14 | 
15 |     Args:
16 |         dir (str):
17 |             The directory to copy the demo notebooks to. Defaults to "./demo-notebooks".
18 |         overwrite (bool):
19 |             Whether to overwrite files in the directory if it already exists. Defaults to False.
20 | 
21 |     Raises:
22 |         FileExistsError:
23 |             If the directory already exists.
24 |     """
25 |     # does dir exist already?
26 |     if overwrite is False and pathlib.Path(dir).exists():
27 |         raise FileExistsError(
28 |             f"Directory {dir} already exists. Please remove it or provide a different location."
29 |         )
30 | 
31 |     shutil.copytree(demo_dir, dir, dirs_exist_ok=True)
32 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/utils/test_generate_cert.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 IBM, Red Hat
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import base64
 16 | 
 17 | from cryptography.hazmat.primitives.serialization import (
 18 |     Encoding,
 19 |     PublicFormat,
 20 |     load_pem_private_key,
 21 | )
 22 | from cryptography.x509 import load_pem_x509_certificate
 23 | import os
 24 | from codeflare_sdk.common.utils.generate_cert import (
 25 |     export_env,
 26 |     generate_ca_cert,
 27 |     generate_tls_cert,
 28 | )
 29 | from kubernetes import client
 30 | 
 31 | 
 32 | def test_generate_ca_cert():
 33 |     """
 34 |     test the function codeflare_sdk.common.utils.generate_ca_cert generates the correct outputs
 35 |     """
 36 |     key, certificate = generate_ca_cert()
 37 |     cert = load_pem_x509_certificate(base64.b64decode(certificate))
 38 |     private_pub_key_bytes = (
 39 |         load_pem_private_key(base64.b64decode(key), password=None)
 40 |         .public_key()
 41 |         .public_bytes(Encoding.PEM, PublicFormat.SubjectPublicKeyInfo)
 42 |     )
 43 |     cert_pub_key_bytes = cert.public_key().public_bytes(
 44 |         Encoding.PEM, PublicFormat.SubjectPublicKeyInfo
 45 |     )
 46 |     assert type(key) == str
 47 |     assert type(certificate) == str
 48 |     # Veirfy ca.cert is self signed
 49 |     assert cert.verify_directly_issued_by(cert) == None
 50 |     # Verify cert has the public key bytes from the private key
 51 |     assert cert_pub_key_bytes == private_pub_key_bytes
 52 | 
 53 | 
 54 | def secret_ca_retreival(secret_name, namespace):
 55 |     ca_private_key_bytes, ca_cert = generate_ca_cert()
 56 |     data = {"ca.crt": ca_cert, "ca.key": ca_private_key_bytes}
 57 |     assert secret_name == "ca-secret-cluster"
 58 |     assert namespace == "namespace"
 59 |     return client.models.V1Secret(data=data)
 60 | 
 61 | 
 62 | def test_generate_tls_cert(mocker):
 63 |     """
 64 |     test the function codeflare_sdk.common.utils.generate_ca_cert generates the correct outputs
 65 |     """
 66 |     mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
 67 |     mocker.patch(
 68 |         "codeflare_sdk.common.utils.generate_cert.get_secret_name",
 69 |         return_value="ca-secret-cluster",
 70 |     )
 71 |     mocker.patch(
 72 |         "kubernetes.client.CoreV1Api.read_namespaced_secret",
 73 |         side_effect=secret_ca_retreival,
 74 |     )
 75 | 
 76 |     generate_tls_cert("cluster", "namespace")
 77 |     assert os.path.exists("tls-cluster-namespace")
 78 |     assert os.path.exists(os.path.join("tls-cluster-namespace", "ca.crt"))
 79 |     assert os.path.exists(os.path.join("tls-cluster-namespace", "tls.crt"))
 80 |     assert os.path.exists(os.path.join("tls-cluster-namespace", "tls.key"))
 81 | 
 82 |     # verify the that the signed tls.crt is issued by the ca_cert (root cert)
 83 |     with open(os.path.join("tls-cluster-namespace", "tls.crt"), "r") as f:
 84 |         tls_cert = load_pem_x509_certificate(f.read().encode("utf-8"))
 85 |     with open(os.path.join("tls-cluster-namespace", "ca.crt"), "r") as f:
 86 |         root_cert = load_pem_x509_certificate(f.read().encode("utf-8"))
 87 |     assert tls_cert.verify_directly_issued_by(root_cert) == None
 88 | 
 89 | 
 90 | def test_export_env():
 91 |     """
 92 |     test the function codeflare_sdk.common.utils.generate_ca_cert.export_ev generates the correct outputs
 93 |     """
 94 |     tls_dir = "cluster"
 95 |     ns = "namespace"
 96 |     export_env(tls_dir, ns)
 97 |     assert os.environ["RAY_USE_TLS"] == "1"
 98 |     assert os.environ["RAY_TLS_SERVER_CERT"] == os.path.join(
 99 |         os.getcwd(), f"tls-{tls_dir}-{ns}", "tls.crt"
100 |     )
101 |     assert os.environ["RAY_TLS_SERVER_KEY"] == os.path.join(
102 |         os.getcwd(), f"tls-{tls_dir}-{ns}", "tls.key"
103 |     )
104 |     assert os.environ["RAY_TLS_CA_CERT"] == os.path.join(
105 |         os.getcwd(), f"tls-{tls_dir}-{ns}", "ca.crt"
106 |     )
107 | 
108 | 
109 | # Make sure to always keep this function last
110 | def test_cleanup():
111 |     os.remove("tls-cluster-namespace/ca.crt")
112 |     os.remove("tls-cluster-namespace/tls.crt")
113 |     os.remove("tls-cluster-namespace/tls.key")
114 |     os.rmdir("tls-cluster-namespace")
115 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/common/widgets/__init__.py:
--------------------------------------------------------------------------------
1 | from .widgets import (
2 |     view_clusters,
3 | )
4 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/__init__.py:
--------------------------------------------------------------------------------
 1 | from .appwrapper import AppWrapper, AppWrapperStatus, AWManager
 2 | 
 3 | from .client import (
 4 |     RayJobClient,
 5 | )
 6 | 
 7 | from .cluster import (
 8 |     Cluster,
 9 |     ClusterConfiguration,
10 |     get_cluster,
11 |     list_all_queued,
12 |     list_all_clusters,
13 |     RayClusterStatus,
14 |     CodeFlareClusterStatus,
15 |     RayCluster,
16 | )
17 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/__init__.py:
--------------------------------------------------------------------------------
1 | from .awload import AWManager
2 | 
3 | from .status import (
4 |     AppWrapperStatus,
5 |     AppWrapper,
6 | )
7 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/awload.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 IBM, Red Hat
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """
 16 | The awload sub-module contains the definition of the AWManager object, which handles
 17 | submission and deletion of existing AppWrappers from a user's file system.
 18 | """
 19 | 
 20 | from os.path import isfile
 21 | import errno
 22 | import os
 23 | import yaml
 24 | 
 25 | from kubernetes import client
 26 | from ...common import _kube_api_error_handling
 27 | from ...common.kubernetes_cluster.auth import (
 28 |     config_check,
 29 |     get_api_client,
 30 | )
 31 | 
 32 | 
 33 | class AWManager:
 34 |     """
 35 |     An object for submitting and removing existing AppWrapper yamls
 36 |     to be added to the Kueue localqueue.
 37 |     """
 38 | 
 39 |     def __init__(self, filename: str) -> None:
 40 |         """
 41 |         Create the AppWrapper Manager object by passing in an
 42 |         AppWrapper yaml file
 43 |         """
 44 |         if not isfile(filename):
 45 |             raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
 46 |         self.filename = filename
 47 |         try:
 48 |             with open(self.filename) as f:
 49 |                 self.awyaml = yaml.load(f, Loader=yaml.FullLoader)
 50 |             assert self.awyaml["kind"] == "AppWrapper"
 51 |             self.name = self.awyaml["metadata"]["name"]
 52 |             self.namespace = self.awyaml["metadata"]["namespace"]
 53 |         except:
 54 |             raise ValueError(
 55 |                 f"{filename } is not a correctly formatted AppWrapper yaml"
 56 |             )
 57 |         self.submitted = False
 58 | 
 59 |     def submit(self) -> None:
 60 |         """
 61 |         Attempts to create the AppWrapper custom resource using the yaml file
 62 |         """
 63 |         try:
 64 |             config_check()
 65 |             api_instance = client.CustomObjectsApi(get_api_client())
 66 |             api_instance.create_namespaced_custom_object(
 67 |                 group="workload.codeflare.dev",
 68 |                 version="v1beta2",
 69 |                 namespace=self.namespace,
 70 |                 plural="appwrappers",
 71 |                 body=self.awyaml,
 72 |             )
 73 |         except Exception as e:
 74 |             return _kube_api_error_handling(e)
 75 | 
 76 |         self.submitted = True
 77 |         print(f"AppWrapper {self.filename} submitted!")
 78 | 
 79 |     def remove(self) -> None:
 80 |         """
 81 |         Attempts to delete the AppWrapper custom resource matching the name in the yaml,
 82 |         if submitted by this manager.
 83 |         """
 84 |         if not self.submitted:
 85 |             print("AppWrapper not submitted by this manager yet, nothing to remove")
 86 |             return
 87 | 
 88 |         try:
 89 |             config_check()
 90 |             api_instance = client.CustomObjectsApi(get_api_client())
 91 |             api_instance.delete_namespaced_custom_object(
 92 |                 group="workload.codeflare.dev",
 93 |                 version="v1beta2",
 94 |                 namespace=self.namespace,
 95 |                 plural="appwrappers",
 96 |                 name=self.name,
 97 |             )
 98 |         except Exception as e:
 99 |             return _kube_api_error_handling(e)
100 | 
101 |         self.submitted = False
102 |         print(f"AppWrapper {self.name} removed!")
103 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/status.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 IBM, Red Hat
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | The status sub-module defines Enums containing information for
17 | AppWrapper states, as well as dataclasses to store information for AppWrappers.
18 | """
19 | 
20 | from dataclasses import dataclass
21 | from enum import Enum
22 | 
23 | 
24 | class AppWrapperStatus(Enum):
25 |     """
26 |     Defines the possible reportable phases of an AppWrapper.
27 |     """
28 | 
29 |     SUSPENDED = "suspended"
30 |     RESUMING = "resuming"
31 |     RUNNING = "running"
32 |     RESETTING = "resetting"
33 |     SUSPENDING = "suspending"
34 |     SUCCEEDED = "succeeded"
35 |     FAILED = "failed"
36 |     TERMINATING = "terminating"
37 | 
38 | 
39 | @dataclass
40 | class AppWrapper:
41 |     """
42 |     For storing information about an AppWrapper.
43 |     """
44 | 
45 |     name: str
46 |     status: AppWrapperStatus
47 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/test_awload.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 IBM, Red Hat
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from codeflare_sdk.common.utils.unit_test_support import (
15 |     apply_template,
16 |     arg_check_aw_apply_effect,
17 |     arg_check_aw_del_effect,
18 |     get_template_variables,
19 | )
20 | from codeflare_sdk.ray.appwrapper import AWManager
21 | from codeflare_sdk.ray.cluster import Cluster, ClusterConfiguration
22 | import os
23 | from pathlib import Path
24 | 
25 | parent = Path(__file__).resolve().parents[4]  # project directory
26 | aw_dir = os.path.expanduser("~/.codeflare/resources/")
27 | 
28 | 
29 | def test_AWManager_creation(mocker):
30 |     mocker.patch("kubernetes.client.ApisApi.get_api_versions")
31 |     mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object")
32 |     # Create test.yaml
33 |     Cluster(
34 |         ClusterConfiguration(
35 |             name="test",
36 |             namespace="ns",
37 |             write_to_file=True,
38 |             appwrapper=True,
39 |         )
40 |     )
41 | 
42 |     testaw = AWManager(f"{aw_dir}test.yaml")
43 |     assert testaw.name == "test"
44 |     assert testaw.namespace == "ns"
45 |     assert testaw.submitted == False
46 |     try:
47 |         testaw = AWManager("fake")
48 |     except Exception as e:
49 |         assert type(e) == FileNotFoundError
50 |         assert str(e) == "[Errno 2] No such file or directory: 'fake'"
51 |     try:
52 |         testaw = apply_template(
53 |             AWManager(
54 |                 f"{parent}/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml"
55 |             ),
56 |             get_template_variables(),
57 |         )
58 |     except Exception as e:
59 |         assert type(e) == ValueError
60 |         assert (
61 |             str(e)
62 |             == f"{parent}/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml is not a correctly formatted AppWrapper yaml"
63 |         )
64 | 
65 | 
66 | def test_AWManager_submit_remove(mocker, capsys):
67 |     mocker.patch("kubernetes.client.ApisApi.get_api_versions")
68 |     testaw = AWManager(f"{aw_dir}test.yaml")
69 |     testaw.remove()
70 |     captured = capsys.readouterr()
71 |     assert (
72 |         captured.out
73 |         == "AppWrapper not submitted by this manager yet, nothing to remove\n"
74 |     )
75 |     assert testaw.submitted == False
76 |     mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
77 |     mocker.patch(
78 |         "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object",
79 |         side_effect=arg_check_aw_apply_effect,
80 |     )
81 |     mocker.patch(
82 |         "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object",
83 |         side_effect=arg_check_aw_del_effect,
84 |     )
85 |     testaw.submit()
86 |     assert testaw.submitted == True
87 |     testaw.remove()
88 |     assert testaw.submitted == False
89 | 
90 | 
91 | # Make sure to always keep this function last
92 | def test_cleanup():
93 |     os.remove(f"{aw_dir}test.yaml")
94 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/appwrapper/test_status.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 IBM, Red Hat
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from codeflare_sdk.ray.cluster.cluster import (
 16 |     _app_wrapper_status,
 17 |     Cluster,
 18 |     ClusterConfiguration,
 19 | )
 20 | from codeflare_sdk.ray.appwrapper import AppWrapper, AppWrapperStatus
 21 | from codeflare_sdk.ray.cluster.status import CodeFlareClusterStatus
 22 | from codeflare_sdk.common.utils.unit_test_support import get_local_queue
 23 | import os
 24 | 
 25 | aw_dir = os.path.expanduser("~/.codeflare/resources/")
 26 | 
 27 | 
 28 | def test_cluster_status(mocker):
 29 |     mocker.patch("kubernetes.client.ApisApi.get_api_versions")
 30 |     mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
 31 |     mocker.patch(
 32 |         "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
 33 |         return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
 34 |     )
 35 |     fake_aw = AppWrapper("test", AppWrapperStatus.FAILED)
 36 | 
 37 |     cf = Cluster(
 38 |         ClusterConfiguration(
 39 |             name="test",
 40 |             namespace="ns",
 41 |             write_to_file=True,
 42 |             appwrapper=True,
 43 |             local_queue="local-queue-default",
 44 |         )
 45 |     )
 46 |     mocker.patch(
 47 |         "codeflare_sdk.ray.cluster.cluster._app_wrapper_status", return_value=None
 48 |     )
 49 |     mocker.patch(
 50 |         "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=None
 51 |     )
 52 |     status, ready = cf.status()
 53 |     assert status == CodeFlareClusterStatus.UNKNOWN
 54 |     assert ready == False
 55 | 
 56 |     mocker.patch(
 57 |         "codeflare_sdk.ray.cluster.cluster._app_wrapper_status", return_value=fake_aw
 58 |     )
 59 |     status, ready = cf.status()
 60 |     assert status == CodeFlareClusterStatus.FAILED
 61 |     assert ready == False
 62 | 
 63 |     fake_aw.status = AppWrapperStatus.SUSPENDED
 64 |     status, ready = cf.status()
 65 |     assert status == CodeFlareClusterStatus.QUEUED
 66 |     assert ready == False
 67 | 
 68 |     fake_aw.status = AppWrapperStatus.RESUMING
 69 |     status, ready = cf.status()
 70 |     assert status == CodeFlareClusterStatus.STARTING
 71 |     assert ready == False
 72 | 
 73 |     fake_aw.status = AppWrapperStatus.RESETTING
 74 |     status, ready = cf.status()
 75 |     assert status == CodeFlareClusterStatus.STARTING
 76 |     assert ready == False
 77 | 
 78 |     fake_aw.status = AppWrapperStatus.RUNNING
 79 |     status, ready = cf.status()
 80 |     assert status == CodeFlareClusterStatus.UNKNOWN
 81 |     assert ready == False
 82 | 
 83 | 
 84 | def aw_status_fields(group, version, namespace, plural, *args):
 85 |     assert group == "workload.codeflare.dev"
 86 |     assert version == "v1beta2"
 87 |     assert namespace == "test-ns"
 88 |     assert plural == "appwrappers"
 89 |     assert args == tuple()
 90 |     return {"items": []}
 91 | 
 92 | 
 93 | def test_aw_status(mocker):
 94 |     mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
 95 |     mocker.patch(
 96 |         "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
 97 |         side_effect=aw_status_fields,
 98 |     )
 99 |     aw = _app_wrapper_status("test-aw", "test-ns")
100 |     assert aw == None
101 | 
102 | 
103 | # Make sure to always keep this function last
104 | def test_cleanup():
105 |     os.remove(f"{aw_dir}test.yaml")
106 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/client/__init__.py:
--------------------------------------------------------------------------------
1 | from .ray_jobs import RayJobClient
2 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/cluster/__init__.py:
--------------------------------------------------------------------------------
 1 | from .status import (
 2 |     RayClusterStatus,
 3 |     CodeFlareClusterStatus,
 4 |     RayCluster,
 5 | )
 6 | 
 7 | from .cluster import (
 8 |     Cluster,
 9 |     ClusterConfiguration,
10 |     get_cluster,
11 |     list_all_queued,
12 |     list_all_clusters,
13 | )
14 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/cluster/status.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 IBM, Red Hat
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | The status sub-module defines Enums containing information for Ray cluster
17 | states states, and CodeFlare cluster states, as well as
18 | dataclasses to store information for Ray clusters.
19 | """
20 | 
21 | from dataclasses import dataclass, field
22 | from enum import Enum
23 | import typing
24 | from typing import Union
25 | 
26 | 
27 | class RayClusterStatus(Enum):
28 |     """
29 |     Defines the possible reportable states of a Ray cluster.
30 |     """
31 | 
32 |     # https://github.com/ray-project/kuberay/blob/master/ray-operator/apis/ray/v1/raycluster_types.go#L112-L117
33 |     READY = "ready"
34 |     UNHEALTHY = "unhealthy"
35 |     FAILED = "failed"
36 |     UNKNOWN = "unknown"
37 |     SUSPENDED = "suspended"
38 | 
39 | 
40 | class CodeFlareClusterStatus(Enum):
41 |     """
42 |     Defines the possible reportable states of a Codeflare cluster.
43 |     """
44 | 
45 |     READY = 1
46 |     STARTING = 2
47 |     QUEUED = 3
48 |     QUEUEING = 4
49 |     FAILED = 5
50 |     UNKNOWN = 6
51 |     SUSPENDED = 7
52 | 
53 | 
54 | @dataclass
55 | class RayCluster:
56 |     """
57 |     For storing information about a Ray cluster.
58 |     """
59 | 
60 |     name: str
61 |     status: RayClusterStatus
62 |     head_cpu_requests: int
63 |     head_cpu_limits: int
64 |     head_mem_requests: str
65 |     head_mem_limits: str
66 |     num_workers: int
67 |     worker_mem_requests: str
68 |     worker_mem_limits: str
69 |     worker_cpu_requests: Union[int, str]
70 |     worker_cpu_limits: Union[int, str]
71 |     namespace: str
72 |     dashboard: str
73 |     worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict)
74 |     head_extended_resources: typing.Dict[str, int] = field(default_factory=dict)
75 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/cluster/test_build_ray_cluster.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 IBM, Red Hat
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from collections import namedtuple
 15 | import sys
 16 | from .build_ray_cluster import gen_names, update_image, build_ray_cluster
 17 | import uuid
 18 | from codeflare_sdk.ray.cluster.cluster import ClusterConfiguration, Cluster
 19 | 
 20 | 
 21 | def test_gen_names_with_name(mocker):
 22 |     mocker.patch.object(
 23 |         uuid, "uuid4", return_value=uuid.UUID("00000000-0000-0000-0000-000000000001")
 24 |     )
 25 |     name = "myname"
 26 |     appwrapper_name, cluster_name = gen_names(name)
 27 |     assert appwrapper_name == name
 28 |     assert cluster_name == name
 29 | 
 30 | 
 31 | def test_gen_names_without_name(mocker):
 32 |     mocker.patch.object(
 33 |         uuid, "uuid4", return_value=uuid.UUID("00000000-0000-0000-0000-000000000001")
 34 |     )
 35 |     appwrapper_name, cluster_name = gen_names(None)
 36 |     assert appwrapper_name.startswith("appwrapper-")
 37 |     assert cluster_name.startswith("cluster-")
 38 | 
 39 | 
 40 | def test_update_image_without_supported_python_version(mocker):
 41 |     # Mock SUPPORTED_PYTHON_VERSIONS
 42 |     mocker.patch.dict(
 43 |         "codeflare_sdk.ray.cluster.build_ray_cluster.SUPPORTED_PYTHON_VERSIONS",
 44 |         {
 45 |             "3.11": "ray-py3.11",
 46 |         },
 47 |     )
 48 | 
 49 |     # Create a namedtuple to mock sys.version_info
 50 |     VersionInfo = namedtuple(
 51 |         "version_info", ["major", "minor", "micro", "releaselevel", "serial"]
 52 |     )
 53 |     mocker.patch.object(sys, "version_info", VersionInfo(3, 8, 0, "final", 0))
 54 | 
 55 |     # Mock warnings.warn to check if it gets called
 56 |     warn_mock = mocker.patch("warnings.warn")
 57 | 
 58 |     # Call the update_image function with no image provided
 59 |     image = update_image(None)
 60 | 
 61 |     # Assert that the warning was called with the expected message
 62 |     warn_mock.assert_called_once_with(
 63 |         "No default Ray image defined for 3.8. Please provide your own image or use one of the following python versions: 3.11."
 64 |     )
 65 | 
 66 |     # Assert that no image was set since the Python version is not supported
 67 |     assert image is None
 68 | 
 69 | 
 70 | def test_build_ray_cluster_with_gcs_ft(mocker):
 71 |     mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
 72 |     mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object")
 73 | 
 74 |     cluster = Cluster(
 75 |         ClusterConfiguration(
 76 |             name="test",
 77 |             namespace="ns",
 78 |             enable_gcs_ft=True,
 79 |             redis_address="redis:6379",
 80 |             redis_password_secret={"name": "redis-password-secret", "key": "password"},
 81 |             external_storage_namespace="new-ns",
 82 |         )
 83 |     )
 84 | 
 85 |     mocker.patch("codeflare_sdk.ray.cluster.build_ray_cluster.config_check")
 86 |     mocker.patch(
 87 |         "codeflare_sdk.ray.cluster.build_ray_cluster.get_api_client", return_value=None
 88 |     )
 89 |     mocker.patch(
 90 |         "codeflare_sdk.ray.cluster.build_ray_cluster.update_image", return_value=None
 91 |     )
 92 | 
 93 |     resource = build_ray_cluster(cluster)
 94 | 
 95 |     assert "spec" in resource
 96 |     assert "gcsFaultToleranceOptions" in resource["spec"]
 97 | 
 98 |     gcs_ft_options = resource["spec"]["gcsFaultToleranceOptions"]
 99 | 
100 |     assert gcs_ft_options["redisAddress"] == "redis:6379"
101 |     assert gcs_ft_options["externalStorageNamespace"] == "new-ns"
102 |     assert (
103 |         gcs_ft_options["redisPassword"]["valueFrom"]["secretKeyRef"]["name"]
104 |         == "redis-password-secret"
105 |     )
106 |     assert (
107 |         gcs_ft_options["redisPassword"]["valueFrom"]["secretKeyRef"]["key"]
108 |         == "password"
109 |     )
110 | 


--------------------------------------------------------------------------------
/src/codeflare_sdk/ray/cluster/test_status.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 IBM, Red Hat
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from codeflare_sdk.ray.cluster.cluster import (
 16 |     Cluster,
 17 |     ClusterConfiguration,
 18 |     _ray_cluster_status,
 19 | )
 20 | from codeflare_sdk.ray.cluster.status import (
 21 |     CodeFlareClusterStatus,
 22 |     RayClusterStatus,
 23 |     RayCluster,
 24 | )
 25 | import os
 26 | from ...common.utils.unit_test_support import get_local_queue
 27 | 
 28 | aw_dir = os.path.expanduser("~/.codeflare/resources/")
 29 | 
 30 | 
 31 | def test_cluster_status(mocker):
 32 |     mocker.patch("kubernetes.client.ApisApi.get_api_versions")
 33 |     mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
 34 | 
 35 |     fake_ray = RayCluster(
 36 |         name="test",
 37 |         status=RayClusterStatus.UNKNOWN,
 38 |         num_workers=1,
 39 |         worker_mem_requests=2,
 40 |         worker_mem_limits=2,
 41 |         worker_cpu_requests=1,
 42 |         worker_cpu_limits=1,
 43 |         namespace="ns",
 44 |         dashboard="fake-uri",
 45 |         head_cpu_requests=2,
 46 |         head_cpu_limits=2,
 47 |         head_mem_requests=8,
 48 |         head_mem_limits=8,
 49 |     )
 50 | 
 51 |     mocker.patch(
 52 |         "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
 53 |         return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
 54 |     )
 55 | 
 56 |     cf = Cluster(
 57 |         ClusterConfiguration(
 58 |             name="test",
 59 |             namespace="ns",
 60 |             write_to_file=True,
 61 |             appwrapper=False,
 62 |             local_queue="local-queue-default",
 63 |         )
 64 |     )
 65 |     mocker.patch(
 66 |         "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=None
 67 |     )
 68 |     status, ready = cf.status()
 69 |     assert status == CodeFlareClusterStatus.UNKNOWN
 70 |     assert ready == False
 71 | 
 72 |     mocker.patch(
 73 |         "codeflare_sdk.ray.cluster.cluster._ray_cluster_status", return_value=fake_ray
 74 |     )
 75 | 
 76 |     status, ready = cf.status()
 77 |     assert status == CodeFlareClusterStatus.STARTING
 78 |     assert ready == False
 79 | 
 80 |     fake_ray.status = RayClusterStatus.FAILED
 81 |     status, ready = cf.status()
 82 |     assert status == CodeFlareClusterStatus.FAILED
 83 |     assert ready == False
 84 | 
 85 |     fake_ray.status = RayClusterStatus.UNHEALTHY
 86 |     status, ready = cf.status()
 87 |     assert status == CodeFlareClusterStatus.FAILED
 88 |     assert ready == False
 89 | 
 90 |     fake_ray.status = RayClusterStatus.READY
 91 |     status, ready = cf.status()
 92 |     assert status == CodeFlareClusterStatus.READY
 93 |     assert ready == True
 94 | 
 95 | 
 96 | def rc_status_fields(group, version, namespace, plural, *args):
 97 |     assert group == "ray.io"
 98 |     assert version == "v1"
 99 |     assert namespace == "test-ns"
100 |     assert plural == "rayclusters"
101 |     assert args == tuple()
102 |     return {"items": []}
103 | 
104 | 
105 | def test_rc_status(mocker):
106 |     mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
107 |     mocker.patch(
108 |         "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
109 |         side_effect=rc_status_fields,
110 |     )
111 |     rc = _ray_cluster_status("test-rc", "test-ns")
112 |     assert rc == None
113 | 
114 | 
115 | # Make sure to always keep this function last
116 | def test_cleanup():
117 |     os.remove(f"{aw_dir}test.yaml")
118 | 


--------------------------------------------------------------------------------
/target_users.md:
--------------------------------------------------------------------------------
 1 | # CodeFlare Stack Target Users
 2 | 
 3 | [Cluster Admin](#cluster-administrator)
 4 | 
 5 | [Data Scientist I](#data-scientist-i)
 6 | 
 7 | [Data Scientist II](#data-scientist-ii)
 8 | 
 9 | 
10 | 
11 | ## Cluster Administrator
12 | 
13 | * Quota Management
14 | * Gang-Scheduling for Distributed Compute
15 | * Job/Infrastructure Queuing
16 | 
17 | I want to enable a team of data scientists to have self-serve, but limited, access to a shared pool of distributed compute resources such as GPUs for large scale machine learning model training jobs. If the existing pool of resources is insufficient, I want my cluster to scale up (to a defined quota) to meet my users’ needs and scale back down automatically when their jobs have completed. I want these features to be made available through simple installation of generic modules via a user-friendly interface. I also want the ability to monitor current queue of pending tasks, the utilization of active resources, and the progress of all current jobs visualized in a simple dashboard.
18 | 
19 | ## Data Scientist I
20 | 
21 | * Training Mid-Size Models (less than 1,000 nodes)
22 | * Fine-Tuning Existing Models
23 | * Distributed Compute Framework
24 | 
25 | I need temporary access to a reasonably large set of GPU enabled nodes on my team’s shared cluster for short term experimentation, parallelizing my existing ML workflow, or fine-tuning existing large scale models. I’d prefer to work from a notebook environment with access to a python sdk that I can use to request the creation of Framework Clusters that I can distribute my workloads across. In addition to interactive experimentation work, I also want the ability to “fire-and-forget” longer running ML jobs onto temporarily deployed Framework Clusters with the ability to monitor these jobs while they are running and access to all of their artifacts once complete.  I also want to see where my jobs are in the current queue and the progress of all my current jobs visualized in a simple dashboard.
26 | 
27 | ## Data Scientist II
28 | * Training Foundation Models (1,000+ nodes)
29 | * Distributed Compute Framework
30 | 
31 | I need temporary (but long term) access to a massive amount of GPU enabled infrastructure to train a foundation model. I want to be able to “fire-and-forget” my ML Job into this environment. Due to the size and cost associated with this job, it has already been well tested and validated, so access to jupyter notebooks is unnecessary.  I would prefer to write my job as a bash script leveraging a CLI, or as a python script leveraging an SDK. I need the ability to monitor the job while it is running, as well as access to all of its artifacts once complete. I also want to see where my jobs are in the current queue and the progress of all my current jobs visualized in a simple dashboard.
32 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/tests/__init__.py


--------------------------------------------------------------------------------
/tests/auth-test.crt:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIDOTCCAiGgAwIBAgIUENjaZDrvhc5uV3j7GI8deZJwc+YwDQYJKoZIhvcNAQEL
 3 | BQAwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM
 4 | GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDAeFw0yNDA1MTMxMTE1NDZaFw0yNTA1
 5 | MTMxMTE1NDZaMEUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEw
 6 | HwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQwggEiMA0GCSqGSIb3DQEB
 7 | AQUAA4IBDwAwggEKAoIBAQDEYYk81jvPijZXXeI9cByf5EIbOVaBTH7I51J9EKG5
 8 | Y/KRXI43WgvVEiZ3jP8LJnSD79WhBiL6TgadQZje5ndroRYDM9vyqz1OUZapnOO+
 9 | yzl01y/qSsH8Kn88eLAzkE9HSu4QN9PuJtySyksjDFQJ6kjyE8ZHUSorur0FlLLf
10 | IToFgTuaIPDYjvFRchOCfZ7sV/MF7LxqFfFnaWOYvH41ZdvqJiRcVsMi+mYs9/I/
11 | I72IMXwVnQDVnK8H84ntEmHNN6NoVuMKla0So4/wKcHJSCgS3axLI2Ka2aaaJo9K
12 | l2cn21NOyodF+DaSFy7qaGRXxoTQ2k9tUrSvxkBJvRmBAgMBAAGjITAfMB0GA1Ud
13 | DgQWBBRTK8mO5XMcmR+Xg/PVNFnvz4eubDANBgkqhkiG9w0BAQsFAAOCAQEAlZva
14 | 6ws3zRff7u0tWT2JJaE1uPqsuAdHtVvEyAMp2QvYfyrgADTroUTaSU4p6ppX/t7v
15 | ynHhuzR6UOVkuY0/CH1P3UUGrEPNOXT8i2BDwL+j4y2K2aRN8zU0Nu/IVePBhu+4
16 | Jdt+3P7/MuwiCON5JukgxUYlQKhVhzFj7GOd2+Ca+fh8Siq3tkWDSN54+90fgylQ
17 | +74Yfya1NVabpzLqP3Isqu2XQhEVaBFvj8Yu0h83e3D8LeQToC3mVMF4yy5BZ9Ty
18 | K66YGlGQgszWEUFPEdsB8Dj/iJMhkWXuyc3u/w0s3t7rXeMYYgr+xrEeK+g0oyB5
19 | xeZuMjd567Znmu5oMw==
20 | -----END CERTIFICATE-----
21 | 


--------------------------------------------------------------------------------
/tests/e2e/cluster_apply_kind_test.py:
--------------------------------------------------------------------------------
  1 | from codeflare_sdk import Cluster, ClusterConfiguration
  2 | import pytest
  3 | from kubernetes import client
  4 | 
  5 | from support import (
  6 |     initialize_kubernetes_client,
  7 |     create_namespace,
  8 |     delete_namespace,
  9 |     get_ray_cluster,
 10 | )
 11 | 
 12 | 
 13 | @pytest.mark.kind
 14 | class TestRayClusterApply:
 15 |     def setup_method(self):
 16 |         initialize_kubernetes_client(self)
 17 | 
 18 |     def teardown_method(self):
 19 |         delete_namespace(self)
 20 | 
 21 |     def test_cluster_apply(self):
 22 |         self.setup_method()
 23 |         create_namespace(self)
 24 | 
 25 |         cluster_name = "test-cluster-apply"
 26 |         namespace = self.namespace
 27 | 
 28 |         # Initial configuration with 1 worker
 29 |         initial_config = ClusterConfiguration(
 30 |             name=cluster_name,
 31 |             namespace=namespace,
 32 |             num_workers=1,
 33 |             head_cpu_requests="500m",
 34 |             head_cpu_limits="1",
 35 |             head_memory_requests="1Gi",
 36 |             head_memory_limits="2Gi",
 37 |             worker_cpu_requests="500m",
 38 |             worker_cpu_limits="1",
 39 |             worker_memory_requests="1Gi",
 40 |             worker_memory_limits="2Gi",
 41 |             write_to_file=True,
 42 |             verify_tls=False,
 43 |         )
 44 | 
 45 |         # Create the cluster
 46 |         cluster = Cluster(initial_config)
 47 |         cluster.apply()
 48 | 
 49 |         # Wait for the cluster to be ready
 50 |         cluster.wait_ready()
 51 |         status = cluster.status()
 52 |         assert status["ready"], f"Cluster {cluster_name} is not ready: {status}"
 53 | 
 54 |         # Verify the cluster is created
 55 |         ray_cluster = get_ray_cluster(cluster_name, namespace)
 56 |         assert ray_cluster is not None, "Cluster was not created successfully"
 57 |         assert (
 58 |             ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 1
 59 |         ), "Initial worker count does not match"
 60 | 
 61 |         # Update configuration with 3 workers
 62 |         updated_config = ClusterConfiguration(
 63 |             name=cluster_name,
 64 |             namespace=namespace,
 65 |             num_workers=2,
 66 |             head_cpu_requests="500m",
 67 |             head_cpu_limits="1",
 68 |             head_memory_requests="1Gi",
 69 |             head_memory_limits="2Gi",
 70 |             worker_cpu_requests="500m",
 71 |             worker_cpu_limits="1",
 72 |             worker_memory_requests="1Gi",
 73 |             worker_memory_limits="2Gi",
 74 |             write_to_file=True,
 75 |             verify_tls=False,
 76 |         )
 77 | 
 78 |         # Apply the updated configuration
 79 |         cluster.config = updated_config
 80 |         cluster.apply()
 81 | 
 82 |         # Wait for the updated cluster to be ready
 83 |         cluster.wait_ready()
 84 |         updated_status = cluster.status()
 85 |         assert updated_status[
 86 |             "ready"
 87 |         ], f"Cluster {cluster_name} is not ready after update: {updated_status}"
 88 | 
 89 |         # Verify the cluster is updated
 90 |         updated_ray_cluster = get_ray_cluster(cluster_name, namespace)
 91 |         assert (
 92 |             updated_ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 2
 93 |         ), "Worker count was not updated"
 94 | 
 95 |         # Clean up
 96 |         cluster.down()
 97 |         ray_cluster = get_ray_cluster(cluster_name, namespace)
 98 |         assert ray_cluster is None, "Cluster was not deleted successfully"
 99 | 
100 |     def test_apply_invalid_update(self):
101 |         self.setup_method()
102 |         create_namespace(self)
103 | 
104 |         cluster_name = "test-cluster-apply-invalid"
105 |         namespace = self.namespace
106 | 
107 |         # Initial configuration
108 |         initial_config = ClusterConfiguration(
109 |             name=cluster_name,
110 |             namespace=namespace,
111 |             num_workers=1,
112 |             head_cpu_requests="500m",
113 |             head_cpu_limits="1",
114 |             head_memory_requests="1Gi",
115 |             head_memory_limits="2Gi",
116 |             worker_cpu_requests="500m",
117 |             worker_cpu_limits="1",
118 |             worker_memory_requests="1Gi",
119 |             worker_memory_limits="2Gi",
120 |             write_to_file=True,
121 |             verify_tls=False,
122 |         )
123 | 
124 |         # Create the cluster
125 |         cluster = Cluster(initial_config)
126 |         cluster.apply()
127 | 
128 |         # Wait for the cluster to be ready
129 |         cluster.wait_ready()
130 |         status = cluster.status()
131 |         assert status["ready"], f"Cluster {cluster_name} is not ready: {status}"
132 | 
133 |         # Update with an invalid configuration (e.g., immutable field change)
134 |         invalid_config = ClusterConfiguration(
135 |             name=cluster_name,
136 |             namespace=namespace,
137 |             num_workers=2,
138 |             head_cpu_requests="1",
139 |             head_cpu_limits="2",  # Changing CPU limits (immutable)
140 |             head_memory_requests="1Gi",
141 |             head_memory_limits="2Gi",
142 |             worker_cpu_requests="500m",
143 |             worker_cpu_limits="1",
144 |             worker_memory_requests="1Gi",
145 |             worker_memory_limits="2Gi",
146 |             write_to_file=True,
147 |             verify_tls=False,
148 |         )
149 | 
150 |         # Try to apply the invalid configuration and expect failure
151 |         cluster.config = invalid_config
152 |         with pytest.raises(RuntimeError, match="Immutable fields detected"):
153 |             cluster.apply()
154 | 
155 |         # Clean up
156 |         cluster.down()
157 | 


--------------------------------------------------------------------------------
/tests/e2e/heterogeneous_clusters_kind_test.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | import time
 3 | from codeflare_sdk import (
 4 |     Cluster,
 5 |     ClusterConfiguration,
 6 | )
 7 | 
 8 | from codeflare_sdk.common.kueue.kueue import list_local_queues
 9 | 
10 | import pytest
11 | 
12 | from support import *
13 | 
14 | 
15 | @pytest.mark.skip(reason="Skipping heterogenous cluster kind test")
16 | @pytest.mark.kind
17 | class TestHeterogeneousClustersKind:
18 |     def setup_method(self):
19 |         initialize_kubernetes_client(self)
20 | 
21 |     def teardown_method(self):
22 |         delete_namespace(self)
23 |         delete_kueue_resources(self)
24 | 
25 |     @pytest.mark.nvidia_gpu
26 |     def test_heterogeneous_clusters(self):
27 |         create_namespace(self)
28 |         create_kueue_resources(self, 2, with_labels=True, with_tolerations=True)
29 |         self.run_heterogeneous_clusters()
30 | 
31 |     def run_heterogeneous_clusters(
32 |         self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
33 |     ):
34 |         for flavor in self.resource_flavors:
35 |             node_labels = (
36 |                 get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {})
37 |             )
38 |             expected_nodes = get_nodes_by_label(self, node_labels)
39 | 
40 |             print(f"Expected nodes: {expected_nodes}")
41 |             cluster_name = f"test-ray-cluster-li-{flavor[-5:]}"
42 |             queues = list_local_queues(namespace=self.namespace, flavors=[flavor])
43 |             queue_name = queues[0]["name"] if queues else None
44 |             print(f"Using flavor: {flavor}, Queue: {queue_name}")
45 |             cluster = Cluster(
46 |                 ClusterConfiguration(
47 |                     name=cluster_name,
48 |                     namespace=self.namespace,
49 |                     num_workers=1,
50 |                     head_cpu_requests="500m",
51 |                     head_cpu_limits="500m",
52 |                     head_memory_requests=2,
53 |                     head_memory_limits=2,
54 |                     worker_cpu_requests="500m",
55 |                     worker_cpu_limits=1,
56 |                     worker_memory_requests=1,
57 |                     worker_memory_limits=4,
58 |                     worker_extended_resource_requests={
59 |                         gpu_resource_name: number_of_gpus
60 |                     },
61 |                     write_to_file=True,
62 |                     verify_tls=False,
63 |                     local_queue=queue_name,
64 |                 )
65 |             )
66 |             cluster.up()
67 |             sleep(5)
68 |             node_name = get_pod_node(self, self.namespace, cluster_name)
69 |             print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}")
70 |             sleep(5)
71 |             assert (
72 |                 node_name in expected_nodes
73 |             ), f"Node {node_name} is not in the expected nodes for flavor {flavor}."
74 |             cluster.down()
75 | 


--------------------------------------------------------------------------------
/tests/e2e/heterogeneous_clusters_oauth_test.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | import time
 3 | from codeflare_sdk import (
 4 |     Cluster,
 5 |     ClusterConfiguration,
 6 |     TokenAuthentication,
 7 | )
 8 | 
 9 | from codeflare_sdk.common.kueue.kueue import list_local_queues
10 | 
11 | import pytest
12 | 
13 | from support import *
14 | 
15 | 
16 | @pytest.mark.openshift
17 | class TestHeterogeneousClustersOauth:
18 |     def setup_method(self):
19 |         initialize_kubernetes_client(self)
20 | 
21 |     def teardown_method(self):
22 |         delete_namespace(self)
23 |         delete_kueue_resources(self)
24 | 
25 |     def test_heterogeneous_clusters(self):
26 |         create_namespace(self)
27 |         create_kueue_resources(self, 2, with_labels=True, with_tolerations=True)
28 |         self.run_heterogeneous_clusters()
29 | 
30 |     def run_heterogeneous_clusters(
31 |         self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
32 |     ):
33 |         ray_image = get_ray_image()
34 | 
35 |         auth = TokenAuthentication(
36 |             token=run_oc_command(["whoami", "--show-token=true"]),
37 |             server=run_oc_command(["whoami", "--show-server=true"]),
38 |             skip_tls=True,
39 |         )
40 |         auth.login()
41 | 
42 |         for flavor in self.resource_flavors:
43 |             node_labels = (
44 |                 get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {})
45 |             )
46 |             expected_nodes = get_nodes_by_label(self, node_labels)
47 | 
48 |             print(f"Expected nodes: {expected_nodes}")
49 |             cluster_name = f"test-ray-cluster-li-{flavor[-5:]}"
50 |             queues = list_local_queues(namespace=self.namespace, flavors=[flavor])
51 |             queue_name = queues[0]["name"] if queues else None
52 |             print(f"Using flavor: {flavor}, Queue: {queue_name}")
53 |             cluster = Cluster(
54 |                 ClusterConfiguration(
55 |                     namespace=self.namespace,
56 |                     name=cluster_name,
57 |                     num_workers=1,
58 |                     head_cpu_requests="500m",
59 |                     head_cpu_limits="500m",
60 |                     worker_cpu_requests="500m",
61 |                     worker_cpu_limits=1,
62 |                     worker_memory_requests=1,
63 |                     worker_memory_limits=4,
64 |                     image=ray_image,
65 |                     verify_tls=False,
66 |                     local_queue=queue_name,
67 |                 )
68 |             )
69 |             cluster.up()
70 |             sleep(5)
71 |             node_name = get_pod_node(self, self.namespace, cluster_name)
72 |             print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}")
73 |             sleep(5)
74 |             assert (
75 |                 node_name in expected_nodes
76 |             ), f"Node {node_name} is not in the expected nodes for flavor {flavor}."
77 |             cluster.down()
78 | 


--------------------------------------------------------------------------------
/tests/e2e/install-codeflare-sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd ..
 4 | 
 5 | # Install Poetry and configure virtualenvs
 6 | pip install poetry
 7 | poetry config virtualenvs.create false
 8 | 
 9 | cd codeflare-sdk
10 | 
11 | # Lock dependencies and install them
12 | poetry lock
13 | poetry install --with test,docs
14 | 
15 | # Return to the workdir
16 | cd ..
17 | cd workdir
18 | 


--------------------------------------------------------------------------------
/tests/e2e/local_interactive_sdk_kind_test.py:
--------------------------------------------------------------------------------
  1 | from codeflare_sdk import (
  2 |     Cluster,
  3 |     ClusterConfiguration,
  4 |     generate_cert,
  5 | )
  6 | 
  7 | import pytest
  8 | import ray
  9 | import math
 10 | import subprocess
 11 | 
 12 | from support import *
 13 | 
 14 | 
 15 | @pytest.mark.kind
 16 | class TestRayLocalInteractiveKind:
 17 |     def setup_method(self):
 18 |         initialize_kubernetes_client(self)
 19 |         self.port_forward_process = None
 20 | 
 21 |     def cleanup_port_forward(self):
 22 |         if self.port_forward_process:
 23 |             self.port_forward_process.terminate()
 24 |             self.port_forward_process.wait(timeout=10)
 25 |             self.port_forward_process = None
 26 | 
 27 |     def teardown_method(self):
 28 |         self.cleanup_port_forward()
 29 |         delete_namespace(self)
 30 |         delete_kueue_resources(self)
 31 | 
 32 |     def test_local_interactives(self):
 33 |         self.setup_method()
 34 |         create_namespace(self)
 35 |         create_kueue_resources(self)
 36 |         self.run_local_interactives()
 37 | 
 38 |     @pytest.mark.nvidia_gpu
 39 |     def test_local_interactives_nvidia_gpu(self):
 40 |         self.setup_method()
 41 |         create_namespace(self)
 42 |         create_kueue_resources(self)
 43 |         self.run_local_interactives(number_of_gpus=1)
 44 | 
 45 |     def run_local_interactives(
 46 |         self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
 47 |     ):
 48 |         cluster_name = "test-ray-cluster-li"
 49 | 
 50 |         ray.shutdown()
 51 | 
 52 |         cluster = Cluster(
 53 |             ClusterConfiguration(
 54 |                 name=cluster_name,
 55 |                 namespace=self.namespace,
 56 |                 num_workers=1,
 57 |                 head_cpu_requests="500m",
 58 |                 head_cpu_limits="500m",
 59 |                 worker_cpu_requests="500m",
 60 |                 worker_cpu_limits=1,
 61 |                 worker_memory_requests=1,
 62 |                 worker_memory_limits=4,
 63 |                 worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
 64 |                 verify_tls=False,
 65 |             )
 66 |         )
 67 | 
 68 |         cluster.up()
 69 | 
 70 |         cluster.wait_ready()
 71 |         cluster.status()
 72 | 
 73 |         generate_cert.generate_tls_cert(cluster_name, self.namespace)
 74 |         generate_cert.export_env(cluster_name, self.namespace)
 75 | 
 76 |         print(cluster.local_client_url())
 77 | 
 78 |         @ray.remote(num_gpus=number_of_gpus / 2)
 79 |         def heavy_calculation_part(num_iterations):
 80 |             result = 0.0
 81 |             for i in range(num_iterations):
 82 |                 for j in range(num_iterations):
 83 |                     for k in range(num_iterations):
 84 |                         result += math.sin(i) * math.cos(j) * math.tan(k)
 85 |             return result
 86 | 
 87 |         @ray.remote(num_gpus=number_of_gpus / 2)
 88 |         def heavy_calculation(num_iterations):
 89 |             results = ray.get(
 90 |                 [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
 91 |             )
 92 |             return sum(results)
 93 | 
 94 |         # Attempt to port forward
 95 |         try:
 96 |             local_port = "20001"
 97 |             ray_client_port = "10001"
 98 | 
 99 |             port_forward_cmd = [
100 |                 "kubectl",
101 |                 "port-forward",
102 |                 "-n",
103 |                 self.namespace,
104 |                 f"svc/{cluster_name}-head-svc",
105 |                 f"{local_port}:{ray_client_port}",
106 |             ]
107 |             self.port_forward_process = subprocess.Popen(
108 |                 port_forward_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
109 |             )
110 | 
111 |             client_url = f"ray://localhost:{local_port}"
112 |             cluster.status()
113 | 
114 |             ray.init(address=client_url, logging_level="INFO")
115 | 
116 |             ref = heavy_calculation.remote(3000)
117 |             result = ray.get(ref)
118 |             assert result == 1789.4644387076714
119 |             ray.cancel(ref)
120 |             ray.shutdown()
121 | 
122 |             cluster.down()
123 |         finally:
124 |             self.cleanup_port_forward()
125 | 


--------------------------------------------------------------------------------
/tests/e2e/local_interactive_sdk_oauth_test.py:
--------------------------------------------------------------------------------
 1 | from codeflare_sdk import (
 2 |     Cluster,
 3 |     ClusterConfiguration,
 4 |     TokenAuthentication,
 5 |     generate_cert,
 6 | )
 7 | 
 8 | import math
 9 | import pytest
10 | import ray
11 | 
12 | from support import *
13 | 
14 | 
15 | @pytest.mark.openshift
16 | class TestRayLocalInteractiveOauth:
17 |     def setup_method(self):
18 |         initialize_kubernetes_client(self)
19 | 
20 |     def teardown_method(self):
21 |         delete_namespace(self)
22 |         delete_kueue_resources(self)
23 | 
24 |     def test_local_interactives(self):
25 |         self.setup_method()
26 |         create_namespace(self)
27 |         create_kueue_resources(self)
28 |         self.run_local_interactives()
29 | 
30 |     def run_local_interactives(self):
31 |         ray_image = get_ray_image()
32 | 
33 |         auth = TokenAuthentication(
34 |             token=run_oc_command(["whoami", "--show-token=true"]),
35 |             server=run_oc_command(["whoami", "--show-server=true"]),
36 |             skip_tls=True,
37 |         )
38 |         auth.login()
39 | 
40 |         cluster_name = "test-ray-cluster-li"
41 | 
42 |         cluster = Cluster(
43 |             ClusterConfiguration(
44 |                 namespace=self.namespace,
45 |                 name=cluster_name,
46 |                 num_workers=1,
47 |                 worker_cpu_requests=1,
48 |                 worker_cpu_limits=1,
49 |                 worker_memory_requests=1,
50 |                 worker_memory_limits=4,
51 |                 image=ray_image,
52 |                 verify_tls=False,
53 |             )
54 |         )
55 |         cluster.up()
56 |         cluster.wait_ready()
57 | 
58 |         generate_cert.generate_tls_cert(cluster_name, self.namespace)
59 |         generate_cert.export_env(cluster_name, self.namespace)
60 | 
61 |         ray.shutdown()
62 |         ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
63 | 
64 |         @ray.remote
65 |         def heavy_calculation_part(num_iterations):
66 |             result = 0.0
67 |             for i in range(num_iterations):
68 |                 for j in range(num_iterations):
69 |                     for k in range(num_iterations):
70 |                         result += math.sin(i) * math.cos(j) * math.tan(k)
71 |             return result
72 | 
73 |         @ray.remote
74 |         def heavy_calculation(num_iterations):
75 |             results = ray.get(
76 |                 [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
77 |             )
78 |             return sum(results)
79 | 
80 |         ref = heavy_calculation.remote(3000)
81 |         result = ray.get(ref)
82 |         assert result == 1789.4644387076714
83 |         ray.cancel(ref)
84 |         ray.shutdown()
85 | 
86 |         cluster.down()
87 | 


--------------------------------------------------------------------------------
/tests/e2e/minio_deployment.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | kind: PersistentVolumeClaim
  3 | apiVersion: v1
  4 | metadata:
  5 |   name: minio-pvc
  6 | spec:
  7 |   accessModes:
  8 |     - ReadWriteOnce
  9 |   resources:
 10 |     requests:
 11 |       storage: 20Gi
 12 |   volumeMode: Filesystem
 13 | ---
 14 | kind: Secret
 15 | apiVersion: v1
 16 | metadata:
 17 |   name: minio-secret
 18 | stringData:
 19 |   # change the username and password to your own values.
 20 |   # ensure that the user is at least 3 characters long and the password at least 8
 21 |   minio_root_user: minio
 22 |   minio_root_password: minio123
 23 | ---
 24 | kind: Deployment
 25 | apiVersion: apps/v1
 26 | metadata:
 27 |   name: minio
 28 | spec:
 29 |   replicas: 1
 30 |   selector:
 31 |     matchLabels:
 32 |       app: minio
 33 |   template:
 34 |     metadata:
 35 |       creationTimestamp: null
 36 |       labels:
 37 |         app: minio
 38 |     spec:
 39 |       volumes:
 40 |         - name: data
 41 |           persistentVolumeClaim:
 42 |             claimName: minio-pvc
 43 |       containers:
 44 |         - resources:
 45 |             limits:
 46 |               cpu: 250m
 47 |               memory: 1Gi
 48 |             requests:
 49 |               cpu: 20m
 50 |               memory: 100Mi
 51 |           readinessProbe:
 52 |             tcpSocket:
 53 |               port: 9000
 54 |             initialDelaySeconds: 5
 55 |             timeoutSeconds: 1
 56 |             periodSeconds: 5
 57 |             successThreshold: 1
 58 |             failureThreshold: 3
 59 |           terminationMessagePath: /dev/termination-log
 60 |           name: minio
 61 |           livenessProbe:
 62 |             tcpSocket:
 63 |               port: 9000
 64 |             initialDelaySeconds: 30
 65 |             timeoutSeconds: 1
 66 |             periodSeconds: 5
 67 |             successThreshold: 1
 68 |             failureThreshold: 3
 69 |           env:
 70 |             - name: MINIO_ROOT_USER
 71 |               valueFrom:
 72 |                 secretKeyRef:
 73 |                   name: minio-secret
 74 |                   key: minio_root_user
 75 |             - name: MINIO_ROOT_PASSWORD
 76 |               valueFrom:
 77 |                 secretKeyRef:
 78 |                   name: minio-secret
 79 |                   key: minio_root_password
 80 |           ports:
 81 |             - containerPort: 9000
 82 |               protocol: TCP
 83 |             - containerPort: 9090
 84 |               protocol: TCP
 85 |           imagePullPolicy: IfNotPresent
 86 |           volumeMounts:
 87 |             - name: data
 88 |               mountPath: /data
 89 |               subPath: minio
 90 |           terminationMessagePolicy: File
 91 |           image: quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z
 92 |           args:
 93 |             - server
 94 |             - /data
 95 |             - --console-address
 96 |             - :9090
 97 |       restartPolicy: Always
 98 |       terminationGracePeriodSeconds: 30
 99 |       dnsPolicy: ClusterFirst
100 |       securityContext: {}
101 |       schedulerName: default-scheduler
102 |   strategy:
103 |     type: Recreate
104 |   revisionHistoryLimit: 10
105 |   progressDeadlineSeconds: 600
106 | ---
107 | kind: Service
108 | apiVersion: v1
109 | metadata:
110 |   name: minio-service
111 | spec:
112 |   ipFamilies:
113 |     - IPv4
114 |   ports:
115 |     - name: api
116 |       protocol: TCP
117 |       port: 9000
118 |       targetPort: 9000
119 |     - name: ui
120 |       protocol: TCP
121 |       port: 9090
122 |       targetPort: 9090
123 |   internalTrafficPolicy: Cluster
124 |   type: ClusterIP
125 |   ipFamilyPolicy: SingleStack
126 |   sessionAffinity: None
127 |   selector:
128 |     app: minio
129 | 


--------------------------------------------------------------------------------
/tests/e2e/mnist_pip_requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==1.9.5
2 | torchmetrics==0.9.1
3 | torchvision==0.20.1
4 | minio
5 | 


--------------------------------------------------------------------------------
/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | 
  3 | from time import sleep
  4 | 
  5 | from codeflare_sdk import Cluster, ClusterConfiguration
  6 | from codeflare_sdk.ray.client import RayJobClient
  7 | 
  8 | import pytest
  9 | 
 10 | from support import *
 11 | 
 12 | # This test creates an AppWrapper containing a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster
 13 | 
 14 | 
 15 | @pytest.mark.kind
 16 | class TestRayClusterSDKAppWrapperKind:
 17 |     def setup_method(self):
 18 |         initialize_kubernetes_client(self)
 19 | 
 20 |     def teardown_method(self):
 21 |         delete_namespace(self)
 22 |         delete_kueue_resources(self)
 23 | 
 24 |     def test_mnist_ray_cluster_sdk_kind(self):
 25 |         self.setup_method()
 26 |         create_namespace(self)
 27 |         create_kueue_resources(self)
 28 |         self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
 29 | 
 30 |     @pytest.mark.nvidia_gpu
 31 |     def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
 32 |         self.setup_method()
 33 |         create_namespace(self)
 34 |         create_kueue_resources(self)
 35 |         self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
 36 | 
 37 |     def run_mnist_raycluster_sdk_kind(
 38 |         self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
 39 |     ):
 40 |         cluster = Cluster(
 41 |             ClusterConfiguration(
 42 |                 name="mnist",
 43 |                 namespace=self.namespace,
 44 |                 num_workers=1,
 45 |                 head_cpu_requests="500m",
 46 |                 head_cpu_limits="500m",
 47 |                 worker_cpu_requests="500m",
 48 |                 worker_cpu_limits=1,
 49 |                 worker_memory_requests=1,
 50 |                 worker_memory_limits=4,
 51 |                 worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
 52 |                 write_to_file=True,
 53 |                 verify_tls=False,
 54 |                 appwrapper=True,
 55 |             )
 56 |         )
 57 | 
 58 |         cluster.up()
 59 | 
 60 |         cluster.status()
 61 | 
 62 |         cluster.wait_ready()
 63 | 
 64 |         cluster.status()
 65 | 
 66 |         cluster.details()
 67 | 
 68 |         self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
 69 |         assert_get_cluster_and_jobsubmit(
 70 |             self, "mnist", accelerator="gpu", number_of_gpus=1
 71 |         )
 72 | 
 73 |     # Assertions
 74 | 
 75 |     def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
 76 |         ray_dashboard = cluster.cluster_dashboard_uri()
 77 |         client = RayJobClient(address=ray_dashboard, verify=False)
 78 | 
 79 |         submission_id = client.submit_job(
 80 |             entrypoint="python mnist.py",
 81 |             runtime_env={
 82 |                 "working_dir": "./tests/e2e/",
 83 |                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
 84 |                 "env_vars": get_setup_env_variables(ACCELERATOR=accelerator),
 85 |             },
 86 |             entrypoint_num_gpus=number_of_gpus,
 87 |         )
 88 |         print(f"Submitted job with ID: {submission_id}")
 89 |         done = False
 90 |         time = 0
 91 |         timeout = 900
 92 |         while not done:
 93 |             status = client.get_job_status(submission_id)
 94 |             if status.is_terminal():
 95 |                 break
 96 |             if not done:
 97 |                 print(status)
 98 |                 if timeout and time >= timeout:
 99 |                     raise TimeoutError(f"job has timed out after waiting {timeout}s")
100 |                 sleep(5)
101 |                 time += 5
102 | 
103 |         logs = client.get_job_logs(submission_id)
104 |         print(logs)
105 | 
106 |         self.assert_job_completion(status)
107 | 
108 |         client.delete_job(submission_id)
109 | 
110 |     def assert_job_completion(self, status):
111 |         if status == "SUCCEEDED":
112 |             print(f"Job has completed: '{status}'")
113 |             assert True
114 |         else:
115 |             print(f"Job has completed: '{status}'")
116 |             assert False
117 | 


--------------------------------------------------------------------------------
/tests/e2e/mnist_raycluster_sdk_kind_test.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | 
  3 | from time import sleep
  4 | 
  5 | from codeflare_sdk import Cluster, ClusterConfiguration
  6 | from codeflare_sdk.ray.client import RayJobClient
  7 | 
  8 | import pytest
  9 | 
 10 | from support import *
 11 | 
 12 | # This test creates a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster
 13 | 
 14 | 
 15 | @pytest.mark.kind
 16 | class TestRayClusterSDKKind:
 17 |     def setup_method(self):
 18 |         initialize_kubernetes_client(self)
 19 | 
 20 |     def teardown_method(self):
 21 |         delete_namespace(self)
 22 |         delete_kueue_resources(self)
 23 | 
 24 |     def test_mnist_ray_cluster_sdk_kind(self):
 25 |         self.setup_method()
 26 |         create_namespace(self)
 27 |         create_kueue_resources(self)
 28 |         self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
 29 | 
 30 |     @pytest.mark.nvidia_gpu
 31 |     def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
 32 |         self.setup_method()
 33 |         create_namespace(self)
 34 |         create_kueue_resources(self)
 35 |         self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
 36 | 
 37 |     def run_mnist_raycluster_sdk_kind(
 38 |         self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
 39 |     ):
 40 |         cluster = Cluster(
 41 |             ClusterConfiguration(
 42 |                 name="mnist",
 43 |                 namespace=self.namespace,
 44 |                 num_workers=1,
 45 |                 head_cpu_requests="500m",
 46 |                 head_cpu_limits="500m",
 47 |                 worker_cpu_requests="500m",
 48 |                 worker_cpu_limits=1,
 49 |                 worker_memory_requests=1,
 50 |                 worker_memory_limits=4,
 51 |                 worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
 52 |                 write_to_file=True,
 53 |                 verify_tls=False,
 54 |             )
 55 |         )
 56 | 
 57 |         cluster.up()
 58 | 
 59 |         cluster.status()
 60 | 
 61 |         cluster.wait_ready()
 62 | 
 63 |         cluster.status()
 64 | 
 65 |         cluster.details()
 66 | 
 67 |         self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
 68 | 
 69 |         assert_get_cluster_and_jobsubmit(
 70 |             self, "mnist", accelerator="gpu", number_of_gpus=1
 71 |         )
 72 | 
 73 |     # Assertions
 74 | 
 75 |     def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
 76 |         ray_dashboard = cluster.cluster_dashboard_uri()
 77 |         client = RayJobClient(address=ray_dashboard, verify=False)
 78 | 
 79 |         submission_id = client.submit_job(
 80 |             entrypoint="python mnist.py",
 81 |             runtime_env={
 82 |                 "working_dir": "./tests/e2e/",
 83 |                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
 84 |                 "env_vars": get_setup_env_variables(ACCELERATOR=accelerator),
 85 |             },
 86 |             entrypoint_num_gpus=number_of_gpus,
 87 |         )
 88 |         print(f"Submitted job with ID: {submission_id}")
 89 |         done = False
 90 |         time = 0
 91 |         timeout = 900
 92 |         while not done:
 93 |             status = client.get_job_status(submission_id)
 94 |             if status.is_terminal():
 95 |                 break
 96 |             if not done:
 97 |                 print(status)
 98 |                 if timeout and time >= timeout:
 99 |                     raise TimeoutError(f"job has timed out after waiting {timeout}s")
100 |                 sleep(5)
101 |                 time += 5
102 | 
103 |         logs = client.get_job_logs(submission_id)
104 |         print(logs)
105 | 
106 |         self.assert_job_completion(status)
107 | 
108 |         client.delete_job(submission_id)
109 | 
110 |     def assert_job_completion(self, status):
111 |         if status == "SUCCEEDED":
112 |             print(f"Job has completed: '{status}'")
113 |             assert True
114 |         else:
115 |             print(f"Job has completed: '{status}'")
116 |             assert False
117 | 


--------------------------------------------------------------------------------
/tests/e2e/mnist_raycluster_sdk_oauth_test.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | 
  3 | from time import sleep
  4 | 
  5 | from codeflare_sdk import (
  6 |     Cluster,
  7 |     ClusterConfiguration,
  8 |     TokenAuthentication,
  9 | )
 10 | from codeflare_sdk.ray.client import RayJobClient
 11 | 
 12 | import pytest
 13 | 
 14 | from support import *
 15 | 
 16 | # This test creates a Ray Cluster and covers the Ray Job submission with authentication and without authentication functionality on Openshift Cluster
 17 | 
 18 | 
 19 | @pytest.mark.openshift
 20 | class TestRayClusterSDKOauth:
 21 |     def setup_method(self):
 22 |         initialize_kubernetes_client(self)
 23 | 
 24 |     def teardown_method(self):
 25 |         delete_namespace(self)
 26 |         delete_kueue_resources(self)
 27 | 
 28 |     def test_mnist_ray_cluster_sdk_auth(self):
 29 |         self.setup_method()
 30 |         create_namespace(self)
 31 |         create_kueue_resources(self)
 32 |         self.run_mnist_raycluster_sdk_oauth()
 33 | 
 34 |     def run_mnist_raycluster_sdk_oauth(self):
 35 |         ray_image = get_ray_image()
 36 | 
 37 |         auth = TokenAuthentication(
 38 |             token=run_oc_command(["whoami", "--show-token=true"]),
 39 |             server=run_oc_command(["whoami", "--show-server=true"]),
 40 |             skip_tls=True,
 41 |         )
 42 |         auth.login()
 43 | 
 44 |         cluster = Cluster(
 45 |             ClusterConfiguration(
 46 |                 name="mnist",
 47 |                 namespace=self.namespace,
 48 |                 num_workers=1,
 49 |                 head_cpu_requests="500m",
 50 |                 head_cpu_limits="500m",
 51 |                 worker_cpu_requests=1,
 52 |                 worker_cpu_limits=1,
 53 |                 worker_memory_requests=1,
 54 |                 worker_memory_limits=4,
 55 |                 image=ray_image,
 56 |                 write_to_file=True,
 57 |                 verify_tls=False,
 58 |             )
 59 |         )
 60 | 
 61 |         cluster.up()
 62 | 
 63 |         cluster.status()
 64 | 
 65 |         cluster.wait_ready()
 66 | 
 67 |         cluster.status()
 68 | 
 69 |         cluster.details()
 70 | 
 71 |         self.assert_jobsubmit_withoutLogin(cluster)
 72 |         self.assert_jobsubmit_withlogin(cluster)
 73 |         assert_get_cluster_and_jobsubmit(self, "mnist")
 74 | 
 75 |     # Assertions
 76 | 
 77 |     def assert_jobsubmit_withoutLogin(self, cluster):
 78 |         dashboard_url = cluster.cluster_dashboard_uri()
 79 |         jobdata = {
 80 |             "entrypoint": "python mnist.py",
 81 |             "runtime_env": {
 82 |                 "working_dir": "./tests/e2e/",
 83 |                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
 84 |                 "env_vars": get_setup_env_variables(),
 85 |             },
 86 |         }
 87 |         try:
 88 |             response = requests.post(
 89 |                 dashboard_url + "/api/jobs/", verify=False, json=jobdata
 90 |             )
 91 |             if response.status_code == 403:
 92 |                 assert True
 93 |             else:
 94 |                 response.raise_for_status()
 95 |                 assert False
 96 | 
 97 |         except Exception as e:
 98 |             print(f"An unexpected error occurred. Error: {e}")
 99 |             assert False
100 | 
101 |     def assert_jobsubmit_withlogin(self, cluster):
102 |         auth_token = run_oc_command(["whoami", "--show-token=true"])
103 |         ray_dashboard = cluster.cluster_dashboard_uri()
104 |         header = {"Authorization": f"Bearer {auth_token}"}
105 |         client = RayJobClient(address=ray_dashboard, headers=header, verify=False)
106 | 
107 |         submission_id = client.submit_job(
108 |             entrypoint="python mnist.py",
109 |             runtime_env={
110 |                 "working_dir": "./tests/e2e/",
111 |                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
112 |                 "env_vars": get_setup_env_variables(),
113 |             },
114 |             entrypoint_num_cpus=1,
115 |         )
116 |         print(f"Submitted job with ID: {submission_id}")
117 |         done = False
118 |         time = 0
119 |         timeout = 900
120 |         while not done:
121 |             status = client.get_job_status(submission_id)
122 |             if status.is_terminal():
123 |                 break
124 |             if not done:
125 |                 print(status)
126 |                 if timeout and time >= timeout:
127 |                     raise TimeoutError(f"job has timed out after waiting {timeout}s")
128 |                 sleep(5)
129 |                 time += 5
130 | 
131 |         logs = client.get_job_logs(submission_id)
132 |         print(logs)
133 | 
134 |         self.assert_job_completion(status)
135 | 
136 |         client.delete_job(submission_id)
137 | 
138 |     def assert_job_completion(self, status):
139 |         if status == "SUCCEEDED":
140 |             print(f"Job has completed: '{status}'")
141 |             assert True
142 |         else:
143 |             print(f"Job has completed: '{status}'")
144 |             assert False
145 | 


--------------------------------------------------------------------------------
/tests/e2e/mnist_rayjob.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from time import sleep
 4 | 
 5 | from support import *
 6 | 
 7 | from codeflare_sdk.ray.cluster.cluster import get_cluster
 8 | from codeflare_sdk.ray.client import RayJobClient
 9 | 
10 | namespace = sys.argv[1]
11 | 
12 | cluster = get_cluster("mnist", namespace)
13 | 
14 | cluster.details()
15 | 
16 | auth_token = run_oc_command(["whoami", "--show-token=true"])
17 | ray_dashboard = cluster.cluster_dashboard_uri()
18 | header = {"Authorization": f"Bearer {auth_token}"}
19 | client = RayJobClient(address=ray_dashboard, headers=header, verify=True)
20 | 
21 | # Submit the job
22 | submission_id = client.submit_job(
23 |     entrypoint="python mnist.py",
24 |     runtime_env={"working_dir": "/", "pip": "requirements.txt"},
25 | )
26 | print(f"Submitted job with ID: {submission_id}")
27 | done = False
28 | time = 0
29 | timeout = 900
30 | while not done:
31 |     status = client.get_job_status(submission_id)
32 |     if status.is_terminal():
33 |         break
34 |     if not done:
35 |         print(status)
36 |         if timeout and time >= timeout:
37 |             raise TimeoutError(f"job has timed out after waiting {timeout}s")
38 |         sleep(5)
39 |         time += 5
40 | 
41 | logs = client.get_job_logs(submission_id)
42 | print(logs)
43 | 
44 | client.delete_job(submission_id)
45 | cluster.down()
46 | 
47 | 
48 | if not status == "SUCCEEDED":
49 |     exit(1)
50 | else:
51 |     exit(0)
52 | 


--------------------------------------------------------------------------------
/tests/e2e/mnist_sleep.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 IBM, Red Hat
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import time
16 | import torch
17 | import torch.nn as nn
18 | from torch.utils.data import DataLoader
19 | from torchvision import datasets, transforms
20 | 
21 | 
22 | # Define a simple neural network
23 | class NeuralNetwork(nn.Module):
24 |     def __init__(self):
25 |         super(NeuralNetwork, self).__init__()
26 |         self.flatten = nn.Flatten()
27 |         self.linear_relu_stack = nn.Sequential(
28 |             nn.Linear(28 * 28, 512),
29 |             nn.ReLU(),
30 |             nn.Linear(512, 512),
31 |             nn.ReLU(),
32 |             nn.Linear(512, 10),
33 |         )
34 | 
35 |     def forward(self, x):
36 |         x = self.flatten(x)
37 |         logits = self.linear_relu_stack(x)
38 |         return logits
39 | 
40 | 
41 | # Define the training function
42 | def train():
43 |     # Sleeping for 24 hours for upgrade test scenario
44 |     print("Sleeping for 24 hours before starting the training for upgrade testing...")
45 |     time.sleep(24 * 60 * 60)
46 | 
47 |     # Load dataset
48 |     transform = transforms.Compose([transforms.ToTensor()])
49 |     train_dataset = datasets.FashionMNIST(
50 |         root="./data", train=True, download=True, transform=transform
51 |     )
52 |     train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
53 | 
54 |     # Initialize the neural network, loss function, and optimizer
55 |     model = NeuralNetwork()
56 |     criterion = nn.CrossEntropyLoss()
57 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
58 | 
59 |     # Train the model
60 |     num_epochs = 3
61 |     for epoch in range(num_epochs):
62 |         for inputs, labels in train_loader:
63 |             optimizer.zero_grad()
64 |             outputs = model(inputs)
65 |             loss = criterion(outputs, labels)
66 |             loss.backward()
67 |             optimizer.step()
68 |         print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     train()
73 | 


--------------------------------------------------------------------------------
/tests/e2e/start_ray_cluster.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | from time import sleep
 5 | 
 6 | from codeflare_sdk.ray.cluster.cluster import Cluster, ClusterConfiguration
 7 | 
 8 | namespace = sys.argv[1]
 9 | ray_image = os.getenv("RAY_IMAGE")
10 | 
11 | cluster = Cluster(
12 |     ClusterConfiguration(
13 |         name="mnist",
14 |         namespace=namespace,
15 |         num_workers=1,
16 |         head_cpu_requests="500m",
17 |         head_cpu_limits="500m",
18 |         head_memory_requests=2,
19 |         head_memory_limits=2,
20 |         worker_cpu_requests="500m",
21 |         worker_cpu_limits=1,
22 |         worker_memory_requests=1,
23 |         worker_memory_limits=2,
24 |         image=ray_image,
25 |         appwrapper=True,
26 |     )
27 | )
28 | 
29 | cluster.up()
30 | 
31 | cluster.status()
32 | 
33 | cluster.wait_ready()
34 | 
35 | cluster.status()
36 | 
37 | cluster.details()
38 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/appwrapper/test-case-bad.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: workload.codeflare.dev/v1beta2
  2 | kind: AppsWrapper
  3 | metadata:
  4 |   labels:
  5 |     orderedinstance: cpu.small_gpu.large
  6 |   nam: unit-test-cluster
  7 |   namspace: ns
  8 | spec:
  9 |   components:
 10 |   - template:
 11 |       apiVersion: ray.io/v1
 12 |       kind: RayCluster
 13 |       metadata:
 14 |         labels:
 15 |           controller-tools.k8s.io: '1.0'
 16 |         name: unit-test-cluster
 17 |         namespace: ns
 18 |       spec:
 19 |         autoscalerOptions:
 20 |           idleTimeoutSeconds: 60
 21 |           resources:
 22 |             limits:
 23 |               cpu: 500m
 24 |               memory: 512Mi
 25 |             requests:
 26 |               cpu: 500m
 27 |               memory: 512Mi
 28 |           upscalingMode: Default
 29 |         enableInTreeAutoscaling: false
 30 |         headGroupSpec:
 31 |           rayStartParams:
 32 |             block: 'true'
 33 |             dashboard-host: 0.0.0.0
 34 |             num-gpus: '0'
 35 |             resources: '"{}"'
 36 |           serviceType: ClusterIP
 37 |           template:
 38 |             spec:
 39 |               containers:
 40 |               - env:
 41 |                 - name: MY_POD_IP
 42 |                   valueFrom:
 43 |                     fieldRef:
 44 |                       fieldPath: status.podIP
 45 |                 image: "${image}"
 46 |                 imagePullPolicy: IfNotPresent
 47 |                 lifecycle:
 48 |                   preStop:
 49 |                     exec:
 50 |                       command:
 51 |                       - /bin/sh
 52 |                       - -c
 53 |                       - ray stop
 54 |                 name: ray-head
 55 |                 ports:
 56 |                 - containerPort: 6379
 57 |                   name: gcs
 58 |                 - containerPort: 8265
 59 |                   name: dashboard
 60 |                 - containerPort: 10001
 61 |                   name: client
 62 |                 resources:
 63 |                   limits:
 64 |                     cpu: 2
 65 |                     memory: 8G
 66 |                   requests:
 67 |                     cpu: 2
 68 |                     memory: 8G
 69 |         rayVersion: 2.47.1
 70 |         workerGroupSpecs:
 71 |         - groupName: small-group-unit-test-cluster
 72 |           maxReplicas: 2
 73 |           minReplicas: 2
 74 |           rayStartParams:
 75 |             block: 'true'
 76 |             num-gpus: '7'
 77 |             resources: '"{}"'
 78 |           replicas: 2
 79 |           template:
 80 |             metadata:
 81 |               annotations:
 82 |                 key: value
 83 |               labels:
 84 |                 key: value
 85 |             spec:
 86 |               containers:
 87 |               - env:
 88 |                 - name: MY_POD_IP
 89 |                   valueFrom:
 90 |                     fieldRef:
 91 |                       fieldPath: status.podIP
 92 |                 image: "${image}"
 93 |                 lifecycle:
 94 |                   preStop:
 95 |                     exec:
 96 |                       command:
 97 |                       - /bin/sh
 98 |                       - -c
 99 |                       - ray stop
100 |                 name: machine-learning
101 |                 resources:
102 |                   limits:
103 |                     cpu: 4
104 |                     memory: 6G
105 |                     nvidia.com/gpu: 7
106 |                   requests:
107 |                     cpu: 3
108 |                     memory: 5G
109 |                     nvidia.com/gpu: 7
110 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/kueue/aw_kueue.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: workload.codeflare.dev/v1beta2
  2 | kind: AppWrapper
  3 | metadata:
  4 |   labels:
  5 |     kueue.x-k8s.io/queue-name: local-queue-default
  6 |   name: unit-test-aw-kueue
  7 |   namespace: ns
  8 | spec:
  9 |   components:
 10 |   - template:
 11 |       apiVersion: ray.io/v1
 12 |       kind: RayCluster
 13 |       metadata:
 14 |         labels:
 15 |           controller-tools.k8s.io: '1.0'
 16 |         name: unit-test-aw-kueue
 17 |         namespace: ns
 18 |       spec:
 19 |         autoscalerOptions:
 20 |           idleTimeoutSeconds: 60
 21 |           resources:
 22 |             limits:
 23 |               cpu: 500m
 24 |               memory: 512Mi
 25 |             requests:
 26 |               cpu: 500m
 27 |               memory: 512Mi
 28 |           upscalingMode: Default
 29 |         enableInTreeAutoscaling: false
 30 |         headGroupSpec:
 31 |           enableIngress: false
 32 |           rayStartParams:
 33 |             block: 'true'
 34 |             dashboard-host: 0.0.0.0
 35 |             num-gpus: '0'
 36 |             resources: '"{}"'
 37 |           serviceType: ClusterIP
 38 |           template:
 39 |             spec:
 40 |               containers:
 41 |               - image: "${image}"
 42 |                 imagePullPolicy: Always
 43 |                 lifecycle:
 44 |                   preStop:
 45 |                     exec:
 46 |                       command:
 47 |                       - /bin/sh
 48 |                       - -c
 49 |                       - ray stop
 50 |                 name: ray-head
 51 |                 ports:
 52 |                 - containerPort: 6379
 53 |                   name: gcs
 54 |                 - containerPort: 8265
 55 |                   name: dashboard
 56 |                 - containerPort: 10001
 57 |                   name: client
 58 |                 resources:
 59 |                   limits:
 60 |                     cpu: 2
 61 |                     memory: 8G
 62 |                   requests:
 63 |                     cpu: 2
 64 |                     memory: 8G
 65 |                 volumeMounts:
 66 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
 67 |                   name: odh-trusted-ca-cert
 68 |                   subPath: odh-trusted-ca-bundle.crt
 69 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
 70 |                   name: odh-trusted-ca-cert
 71 |                   subPath: odh-trusted-ca-bundle.crt
 72 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
 73 |                   name: odh-ca-cert
 74 |                   subPath: odh-ca-bundle.crt
 75 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
 76 |                   name: odh-ca-cert
 77 |                   subPath: odh-ca-bundle.crt
 78 |                 env:
 79 |                 - name: RAY_USAGE_STATS_ENABLED
 80 |                   value: '0'
 81 |               volumes:
 82 |               - configMap:
 83 |                   items:
 84 |                   - key: ca-bundle.crt
 85 |                     path: odh-trusted-ca-bundle.crt
 86 |                   name: odh-trusted-ca-bundle
 87 |                   optional: true
 88 |                 name: odh-trusted-ca-cert
 89 |               - configMap:
 90 |                   items:
 91 |                   - key: odh-ca-bundle.crt
 92 |                     path: odh-ca-bundle.crt
 93 |                   name: odh-trusted-ca-bundle
 94 |                   optional: true
 95 |                 name: odh-ca-cert
 96 |         rayVersion: 2.47.1
 97 |         workerGroupSpecs:
 98 |         - groupName: small-group-unit-test-aw-kueue
 99 |           maxReplicas: 2
100 |           minReplicas: 2
101 |           rayStartParams:
102 |             block: 'true'
103 |             num-gpus: '0'
104 |             resources: '"{}"'
105 |           replicas: 2
106 |           template:
107 |             spec:
108 |               containers:
109 |               - image: "${image}"
110 |                 imagePullPolicy: Always
111 |                 lifecycle:
112 |                   preStop:
113 |                     exec:
114 |                       command:
115 |                       - /bin/sh
116 |                       - -c
117 |                       - ray stop
118 |                 name: machine-learning
119 |                 resources:
120 |                   limits:
121 |                     cpu: 4
122 |                     memory: 6G
123 |                   requests:
124 |                     cpu: 3
125 |                     memory: 5G
126 |                 volumeMounts:
127 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
128 |                   name: odh-trusted-ca-cert
129 |                   subPath: odh-trusted-ca-bundle.crt
130 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
131 |                   name: odh-trusted-ca-cert
132 |                   subPath: odh-trusted-ca-bundle.crt
133 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
134 |                   name: odh-ca-cert
135 |                   subPath: odh-ca-bundle.crt
136 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
137 |                   name: odh-ca-cert
138 |                   subPath: odh-ca-bundle.crt
139 |                 env:
140 |                 - name: RAY_USAGE_STATS_ENABLED
141 |                   value: '0'
142 |               volumes:
143 |               - configMap:
144 |                   items:
145 |                   - key: ca-bundle.crt
146 |                     path: odh-trusted-ca-bundle.crt
147 |                   name: odh-trusted-ca-bundle
148 |                   optional: true
149 |                 name: odh-trusted-ca-cert
150 |               - configMap:
151 |                   items:
152 |                   - key: odh-ca-bundle.crt
153 |                     path: odh-ca-bundle.crt
154 |                   name: odh-trusted-ca-bundle
155 |                   optional: true
156 |                 name: odh-ca-cert
157 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: workload.codeflare.dev/v1beta2
  2 | kind: AppWrapper
  3 | metadata:
  4 |   labels:
  5 |     kueue.x-k8s.io/queue-name: local-queue-default
  6 |   name: unit-test-cluster-kueue
  7 |   namespace: ns
  8 | spec:
  9 |   components:
 10 |   - template:
 11 |       apiVersion: ray.io/v1
 12 |       kind: RayCluster
 13 |       metadata:
 14 |         labels:
 15 |           controller-tools.k8s.io: '1.0'
 16 |         name: unit-test-cluster-kueue
 17 |         namespace: ns
 18 |       spec:
 19 |         autoscalerOptions:
 20 |           idleTimeoutSeconds: 60
 21 |           resources:
 22 |             limits:
 23 |               cpu: 500m
 24 |               memory: 512Mi
 25 |             requests:
 26 |               cpu: 500m
 27 |               memory: 512Mi
 28 |           upscalingMode: Default
 29 |         enableInTreeAutoscaling: false
 30 |         headGroupSpec:
 31 |           enableIngress: false
 32 |           rayStartParams:
 33 |             block: 'true'
 34 |             dashboard-host: 0.0.0.0
 35 |             num-gpus: '0'
 36 |             resources: '"{}"'
 37 |           serviceType: ClusterIP
 38 |           template:
 39 |             spec:
 40 |               containers:
 41 |               - image: "${image}"
 42 |                 imagePullPolicy: Always
 43 |                 lifecycle:
 44 |                   preStop:
 45 |                     exec:
 46 |                       command:
 47 |                       - /bin/sh
 48 |                       - -c
 49 |                       - ray stop
 50 |                 name: ray-head
 51 |                 ports:
 52 |                 - containerPort: 6379
 53 |                   name: gcs
 54 |                 - containerPort: 8265
 55 |                   name: dashboard
 56 |                 - containerPort: 10001
 57 |                   name: client
 58 |                 resources:
 59 |                   limits:
 60 |                     cpu: 2
 61 |                     memory: 8G
 62 |                   requests:
 63 |                     cpu: 2
 64 |                     memory: 8G
 65 |                 volumeMounts:
 66 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
 67 |                   name: odh-trusted-ca-cert
 68 |                   subPath: odh-trusted-ca-bundle.crt
 69 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
 70 |                   name: odh-trusted-ca-cert
 71 |                   subPath: odh-trusted-ca-bundle.crt
 72 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
 73 |                   name: odh-ca-cert
 74 |                   subPath: odh-ca-bundle.crt
 75 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
 76 |                   name: odh-ca-cert
 77 |                   subPath: odh-ca-bundle.crt
 78 |                 env:
 79 |                 - name: RAY_USAGE_STATS_ENABLED
 80 |                   value: '0'
 81 |               volumes:
 82 |               - configMap:
 83 |                   items:
 84 |                   - key: ca-bundle.crt
 85 |                     path: odh-trusted-ca-bundle.crt
 86 |                   name: odh-trusted-ca-bundle
 87 |                   optional: true
 88 |                 name: odh-trusted-ca-cert
 89 |               - configMap:
 90 |                   items:
 91 |                   - key: odh-ca-bundle.crt
 92 |                     path: odh-ca-bundle.crt
 93 |                   name: odh-trusted-ca-bundle
 94 |                   optional: true
 95 |                 name: odh-ca-cert
 96 |         rayVersion: 2.47.1
 97 |         workerGroupSpecs:
 98 |         - groupName: small-group-unit-test-cluster-kueue
 99 |           maxReplicas: 2
100 |           minReplicas: 2
101 |           rayStartParams:
102 |             block: 'true'
103 |             num-gpus: '0'
104 |             resources: '"{}"'
105 |           replicas: 2
106 |           template:
107 |             spec:
108 |               containers:
109 |               - image: "${image}"
110 |                 imagePullPolicy: Always
111 |                 lifecycle:
112 |                   preStop:
113 |                     exec:
114 |                       command:
115 |                       - /bin/sh
116 |                       - -c
117 |                       - ray stop
118 |                 name: machine-learning
119 |                 resources:
120 |                   limits:
121 |                     cpu: 4
122 |                     memory: 6G
123 |                   requests:
124 |                     cpu: 3
125 |                     memory: 5G
126 |                 volumeMounts:
127 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
128 |                   name: odh-trusted-ca-cert
129 |                   subPath: odh-trusted-ca-bundle.crt
130 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
131 |                   name: odh-trusted-ca-cert
132 |                   subPath: odh-trusted-ca-bundle.crt
133 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
134 |                   name: odh-ca-cert
135 |                   subPath: odh-ca-bundle.crt
136 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
137 |                   name: odh-ca-cert
138 |                   subPath: odh-ca-bundle.crt
139 |                 env:
140 |                 - name: RAY_USAGE_STATS_ENABLED
141 |                   value: '0'
142 |               volumes:
143 |               - configMap:
144 |                   items:
145 |                   - key: ca-bundle.crt
146 |                     path: odh-trusted-ca-bundle.crt
147 |                   name: odh-trusted-ca-bundle
148 |                   optional: true
149 |                 name: odh-trusted-ca-cert
150 |               - configMap:
151 |                   items:
152 |                   - key: odh-ca-bundle.crt
153 |                     path: odh-ca-bundle.crt
154 |                   name: odh-trusted-ca-bundle
155 |                   optional: true
156 |                 name: odh-ca-cert
157 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/ray/default-appwrapper.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: workload.codeflare.dev/v1beta2
  2 | kind: AppWrapper
  3 | metadata:
  4 |   name: default-appwrapper
  5 |   namespace: ns
  6 | spec:
  7 |   components:
  8 |   - template:
  9 |       apiVersion: ray.io/v1
 10 |       kind: RayCluster
 11 |       metadata:
 12 |         labels:
 13 |           controller-tools.k8s.io: '1.0'
 14 |         name: default-appwrapper
 15 |         namespace: ns
 16 |       spec:
 17 |         autoscalerOptions:
 18 |           idleTimeoutSeconds: 60
 19 |           resources:
 20 |             limits:
 21 |               cpu: 500m
 22 |               memory: 512Mi
 23 |             requests:
 24 |               cpu: 500m
 25 |               memory: 512Mi
 26 |           upscalingMode: Default
 27 |         enableInTreeAutoscaling: false
 28 |         headGroupSpec:
 29 |           enableIngress: false
 30 |           rayStartParams:
 31 |             block: 'true'
 32 |             dashboard-host: 0.0.0.0
 33 |             num-gpus: '0'
 34 |             resources: '"{}"'
 35 |           serviceType: ClusterIP
 36 |           template:
 37 |             spec:
 38 |               containers:
 39 |               - image: "${image}"
 40 |                 imagePullPolicy: Always
 41 |                 lifecycle:
 42 |                   preStop:
 43 |                     exec:
 44 |                       command:
 45 |                       - /bin/sh
 46 |                       - -c
 47 |                       - ray stop
 48 |                 name: ray-head
 49 |                 ports:
 50 |                 - containerPort: 6379
 51 |                   name: gcs
 52 |                 - containerPort: 8265
 53 |                   name: dashboard
 54 |                 - containerPort: 10001
 55 |                   name: client
 56 |                 env:
 57 |                 - name: RAY_USAGE_STATS_ENABLED
 58 |                   value: '0'
 59 |                 resources:
 60 |                   limits:
 61 |                     cpu: 2
 62 |                     memory: 8G
 63 |                   requests:
 64 |                     cpu: 2
 65 |                     memory: 8G
 66 |                 volumeMounts:
 67 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
 68 |                   name: odh-trusted-ca-cert
 69 |                   subPath: odh-trusted-ca-bundle.crt
 70 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
 71 |                   name: odh-trusted-ca-cert
 72 |                   subPath: odh-trusted-ca-bundle.crt
 73 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
 74 |                   name: odh-ca-cert
 75 |                   subPath: odh-ca-bundle.crt
 76 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
 77 |                   name: odh-ca-cert
 78 |                   subPath: odh-ca-bundle.crt
 79 |               volumes:
 80 |               - configMap:
 81 |                   items:
 82 |                   - key: ca-bundle.crt
 83 |                     path: odh-trusted-ca-bundle.crt
 84 |                   name: odh-trusted-ca-bundle
 85 |                   optional: true
 86 |                 name: odh-trusted-ca-cert
 87 |               - configMap:
 88 |                   items:
 89 |                   - key: odh-ca-bundle.crt
 90 |                     path: odh-ca-bundle.crt
 91 |                   name: odh-trusted-ca-bundle
 92 |                   optional: true
 93 |                 name: odh-ca-cert
 94 |         rayVersion: 2.47.1
 95 |         workerGroupSpecs:
 96 |         - groupName: small-group-default-appwrapper
 97 |           maxReplicas: 1
 98 |           minReplicas: 1
 99 |           rayStartParams:
100 |             block: 'true'
101 |             num-gpus: '0'
102 |             resources: '"{}"'
103 |           replicas: 1
104 |           template:
105 |             spec:
106 |               containers:
107 |               - image: "${image}"
108 |                 imagePullPolicy: Always
109 |                 lifecycle:
110 |                   preStop:
111 |                     exec:
112 |                       command:
113 |                       - /bin/sh
114 |                       - -c
115 |                       - ray stop
116 |                 name: machine-learning
117 |                 env:
118 |                 - name: RAY_USAGE_STATS_ENABLED
119 |                   value: '0'
120 |                 resources:
121 |                   limits:
122 |                     cpu: 1
123 |                     memory: 2G
124 |                   requests:
125 |                     cpu: 1
126 |                     memory: 2G
127 |                 volumeMounts:
128 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
129 |                   name: odh-trusted-ca-cert
130 |                   subPath: odh-trusted-ca-bundle.crt
131 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
132 |                   name: odh-trusted-ca-cert
133 |                   subPath: odh-trusted-ca-bundle.crt
134 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
135 |                   name: odh-ca-cert
136 |                   subPath: odh-ca-bundle.crt
137 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
138 |                   name: odh-ca-cert
139 |                   subPath: odh-ca-bundle.crt
140 |               volumes:
141 |               - configMap:
142 |                   items:
143 |                   - key: ca-bundle.crt
144 |                     path: odh-trusted-ca-bundle.crt
145 |                   name: odh-trusted-ca-bundle
146 |                   optional: true
147 |                 name: odh-trusted-ca-cert
148 |               - configMap:
149 |                   items:
150 |                   - key: odh-ca-bundle.crt
151 |                     path: odh-ca-bundle.crt
152 |                   name: odh-trusted-ca-bundle
153 |                   optional: true
154 |                 name: odh-ca-cert
155 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/ray/default-ray-cluster.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: ray.io/v1
  2 | kind: RayCluster
  3 | metadata:
  4 |   labels:
  5 |     controller-tools.k8s.io: '1.0'
  6 |   name: default-cluster
  7 |   namespace: ns
  8 | spec:
  9 |   autoscalerOptions:
 10 |     idleTimeoutSeconds: 60
 11 |     resources:
 12 |       limits:
 13 |         cpu: 500m
 14 |         memory: 512Mi
 15 |       requests:
 16 |         cpu: 500m
 17 |         memory: 512Mi
 18 |     upscalingMode: Default
 19 |   enableInTreeAutoscaling: false
 20 |   headGroupSpec:
 21 |     enableIngress: false
 22 |     rayStartParams:
 23 |       block: 'true'
 24 |       dashboard-host: 0.0.0.0
 25 |       num-gpus: '0'
 26 |       resources: '"{}"'
 27 |     serviceType: ClusterIP
 28 |     template:
 29 |       spec:
 30 |         containers:
 31 |         - image: "${image}"
 32 |           imagePullPolicy: Always
 33 |           lifecycle:
 34 |             preStop:
 35 |               exec:
 36 |                 command:
 37 |                 - /bin/sh
 38 |                 - -c
 39 |                 - ray stop
 40 |           name: ray-head
 41 |           ports:
 42 |           - containerPort: 6379
 43 |             name: gcs
 44 |           - containerPort: 8265
 45 |             name: dashboard
 46 |           - containerPort: 10001
 47 |             name: client
 48 |           env:
 49 |           - name: RAY_USAGE_STATS_ENABLED
 50 |             value: '0'
 51 |           resources:
 52 |             limits:
 53 |               cpu: 2
 54 |               memory: 8G
 55 |             requests:
 56 |               cpu: 2
 57 |               memory: 8G
 58 |           volumeMounts:
 59 |           - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
 60 |             name: odh-trusted-ca-cert
 61 |             subPath: odh-trusted-ca-bundle.crt
 62 |           - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
 63 |             name: odh-trusted-ca-cert
 64 |             subPath: odh-trusted-ca-bundle.crt
 65 |           - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
 66 |             name: odh-ca-cert
 67 |             subPath: odh-ca-bundle.crt
 68 |           - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
 69 |             name: odh-ca-cert
 70 |             subPath: odh-ca-bundle.crt
 71 |         volumes:
 72 |         - configMap:
 73 |             items:
 74 |             - key: ca-bundle.crt
 75 |               path: odh-trusted-ca-bundle.crt
 76 |             name: odh-trusted-ca-bundle
 77 |             optional: true
 78 |           name: odh-trusted-ca-cert
 79 |         - configMap:
 80 |             items:
 81 |             - key: odh-ca-bundle.crt
 82 |               path: odh-ca-bundle.crt
 83 |             name: odh-trusted-ca-bundle
 84 |             optional: true
 85 |           name: odh-ca-cert
 86 |   rayVersion: 2.47.1
 87 |   workerGroupSpecs:
 88 |   - groupName: small-group-default-cluster
 89 |     maxReplicas: 1
 90 |     minReplicas: 1
 91 |     rayStartParams:
 92 |       block: 'true'
 93 |       num-gpus: '0'
 94 |       resources: '"{}"'
 95 |     replicas: 1
 96 |     template:
 97 |       spec:
 98 |         containers:
 99 |         - image: "${image}"
100 |           imagePullPolicy: Always
101 |           lifecycle:
102 |             preStop:
103 |               exec:
104 |                 command:
105 |                 - /bin/sh
106 |                 - -c
107 |                 - ray stop
108 |           name: machine-learning
109 |           resources:
110 |             limits:
111 |               cpu: 1
112 |               memory: 2G
113 |             requests:
114 |               cpu: 1
115 |               memory: 2G
116 |           env:
117 |           - name: RAY_USAGE_STATS_ENABLED
118 |             value: '0'
119 |           volumeMounts:
120 |           - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
121 |             name: odh-trusted-ca-cert
122 |             subPath: odh-trusted-ca-bundle.crt
123 |           - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
124 |             name: odh-trusted-ca-cert
125 |             subPath: odh-trusted-ca-bundle.crt
126 |           - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
127 |             name: odh-ca-cert
128 |             subPath: odh-ca-bundle.crt
129 |           - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
130 |             name: odh-ca-cert
131 |             subPath: odh-ca-bundle.crt
132 |         volumes:
133 |         - configMap:
134 |             items:
135 |             - key: ca-bundle.crt
136 |               path: odh-trusted-ca-bundle.crt
137 |             name: odh-trusted-ca-bundle
138 |             optional: true
139 |           name: odh-trusted-ca-cert
140 |         - configMap:
141 |             items:
142 |             - key: odh-ca-bundle.crt
143 |               path: odh-ca-bundle.crt
144 |             name: odh-trusted-ca-bundle
145 |             optional: true
146 |           name: odh-ca-cert
147 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/support_clusters/test-aw-a.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: workload.codeflare.dev/v1beta2
  2 | kind: AppWrapper
  3 | metadata:
  4 |   labels:
  5 |     kueue.x-k8s.io/queue-name: local_default_queue
  6 |   name: test-cluster-a
  7 |   namespace: ns
  8 | spec:
  9 |   components:
 10 |   - template:
 11 |       apiVersion: ray.io/v1
 12 |       kind: RayCluster
 13 |       metadata:
 14 |         labels:
 15 |           controller-tools.k8s.io: '1.0'
 16 |         name: test-cluster-a
 17 |         namespace: ns
 18 |       spec:
 19 |         autoscalerOptions:
 20 |           idleTimeoutSeconds: 60
 21 |           resources:
 22 |             limits:
 23 |               cpu: 500m
 24 |               memory: 512Mi
 25 |             requests:
 26 |               cpu: 500m
 27 |               memory: 512Mi
 28 |           upscalingMode: Default
 29 |         enableInTreeAutoscaling: false
 30 |         headGroupSpec:
 31 |           enableIngress: false
 32 |           rayStartParams:
 33 |             block: 'true'
 34 |             dashboard-host: 0.0.0.0
 35 |             num-gpus: '0'
 36 |             resources: '"{}"'
 37 |           serviceType: ClusterIP
 38 |           template:
 39 |             spec:
 40 |               containers:
 41 |               - image: "${image}"
 42 |                 imagePullPolicy: IfNotPresent
 43 |                 lifecycle:
 44 |                   preStop:
 45 |                     exec:
 46 |                       command:
 47 |                       - /bin/sh
 48 |                       - -c
 49 |                       - ray stop
 50 |                 name: ray-head
 51 |                 ports:
 52 |                 - containerPort: 6379
 53 |                   name: gcs
 54 |                 - containerPort: 8265
 55 |                   name: dashboard
 56 |                 - containerPort: 10001
 57 |                   name: client
 58 |                 resources:
 59 |                   limits:
 60 |                     cpu: 2
 61 |                     memory: 8G
 62 |                   requests:
 63 |                     cpu: 2
 64 |                     memory: 8G
 65 |                 volumeMounts:
 66 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
 67 |                   name: odh-trusted-ca-cert
 68 |                   subPath: odh-trusted-ca-bundle.crt
 69 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
 70 |                   name: odh-trusted-ca-cert
 71 |                   subPath: odh-trusted-ca-bundle.crt
 72 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
 73 |                   name: odh-ca-cert
 74 |                   subPath: odh-ca-bundle.crt
 75 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
 76 |                   name: odh-ca-cert
 77 |                   subPath: odh-ca-bundle.crt
 78 |               imagePullSecrets: []
 79 |               volumes:
 80 |               - configMap:
 81 |                   items:
 82 |                   - key: ca-bundle.crt
 83 |                     path: odh-trusted-ca-bundle.crt
 84 |                   name: odh-trusted-ca-bundle
 85 |                   optional: true
 86 |                 name: odh-trusted-ca-cert
 87 |               - configMap:
 88 |                   items:
 89 |                   - key: odh-ca-bundle.crt
 90 |                     path: odh-ca-bundle.crt
 91 |                   name: odh-trusted-ca-bundle
 92 |                   optional: true
 93 |                 name: odh-ca-cert
 94 |         rayVersion: 2.47.1
 95 |         workerGroupSpecs:
 96 |         - groupName: small-group-test-cluster-a
 97 |           maxReplicas: 1
 98 |           minReplicas: 1
 99 |           rayStartParams:
100 |             block: 'true'
101 |             num-gpus: '0'
102 |             resources: '"{}"'
103 |           replicas: 1
104 |           template:
105 |             metadata:
106 |               annotations:
107 |                 key: value
108 |               labels:
109 |                 key: value
110 |             spec:
111 |               containers:
112 |               - image: "${image}"
113 |                 lifecycle:
114 |                   preStop:
115 |                     exec:
116 |                       command:
117 |                       - /bin/sh
118 |                       - -c
119 |                       - ray stop
120 |                 name: machine-learning
121 |                 resources:
122 |                   limits:
123 |                     cpu: 1
124 |                     memory: 2G
125 |                   requests:
126 |                     cpu: 1
127 |                     memory: 2G
128 |                 volumeMounts:
129 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
130 |                   name: odh-trusted-ca-cert
131 |                   subPath: odh-trusted-ca-bundle.crt
132 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
133 |                   name: odh-trusted-ca-cert
134 |                   subPath: odh-trusted-ca-bundle.crt
135 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
136 |                   name: odh-ca-cert
137 |                   subPath: odh-ca-bundle.crt
138 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
139 |                   name: odh-ca-cert
140 |                   subPath: odh-ca-bundle.crt
141 |               imagePullSecrets: []
142 |               volumes:
143 |               - configMap:
144 |                   items:
145 |                   - key: ca-bundle.crt
146 |                     path: odh-trusted-ca-bundle.crt
147 |                   name: odh-trusted-ca-bundle
148 |                   optional: true
149 |                 name: odh-trusted-ca-cert
150 |               - configMap:
151 |                   items:
152 |                   - key: odh-ca-bundle.crt
153 |                     path: odh-ca-bundle.crt
154 |                   name: odh-trusted-ca-bundle
155 |                   optional: true
156 |                 name: odh-ca-cert
157 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/support_clusters/test-aw-b.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: workload.codeflare.dev/v1beta2
  2 | kind: AppWrapper
  3 | metadata:
  4 |   labels:
  5 |     kueue.x-k8s.io/queue-name: local_default_queue
  6 |   name: test-cluster-b
  7 |   namespace: ns
  8 | spec:
  9 |   components:
 10 |   - template:
 11 |       apiVersion: ray.io/v1
 12 |       kind: RayCluster
 13 |       metadata:
 14 |         labels:
 15 |           controller-tools.k8s.io: '1.0'
 16 |         name: test-cluster-b
 17 |         namespace: ns
 18 |       spec:
 19 |         autoscalerOptions:
 20 |           idleTimeoutSeconds: 60
 21 |           resources:
 22 |             limits:
 23 |               cpu: 500m
 24 |               memory: 512Mi
 25 |             requests:
 26 |               cpu: 500m
 27 |               memory: 512Mi
 28 |           upscalingMode: Default
 29 |         enableInTreeAutoscaling: false
 30 |         headGroupSpec:
 31 |           enableIngress: false
 32 |           rayStartParams:
 33 |             block: 'true'
 34 |             dashboard-host: 0.0.0.0
 35 |             num-gpus: '0'
 36 |             resources: '"{}"'
 37 |           serviceType: ClusterIP
 38 |           template:
 39 |             spec:
 40 |               containers:
 41 |               - image: "${image}"
 42 |                 imagePullPolicy: IfNotPresent
 43 |                 lifecycle:
 44 |                   preStop:
 45 |                     exec:
 46 |                       command:
 47 |                       - /bin/sh
 48 |                       - -c
 49 |                       - ray stop
 50 |                 name: ray-head
 51 |                 ports:
 52 |                 - containerPort: 6379
 53 |                   name: gcs
 54 |                 - containerPort: 8265
 55 |                   name: dashboard
 56 |                 - containerPort: 10001
 57 |                   name: client
 58 |                 resources:
 59 |                   limits:
 60 |                     cpu: 2
 61 |                     memory: 8G
 62 |                   requests:
 63 |                     cpu: 2
 64 |                     memory: 8G
 65 |                 volumeMounts:
 66 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
 67 |                   name: odh-trusted-ca-cert
 68 |                   subPath: odh-trusted-ca-bundle.crt
 69 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
 70 |                   name: odh-trusted-ca-cert
 71 |                   subPath: odh-trusted-ca-bundle.crt
 72 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
 73 |                   name: odh-ca-cert
 74 |                   subPath: odh-ca-bundle.crt
 75 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
 76 |                   name: odh-ca-cert
 77 |                   subPath: odh-ca-bundle.crt
 78 |               imagePullSecrets: []
 79 |               volumes:
 80 |               - configMap:
 81 |                   items:
 82 |                   - key: ca-bundle.crt
 83 |                     path: odh-trusted-ca-bundle.crt
 84 |                   name: odh-trusted-ca-bundle
 85 |                   optional: true
 86 |                 name: odh-trusted-ca-cert
 87 |               - configMap:
 88 |                   items:
 89 |                   - key: odh-ca-bundle.crt
 90 |                     path: odh-ca-bundle.crt
 91 |                   name: odh-trusted-ca-bundle
 92 |                   optional: true
 93 |                 name: odh-ca-cert
 94 |         rayVersion: 2.47.1
 95 |         workerGroupSpecs:
 96 |         - groupName: small-group-test-cluster-b
 97 |           maxReplicas: 1
 98 |           minReplicas: 1
 99 |           rayStartParams:
100 |             block: 'true'
101 |             num-gpus: '0'
102 |             resources: '"{}"'
103 |           replicas: 1
104 |           template:
105 |             metadata:
106 |               annotations:
107 |                 key: value
108 |               labels:
109 |                 key: value
110 |             spec:
111 |               containers:
112 |               - image: "${image}"
113 |                 lifecycle:
114 |                   preStop:
115 |                     exec:
116 |                       command:
117 |                       - /bin/sh
118 |                       - -c
119 |                       - ray stop
120 |                 name: machine-learning
121 |                 resources:
122 |                   limits:
123 |                     cpu: 1
124 |                     memory: 2G
125 |                   requests:
126 |                     cpu: 1
127 |                     memory: 2G
128 |                 volumeMounts:
129 |                 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
130 |                   name: odh-trusted-ca-cert
131 |                   subPath: odh-trusted-ca-bundle.crt
132 |                 - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
133 |                   name: odh-trusted-ca-cert
134 |                   subPath: odh-trusted-ca-bundle.crt
135 |                 - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
136 |                   name: odh-ca-cert
137 |                   subPath: odh-ca-bundle.crt
138 |                 - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
139 |                   name: odh-ca-cert
140 |                   subPath: odh-ca-bundle.crt
141 |               imagePullSecrets: []
142 |               volumes:
143 |               - configMap:
144 |                   items:
145 |                   - key: ca-bundle.crt
146 |                     path: odh-trusted-ca-bundle.crt
147 |                   name: odh-trusted-ca-bundle
148 |                   optional: true
149 |                 name: odh-trusted-ca-cert
150 |               - configMap:
151 |                   items:
152 |                   - key: odh-ca-bundle.crt
153 |                     path: odh-ca-bundle.crt
154 |                   name: odh-trusted-ca-bundle
155 |                   optional: true
156 |                 name: odh-ca-cert
157 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/support_clusters/test-rc-a.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: ray.io/v1
  2 | kind: RayCluster
  3 | metadata:
  4 |   labels:
  5 |     controller-tools.k8s.io: '1.0'
  6 |     kueue.x-k8s.io/queue-name: local_default_queue
  7 |   name: test-cluster-a
  8 |   namespace: ns
  9 | spec:
 10 |   autoscalerOptions:
 11 |     idleTimeoutSeconds: 60
 12 |     resources:
 13 |       limits:
 14 |         cpu: 500m
 15 |         memory: 512Mi
 16 |       requests:
 17 |         cpu: 500m
 18 |         memory: 512Mi
 19 |     upscalingMode: Default
 20 |   enableInTreeAutoscaling: false
 21 |   headGroupSpec:
 22 |     enableIngress: false
 23 |     rayStartParams:
 24 |       block: 'true'
 25 |       dashboard-host: 0.0.0.0
 26 |       num-gpus: '0'
 27 |       resources: '"{}"'
 28 |     serviceType: ClusterIP
 29 |     template:
 30 |       spec:
 31 |         containers:
 32 |         - image: "${image}"
 33 |           imagePullPolicy: IfNotPresent
 34 |           lifecycle:
 35 |             preStop:
 36 |               exec:
 37 |                 command:
 38 |                 - /bin/sh
 39 |                 - -c
 40 |                 - ray stop
 41 |           name: ray-head
 42 |           ports:
 43 |           - containerPort: 6379
 44 |             name: gcs
 45 |           - containerPort: 8265
 46 |             name: dashboard
 47 |           - containerPort: 10001
 48 |             name: client
 49 |           resources:
 50 |             limits:
 51 |               cpu: 2
 52 |               memory: 8G
 53 |             requests:
 54 |               cpu: 2
 55 |               memory: 8G
 56 |           volumeMounts:
 57 |           - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
 58 |             name: odh-trusted-ca-cert
 59 |             subPath: odh-trusted-ca-bundle.crt
 60 |           - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
 61 |             name: odh-trusted-ca-cert
 62 |             subPath: odh-trusted-ca-bundle.crt
 63 |           - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
 64 |             name: odh-ca-cert
 65 |             subPath: odh-ca-bundle.crt
 66 |           - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
 67 |             name: odh-ca-cert
 68 |             subPath: odh-ca-bundle.crt
 69 |         imagePullSecrets: []
 70 |         volumes:
 71 |         - configMap:
 72 |             items:
 73 |             - key: ca-bundle.crt
 74 |               path: odh-trusted-ca-bundle.crt
 75 |             name: odh-trusted-ca-bundle
 76 |             optional: true
 77 |           name: odh-trusted-ca-cert
 78 |         - configMap:
 79 |             items:
 80 |             - key: odh-ca-bundle.crt
 81 |               path: odh-ca-bundle.crt
 82 |             name: odh-trusted-ca-bundle
 83 |             optional: true
 84 |           name: odh-ca-cert
 85 |   rayVersion: 2.47.1
 86 |   workerGroupSpecs:
 87 |   - groupName: small-group-test-cluster-a
 88 |     maxReplicas: 1
 89 |     minReplicas: 1
 90 |     rayStartParams:
 91 |       block: 'true'
 92 |       num-gpus: '0'
 93 |       resources: '"{}"'
 94 |     replicas: 1
 95 |     template:
 96 |       metadata:
 97 |         annotations:
 98 |           key: value
 99 |         labels:
100 |           key: value
101 |       spec:
102 |         containers:
103 |         - image: "${image}"
104 |           lifecycle:
105 |             preStop:
106 |               exec:
107 |                 command:
108 |                 - /bin/sh
109 |                 - -c
110 |                 - ray stop
111 |           name: machine-learning
112 |           resources:
113 |             limits:
114 |               cpu: 1
115 |               memory: 2G
116 |             requests:
117 |               cpu: 1
118 |               memory: 2G
119 |           volumeMounts:
120 |           - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
121 |             name: odh-trusted-ca-cert
122 |             subPath: odh-trusted-ca-bundle.crt
123 |           - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
124 |             name: odh-trusted-ca-cert
125 |             subPath: odh-trusted-ca-bundle.crt
126 |           - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
127 |             name: odh-ca-cert
128 |             subPath: odh-ca-bundle.crt
129 |           - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
130 |             name: odh-ca-cert
131 |             subPath: odh-ca-bundle.crt
132 |         imagePullSecrets: []
133 |         volumes:
134 |         - configMap:
135 |             items:
136 |             - key: ca-bundle.crt
137 |               path: odh-trusted-ca-bundle.crt
138 |             name: odh-trusted-ca-bundle
139 |             optional: true
140 |           name: odh-trusted-ca-cert
141 |         - configMap:
142 |             items:
143 |             - key: odh-ca-bundle.crt
144 |               path: odh-ca-bundle.crt
145 |             name: odh-trusted-ca-bundle
146 |             optional: true
147 |           name: odh-ca-cert
148 | 


--------------------------------------------------------------------------------
/tests/test_cluster_yamls/support_clusters/test-rc-b.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: ray.io/v1
  2 | kind: RayCluster
  3 | metadata:
  4 |   labels:
  5 |     controller-tools.k8s.io: '1.0'
  6 |     kueue.x-k8s.io/queue-name: local_default_queue
  7 |   name: test-rc-b
  8 |   namespace: ns
  9 | spec:
 10 |   autoscalerOptions:
 11 |     idleTimeoutSeconds: 60
 12 |     resources:
 13 |       limits:
 14 |         cpu: 500m
 15 |         memory: 512Mi
 16 |       requests:
 17 |         cpu: 500m
 18 |         memory: 512Mi
 19 |     upscalingMode: Default
 20 |   enableInTreeAutoscaling: false
 21 |   headGroupSpec:
 22 |     enableIngress: false
 23 |     rayStartParams:
 24 |       block: 'true'
 25 |       dashboard-host: 0.0.0.0
 26 |       num-gpus: '0'
 27 |       resources: '"{}"'
 28 |     serviceType: ClusterIP
 29 |     template:
 30 |       spec:
 31 |         containers:
 32 |         - image: "${image}"
 33 |           imagePullPolicy: IfNotPresent
 34 |           lifecycle:
 35 |             preStop:
 36 |               exec:
 37 |                 command:
 38 |                 - /bin/sh
 39 |                 - -c
 40 |                 - ray stop
 41 |           name: ray-head
 42 |           ports:
 43 |           - containerPort: 6379
 44 |             name: gcs
 45 |           - containerPort: 8265
 46 |             name: dashboard
 47 |           - containerPort: 10001
 48 |             name: client
 49 |           resources:
 50 |             limits:
 51 |               cpu: 2
 52 |               memory: 8G
 53 |             requests:
 54 |               cpu: 2
 55 |               memory: 8G
 56 |           volumeMounts:
 57 |           - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
 58 |             name: odh-trusted-ca-cert
 59 |             subPath: odh-trusted-ca-bundle.crt
 60 |           - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
 61 |             name: odh-trusted-ca-cert
 62 |             subPath: odh-trusted-ca-bundle.crt
 63 |           - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
 64 |             name: odh-ca-cert
 65 |             subPath: odh-ca-bundle.crt
 66 |           - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
 67 |             name: odh-ca-cert
 68 |             subPath: odh-ca-bundle.crt
 69 |         imagePullSecrets: []
 70 |         volumes:
 71 |         - configMap:
 72 |             items:
 73 |             - key: ca-bundle.crt
 74 |               path: odh-trusted-ca-bundle.crt
 75 |             name: odh-trusted-ca-bundle
 76 |             optional: true
 77 |           name: odh-trusted-ca-cert
 78 |         - configMap:
 79 |             items:
 80 |             - key: odh-ca-bundle.crt
 81 |               path: odh-ca-bundle.crt
 82 |             name: odh-trusted-ca-bundle
 83 |             optional: true
 84 |           name: odh-ca-cert
 85 |   rayVersion: 2.47.1
 86 |   workerGroupSpecs:
 87 |   - groupName: small-group-test-rc-b
 88 |     maxReplicas: 1
 89 |     minReplicas: 1
 90 |     rayStartParams:
 91 |       block: 'true'
 92 |       num-gpus: '0'
 93 |       resources: '"{}"'
 94 |     replicas: 1
 95 |     template:
 96 |       metadata:
 97 |         annotations:
 98 |           key: value
 99 |         labels:
100 |           key: value
101 |       spec:
102 |         containers:
103 |         - image: "${image}"
104 |           lifecycle:
105 |             preStop:
106 |               exec:
107 |                 command:
108 |                 - /bin/sh
109 |                 - -c
110 |                 - ray stop
111 |           name: machine-learning
112 |           resources:
113 |             limits:
114 |               cpu: 1
115 |               memory: 2G
116 |             requests:
117 |               cpu: 1
118 |               memory: 2G
119 |           volumeMounts:
120 |           - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
121 |             name: odh-trusted-ca-cert
122 |             subPath: odh-trusted-ca-bundle.crt
123 |           - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
124 |             name: odh-trusted-ca-cert
125 |             subPath: odh-trusted-ca-bundle.crt
126 |           - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
127 |             name: odh-ca-cert
128 |             subPath: odh-ca-bundle.crt
129 |           - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
130 |             name: odh-ca-cert
131 |             subPath: odh-ca-bundle.crt
132 |         imagePullSecrets: []
133 |         volumes:
134 |         - configMap:
135 |             items:
136 |             - key: ca-bundle.crt
137 |               path: odh-trusted-ca-bundle.crt
138 |             name: odh-trusted-ca-bundle
139 |             optional: true
140 |           name: odh-trusted-ca-cert
141 |         - configMap:
142 |             items:
143 |             - key: odh-ca-bundle.crt
144 |               path: odh-ca-bundle.crt
145 |             name: odh-trusted-ca-bundle
146 |             optional: true
147 |           name: odh-ca-cert
148 | 


--------------------------------------------------------------------------------
/tests/upgrade/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/tests/upgrade/__init__.py


--------------------------------------------------------------------------------
/tests/upgrade/raycluster_sdk_upgrade_test.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from time import sleep
  3 | 
  4 | from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
  5 | from codeflare_sdk.ray.client import RayJobClient
  6 | 
  7 | from tests.e2e.support import *
  8 | from codeflare_sdk.ray.cluster.cluster import get_cluster
  9 | 
 10 | from codeflare_sdk.common import _kube_api_error_handling
 11 | 
 12 | namespace = "test-ns-rayupgrade"
 13 | # Global variables for kueue resources
 14 | cluster_queue = "cluster-queue-mnist"
 15 | flavor = "default-flavor-mnist"
 16 | local_queue = "local-queue-mnist"
 17 | 
 18 | 
 19 | # Creates a Ray cluster
 20 | class TestMNISTRayClusterUp:
 21 |     def setup_method(self):
 22 |         initialize_kubernetes_client(self)
 23 |         create_namespace_with_name(self, namespace)
 24 |         try:
 25 |             create_cluster_queue(self, cluster_queue, flavor)
 26 |             create_resource_flavor(self, flavor)
 27 |             create_local_queue(self, cluster_queue, local_queue)
 28 |         except Exception as e:
 29 |             delete_namespace(self)
 30 |             delete_kueue_resources(self)
 31 |             return _kube_api_error_handling(e)
 32 | 
 33 |     def test_mnist_ray_cluster_sdk_auth(self):
 34 |         self.run_mnist_raycluster_sdk_oauth()
 35 | 
 36 |     def run_mnist_raycluster_sdk_oauth(self):
 37 |         ray_image = get_ray_image()
 38 | 
 39 |         auth = TokenAuthentication(
 40 |             token=run_oc_command(["whoami", "--show-token=true"]),
 41 |             server=run_oc_command(["whoami", "--show-server=true"]),
 42 |             skip_tls=True,
 43 |         )
 44 |         auth.login()
 45 | 
 46 |         cluster = Cluster(
 47 |             ClusterConfiguration(
 48 |                 name="mnist",
 49 |                 namespace=self.namespace,
 50 |                 num_workers=1,
 51 |                 head_cpu_requests=1,
 52 |                 head_cpu_limits=1,
 53 |                 head_memory_requests=6,
 54 |                 head_memory_limits=8,
 55 |                 worker_cpu_requests=1,
 56 |                 worker_cpu_limits=1,
 57 |                 worker_memory_requests=6,
 58 |                 worker_memory_limits=8,
 59 |                 image=ray_image,
 60 |                 write_to_file=True,
 61 |                 verify_tls=False,
 62 |             )
 63 |         )
 64 | 
 65 |         try:
 66 |             cluster.up()
 67 |             cluster.status()
 68 |             # wait for raycluster to be Ready
 69 |             cluster.wait_ready()
 70 |             cluster.status()
 71 |             # Check cluster details
 72 |             cluster.details()
 73 |             # Assert the cluster status is READY
 74 |             _, ready = cluster.status()
 75 |             assert ready
 76 | 
 77 |         except Exception as e:
 78 |             print(f"An unexpected error occurred. Error: ", e)
 79 |             delete_namespace(self)
 80 |             assert False, "Cluster is not ready!"
 81 | 
 82 | 
 83 | class TestMnistJobSubmit:
 84 |     def setup_method(self):
 85 |         initialize_kubernetes_client(self)
 86 |         auth = TokenAuthentication(
 87 |             token=run_oc_command(["whoami", "--show-token=true"]),
 88 |             server=run_oc_command(["whoami", "--show-server=true"]),
 89 |             skip_tls=True,
 90 |         )
 91 |         auth.login()
 92 |         self.namespace = namespace
 93 |         self.cluster = get_cluster("mnist", self.namespace)
 94 |         if not self.cluster:
 95 |             raise RuntimeError("TestRayClusterUp needs to be run before this test")
 96 | 
 97 |     def test_mnist_job_submission(self):
 98 |         self.assert_jobsubmit_withoutLogin(self.cluster)
 99 |         self.assert_jobsubmit_withlogin(self.cluster)
100 | 
101 |     # Assertions
102 |     def assert_jobsubmit_withoutLogin(self, cluster):
103 |         dashboard_url = cluster.cluster_dashboard_uri()
104 |         try:
105 |             RayJobClient(address=dashboard_url, verify=False)
106 |             assert False
107 |         except Exception as e:
108 |             if e.response.status_code == 403:
109 |                 assert True
110 |             else:
111 |                 print(f"An unexpected error occurred. Error: {e}")
112 |                 assert False
113 | 
114 |     def assert_jobsubmit_withlogin(self, cluster):
115 |         auth_token = run_oc_command(["whoami", "--show-token=true"])
116 |         ray_dashboard = cluster.cluster_dashboard_uri()
117 |         header = {"Authorization": f"Bearer {auth_token}"}
118 |         client = RayJobClient(address=ray_dashboard, headers=header, verify=False)
119 | 
120 |         # Submit the job
121 |         submission_id = client.submit_job(
122 |             entrypoint="python mnist.py",
123 |             runtime_env={
124 |                 "working_dir": "./tests/e2e/",
125 |                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
126 |                 "env_vars": get_setup_env_variables(),
127 |             },
128 |         )
129 |         print(f"Submitted job with ID: {submission_id}")
130 |         done = False
131 |         time = 0
132 |         timeout = 900
133 |         while not done:
134 |             status = client.get_job_status(submission_id)
135 |             if status.is_terminal():
136 |                 break
137 |             if not done:
138 |                 print(status)
139 |                 if timeout and time >= timeout:
140 |                     raise TimeoutError(f"job has timed out after waiting {timeout}s")
141 |                 sleep(5)
142 |                 time += 5
143 | 
144 |         logs = client.get_job_logs(submission_id)
145 |         print(logs)
146 | 
147 |         self.assert_job_completion(status)
148 | 
149 |         client.delete_job(submission_id)
150 | 
151 |     def assert_job_completion(self, status):
152 |         if status == "SUCCEEDED":
153 |             print(f"Job has completed: '{status}'")
154 |             assert True
155 |         else:
156 |             print(f"Job has completed: '{status}'")
157 |             assert False
158 | 


--------------------------------------------------------------------------------
/ui-tests/.yarnrc:
--------------------------------------------------------------------------------
1 | disable-self-update-check true
2 | ignore-optional true
3 | network-timeout "300000"
4 | registry "https://registry.npmjs.org/"
5 | 


--------------------------------------------------------------------------------
/ui-tests/jupyter_server_config.py:
--------------------------------------------------------------------------------
1 | from jupyterlab.galata import configure_jupyter_server
2 | 
3 | configure_jupyter_server(c)
4 | 
5 | # Uncomment to set server log level to debug level
6 | # c.ServerApp.log_level = "DEBUG"
7 | 


--------------------------------------------------------------------------------
/ui-tests/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@jupyter-widgets/ui-tests",
 3 |   "private": true,
 4 |   "version": "0.1.0",
 5 |   "description": "ipywidgets UI Tests",
 6 |   "scripts": {
 7 |     "start": "jupyter lab --config ./jupyter_server_config.py",
 8 |     "start:detached": "jlpm start&",
 9 |     "test": "npx playwright test",
10 |     "test:debug": "PWDEBUG=1 npx playwright test",
11 |     "test:report": "http-server ./playwright-report -a localhost -o",
12 |     "test:update": "npx playwright test --update-snapshots",
13 |     "deduplicate": "jlpm && yarn-deduplicate -s fewer --fail"
14 |   },
15 |   "author": "Project Jupyter",
16 |   "license": "BSD-3-Clause",
17 |   "devDependencies": {
18 |     "@jupyterlab/galata": "^5.3.0",
19 |     "@playwright/test": "^1.49.0",
20 |     "yarn-deduplicate": "^6.0.1"
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/ui-tests/playwright.config.js:
--------------------------------------------------------------------------------
 1 | const baseConfig = require('@jupyterlab/galata/lib/playwright-config');
 2 | 
 3 | module.exports = {
 4 |   ...baseConfig,
 5 |   timeout: 600000,
 6 |   webServer: {
 7 |     command: 'yarn start',
 8 |     url: 'http://localhost:8888/lab',
 9 |     timeout: 120 * 1000,
10 |     reuseExistingServer: !process.env.CI,
11 |   },
12 |   retries: 0,
13 | };
14 | 


--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-0-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-0-linux.png


--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-2-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-2-linux.png


--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-3-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-3-linux.png


--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png


--------------------------------------------------------------------------------
/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/codeflare-sdk/f9f86719e09e56995efa5ad03facb21846041574/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png


--------------------------------------------------------------------------------