├── .dockerignore ├── .github ├── release.yaml └── workflows │ ├── CI-devmode.yaml │ ├── CI-standalone.yaml │ ├── CI.yaml │ ├── jekyll-gh-pages.yaml │ └── release.yaml ├── .gitignore ├── .golangci.yml ├── .pre-commit-config.yaml ├── .yamlfmt ├── Dockerfile ├── LICENSE ├── Makefile ├── PROJECT ├── README.md ├── api └── v1beta2 │ ├── appwrapper_types.go │ ├── doc.go │ ├── groupversion_info.go │ └── zz_generated.deepcopy.go ├── cmd └── main.go ├── config ├── certmanager │ ├── certificate.yaml │ ├── kustomization.yaml │ └── kustomizeconfig.yaml ├── crd │ ├── bases │ │ └── workload.codeflare.dev_appwrappers.yaml │ └── kustomization.yaml ├── default │ ├── config.yaml │ ├── editor_role_patch.yaml │ ├── kustomization.yaml │ ├── manager_config_patch.yaml │ ├── manager_webhook_patch.yaml │ ├── metrics_service.yaml │ └── viewer_role_patch.yaml ├── dev │ ├── config.yaml │ ├── kustomization.yaml │ └── namespace.yaml ├── internalcert │ ├── kustomization.yaml │ └── secret.yaml ├── manager │ ├── kustomization.yaml │ └── manager.yaml ├── prometheus │ ├── kustomization.yaml │ └── monitor.yaml ├── rbac │ ├── editor_role.yaml │ ├── kustomization.yaml │ ├── leader_election_role.yaml │ ├── leader_election_role_binding.yaml │ ├── metrics_auth_role.yaml │ ├── metrics_auth_role_binding.yaml │ ├── metrics_reader_role.yaml │ ├── role.yaml │ ├── role_binding.yaml │ ├── service_account.yaml │ ├── user_role.yaml │ └── viewer_role.yaml └── webhook │ ├── kustomization.yaml │ ├── kustomizeconfig.yaml │ ├── manifests.yaml │ └── service.yaml ├── docs ├── release_instructions.md └── website_instructions.md ├── go.mod ├── go.sum ├── hack ├── boilerplate.go.txt ├── create-test-cluster.sh ├── default-queues.yaml ├── deploy-kueue.sh ├── e2e-util.sh ├── kind-config.yaml ├── kueue-config │ └── kustomization.yaml ├── run-dev-mode-tests.sh └── run-tests-on-cluster.sh ├── internal ├── controller │ └── appwrapper │ │ ├── appwrapper_controller.go │ │ ├── appwrapper_controller_test.go │ │ ├── fixtures_test.go │ │ ├── node_health_monitor.go │ │ ├── node_health_monitor_test.go │ │ ├── resource_management.go │ │ └── suite_test.go ├── metrics │ └── metrics.go ├── tools │ └── pinversion.go ├── util │ └── maps.go └── webhook │ ├── appwrapper_fixtures_test.go │ ├── appwrapper_webhook.go │ ├── appwrapper_webhook_test.go │ └── suite_test.go ├── kube-state-metrics ├── README.md ├── appwrapper-ksm-cm.yaml └── appwrapper-ksm-rbac.yaml ├── pkg ├── config │ ├── config.go │ └── config_test.go ├── controller │ └── setup.go ├── logger │ └── logger.go └── utils │ └── utils.go ├── samples ├── README.md ├── wrapped-deployment.yaml ├── wrapped-failing-job.yaml ├── wrapped-failing-pod.yaml ├── wrapped-failing-pytorch-job.yaml ├── wrapped-gpu-job.yaml ├── wrapped-job.yaml ├── wrapped-jobset.yaml ├── wrapped-leader-worker-set.yaml ├── wrapped-pod.yaml └── wrapped-pytorch-job.yaml ├── site ├── .gitignore ├── Gemfile ├── _config.yml ├── _data │ └── navigation.yml ├── _pages │ ├── 404.md │ ├── appwrapper.v1beta2.md │ ├── arch-controller.md │ ├── arch-crd.md │ ├── arch-fault-tolerance.md │ ├── arch-node-monitoring.md │ ├── category-archive.md │ ├── dev-setup.md │ ├── overview.md │ ├── quick-start.md │ ├── sample-batch-job.md │ ├── sample-pytorch.md │ ├── samples.md │ ├── tag-archive.md │ └── year-archive.md ├── assets │ └── js │ │ └── clipboardrouge.js └── genref │ ├── config.yaml │ └── markdown │ ├── members.tpl │ ├── pkg.tpl │ └── type.tpl └── test ├── README.md └── e2e ├── appwrapper_test.go ├── e2e_test.go ├── fixtures_test.go ├── metrics_test.go └── util_test.go /.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore build and test binaries. 3 | bin/ 4 | -------------------------------------------------------------------------------- /.github/release.yaml: -------------------------------------------------------------------------------- 1 | changelog: 2 | exclude: 3 | labels: 4 | - housekeeping 5 | categories: 6 | - title: Features 7 | labels: 8 | - enhancement 9 | - title: Bugs and Regressions 10 | labels: 11 | - bug 12 | - title: Other Changes 13 | labels: 14 | - "*" 15 | -------------------------------------------------------------------------------- /.github/workflows/CI-devmode.yaml: -------------------------------------------------------------------------------- 1 | name: CI-devmode 2 | on: 3 | push: 4 | branches: [main, rhoai-2.10] 5 | paths-ignore: 6 | - 'site/**' 7 | pull_request: 8 | branches: [main, rhoai-2.10] 9 | 10 | jobs: 11 | CI: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: checkout code 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Set latest tag and branch name 21 | run: | 22 | echo "GIT_BRANCH=gha-ci" >> $GITHUB_ENV 23 | echo "TAG=$GITHUB_RUN_ID" >> $GITHUB_ENV 24 | 25 | - name: Set up Go 26 | uses: actions/setup-go@v5 27 | with: 28 | go-version-file: './go.mod' 29 | 30 | - name: Set up Python 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: '3.11' 34 | 35 | - name: Run pre-commit checks 36 | run: | 37 | pip install pre-commit 38 | pre-commit run --show-diff-on-failure --color=always --all-files 39 | 40 | - name: Verify that generated manifests are up-to-date 41 | run: make manifests && git diff --exit-code 42 | 43 | - name: Build 44 | run: make build 45 | 46 | - name: Run Unit Tests 47 | run: make test 48 | 49 | - name: Create and configure cluster 50 | run: ./hack/create-test-cluster.sh 51 | 52 | - name: Install CRDs 53 | run: | 54 | make install -e GIT_BRANCH=${{ env.GIT_BRANCH }} TAG=${{ env.GIT_BRANCH }}-${{ env.TAG }} 55 | 56 | - name: Run E2E tests using dev mode controller 57 | run: ./hack/run-dev-mode-tests.sh 58 | -------------------------------------------------------------------------------- /.github/workflows/CI-standalone.yaml: -------------------------------------------------------------------------------- 1 | name: CI-standalone 2 | on: 3 | push: 4 | branches: [main, rhoai-2.10] 5 | paths-ignore: 6 | - 'site/**' 7 | pull_request: 8 | branches: [main, rhoai-2.10] 9 | 10 | jobs: 11 | CI: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: checkout code 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Set latest tag and branch name 21 | run: | 22 | echo "GIT_BRANCH=gha-ci" >> $GITHUB_ENV 23 | echo "TAG=$GITHUB_RUN_ID" >> $GITHUB_ENV 24 | 25 | - name: Set up Go 26 | uses: actions/setup-go@v5 27 | with: 28 | go-version-file: './go.mod' 29 | 30 | - name: Set up Python 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: '3.11' 34 | 35 | - name: Run pre-commit checks 36 | run: | 37 | pip install pre-commit 38 | pre-commit run --show-diff-on-failure --color=always --all-files 39 | 40 | - name: Verify that generated manifests are up-to-date 41 | run: make manifests && git diff --exit-code 42 | 43 | - name: Build 44 | run: make build 45 | 46 | - name: Build Image 47 | run: | 48 | make docker-build -e GIT_BRANCH=${{ env.GIT_BRANCH }} TAG=${{ env.GIT_BRANCH }}-${{ env.TAG }} 49 | 50 | - name: Run Unit Tests 51 | run: make test 52 | 53 | - name: Create and configure cluster 54 | run: ./hack/create-test-cluster.sh 55 | 56 | - name: Deploy AppWrapper controller 57 | run: | 58 | make kind-push -e GIT_BRANCH=${{ env.GIT_BRANCH }} TAG=${{ env.GIT_BRANCH }}-${{ env.TAG }} 59 | make deploy -e GIT_BRANCH=${{ env.GIT_BRANCH }} TAG=${{ env.GIT_BRANCH }}-${{ env.TAG }} ENV=default 60 | 61 | - name: Run E2E tests 62 | run: LABEL_FILTER="Metrics,Standalone,Webhook" ./hack/run-tests-on-cluster.sh 63 | -------------------------------------------------------------------------------- /.github/workflows/CI.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: [main, rhoai-2.10, codeflare-releases] 5 | paths-ignore: 6 | - 'site/**' 7 | pull_request: 8 | branches: [main, rhoai-2.10, codeflare-releases] 9 | 10 | jobs: 11 | CI: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: checkout code 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Set latest tag and branch name 21 | run: | 22 | echo "GIT_BRANCH=gha-ci" >> $GITHUB_ENV 23 | echo "TAG=$GITHUB_RUN_ID" >> $GITHUB_ENV 24 | 25 | - name: Set up Go 26 | uses: actions/setup-go@v5 27 | with: 28 | go-version-file: './go.mod' 29 | 30 | - name: Set up Python 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: '3.11' 34 | 35 | - name: Run pre-commit checks 36 | run: | 37 | pip install pre-commit 38 | pre-commit run --show-diff-on-failure --color=always --all-files 39 | 40 | - name: Verify that generated manifests are up-to-date 41 | run: make manifests && git diff --exit-code 42 | 43 | - name: Build 44 | run: make build 45 | 46 | - name: Build Image 47 | run: | 48 | make docker-build -e GIT_BRANCH=${{ env.GIT_BRANCH }} TAG=${{ env.GIT_BRANCH }}-${{ env.TAG }} 49 | 50 | - name: Run Unit Tests 51 | run: make test 52 | 53 | - name: Create and configure cluster 54 | run: ./hack/create-test-cluster.sh 55 | 56 | - name: Deploy Kueue 57 | run: ./hack/deploy-kueue.sh 58 | 59 | - name: Deploy AppWrapper controller 60 | run: | 61 | make kind-push -e GIT_BRANCH=${{ env.GIT_BRANCH }} TAG=${{ env.GIT_BRANCH }}-${{ env.TAG }} 62 | make deploy -e GIT_BRANCH=${{ env.GIT_BRANCH }} TAG=${{ env.GIT_BRANCH }}-${{ env.TAG }} ENV=default 63 | 64 | - name: Run E2E tests 65 | run: ./hack/run-tests-on-cluster.sh 66 | -------------------------------------------------------------------------------- /.github/workflows/jekyll-gh-pages.yaml: -------------------------------------------------------------------------------- 1 | name: Build GitHub Pages 2 | 3 | on: 4 | # Runs on pushes targeting the default branch 5 | push: 6 | branches: ["main"] 7 | paths: 8 | - 'site/**' 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 14 | permissions: 15 | contents: read 16 | pages: write 17 | id-token: write 18 | 19 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 20 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 21 | concurrency: 22 | group: "pages" 23 | cancel-in-progress: false 24 | 25 | jobs: 26 | # Build job 27 | build: 28 | runs-on: ubuntu-latest 29 | steps: 30 | - name: Checkout 31 | uses: actions/checkout@v4 32 | 33 | - name: Setup Ruby 34 | uses: ruby/setup-ruby@v1 35 | with: 36 | ruby-version: '3.1' 37 | working-directory: site 38 | bundler-cache: false 39 | 40 | - name: Install Gems 41 | working-directory: site 42 | run: bundle install 43 | 44 | - name: Setup Pages 45 | id: pages 46 | uses: actions/configure-pages@v5 47 | 48 | - name: Build with Jekyll 49 | working-directory: site 50 | run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}" 51 | env: 52 | JEKYLL_ENV: production 53 | 54 | - name: Upload artifact 55 | uses: actions/upload-pages-artifact@v3 56 | with: 57 | path: "site/_site/" 58 | 59 | # Deployment job 60 | deploy: 61 | environment: 62 | name: github-pages 63 | url: ${{ steps.deployment.outputs.page_url }} 64 | runs-on: ubuntu-latest 65 | needs: build 66 | steps: 67 | - name: Deploy to GitHub Pages 68 | id: deployment 69 | uses: actions/deploy-pages@v4 70 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | # Actions to take when a release is tagged 2 | 3 | name: release 4 | 5 | on: 6 | push: 7 | # Sequence of patterns matched against refs/tags 8 | tags: 9 | - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 10 | 11 | jobs: 12 | release: 13 | runs-on: ubuntu-latest 14 | if: github.repository == 'project-codeflare/appwrapper' 15 | steps: 16 | - name: checkout code 17 | uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: validate tag format 22 | run: | 23 | if [[ ${GITHUB_REF_NAME} =~ ^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$ ]]; then 24 | echo "Tag format is valid." 25 | else 26 | echo "Invalid tag format: ${GITHUB_REF_NAME}" 27 | exit 1 28 | fi 29 | echo "RELEASE_VERSION=${GITHUB_REF_NAME}" >> "$GITHUB_ENV" 30 | 31 | - name: Set up Go 32 | uses: actions/setup-go@v5 33 | with: 34 | go-version-file: './go.mod' 35 | 36 | - name: docker login 37 | uses: docker/login-action@v3 38 | with: 39 | registry: quay.io 40 | username: ${{ secrets.QUAY_USERNAME }} 41 | password: ${{ secrets.QUAY_ROBOT_TOKEN }} 42 | 43 | - name: Build and Push Images 44 | run: | 45 | make docker-buildx -e TAG=${RELEASE_VERSION} -e quay_repository=quay.io/ibm 46 | 47 | - name: Create Install YAML 48 | run: | 49 | make build-installer -e TAG=${RELEASE_VERSION} -e quay_repository=quay.io/ibm 50 | 51 | - name: Create GitHub Release 52 | uses: softprops/action-gh-release@v2 53 | with: 54 | name: Release ${{ env.RELEASE_VERSION }} 55 | generate_release_notes: true 56 | fail_on_unmatched_files: true 57 | files: | 58 | ./dist/install.yaml 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | bin/* 9 | Dockerfile.cross 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Go workspace file 18 | go.work 19 | 20 | # Kubernetes Generated files - skip generated files, except for vendored files 21 | !vendor/**/zz_generated.* 22 | 23 | # editor and IDE paraphernalia 24 | .idea 25 | .vscode 26 | *.swp 27 | *.swo 28 | *~ 29 | 30 | # CRDs for unit tests 31 | dep-crds 32 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | run: 2 | deadline: 5m 3 | allow-parallel-runners: true 4 | 5 | # Settings of specific linters 6 | linters-settings: 7 | gci: 8 | sections: 9 | - standard 10 | - default 11 | - prefix(github.com/project-codeflare/appwrapper) 12 | - blank 13 | - dot 14 | skip-generated: true # Skip generated files. 15 | 16 | linters: 17 | disable-all: true 18 | enable: 19 | - copyloopvar 20 | - dupl 21 | - dupword 22 | - errcheck 23 | - gci 24 | - ginkgolinter 25 | - goconst 26 | - gocyclo 27 | - gofmt 28 | - goimports 29 | - gosimple 30 | - govet 31 | - ineffassign 32 | - lll 33 | - misspell 34 | - nakedret 35 | - prealloc 36 | - staticcheck 37 | - typecheck 38 | - unconvert 39 | - unparam 40 | - unused 41 | 42 | issues: 43 | # don't skip warning about doc comments 44 | # don't exclude the default set of lint 45 | exclude-use-default: false 46 | # restore some of the defaults 47 | # (fill in the rest as needed) 48 | exclude-rules: 49 | - path: "test/*" 50 | linters: 51 | - goconst 52 | - lll 53 | - prealloc 54 | - staticcheck 55 | - unparam 56 | - unused 57 | - ineffassign 58 | - path: "api/*" 59 | linters: 60 | - lll 61 | - path: "internal/*" 62 | linters: 63 | - dupl 64 | - lll 65 | - path: "pkg/*" 66 | linters: 67 | - lll 68 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/dnephin/pre-commit-golang 3 | rev: v0.5.1 4 | hooks: 5 | - id: go-fmt 6 | - id: go-mod-tidy 7 | - repo: local 8 | hooks: 9 | - id: golangci-lint 10 | name: golangci-lint 11 | entry: make lint-fix 12 | language: system 13 | pass_filenames: false 14 | - id: generate-apiref 15 | name: generate-apiref 16 | entry: make generate-apiref 17 | language: system 18 | pass_filenames: false 19 | - repo: https://github.com/google/yamlfmt 20 | rev: v0.10.0 21 | hooks: 22 | - id: yamlfmt 23 | exclude: ^config/crd/bases/.*\.yaml|config/rbac/role\.yaml|config/webhook/manifests\.yaml 24 | - repo: https://github.com/pre-commit/pre-commit-hooks 25 | rev: v4.5.0 26 | hooks: 27 | - id: trailing-whitespace 28 | exclude: site/_pages/appwrapper.v1beta2.md 29 | - id: end-of-file-fixer 30 | exclude: hack/boilerplate.go.txt|site/_pages/appwrapper.v1beta2.md 31 | - id: mixed-line-ending 32 | args: ["--fix=lf"] 33 | -------------------------------------------------------------------------------- /.yamlfmt: -------------------------------------------------------------------------------- 1 | formatter: 2 | type: basic 3 | indentless_arrays: true 4 | retain_line_breaks: true 5 | max_line_length: 80 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the manager binary 2 | FROM golang:1.23 AS builder 3 | ARG TARGETOS 4 | ARG TARGETARCH 5 | ARG BUILD_VERSION=Unknown 6 | ARG BUILD_DATE=Unknown 7 | 8 | WORKDIR /workspace 9 | # Copy the Go Modules manifests 10 | COPY go.mod go.mod 11 | COPY go.sum go.sum 12 | # cache deps before building and copying source so that we don't need to re-download as much 13 | # and so that source changes don't invalidate our downloaded layer 14 | RUN go mod download 15 | 16 | # Copy the go source 17 | COPY api/ api/ 18 | COPY cmd/ cmd/ 19 | COPY internal/ internal/ 20 | COPY pkg/ pkg/ 21 | 22 | # Build 23 | # the GOARCH has not a default value to allow the binary be built according to the host where the command 24 | # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO 25 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, 26 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. 27 | RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a \ 28 | -ldflags "-X 'main.BuildVersion=${BUILD_VERSION}' -X 'main.BuildDate=${BUILD_DATE}'" \ 29 | -o manager cmd/main.go 30 | 31 | # Use distroless as minimal base image to package the manager binary 32 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 33 | FROM gcr.io/distroless/static:nonroot 34 | WORKDIR / 35 | COPY --from=builder /workspace/manager . 36 | USER 65532:65532 37 | 38 | ENTRYPOINT ["/manager"] 39 | -------------------------------------------------------------------------------- /PROJECT: -------------------------------------------------------------------------------- 1 | # Code generated by tool. DO NOT EDIT. 2 | # This file is used to track the info used to scaffold your project 3 | # and allow the plugins properly work. 4 | # More info: https://book.kubebuilder.io/reference/project-config.html 5 | domain: codeflare.dev 6 | layout: 7 | - go.kubebuilder.io/v4 8 | projectName: appwrapper 9 | repo: github.com/project-codeflare/appwrapper 10 | resources: 11 | - api: 12 | crdVersion: v1 13 | namespaced: true 14 | controller: true 15 | domain: codeflare.dev 16 | group: workload 17 | kind: AppWrapper 18 | path: github.com/project-codeflare/appwrapper/api/v1beta2 19 | version: v1beta2 20 | webhooks: 21 | defaulting: true 22 | validation: true 23 | webhookVersion: v1 24 | version: "3" 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AppWrapper 2 | 3 | [![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0) 4 | [![Continuous Integration](https://github.com/project-codeflare/appwrapper/actions/workflows/CI.yaml/badge.svg)](https://github.com/project-codeflare/appwrapper/actions/workflows/CI.yaml) 5 | 6 | An AppWrapper contains a collection of Kubernetes resources that a 7 | user desires to manage as a single logical workload. AppWrapper is 8 | designed to smoothly interoperate with 9 | [Kueue](https://kueue.sigs.k8s.io). AppWrapper provides a flexible and 10 | workload-agnostic mechanism for enabling Kueue to manage a group of 11 | Kubernetes resources as a single logical unit without requiring any 12 | Kueue-specific support by the controllers of those resources. 13 | Beginning in Kueue 0.11 (and AppWrapper v1.1), AppWrapper is a 14 | *built-in Kueue integration* and is enabled by default. In older versions 15 | AppWrapper was supported by Kueue as an *external framework* and needed to 16 | be explicitly enabled via a custom Kueue configuration. 17 | 18 | An AppWrapper can be used to harden workloads by providing an 19 | additional level of automatic fault detection and recovery. The AppWrapper 20 | controller monitors the health of the workload and if corrective actions 21 | are not taken by the primary resource controllers within specified deadlines, 22 | the AppWrapper controller will orchestrate workload-level retries and 23 | resource deletion to ensure that either the workload returns to a 24 | healthy state or is cleanly removed from the cluster and its quota 25 | freed for use by other workloads. If [Autopilot](https://github.com/ibm/autopilot) 26 | is also being used on the cluster, the AppWrapper controller can be configured 27 | to automatically inject Node anti-affinities into Pods and to trigger 28 | retries when Pods in already running workloads are using resources 29 | that Autopilot has tagged as unhealthy. For details on customizing and 30 | configuring these fault tolerance capabilities, please see the 31 | [Fault Tolerance](https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/) 32 | section of our website. 33 | 34 | AppWrapper is designed to be used as part of fully open source software stack 35 | to run production batch workloads on Kubernetes and OpenShift. The [MLBatch](https://github.com/project-codeflare/mlbatch) 36 | project leverages [Kueue](https://kueue.sigs.k8s.io), the [Kubeflow Training 37 | Operator](https://www.kubeflow.org/docs/components/training/), 38 | [KubeRay](https://docs.ray.io/en/latest/cluster/kubernetes/index.html), and the 39 | [Codeflare Operator](https://github.com/project-codeflare/codeflare-operator) 40 | from [Red Hat OpenShift 41 | AI](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai). 42 | MLBatch enables [AppWrapper](https://project-codeflare.github.io/appwrapper/) 43 | and adds 44 | [Coscheduler](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/README.md). 45 | MLBatch includes a number of configuration steps to help these components work 46 | in harmony and support large workloads on large clusters. 47 | 48 | ## Installation 49 | 50 | To install the latest release of AppWrapper in a Kubernetes cluster with Kueue already installed 51 | and configured, simply run the command: 52 | 53 | ```sh 54 | kubectl apply --server-side -f https://github.com/project-codeflare/appwrapper/releases/download/v1.1.2/install.yaml 55 | ``` 56 | 57 | The controller runs in the `appwrapper-system` namespace. 58 | 59 | Read the [Quick Start Guide](https://project-codeflare.github.io/appwrapper/quick-start/) to learn more. 60 | 61 | ## Usage 62 | 63 | For example of AppWrapper usage, browse our [Samples](./samples) directory or 64 | see the [Samples](https://project-codeflare.github.io/appwrapper/samples/) section 65 | of the project website. 66 | 67 | ## Development 68 | 69 | To contribute to the AppWrapper project and for detailed instructions on how to 70 | build and deploy the project from source, see the 71 | [Development Setup](https://project-codeflare.github.io/appwrapper/dev-setup/) section 72 | of the project website. 73 | 74 | ## License 75 | 76 | Copyright 2024 IBM Corporation. 77 | 78 | Licensed under the Apache License, Version 2.0 (the "License"); 79 | you may not use this file except in compliance with the License. 80 | You may obtain a copy of the License at 81 | 82 | http://www.apache.org/licenses/LICENSE-2.0 83 | 84 | Unless required by applicable law or agreed to in writing, software 85 | distributed under the License is distributed on an "AS IS" BASIS, 86 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 87 | See the License for the specific language governing permissions and 88 | limitations under the License. 89 | -------------------------------------------------------------------------------- /api/v1beta2/appwrapper_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1beta2 18 | 19 | import ( 20 | corev1 "k8s.io/api/core/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | runtime "k8s.io/apimachinery/pkg/runtime" 23 | ) 24 | 25 | // AppWrapperSpec defines the desired state of the AppWrapper 26 | type AppWrapperSpec struct { 27 | // Components lists the components contained in the AppWrapper 28 | Components []AppWrapperComponent `json:"components"` 29 | 30 | // Suspend suspends the AppWrapper when set to true 31 | //+optional 32 | Suspend bool `json:"suspend,omitempty"` 33 | 34 | // ManagedBy is used to indicate the controller or entity that manages the AppWrapper. 35 | ManagedBy *string `json:"managedBy,omitempty"` 36 | } 37 | 38 | // AppWrapperComponent describes a single wrapped Kubernetes resource 39 | type AppWrapperComponent struct { 40 | // Annotations is an unstructured key value map that may be used to store and retrieve 41 | // arbitrary metadata about the Component to customize its treatment by the AppWrapper controller. 42 | //+optional 43 | Annotations map[string]string `json:"annotations,omitempty"` 44 | 45 | // DeclaredPodSets for the Component (optional for known GVKs whose PodSets can be automatically inferred) 46 | //+optional 47 | DeclaredPodSets []AppWrapperPodSet `json:"podSets,omitempty"` 48 | 49 | // PodSetInfos assigned to the Component's PodSets by Kueue 50 | //+optional 51 | PodSetInfos []AppWrapperPodSetInfo `json:"podSetInfos,omitempty"` 52 | 53 | // Template defines the Kubernetes resource for the Component 54 | // +kubebuilder:pruning:PreserveUnknownFields 55 | // +kubebuilder:validation:EmbeddedResource 56 | Template runtime.RawExtension `json:"template"` 57 | } 58 | 59 | // AppWrapperPodSet describes a homogeneous set of pods 60 | type AppWrapperPodSet struct { 61 | // Replicas is the number of pods in this PodSet 62 | //+optional 63 | Replicas *int32 `json:"replicas,omitempty"` 64 | 65 | // Path is the path within Component.Template to the PodTemplateSpec for this PodSet 66 | Path string `json:"path"` 67 | 68 | // Annotations is an unstructured key value map that may be used to store and retrieve 69 | // arbitrary metadata about the PodSet to customize its treatment by the AppWrapper controller. 70 | //+optional 71 | Annotations map[string]string `json:"annotations,omitempty"` 72 | } 73 | 74 | // AppWrapperPodSetInfo contains the data that Kueue wants to inject into an admitted PodSpecTemplate 75 | type AppWrapperPodSetInfo struct { 76 | // Annotations to be added to the PodSpecTemplate 77 | //+optional 78 | Annotations map[string]string `json:"annotations,omitempty"` 79 | // Labels to be added to the PodSepcTemplate 80 | //+optional 81 | Labels map[string]string `json:"labels,omitempty"` 82 | // NodeSelectors to be added to the PodSpecTemplate 83 | //+optional 84 | NodeSelector map[string]string `json:"nodeSelector,omitempty"` 85 | // Tolerations to be added to the PodSpecTemplate 86 | //+optional 87 | Tolerations []corev1.Toleration `json:"tolerations,omitempty"` 88 | // SchedulingGates to be added to the PodSpecTemplate 89 | //+optional 90 | SchedulingGates []corev1.PodSchedulingGate `json:"schedulingGates,omitempty"` 91 | } 92 | 93 | // AppWrapperStatus defines the observed state of the AppWrapper 94 | type AppWrapperStatus struct { 95 | // Phase of the AppWrapper object 96 | //+optional 97 | Phase AppWrapperPhase `json:"phase,omitempty"` 98 | 99 | // Retries counts the number of times the AppWrapper has entered the Resetting Phase 100 | //+optional 101 | Retries int32 `json:"resettingCount,omitempty"` 102 | 103 | // Conditions hold the latest available observations of the AppWrapper current state. 104 | // 105 | // The type of the condition could be: 106 | // 107 | // - QuotaReserved: The AppWrapper was admitted by Kueue and has quota allocated to it 108 | // - ResourcesDeployed: The contained resources are deployed (or being deployed) on the cluster 109 | // - PodsReady: All pods of the contained resources are in the Ready or Succeeded state 110 | // - Unhealthy: One or more of the contained resources is unhealthy 111 | // - DeletingResources: The contained resources are in the process of being deleted from the cluster 112 | // 113 | //+optional 114 | //+patchMergeKey=type 115 | //+patchStrategy=merge 116 | //+listType=map 117 | //+listMapKey=type 118 | Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` 119 | 120 | // ComponentStatus parallels the Components array in the Spec and tracks the actually deployed resources 121 | ComponentStatus []AppWrapperComponentStatus `json:"componentStatus,omitempty"` 122 | } 123 | 124 | // AppWrapperComponentStatus tracks the status of a single managed Component 125 | type AppWrapperComponentStatus struct { 126 | // Name is the name of the Component 127 | Name string `json:"name"` 128 | 129 | // Kind is the Kind of the Component 130 | Kind string `json:"kind"` 131 | 132 | // APIVersion is the APIVersion of the Component 133 | APIVersion string `json:"apiVersion"` 134 | 135 | // PodSets is the validated PodSets for the Component (either from AppWrapperComponent.DeclaredPodSets or inferred by the controller) 136 | PodSets []AppWrapperPodSet `json:"podSets"` 137 | 138 | // Conditions hold the latest available observations of the Component's current state. 139 | // 140 | // The type of the condition could be: 141 | // 142 | // - ResourcesDeployed: The component is deployed on the cluster 143 | // 144 | //+optional 145 | //+patchMergeKey=type 146 | //+patchStrategy=merge 147 | //+listType=map 148 | //+listMapKey=type 149 | Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` 150 | } 151 | 152 | // AppWrapperPhase enumerates the valid Phases of an AppWrapper 153 | type AppWrapperPhase string 154 | 155 | const ( 156 | AppWrapperEmpty AppWrapperPhase = "" 157 | AppWrapperSuspended AppWrapperPhase = "Suspended" 158 | AppWrapperResuming AppWrapperPhase = "Resuming" 159 | AppWrapperRunning AppWrapperPhase = "Running" 160 | AppWrapperResetting AppWrapperPhase = "Resetting" 161 | AppWrapperSuspending AppWrapperPhase = "Suspending" 162 | AppWrapperSucceeded AppWrapperPhase = "Succeeded" 163 | AppWrapperFailed AppWrapperPhase = "Failed" 164 | AppWrapperTerminating AppWrapperPhase = "Terminating" 165 | ) 166 | 167 | // AppWrapperCondition enumerates the Condition Types that may appear in AppWrapper status 168 | type AppWrapperCondition string 169 | 170 | const ( 171 | QuotaReserved AppWrapperCondition = "QuotaReserved" 172 | ResourcesDeployed AppWrapperCondition = "ResourcesDeployed" 173 | PodsReady AppWrapperCondition = "PodsReady" 174 | Unhealthy AppWrapperCondition = "Unhealthy" 175 | DeletingResources AppWrapperCondition = "DeletingResources" 176 | ) 177 | 178 | const ( 179 | AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration" 180 | WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration" 181 | FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration" 182 | RetryPausePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/retryPausePeriodDuration" 183 | RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit" 184 | ForcefulDeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration" 185 | DeletionOnFailureGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration" 186 | SuccessTTLAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration" 187 | TerminalExitCodesAnnotation = "workload.codeflare.dev.appwrapper/terminalExitCodes" 188 | RetryableExitCodesAnnotation = "workload.codeflare.dev.appwrapper/retryableExitCodes" 189 | ) 190 | 191 | const ( 192 | AppWrapperControllerName = "workload.codeflare.dev/appwrapper-controller" 193 | AppWrapperLabel = "workload.codeflare.dev/appwrapper" 194 | ) 195 | 196 | //+kubebuilder:object:root=true 197 | //+kubebuilder:subresource:status 198 | //+kubebuilder:resource:shortName={aw} 199 | //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=`.status.phase` 200 | //+kubebuilder:printcolumn:name="Quota Reserved",type="string",JSONPath=".status.conditions[?(@.type==\"QuotaReserved\")].status" 201 | //+kubebuilder:printcolumn:name="Resources Deployed",type="string",JSONPath=".status.conditions[?(@.type==\"ResourcesDeployed\")].status" 202 | //+kubebuilder:printcolumn:name="Unhealthy",type="string",JSONPath=".status.conditions[?(@.type==\"Unhealthy\")].status" 203 | //+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" 204 | 205 | // AppWrapper is the Schema for the appwrappers API 206 | type AppWrapper struct { 207 | metav1.TypeMeta `json:",inline"` 208 | metav1.ObjectMeta `json:"metadata,omitempty"` 209 | 210 | Spec AppWrapperSpec `json:"spec,omitempty"` 211 | Status AppWrapperStatus `json:"status,omitempty"` 212 | } 213 | 214 | //+kubebuilder:object:root=true 215 | 216 | // AppWrapperList contains a list of appwrappers 217 | type AppWrapperList struct { 218 | metav1.TypeMeta `json:",inline"` 219 | metav1.ListMeta `json:"metadata,omitempty"` 220 | Items []AppWrapper `json:"items"` 221 | } 222 | 223 | func init() { 224 | SchemeBuilder.Register(&AppWrapper{}, &AppWrapperList{}) 225 | } 226 | -------------------------------------------------------------------------------- /api/v1beta2/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // +kubebuilder:object:generate=true 18 | // +groupName=workload.codeflare.dev 19 | package v1beta2 20 | -------------------------------------------------------------------------------- /api/v1beta2/groupversion_info.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1beta2 contains API Schema definitions for the workload v1beta2 API group 18 | // +kubebuilder:object:generate=true 19 | // +groupName=workload.codeflare.dev 20 | package v1beta2 21 | 22 | import ( 23 | "k8s.io/apimachinery/pkg/runtime/schema" 24 | "sigs.k8s.io/controller-runtime/pkg/scheme" 25 | ) 26 | 27 | var ( 28 | // GroupVersion is group version used to register these objects 29 | GroupVersion = schema.GroupVersion{Group: "workload.codeflare.dev", Version: "v1beta2"} 30 | 31 | // AppWrapperKind is the kind name 32 | AppWrapperKind = "AppWrapper" 33 | 34 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme 35 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 36 | 37 | // AddToScheme adds the types in this group-version to the given scheme. 38 | AddToScheme = SchemeBuilder.AddToScheme 39 | ) 40 | -------------------------------------------------------------------------------- /api/v1beta2/zz_generated.deepcopy.go: -------------------------------------------------------------------------------- 1 | //go:build !ignore_autogenerated 2 | 3 | /* 4 | Copyright 2024 IBM Corporation. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | // Code generated by controller-gen. DO NOT EDIT. 20 | 21 | package v1beta2 22 | 23 | import ( 24 | "k8s.io/api/core/v1" 25 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 | "k8s.io/apimachinery/pkg/runtime" 27 | ) 28 | 29 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 30 | func (in *AppWrapper) DeepCopyInto(out *AppWrapper) { 31 | *out = *in 32 | out.TypeMeta = in.TypeMeta 33 | in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) 34 | in.Spec.DeepCopyInto(&out.Spec) 35 | in.Status.DeepCopyInto(&out.Status) 36 | } 37 | 38 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapper. 39 | func (in *AppWrapper) DeepCopy() *AppWrapper { 40 | if in == nil { 41 | return nil 42 | } 43 | out := new(AppWrapper) 44 | in.DeepCopyInto(out) 45 | return out 46 | } 47 | 48 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 49 | func (in *AppWrapper) DeepCopyObject() runtime.Object { 50 | if c := in.DeepCopy(); c != nil { 51 | return c 52 | } 53 | return nil 54 | } 55 | 56 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 57 | func (in *AppWrapperComponent) DeepCopyInto(out *AppWrapperComponent) { 58 | *out = *in 59 | if in.Annotations != nil { 60 | in, out := &in.Annotations, &out.Annotations 61 | *out = make(map[string]string, len(*in)) 62 | for key, val := range *in { 63 | (*out)[key] = val 64 | } 65 | } 66 | if in.DeclaredPodSets != nil { 67 | in, out := &in.DeclaredPodSets, &out.DeclaredPodSets 68 | *out = make([]AppWrapperPodSet, len(*in)) 69 | for i := range *in { 70 | (*in)[i].DeepCopyInto(&(*out)[i]) 71 | } 72 | } 73 | if in.PodSetInfos != nil { 74 | in, out := &in.PodSetInfos, &out.PodSetInfos 75 | *out = make([]AppWrapperPodSetInfo, len(*in)) 76 | for i := range *in { 77 | (*in)[i].DeepCopyInto(&(*out)[i]) 78 | } 79 | } 80 | in.Template.DeepCopyInto(&out.Template) 81 | } 82 | 83 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperComponent. 84 | func (in *AppWrapperComponent) DeepCopy() *AppWrapperComponent { 85 | if in == nil { 86 | return nil 87 | } 88 | out := new(AppWrapperComponent) 89 | in.DeepCopyInto(out) 90 | return out 91 | } 92 | 93 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 94 | func (in *AppWrapperComponentStatus) DeepCopyInto(out *AppWrapperComponentStatus) { 95 | *out = *in 96 | if in.PodSets != nil { 97 | in, out := &in.PodSets, &out.PodSets 98 | *out = make([]AppWrapperPodSet, len(*in)) 99 | for i := range *in { 100 | (*in)[i].DeepCopyInto(&(*out)[i]) 101 | } 102 | } 103 | if in.Conditions != nil { 104 | in, out := &in.Conditions, &out.Conditions 105 | *out = make([]metav1.Condition, len(*in)) 106 | for i := range *in { 107 | (*in)[i].DeepCopyInto(&(*out)[i]) 108 | } 109 | } 110 | } 111 | 112 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperComponentStatus. 113 | func (in *AppWrapperComponentStatus) DeepCopy() *AppWrapperComponentStatus { 114 | if in == nil { 115 | return nil 116 | } 117 | out := new(AppWrapperComponentStatus) 118 | in.DeepCopyInto(out) 119 | return out 120 | } 121 | 122 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 123 | func (in *AppWrapperList) DeepCopyInto(out *AppWrapperList) { 124 | *out = *in 125 | out.TypeMeta = in.TypeMeta 126 | in.ListMeta.DeepCopyInto(&out.ListMeta) 127 | if in.Items != nil { 128 | in, out := &in.Items, &out.Items 129 | *out = make([]AppWrapper, len(*in)) 130 | for i := range *in { 131 | (*in)[i].DeepCopyInto(&(*out)[i]) 132 | } 133 | } 134 | } 135 | 136 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperList. 137 | func (in *AppWrapperList) DeepCopy() *AppWrapperList { 138 | if in == nil { 139 | return nil 140 | } 141 | out := new(AppWrapperList) 142 | in.DeepCopyInto(out) 143 | return out 144 | } 145 | 146 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 147 | func (in *AppWrapperList) DeepCopyObject() runtime.Object { 148 | if c := in.DeepCopy(); c != nil { 149 | return c 150 | } 151 | return nil 152 | } 153 | 154 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 155 | func (in *AppWrapperPodSet) DeepCopyInto(out *AppWrapperPodSet) { 156 | *out = *in 157 | if in.Replicas != nil { 158 | in, out := &in.Replicas, &out.Replicas 159 | *out = new(int32) 160 | **out = **in 161 | } 162 | if in.Annotations != nil { 163 | in, out := &in.Annotations, &out.Annotations 164 | *out = make(map[string]string, len(*in)) 165 | for key, val := range *in { 166 | (*out)[key] = val 167 | } 168 | } 169 | } 170 | 171 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperPodSet. 172 | func (in *AppWrapperPodSet) DeepCopy() *AppWrapperPodSet { 173 | if in == nil { 174 | return nil 175 | } 176 | out := new(AppWrapperPodSet) 177 | in.DeepCopyInto(out) 178 | return out 179 | } 180 | 181 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 182 | func (in *AppWrapperPodSetInfo) DeepCopyInto(out *AppWrapperPodSetInfo) { 183 | *out = *in 184 | if in.Annotations != nil { 185 | in, out := &in.Annotations, &out.Annotations 186 | *out = make(map[string]string, len(*in)) 187 | for key, val := range *in { 188 | (*out)[key] = val 189 | } 190 | } 191 | if in.Labels != nil { 192 | in, out := &in.Labels, &out.Labels 193 | *out = make(map[string]string, len(*in)) 194 | for key, val := range *in { 195 | (*out)[key] = val 196 | } 197 | } 198 | if in.NodeSelector != nil { 199 | in, out := &in.NodeSelector, &out.NodeSelector 200 | *out = make(map[string]string, len(*in)) 201 | for key, val := range *in { 202 | (*out)[key] = val 203 | } 204 | } 205 | if in.Tolerations != nil { 206 | in, out := &in.Tolerations, &out.Tolerations 207 | *out = make([]v1.Toleration, len(*in)) 208 | for i := range *in { 209 | (*in)[i].DeepCopyInto(&(*out)[i]) 210 | } 211 | } 212 | if in.SchedulingGates != nil { 213 | in, out := &in.SchedulingGates, &out.SchedulingGates 214 | *out = make([]v1.PodSchedulingGate, len(*in)) 215 | copy(*out, *in) 216 | } 217 | } 218 | 219 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperPodSetInfo. 220 | func (in *AppWrapperPodSetInfo) DeepCopy() *AppWrapperPodSetInfo { 221 | if in == nil { 222 | return nil 223 | } 224 | out := new(AppWrapperPodSetInfo) 225 | in.DeepCopyInto(out) 226 | return out 227 | } 228 | 229 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 230 | func (in *AppWrapperSpec) DeepCopyInto(out *AppWrapperSpec) { 231 | *out = *in 232 | if in.Components != nil { 233 | in, out := &in.Components, &out.Components 234 | *out = make([]AppWrapperComponent, len(*in)) 235 | for i := range *in { 236 | (*in)[i].DeepCopyInto(&(*out)[i]) 237 | } 238 | } 239 | if in.ManagedBy != nil { 240 | in, out := &in.ManagedBy, &out.ManagedBy 241 | *out = new(string) 242 | **out = **in 243 | } 244 | } 245 | 246 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperSpec. 247 | func (in *AppWrapperSpec) DeepCopy() *AppWrapperSpec { 248 | if in == nil { 249 | return nil 250 | } 251 | out := new(AppWrapperSpec) 252 | in.DeepCopyInto(out) 253 | return out 254 | } 255 | 256 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 257 | func (in *AppWrapperStatus) DeepCopyInto(out *AppWrapperStatus) { 258 | *out = *in 259 | if in.Conditions != nil { 260 | in, out := &in.Conditions, &out.Conditions 261 | *out = make([]metav1.Condition, len(*in)) 262 | for i := range *in { 263 | (*in)[i].DeepCopyInto(&(*out)[i]) 264 | } 265 | } 266 | if in.ComponentStatus != nil { 267 | in, out := &in.ComponentStatus, &out.ComponentStatus 268 | *out = make([]AppWrapperComponentStatus, len(*in)) 269 | for i := range *in { 270 | (*in)[i].DeepCopyInto(&(*out)[i]) 271 | } 272 | } 273 | } 274 | 275 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperStatus. 276 | func (in *AppWrapperStatus) DeepCopy() *AppWrapperStatus { 277 | if in == nil { 278 | return nil 279 | } 280 | out := new(AppWrapperStatus) 281 | in.DeepCopyInto(out) 282 | return out 283 | } 284 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "context" 21 | "crypto/tls" 22 | "flag" 23 | "fmt" 24 | "os" 25 | "strings" 26 | 27 | zaplog "go.uber.org/zap" 28 | "go.uber.org/zap/zapcore" 29 | corev1 "k8s.io/api/core/v1" 30 | apierrors "k8s.io/apimachinery/pkg/api/errors" 31 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 | "k8s.io/apimachinery/pkg/runtime" 33 | "k8s.io/apimachinery/pkg/types" 34 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 35 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 36 | "k8s.io/utils/ptr" 37 | ctrl "sigs.k8s.io/controller-runtime" 38 | "sigs.k8s.io/controller-runtime/pkg/client" 39 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 40 | "sigs.k8s.io/controller-runtime/pkg/metrics/filters" 41 | metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" 42 | "sigs.k8s.io/controller-runtime/pkg/webhook" 43 | "sigs.k8s.io/yaml" 44 | 45 | awv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2" 46 | "github.com/project-codeflare/appwrapper/internal/metrics" 47 | "github.com/project-codeflare/appwrapper/pkg/config" 48 | "github.com/project-codeflare/appwrapper/pkg/controller" 49 | "github.com/project-codeflare/appwrapper/pkg/logger" 50 | 51 | // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) 52 | // to ensure that exec-entrypoint and run can make use of them. 53 | _ "k8s.io/client-go/plugin/pkg/client/auth" 54 | ) 55 | 56 | var ( 57 | scheme = runtime.NewScheme() 58 | setupLog = ctrl.Log.WithName("setup") 59 | BuildVersion = "UNKNOWN" 60 | BuildDate = "UNKNOWN" 61 | ) 62 | 63 | func init() { 64 | utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 65 | utilruntime.Must(awv1beta2.AddToScheme(scheme)) 66 | //+kubebuilder:scaffold:scheme 67 | } 68 | 69 | func main() { 70 | var configMapName string 71 | flag.StringVar(&configMapName, "config", "appwrapper-operator-config", 72 | "The name of the ConfigMap to load the operator configuration from. "+ 73 | "If it does not exist, the operator will create and initialise it.") 74 | 75 | opts := zap.Options{ 76 | Development: true, 77 | TimeEncoder: zapcore.RFC3339NanoTimeEncoder, 78 | ZapOpts: []zaplog.Option{zaplog.AddCaller()}, 79 | } 80 | opts.BindFlags(flag.CommandLine) 81 | flag.Parse() 82 | 83 | ctrl.SetLogger(logger.FilteredLogger(zap.New(zap.UseFlagOptions(&opts)))) 84 | setupLog.Info("Build info", "version", BuildVersion, "date", BuildDate) 85 | 86 | namespace, err := getNamespace() 87 | exitOnError(err, "unable to get operator namespace") 88 | 89 | cfg := &config.OperatorConfig{ 90 | AppWrapper: config.NewAppWrapperConfig(), 91 | CertManagement: config.NewCertManagementConfig(namespace), 92 | ControllerManager: config.NewControllerManagerConfig(), 93 | WebhooksEnabled: ptr.To(true), 94 | } 95 | 96 | k8sConfig, err := ctrl.GetConfig() 97 | exitOnError(err, "unable to get client config") 98 | k8sClient, err := client.New(k8sConfig, client.Options{Scheme: scheme}) 99 | exitOnError(err, "unable to create Kubernetes client") 100 | ctx := ctrl.SetupSignalHandler() 101 | 102 | cmName := types.NamespacedName{Namespace: namespace, Name: configMapName} 103 | exitOnError(loadIntoOrCreate(ctx, k8sClient, cmName, cfg), "unable to initialise configuration") 104 | 105 | setupLog.Info("Configuration", "config", cfg) 106 | exitOnError(config.ValidateAppWrapperConfig(cfg.AppWrapper), "invalid appwrapper config") 107 | 108 | tlsOpts := []func(*tls.Config){} 109 | if !cfg.ControllerManager.EnableHTTP2 { 110 | // Unless EnableHTTP2 was set to True, http/2 should be disabled 111 | // due to its vulnerabilities. More specifically, disabling http/2 will 112 | // prevent from being vulnerable to the HTTP/2 Stream Cancelation and 113 | // Rapid Reset CVEs. For more information see: 114 | // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 115 | // - https://github.com/advisories/GHSA-4374-p667-p6c8 116 | disableHTTP2 := func(c *tls.Config) { 117 | setupLog.Info("disabling http/2") 118 | c.NextProtos = []string{"http/1.1"} 119 | } 120 | tlsOpts = append(tlsOpts, disableHTTP2) 121 | } 122 | 123 | metrics.Register() 124 | 125 | mgr, err := ctrl.NewManager(k8sConfig, ctrl.Options{ 126 | Scheme: scheme, 127 | Metrics: metricsserver.Options{ 128 | BindAddress: cfg.ControllerManager.Metrics.BindAddress, 129 | FilterProvider: filters.WithAuthenticationAndAuthorization, 130 | SecureServing: true, 131 | TLSOpts: tlsOpts, 132 | }, 133 | WebhookServer: webhook.NewServer(webhook.Options{ 134 | TLSOpts: tlsOpts, 135 | Port: 9443, 136 | }), 137 | HealthProbeBindAddress: cfg.ControllerManager.Health.BindAddress, 138 | LeaderElection: cfg.ControllerManager.LeaderElection, 139 | LeaderElectionID: "f134c674.codeflare.dev", 140 | }) 141 | exitOnError(err, "unable to start manager") 142 | 143 | certsReady := make(chan struct{}) 144 | 145 | if ptr.Deref(cfg.WebhooksEnabled, false) { 146 | exitOnError(controller.SetupCertManagement(mgr, cfg.CertManagement, certsReady), "Unable to set up cert rotation") 147 | } else { 148 | close(certsReady) 149 | } 150 | 151 | go func() { 152 | setupLog.Info("Waiting for certificates to be generated") 153 | <-certsReady 154 | setupLog.Info("Certs ready") 155 | if ptr.Deref(cfg.WebhooksEnabled, false) { 156 | exitOnError(controller.SetupWebhooks(mgr, cfg.AppWrapper), "unable to configure webhook") 157 | } 158 | exitOnError(controller.SetupControllers(mgr, cfg.AppWrapper), "unable to start controllers") 159 | }() 160 | 161 | exitOnError(controller.SetupIndexers(ctx, mgr, cfg.AppWrapper), "unable to setup indexers") 162 | exitOnError(controller.SetupProbeEndpoints(mgr, certsReady), "unable to setup probe endpoints") 163 | 164 | setupLog.Info("starting manager") 165 | exitOnError(mgr.Start(ctx), "problem starting manager") 166 | } 167 | 168 | func getNamespace() (string, error) { 169 | // This way assumes you've set the NAMESPACE environment variable either manually, when running 170 | // the operator standalone, or using the downward API, when running the operator in-cluster. 171 | if ns := os.Getenv("NAMESPACE"); ns != "" { 172 | return ns, nil 173 | } 174 | 175 | // Fall back to the namespace associated with the service account token, if available 176 | if data, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"); err == nil { 177 | if ns := strings.TrimSpace(string(data)); len(ns) > 0 { 178 | return ns, nil 179 | } 180 | } 181 | 182 | return "", fmt.Errorf("unable to determine current namespace") 183 | } 184 | 185 | func loadIntoOrCreate(ctx context.Context, k8sClient client.Client, cmName types.NamespacedName, 186 | cfg *config.OperatorConfig) error { 187 | configMap := &corev1.ConfigMap{} 188 | err := k8sClient.Get(ctx, cmName, configMap) 189 | if apierrors.IsNotFound(err) { 190 | if content, err := yaml.Marshal(cfg); err == nil { 191 | configMap := &corev1.ConfigMap{ 192 | ObjectMeta: metav1.ObjectMeta{Name: cmName.Name, Namespace: cmName.Namespace}, 193 | Data: map[string]string{"config.yaml": string(content)}, 194 | } 195 | return k8sClient.Create(ctx, configMap) 196 | } else { 197 | return err 198 | } 199 | } else if err != nil { 200 | return err 201 | } 202 | 203 | if len(configMap.Data) != 1 { 204 | return fmt.Errorf("cannot resolve config from ConfigMap %s/%s", configMap.Namespace, configMap.Name) 205 | } 206 | 207 | for _, data := range configMap.Data { 208 | return yaml.Unmarshal([]byte(data), cfg) 209 | } 210 | 211 | return nil 212 | } 213 | 214 | func exitOnError(err error, msg string) { 215 | if err != nil { 216 | setupLog.Error(err, msg) 217 | os.Exit(1) 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /config/certmanager/certificate.yaml: -------------------------------------------------------------------------------- 1 | # The following manifests contain a self-signed issuer CR and a certificate CR. 2 | # More document can be found at https://docs.cert-manager.io 3 | # WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. 4 | apiVersion: cert-manager.io/v1 5 | kind: Issuer 6 | metadata: 7 | name: selfsigned-issuer 8 | namespace: system 9 | spec: 10 | selfSigned: {} 11 | --- 12 | apiVersion: cert-manager.io/v1 13 | kind: Certificate 14 | metadata: 15 | name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml 16 | namespace: system 17 | spec: 18 | # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize 19 | dnsNames: 20 | - SERVICE_NAME.SERVICE_NAMESPACE.svc 21 | - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local 22 | issuerRef: 23 | kind: Issuer 24 | name: selfsigned-issuer 25 | secretName: webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize 26 | -------------------------------------------------------------------------------- /config/certmanager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - certificate.yaml 3 | 4 | configurations: 5 | - kustomizeconfig.yaml 6 | -------------------------------------------------------------------------------- /config/certmanager/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This configuration is for teaching kustomize how to update name ref substitution 2 | nameReference: 3 | - kind: Issuer 4 | group: cert-manager.io 5 | fieldSpecs: 6 | - kind: Certificate 7 | group: cert-manager.io 8 | path: spec/issuerRef/name 9 | -------------------------------------------------------------------------------- /config/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # This kustomization.yaml is not intended to be run by itself, 2 | # since it depends on service name and namespace that are out of this kustomize package. 3 | # It should be run by config/default 4 | resources: 5 | - bases/workload.codeflare.dev_appwrappers.yaml 6 | #+kubebuilder:scaffold:crdkustomizeresource 7 | -------------------------------------------------------------------------------- /config/default/config.yaml: -------------------------------------------------------------------------------- 1 | kind: ConfigMap 2 | apiVersion: v1 3 | metadata: 4 | name: operator-config 5 | data: 6 | config.yaml: | 7 | controllerManager: 8 | health: 9 | bindAddress: ":8081" 10 | metrics: 11 | bindAddress: ":8443" 12 | leaderElection: true 13 | -------------------------------------------------------------------------------- /config/default/editor_role_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: editor-role 5 | labels: 6 | rbac.authorization.k8s.io/aggregate-to-edit: "true" 7 | rbac.authorization.k8s.io/aggregate-to-admin: "true" 8 | -------------------------------------------------------------------------------- /config/default/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # Adds namespace to all resources. 2 | namespace: appwrapper-system 3 | 4 | # Value of this field is prepended to the 5 | # names of all resources, e.g. a deployment named 6 | # "wordpress" becomes "alices-wordpress". 7 | # Note that it should also match with the prefix (text before '-') of the namespace 8 | # field above. 9 | namePrefix: appwrapper- 10 | 11 | labels: 12 | - pairs: 13 | app.kubernetes.io/name: appwrapper 14 | app.kubernetes.io/component: controller 15 | includeTemplates: true 16 | - pairs: 17 | control-plane: controller-manager 18 | includeSelectors: true 19 | 20 | resources: 21 | - config.yaml 22 | - ../crd 23 | - ../rbac 24 | - ../manager 25 | - ../internalcert 26 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 27 | # crd/kustomization.yaml 28 | - ../webhook 29 | # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. 30 | #- ../prometheus 31 | # [METRICS] Expose the controller manager metrics service. 32 | - metrics_service.yaml 33 | 34 | patches: 35 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 36 | # crd/kustomization.yaml 37 | - path: manager_webhook_patch.yaml 38 | 39 | # Add aggregate labels to rbacs 40 | - path: editor_role_patch.yaml 41 | - path: viewer_role_patch.yaml 42 | -------------------------------------------------------------------------------- /config/default/manager_config_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | -------------------------------------------------------------------------------- /config/default/manager_webhook_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | ports: 12 | - containerPort: 9443 13 | name: webhook-server 14 | protocol: TCP 15 | volumeMounts: 16 | - mountPath: /tmp/k8s-webhook-server/serving-certs 17 | name: cert 18 | readOnly: true 19 | volumes: 20 | - name: cert 21 | secret: 22 | defaultMode: 420 23 | secretName: webhook-server-cert 24 | -------------------------------------------------------------------------------- /config/default/metrics_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: controller-manager 6 | app.kubernetes.io/name: appwrapper 7 | name: controller-manager-metrics-service 8 | namespace: system 9 | spec: 10 | ports: 11 | - name: https 12 | port: 8443 13 | protocol: TCP 14 | targetPort: 8443 15 | selector: 16 | control-plane: controller-manager 17 | -------------------------------------------------------------------------------- /config/default/viewer_role_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: viewer-role 5 | labels: 6 | rbac.authorization.k8s.io/aggregate-to-view: "true" 7 | -------------------------------------------------------------------------------- /config/dev/config.yaml: -------------------------------------------------------------------------------- 1 | kind: ConfigMap 2 | apiVersion: v1 3 | metadata: 4 | name: operator-config 5 | data: 6 | config.yaml: | 7 | controllerManager: 8 | health: 9 | bindAddress: "localhost:0" 10 | metrics: 11 | bindAddress: "localhost:0" 12 | leaderElection: false 13 | webhooksEnabled: false 14 | -------------------------------------------------------------------------------- /config/dev/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # Adds namespace to all resources. 2 | namespace: dev 3 | 4 | # Value of this field is prepended to the 5 | # names of all resources, e.g. a deployment named 6 | # "wordpress" becomes "alices-wordpress". 7 | # Note that it should also match with the prefix (text before '-') of the namespace 8 | # field above. 9 | namePrefix: appwrapper- 10 | 11 | resources: 12 | - config.yaml 13 | - namespace.yaml 14 | - ../crd 15 | -------------------------------------------------------------------------------- /config/dev/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: system 5 | -------------------------------------------------------------------------------- /config/internalcert/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - secret.yaml 3 | -------------------------------------------------------------------------------- /config/internalcert/secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: webhook-server-cert 5 | namespace: system 6 | -------------------------------------------------------------------------------- /config/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manager.yaml 3 | apiVersion: kustomize.config.k8s.io/v1beta1 4 | kind: Kustomization 5 | images: 6 | - name: controller 7 | newName: quay.io/ibm/appwrapper 8 | -------------------------------------------------------------------------------- /config/manager/manager.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: system 5 | --- 6 | apiVersion: apps/v1 7 | kind: Deployment 8 | metadata: 9 | name: controller-manager 10 | namespace: system 11 | spec: 12 | replicas: 1 13 | template: 14 | metadata: 15 | annotations: 16 | kubectl.kubernetes.io/default-container: manager 17 | spec: 18 | affinity: 19 | nodeAffinity: 20 | requiredDuringSchedulingIgnoredDuringExecution: 21 | nodeSelectorTerms: 22 | - matchExpressions: 23 | - key: kubernetes.io/arch 24 | operator: In 25 | values: 26 | - amd64 27 | - arm64 28 | - ppc64le 29 | - s390x 30 | - key: kubernetes.io/os 31 | operator: In 32 | values: 33 | - linux 34 | securityContext: 35 | runAsNonRoot: true 36 | seccompProfile: 37 | type: RuntimeDefault 38 | containers: 39 | - command: 40 | - /manager 41 | args: 42 | - "--zap-log-level=2" 43 | image: controller:latest 44 | name: manager 45 | securityContext: 46 | allowPrivilegeEscalation: false 47 | capabilities: 48 | drop: 49 | - "ALL" 50 | livenessProbe: 51 | httpGet: 52 | path: /healthz 53 | port: 8081 54 | initialDelaySeconds: 15 55 | periodSeconds: 20 56 | readinessProbe: 57 | httpGet: 58 | path: /readyz 59 | port: 8081 60 | initialDelaySeconds: 5 61 | periodSeconds: 10 62 | resources: 63 | limits: 64 | cpu: "2" 65 | memory: 128Mi 66 | requests: 67 | cpu: 100m 68 | memory: 64Mi 69 | serviceAccountName: controller-manager 70 | terminationGracePeriodSeconds: 10 71 | -------------------------------------------------------------------------------- /config/prometheus/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - monitor.yaml 3 | -------------------------------------------------------------------------------- /config/prometheus/monitor.yaml: -------------------------------------------------------------------------------- 1 | # Prometheus Monitor Service (Metrics) 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | name: controller-manager-metrics-monitor 6 | namespace: system 7 | spec: 8 | endpoints: 9 | - path: /metrics 10 | port: https 11 | scheme: https 12 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 13 | tlsConfig: 14 | insecureSkipVerify: true 15 | selector: 16 | matchLabels: 17 | control-plane: controller-manager 18 | -------------------------------------------------------------------------------- /config/rbac/editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit appwrappers. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: editor-role 6 | rules: 7 | - apiGroups: 8 | - workload.codeflare.dev 9 | resources: 10 | - appwrappers 11 | verbs: 12 | - create 13 | - delete 14 | - get 15 | - list 16 | - patch 17 | - update 18 | - watch 19 | - apiGroups: 20 | - workload.codeflare.dev 21 | resources: 22 | - appwrappers/status 23 | verbs: 24 | - get 25 | -------------------------------------------------------------------------------- /config/rbac/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | # All RBAC will be applied under this service account in 3 | # the deployment namespace. You may comment out this resource 4 | # if your manager will use a service account that exists at 5 | # runtime. Be sure to update RoleBinding and ClusterRoleBinding 6 | # subjects if changing service account names. 7 | - service_account.yaml 8 | - role.yaml 9 | - role_binding.yaml 10 | - leader_election_role.yaml 11 | - leader_election_role_binding.yaml 12 | - user_role.yaml 13 | - editor_role.yaml 14 | - viewer_role.yaml 15 | 16 | # The following RBAC configurations are used to protect 17 | # the metrics endpoint with authn/authz. These configurations 18 | # ensure that only authorized users and service accounts 19 | # can access the metrics endpoint. Comment the following 20 | # permissions if you want to disable this protection. 21 | # More info: https://book.kubebuilder.io/reference/metrics.html 22 | - metrics_auth_role.yaml 23 | - metrics_auth_role_binding.yaml 24 | - metrics_reader_role.yaml 25 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions to do leader election. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | name: leader-election-role 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - configmaps 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - create 16 | - update 17 | - patch 18 | - delete 19 | - apiGroups: 20 | - coordination.k8s.io 21 | resources: 22 | - leases 23 | verbs: 24 | - get 25 | - list 26 | - watch 27 | - create 28 | - update 29 | - patch 30 | - delete 31 | - apiGroups: 32 | - "" 33 | resources: 34 | - events 35 | verbs: 36 | - create 37 | - patch 38 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: leader-election-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: Role 8 | name: leader-election-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: controller-manager 12 | namespace: system 13 | -------------------------------------------------------------------------------- /config/rbac/metrics_auth_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: metrics-auth-role 5 | rules: 6 | - apiGroups: 7 | - authentication.k8s.io 8 | resources: 9 | - tokenreviews 10 | verbs: 11 | - create 12 | - apiGroups: 13 | - authorization.k8s.io 14 | resources: 15 | - subjectaccessreviews 16 | verbs: 17 | - create 18 | -------------------------------------------------------------------------------- /config/rbac/metrics_auth_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: metrics-auth-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: metrics-auth-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: controller-manager 12 | namespace: system 13 | -------------------------------------------------------------------------------- /config/rbac/metrics_reader_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: metrics-reader 5 | rules: 6 | - nonResourceURLs: 7 | - "/metrics" 8 | verbs: 9 | - get 10 | -------------------------------------------------------------------------------- /config/rbac/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: manager-role 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - events 11 | verbs: 12 | - create 13 | - patch 14 | - update 15 | - watch 16 | - apiGroups: 17 | - "" 18 | resources: 19 | - nodes 20 | verbs: 21 | - get 22 | - list 23 | - watch 24 | - apiGroups: 25 | - "" 26 | resources: 27 | - pods 28 | - services 29 | verbs: 30 | - create 31 | - delete 32 | - get 33 | - list 34 | - patch 35 | - update 36 | - watch 37 | - apiGroups: 38 | - "" 39 | resources: 40 | - secrets 41 | verbs: 42 | - get 43 | - list 44 | - update 45 | - watch 46 | - apiGroups: 47 | - admissionregistration.k8s.io 48 | resources: 49 | - mutatingwebhookconfigurations 50 | - validatingwebhookconfigurations 51 | verbs: 52 | - get 53 | - list 54 | - update 55 | - watch 56 | - apiGroups: 57 | - apiextensions.k8s.io 58 | resources: 59 | - customresourcedefinitions 60 | verbs: 61 | - list 62 | - apiGroups: 63 | - apps 64 | resources: 65 | - deployments 66 | - statefulsets 67 | verbs: 68 | - create 69 | - delete 70 | - get 71 | - list 72 | - patch 73 | - update 74 | - watch 75 | - apiGroups: 76 | - authorization.k8s.io 77 | resources: 78 | - subjectaccessreviews 79 | verbs: 80 | - create 81 | - apiGroups: 82 | - batch 83 | resources: 84 | - jobs 85 | verbs: 86 | - create 87 | - delete 88 | - get 89 | - list 90 | - patch 91 | - update 92 | - watch 93 | - apiGroups: 94 | - jobset.x-k8s.io 95 | resources: 96 | - jobsets 97 | verbs: 98 | - create 99 | - delete 100 | - get 101 | - list 102 | - patch 103 | - update 104 | - watch 105 | - apiGroups: 106 | - kubeflow.org 107 | resources: 108 | - pytorchjobs 109 | verbs: 110 | - create 111 | - delete 112 | - get 113 | - list 114 | - patch 115 | - update 116 | - watch 117 | - apiGroups: 118 | - ray.io 119 | resources: 120 | - rayclusters 121 | - rayjobs 122 | verbs: 123 | - create 124 | - delete 125 | - get 126 | - list 127 | - patch 128 | - update 129 | - watch 130 | - apiGroups: 131 | - scheduling.sigs.k8s.io 132 | - scheduling.x-k8s.io 133 | resources: 134 | - podgroups 135 | verbs: 136 | - create 137 | - delete 138 | - get 139 | - list 140 | - patch 141 | - update 142 | - watch 143 | - apiGroups: 144 | - workload.codeflare.dev 145 | resources: 146 | - appwrappers 147 | verbs: 148 | - create 149 | - delete 150 | - get 151 | - list 152 | - patch 153 | - update 154 | - watch 155 | - apiGroups: 156 | - workload.codeflare.dev 157 | resources: 158 | - appwrappers/finalizers 159 | verbs: 160 | - update 161 | - apiGroups: 162 | - workload.codeflare.dev 163 | resources: 164 | - appwrappers/status 165 | verbs: 166 | - get 167 | - patch 168 | - update 169 | -------------------------------------------------------------------------------- /config/rbac/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: manager-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: manager-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: controller-manager 12 | namespace: system 13 | -------------------------------------------------------------------------------- /config/rbac/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | -------------------------------------------------------------------------------- /config/rbac/user_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users of appwrappers. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: user-role 6 | rules: 7 | - apiGroups: 8 | - workload.codeflare.dev 9 | resources: 10 | - appwrappers 11 | verbs: 12 | - create 13 | - delete 14 | - get 15 | - list 16 | - watch 17 | - apiGroups: 18 | - workload.codeflare.dev 19 | resources: 20 | - appwrappers/status 21 | verbs: 22 | - get 23 | - apiGroups: 24 | - workload.codeflare.dev 25 | resources: 26 | - appwrappers/finalizer 27 | verbs: 28 | - update 29 | -------------------------------------------------------------------------------- /config/rbac/viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view appwrappers. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: viewer-role 6 | rules: 7 | - apiGroups: 8 | - workload.codeflare.dev 9 | resources: 10 | - appwrappers 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - apiGroups: 16 | - workload.codeflare.dev 17 | resources: 18 | - appwrappers/status 19 | verbs: 20 | - get 21 | -------------------------------------------------------------------------------- /config/webhook/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manifests.yaml 3 | - service.yaml 4 | 5 | configurations: 6 | - kustomizeconfig.yaml 7 | -------------------------------------------------------------------------------- /config/webhook/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # the following config is for teaching kustomize where to look at when substituting nameReference. 2 | # It requires kustomize v2.1.0 or newer to work properly. 3 | nameReference: 4 | - kind: Service 5 | version: v1 6 | fieldSpecs: 7 | - kind: MutatingWebhookConfiguration 8 | group: admissionregistration.k8s.io 9 | path: webhooks/clientConfig/service/name 10 | - kind: ValidatingWebhookConfiguration 11 | group: admissionregistration.k8s.io 12 | path: webhooks/clientConfig/service/name 13 | 14 | namespace: 15 | - kind: MutatingWebhookConfiguration 16 | group: admissionregistration.k8s.io 17 | path: webhooks/clientConfig/service/namespace 18 | create: true 19 | - kind: ValidatingWebhookConfiguration 20 | group: admissionregistration.k8s.io 21 | path: webhooks/clientConfig/service/namespace 22 | create: true 23 | -------------------------------------------------------------------------------- /config/webhook/manifests.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: admissionregistration.k8s.io/v1 3 | kind: MutatingWebhookConfiguration 4 | metadata: 5 | name: mutating-webhook-configuration 6 | webhooks: 7 | - admissionReviewVersions: 8 | - v1 9 | clientConfig: 10 | service: 11 | name: webhook-service 12 | namespace: system 13 | path: /mutate-workload-codeflare-dev-v1beta2-appwrapper 14 | failurePolicy: Fail 15 | name: mappwrapper.kb.io 16 | rules: 17 | - apiGroups: 18 | - workload.codeflare.dev 19 | apiVersions: 20 | - v1beta2 21 | operations: 22 | - CREATE 23 | resources: 24 | - appwrappers 25 | sideEffects: None 26 | --- 27 | apiVersion: admissionregistration.k8s.io/v1 28 | kind: ValidatingWebhookConfiguration 29 | metadata: 30 | name: validating-webhook-configuration 31 | webhooks: 32 | - admissionReviewVersions: 33 | - v1 34 | clientConfig: 35 | service: 36 | name: webhook-service 37 | namespace: system 38 | path: /validate-workload-codeflare-dev-v1beta2-appwrapper 39 | failurePolicy: Fail 40 | name: vappwrapper.kb.io 41 | rules: 42 | - apiGroups: 43 | - workload.codeflare.dev 44 | apiVersions: 45 | - v1beta2 46 | operations: 47 | - CREATE 48 | - UPDATE 49 | resources: 50 | - appwrappers 51 | sideEffects: None 52 | -------------------------------------------------------------------------------- /config/webhook/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: webhook-service 5 | namespace: system 6 | spec: 7 | ports: 8 | - port: 443 9 | protocol: TCP 10 | targetPort: 9443 11 | selector: 12 | control-plane: controller-manager 13 | -------------------------------------------------------------------------------- /docs/release_instructions.md: -------------------------------------------------------------------------------- 1 | ## Release Instructions 2 | 3 | 1. Submit a housekeeping PR that does the following: 4 | + Update the AppWrapper version number in the installation section of [README.md](../README.md#Installation). 5 | + Update the `appwrapper_version` variable in [_config.yaml](../site/_config.yaml). 6 | 7 | 2. Review all closed PRs since the last release and make sure they are labeled 8 | correctly (enhancement, bug, housekeeping). The next step will use these labels 9 | to generate the release notes. 10 | 11 | 3. After merging the PR, create a new release tag (vX.Y.Z) and push the 12 | tag to the main branch. This will trigger the `release` workflow which 13 | will: 14 | + build, tag, and push images to [quay.io/ibm/appwrapper](https://quay.io/repository/ibm/appwrapper) 15 | + generate the install.yaml for the release 16 | + create a [GitHub release](https://github.com/project-codeflare/appwrapper/releases) that contains the install.yaml 17 | 18 | 4. Update the kustomization.yaml files in MLBatch to refer to the new release: 19 | + setup.k8s/appwrapper/kustomization.yaml 20 | 21 | 4. To workaround back level go versions in ODH, we also maintain a 22 | codeflare-releases branch. After making a release, merge main 23 | into the codeflare-release branch creating a merge commit and 24 | push to the upstream codeflare-releases branch. After CI passes, 25 | tag the branch using a `cf` prefix instead of a `v`. (eg v0.21.2 ==> cf0.21.2). 26 | 27 | 5. You can update the codeflare-operator, using the vX.Y.Z tag in the Makefile 28 | and optionally the cfX.Y.Z in the replace clause in codeflare's go.mod if there 29 | is a difference in go levels between Kueue/AppWrapper and the codeflare-operator. 30 | -------------------------------------------------------------------------------- /docs/website_instructions.md: -------------------------------------------------------------------------------- 1 | We use Jekyll to generate static html that can be served as a GitHub 2 | page for the project. 3 | 4 | The GitHub action 5 | [jekyll-gh-pages](../.github/workflows/jekyll-gh-pages.yaml) runs whenever 6 | a change to the `_site` directory is pushed to the main branch. 7 | 8 | To host the website locally, you need a a Ruby 3.1 environment. Then in 9 | the [site](../site) directory do `bundle install` followed by 10 | `bundle exec jekyll serve`. 11 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/project-codeflare/appwrapper 2 | 3 | go 1.23.0 4 | 5 | require ( 6 | github.com/distribution/reference v0.6.0 7 | github.com/go-logr/logr v1.4.2 8 | github.com/golangci/golangci-lint v1.64.7 9 | github.com/kubeflow/training-operator v1.9.0 10 | github.com/onsi/ginkgo/v2 v2.23.0 11 | github.com/onsi/gomega v1.36.2 12 | github.com/open-policy-agent/cert-controller v0.12.0 13 | github.com/prometheus/client_golang v1.21.1 14 | go.uber.org/zap v1.27.0 15 | k8s.io/api v0.32.3 16 | k8s.io/apimachinery v0.32.3 17 | k8s.io/client-go v0.32.3 18 | k8s.io/utils v0.0.0-20241210054802-24370beab758 19 | sigs.k8s.io/controller-runtime v0.20.3 20 | sigs.k8s.io/controller-tools v0.16.5 21 | sigs.k8s.io/jobset v0.8.0 22 | sigs.k8s.io/kustomize/kustomize/v5 v5.5.0 23 | sigs.k8s.io/yaml v1.4.0 24 | ) 25 | 26 | require ( 27 | cel.dev/expr v0.19.1 // indirect 28 | github.com/antlr4-go/antlr/v4 v4.13.1 // indirect 29 | github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect 30 | github.com/beorn7/perks v1.0.1 // indirect 31 | github.com/blang/semver/v4 v4.0.0 // indirect 32 | github.com/cenkalti/backoff/v4 v4.3.0 // indirect 33 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 34 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 35 | github.com/emicklei/go-restful/v3 v3.12.2 // indirect 36 | github.com/evanphx/json-patch/v5 v5.9.11 // indirect 37 | github.com/fatih/color v1.18.0 // indirect 38 | github.com/felixge/httpsnoop v1.0.4 // indirect 39 | github.com/fsnotify/fsnotify v1.8.0 // indirect 40 | github.com/fxamacker/cbor/v2 v2.7.0 // indirect 41 | github.com/go-errors/errors v1.4.2 // indirect 42 | github.com/go-logr/stdr v1.2.2 // indirect 43 | github.com/go-logr/zapr v1.3.0 // indirect 44 | github.com/go-openapi/jsonpointer v0.21.1 // indirect 45 | github.com/go-openapi/jsonreference v0.21.0 // indirect 46 | github.com/go-openapi/swag v0.23.1 // indirect 47 | github.com/go-task/slim-sprig/v3 v3.0.0 // indirect 48 | github.com/gobuffalo/flect v1.0.3 // indirect 49 | github.com/gogo/protobuf v1.3.2 // indirect 50 | github.com/golang/protobuf v1.5.4 // indirect 51 | github.com/google/btree v1.1.3 // indirect 52 | github.com/google/cel-go v0.22.1 // indirect 53 | github.com/google/gnostic-models v0.6.9 // indirect 54 | github.com/google/go-cmp v0.7.0 // indirect 55 | github.com/google/gofuzz v1.2.0 // indirect 56 | github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect 57 | github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect 58 | github.com/google/uuid v1.6.0 // indirect 59 | github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 // indirect 60 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 61 | github.com/josharian/intern v1.0.0 // indirect 62 | github.com/json-iterator/go v1.1.12 // indirect 63 | github.com/klauspost/compress v1.18.0 // indirect 64 | github.com/mailru/easyjson v0.9.0 // indirect 65 | github.com/mattn/go-colorable v0.1.14 // indirect 66 | github.com/mattn/go-isatty v0.0.20 // indirect 67 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 68 | github.com/modern-go/reflect2 v1.0.2 // indirect 69 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect 70 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 71 | github.com/opencontainers/go-digest v1.0.0 // indirect 72 | github.com/pkg/errors v0.9.1 // indirect 73 | github.com/prometheus/client_model v0.6.1 // indirect 74 | github.com/prometheus/common v0.63.0 // indirect 75 | github.com/prometheus/procfs v0.16.0 // indirect 76 | github.com/sirupsen/logrus v1.9.3 // indirect 77 | github.com/spf13/cobra v1.9.1 // indirect 78 | github.com/spf13/pflag v1.0.6 // indirect 79 | github.com/stoewer/go-strcase v1.3.0 // indirect 80 | github.com/x448/float16 v0.8.4 // indirect 81 | github.com/xlab/treeprint v1.2.0 // indirect 82 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 83 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect 84 | go.opentelemetry.io/otel v1.35.0 // indirect 85 | go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect 86 | go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect 87 | go.opentelemetry.io/otel/metric v1.35.0 // indirect 88 | go.opentelemetry.io/otel/sdk v1.33.0 // indirect 89 | go.opentelemetry.io/otel/trace v1.35.0 // indirect 90 | go.opentelemetry.io/proto/otlp v1.4.0 // indirect 91 | go.uber.org/atomic v1.11.0 // indirect 92 | go.uber.org/multierr v1.11.0 // indirect 93 | golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 // indirect 94 | golang.org/x/mod v0.24.0 // indirect 95 | golang.org/x/net v0.38.0 // indirect 96 | golang.org/x/oauth2 v0.28.0 // indirect 97 | golang.org/x/sync v0.12.0 // indirect 98 | golang.org/x/sys v0.31.0 // indirect 99 | golang.org/x/term v0.30.0 // indirect 100 | golang.org/x/text v0.23.0 // indirect 101 | golang.org/x/time v0.11.0 // indirect 102 | golang.org/x/tools v0.31.0 // indirect 103 | gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect 104 | google.golang.org/genproto/googleapis/api v0.0.0-20241219192143-6b3ec007d9bb // indirect 105 | google.golang.org/genproto/googleapis/rpc v0.0.0-20241219192143-6b3ec007d9bb // indirect 106 | google.golang.org/grpc v1.69.2 // indirect 107 | google.golang.org/protobuf v1.36.5 // indirect 108 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 109 | gopkg.in/inf.v0 v0.9.1 // indirect 110 | gopkg.in/yaml.v2 v2.4.0 // indirect 111 | gopkg.in/yaml.v3 v3.0.1 // indirect 112 | k8s.io/apiextensions-apiserver v0.32.3 // indirect 113 | k8s.io/apiserver v0.32.3 // indirect 114 | k8s.io/component-base v0.32.3 // indirect 115 | k8s.io/klog/v2 v2.130.1 // indirect 116 | k8s.io/kube-openapi v0.0.0-20250304201544-e5f78fe3ede9 // indirect 117 | sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.1 // indirect 118 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect 119 | sigs.k8s.io/kustomize/api v0.18.0 // indirect 120 | sigs.k8s.io/kustomize/kyaml v0.18.1 // indirect 121 | sigs.k8s.io/randfill v1.0.0 // indirect 122 | sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect 123 | ) 124 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ -------------------------------------------------------------------------------- /hack/create-test-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Create and optionally configure a kind cluster for running the e2e tests 16 | 17 | export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")" 18 | CLUSTER_STARTED="false" 19 | CONFIGURE_CLUSTER=${CONFIGURE_CLUSTER:-"true"} 20 | 21 | source ${ROOT_DIR}/hack/e2e-util.sh 22 | 23 | if [[ "$CONFIGURE_CLUSTER" == "true" ]] 24 | then 25 | update_test_host 26 | check_prerequisites 27 | pull_images 28 | fi 29 | 30 | kind_up_cluster 31 | add_virtual_GPUs 32 | 33 | if [[ "$CONFIGURE_CLUSTER" == "true" ]] 34 | then 35 | kind_load_images 36 | configure_cluster 37 | fi 38 | -------------------------------------------------------------------------------- /hack/default-queues.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kueue.x-k8s.io/v1beta1 2 | kind: ResourceFlavor 3 | metadata: 4 | name: "default-flavor" 5 | --- 6 | apiVersion: kueue.x-k8s.io/v1beta1 7 | kind: ClusterQueue 8 | metadata: 9 | name: "cluster-queue" 10 | spec: 11 | namespaceSelector: {} # match all. 12 | resourceGroups: 13 | - coveredResources: ["cpu", "nvidia.com/gpu"] 14 | flavors: 15 | - name: "default-flavor" 16 | resources: 17 | - name: "cpu" 18 | nominalQuota: 8 19 | - name: "nvidia.com/gpu" 20 | nominalQuota: 16 21 | --- 22 | apiVersion: kueue.x-k8s.io/v1beta1 23 | kind: LocalQueue 24 | metadata: 25 | namespace: "default" 26 | name: "default-queue" 27 | spec: 28 | clusterQueue: "cluster-queue" 29 | -------------------------------------------------------------------------------- /hack/deploy-kueue.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Installs a kueue release onto an existing cluster 16 | 17 | export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")" 18 | 19 | echo "Deploying Kueue" 20 | kubectl apply --server-side -k $ROOT_DIR/hack/kueue-config 21 | 22 | # Sleep until the kueue manager is running 23 | echo "Waiting for pods in the kueue-system namespace to become ready" 24 | while [[ $(kubectl get pods -n kueue-system -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]] 25 | do 26 | echo -n "." && sleep 1; 27 | done 28 | echo "" 29 | 30 | # Define a default local queue in the default namespace 31 | echo "Attempting to define default local queue" 32 | 33 | # This won't work until kueue's webhooks are actually configured and working, 34 | # so first sleep for five seconds, then try it in a loop 35 | sleep 5 36 | until kubectl apply -f $ROOT_DIR/hack/default-queues.yaml 37 | do 38 | echo -n "." && sleep 1; 39 | done 40 | -------------------------------------------------------------------------------- /hack/kind-config.yaml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | # 1 control plane node and 2 worker nodes 4 | nodes: 5 | - role: control-plane 6 | - role: worker 7 | - role: worker 8 | -------------------------------------------------------------------------------- /hack/kueue-config/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.12.2" 6 | 7 | images: 8 | - name: us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue 9 | newName: registry.k8s.io/kueue/kueue 10 | newTag: v0.12.2 11 | 12 | patches: 13 | - target: 14 | kind: Deployment 15 | name: controller-manager 16 | patch: | 17 | - op: add 18 | path: /spec/template/spec/containers/0/args/- 19 | value: "--zap-log-level=2" 20 | -------------------------------------------------------------------------------- /hack/run-dev-mode-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Run backgrounded dev mode controller with e2e tests in foreground 16 | 17 | export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")" 18 | export GORACE=1 19 | export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"false"} 20 | export CLUSTER_STARTED="true" 21 | 22 | source ${ROOT_DIR}/hack/e2e-util.sh 23 | 24 | trap cleanup EXIT 25 | 26 | # run background_services_gpid test_command 27 | run () { 28 | PID=$1 29 | shift 30 | CODE=0 31 | "$@" || CODE=$? 32 | kill -- -$PID || true 33 | sleep 1 34 | return $CODE 35 | } 36 | 37 | NAMESPACE=dev go run ./cmd/main.go & 38 | run $! go run github.com/onsi/ginkgo/v2/ginkgo -v -fail-fast --procs 1 -timeout 130m --label-filter=Standalone ./test/e2e 39 | 40 | RC=$? 41 | if [ ${RC} -eq 0 ] 42 | then 43 | DUMP_LOGS="false" 44 | fi 45 | echo "End to end test script return code set to ${RC}" 46 | exit ${RC} 47 | -------------------------------------------------------------------------------- /hack/run-tests-on-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Run the e2e tests on an existing cluster with kueue and AppWrapper already installed 16 | 17 | export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")" 18 | export GORACE=1 19 | export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"false"} 20 | export CLUSTER_STARTED="true" 21 | export KUTTL_TEST_SUITES=("") 22 | export LABEL_FILTER=${LABEL_FILTER:-"Kueue,Webhook,Metrics"} 23 | 24 | source ${ROOT_DIR}/hack/e2e-util.sh 25 | 26 | trap cleanup EXIT 27 | 28 | wait_for_appwrapper_controller 29 | 30 | run_kuttl_test_suite 31 | go run github.com/onsi/ginkgo/v2/ginkgo -v -fail-fast --procs 1 -timeout 130m --label-filter=${LABEL_FILTER} ./test/e2e 32 | 33 | RC=$? 34 | if [ ${RC} -eq 0 ] 35 | then 36 | DUMP_LOGS="false" 37 | fi 38 | echo "End to end test script return code set to ${RC}" 39 | exit ${RC} 40 | -------------------------------------------------------------------------------- /internal/controller/appwrapper/fixtures_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package appwrapper 18 | 19 | import ( 20 | "fmt" 21 | "math/rand" 22 | "time" 23 | 24 | v1 "k8s.io/api/core/v1" 25 | "k8s.io/apimachinery/pkg/api/resource" 26 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 | "k8s.io/apimachinery/pkg/runtime" 28 | "k8s.io/apimachinery/pkg/types" 29 | "k8s.io/utils/ptr" 30 | "sigs.k8s.io/controller-runtime/pkg/client" 31 | "sigs.k8s.io/yaml" 32 | 33 | awv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2" 34 | 35 | . "github.com/onsi/gomega" 36 | ) 37 | 38 | const charset = "abcdefghijklmnopqrstuvwxyz0123456789" 39 | 40 | func randName(baseName string) string { 41 | seededRand := rand.New(rand.NewSource(time.Now().UnixNano())) 42 | b := make([]byte, 6) 43 | for i := range b { 44 | b[i] = charset[seededRand.Intn(len(charset))] 45 | } 46 | return fmt.Sprintf("%s-%s", baseName, string(b)) 47 | } 48 | 49 | func toAppWrapper(components ...awv1beta2.AppWrapperComponent) *awv1beta2.AppWrapper { 50 | return &awv1beta2.AppWrapper{ 51 | TypeMeta: metav1.TypeMeta{APIVersion: awv1beta2.GroupVersion.String(), Kind: awv1beta2.AppWrapperKind}, 52 | ObjectMeta: metav1.ObjectMeta{Name: randName("aw"), Namespace: "default"}, 53 | Spec: awv1beta2.AppWrapperSpec{Components: components}, 54 | } 55 | } 56 | 57 | func getAppWrapper(typeNamespacedName types.NamespacedName) *awv1beta2.AppWrapper { 58 | aw := &awv1beta2.AppWrapper{} 59 | err := k8sClient.Get(ctx, typeNamespacedName, aw) 60 | Expect(err).NotTo(HaveOccurred()) 61 | return aw 62 | } 63 | 64 | func getNode(name string) *v1.Node { 65 | node := &v1.Node{} 66 | err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, node) 67 | Expect(err).NotTo(HaveOccurred()) 68 | return node 69 | } 70 | 71 | func getPods(aw *awv1beta2.AppWrapper) []v1.Pod { 72 | result := []v1.Pod{} 73 | podList := &v1.PodList{} 74 | err := k8sClient.List(ctx, podList, &client.ListOptions{Namespace: aw.Namespace}) 75 | Expect(err).NotTo(HaveOccurred()) 76 | for _, pod := range podList.Items { 77 | if awn, found := pod.Labels[awv1beta2.AppWrapperLabel]; found && awn == aw.Name { 78 | result = append(result, pod) 79 | } 80 | } 81 | return result 82 | } 83 | 84 | // envTest doesn't have a Pod controller; so simulate it 85 | func setPodStatus(aw *awv1beta2.AppWrapper, phase v1.PodPhase, numToChange int32) error { 86 | podList := &v1.PodList{} 87 | err := k8sClient.List(ctx, podList, &client.ListOptions{Namespace: aw.Namespace}) 88 | if err != nil { 89 | return err 90 | } 91 | for _, pod := range podList.Items { 92 | if numToChange <= 0 { 93 | return nil 94 | } 95 | if awn, found := pod.Labels[awv1beta2.AppWrapperLabel]; found && awn == aw.Name { 96 | pod.Status.Phase = phase 97 | err = k8sClient.Status().Update(ctx, &pod) 98 | if err != nil { 99 | return err 100 | } 101 | numToChange -= 1 102 | } 103 | } 104 | return nil 105 | } 106 | 107 | const podYAML = ` 108 | apiVersion: v1 109 | kind: Pod 110 | metadata: 111 | name: %v 112 | spec: 113 | restartPolicy: Never 114 | containers: 115 | - name: busybox 116 | image: quay.io/project-codeflare/busybox:1.36 117 | command: ["sh", "-c", "sleep 10"] 118 | resources: 119 | requests: 120 | cpu: %v 121 | nvidia.com/gpu: %v 122 | limits: 123 | nvidia.com/gpu: %v` 124 | 125 | func pod(milliCPU int64, numGPU int64, declarePodSets bool) awv1beta2.AppWrapperComponent { 126 | yamlString := fmt.Sprintf(podYAML, 127 | randName("pod"), 128 | resource.NewMilliQuantity(milliCPU, resource.DecimalSI), 129 | resource.NewQuantity(numGPU, resource.DecimalSI), 130 | resource.NewQuantity(numGPU, resource.DecimalSI)) 131 | 132 | jsonBytes, err := yaml.YAMLToJSON([]byte(yamlString)) 133 | Expect(err).NotTo(HaveOccurred()) 134 | awc := &awv1beta2.AppWrapperComponent{ 135 | Template: runtime.RawExtension{Raw: jsonBytes}, 136 | } 137 | if declarePodSets { 138 | awc.DeclaredPodSets = []awv1beta2.AppWrapperPodSet{{Replicas: ptr.To(int32(1)), Path: "template"}} 139 | } 140 | return *awc 141 | } 142 | 143 | const complexPodYAML = ` 144 | apiVersion: v1 145 | kind: Pod 146 | metadata: 147 | name: %v 148 | labels: 149 | myComplexLabel: myComplexValue 150 | annotations: 151 | myComplexAnnotation: myComplexValue 152 | spec: 153 | restartPolicy: Never 154 | nodeSelector: 155 | myComplexSelector: myComplexValue 156 | affinity: 157 | nodeAffinity: 158 | requiredDuringSchedulingIgnoredDuringExecution: 159 | nodeSelectorTerms: 160 | - matchExpressions: 161 | - key: kubernetes.io/hostname 162 | operator: NotIn 163 | values: 164 | - badHost1 165 | schedulingGates: 166 | - name: myComplexGate 167 | tolerations: 168 | - key: myComplexKey 169 | value: myComplexValue 170 | operator: Equal 171 | effect: NoSchedule 172 | containers: 173 | - name: busybox 174 | image: quay.io/project-codeflare/busybox:1.36 175 | command: ["sh", "-c", "sleep 10"] 176 | resources: 177 | requests: 178 | cpu: 100m 179 | nvidia.com/gpu: 1 180 | limits: 181 | nvidia.com/gpu: 1` 182 | 183 | func complexPodYaml() awv1beta2.AppWrapperComponent { 184 | yamlString := fmt.Sprintf(complexPodYAML, randName("pod")) 185 | jsonBytes, err := yaml.YAMLToJSON([]byte(yamlString)) 186 | Expect(err).NotTo(HaveOccurred()) 187 | awc := &awv1beta2.AppWrapperComponent{ 188 | Template: runtime.RawExtension{Raw: jsonBytes}, 189 | } 190 | return *awc 191 | } 192 | 193 | const malformedPodYAML = ` 194 | apiVersion: v1 195 | kind: Pod 196 | metadata: 197 | name: %v 198 | spec: 199 | restartPolicy: Never 200 | containers: 201 | - name: busybox 202 | command: ["sh", "-c", "sleep 10"] 203 | resources: 204 | requests: 205 | cpu: %v` 206 | 207 | func malformedPod(milliCPU int64) awv1beta2.AppWrapperComponent { 208 | yamlString := fmt.Sprintf(malformedPodYAML, 209 | randName("pod"), 210 | resource.NewMilliQuantity(milliCPU, resource.DecimalSI)) 211 | 212 | jsonBytes, err := yaml.YAMLToJSON([]byte(yamlString)) 213 | Expect(err).NotTo(HaveOccurred()) 214 | return awv1beta2.AppWrapperComponent{ 215 | DeclaredPodSets: []awv1beta2.AppWrapperPodSet{{Replicas: ptr.To(int32(1)), Path: "template"}}, 216 | Template: runtime.RawExtension{Raw: jsonBytes}, 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /internal/controller/appwrapper/node_health_monitor.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package appwrapper 18 | 19 | import ( 20 | "context" 21 | "maps" 22 | "sync" 23 | 24 | v1 "k8s.io/api/core/v1" 25 | "k8s.io/apimachinery/pkg/api/errors" 26 | "k8s.io/apimachinery/pkg/api/resource" 27 | "k8s.io/apimachinery/pkg/util/sets" 28 | ctrl "sigs.k8s.io/controller-runtime" 29 | "sigs.k8s.io/controller-runtime/pkg/client" 30 | "sigs.k8s.io/controller-runtime/pkg/handler" 31 | "sigs.k8s.io/controller-runtime/pkg/log" 32 | 33 | "github.com/project-codeflare/appwrapper/pkg/config" 34 | ) 35 | 36 | // NodeHealthMonitor watches Nodes and maintains mappings of Nodes that have either 37 | // been marked as Unschedulable or that have been labeled to indicate that 38 | // they have resources that Autopilot has tainted as NoSchedule or NoExecute. 39 | // This information is used to automate the maintenance of the lendingLimit of 40 | // a designated slack ClusterQueue and to migrate running workloads away from NoExecute resources. 41 | type NodeHealthMonitor struct { 42 | client.Client 43 | Config *config.AppWrapperConfig 44 | } 45 | 46 | var ( 47 | // noExecuteNodes is a mapping from Node names to resources with an Autopilot NoExecute taint 48 | noExecuteNodes = make(map[string]sets.Set[string]) 49 | // noExecuteNodesMutex synchronizes access to noExecuteNodes 50 | noExecuteNodesMutex sync.RWMutex 51 | 52 | // noScheduleNodes is a mapping from Node names to ResourceLists of unschedulable resources. 53 | // A resource may be unschedulable either because: 54 | // (a) the Node is cordoned (node.Spec.Unschedulable is true) or 55 | // (b) the Node has been marked as NotReady by Kubernetes or 56 | // (c) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource. 57 | noScheduleNodes = make(map[string]v1.ResourceList) 58 | // noScheduleNodesMutex synchronizes access to noScheduleNodes 59 | noScheduleNodesMutex sync.RWMutex 60 | ) 61 | 62 | // permission to watch nodes 63 | //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch 64 | 65 | func (r *NodeHealthMonitor) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 66 | node := &v1.Node{} 67 | if err := r.Get(ctx, req.NamespacedName, node); err != nil { 68 | if errors.IsNotFound(err) { 69 | r.updateForNodeDeletion(ctx, req.Name) 70 | return ctrl.Result{}, nil 71 | } 72 | return ctrl.Result{}, err 73 | } 74 | 75 | if node.DeletionTimestamp.IsZero() { 76 | r.updateNoExecuteNodes(ctx, node) 77 | r.updateNoScheduleNodes(ctx, node) 78 | } else { 79 | r.updateForNodeDeletion(ctx, req.Name) 80 | } 81 | 82 | return ctrl.Result{}, nil 83 | } 84 | 85 | // update noExecuteNodes and noScheduleNodes for the deletion of nodeName 86 | func (r *NodeHealthMonitor) updateForNodeDeletion(ctx context.Context, nodeName string) { 87 | if _, ok := noExecuteNodes[nodeName]; ok { 88 | noExecuteNodesMutex.Lock() // BEGIN CRITICAL SECTION 89 | delete(noExecuteNodes, nodeName) 90 | noExecuteNodesMutex.Unlock() // END CRITICAL SECTION 91 | log.FromContext(ctx).Info("Updated NoExecute information due to Node deletion", 92 | "Number NoExecute Nodes", len(noExecuteNodes), "NoExecute Resource Details", noExecuteNodes) 93 | } 94 | if _, ok := noScheduleNodes[nodeName]; ok { 95 | noScheduleNodesMutex.Lock() // BEGIN CRITICAL SECTION 96 | delete(noScheduleNodes, nodeName) 97 | noScheduleNodesMutex.Unlock() // END CRITICAL SECTION 98 | log.FromContext(ctx).Info("Updated NoSchedule information due to Node deletion", 99 | "Number NoSchedule Nodes", len(noScheduleNodes), "NoSchedule Resource Details", noScheduleNodes) 100 | } 101 | } 102 | 103 | // update noExecuteNodes entry for node 104 | func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.Node) { 105 | noExecuteResources := make(sets.Set[string]) 106 | for key, value := range node.GetLabels() { 107 | for resourceName, taints := range r.Config.Autopilot.ResourceTaints { 108 | for _, taint := range taints { 109 | if key == taint.Key && value == taint.Value && taint.Effect == v1.TaintEffectNoExecute { 110 | noExecuteResources.Insert(resourceName) 111 | } 112 | } 113 | } 114 | } 115 | 116 | noExecuteNodesChanged := false 117 | noExecuteNodesMutex.Lock() // BEGIN CRITICAL SECTION 118 | if priorEntry, ok := noExecuteNodes[node.GetName()]; ok { 119 | if len(noExecuteResources) == 0 { 120 | delete(noExecuteNodes, node.GetName()) 121 | noExecuteNodesChanged = true 122 | } else if !priorEntry.Equal(noExecuteResources) { 123 | noExecuteNodes[node.GetName()] = noExecuteResources 124 | noExecuteNodesChanged = true 125 | } 126 | } else if len(noExecuteResources) > 0 { 127 | noExecuteNodes[node.GetName()] = noExecuteResources 128 | noExecuteNodesChanged = true 129 | } 130 | noExecuteNodesMutex.Unlock() // END CRITICAL SECTION 131 | 132 | if noExecuteNodesChanged { 133 | log.FromContext(ctx).Info("Updated NoExecute information", "Number NoExecute Nodes", len(noExecuteNodes), "NoExecute Resource Details", noExecuteNodes) 134 | } 135 | } 136 | 137 | // update noScheduleNodes entry for node 138 | func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1.Node) { 139 | var noScheduleResources v1.ResourceList 140 | if r.nodeIsUnscheduable(node) { 141 | noScheduleResources = node.Status.Capacity.DeepCopy() 142 | delete(noScheduleResources, v1.ResourcePods) 143 | } else { 144 | noScheduleResources = make(v1.ResourceList) 145 | for key, value := range node.GetLabels() { 146 | for resourceName, taints := range r.Config.Autopilot.ResourceTaints { 147 | for _, taint := range taints { 148 | if taint.Effect == v1.TaintEffectNoExecute || taint.Effect == v1.TaintEffectNoSchedule { 149 | if key == taint.Key && value == taint.Value { 150 | quantity := node.Status.Capacity.Name(v1.ResourceName(resourceName), resource.DecimalSI) 151 | if !quantity.IsZero() { 152 | noScheduleResources[v1.ResourceName(resourceName)] = *quantity 153 | } 154 | } 155 | } 156 | } 157 | } 158 | } 159 | } 160 | 161 | noScheduleNodesChanged := false 162 | noScheduleNodesMutex.Lock() // BEGIN CRITICAL SECTION 163 | if priorEntry, ok := noScheduleNodes[node.GetName()]; ok { 164 | if len(noScheduleResources) == 0 { 165 | delete(noScheduleNodes, node.GetName()) 166 | noScheduleNodesChanged = true 167 | } else if !maps.Equal(priorEntry, noScheduleResources) { 168 | noScheduleNodes[node.GetName()] = noScheduleResources 169 | noScheduleNodesChanged = true 170 | } 171 | } else if len(noScheduleResources) > 0 { 172 | noScheduleNodes[node.GetName()] = noScheduleResources 173 | noScheduleNodesChanged = true 174 | } 175 | noScheduleNodesMutex.Unlock() // END CRITICAL SECTION 176 | 177 | if noScheduleNodesChanged { 178 | log.FromContext(ctx).Info("Updated NoSchedule information", "Number NoSchedule Nodes", len(noScheduleNodes), "NoSchedule Resource Details", noScheduleNodes) 179 | } 180 | } 181 | 182 | func (r *NodeHealthMonitor) nodeIsUnscheduable(node *v1.Node) bool { 183 | if node.Spec.Unschedulable { 184 | return true 185 | } 186 | for _, taint := range node.Spec.Taints { 187 | if taint.Key == "node.kubernetes.io/unreachable" || taint.Key == "node.kubernetes.io/not-ready" { 188 | return true 189 | } 190 | } 191 | return false 192 | } 193 | 194 | // SetupWithManager sets up the controller with the Manager. 195 | func (r *NodeHealthMonitor) SetupWithManager(mgr ctrl.Manager) error { 196 | return ctrl.NewControllerManagedBy(mgr). 197 | Watches(&v1.Node{}, &handler.EnqueueRequestForObject{}). 198 | Named("NodeMonitor"). 199 | Complete(r) 200 | } 201 | -------------------------------------------------------------------------------- /internal/controller/appwrapper/node_health_monitor_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package appwrapper 18 | 19 | import ( 20 | v1 "k8s.io/api/core/v1" 21 | "k8s.io/apimachinery/pkg/api/resource" 22 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | "k8s.io/apimachinery/pkg/types" 24 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 25 | 26 | "github.com/project-codeflare/appwrapper/pkg/config" 27 | 28 | . "github.com/onsi/ginkgo/v2" 29 | . "github.com/onsi/gomega" 30 | ) 31 | 32 | var _ = Describe("NodeMonitor Controller", func() { 33 | var node1Name = types.NamespacedName{Name: "fake-node-1"} 34 | var node2Name = types.NamespacedName{Name: "fake-node-2"} 35 | var nodeMonitor *NodeHealthMonitor 36 | nodeGPUs := v1.ResourceList{v1.ResourceName("nvidia.com/gpu"): resource.MustParse("4")} 37 | 38 | createNode := func(nodeName string) { 39 | node := &v1.Node{ 40 | TypeMeta: metav1.TypeMeta{APIVersion: "v1", Kind: "Node"}, 41 | ObjectMeta: metav1.ObjectMeta{Name: nodeName, Labels: map[string]string{"key1": "value1"}}, 42 | } 43 | Expect(k8sClient.Create(ctx, node)).To(Succeed()) 44 | node = getNode(nodeName) 45 | node.Status.Capacity = nodeGPUs 46 | node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ 47 | Type: v1.NodeReady, 48 | Status: v1.ConditionTrue, 49 | }) 50 | Expect(k8sClient.Status().Update(ctx, node)).To(Succeed()) 51 | node.Spec.Taints = []v1.Taint{} 52 | Expect(k8sClient.Update(ctx, node)).To(Succeed()) 53 | } 54 | 55 | deleteNode := func(nodeName string) { 56 | Expect(k8sClient.Delete(ctx, &v1.Node{ 57 | TypeMeta: metav1.TypeMeta{APIVersion: "v1", Kind: "Node"}, 58 | ObjectMeta: metav1.ObjectMeta{Name: nodeName}, 59 | })).To(Succeed()) 60 | } 61 | 62 | BeforeEach(func() { 63 | // Create reconcillers 64 | awConfig := config.NewAppWrapperConfig() 65 | nodeMonitor = &NodeHealthMonitor{ 66 | Client: k8sClient, 67 | Config: awConfig, 68 | } 69 | }) 70 | 71 | AfterEach(func() { 72 | nodeMonitor = nil 73 | }) 74 | 75 | It("Autopilot Monitoring", func() { 76 | createNode(node1Name.Name) 77 | createNode(node2Name.Name) 78 | 79 | _, err := nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 80 | Expect(err).NotTo(HaveOccurred()) 81 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) 82 | Expect(err).NotTo(HaveOccurred()) 83 | 84 | By("Healthy cluster has no unhealthy nodes") 85 | Expect(noExecuteNodes).Should(BeEmpty()) 86 | 87 | By("A node labeled EVICT is detected as unhealthy") 88 | node := getNode(node1Name.Name) 89 | node.Labels["autopilot.ibm.com/gpuhealth"] = "EVICT" 90 | Expect(k8sClient.Update(ctx, node)).Should(Succeed()) 91 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 92 | Expect(err).NotTo(HaveOccurred()) 93 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) 94 | Expect(err).NotTo(HaveOccurred()) 95 | Expect(noExecuteNodes).Should(HaveLen(1)) 96 | Expect(noExecuteNodes).Should(HaveKey(node1Name.Name)) 97 | Expect(noExecuteNodes[node1Name.Name]).Should(HaveKey("nvidia.com/gpu")) 98 | 99 | By("Repeated reconcile does not change map") 100 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 101 | Expect(err).NotTo(HaveOccurred()) 102 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) 103 | Expect(err).NotTo(HaveOccurred()) 104 | Expect(noExecuteNodes).Should(HaveLen(1)) 105 | Expect(noExecuteNodes).Should(HaveKey(node1Name.Name)) 106 | Expect(noExecuteNodes[node1Name.Name]).Should(HaveKey("nvidia.com/gpu")) 107 | 108 | By("Removing the EVICT label updates unhealthyNodes") 109 | node.Labels["autopilot.ibm.com/gpuhealth"] = "WARN" 110 | Expect(k8sClient.Update(ctx, node)).Should(Succeed()) 111 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 112 | Expect(err).NotTo(HaveOccurred()) 113 | Expect(noExecuteNodes).Should(BeEmpty()) 114 | 115 | By("A Node tainted as unreachable is detected as unscheduable") 116 | node = getNode(node1Name.Name) 117 | node.Spec.Taints = append(node.Spec.Taints, v1.Taint{Key: "node.kubernetes.io/unreachable", Effect: v1.TaintEffectNoExecute}) 118 | Expect(k8sClient.Update(ctx, node)).Should(Succeed()) 119 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 120 | Expect(err).NotTo(HaveOccurred()) 121 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) 122 | Expect(err).NotTo(HaveOccurred()) 123 | Expect(noScheduleNodes).Should(HaveLen(1)) 124 | Expect(noScheduleNodes).Should(HaveKey(node1Name.Name)) 125 | Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu"))) 126 | 127 | By("Repeated reconcile does not change map") 128 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 129 | Expect(err).NotTo(HaveOccurred()) 130 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) 131 | Expect(err).NotTo(HaveOccurred()) 132 | Expect(noScheduleNodes).Should(HaveLen(1)) 133 | Expect(noScheduleNodes).Should(HaveKey(node1Name.Name)) 134 | Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu"))) 135 | 136 | By("Removing the taint updates unhealthyNodes") 137 | node.Spec.Taints = []v1.Taint{} 138 | Expect(k8sClient.Update(ctx, node)).Should(Succeed()) 139 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 140 | Expect(err).NotTo(HaveOccurred()) 141 | Expect(noScheduleNodes).Should(BeEmpty()) 142 | 143 | By("A Node tainted as not-read is detected as unscheduable") 144 | node = getNode(node1Name.Name) 145 | node.Spec.Taints = append(node.Spec.Taints, v1.Taint{Key: "node.kubernetes.io/not-ready", Effect: v1.TaintEffectNoExecute}) 146 | Expect(k8sClient.Update(ctx, node)).Should(Succeed()) 147 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 148 | Expect(err).NotTo(HaveOccurred()) 149 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) 150 | Expect(err).NotTo(HaveOccurred()) 151 | Expect(noScheduleNodes).Should(HaveLen(1)) 152 | Expect(noScheduleNodes).Should(HaveKey(node1Name.Name)) 153 | Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu"))) 154 | 155 | By("Repeated reconcile does not change map") 156 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 157 | Expect(err).NotTo(HaveOccurred()) 158 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) 159 | Expect(err).NotTo(HaveOccurred()) 160 | Expect(noScheduleNodes).Should(HaveLen(1)) 161 | Expect(noScheduleNodes).Should(HaveKey(node1Name.Name)) 162 | Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu"))) 163 | 164 | By("Removing the taint updates unhealthyNodes") 165 | node.Spec.Taints = []v1.Taint{} 166 | Expect(k8sClient.Update(ctx, node)).Should(Succeed()) 167 | _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) 168 | Expect(err).NotTo(HaveOccurred()) 169 | Expect(noScheduleNodes).Should(BeEmpty()) 170 | 171 | deleteNode(node1Name.Name) 172 | deleteNode(node2Name.Name) 173 | }) 174 | }) 175 | -------------------------------------------------------------------------------- /internal/controller/appwrapper/suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package appwrapper 18 | 19 | import ( 20 | "context" 21 | "fmt" 22 | "path/filepath" 23 | "runtime" 24 | "testing" 25 | 26 | admissionv1 "k8s.io/api/admission/v1" 27 | rbacv1 "k8s.io/api/rbac/v1" 28 | apimachineryruntime "k8s.io/apimachinery/pkg/runtime" 29 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 30 | "k8s.io/client-go/rest" 31 | "sigs.k8s.io/controller-runtime/pkg/client" 32 | "sigs.k8s.io/controller-runtime/pkg/envtest" 33 | logf "sigs.k8s.io/controller-runtime/pkg/log" 34 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 35 | 36 | awv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2" 37 | 38 | . "github.com/onsi/ginkgo/v2" 39 | . "github.com/onsi/gomega" 40 | ) 41 | 42 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to 43 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. 44 | 45 | var cfg *rest.Config 46 | var k8sClient client.Client 47 | var testEnv *envtest.Environment 48 | var ctx context.Context 49 | var cancel context.CancelFunc 50 | 51 | func TestControllers(t *testing.T) { 52 | RegisterFailHandler(Fail) 53 | 54 | RunSpecs(t, "Controller Unit Tests") 55 | } 56 | 57 | var _ = BeforeSuite(func() { 58 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 59 | 60 | ctx, cancel = context.WithCancel(context.Background()) 61 | 62 | By("bootstrapping test environment") 63 | testEnv = &envtest.Environment{ 64 | CRDDirectoryPaths: []string{ 65 | filepath.Join("..", "..", "..", "config", "crd", "bases"), 66 | }, 67 | ErrorIfCRDPathMissing: true, 68 | 69 | // The BinaryAssetsDirectory is only required if you want to run the tests directly 70 | // without call the makefile target test. If not informed it will look for the 71 | // default path defined in controller-runtime which is /usr/local/kubebuilder/. 72 | // Note that you must have the required binaries setup under the bin directory to perform 73 | // the tests directly. When we run make test it will be setup and used automatically. 74 | BinaryAssetsDirectory: filepath.Join("..", "..", "..", "bin", "k8s", 75 | fmt.Sprintf("1.29.0-%s-%s", runtime.GOOS, runtime.GOARCH)), 76 | } 77 | 78 | var err error 79 | // cfg is defined in this file globally. 80 | cfg, err = testEnv.Start() 81 | Expect(err).NotTo(HaveOccurred()) 82 | Expect(cfg).NotTo(BeNil()) 83 | 84 | scheme := apimachineryruntime.NewScheme() 85 | err = awv1beta2.AddToScheme(scheme) 86 | Expect(err).NotTo(HaveOccurred()) 87 | 88 | err = admissionv1.AddToScheme(scheme) 89 | Expect(err).NotTo(HaveOccurred()) 90 | err = rbacv1.AddToScheme(scheme) 91 | Expect(err).NotTo(HaveOccurred()) 92 | err = clientgoscheme.AddToScheme(scheme) 93 | Expect(err).NotTo(HaveOccurred()) 94 | 95 | //+kubebuilder:scaffold:scheme 96 | 97 | k8sClient, err = client.New(cfg, client.Options{Scheme: scheme}) 98 | Expect(err).NotTo(HaveOccurred()) 99 | Expect(k8sClient).NotTo(BeNil()) 100 | }) 101 | 102 | var _ = AfterSuite(func() { 103 | cancel() 104 | By("tearing down the test environment") 105 | err := testEnv.Stop() 106 | Expect(err).NotTo(HaveOccurred()) 107 | }) 108 | -------------------------------------------------------------------------------- /internal/metrics/metrics.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package metrics 18 | 19 | import ( 20 | "github.com/prometheus/client_golang/prometheus" 21 | "sigs.k8s.io/controller-runtime/pkg/metrics" 22 | ) 23 | 24 | var ( 25 | AppWrapperPhaseCounter = prometheus.NewCounterVec( 26 | prometheus.CounterOpts{ 27 | Name: "appwrapper_phase_total", 28 | Help: `The total number of times an appwrapper transitioned to a given phase per namespace.`, 29 | }, []string{"namespace", "phase"}, 30 | ) 31 | ) 32 | 33 | func Register() { 34 | metrics.Registry.MustRegister(AppWrapperPhaseCounter) 35 | } 36 | -------------------------------------------------------------------------------- /internal/tools/pinversion.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package tools 18 | 19 | // Keep a reference to the code generators so they are not removed by go mod tidy 20 | import ( 21 | // since verify will error when referencing a cmd package 22 | // we need to reference individual dependencies used by it 23 | _ "github.com/golangci/golangci-lint/pkg/exitcodes" 24 | _ "sigs.k8s.io/controller-tools/pkg/crd" 25 | _ "sigs.k8s.io/controller-tools/pkg/genall/help/pretty" 26 | _ "sigs.k8s.io/kustomize/kustomize/v5/commands/edit/listbuiltin" 27 | ) 28 | -------------------------------------------------------------------------------- /internal/util/maps.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | /* 18 | This file contains a copy of the two map utility functions from 19 | https://github.com/kubernetes-sigs/kueue/blob/main/pkg/util/maps/maps.go 20 | that are used by the AppWrapper controlller. 21 | 22 | We "vendor" the used functions to eliminate our go dependency on Kueue. 23 | This simplifies bundling AppWrapper in the codeflare-operator in RedHat OpenShift AI. 24 | */ 25 | 26 | package maps 27 | 28 | import ( 29 | "fmt" 30 | "maps" 31 | ) 32 | 33 | // merge merges a and b while resolving the conflicts by calling commonKeyValue 34 | func merge[K comparable, V any, S ~map[K]V](a, b S, commonKeyValue func(a, b V) V) S { 35 | if a == nil { 36 | return maps.Clone(b) 37 | } 38 | 39 | ret := maps.Clone(a) 40 | 41 | for k, v := range b { 42 | if _, found := a[k]; found { 43 | ret[k] = commonKeyValue(a[k], v) 44 | } else { 45 | ret[k] = v 46 | } 47 | } 48 | return ret 49 | } 50 | 51 | // MergeKeepFirst merges a and b keeping the values in a in case of conflict 52 | func MergeKeepFirst[K comparable, V any, S ~map[K]V](a, b S) S { 53 | return merge(a, b, func(v, _ V) V { return v }) 54 | } 55 | 56 | // HaveConflict checks if a and b have the same key, but different value 57 | func HaveConflict[K comparable, V comparable, S ~map[K]V](a, b S) error { 58 | for k, av := range a { 59 | if bv, found := b[k]; found && av != bv { 60 | return fmt.Errorf("conflict for key=%v, value1=%v, value2=%v", k, av, bv) 61 | } 62 | } 63 | return nil 64 | } 65 | -------------------------------------------------------------------------------- /internal/webhook/appwrapper_webhook_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package webhook 18 | 19 | import ( 20 | "encoding/json" 21 | 22 | "k8s.io/apimachinery/pkg/runtime" 23 | "k8s.io/apimachinery/pkg/types" 24 | "k8s.io/utils/ptr" 25 | 26 | awv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2" 27 | utilmaps "github.com/project-codeflare/appwrapper/internal/util" 28 | 29 | . "github.com/onsi/ginkgo/v2" 30 | . "github.com/onsi/gomega" 31 | ) 32 | 33 | var _ = Describe("AppWrapper Webhook Tests", func() { 34 | 35 | Context("Defaulting Webhook", func() { 36 | It("Default queue name is set", func() { 37 | aw := toAppWrapper(pod(100)) 38 | 39 | Expect(k8sClient.Create(ctx, aw)).To(Succeed()) 40 | Expect(aw.Labels[QueueNameLabel]).Should(BeIdenticalTo(defaultQueueName), "aw should be labeled with the default queue name") 41 | Expect(k8sClient.Delete(ctx, aw)).To(Succeed()) 42 | }) 43 | 44 | It("Provided queue name is not overridden by default queue name", func() { 45 | aw := toAppWrapper(pod(100)) 46 | aw.Labels = utilmaps.MergeKeepFirst(map[string]string{QueueNameLabel: userProvidedQueueName}, aw.Labels) 47 | 48 | Expect(k8sClient.Create(ctx, aw)).To(Succeed()) 49 | Expect(aw.Labels[QueueNameLabel]).Should(BeIdenticalTo(userProvidedQueueName), "queue name should not be overridden") 50 | Expect(k8sClient.Delete(ctx, aw)).To(Succeed()) 51 | }) 52 | 53 | It("User name and ID are set", func() { 54 | aw := toAppWrapper(pod(100)) 55 | aw.Labels = utilmaps.MergeKeepFirst(map[string]string{AppWrapperUsernameLabel: "bad", AppWrapperUserIDLabel: "bad"}, aw.Labels) 56 | 57 | Expect(k8sLimitedClient.Create(ctx, aw)).To(Succeed()) 58 | Expect(aw.Labels[AppWrapperUsernameLabel]).Should(BeIdenticalTo(limitedUserName)) 59 | Expect(aw.Labels[AppWrapperUserIDLabel]).Should(BeIdenticalTo(limitedUserID)) 60 | Expect(k8sLimitedClient.Delete(ctx, aw)).To(Succeed()) 61 | }) 62 | }) 63 | 64 | Context("Validating Webhook", func() { 65 | Context("Structural Invariants", func() { 66 | It("There must be at least one podspec (a)", func() { 67 | aw := toAppWrapper() 68 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 69 | }) 70 | 71 | It("There must be at least one podspec (b)", func() { 72 | aw := toAppWrapper(service()) 73 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 74 | }) 75 | 76 | It("There must be no more than 8 podspecs", func() { 77 | aw := toAppWrapper(pod(100), pod(100), pod(100), pod(100), pod(100), pod(100), pod(100), pod(100), pod(100)) 78 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 79 | }) 80 | 81 | It("Non-existent PodSpec paths are rejected", func() { 82 | comp := deployment(4, 100) 83 | comp.DeclaredPodSets[0].Path = "template.spec.missing" 84 | aw := toAppWrapper(comp) 85 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 86 | 87 | comp.DeclaredPodSets[0].Path = "" 88 | aw = toAppWrapper(comp) 89 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 90 | }) 91 | 92 | It("PodSpec paths must refer to a PodSpecTemplate", func() { 93 | comp := deployment(4, 100) 94 | comp.DeclaredPodSets[0].Path = "template.spec.template.metadata" 95 | aw := toAppWrapper(comp) 96 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 97 | }) 98 | 99 | It("Validation of Array and Map path elements", func() { 100 | comp := jobSet(2, 100) 101 | comp.DeclaredPodSets[0].Path = "template.spec.replicatedJobs.template.spec.template" 102 | aw := toAppWrapper(comp) 103 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 104 | 105 | comp.DeclaredPodSets[0].Path = "template.spec.replicatedJobs" 106 | aw = toAppWrapper(comp) 107 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 108 | 109 | comp.DeclaredPodSets[0].Path = "template.spec.replicatedJobs[0].template[0].spec.template" 110 | aw = toAppWrapper(comp) 111 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 112 | 113 | comp.DeclaredPodSets[0].Path = "template.spec.replicatedJobs[10].template.spec.template" 114 | aw = toAppWrapper(comp) 115 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 116 | 117 | comp.DeclaredPodSets[0].Path = "template.spec.replicatedJobs[-1].template.spec.template" 118 | aw = toAppWrapper(comp) 119 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 120 | 121 | comp.DeclaredPodSets[0].Path = "template.spec.replicatedJobs[a10].template.spec.template" 122 | aw = toAppWrapper(comp) 123 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 124 | 125 | comp.DeclaredPodSets[0].Path = "template.spec.replicatedJobs[1" 126 | aw = toAppWrapper(comp) 127 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 128 | 129 | comp.DeclaredPodSets[0].Path = "template.spec.replicatedJobs[1]].template.spec.template" 130 | aw = toAppWrapper(comp) 131 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 132 | }) 133 | }) 134 | 135 | It("Components in other namespaces are rejected", func() { 136 | aw := toAppWrapper(namespacedPod("test", 100)) 137 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 138 | }) 139 | 140 | It("Nested AppWrappers are rejected", func() { 141 | child := toAppWrapper(pod(100)) 142 | childBytes, err := json.Marshal(child) 143 | Expect(err).ShouldNot(HaveOccurred()) 144 | aw := toAppWrapper(pod(100), awv1beta2.AppWrapperComponent{ 145 | DeclaredPodSets: []awv1beta2.AppWrapperPodSet{}, 146 | Template: runtime.RawExtension{Raw: childBytes}, 147 | }) 148 | Expect(k8sClient.Create(ctx, aw)).ShouldNot(Succeed()) 149 | }) 150 | 151 | It("User name and ID are immutable", func() { 152 | aw := toAppWrapper(pod(100)) 153 | awName := types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace} 154 | Expect(k8sClient.Create(ctx, aw)).Should(Succeed()) 155 | 156 | aw = getAppWrapper(awName) 157 | aw.Labels[AppWrapperUsernameLabel] = "bad" 158 | Expect(k8sClient.Update(ctx, aw)).ShouldNot(Succeed()) 159 | 160 | aw = getAppWrapper(awName) 161 | aw.Labels[AppWrapperUserIDLabel] = "bad" 162 | Expect(k8sClient.Update(ctx, aw)).ShouldNot(Succeed()) 163 | 164 | Expect(k8sClient.Delete(ctx, aw)).To(Succeed()) 165 | }) 166 | 167 | It("User name and ID should be preserved on updates", func() { 168 | aw := toAppWrapper(pod(100)) 169 | awName := types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace} 170 | Expect(k8sLimitedClient.Create(ctx, aw)).Should(Succeed()) 171 | 172 | aw = getAppWrapper(awName) 173 | Expect(k8sClient.Update(ctx, aw)).Should(Succeed()) 174 | 175 | aw = getAppWrapper(awName) 176 | Expect(aw.Labels[AppWrapperUsernameLabel]).Should(BeIdenticalTo(limitedUserName)) 177 | Expect(aw.Labels[AppWrapperUserIDLabel]).Should(BeIdenticalTo(limitedUserID)) 178 | Expect(k8sLimitedClient.Delete(ctx, aw)).To(Succeed()) 179 | }) 180 | 181 | Context("aw.Spec.Components is immutable", func() { 182 | It("Updates to non-sensitive fields are allowed", func() { 183 | aw := toAppWrapper(pod(100), deployment(4, 100)) 184 | awName := types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace} 185 | Expect(k8sClient.Create(ctx, aw)).Should(Succeed()) 186 | 187 | aw = getAppWrapper(awName) 188 | aw.Spec.Suspend = true 189 | Expect(k8sClient.Update(ctx, aw)).Should(Succeed()) 190 | 191 | aw = getAppWrapper(awName) 192 | aw.Spec.Components[1].PodSetInfos = make([]awv1beta2.AppWrapperPodSetInfo, 1) 193 | Expect(k8sClient.Update(ctx, aw)).Should(Succeed()) 194 | }) 195 | 196 | It("Updates to sensitive fields are rejected", func() { 197 | aw := toAppWrapper(pod(100), deployment(4, 100)) 198 | awName := types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace} 199 | Expect(k8sClient.Create(ctx, aw)).Should(Succeed()) 200 | 201 | aw = getAppWrapper(awName) 202 | aw.Spec.Components[0].Template = aw.Spec.Components[1].Template 203 | Expect(k8sClient.Update(ctx, aw)).ShouldNot(Succeed()) 204 | 205 | aw = getAppWrapper(awName) 206 | aw.Spec.Components = append(aw.Spec.Components, aw.Spec.Components[0]) 207 | Expect(k8sClient.Update(ctx, aw)).ShouldNot(Succeed()) 208 | 209 | aw = getAppWrapper(awName) 210 | aw.Spec.Components[0].DeclaredPodSets = append(aw.Spec.Components[0].DeclaredPodSets, aw.Spec.Components[0].DeclaredPodSets...) 211 | Expect(k8sClient.Update(ctx, aw)).ShouldNot(Succeed()) 212 | 213 | aw = getAppWrapper(awName) 214 | aw.Spec.Components[0].DeclaredPodSets[0].Path = "bad" 215 | Expect(k8sClient.Update(ctx, aw)).ShouldNot(Succeed()) 216 | 217 | aw = getAppWrapper(awName) 218 | aw.Spec.Components[0].DeclaredPodSets[0].Replicas = ptr.To(int32(12)) 219 | Expect(k8sClient.Update(ctx, aw)).ShouldNot(Succeed()) 220 | }) 221 | }) 222 | 223 | Context("RBAC is enforced for wrapped resouces", func() { 224 | It("AppWrapper containing permitted resources can be created", func() { 225 | aw := toAppWrapper(pod(100)) 226 | Expect(k8sLimitedClient.Create(ctx, aw)).To(Succeed(), "Limited user should be allowed to create AppWrapper containing Pods") 227 | Expect(k8sLimitedClient.Delete(ctx, aw)).To(Succeed()) 228 | }) 229 | 230 | It("AppWrapper containing unpermitted resources cannot be created", func() { 231 | aw := toAppWrapper(deployment(4, 100)) 232 | Expect(k8sLimitedClient.Create(ctx, aw)).NotTo(Succeed(), "Limited user should not be allowed to create AppWrapper containing Deployments") 233 | }) 234 | }) 235 | 236 | It("Well-formed AppWrappers are accepted", func() { 237 | aw := toAppWrapper(pod(100), deployment(1, 100), namespacedPod("default", 100), rayCluster(1, 100), jobSet(1, 100)) 238 | 239 | Expect(k8sClient.Create(ctx, aw)).To(Succeed(), "Legal AppWrappers should be accepted") 240 | Expect(k8sClient.Delete(ctx, aw)).To(Succeed()) 241 | }) 242 | 243 | Context("PodSets are inferred for known GVKs", func() { 244 | It("PodSets are inferred for common kinds", func() { 245 | aw := toAppWrapper(pod(100), deploymentForInference(1, 100), podForInference(100), 246 | jobForInference(2, 4, 100), jobForInference(8, 4, 100)) 247 | 248 | Expect(k8sClient.Create(ctx, aw)).To(Succeed(), "PodSets should be inferred") 249 | Expect(k8sClient.Delete(ctx, aw)).To(Succeed()) 250 | }) 251 | 252 | It("PodSets are inferred for PyTorchJobs, RayClusters, and RayJobs", func() { 253 | aw := toAppWrapper(pytorchJobForInference(100, 4, 100), rayClusterForInference(7, 100), rayJobForInference(7, 100)) 254 | 255 | Expect(k8sClient.Create(ctx, aw)).To(Succeed(), "PodSets should be inferred") 256 | Expect(k8sClient.Delete(ctx, aw)).To(Succeed()) 257 | }) 258 | }) 259 | }) 260 | 261 | }) 262 | -------------------------------------------------------------------------------- /internal/webhook/suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package webhook 18 | 19 | import ( 20 | "context" 21 | "crypto/tls" 22 | "fmt" 23 | "net" 24 | "path/filepath" 25 | "runtime" 26 | "testing" 27 | "time" 28 | 29 | admissionv1 "k8s.io/api/admission/v1" 30 | rbacv1 "k8s.io/api/rbac/v1" 31 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 | apimachineryruntime "k8s.io/apimachinery/pkg/runtime" 33 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 34 | "k8s.io/client-go/rest" 35 | ctrl "sigs.k8s.io/controller-runtime" 36 | "sigs.k8s.io/controller-runtime/pkg/client" 37 | "sigs.k8s.io/controller-runtime/pkg/envtest" 38 | logf "sigs.k8s.io/controller-runtime/pkg/log" 39 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 40 | metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" 41 | "sigs.k8s.io/controller-runtime/pkg/webhook" 42 | 43 | awv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2" 44 | "github.com/project-codeflare/appwrapper/pkg/config" 45 | 46 | . "github.com/onsi/ginkgo/v2" 47 | . "github.com/onsi/gomega" 48 | ) 49 | 50 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to 51 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. 52 | 53 | var cfg *rest.Config 54 | var k8sClient client.Client 55 | var k8sLimitedClient client.Client 56 | var testEnv *envtest.Environment 57 | var ctx context.Context 58 | var cancel context.CancelFunc 59 | 60 | const limitedUserName = "limited-user" 61 | const limitedUserID = "8da0fcfe-6d7f-4f44-b433-d91d22cc1b8c" 62 | const defaultQueueName = "system-default-queue" 63 | const userProvidedQueueName = "user-provided-queue" 64 | 65 | func TestWebhooks(t *testing.T) { 66 | RegisterFailHandler(Fail) 67 | 68 | RunSpecs(t, "Webhook Unit Tests") 69 | } 70 | 71 | var _ = BeforeSuite(func() { 72 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 73 | 74 | ctx, cancel = context.WithCancel(context.Background()) 75 | 76 | By("bootstrapping test environment") 77 | testEnv = &envtest.Environment{ 78 | CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, 79 | ErrorIfCRDPathMissing: false, 80 | 81 | // The BinaryAssetsDirectory is only required if you want to run the tests directly 82 | // without call the makefile target test. If not informed it will look for the 83 | // default path defined in controller-runtime which is /usr/local/kubebuilder/. 84 | // Note that you must have the required binaries setup under the bin directory to perform 85 | // the tests directly. When we run make test it will be setup and used automatically. 86 | BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", 87 | fmt.Sprintf("1.29.0-%s-%s", runtime.GOOS, runtime.GOARCH)), 88 | 89 | WebhookInstallOptions: envtest.WebhookInstallOptions{ 90 | Paths: []string{filepath.Join("..", "..", "config", "webhook")}, 91 | }, 92 | } 93 | 94 | var err error 95 | // cfg is defined in this file globally. 96 | cfg, err = testEnv.Start() 97 | Expect(err).NotTo(HaveOccurred()) 98 | Expect(cfg).NotTo(BeNil()) 99 | 100 | scheme := apimachineryruntime.NewScheme() 101 | err = awv1beta2.AddToScheme(scheme) 102 | Expect(err).NotTo(HaveOccurred()) 103 | 104 | err = admissionv1.AddToScheme(scheme) 105 | Expect(err).NotTo(HaveOccurred()) 106 | err = rbacv1.AddToScheme(scheme) 107 | Expect(err).NotTo(HaveOccurred()) 108 | err = clientgoscheme.AddToScheme(scheme) 109 | Expect(err).NotTo(HaveOccurred()) 110 | 111 | //+kubebuilder:scaffold:scheme 112 | 113 | k8sClient, err = client.New(cfg, client.Options{Scheme: scheme}) 114 | Expect(err).NotTo(HaveOccurred()) 115 | Expect(k8sClient).NotTo(BeNil()) 116 | 117 | // configure a restricted rbac user who can create AppWrappers and Pods but not Deployments 118 | limitedCfg := *cfg 119 | limitedCfg.Impersonate = rest.ImpersonationConfig{UserName: limitedUserName, UID: string(limitedUserID), Extra: map[string][]string{"xyzzy": {"plugh"}}} 120 | _, err = testEnv.AddUser(envtest.User{Name: limitedUserName, Groups: []string{}}, &limitedCfg) 121 | Expect(err).NotTo(HaveOccurred()) 122 | clusterRole := &rbacv1.ClusterRole{ 123 | ObjectMeta: metav1.ObjectMeta{Name: "limited-role"}, 124 | Rules: []rbacv1.PolicyRule{ 125 | {Verbs: []string{"*"}, APIGroups: []string{"workload.codeflare.dev"}, Resources: []string{"appwrappers"}}, 126 | {Verbs: []string{"*"}, APIGroups: []string{""}, Resources: []string{"pods"}}, 127 | {Verbs: []string{"get"}, APIGroups: []string{"apps"}, Resources: []string{"deployments"}}, 128 | }, 129 | } 130 | err = k8sClient.Create(ctx, clusterRole) 131 | Expect(err).NotTo(HaveOccurred()) 132 | clusterRoleBinding := &rbacv1.ClusterRoleBinding{ 133 | ObjectMeta: metav1.ObjectMeta{Name: "limited-role-binding"}, 134 | Subjects: []rbacv1.Subject{{Kind: rbacv1.UserKind, Name: limitedUserName}}, 135 | RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "ClusterRole", Name: clusterRole.Name}, 136 | } 137 | err = k8sClient.Create(ctx, clusterRoleBinding) 138 | Expect(err).NotTo(HaveOccurred()) 139 | 140 | k8sLimitedClient, err = client.New(&limitedCfg, client.Options{Scheme: scheme}) 141 | Expect(err).NotTo(HaveOccurred()) 142 | 143 | // start webhook server using Manager 144 | webhookInstallOptions := &testEnv.WebhookInstallOptions 145 | mgr, err := ctrl.NewManager(cfg, ctrl.Options{ 146 | Scheme: scheme, 147 | WebhookServer: webhook.NewServer(webhook.Options{ 148 | Host: webhookInstallOptions.LocalServingHost, 149 | Port: webhookInstallOptions.LocalServingPort, 150 | CertDir: webhookInstallOptions.LocalServingCertDir, 151 | }), 152 | LeaderElection: false, 153 | Metrics: metricsserver.Options{BindAddress: "0"}, 154 | }) 155 | Expect(err).NotTo(HaveOccurred()) 156 | 157 | conf := config.NewAppWrapperConfig() 158 | conf.DefaultQueueName = defaultQueueName // add default queue name 159 | err = SetupAppWrapperWebhook(mgr, conf) 160 | Expect(err).NotTo(HaveOccurred()) 161 | 162 | //+kubebuilder:scaffold:webhook 163 | 164 | go func() { 165 | defer GinkgoRecover() 166 | err = mgr.Start(ctx) 167 | Expect(err).NotTo(HaveOccurred()) 168 | }() 169 | 170 | // wait for the webhook server to get ready 171 | dialer := &net.Dialer{Timeout: time.Second} 172 | addrPort := fmt.Sprintf("%s:%d", webhookInstallOptions.LocalServingHost, webhookInstallOptions.LocalServingPort) 173 | Eventually(func() error { 174 | conn, err := tls.DialWithDialer(dialer, "tcp", addrPort, &tls.Config{InsecureSkipVerify: true}) 175 | if err != nil { 176 | return err 177 | } 178 | return conn.Close() 179 | }).Should(Succeed()) 180 | }) 181 | 182 | var _ = AfterSuite(func() { 183 | cancel() 184 | By("tearing down the test environment") 185 | err := testEnv.Stop() 186 | Expect(err).NotTo(HaveOccurred()) 187 | }) 188 | -------------------------------------------------------------------------------- /kube-state-metrics/README.md: -------------------------------------------------------------------------------- 1 | This directory contains configuration files for enabling 2 | [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics/) 3 | to report metrics for AppWrapper. 4 | 5 | The file [appwrapper-ksm-cm.yaml](./appwrapper-ksm-cm.yaml) defines 6 | a configuration map that can be volume-mounted into the 7 | kube-state-metrics pod and passed via the `--custom-resource-state-config-file` 8 | command line argument. For development of the AppWrapper metrics, 9 | you may want to add `--custom-resource-state-only=true` to the command 10 | line arguments to suppress generation of metrics for built-in types. 11 | 12 | The file [appwrapper-ksm-rbac.yaml](./appwrapper-ksm-rbac.yaml) defines 13 | a clusterrole and clusterrolebinding that add the RBACs 14 | needed to collect AppWrapper metrics to the `kube-state-metrics` service account. 15 | Alternatively, you could edit the existing kube-state-metrics clusterrole to 16 | add these permissions. 17 | 18 | The changes to the kube-state-metrics deployment are roughly as shown below: 19 | ```yaml 20 | ... 21 | spec: 22 | containers: 23 | - args: 24 | - --custom-resource-state-config-file=/appwrapper_ksm.yaml 25 | ... 26 | volumeMounts: 27 | - mountPath: /appwrapper_ksm.yaml 28 | name: appwrapper-ksm 29 | readOnly: true 30 | subPath: appwrapper_ksm.yaml 31 | ... 32 | volumes: 33 | - configMap: 34 | defaultMode: 420 35 | name: appwrapper-ksm 36 | name: appwrapper-ksm 37 | ``` 38 | -------------------------------------------------------------------------------- /kube-state-metrics/appwrapper-ksm-cm.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: appwrapper-ksm 5 | namespace: kube-system 6 | data: 7 | appwrapper_ksm.yaml: | 8 | kind: CustomResourceStateMetrics 9 | spec: 10 | resources: 11 | - groupVersionKind: 12 | group: workload.codeflare.dev 13 | kind: "AppWrapper" 14 | version: "v1beta2" 15 | commonLabels: 16 | crd_type: appwrapper 17 | labelsFromPath: 18 | namespace: [metadata, namespace] 19 | metrics: 20 | - name: "status_phase" 21 | help: "AppWrapper status_phase" 22 | each: 23 | type: StateSet 24 | stateSet: 25 | labelName: phase 26 | path: [status, phase] 27 | list: [Suspended, Resuming, Running, Resetting, Suspending, Succeeded, Failed, Terminating] 28 | - name: "retry_count" 29 | help: "AppWrapper status_retries" 30 | each: 31 | type: Gauge 32 | gauge: 33 | path: [status, retries] 34 | - name: "status_conditions" 35 | help: "AppWrapper status_conditions" 36 | each: 37 | type: Gauge 38 | gauge: 39 | path: [status, conditions] 40 | labelsFromPath: 41 | type: ["type"] 42 | valueFrom: ["status"] 43 | -------------------------------------------------------------------------------- /kube-state-metrics/appwrapper-ksm-rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: kube-state-metrics-appwrapper 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: kube-state-metrics-appwrapper 9 | subjects: 10 | - kind: ServiceAccount 11 | name: kube-state-metrics 12 | namespace: kube-system 13 | --- 14 | apiVersion: rbac.authorization.k8s.io/v1 15 | kind: ClusterRole 16 | metadata: 17 | name: kube-state-metrics-appwrapper 18 | rules: 19 | - apiGroups: 20 | - apiextensions.k8s.io 21 | resources: 22 | - customresourcedefinitions 23 | verbs: 24 | - list 25 | - watch 26 | - apiGroups: 27 | - workload.codeflare.dev 28 | resources: 29 | - appwrappers 30 | verbs: 31 | - list 32 | - watch 33 | -------------------------------------------------------------------------------- /pkg/config/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package config 18 | 19 | import ( 20 | "fmt" 21 | "time" 22 | 23 | v1 "k8s.io/api/core/v1" 24 | "k8s.io/utils/ptr" 25 | ) 26 | 27 | type OperatorConfig struct { 28 | AppWrapper *AppWrapperConfig `json:"appwrapper,omitempty"` 29 | CertManagement *CertManagementConfig `json:"certManagement,omitempty"` 30 | ControllerManager *ControllerManagerConfig `json:"controllerManager,omitempty"` 31 | WebhooksEnabled *bool `json:"webhooksEnabled,omitempty"` 32 | } 33 | 34 | type AppWrapperConfig struct { 35 | Autopilot *AutopilotConfig `json:"autopilot,omitempty"` 36 | UserRBACAdmissionCheck bool `json:"userRBACAdmissionCheck,omitempty"` 37 | FaultTolerance *FaultToleranceConfig `json:"faultTolerance,omitempty"` 38 | SchedulerName string `json:"schedulerName,omitempty"` 39 | DefaultQueueName string `json:"defaultQueueName,omitempty"` 40 | } 41 | 42 | type AutopilotConfig struct { 43 | InjectAntiAffinities bool `json:"injectAntiAffinities,omitempty"` 44 | MonitorNodes bool `json:"monitorNodes,omitempty"` 45 | ResourceTaints map[string][]v1.Taint `json:"resourceTaints,omitempty"` 46 | PreferNoScheduleWeight *int32 `json:"preferNoScheduleWeight,omitempty"` 47 | } 48 | 49 | type FaultToleranceConfig struct { 50 | AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"` 51 | WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"` 52 | FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"` 53 | RetryPausePeriod time.Duration `json:"resetPause,omitempty"` 54 | RetryLimit int32 `json:"retryLimit,omitempty"` 55 | ForcefulDeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"` 56 | GracePeriodMaximum time.Duration `json:"gracePeriodCeiling,omitempty"` 57 | SuccessTTL time.Duration `json:"successTTLCeiling,omitempty"` 58 | } 59 | 60 | type CertManagementConfig struct { 61 | Namespace string `json:"namespace,omitempty"` 62 | CertificateDir string `json:"certificateDir,omitempty"` 63 | CertificateName string `json:"certificateName,omitempty"` 64 | CertificateOrg string `json:"certificateOrg,omitempty"` 65 | MutatingWebhookConfigName string `json:"mutatingWebhookConfigName,omitempty"` 66 | ValidatingWebhookConfigName string `json:"validatingWebhookConfigName,omitempty"` 67 | WebhookServiceName string `json:"webhookServiceName,omitempty"` 68 | WebhookSecretName string `json:"webhookSecretName,omitempty"` 69 | } 70 | 71 | type ControllerManagerConfig struct { 72 | Metrics MetricsConfiguration `json:"metrics,omitempty"` 73 | Health HealthConfiguration `json:"health,omitempty"` 74 | LeaderElection bool `json:"leaderElection,omitempty"` 75 | EnableHTTP2 bool `json:"enableHTTP2,omitempty"` 76 | } 77 | 78 | type MetricsConfiguration struct { 79 | BindAddress string `json:"bindAddress,omitempty"` 80 | } 81 | 82 | type HealthConfiguration struct { 83 | BindAddress string `json:"bindAddress,omitempty"` 84 | } 85 | 86 | // NewAppWrapperConfig constructs an AppWrapperConfig and fills in default values 87 | func NewAppWrapperConfig() *AppWrapperConfig { 88 | return &AppWrapperConfig{ 89 | Autopilot: &AutopilotConfig{ 90 | InjectAntiAffinities: true, 91 | MonitorNodes: true, 92 | ResourceTaints: map[string][]v1.Taint{ 93 | "nvidia.com/gpu": { 94 | {Key: "autopilot.ibm.com/gpuhealth", Value: "WARN", Effect: v1.TaintEffectPreferNoSchedule}, 95 | {Key: "autopilot.ibm.com/gpuhealth", Value: "TESTING", Effect: v1.TaintEffectNoSchedule}, 96 | {Key: "autopilot.ibm.com/gpuhealth", Value: "EVICT", Effect: v1.TaintEffectNoExecute}}, 97 | }, 98 | PreferNoScheduleWeight: ptr.To(int32(50)), 99 | }, 100 | UserRBACAdmissionCheck: true, 101 | FaultTolerance: &FaultToleranceConfig{ 102 | AdmissionGracePeriod: 1 * time.Minute, 103 | WarmupGracePeriod: 5 * time.Minute, 104 | FailureGracePeriod: 1 * time.Minute, 105 | RetryPausePeriod: 90 * time.Second, 106 | RetryLimit: 3, 107 | ForcefulDeletionGracePeriod: 10 * time.Minute, 108 | GracePeriodMaximum: 24 * time.Hour, 109 | SuccessTTL: 7 * 24 * time.Hour, 110 | }, 111 | } 112 | } 113 | 114 | func ValidateAppWrapperConfig(config *AppWrapperConfig) error { 115 | if config.FaultTolerance.ForcefulDeletionGracePeriod > config.FaultTolerance.GracePeriodMaximum { 116 | return fmt.Errorf("ForcefulDelectionGracePeriod %v exceeds GracePeriodCeiling %v", 117 | config.FaultTolerance.ForcefulDeletionGracePeriod, config.FaultTolerance.GracePeriodMaximum) 118 | } 119 | if config.FaultTolerance.RetryPausePeriod > config.FaultTolerance.GracePeriodMaximum { 120 | return fmt.Errorf("RetryPausePeriod %v exceeds GracePeriodCeiling %v", 121 | config.FaultTolerance.RetryPausePeriod, config.FaultTolerance.GracePeriodMaximum) 122 | } 123 | if config.FaultTolerance.FailureGracePeriod > config.FaultTolerance.GracePeriodMaximum { 124 | return fmt.Errorf("FailureGracePeriod %v exceeds GracePeriodCeiling %v", 125 | config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodMaximum) 126 | } 127 | if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodMaximum { 128 | return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v", 129 | config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodMaximum) 130 | } 131 | if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodMaximum { 132 | return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v", 133 | config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum) 134 | } 135 | if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.WarmupGracePeriod { 136 | return fmt.Errorf("AdmissionGracePeriod %v exceeds AdmissionGracePeriod %v", 137 | config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum) 138 | } 139 | if config.FaultTolerance.SuccessTTL <= 0 { 140 | return fmt.Errorf("SuccessTTL %v is not a positive duration", config.FaultTolerance.SuccessTTL) 141 | } 142 | 143 | return nil 144 | } 145 | 146 | // NewCertManagermentConfig constructs a CertManagementConfig and fills in default values 147 | func NewCertManagementConfig(namespace string) *CertManagementConfig { 148 | return &CertManagementConfig{ 149 | Namespace: namespace, 150 | CertificateDir: "/tmp/k8s-webhook-server/serving-certs", 151 | CertificateName: "appwrapper-ca", 152 | CertificateOrg: "appwrapper", 153 | MutatingWebhookConfigName: "appwrapper-mutating-webhook-configuration", 154 | ValidatingWebhookConfigName: "appwrapper-validating-webhook-configuration", 155 | WebhookServiceName: "appwrapper-webhook-service", 156 | WebhookSecretName: "appwrapper-webhook-server-cert", 157 | } 158 | } 159 | 160 | // NewControllerRuntimeConfig constructs a ControllerRuntimeConfig and fills in default values 161 | func NewControllerManagerConfig() *ControllerManagerConfig { 162 | return &ControllerManagerConfig{ 163 | Metrics: MetricsConfiguration{ 164 | BindAddress: ":8443", 165 | }, 166 | Health: HealthConfiguration{ 167 | BindAddress: ":8081", 168 | }, 169 | LeaderElection: false, 170 | EnableHTTP2: false, 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /pkg/config/config_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package config 18 | 19 | import ( 20 | "testing" 21 | "time" 22 | 23 | . "github.com/onsi/ginkgo/v2" 24 | . "github.com/onsi/gomega" 25 | ) 26 | 27 | func TestConfig(t *testing.T) { 28 | RegisterFailHandler(Fail) 29 | 30 | RunSpecs(t, "AppWrapperConfig Unit Tests") 31 | } 32 | 33 | var _ = Describe("AppWrapper Config", func() { 34 | It("Config Constructors", func() { 35 | Expect(NewAppWrapperConfig()).ShouldNot(BeNil()) 36 | Expect(NewCertManagementConfig("testing")).ShouldNot(BeNil()) 37 | Expect(NewControllerManagerConfig()).ShouldNot(BeNil()) 38 | }) 39 | 40 | It("Config Validation", func() { 41 | awc := NewAppWrapperConfig() 42 | Expect(ValidateAppWrapperConfig(awc)).Should(Succeed()) 43 | 44 | bad := &FaultToleranceConfig{ForcefulDeletionGracePeriod: 10 * time.Second, GracePeriodMaximum: 1 * time.Second} 45 | Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) 46 | 47 | bad = &FaultToleranceConfig{RetryPausePeriod: 10 * time.Second, GracePeriodMaximum: 1 * time.Second} 48 | Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) 49 | 50 | bad = &FaultToleranceConfig{FailureGracePeriod: 10 * time.Second, GracePeriodMaximum: 1 * time.Second} 51 | Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) 52 | 53 | bad = &FaultToleranceConfig{AdmissionGracePeriod: 10 * time.Second, GracePeriodMaximum: 1 * time.Second} 54 | Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) 55 | 56 | bad = &FaultToleranceConfig{WarmupGracePeriod: 10 * time.Second, GracePeriodMaximum: 1 * time.Second} 57 | Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) 58 | 59 | bad = &FaultToleranceConfig{AdmissionGracePeriod: 10 * time.Second, WarmupGracePeriod: 1 * time.Second} 60 | Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) 61 | 62 | bad = &FaultToleranceConfig{SuccessTTL: -1 * time.Second} 63 | Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) 64 | }) 65 | }) 66 | -------------------------------------------------------------------------------- /pkg/controller/setup.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "errors" 22 | "fmt" 23 | "net/http" 24 | 25 | cert "github.com/open-policy-agent/cert-controller/pkg/rotator" 26 | "k8s.io/apimachinery/pkg/types" 27 | ctrl "sigs.k8s.io/controller-runtime" 28 | "sigs.k8s.io/controller-runtime/pkg/healthz" 29 | 30 | "github.com/project-codeflare/appwrapper/internal/controller/appwrapper" 31 | "github.com/project-codeflare/appwrapper/internal/webhook" 32 | "github.com/project-codeflare/appwrapper/pkg/config" 33 | ) 34 | 35 | // SetupControllers creates and configures all components of the AppWrapper controller 36 | func SetupControllers(mgr ctrl.Manager, awConfig *config.AppWrapperConfig) error { 37 | if awConfig.Autopilot != nil && awConfig.Autopilot.MonitorNodes { 38 | if err := (&appwrapper.NodeHealthMonitor{ 39 | Client: mgr.GetClient(), 40 | Config: awConfig, 41 | }).SetupWithManager(mgr); err != nil { 42 | return fmt.Errorf("node health monitor: %w", err) 43 | } 44 | } 45 | 46 | if err := (&appwrapper.AppWrapperReconciler{ 47 | Client: mgr.GetClient(), 48 | Recorder: mgr.GetEventRecorderFor("appwrappers"), 49 | Scheme: mgr.GetScheme(), 50 | Config: awConfig, 51 | }).SetupWithManager(mgr); err != nil { 52 | return fmt.Errorf("appwrapper controller: %w", err) 53 | } 54 | 55 | return nil 56 | } 57 | 58 | // SetupWebhooks creates and configures the AppWrapper controller's Webhooks 59 | func SetupWebhooks(mgr ctrl.Manager, awConfig *config.AppWrapperConfig) error { 60 | if err := webhook.SetupAppWrapperWebhook(mgr, awConfig); err != nil { 61 | return fmt.Errorf("webhook: %w", err) 62 | } 63 | return nil 64 | } 65 | 66 | func SetupIndexers(ctx context.Context, mgr ctrl.Manager, awConfig *config.AppWrapperConfig) error { 67 | return nil 68 | } 69 | 70 | func SetupProbeEndpoints(mgr ctrl.Manager, certsReady chan struct{}) error { 71 | if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 72 | return fmt.Errorf("health check: %w", err) 73 | } 74 | 75 | if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error { 76 | select { 77 | case <-certsReady: 78 | return mgr.GetWebhookServer().StartedChecker()(req) 79 | default: 80 | return errors.New("certificates are not ready") 81 | } 82 | }); err != nil { 83 | return fmt.Errorf("readiness check: %w", err) 84 | } 85 | return nil 86 | } 87 | 88 | // +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;update 89 | // +kubebuilder:rbac:groups="admissionregistration.k8s.io",resources=mutatingwebhookconfigurations,verbs=get;list;watch;update 90 | // +kubebuilder:rbac:groups="admissionregistration.k8s.io",resources=validatingwebhookconfigurations,verbs=get;list;watch;update 91 | 92 | func SetupCertManagement(mgr ctrl.Manager, config *config.CertManagementConfig, certsReady chan struct{}) error { 93 | // DNSName is ..svc 94 | var dnsName = fmt.Sprintf("%s.%s.svc", config.WebhookServiceName, config.Namespace) 95 | 96 | return cert.AddRotator(mgr, &cert.CertRotator{ 97 | SecretKey: types.NamespacedName{Namespace: config.Namespace, Name: config.WebhookSecretName}, 98 | CertDir: config.CertificateDir, 99 | CAName: config.CertificateName, 100 | CAOrganization: config.CertificateOrg, 101 | DNSName: dnsName, 102 | IsReady: certsReady, 103 | Webhooks: []cert.WebhookInfo{ 104 | {Type: cert.Validating, Name: config.ValidatingWebhookConfigName}, 105 | {Type: cert.Mutating, Name: config.MutatingWebhookConfigName}, 106 | }, 107 | // When the controller is running in the leader election mode, 108 | // we expect webhook server will run in primary and secondary instance 109 | RequireLeaderElection: false, 110 | }) 111 | } 112 | -------------------------------------------------------------------------------- /pkg/logger/logger.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package logger 18 | 19 | import ( 20 | "github.com/go-logr/logr" 21 | "k8s.io/apimachinery/pkg/api/errors" 22 | ) 23 | 24 | // logSink implements a filtered log sink 25 | type logSink struct { 26 | sink logr.LogSink 27 | } 28 | 29 | func (l logSink) Init(info logr.RuntimeInfo) { 30 | l.sink.Init(info) 31 | } 32 | 33 | func (l logSink) Enabled(level int) bool { 34 | return l.sink.Enabled(level) 35 | } 36 | func (l logSink) Info(level int, msg string, keysAndValues ...any) { 37 | l.sink.Info(level, msg, keysAndValues...) 38 | } 39 | 40 | func (l logSink) Error(err error, msg string, keysAndValues ...any) { 41 | // replace StatusReasonConflict errors with debug messages 42 | if errors.IsConflict(err) { 43 | l.sink.Info(1, msg, append(keysAndValues, "error", err.Error())...) 44 | } else { 45 | l.sink.Error(err, msg, keysAndValues...) 46 | } 47 | } 48 | 49 | func (l logSink) WithValues(keysAndValues ...any) logr.LogSink { 50 | return logSink{l.sink.WithValues(keysAndValues...)} 51 | } 52 | 53 | func (l logSink) WithName(name string) logr.LogSink { 54 | return logSink{l.sink.WithName(name)} 55 | } 56 | 57 | // FilteredLogger returns a copy of the logger with a filtered sink 58 | func FilteredLogger(logger logr.Logger) logr.Logger { 59 | return logger.WithSink(logSink{logger.GetSink()}) 60 | } 61 | -------------------------------------------------------------------------------- /samples/README.md: -------------------------------------------------------------------------------- 1 | # Sample AppWrappers 2 | 3 | This directory contains a number of example yamls showing how to wrap 4 | different Pod-creating Kubernetes resources in an AppWrapper. 5 | An AppWrapper can be used to wrap one or more instances of 6 | any Kubernetes Kind that uses `PodSpecTemplate` to define its Pods. 7 | An AppWrapper must contain at least one such Pod-creating resource in addition 8 | to zero or more non-Pod-creating resources. 9 | 10 | An AppWrapper contains a`components` array containing the wrapped resources. 11 | Each component has two main pieces: a `template` that defines the wrapped resource 12 | and a `podSets` array that gives the `replicas` and `path` within the template 13 | for each `PodSpecTemplate`. For correct operation of the AppWrapper, it is 14 | required that the provided `path` and `replicas` information correctly represent 15 | the Pod creating behavior of the wrapped resource. For resources that do not 16 | created Pods (eg `Services` or `Secrets`) `podSets` should be empty and thus omitted. 17 | 18 | To simplify the user experience, for a selection of commonly-used Kubernetes 19 | resource Kinds, the AppWrapper controller can automatically infer the `podSets` 20 | array if it is not provided. For these same Kinds, the AppWrapper controller 21 | will validate that any explicitly provided `podSet` entries do in fact match the 22 | definitions in `template`. 23 | The current set of automatically inferred Kinds is: 24 | + v1 Pod 25 | + apps/v1 Deployment 26 | + apps/v1 StatefulSet 27 | + batch/v1 Job 28 | + kubeflow.org/v1 PyTorchJob 29 | + ray.io/v1 RayCluster 30 | + ray.io/v1 RayJob 31 | + jobset.x-k8s.io/v1alpha2 JobSet 32 | 33 | In all of the examples, if `podSets` inference is supported for the wrapped Kind, 34 | then `podSets` is omitted from the sample yaml. 35 | -------------------------------------------------------------------------------- /samples/wrapped-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-deployment 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | spec: 8 | components: 9 | - template: 10 | apiVersion: apps/v1 11 | kind: Deployment 12 | metadata: 13 | name: sample-deployment 14 | labels: 15 | app: test 16 | spec: 17 | replicas: 2 18 | selector: 19 | matchLabels: 20 | app: test 21 | template: 22 | metadata: 23 | labels: 24 | app: test 25 | spec: 26 | terminationGracePeriodSeconds: 0 27 | containers: 28 | - name: busybox 29 | image: quay.io/project-codeflare/busybox:1.36 30 | command: ["sh", "-c", "sleep 10000"] 31 | resources: 32 | requests: 33 | cpu: 1 34 | -------------------------------------------------------------------------------- /samples/wrapped-failing-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-failing-job 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | annotations: 8 | workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s 9 | workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s 10 | workload.codeflare.dev.appwrapper/retryLimit: "3" 11 | spec: 12 | components: 13 | - template: 14 | apiVersion: batch/v1 15 | kind: Job 16 | metadata: 17 | name: sample-failing-job 18 | spec: 19 | backoffLimit: 1 20 | completions: 1 21 | template: 22 | spec: 23 | restartPolicy: Never 24 | containers: 25 | - name: busybox 26 | image: quay.io/project-codeflare/busybox:1.36 27 | command: ["sh", "-c", "sleep 15; exit 1"] 28 | resources: 29 | requests: 30 | cpu: 1 31 | -------------------------------------------------------------------------------- /samples/wrapped-failing-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-failing-pod 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | annotations: 8 | workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s 9 | workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s 10 | workload.codeflare.dev.appwrapper/retryLimit: "1" 11 | workload.codeflare.dev.appwrapper/delectionOnFailureGracePeriodDuration: "5m" 12 | spec: 13 | components: 14 | - template: 15 | apiVersion: v1 16 | kind: Pod 17 | metadata: 18 | name: sample-failing-pod 19 | spec: 20 | restartPolicy: Never 21 | initContainers: 22 | - name: stall 23 | image: quay.io/project-codeflare/busybox:1.36 24 | command: ["sh", "-c", "sleep 10"] 25 | containers: 26 | - name: busybox 27 | image: quay.io/project-codeflare/busybox:1.36 28 | command: ["sh", "-c", "sleep 5; exit 1"] 29 | resources: 30 | requests: 31 | cpu: 1 32 | -------------------------------------------------------------------------------- /samples/wrapped-failing-pytorch-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-failing-pytorch-job 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | spec: 8 | components: 9 | - template: 10 | apiVersion: "kubeflow.org/v1" 11 | kind: PyTorchJob 12 | metadata: 13 | name: pytorch-simple 14 | spec: 15 | pytorchReplicaSpecs: 16 | Master: 17 | replicas: 1 18 | restartPolicy: Never 19 | template: 20 | spec: 21 | containers: 22 | - name: pytorch 23 | image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 24 | command: 25 | - "python3" 26 | - "/opt/pytorch-mnist/mnist.py" 27 | - "--epochs=1" 28 | resources: 29 | requests: 30 | cpu: 1 31 | Worker: 32 | replicas: 1 33 | restartPolicy: Never 34 | template: 35 | spec: 36 | containers: 37 | - name: pytorch 38 | image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 39 | command: 40 | - sleep 10; exit 1 41 | resources: 42 | requests: 43 | cpu: 1 44 | -------------------------------------------------------------------------------- /samples/wrapped-gpu-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-gpu-job 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | annotations: 8 | workload.codeflare.dev.appwrapper/successTTLDuration: "1m" 9 | spec: 10 | components: 11 | - template: 12 | apiVersion: batch/v1 13 | kind: Job 14 | metadata: 15 | name: sample-gpu-job 16 | spec: 17 | template: 18 | spec: 19 | restartPolicy: Never 20 | containers: 21 | - name: busybox 22 | image: quay.io/project-codeflare/busybox:1.36 23 | command: ["sh", "-c", "sleep 600"] 24 | resources: 25 | requests: 26 | cpu: 1 27 | nvidia.com/gpu: 4 28 | limits: 29 | nvidia.com/gpu: 4 30 | -------------------------------------------------------------------------------- /samples/wrapped-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-job 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | annotations: 8 | workload.codeflare.dev.appwrapper/successTTLDuration: "1m" 9 | spec: 10 | components: 11 | - template: 12 | apiVersion: batch/v1 13 | kind: Job 14 | metadata: 15 | name: sample-job 16 | spec: 17 | template: 18 | spec: 19 | restartPolicy: Never 20 | containers: 21 | - name: busybox 22 | image: quay.io/project-codeflare/busybox:1.36 23 | command: ["sh", "-c", "sleep 600"] 24 | resources: 25 | requests: 26 | cpu: 1 27 | -------------------------------------------------------------------------------- /samples/wrapped-jobset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-jobset 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | spec: 8 | components: 9 | - template: 10 | apiVersion: jobset.x-k8s.io/v1alpha2 11 | kind: JobSet 12 | metadata: 13 | name: sample-jobset 14 | spec: 15 | replicatedJobs: 16 | - name: workers 17 | replicas: 2 18 | template: 19 | spec: 20 | parallelism: 2 21 | completions: 2 22 | backoffLimit: 0 23 | template: 24 | spec: 25 | restartPolicy: Never 26 | containers: 27 | - name: sleep 28 | image: quay.io/project-codeflare/busybox:1.36 29 | command: ["sh", "-c", "sleep 100"] 30 | resources: 31 | requests: 32 | cpu: 100m 33 | - name: driver 34 | template: 35 | spec: 36 | parallelism: 1 37 | completions: 1 38 | backoffLimit: 0 39 | template: 40 | spec: 41 | restartPolicy: Never 42 | containers: 43 | - name: sleep 44 | image: quay.io/project-codeflare/busybox:1.36 45 | command: ["sh", "-c", "sleep 100"] 46 | resources: 47 | requests: 48 | cpu: 100m 49 | -------------------------------------------------------------------------------- /samples/wrapped-leader-worker-set.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-lws 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | spec: 8 | components: 9 | - podSets: 10 | - path: "template.spec.leaderWorkerTemplate.leaderTemplate" 11 | replicas: 2 12 | - path: "template.spec.leaderWorkerTemplate.workerTemplate" 13 | replicas: 3 14 | template: 15 | apiVersion: leaderworkerset.x-k8s.io/v1 16 | kind: LeaderWorkerSet 17 | metadata: 18 | name: nginx-leaderworkerset 19 | labels: 20 | app: nginx 21 | spec: 22 | replicas: 2 23 | leaderWorkerTemplate: 24 | leaderTemplate: 25 | spec: 26 | containers: 27 | - name: nginx-leader 28 | image: registry.k8s.io/nginx-slim:0.27 29 | resources: 30 | requests: 31 | cpu: "100m" 32 | ports: 33 | - containerPort: 80 34 | size: 3 35 | workerTemplate: 36 | spec: 37 | containers: 38 | - name: nginx-worker 39 | image: nginx:1.14.2 40 | resources: 41 | requests: 42 | cpu: "200m" 43 | ports: 44 | - containerPort: 80 45 | -------------------------------------------------------------------------------- /samples/wrapped-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-pod 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | spec: 8 | components: 9 | - template: 10 | apiVersion: v1 11 | kind: Pod 12 | metadata: 13 | name: sample-pod 14 | spec: 15 | restartPolicy: Never 16 | initContainers: 17 | - name: stall 18 | image: quay.io/project-codeflare/busybox:1.36 19 | command: ["sh", "-c", "sleep 10"] 20 | containers: 21 | - name: busybox 22 | image: quay.io/project-codeflare/busybox:1.36 23 | command: ["sh", "-c", "sleep 600"] 24 | resources: 25 | requests: 26 | cpu: 1 27 | -------------------------------------------------------------------------------- /samples/wrapped-pytorch-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: workload.codeflare.dev/v1beta2 2 | kind: AppWrapper 3 | metadata: 4 | name: sample-pytorch-job 5 | labels: 6 | kueue.x-k8s.io/queue-name: default-queue 7 | spec: 8 | components: 9 | - template: 10 | apiVersion: "kubeflow.org/v1" 11 | kind: PyTorchJob 12 | metadata: 13 | name: pytorch-simple 14 | spec: 15 | pytorchReplicaSpecs: 16 | Master: 17 | replicas: 1 18 | restartPolicy: OnFailure 19 | template: 20 | spec: 21 | containers: 22 | - name: pytorch 23 | image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 24 | command: 25 | - "python3" 26 | - "/opt/pytorch-mnist/mnist.py" 27 | - "--epochs=1" 28 | resources: 29 | requests: 30 | cpu: 1 31 | Worker: 32 | replicas: 2 33 | restartPolicy: OnFailure 34 | template: 35 | spec: 36 | containers: 37 | - name: pytorch 38 | image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 39 | command: 40 | - "python3" 41 | - "/opt/pytorch-mnist/mnist.py" 42 | - "--epochs=1" 43 | resources: 44 | requests: 45 | cpu: 1 46 | -------------------------------------------------------------------------------- /site/.gitignore: -------------------------------------------------------------------------------- 1 | _site 2 | .sass-cache 3 | .jekyll-metadata 4 | Gemfile.lock 5 | -------------------------------------------------------------------------------- /site/Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gem "github-pages", group: :jekyll_plugins 4 | 5 | gem "tzinfo-data" 6 | gem "wdm", "~> 0.1.0" if Gem.win_platform? 7 | 8 | # If you have any plugins, put them here! 9 | group :jekyll_plugins do 10 | gem "jekyll-paginate" 11 | gem "jekyll-sitemap" 12 | gem "jekyll-gist" 13 | gem "jekyll-feed" 14 | gem "jemoji" 15 | gem "jekyll-include-cache" 16 | gem "jekyll-spaceship" 17 | end 18 | 19 | gem "webrick", "~> 1.7" 20 | -------------------------------------------------------------------------------- /site/_config.yml: -------------------------------------------------------------------------------- 1 | # Welcome to Jekyll! 2 | # 3 | # This config file is meant for settings that affect your whole blog, values 4 | # which you are expected to set up once and rarely edit after that. If you find 5 | # yourself editing this file very often, consider using Jekyll's data files 6 | # feature for the data you need to update frequently. 7 | # 8 | # For technical reasons, this file is *NOT* reloaded automatically when you use 9 | # 'bundle exec jekyll serve'. If you change this file, please restart the server process. 10 | 11 | # Site settings 12 | # These are used to personalize your new site. If you look in the HTML files, 13 | # you will see them accessed via {{ site.title }}, {{ site.email }}, and so on. 14 | # You can create any custom variable you would like, and they will be accessible 15 | # in the templates via {{ site.myvariable }}. 16 | title: "Project CodeFlare: AppWrapper" 17 | description: >- # 18 | Project Codeflare: AppWrapper 19 | minimal_mistakes_skin: contrast 20 | 21 | # Build settings 22 | markdown: kramdown 23 | remote_theme: mmistakes/minimal-mistakes@4.24.0 24 | repository: project-codeflare/appwrapper 25 | 26 | # Variables for use in pages 27 | gh_main_url: https://github.com/project-codeflare/appwrapper/blob/main 28 | appwrapper_version: v1.1.2 29 | 30 | # Outputting 31 | permalink: /:categories/:title/ 32 | timezone: America/New_York 33 | 34 | exclude: 35 | - README.md 36 | 37 | include: 38 | - _pages 39 | 40 | # Plugins 41 | plugins: 42 | - jekyll-paginate 43 | - jekyll-sitemap 44 | - jekyll-gist 45 | - jekyll-feed 46 | - jemoji 47 | - jekyll-include-cache 48 | - jekyll-spaceship 49 | 50 | defaults: 51 | # _posts 52 | - scope: 53 | path: "" 54 | type: posts 55 | values: 56 | layout: single 57 | read_time: false 58 | comments: false 59 | share: false 60 | related: false 61 | # _pages 62 | - scope: 63 | path: "_pages" 64 | type: pages 65 | values: 66 | layout: single 67 | sidebar: 68 | nav: "side" 69 | 70 | footer: 71 | 72 | atom_feed: 73 | hide: true 74 | 75 | category_archive: 76 | type: liquid 77 | path: /categories/ 78 | tag_archive: 79 | type: liquid 80 | path: /tags/ 81 | 82 | after_footer_scripts: 83 | - https://cdn.jsdelivr.net/npm/clipboard@2/dist/clipboard.min.js 84 | - assets/js/clipboardrouge.js 85 | -------------------------------------------------------------------------------- /site/_data/navigation.yml: -------------------------------------------------------------------------------- 1 | main: 2 | - title: "Overview" 3 | url: / 4 | - title: "Quick Start Guide" 5 | url: /quick-start/ 6 | - title: "Samples" 7 | url: /samples/ 8 | - title: "GitHub" 9 | url: https://github.com/project-codeflare/appwrapper 10 | - title: "API Reference" 11 | url: /api/workload.codeflare.dev/v1beta2/ 12 | 13 | 14 | side: 15 | - title: "Installation" 16 | children: 17 | - title: "Quick-Start Guide" 18 | url: /quick-start/ 19 | - title: "Development Setup" 20 | url: /dev-setup/ 21 | 22 | - title: "Samples" 23 | children: 24 | - title: "PyTorch Job" 25 | url: "/samples/pytorch/" 26 | - title: "Batch Job" 27 | url: "/samples/batch-job/" 28 | 29 | - title: "Architecture" 30 | children: 31 | - title: API Reference 32 | url: /api/workload.codeflare.dev/v1beta2/ 33 | - title: Controllers 34 | url: /arch-controller/ 35 | - title: Fault Tolerance 36 | url: /arch-fault-tolerance/ 37 | - title: Node Monitoring 38 | url: /arch-node-monitoring/ 39 | -------------------------------------------------------------------------------- /site/_pages/404.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Page Not Found" 3 | excerpt: "Page not found. Your pixels are in another canvas." 4 | sitemap: false 5 | permalink: /404.html 6 | --- 7 | 8 | Sorry, but the page you were trying to view does not exist. 9 | -------------------------------------------------------------------------------- /site/_pages/arch-controller.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /arch-controller/ 3 | title: "AppWrapper Controllers" 4 | classes: wide 5 | --- 6 | 7 | Kueue has a [well-developed pattern](https://kueue.sigs.k8s.io/docs/tasks/dev/integrate_a_custom_job/) 8 | for Kueue-enabling a Custom Resource Definition and its associated operator. 9 | 10 | AppWrapper version 1.0.6 and earlier was an *external Kueue integration* 11 | and therefore versions of the AppWrapper controller were closely tied to a matching 12 | Kueue version (Kueue 0.7 through Kueue 0.10). 13 | 14 | Starting with Kueue 0.11 and AppWrapper version 1.1, AppWrapper becamse a first class 15 | *built-in Kueue integration*. This allows a looser coupling between the 16 | two controllers and a significant simplification in their joint deployment and configuration. 17 | 18 | #### AppWrapper Controller 19 | 20 | The AppWrapper Controller is a standard reconciliation loop that watches AppWrapper instances and 21 | is responsible for all AppWrapper-specific operations including 22 | creating, monitoring, and deleting the wrapped resources in response 23 | to the modifications of the AppWrapper instance’s specification and 24 | status. 25 | 26 | ```mermaid! 27 | --- 28 | title: AppWrapper Phase Transitions 29 | --- 30 | stateDiagram-v2 31 | e : Empty 32 | 33 | sd : Suspended 34 | rs : Resuming 35 | rn : Running 36 | rt : Resetting 37 | sg : Suspending 38 | s : Succeeded 39 | f : Failed 40 | 41 | %% Happy Path 42 | e --> sd 43 | sd --> rs : Suspend == false 44 | rs --> rn 45 | rn --> s 46 | 47 | %% Requeuing 48 | rs --> sg : Suspend == true 49 | rn --> sg : Suspend == true 50 | rt --> sg : Suspend == true 51 | sg --> sd 52 | 53 | %% Failures 54 | rs --> f 55 | rn --> f 56 | rn --> rt : Workload Unhealthy 57 | rt --> rs 58 | 59 | classDef quota fill:lightblue 60 | class rs quota 61 | class rn quota 62 | class rt quota 63 | class sg quota 64 | 65 | classDef failed fill:pink 66 | class f failed 67 | 68 | classDef succeeded fill:lightgreen 69 | class s succeeded 70 | ``` 71 | 72 | The state diagram above depicts the transitions between the Phases of 73 | an AppWrapper. A label on an edge indicates the state change that will 74 | trigger that transition. For example, if an AppWrapper is in the 75 | Suspended Phase and Kueue sets `spec.suspend` to `false` then the AppWrapper Controller 76 | will transition the AppWrapper to the Resuming Phase. 77 | 78 | These states are augmented by two orthogonal Conditions: 79 | + **QuotaReserved** indicates whether the AppWrapper is considered Active by Kueue. 80 | + **ResourcesDeployed** indicates whether wrapped resources may exist on the cluster. 81 | 82 | QuotaReserved and ResourcesDeployed are both true in states colored blue below. 83 | 84 | QuotaReserved and ResourcesDeployed will initially be true in the Failed state (pink), 85 | but will become false when the AppWrapper Controller succeeds at deleting all resources created 86 | in the Resuming phase. 87 | 88 | ResourcesDeployed will be true in the Succeeded state (green), but QuotaReserved will be false. 89 | After a configurable delay, the AppWrapper controller will eventually delete the resources of 90 | Succeeded AppWrappers and ResourcesDeployed will become false. 91 | 92 | Any phase may transition to the Terminating phase (not shown) when the AppWrapper is deleted. 93 | During the Terminating phase, QuotaReserved and ResourcesDeployed may initially be true 94 | but will become false once the AppWrapper Controller succeeds at deleting all associated resources. 95 | 96 | See [appwrapper_controller.go]({{ site.gh_main_url }}/internal/controller/appwrapper/appwrapper_controller.go) 97 | for the implementation. 98 | -------------------------------------------------------------------------------- /site/_pages/arch-crd.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /arch-crd/ 3 | title: "AppWrapper Custom Resource Definition" 4 | classes: wide 5 | --- 6 | 7 | TODO: Document CRD here. Pull in generated OpenAPI spec. 8 | -------------------------------------------------------------------------------- /site/_pages/arch-fault-tolerance.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /arch-fault-tolerance/ 3 | title: "Fault Tolerance" 4 | classes: wide 5 | --- 6 | 7 | ### Overview of Capabilities 8 | 9 | The AppWrapper controller is designed to enhance and extend the fault 10 | tolerance capabilities provided by the controllers of its wrapped 11 | resources. If [Autopilot](https://github.com/ibm/autopilot) is deployed on the 12 | cluster, the AppWrapper controller can automate both the injection of 13 | Node anti-affinites to avoid scheduling workloads on unhealthy Nodes 14 | and the migration of running workloads away from unhealthy Nodes. 15 | Throughout the execution of a workload, the AppWrapper 16 | controller monitors both the status of the contained top-level 17 | resources and the status of all Pods created by the workload. If a 18 | workload is determined to be *unhealthy*, the AppWrapper controller 19 | firsts waits for a bounded time period to allow the underlying 20 | controllers to correct the problem. If they fail to do so, then the 21 | AppWrapper controller will *reset* the workload by removing all 22 | created resources, and then, if the maximum number of retires has not 23 | been exceeded, recreating the workload. This reset process is carefully 24 | engineered to ensure that it will always make progress and eventually 25 | succeed in completely removing all Pods and other resources created by 26 | a failed workload. 27 | 28 | ```mermaid! 29 | --- 30 | title: Overview of AppWrapper Fault Tolerance Phase Transitions 31 | --- 32 | stateDiagram-v2 33 | 34 | rn : Running 35 | s : Succeeded 36 | f : Failed 37 | rt : Resetting 38 | rs : Resuming 39 | 40 | %% Happy Path 41 | rn --> s 42 | 43 | %% Requeuing 44 | rn --> f : Retries Exceeded 45 | rn --> rt : Workload Unhealthy 46 | rt --> rs : All Resources Removed 47 | rs --> rn : All Resources Recreated 48 | 49 | classDef quota fill:lightblue 50 | class rs quota 51 | class rn quota 52 | class rt quota 53 | 54 | classDef failed fill:pink 55 | class f failed 56 | 57 | classDef succeeded fill:lightgreen 58 | class s succeeded 59 | ``` 60 | 61 | ### Progress Guarantees 62 | 63 | When the AppWrapper controller decides to delete the resources for a 64 | workload, it proceeds through several phases. First it does a normal 65 | delete of the top-level resources, allowing the primary resource 66 | controllers time to cascade the deletion through all child resources. 67 | If they are not able to successfully delete all of the workload's Pods 68 | and resources within a `ForcefulDeletionGracePeriod`, the AppWrapper 69 | controller then initiates a *forceful* deletion of all remaining Pods 70 | and resources by deleting them with a `GracePeriod` of `0`. An 71 | AppWrapper will continue to have its `ResourcesDeployed` condition to 72 | be `True` until all resources and Pods are successfully deleted. 73 | 74 | This process ensures that when `ResourcesDeployed` becomes `False`, 75 | which indicates to Kueue that the quota has been released, all 76 | resources created by a failed workload will have been totally removed 77 | from the cluster. 78 | 79 | ### Detailed Description 80 | 81 | The `podSets` contained in the AppWrapper specification enable the 82 | AppWrapper controller to inject labels into every Pod that is created 83 | by the workload during its execution. Throughout the execution of the 84 | workload, the AppWrapper controller monitors the number and health of 85 | all labeled Pods. It also watches the top-level created resources and 86 | for selected resources types understands how to interpret their status 87 | information. This information is combined to determine if a workload 88 | is unhealthy. A workload can be deemed *unhealthy* if any of the 89 | following conditions are true: 90 | + There are a non-zero number of `Failed` Pods. 91 | + It takes longer than `AdmissionGracePeriod` for the expected 92 | number of Pods to reach the `Pending` state. 93 | + It takes longer than the `WarmupGracePeriod` for the expected 94 | number of Pods to reach the `Running` state. 95 | + If a non-zero number of `Running` Pods are using resources 96 | that Autopilot has tagged as `NoExecute`. 97 | + The status information of a batch/v1 Job or PyTorchJob indicates 98 | that it has failed. 99 | + A top-level wrapped resource is externally deleted. 100 | 101 | If a workload is determined to be unhealthy by one of the first three 102 | Pod-level conditions above, the AppWrapper controller first waits for 103 | a `FailureGracePeriod` to allow the primary resource controller an 104 | opportunity to react and return the workload to a healthy state. The 105 | `FailureGracePeriod` is elided by the remaining conditions because the 106 | primary resource controller is not expected to take any further 107 | action. If the `FailureGracePeriod` passes and the workload is still 108 | unhealthy, the AppWrapper controller will *reset* the workload by 109 | deleting its resources, waiting for a `RetryPausePeriod`, and then 110 | creating new instances of the resources. 111 | 112 | During this retry pause, the AppWrapper **does not** release the workload's 113 | quota; this ensures that when the resources are recreated they will still 114 | have sufficient quota to execute. The number of times an AppWrapper is reset 115 | is tracked as part of its status; if the number of resets exceeds the `RetryLimit`, 116 | then the AppWrapper moves into a `Failed` state and its resources are deleted 117 | (thus finally releasing its quota). If at any time during this retry loop, 118 | an AppWrapper is suspended (ie, Kueue decides to preempt the AppWrapper), 119 | the AppWrapper controller will respect this request by proceeding to delete 120 | the resources. Workload resets that are initiated in response to Autopilot 121 | are subject to the `RetryLimit` but do not increment the `retryCount`. 122 | External deletion of a top-level wrapped resource will cause the AppWrapper to 123 | directly enter the `Failed` state independent of the `RetryLimit`. 124 | 125 | To support debugging `Failed` workloads, an annotation can be added to an 126 | AppWrapper that adds a `DeletionOnFailureGracePeriod` between the time the 127 | AppWrapper enters the `Failed` state and when the process of deleting its resources 128 | begins. Since the AppWrapper continues to consume quota during this delayed deletion period, 129 | this annotation should be used sparingly and only when interactive debugging of 130 | the failed workload is being actively pursued. 131 | 132 | All child resources for an AppWrapper that successfully completed will be automatically 133 | deleted after a `SuccessTTL` after the AppWrapper entered the `Succeeded` state. 134 | 135 | ### Configuration Details 136 | 137 | The parameters of the retry loop described about are configured at the operator level 138 | and can be customized on a per-AppWrapper basis by adding annotations. 139 | The table below lists the parameters, gives their default, and the annotation that 140 | can be used to customize them. 141 | 142 | | Parameter | Default Value | Annotation | 143 | |------------------------------|---------------|------------------------------------------------------------------------| 144 | | AdmissionGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/admissionGracePeriodDuration | 145 | | WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration | 146 | | FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration | 147 | | RetryPausePeriod | 90 Seconds | workload.codeflare.dev.appwrapper/retryPausePeriodDuration | 148 | | RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit | 149 | | DeletionOnFailureGracePeriod | 0 Seconds | workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration | 150 | | ForcefulDeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration | 151 | | SuccessTTL | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration | 152 | | GracePeriodMaximum | 24 Hours | Not Applicable | 153 | 154 | The `GracePeriodMaximum` imposes a system-wide upper limit on all other grace periods to 155 | limit the potential impact of user-added annotations on overall system utilization. 156 | 157 | The set of resources monitored by Autopilot and the associated labels that identify unhealthy 158 | resources can be customized as part of the AppWrapper operator's configuration. The default 159 | Autopilot configuration used by the controller is: 160 | ```yaml 161 | autopilot: 162 | injectAntiAffinities: true 163 | monitorNodes: true 164 | resourceTaints: 165 | nvidia.com/gpu: 166 | - key: autopilot.ibm.com/gpuhealth 167 | value: ERR 168 | effect: NoSchedule 169 | - key: autopilot.ibm.com/gpuhealth 170 | value: EVICT 171 | effect: NoExecute 172 | ``` 173 | 174 | The `resourceTaints` is a map from resource names to taints. For this example 175 | configuration, for exactly those Pods that have a non-zero resource request for 176 | `nvidia.com/gpu`, the AppWrapper controller will automatically inject the stanza below 177 | into the `affinity` portion of their Spec. 178 | ```yaml 179 | nodeAffinity: 180 | requiredDuringSchedulingIgnoredDuringExecution: 181 | nodeSelectorTerms: 182 | - matchExpressions: 183 | - key: autopilot.ibm.com/gpuhealth 184 | operator: NotIn 185 | values: 186 | - ERR 187 | - EVICT 188 | ``` 189 | -------------------------------------------------------------------------------- /site/_pages/arch-node-monitoring.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /arch-node-monitoring/ 3 | title: "Node Monitoring" 4 | classes: wide 5 | --- 6 | 7 | The AppWrapper controller can optionally monitor Kubernetes Nodes and 8 | dynamically adjust the `lendingLimits` on a designated `ClusterQueue` 9 | to account for dynamically unavailable resources. This capability is 10 | designed to enable cluster admins of an 11 | [MLBatch cluster](https://github.com/project-codeflare/mlbatch) to fully 12 | automate the small scale quota adjustments required to maintain full cluster 13 | utilization in the presence of isolated node failures and/or 14 | minor maintenance activities. The monitoring detects both Nodes that 15 | are marked as `Unscheduable` via standard Kubernetes mechanisms and Nodes 16 | that have resources that Autopilot has flagged as unhealthy (see [Fault Tolerance](/arch-fault-tolerance)). 17 | The `lendingLimit` of a designated slack capacity `ClusterQueue` is 18 | automatically adjusted to reflect the current dynamically unavailable resources. 19 | 20 | Node monitoring is enabled by the following additional configuration: 21 | ```yaml 22 | slackQueueName: "slack-queue" 23 | autopilot: 24 | monitorNodes: true 25 | ``` 26 | 27 | See [node_health_monitor.go]({{ site.gh_main_url }}/internal/controller/appwrapper/node_health_monitor.go) 28 | for the implementation. 29 | -------------------------------------------------------------------------------- /site/_pages/category-archive.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Posts by Category" 3 | layout: categories 4 | permalink: /categories/ 5 | author_profile: true 6 | --- 7 | -------------------------------------------------------------------------------- /site/_pages/dev-setup.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /dev-setup/ 3 | title: "Development Setup" 4 | classes: wide 5 | --- 6 | 7 | ### Prerequisites 8 | 9 | You'll need `go` v1.22.4+ installed on your development machine. 10 | 11 | You'll need a container runtime and cli (eg `docker` or `rancher-desktop`). 12 | 13 | You’ll need a Kubernetes cluster to run against. 14 | 15 | You can use [kind](https://sigs.k8s.io/kind) to get a local cluster 16 | for testing, or run against a remote cluster. For the purposes of 17 | simplifying the rest of these instructions, we proceed assuming you 18 | will create a local `kind` cluster. 19 | 20 | ### Pre-commit hooks 21 | 22 | This repository includes pre-configured pre-commit hooks. Make sure to install 23 | the hooks immediately after cloning the repository: 24 | ```sh 25 | pre-commit install 26 | ``` 27 | See [https://pre-commit.com](https://pre-commit.com) for prerequisites. 28 | 29 | ### Create your cluster and deploy Kueue 30 | 31 | 32 | Create the cluster with: 33 | ```sh 34 | ./hack/create-test-cluster.sh 35 | ``` 36 | 37 | Deploy Kueue on the cluster and configure it to have queues in your default namespace 38 | with a nominal quota of 4 CPUs with: 39 | ```sh 40 | ./hack/deploy-kueue.sh 41 | ``` 42 | 43 | You can verify Kueue is configured as expected with: 44 | ```sh 45 | % kubectl get localqueues,clusterqueues -o wide 46 | NAME CLUSTERQUEUE PENDING WORKLOADS ADMITTED WORKLOADS 47 | localqueue.kueue.x-k8s.io/default-queue cluster-queue 0 0 48 | 49 | NAME COHORT STRATEGY PENDING WORKLOADS ADMITTED WORKLOADS 50 | clusterqueue.kueue.x-k8s.io/cluster-queue BestEffortFIFO 0 0 51 | ``` 52 | 53 | ### Deploy on the cluster 54 | 55 | Build your image and push it to the cluster with: 56 | ```sh 57 | make docker-build kind-push 58 | ``` 59 | 60 | Deploy the CRDs and controller to the cluster: 61 | ```sh 62 | make deploy 63 | ``` 64 | 65 | Within a few seconds, the controller pod in the `appwrapper-system` 66 | namespace should be Ready. Verify this with: 67 | ```sh 68 | kubectl get pods -n appwrapper-system 69 | ``` 70 | 71 | You can now try deploying a sample `AppWrapper`: 72 | ```sh 73 | kubectl apply -f samples/wrapped-pod.yaml 74 | ``` 75 | 76 | You should quickly see an AppWrapper with the `Running` Status. 77 | The sample contains a single Pod with an `init` container that runs for 10 seconds, 78 | followed by a main container that runs for 5 seconds. After the main container completes, 79 | the Status of the AppWrapper will be `Succeeded`. We show some kubectl commands and 80 | their expected outputs below: 81 | ```sh 82 | % kubectl get appwrappers 83 | NAME STATUS 84 | sample-pod Running 85 | 86 | % kubectl get pods 87 | NAME READY STATUS RESTARTS AGE 88 | sample-pod 0/1 Init:0/1 0 14s 89 | 90 | % kubectl get pods 91 | NAME READY STATUS RESTARTS AGE 92 | sample-pod 1/1 Running 0 18s 93 | 94 | % kubectl get pods 95 | NAME READY STATUS RESTARTS AGE 96 | sample-pod 0/1 Completed 0 30s 97 | 98 | % kubectl get appwrappers 99 | NAME STATUS 100 | sample-pod Succeeded 101 | ``` 102 | 103 | You can now delete the sample AppWrapper. 104 | ```sh 105 | kubectl delete -f samples/wrapped-pod.yaml 106 | ``` 107 | 108 | To undeploy the CRDs and controller from the cluster: 109 | ```sh 110 | make undeploy 111 | ``` 112 | 113 | ### Run the controller as a local process against the cluster 114 | 115 | For faster development and debugging, you can run the controller 116 | directly on your development machine as local process that will 117 | automatically be connected to the cluster. Note that in this 118 | configuration, the webhooks that implement the Admission Controllers 119 | are not operational. Therefore your CRDs will not be validated and 120 | you must explictly set the `suspended` field to `true` in your 121 | AppWrapper YAML files. 122 | 123 | Install the CRDs into the cluster: 124 | 125 | ```sh 126 | make install 127 | ``` 128 | 129 | Run your controller (this will run in the foreground, so switch to a new terminal if you want to leave it running): 130 | ```sh 131 | make run 132 | ``` 133 | 134 | **NOTE:** You can also run this in one step by running: `make install run` 135 | 136 | You can now deploy a sample with `kubectl apply -f 137 | samples/wrapped-pod.yaml` and observe its execution as described 138 | above. 139 | 140 | After deleting all AppWrapper CR instances, you can uninstall the CRDs 141 | with: 142 | ```sh 143 | make uninstall 144 | ``` 145 | 146 | ### Running unit tests 147 | 148 | Unit tests can be run at any time by doing `make test`. 149 | No additional setup is required. 150 | 151 | ### Running end-to-end tests 152 | 153 | A suite of end-to-end tests are run as part of the project's 154 | [continuous intergration workflow](./.github/workflows/CI.yaml). 155 | These tests can also be run locally aginst a deployed version of Kueue 156 | and the AppWrapper controller. 157 | 158 | To create and initialize your cluster, perform the following steps: 159 | ```shell 160 | ./hack/create-test-cluster.sh 161 | ./hack/deploy-kueue.sh 162 | ``` 163 | 164 | Next build and deploy the AppWrapper operator: 165 | ```shell 166 | make docker-build kind-push 167 | make deploy 168 | ``` 169 | 170 | Finally, run the test suite: 171 | ```shell 172 | ./hack/run-tests-on-cluster.sh 173 | ``` 174 | -------------------------------------------------------------------------------- /site/_pages/overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: single 3 | permalink: / 4 | excerpt: "Project Overview" 5 | redirect_from: 6 | - /about/ 7 | - /about.html 8 | classes: wide 9 | --- 10 | 11 | An AppWrapper contains a collection of Kubernetes resources that a 12 | user desires to manage as a single logical workload. AppWrapper is 13 | designed to smoothly interoperate with 14 | [Kueue](https://kueue.sigs.k8s.io). They provide a flexible and 15 | workload-agnostic mechanism for enabling Kueue to manage a group of 16 | Kubernetes resources as a single logical unit without requiring any 17 | Kueue-specific support by the controllers of those resources. 18 | 19 | An AppWrapper can be used to harden workloads by providing an 20 | additional level of automatic fault detection and recovery. The AppWrapper 21 | controller monitors the health of the workload and if corrective actions 22 | are not taken by the primary resource controllers within specified deadlines, 23 | the AppWrapper controller will orchestrate workload-level retries and 24 | resource deletion to ensure that either the workload returns to a 25 | healthy state or is cleanly removed from the cluster and its quota 26 | freed for use by other workloads. If [Autopilot](https://github.com/ibm/autopilot) 27 | is also being used on the cluster, the AppWrapper controller can be configured 28 | to automatically inject Node anti-affinities into Pods and to trigger 29 | retries when Pods in already running workloads are using resources 30 | that Autopilot has tagged as unhealthy. For details on customizing and 31 | configuring these fault tolerance capabilities, please see the 32 | [Fault Tolerance](https://project-codeflare.github.io/appwrapper/arch-controller/) 33 | section of our website. 34 | 35 | AppWrapper is designed to be used as part of fully open source software stack 36 | to run production batch workloads on Kubernetes and OpenShift. The [MLBatch](https://github.com/project-codeflare/mlbatch) 37 | project leverages [Kueue](https://kueue.sigs.k8s.io), the [Kubeflow Training 38 | Operator](https://www.kubeflow.org/docs/components/training/), 39 | [KubeRay](https://docs.ray.io/en/latest/cluster/kubernetes/index.html), and the 40 | [Codeflare Operator](https://github.com/project-codeflare/codeflare-operator) 41 | from [Red Hat OpenShift 42 | AI](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai). 43 | MLBatch enables [AppWrapper](https://project-codeflare.github.io/appwrapper/) 44 | and adds 45 | [Coscheduler](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/README.md). 46 | MLBatch includes a number of configuration steps to help these components work 47 | in harmony and support large workloads on large clusters. 48 | -------------------------------------------------------------------------------- /site/_pages/quick-start.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /quick-start/ 3 | title: "Quick Start Guide" 4 | classes: wide 5 | --- 6 | 7 | ## Installing the latest Release 8 | 9 | These quick start instructions assume you have a Kubernetes cluster 10 | available to you and `kubectl` is properly configured. 11 | 12 | ### Install Kueue 13 | 14 | Install and configure a compatible version of Kueue by executing this command: 15 | ```sh 16 | kubectl apply --server-side -k "https://github.com/project-codeflare/appwrapper/hack/kueue-config?ref={{ site.appwrapper_version }}" 17 | ``` 18 | 19 | Before continuing, ensure Kueue is ready by executing this command: 20 | ```sh 21 | kubectl -n kueue-system wait --timeout=300s --for=condition=Available deployments --all 22 | ``` 23 | 24 | Finally, you need to create a default `ClusterQueue` and `LocalQueue` 25 | with a quota of 4 CPUs to enable Kueue to schedule workloads on your cluster. 26 | The yaml shown below accomplishes this: 27 | ```yaml 28 | apiVersion: kueue.x-k8s.io/v1beta1 29 | kind: ResourceFlavor 30 | metadata: 31 | name: "default-flavor" 32 | --- 33 | apiVersion: kueue.x-k8s.io/v1beta1 34 | kind: ClusterQueue 35 | metadata: 36 | name: "cluster-queue" 37 | spec: 38 | namespaceSelector: {} # match all. 39 | resourceGroups: 40 | - coveredResources: ["cpu"] 41 | flavors: 42 | - name: "default-flavor" 43 | resources: 44 | - name: "cpu" 45 | nominalQuota: 4 46 | --- 47 | apiVersion: kueue.x-k8s.io/v1beta1 48 | kind: LocalQueue 49 | metadata: 50 | namespace: "default" 51 | name: "default-queue" 52 | spec: 53 | clusterQueue: "cluster-queue" 54 | ``` 55 | 56 | You can either copy this yaml to your local file system and do a `kubectl apply -f ` 57 | or apply it remotely by doing: 58 | ```sh 59 | kubectl apply -f https://raw.githubusercontent.com/project-codeflare/appwrapper/main/hack/default-queues.yaml 60 | ``` 61 | 62 | ### Install AppWrapper 63 | 64 | Install the most recent AppWrapper release by doing: 65 | ```sh 66 | kubectl apply --server-side -f https://github.com/project-codeflare/appwrapper/releases/download/{{ site.appwrapper_version }}/install.yaml 67 | ``` 68 | 69 | Before continuing, ensure AppWrapper is ready by executing this command: 70 | ```sh 71 | kubectl -n appwrapper-system wait --timeout=300s --for=condition=Available deployments --all 72 | ``` 73 | 74 | ### Validate the Install 75 | 76 | Finally, validate the installation by creating a simple AppWrapper and verifying that it runs 77 | as expected. 78 | 79 | Create an AppWrapper by executing: 80 | ```sh 81 | kubectl apply -f https://raw.githubusercontent.com/project-codeflare/appwrapper/{{ site.appwrapper_version }}/samples/wrapped-pod.yaml 82 | ``` 83 | 84 | You should quickly see an AppWrapper with the `Running` Status. 85 | The sample contains a single Pod with an `init` container that runs for 10 seconds, 86 | followed by a main container that runs for 5 seconds. After the main container completes, 87 | the Status of the AppWrapper will be `Succeeded`. We show some kubectl commands and 88 | their expected outputs below: 89 | ```sh 90 | % kubectl get appwrappers 91 | NAME STATUS 92 | sample-pod Running 93 | 94 | % kubectl get pods 95 | NAME READY STATUS RESTARTS AGE 96 | sample-pod 0/1 Init:0/1 0 14s 97 | 98 | % kubectl get pods 99 | NAME READY STATUS RESTARTS AGE 100 | sample-pod 1/1 Running 0 18s 101 | 102 | % kubectl get pods 103 | NAME READY STATUS RESTARTS AGE 104 | sample-pod 0/1 Completed 0 30s 105 | 106 | % kubectl get appwrappers 107 | NAME STATUS 108 | sample-pod Succeeded 109 | ``` 110 | 111 | You can delete the AppWrapper with: 112 | ```sh 113 | kubectl delete -f https://raw.githubusercontent.com/project-codeflare/appwrapper/{{ site.appwrapper_version }}/samples/wrapped-pod.yaml 114 | ``` 115 | -------------------------------------------------------------------------------- /site/_pages/sample-batch-job.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /samples/batch-job/ 3 | title: "AppWrapper Containing a Batch Job" 4 | classes: wide 5 | --- 6 | 7 | 8 | ```yaml 9 | apiVersion: workload.codeflare.dev/v1beta2 10 | kind: AppWrapper 11 | metadata: 12 | name: sample-job 13 | labels: 14 | kueue.x-k8s.io/queue-name: default-queue 15 | spec: 16 | components: 17 | - template: 18 | apiVersion: batch/v1 19 | kind: Job 20 | metadata: 21 | name: sample-job 22 | spec: 23 | template: 24 | spec: 25 | restartPolicy: Never 26 | containers: 27 | - name: busybox 28 | image: quay.io/project-codeflare/busybox:1.36 29 | command: ["sh", "-c", "sleep 30"] 30 | resources: 31 | requests: 32 | cpu: 1 33 | 34 | ``` 35 | -------------------------------------------------------------------------------- /site/_pages/sample-pytorch.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /samples/pytorch/ 3 | title: "AppWrapper Containing PyTorch Job" 4 | classes: wide 5 | --- 6 | 7 | 8 | ```yaml 9 | apiVersion: workload.codeflare.dev/v1beta2 10 | kind: AppWrapper 11 | metadata: 12 | name: sample-pytorch-job 13 | labels: 14 | kueue.x-k8s.io/queue-name: default-queue 15 | spec: 16 | components: 17 | - template: 18 | apiVersion: "kubeflow.org/v1" 19 | kind: PyTorchJob 20 | metadata: 21 | name: pytorch-simple 22 | spec: 23 | pytorchReplicaSpecs: 24 | Master: 25 | replicas: 1 26 | restartPolicy: OnFailure 27 | template: 28 | spec: 29 | containers: 30 | - name: pytorch 31 | image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 32 | command: 33 | - "python3" 34 | - "/opt/pytorch-mnist/mnist.py" 35 | - "--epochs=1" 36 | resources: 37 | requests: 38 | cpu: 1 39 | Worker: 40 | replicas: 1 41 | restartPolicy: OnFailure 42 | template: 43 | spec: 44 | containers: 45 | - name: pytorch 46 | image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 47 | command: 48 | - "python3" 49 | - "/opt/pytorch-mnist/mnist.py" 50 | - "--epochs=1" 51 | resources: 52 | requests: 53 | cpu: 1 54 | ``` 55 | -------------------------------------------------------------------------------- /site/_pages/samples.md: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /samples/ 3 | title: "AppWrapper Samples" 4 | classes: wide 5 | --- 6 | 7 | Available AppWrapper Samples 8 | + [Kubernetes Batch Job](/samples/batch-job) 9 | + [PyTorch Job](/samples/pytorch) 10 | -------------------------------------------------------------------------------- /site/_pages/tag-archive.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Posts by Tag" 3 | permalink: /tags/ 4 | layout: tags 5 | author_profile: true 6 | --- 7 | -------------------------------------------------------------------------------- /site/_pages/year-archive.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Posts by Year" 3 | permalink: /posts/ 4 | layout: posts 5 | author_profile: true 6 | --- 7 | -------------------------------------------------------------------------------- /site/assets/js/clipboardrouge.js: -------------------------------------------------------------------------------- 1 | // Clipboard 2 | // This makes the button blink 250 miliseconds 3 | 4 | function sleep(ms) { 5 | return new Promise(resolve => setTimeout(resolve, ms)); 6 | } 7 | 8 | async function buttonBlink(btn, style) { 9 | btn.classList.remove("btn-light"); 10 | btn.classList.add(style); 11 | await sleep(250); //Blink ms 12 | btn.classList.remove(style); 13 | btn.classList.add("btn-light"); 14 | } 15 | // End 16 | 17 | 18 | // Select highlghted codes 19 | var codeChunk = document.querySelectorAll("pre.highlight"); 20 | 21 | // Loop to add buttons 22 | for (var i = 0; i < codeChunk.length; i++) { 23 | 24 | var pre = codeChunk.item(i); 25 | var btn = document.createElement("button"); 26 | // Prepare button 27 | // btn.innerHTML = ""; // Icon to be displayed on the button 28 | btn.innerHTML = "Copy"; // Text to be displayed on the button 29 | 30 | // Inline styling - may be a new css class, to be added in the next section 31 | btn.style.position = "absolute"; 32 | btn.style.right = "1em"; 33 | 34 | // Button: CSS - Add new classes 35 | btn.classList.add("btn", "btn--primary"); 36 | 37 | // Identifier for ClipboardJS 38 | btn.setAttribute("data-clipboard-copy", ""); 39 | 40 | // btn.setAttribute("aria-label", "Copy to clipboard"); 41 | // etc. 42 | 43 | // Insert button 44 | pre.insertBefore(btn, pre.firstChild); 45 | 46 | } 47 | // End 48 | 49 | // Copy to clipboard 50 | var clipboard = new ClipboardJS("[data-clipboard-copy]", { 51 | target: function (trigger) { 52 | return trigger.nextElementSibling; 53 | } 54 | }); 55 | 56 | // Messages and make the button blink 57 | clipboard.on("success", function (e) { 58 | e.clearSelection(); 59 | buttonBlink(e.trigger, "btn--success"); 60 | console.info("Action:", e.action); 61 | console.info("Text:", e.text); 62 | console.info("Trigger:", e.trigger); 63 | }); 64 | 65 | clipboard.on("error", function (e) { 66 | e.clearSelection(); 67 | buttonBlink(e.trigger, "btn--danger"); 68 | console.info("Action:", e.action); 69 | console.info("Trigger:", e.trigger); 70 | }); 71 | // Finish 72 | -------------------------------------------------------------------------------- /site/genref/config.yaml: -------------------------------------------------------------------------------- 1 | hiddenMemberFields: 2 | - "TypeMeta" 3 | - "ObjectMeta" 4 | 5 | apis: 6 | - name: appwrapper 7 | title: AppWrapper API 8 | package: github.com/project-codeflare/appwrapper 9 | path: api/v1beta2 10 | 11 | externalPackages: 12 | - match: ^k8s\.io/(api|apimachinery/pkg/apis)/ 13 | target: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#{{- 14 | lower .TypeIdentifier -}}-{{- arrIndex .PackageSegments -1 -}}-{{- arrIndex .PackageSegments 15 | -2 -}} 16 | - match: ^k8s\.io/apimachinery/pkg/runtime\.RawExtension$ 17 | target: https://pkg.go.dev/k8s.io/apimachinery/pkg/runtime#RawExtension 18 | - match: ^time\.Duration$ 19 | target: https://pkg.go.dev/time#Duration 20 | -------------------------------------------------------------------------------- /site/genref/markdown/members.tpl: -------------------------------------------------------------------------------- 1 | {{ define "members" }} 2 | {{/* . is a apiType */}} 3 | {{- range .GetMembers -}} 4 | {{/* . is a apiMember */}} 5 | {{- if not .Hidden }} 6 | {{ .FieldName }} 7 | {{- if not .IsOptional }} [Required]{{- end -}} 8 |
9 | {{/* Link for type reference */}} 10 | {{- with .GetType -}} 11 | {{- if .Link -}} 12 | {{ .DisplayName }} 13 | {{- else -}} 14 | {{ .DisplayName }} 15 | {{- end -}} 16 | {{- end }} 17 | 18 | 19 | {{- if .IsInline -}} 20 | (Members of {{ .FieldName }} are embedded into this type.) 21 | {{- end }} 22 | {{ if .GetComment -}} 23 | {{ .GetComment }} 24 | {{- else -}} 25 | No description provided. 26 | {{- end }} 27 | {{- if and (eq (.GetType.Name.Name) "ObjectMeta") -}} 28 | Refer to the Kubernetes API documentation for the fields of the metadata field. 29 | {{- end -}} 30 | 31 | 32 | {{- end }} 33 | {{- end }} 34 | {{ end }} 35 | -------------------------------------------------------------------------------- /site/genref/markdown/pkg.tpl: -------------------------------------------------------------------------------- 1 | {{ define "packages" -}} 2 | 3 | {{- range $idx, $val := .packages -}} 4 | {{- if .IsMain -}} 5 | --- 6 | permalink: /api/{{ .DisplayName }}/ 7 | title: {{ .Title }} 8 | classes: wide 9 | description: Generated API reference documentation for {{ .DisplayName }}. 10 | --- 11 | {{ .GetComment -}} 12 | {{- end -}} 13 | {{- end }} 14 | 15 | ## Resource Types 16 | 17 | {{ range .packages -}} 18 | {{- range .VisibleTypes -}} 19 | - [{{ .DisplayName }}]({{ .Link }}) 20 | {{ end -}} 21 | {{- end -}} 22 | 23 | {{ range .packages -}} 24 | {{ if ne .GroupName "" -}} 25 | {{/* For package with a group name, list all type definitions in it. */}} 26 | {{- range .VisibleTypes }} 27 | {{- if or .Referenced .IsExported -}} 28 | {{ template "type" . }} 29 | {{- end -}} 30 | {{ end }} 31 | {{ else }} 32 | {{/* For package w/o group name, list only types referenced. */}} 33 | {{- range .VisibleTypes -}} 34 | {{- if .Referenced -}} 35 | {{ template "type" . }} 36 | {{- end -}} 37 | {{- end }} 38 | {{- end }} 39 | {{- end }} 40 | {{- end }} 41 | -------------------------------------------------------------------------------- /site/genref/markdown/type.tpl: -------------------------------------------------------------------------------- 1 | {{ define "type" }} 2 | 3 | ## `{{ .Name.Name }}` {#{{ .Anchor }}} 4 | 5 | {{ if eq .Kind "Alias" -}} 6 | (Alias of `{{ .Underlying }}`) 7 | {{ end }} 8 | 9 | {{- with .References }} 10 | **Appears in:** 11 | {{ range . }} 12 | {{ if or .Referenced .IsExported -}} 13 | - [{{ .DisplayName }}]({{ .Link }}) 14 | {{ end -}} 15 | {{- end -}} 16 | {{- end }} 17 | 18 | {{ if .GetComment -}} 19 | {{ .GetComment }} 20 | {{ end }} 21 | {{ if .GetMembers -}} 22 | 23 | 24 | 25 | {{/* . is a apiType */}} 26 | {{- if .IsExported -}} 27 | {{/* Add apiVersion and kind rows if deemed necessary */}} 28 | 29 | 30 | {{- end -}} 31 | {{/* The actual list of members is in the following template */}} 32 | {{- template "members" . -}} 33 | 34 |
FieldDescription
apiVersion
string
{{- .APIGroup -}}
kind
string
{{- .Name.Name -}}
35 | {{- end -}} 36 | {{- end -}} 37 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # AppWrapper End-to-End Tests 2 | 3 | This directory contains both go and kuttl tests suites that are 4 | designed to be run against an AppWrapper operator deployed on a 5 | Kubernetes cluster with Kueue and the Kubeflow operator installed. 6 | 7 | The [../hack/](../hack) directory contains scripts that can be used to 8 | create an appropriately configured test cluster using `kind` and to run 9 | the tests. 10 | -------------------------------------------------------------------------------- /test/e2e/e2e_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package e2e 18 | 19 | import ( 20 | "context" 21 | "testing" 22 | 23 | "sigs.k8s.io/controller-runtime/pkg/log" 24 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 25 | 26 | . "github.com/onsi/ginkgo/v2" 27 | . "github.com/onsi/gomega" 28 | ) 29 | 30 | var ctx context.Context 31 | 32 | var _ = BeforeSuite(func() { 33 | log.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 34 | ctx = extendContextWithClient(context.Background()) 35 | ensureNamespaceExists(ctx) 36 | ctx = extendContextWithLimitedClient(ctx) 37 | if Label("Kueue").MatchesLabelFilter(GinkgoLabelFilter()) { 38 | ensureTestQueuesExist(ctx) 39 | } 40 | }) 41 | 42 | func TestE2E(t *testing.T) { 43 | RegisterFailHandler(Fail) 44 | RunSpecs(t, "AppWrapper Test Suite") 45 | } 46 | -------------------------------------------------------------------------------- /test/e2e/metrics_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 IBM Corporation. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package e2e 18 | 19 | import ( 20 | "fmt" 21 | "os/exec" 22 | "time" 23 | 24 | corev1 "k8s.io/api/core/v1" 25 | rbacv1 "k8s.io/api/rbac/v1" 26 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 | "sigs.k8s.io/controller-runtime/pkg/client" 28 | 29 | . "github.com/onsi/ginkgo/v2" 30 | . "github.com/onsi/gomega" 31 | ) 32 | 33 | const ( 34 | managerNamespace = "appwrapper-system" 35 | serviceAccountName = "appwrapper-controller-manager" 36 | metricsReaderClusterRoleName = "appwrapper-metrics-reader" 37 | metricsServiceName = "appwrapper-controller-manager-metrics-service" 38 | ) 39 | 40 | var _ = Describe("Metrics", Label("Metrics"), func() { 41 | It("should ensure the metrics endpoint is serving metrics", Label("Metrics"), func() { 42 | By("Creating a ClusterRoleBinding for the service account to allow access to metrics") 43 | metricsReaderClusterRoleBinding := &rbacv1.ClusterRoleBinding{ 44 | ObjectMeta: metav1.ObjectMeta{Name: "e2e-test-aw-metrics-reader-crb"}, 45 | Subjects: []rbacv1.Subject{ 46 | { 47 | Kind: "ServiceAccount", 48 | Name: serviceAccountName, 49 | Namespace: managerNamespace, 50 | }, 51 | }, 52 | RoleRef: rbacv1.RoleRef{ 53 | APIGroup: rbacv1.GroupName, 54 | Kind: "ClusterRole", 55 | Name: metricsReaderClusterRoleName, 56 | }, 57 | } 58 | Expect(getClient(ctx).Create(ctx, metricsReaderClusterRoleBinding)).Should(Succeed()) 59 | DeferCleanup(func() { 60 | By("Deleting the ClusterRoleBinding", func() { 61 | Expect(getClient(ctx).Delete(ctx, metricsReaderClusterRoleBinding)).To(Succeed()) 62 | }) 63 | }) 64 | 65 | By("Creating the curl-metrics pod using a service account that can access the metrics endpoint") 66 | pod := &corev1.Pod{ 67 | ObjectMeta: metav1.ObjectMeta{Name: "curl-metrics", Namespace: managerNamespace}, 68 | Spec: corev1.PodSpec{ 69 | ServiceAccountName: serviceAccountName, 70 | Containers: []corev1.Container{{ 71 | Name: "curl", 72 | Image: "quay.io/curl/curl:8.11.1", 73 | Command: []string{"sleep", "3600"}, 74 | }}, 75 | }, 76 | } 77 | Expect(getClient(ctx).Create(ctx, pod)).Should(Succeed()) 78 | DeferCleanup(func() { 79 | By("Deleting the pod", func() { 80 | Expect(getClient(ctx).Delete(ctx, pod)).Should(Succeed()) 81 | }) 82 | }) 83 | 84 | By("Waiting for the curl-metrics pod to be running.", func() { 85 | Eventually(func(g Gomega) { 86 | createdPod := &corev1.Pod{} 87 | g.Expect(getClient(ctx).Get(ctx, client.ObjectKeyFromObject(pod), createdPod)).To(Succeed()) 88 | g.Expect(createdPod.Status.Phase).To(Equal(corev1.PodRunning)) 89 | }, 60*time.Second).Should(Succeed()) 90 | }) 91 | 92 | metrics := []string{ 93 | "controller_runtime_reconcile_total", 94 | } 95 | 96 | By("Getting the metrics via curl", func() { 97 | Eventually(func(g Gomega) { 98 | cmd := exec.Command("kubectl", "exec", "-n", managerNamespace, "curl-metrics", "--", "/bin/sh", "-c", 99 | fmt.Sprintf( 100 | "curl -v -k -H \"Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)\" https://%s.%s.svc.cluster.local:8443/metrics ", 101 | metricsServiceName, managerNamespace, 102 | ), 103 | ) 104 | metricsOutput, err := cmd.CombinedOutput() 105 | g.Expect(err).NotTo(HaveOccurred()) 106 | for _, metric := range metrics { 107 | g.Expect(string(metricsOutput)).To(ContainSubstring(metric)) 108 | } 109 | }, 3600*time.Second).Should(Succeed()) 110 | }) 111 | }) 112 | }) 113 | --------------------------------------------------------------------------------