├── .dockerignore ├── .gitignore ├── .golangci.yml ├── Dockerfile ├── Makefile ├── PROJECT ├── README.md ├── api └── v1beta1 │ ├── daemonsetpool_types.go │ ├── deploymentpool_types.go │ ├── groupversion_info.go │ ├── jobpool_types.go │ ├── nodepool_types.go │ ├── podpool_types.go │ ├── statefulsetpool_types.go │ └── zz_generated.deepcopy.go ├── cmd └── main.go ├── config ├── crd │ ├── bases │ │ ├── kwok.sigs.run-ai.com_daemonsetpools.yaml │ │ ├── kwok.sigs.run-ai.com_deploymentpools.yaml │ │ ├── kwok.sigs.run-ai.com_jobpools.yaml │ │ ├── kwok.sigs.run-ai.com_nodepools.yaml │ │ ├── kwok.sigs.run-ai.com_podpools.yaml │ │ └── kwok.sigs.run-ai.com_statefulsetpools.yaml │ ├── kustomization.yaml │ └── kustomizeconfig.yaml ├── default │ ├── kustomization.yaml │ └── manager_auth_proxy_patch.yaml ├── manager │ ├── kustomization.yaml │ └── manager.yaml ├── prometheus │ ├── kustomization.yaml │ └── monitor.yaml ├── rbac │ ├── auth_proxy_client_clusterrole.yaml │ ├── auth_proxy_role.yaml │ ├── auth_proxy_role_binding.yaml │ ├── auth_proxy_service.yaml │ ├── daemonsetpool_editor_role.yaml │ ├── daemonsetpool_viewer_role.yaml │ ├── deploymentpool_editor_role.yaml │ ├── deploymentpool_viewer_role.yaml │ ├── jobpool_editor_role.yaml │ ├── jobpool_viewer_role.yaml │ ├── kustomization.yaml │ ├── leader_election_role.yaml │ ├── leader_election_role_binding.yaml │ ├── nodepool_editor_role.yaml │ ├── nodepool_viewer_role.yaml │ ├── podpool_editor_role.yaml │ ├── podpool_viewer_role.yaml │ ├── role.yaml │ ├── role_binding.yaml │ ├── service_account.yaml │ ├── statefulsetpool_editor_role.yaml │ └── statefulsetpool_viewer_role.yaml ├── samples │ ├── kustomization.yaml │ ├── kwok.sigs_v1beta1_daemonsetpool.yaml │ ├── kwok.sigs_v1beta1_deploymentpool.yaml │ ├── kwok.sigs_v1beta1_jobpool.yaml │ ├── kwok.sigs_v1beta1_nodepool.yaml │ ├── kwok.sigs_v1beta1_podpool.yaml │ └── kwok.sigs_v1beta1_statefulsetpool.yaml └── scorecard │ ├── bases │ └── config.yaml │ ├── kustomization.yaml │ └── patches │ ├── basic.config.yaml │ └── olm.config.yaml ├── go.mod ├── go.sum ├── hack └── boilerplate.go.txt ├── install_kwok.sh ├── internal └── controller │ ├── daemonsetpool_controller.go │ ├── daemonsetpool_controller_test.go │ ├── deploymentpool_controller.go │ ├── deploymentpool_controller_test.go │ ├── global.go │ ├── jobpool_controller.go │ ├── jobpool_controller_test.go │ ├── nodepool_controller.go │ ├── nodepool_controller_test.go │ ├── podpool_controller.go │ ├── podpool_controller_test.go │ ├── statefulsetpool_controller.go │ ├── statefulsetpool_controller_test.go │ └── suite_test.go └── test ├── e2e ├── e2e_suite_test.go └── e2e_test.go └── utils └── utils.go /.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore build and test binaries. 3 | bin/ 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | bin/* 9 | Dockerfile.cross 10 | kwok-operator.yaml 11 | 12 | # Test binary, built with `go test -c` 13 | *.test 14 | 15 | # Output of the go coverage tool, specifically when used with LiteIDE 16 | *.out 17 | 18 | # Go workspace file 19 | go.work 20 | 21 | # Kubernetes Generated files - skip generated files, except for vendored files 22 | !vendor/**/zz_generated.* 23 | 24 | # editor and IDE paraphernalia 25 | .idea 26 | .vscode 27 | *.swp 28 | *.swo 29 | *~ 30 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | run: 2 | deadline: 5m 3 | allow-parallel-runners: true 4 | 5 | issues: 6 | # don't skip warning about doc comments 7 | # don't exclude the default set of lint 8 | exclude-use-default: false 9 | # restore some of the defaults 10 | # (fill in the rest as needed) 11 | exclude-rules: 12 | - path: "api/*" 13 | linters: 14 | - lll 15 | - path: "internal/*" 16 | linters: 17 | - dupl 18 | - lll 19 | linters: 20 | disable-all: true 21 | enable: 22 | - dupl 23 | - errcheck 24 | - exportloopref 25 | - goconst 26 | - gocyclo 27 | - gofmt 28 | - goimports 29 | - gosimple 30 | - govet 31 | - ineffassign 32 | - lll 33 | - misspell 34 | - nakedret 35 | - prealloc 36 | - staticcheck 37 | - typecheck 38 | - unconvert 39 | - unparam 40 | - unused 41 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the manager binary 2 | FROM golang:1.22 AS builder 3 | ARG TARGETOS 4 | ARG TARGETARCH 5 | 6 | WORKDIR /workspace 7 | # Copy the Go Modules manifests 8 | COPY go.mod go.mod 9 | COPY go.sum go.sum 10 | # cache deps before building and copying source so that we don't need to re-download as much 11 | # and so that source changes don't invalidate our downloaded layer 12 | RUN go mod download 13 | 14 | # Copy the go source 15 | COPY cmd/main.go cmd/main.go 16 | COPY api/ api/ 17 | COPY internal/controller/ internal/controller/ 18 | 19 | # Build 20 | # the GOARCH has not a default value to allow the binary be built according to the host where the command 21 | # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO 22 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, 23 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. 24 | RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go 25 | 26 | # Use distroless as minimal base image to package the manager binary 27 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 28 | FROM gcr.io/distroless/static:nonroot 29 | WORKDIR / 30 | COPY --from=builder /workspace/manager . 31 | USER 65532:65532 32 | 33 | ENTRYPOINT ["/manager"] 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # VERSION defines the project version for the bundle. 2 | # Update this value when you upgrade the version of your project. 3 | # To re-generate a bundle for another specific version without changing the standard setup, you can: 4 | # - use the VERSION as arg of the bundle target (e.g make bundle VERSION=0.0.2) 5 | # - use environment variables to overwrite this value (e.g export VERSION=0.0.2) 6 | VERSION ?= 1.0.1 7 | 8 | # CHANNELS define the bundle channels used in the bundle. 9 | # Add a new line here if you would like to change its default config. (E.g CHANNELS = "candidate,fast,stable") 10 | # To re-generate a bundle for other specific channels without changing the standard setup, you can: 11 | # - use the CHANNELS as arg of the bundle target (e.g make bundle CHANNELS=candidate,fast,stable) 12 | # - use environment variables to overwrite this value (e.g export CHANNELS="candidate,fast,stable") 13 | ifneq ($(origin CHANNELS), undefined) 14 | BUNDLE_CHANNELS := --channels=$(CHANNELS) 15 | endif 16 | 17 | # DEFAULT_CHANNEL defines the default channel used in the bundle. 18 | # Add a new line here if you would like to change its default config. (E.g DEFAULT_CHANNEL = "stable") 19 | # To re-generate a bundle for any other default channel without changing the default setup, you can: 20 | # - use the DEFAULT_CHANNEL as arg of the bundle target (e.g make bundle DEFAULT_CHANNEL=stable) 21 | # - use environment variables to overwrite this value (e.g export DEFAULT_CHANNEL="stable") 22 | ifneq ($(origin DEFAULT_CHANNEL), undefined) 23 | BUNDLE_DEFAULT_CHANNEL := --default-channel=$(DEFAULT_CHANNEL) 24 | endif 25 | BUNDLE_METADATA_OPTS ?= $(BUNDLE_CHANNELS) $(BUNDLE_DEFAULT_CHANNEL) 26 | 27 | # IMAGE_TAG_BASE defines the docker.io namespace and part of the image name for remote images. 28 | # This variable is used to construct full image tags for bundle and catalog images. 29 | # 30 | # For example, running 'make bundle-build bundle-push catalog-build catalog-push' will build and push both 31 | # run-ai.com/kwok-operator-bundle:$VERSION and run-ai.com/kwok-operator-catalog:$VERSION. 32 | IMAGE_TAG_BASE ?= run-ai.com/kwok-operator 33 | 34 | # BUNDLE_IMG defines the image:tag used for the bundle. 35 | # You can use it as an arg. (E.g make bundle-build BUNDLE_IMG=/:) 36 | BUNDLE_IMG ?= $(IMAGE_TAG_BASE)-bundle:v$(VERSION) 37 | 38 | # BUNDLE_GEN_FLAGS are the flags passed to the operator-sdk generate bundle command 39 | BUNDLE_GEN_FLAGS ?= -q --overwrite --version $(VERSION) $(BUNDLE_METADATA_OPTS) 40 | 41 | # USE_IMAGE_DIGESTS defines if images are resolved via tags or digests 42 | # You can enable this value if you would like to use SHA Based Digests 43 | # To enable set flag to true 44 | USE_IMAGE_DIGESTS ?= false 45 | ifeq ($(USE_IMAGE_DIGESTS), true) 46 | BUNDLE_GEN_FLAGS += --use-image-digests 47 | endif 48 | 49 | # Set the Operator SDK version to use. By default, what is installed on the system is used. 50 | # This is useful for CI or a project to utilize a specific version of the operator-sdk toolkit. 51 | OPERATOR_SDK_VERSION ?= v1.34.2 52 | 53 | # Image URL to use all building/pushing image targets 54 | IMG ?= docker.io/runaidevops/kwok-operator:${VERSION} 55 | # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. 56 | ENVTEST_K8S_VERSION = 1.28.3 57 | 58 | # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) 59 | ifeq (,$(shell go env GOBIN)) 60 | GOBIN=$(shell go env GOPATH)/bin 61 | else 62 | GOBIN=$(shell go env GOBIN) 63 | endif 64 | 65 | # CONTAINER_TOOL defines the container tool to be used for building images. 66 | # Be aware that the target commands are only tested with Docker which is 67 | # scaffolded by default. However, you might want to replace it to use other 68 | # tools. (i.e. podman) 69 | CONTAINER_TOOL ?= docker 70 | 71 | # Setting SHELL to bash allows bash commands to be executed by recipes. 72 | # Options are set to exit when a recipe line exits non-zero or a piped command fails. 73 | SHELL = /usr/bin/env bash -o pipefail 74 | .SHELLFLAGS = -ec 75 | 76 | .PHONY: all 77 | all: build 78 | 79 | ##@ General 80 | 81 | # The help target prints out all targets with their descriptions organized 82 | # beneath their categories. The categories are represented by '##@' and the 83 | # target descriptions by '##'. The awk command is responsible for reading the 84 | # entire set of makefiles included in this invocation, looking for lines of the 85 | # file as xyz: ## something, and then pretty-format the target and help. Then, 86 | # if there's a line with ##@ something, that gets pretty-printed as a category. 87 | # More info on the usage of ANSI control characters for terminal formatting: 88 | # https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters 89 | # More info on the awk command: 90 | # http://linuxcommand.org/lc3_adv_awk.php 91 | 92 | .PHONY: help 93 | help: ## Display this help. 94 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 95 | 96 | ##@ Development 97 | 98 | .PHONY: manifests 99 | manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. 100 | $(CONTROLLER_GEN) crd:generateEmbeddedObjectMeta=true webhook paths="./..." output:crd:artifacts:config=config/crd/bases 101 | 102 | .PHONY: generate 103 | generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. 104 | $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." 105 | 106 | .PHONY: fmt 107 | fmt: ## Run go fmt against code. 108 | go fmt ./... 109 | 110 | .PHONY: vet 111 | vet: ## Run go vet against code. 112 | go vet ./... 113 | 114 | .PHONY: test 115 | test: manifests generate fmt vet envtest ## Run tests. 116 | KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out 117 | 118 | # Utilize Kind or modify the e2e tests to load the image locally, enabling compatibility with other vendors. 119 | .PHONY: test-e2e # Run the e2e tests against a Kind k8s instance that is spun up. 120 | test-e2e: 121 | go test ./test/e2e/ -v -ginkgo.v 122 | 123 | GOLANGCI_LINT = $(shell pwd)/bin/golangci-lint 124 | GOLANGCI_LINT_VERSION ?= v1.54.2 125 | golangci-lint: 126 | @[ -f $(GOLANGCI_LINT) ] || { \ 127 | set -e ;\ 128 | curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell dirname $(GOLANGCI_LINT)) $(GOLANGCI_LINT_VERSION) ;\ 129 | } 130 | 131 | .PHONY: lint 132 | lint: golangci-lint ## Run golangci-lint linter & yamllint 133 | $(GOLANGCI_LINT) run 134 | 135 | .PHONY: lint-fix 136 | lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes 137 | $(GOLANGCI_LINT) run --fix 138 | 139 | ##@ Build 140 | 141 | .PHONY: build 142 | build: manifests generate fmt vet ## Build manager binary. 143 | go build -o bin/manager cmd/main.go 144 | 145 | .PHONY: run 146 | run: manifests generate fmt vet ## Run a controller from your host. 147 | go run ./cmd/main.go 148 | 149 | # If you wish to build the manager image targeting other platforms you can use the --platform flag. 150 | # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. 151 | # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ 152 | .PHONY: docker-build 153 | docker-build: ## Build docker image with the manager. 154 | $(CONTAINER_TOOL) build -t ${IMG} . 155 | 156 | .PHONY: docker-push 157 | docker-push: ## Push docker image with the manager. 158 | $(CONTAINER_TOOL) push ${IMG} 159 | 160 | # PLATFORMS defines the target platforms for the manager image be built to provide support to multiple 161 | # architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to: 162 | # - be able to use docker buildx. More info: https://docs.docker.com/build/buildx/ 163 | # - have enabled BuildKit. More info: https://docs.docker.com/develop/develop-images/build_enhancements/ 164 | # - be able to push the image to your registry (i.e. if you do not set a valid value via IMG=> then the export will fail) 165 | # To adequately provide solutions that are compatible with multiple platforms, you should consider using this option. 166 | PLATFORMS ?= linux/arm64,linux/amd64,linux/s390x,linux/ppc64le 167 | .PHONY: docker-buildx 168 | docker-buildx: ## Build and push docker image for the manager for cross-platform support 169 | # copy existing Dockerfile and insert --platform=${BUILDPLATFORM} into Dockerfile.cross, and preserve the original Dockerfile 170 | sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross 171 | - $(CONTAINER_TOOL) buildx create --name project-v3-builder 172 | $(CONTAINER_TOOL) buildx use project-v3-builder 173 | - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross . 174 | - $(CONTAINER_TOOL) buildx rm project-v3-builder 175 | rm Dockerfile.cross 176 | 177 | ##@ Deployment 178 | 179 | ifndef ignore-not-found 180 | ignore-not-found = false 181 | endif 182 | 183 | .PHONY: install 184 | install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config. 185 | $(KUSTOMIZE) build config/crd | $(KUBECTL) apply --server-side -f - 186 | 187 | .PHONY: uninstall 188 | uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. 189 | $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - 190 | 191 | .PHONY: deploy 192 | deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. 193 | cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} 194 | $(KUSTOMIZE) build config/default | $(KUBECTL) create -f - 195 | 196 | .PHONY: undeploy 197 | undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. 198 | $(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - 199 | 200 | ##@ Build Dependencies 201 | 202 | ## Location to install dependencies to 203 | LOCALBIN ?= $(shell pwd)/bin 204 | $(LOCALBIN): 205 | mkdir -p $(LOCALBIN) 206 | 207 | ## Tool Binaries 208 | KUBECTL ?= kubectl 209 | KUSTOMIZE ?= $(LOCALBIN)/kustomize 210 | CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen 211 | ENVTEST ?= $(LOCALBIN)/setup-envtest 212 | 213 | ## Tool Versions 214 | KUSTOMIZE_VERSION ?= v5.2.1 215 | CONTROLLER_TOOLS_VERSION ?= v0.15.0 216 | 217 | .PHONY: kustomize 218 | kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. If wrong version is installed, it will be removed before downloading. 219 | $(KUSTOMIZE): $(LOCALBIN) 220 | @if test -x $(LOCALBIN)/kustomize && ! $(LOCALBIN)/kustomize version | grep -q $(KUSTOMIZE_VERSION); then \ 221 | echo "$(LOCALBIN)/kustomize version is not expected $(KUSTOMIZE_VERSION). Removing it before installing."; \ 222 | rm -rf $(LOCALBIN)/kustomize; \ 223 | fi 224 | test -s $(LOCALBIN)/kustomize || GOBIN=$(LOCALBIN) GO111MODULE=on go install sigs.k8s.io/kustomize/kustomize/v5@$(KUSTOMIZE_VERSION) 225 | 226 | .PHONY: controller-gen 227 | controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. If wrong version is installed, it will be overwritten. 228 | $(CONTROLLER_GEN): $(LOCALBIN) 229 | test -s $(LOCALBIN)/controller-gen && $(LOCALBIN)/controller-gen --version | grep -q $(CONTROLLER_TOOLS_VERSION) || \ 230 | GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION) 231 | 232 | .PHONY: envtest 233 | envtest: $(ENVTEST) ## Download envtest-setup locally if necessary. 234 | $(ENVTEST): $(LOCALBIN) 235 | test -s $(LOCALBIN)/setup-envtest || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest 236 | 237 | .PHONY: operator-sdk 238 | OPERATOR_SDK ?= $(LOCALBIN)/operator-sdk 239 | operator-sdk: ## Download operator-sdk locally if necessary. 240 | ifeq (,$(wildcard $(OPERATOR_SDK))) 241 | ifeq (, $(shell which operator-sdk 2>/dev/null)) 242 | @{ \ 243 | set -e ;\ 244 | mkdir -p $(dir $(OPERATOR_SDK)) ;\ 245 | OS=$(shell go env GOOS) && ARCH=$(shell go env GOARCH) && \ 246 | curl -sSLo $(OPERATOR_SDK) https://github.com/operator-framework/operator-sdk/releases/download/$(OPERATOR_SDK_VERSION)/operator-sdk_$${OS}_$${ARCH} ;\ 247 | chmod +x $(OPERATOR_SDK) ;\ 248 | } 249 | else 250 | OPERATOR_SDK = $(shell which operator-sdk) 251 | endif 252 | endif 253 | 254 | .PHONY: bundle 255 | bundle: manifests kustomize operator-sdk ## Generate bundle manifests and metadata, then validate generated files. 256 | $(OPERATOR_SDK) generate kustomize manifests -q 257 | cd config/manager && $(KUSTOMIZE) edit set image controller=$(IMG) 258 | $(KUSTOMIZE) build config/manifests | $(OPERATOR_SDK) generate bundle $(BUNDLE_GEN_FLAGS) 259 | $(OPERATOR_SDK) bundle validate ./bundle 260 | 261 | .PHONY: bundle-build 262 | bundle-build: ## Build the bundle image. 263 | docker build -f bundle.Dockerfile -t $(BUNDLE_IMG) . 264 | 265 | .PHONY: bundle-push 266 | bundle-push: ## Push the bundle image. 267 | $(MAKE) docker-push IMG=$(BUNDLE_IMG) 268 | 269 | .PHONY: opm 270 | OPM = $(LOCALBIN)/opm 271 | opm: ## Download opm locally if necessary. 272 | ifeq (,$(wildcard $(OPM))) 273 | ifeq (,$(shell which opm 2>/dev/null)) 274 | @{ \ 275 | set -e ;\ 276 | mkdir -p $(dir $(OPM)) ;\ 277 | OS=$(shell go env GOOS) && ARCH=$(shell go env GOARCH) && \ 278 | curl -sSLo $(OPM) https://github.com/operator-framework/operator-registry/releases/download/v1.23.0/$${OS}-$${ARCH}-opm ;\ 279 | chmod +x $(OPM) ;\ 280 | } 281 | else 282 | OPM = $(shell which opm) 283 | endif 284 | endif 285 | 286 | # A comma-separated list of bundle images (e.g. make catalog-build BUNDLE_IMGS=example.com/operator-bundle:v0.1.0,example.com/operator-bundle:v0.2.0). 287 | # These images MUST exist in a registry and be pull-able. 288 | BUNDLE_IMGS ?= $(BUNDLE_IMG) 289 | 290 | # The image tag given to the resulting catalog image (e.g. make catalog-build CATALOG_IMG=example.com/operator-catalog:v0.2.0). 291 | CATALOG_IMG ?= $(IMAGE_TAG_BASE)-catalog:v$(VERSION) 292 | 293 | # Set CATALOG_BASE_IMG to an existing catalog image tag to add $BUNDLE_IMGS to that image. 294 | ifneq ($(origin CATALOG_BASE_IMG), undefined) 295 | FROM_INDEX_OPT := --from-index $(CATALOG_BASE_IMG) 296 | endif 297 | 298 | # Build a catalog image by adding bundle images to an empty catalog using the operator package manager tool, 'opm'. 299 | # This recipe invokes 'opm' in 'semver' bundle add mode. For more information on add modes, see: 300 | # https://github.com/operator-framework/community-operators/blob/7f1438c/docs/packaging-operator.md#updating-your-existing-operator 301 | .PHONY: catalog-build 302 | catalog-build: opm ## Build a catalog image. 303 | $(OPM) index add --container-tool docker --mode semver --tag $(CATALOG_IMG) --bundles $(BUNDLE_IMGS) $(FROM_INDEX_OPT) 304 | 305 | # Push the catalog image. 306 | .PHONY: catalog-push 307 | catalog-push: ## Push a catalog image. 308 | $(MAKE) docker-push IMG=$(CATALOG_IMG) 309 | -------------------------------------------------------------------------------- /PROJECT: -------------------------------------------------------------------------------- 1 | # Code generated by tool. DO NOT EDIT. 2 | # This file is used to track the info used to scaffold your project 3 | # and allow the plugins properly work. 4 | # More info: https://book.kubebuilder.io/reference/project-config.html 5 | domain: run-ai.com 6 | layout: 7 | - go.kubebuilder.io/v4 8 | plugins: 9 | manifests.sdk.operatorframework.io/v2: {} 10 | scorecard.sdk.operatorframework.io/v2: {} 11 | projectName: kwok-operator 12 | repo: github.com/run-ai/kwok-operator 13 | resources: 14 | - api: 15 | crdVersion: v1 16 | namespaced: true 17 | controller: true 18 | domain: run-ai.com 19 | group: kwok.sigs 20 | kind: NodePool 21 | path: github.com/run-ai/kwok-operator/api/v1beta1 22 | version: v1beta1 23 | - api: 24 | crdVersion: v1 25 | namespaced: true 26 | controller: true 27 | domain: run-ai.com 28 | group: kwok.sigs 29 | kind: DeploymentPool 30 | path: github.com/run-ai/kwok-operator/api/v1beta1 31 | version: v1beta1 32 | - api: 33 | crdVersion: v1 34 | namespaced: true 35 | controller: true 36 | domain: run-ai.com 37 | group: kwok.sigs 38 | kind: PodPool 39 | path: github.com/run-ai/kwok-operator/api/v1beta1 40 | version: v1beta1 41 | - api: 42 | crdVersion: v1 43 | namespaced: true 44 | controller: true 45 | domain: run-ai.com 46 | group: kwok.sigs 47 | kind: JobPool 48 | path: github.com/run-ai/kwok-operator/api/v1beta1 49 | version: v1beta1 50 | - api: 51 | crdVersion: v1 52 | namespaced: true 53 | controller: true 54 | domain: run-ai.com 55 | group: kwok.sigs 56 | kind: DaemonsetPool 57 | path: github.com/run-ai/kwok-operator/api/v1beta1 58 | version: v1beta1 59 | - api: 60 | crdVersion: v1 61 | namespaced: true 62 | controller: true 63 | domain: run-ai.com 64 | group: kwok.sigs 65 | kind: StatefulsetPool 66 | path: github.com/run-ai/kwok-operator/api/v1beta1 67 | version: v1beta1 68 | version: "3" 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kwok Operator 2 | 3 | ## Overview 4 | 5 | The Kwok Operator is a Kubernetes operator designed to create virtual nodes within a Kubernetes cluster using Kwok, by applying custom resource definitions (CRDs) for node pools. 6 | 7 | ## Tested 8 | The Kwok operator test on top the following kuberenetes flavors 9 | - Vanila 10 | - EKS ( Elastic Kubernetes Service ) 11 | - GKE ( Goole Kubernetes Engine ) 12 | - AKS ( Azure Kubernetes Service ) 13 | - RKE1 14 | - RKE2 15 | - Openshift 16 | - Kind 17 | 18 | ## Features 19 | 20 | - Automatically creates virtual nodes on Kwok infrastructure. 21 | - Utilizes Kubernetes Custom Resource Definitions (CRDs) for easy configuration. 22 | - Provides seamless integration with Kubernetes clusters. 23 | 24 | ## Prerequisites 25 | 26 | Before using the Kwok Operator, ensure you have the following prerequisites installed: 27 | 28 | - tested on Kubernetes cluster (version 1.24 or later) 29 | - Kwok infrastructure set up and accessible from the cluster 30 | - kubectl CLI installed and configured to access the Kubernetes cluster 31 | 32 | ## Installation 33 | 34 | To install Kwok CRDs and the Kwok Operator, follow these steps: 35 | 36 | 1. Clone the Kwok Operator repository: 37 | 38 | ```shell 39 | git clone git@github.com:run-ai/kwok-operator.git 40 | ``` 41 | 2. enter to kwok-operator directory 42 | ```shell 43 | cd kwok-operator 44 | ``` 45 | 3. make sure kwok installed in your cluster from the URL: https://kwok.sigs.k8s.io/docs/user/kwok-in-cluster/ 46 | or install by the script install_kwok.sh 47 | ```shell 48 | ./install_kwok.sh 49 | ``` 50 | 51 | 3. Apply the kwok-operator Kubernetes manifests: 52 | ```shell 53 | kubectl apply --server-side -k config/default 54 | ``` 55 | or 56 | ```shell 57 | kubectl apply --server-side -f https://github.com/run-ai/kwok-operator/releases/download/1.0.1/kwok-operator.yaml 58 | ``` 59 | ## Usage 60 | 61 | To use the Kwok Operator to provision nodes, follow these steps: 62 | 63 | 1. Define a NodePool custom resource (CR) with your desired configuration. Example: 64 | 65 | ```yaml 66 | apiVersion: kwok.sigs.run-ai.com/v1beta1 67 | kind: NodePool 68 | metadata: 69 | labels: 70 | app.kubernetes.io/name: nodepool 71 | app.kubernetes.io/instance: nodepool-sample 72 | app.kubernetes.io/part-of: kwok-operator 73 | app.kubernetes.io/managed-by: kustomize 74 | app.kubernetes.io/created-by: kwok-operator 75 | name: nodepool-sample 76 | spec: 77 | nodeCount: 15 78 | nodeTemplate: 79 | apiVersion: v1 80 | metadata: 81 | annotations: 82 | node.alpha.kubernetes.io/ttl: "0" 83 | labels: 84 | kubernetes.io/hostname: kwok-node 85 | kubernetes.io/role: agent 86 | type: kwok 87 | spec: {} 88 | status: 89 | allocatable: 90 | cpu: 32 91 | memory: 256Gi 92 | pods: 110 93 | capacity: 94 | cpu: 32 95 | memory: 256Gi 96 | pods: 110 97 | nodeInfo: 98 | architecture: amd64 99 | bootID: "" 100 | containerRuntimeVersion: "" 101 | kernelVersion: "" 102 | kubeProxyVersion: fake 103 | kubeletVersion: fake 104 | machineID: "" 105 | operatingSystem: linux 106 | osImage: "" 107 | systemUUID: "" 108 | phase: Running 109 | ``` 110 | 111 | 2. Apply the NodePool CR to your Kubernetes cluster: 112 | 113 | ```shell 114 | kubectl apply -f path/to/your/nodepool.yaml 115 | ``` 116 | 117 | 3. Monitor the status of the created virtual nodes using: 118 | ```shell 119 | kubectl get nodes 120 | ``` 121 | 122 | ## Configuration 123 | 124 | The Kwok Operator can be configured via the NodePool CR. 125 | ```shell 126 | kubectl edit nodepool nodepool-sample 127 | ``` 128 | 129 | ---- 130 | To use the Kwok Operator to manage deployments and run the pods on top the nodes you provisioned above, follow these steps: 131 | 1. ensure the namespace is exist 132 | 2. Define a DeploymentPool custom resource (CR) with your desired configuration. Example: 133 | ```yaml 134 | apiVersion: kwok.sigs.run-ai.com/v1beta1 135 | kind: DeploymentPool 136 | metadata: 137 | labels: 138 | app.kubernetes.io/name: deploymentpool 139 | app.kubernetes.io/instance: deploymentpool-sample 140 | app.kubernetes.io/part-of: kwok-operator 141 | app.kubernetes.io/managed-by: kustomize 142 | app.kubernetes.io/created-by: kwok-operator 143 | name: deploymentpool-sample 144 | namespace: default 145 | spec: 146 | deploymentCount: 5 147 | deploymentTemplate: 148 | apiVersion: apps/v1 149 | metadata: 150 | name: kwok-operator 151 | labels: 152 | app.kubernetes.io/name: deployment 153 | app.kubernetes.io/instance: deployment-sample 154 | app.kubernetes.io/part-of: kwok-operator 155 | app.kubernetes.io/managed-by: kustomize 156 | app.kubernetes.io/created-by: kwok-operator 157 | spec: 158 | replicas: 3 159 | selector: 160 | matchLabels: 161 | app.kubernetes.io/name: deployment 162 | app.kubernetes.io/instance: deployment-sample 163 | app.kubernetes.io/part-of: kwok-operator 164 | app.kubernetes.io/managed-by: kustomize 165 | app.kubernetes.io/created-by: kwok-operator 166 | template: 167 | metadata: 168 | labels: 169 | app.kubernetes.io/name: deployment 170 | app.kubernetes.io/instance: deployment-sample 171 | app.kubernetes.io/part-of: kwok-operator 172 | app.kubernetes.io/managed-by: kustomize 173 | app.kubernetes.io/created-by: kwok-operator 174 | spec: 175 | containers: 176 | - image: nginx 177 | name: nginx 178 | restartPolicy: Always 179 | ``` 180 | --- 181 | To use the Kwok Operator to manage pods on top the nodes you provisioned above, follow these steps: 182 | 1. ensure the namespace is exist 183 | 2. Define a PodPool custom resource (CR) with your desired configuration. Example: 184 | ```yaml 185 | apiVersion: kwok.sigs.run-ai.com/v1beta1 186 | kind: PodPool 187 | metadata: 188 | labels: 189 | app.kubernetes.io/name: podpool 190 | app.kubernetes.io/instance: podpool-sample 191 | app.kubernetes.io/part-of: kwok-operator 192 | app.kubernetes.io/created-by: kwok-operator 193 | name: podpool-sample 194 | namespace: default 195 | spec: 196 | podCount: 5 197 | podTemplate: 198 | metadata: 199 | name: kwok-operator 200 | labels: 201 | app.kubernetes.io/name: pod 202 | app.kubernetes.io/instance: pod-sample 203 | app.kubernetes.io/part-of: kwok-operator 204 | app.kubernetes.io/managed-by: kustomize 205 | app.kubernetes.io/created-by: kwok-operator 206 | spec: 207 | containers: 208 | - image: nginx 209 | name: nginx 210 | restartPolicy: Always 211 | ``` 212 | Added in version 0.0.5 213 | To use the Kwok Operator to manage jobs on top the nodes you provisioned above, follow these steps: 214 | 1. ensure the namespace is exist 215 | 2. Define a JobPool custom resource (CR) with your desired configuration. Example: 216 | ```yaml 217 | apiVersion: kwok.sigs.run-ai.com/v1beta1 218 | kind: JobPool 219 | metadata: 220 | labels: 221 | app.kubernetes.io/name: jobpool 222 | app.kubernetes.io/instance: jobpool-sample 223 | app.kubernetes.io/part-of: kwok-operator 224 | app.kubernetes.io/managed-by: kustomize 225 | app.kubernetes.io/created-by: kwok-operator 226 | name: jobpool-sample 227 | spec: 228 | jobCount: 5 229 | jobTemplate: 230 | metadata: 231 | name: kwok-operator 232 | labels: 233 | app.kubernetes.io/name: job 234 | app.kubernetes.io/instance: job-sample 235 | app.kubernetes.io/part-of: kwok-operator 236 | app.kubernetes.io/managed-by: kustomize 237 | app.kubernetes.io/created-by: kwok-operator 238 | spec: 239 | template: 240 | metadata: 241 | labels: 242 | app.kubernetes.io/name: job 243 | app.kubernetes.io/instance: job-sample 244 | app.kubernetes.io/part-of: kwok-operator 245 | app.kubernetes.io/managed-by: kustomize 246 | app.kubernetes.io/created-by: kwok-operator 247 | spec: 248 | containers: 249 | - name: job 250 | image: busybox 251 | command: ["sh", "-c", "echo Hello, Kubernetes! && sleep 3600"] 252 | restartPolicy: Never 253 | ``` 254 | Added in version 0.0.7 255 | To use the Kwok Operator to manage Daemonset on top the nodes you provisioned above, follow these steps: 256 | 1. ensure the namespace is exist 257 | 2. Define a DaemonsetPool custom resource (CR) with your desired configuration. Example: 258 | ```yaml 259 | apiVersion: kwok.sigs.run-ai.com/v1beta1 260 | kind: DaemonsetPool 261 | metadata: 262 | labels: 263 | app.kubernetes.io/name: daemonsetpool 264 | app.kubernetes.io/instance: daemonsetpool-sample 265 | app.kubernetes.io/part-of: kwok-operator 266 | app.kubernetes.io/managed-by: kustomize 267 | app.kubernetes.io/created-by: kwok-operator 268 | name: daemonsetpool-sample 269 | namespace: default 270 | spec: 271 | daemonsetCount: 10 272 | daemonsetTemplate: 273 | metadata: 274 | name: kwok-operator 275 | labels: 276 | app.kubernetes.io/name: daemonset 277 | app.kubernetes.io/instance: daemonset-sample 278 | app.kubernetes.io/part-of: kwok-operator 279 | app.kubernetes.io/managed-by: kustomize 280 | app.kubernetes.io/created-by: kwok-operator 281 | spec: 282 | selector: 283 | matchLabels: 284 | app.kubernetes.io/name: deployment 285 | app.kubernetes.io/instance: deployment-sample 286 | app.kubernetes.io/part-of: kwok-operator 287 | app.kubernetes.io/managed-by: kustomize 288 | app.kubernetes.io/created-by: kwok-operator 289 | template: 290 | metadata: 291 | labels: 292 | app.kubernetes.io/name: daemonset 293 | app.kubernetes.io/instance: daemonset-sample 294 | app.kubernetes.io/part-of: kwok-operator 295 | app.kubernetes.io/managed-by: kustomize 296 | app.kubernetes.io/created-by: kwok-operator 297 | spec: 298 | containers: 299 | - image: nginx 300 | name: nginx 301 | restartPolicy: Always 302 | ``` 303 | ## Troubleshooting 304 | 305 | If you encounter any issues with the Kwok Operator, please check the following: 306 | 307 | - Ensure that the Kwok is infrastructure properly configured and accessible from the Kubernetes cluster. 308 | https://kwok.sigs.k8s.io/docs/user/kwok-in-cluster/ 309 | - Check the logs of the Kwok Operator pod for any error messages under namespace kwok-operaotr. 310 | 311 | From version 1.0.0 the Kwok Operator is able to manage Statefuleset. To include PVC on top of the nodes you have provisioned above, follow these steps: 312 | 1. ensure the namespace is exist 313 | 2. ensure that storage class is installed and working as expected in the cluster 314 | 2. Define a statefulesetPool custom resource (CR) with your desired configuration. Example: 315 | ```yaml 316 | apiVersion: kwok.sigs.run-ai.com/v1beta1 317 | kind: StatefulsetPool 318 | metadata: 319 | labels: 320 | app.kubernetes.io/name: statefulsetpool 321 | app.kubernetes.io/instance: statefulsetpool-sample 322 | app.kubernetes.io/part-of: kwok-operator 323 | app.kubernetes.io/managed-by: kustomize 324 | app.kubernetes.io/created-by: kwok-operator 325 | name: statefulsetpool-sample 326 | spec: 327 | createPV: true # optional , default is false 328 | statefulsetCount: 2 329 | StatefulsetTemplate: 330 | metadata: 331 | labels: 332 | app.kubernetes.io/name: statefulsetpool 333 | app.kubernetes.io/instance: statefulsetpool-sample 334 | app.kubernetes.io/part-of: kwok-operator 335 | app.kubernetes.io/managed-by: kustomize 336 | app.kubernetes.io/created-by: kwok-operator 337 | spec: 338 | serviceName: "nginx" 339 | replicas: 45 340 | selector: 341 | matchLabels: 342 | app: nginx 343 | template: 344 | metadata: 345 | labels: 346 | app: nginx 347 | spec: 348 | containers: 349 | - name: nginx 350 | image: registry.k8s.io/nginx-slim:0.21 351 | ports: 352 | - containerPort: 80 353 | name: web 354 | volumeMounts: 355 | - name: www 356 | mountPath: /usr/share/nginx/html 357 | volumeClaimTemplates: 358 | - metadata: 359 | name: www 360 | spec: 361 | accessModes: [ "ReadWriteOnce" ] 362 | resources: 363 | requests: 364 | storage: 1Gi 365 | ``` 366 | ## Contributing 367 | 368 | Contributions to the Kwok Operator are welcome! To contribute, please follow the guidelines outlined in [CONTRIBUTING.md](./CONTRIBUTING.md). 369 | 370 | --- 371 | 372 | Feel free to customize and expand upon this template to suit your specific needs and preferences! -------------------------------------------------------------------------------- /api/v1beta1/daemonsetpool_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1beta1 18 | 19 | import ( 20 | appsv1 "k8s.io/api/apps/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | ) 23 | 24 | // DaemonsetPoolSpec defines the desired state of DaemonsetPool 25 | 26 | type DaemonsetPoolSpec struct { 27 | DaemonsetCount int32 `json:"daemonsetCount"` 28 | DaemonsetTemplate appsv1.DaemonSet `json:"daemonsetTemplate"` 29 | } 30 | 31 | // DaemonsetPoolStatus defines the observed state of DaemonsetPool 32 | type DaemonsetPoolStatus struct { 33 | Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` 34 | ObservedGeneration int64 `json:"observedGeneration,omitempty" protobuf:"varint,2,opt,name=observedGeneration"` 35 | } 36 | 37 | //+kubebuilder:object:root=true 38 | //+kubebuilder:subresource:status 39 | 40 | // DaemonsetPool is the Schema for the daemonsetpools API 41 | type DaemonsetPool struct { 42 | metav1.TypeMeta `json:",inline"` 43 | metav1.ObjectMeta `json:"metadata,omitempty"` 44 | 45 | Spec DaemonsetPoolSpec `json:"spec,omitempty"` 46 | Status DaemonsetPoolStatus `json:"status,omitempty"` 47 | } 48 | 49 | //+kubebuilder:object:root=true 50 | 51 | // DaemonsetPoolList contains a list of DaemonsetPool 52 | type DaemonsetPoolList struct { 53 | metav1.TypeMeta `json:",inline"` 54 | metav1.ListMeta `json:"metadata,omitempty"` 55 | Items []DaemonsetPool `json:"items"` 56 | } 57 | 58 | func init() { 59 | SchemeBuilder.Register(&DaemonsetPool{}, &DaemonsetPoolList{}) 60 | } 61 | -------------------------------------------------------------------------------- /api/v1beta1/deploymentpool_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1beta1 18 | 19 | import ( 20 | appsv1 "k8s.io/api/apps/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | ) 23 | 24 | // DeploymentPoolSpec defines the desired state of DeploymentPool 25 | type DeploymentPoolSpec struct { 26 | DeploymentCount int32 `json:"deploymentCount"` 27 | DeploymentTemplate appsv1.Deployment `json:"deploymentTemplate"` 28 | } 29 | 30 | // DeploymentPoolStatus defines the observed state of DeploymentPool 31 | type DeploymentPoolStatus struct { 32 | Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` 33 | ObservedGeneration int64 `json:"observedGeneration,omitempty" protobuf:"varint,2,opt,name=observedGeneration"` 34 | } 35 | 36 | //+kubebuilder:object:root=true 37 | //+kubebuilder:subresource:status 38 | 39 | // DeploymentPool is the Schema for the deploymentpools API 40 | type DeploymentPool struct { 41 | metav1.TypeMeta `json:",inline"` 42 | metav1.ObjectMeta `json:"metadata,omitempty"` 43 | 44 | Spec DeploymentPoolSpec `json:"spec,omitempty"` 45 | Status DeploymentPoolStatus `json:"status,omitempty"` 46 | } 47 | 48 | //+kubebuilder:object:root=true 49 | 50 | // DeploymentPoolList contains a list of DeploymentPool 51 | type DeploymentPoolList struct { 52 | metav1.TypeMeta `json:",inline"` 53 | metav1.ListMeta `json:"metadata,omitempty"` 54 | Items []DeploymentPool `json:"items"` 55 | } 56 | 57 | func init() { 58 | SchemeBuilder.Register(&DeploymentPool{}, &DeploymentPoolList{}) 59 | } 60 | -------------------------------------------------------------------------------- /api/v1beta1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1beta1 contains API Schema definitions for the kwok.sigs v1beta1 API group 18 | // +kubebuilder:object:generate=true 19 | // +groupName=kwok.sigs.run-ai.com 20 | package v1beta1 21 | 22 | import ( 23 | "k8s.io/apimachinery/pkg/runtime/schema" 24 | "sigs.k8s.io/controller-runtime/pkg/scheme" 25 | ) 26 | 27 | var ( 28 | // GroupVersion is group version used to register these objects 29 | GroupVersion = schema.GroupVersion{Group: "kwok.sigs.run-ai.com", Version: "v1beta1"} 30 | 31 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme 32 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 33 | 34 | // AddToScheme adds the types in this group-version to the given scheme. 35 | AddToScheme = SchemeBuilder.AddToScheme 36 | ) 37 | -------------------------------------------------------------------------------- /api/v1beta1/jobpool_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1beta1 18 | 19 | import ( 20 | batchv1 "k8s.io/api/batch/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | ) 23 | 24 | type JobPoolSpec struct { 25 | JobCount int32 `json:"jobCount"` 26 | JobTemplate batchv1.Job `json:"jobTemplate"` 27 | } 28 | 29 | // JobPoolStatus defines the observed state of JobPool 30 | type JobPoolStatus struct { 31 | Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` 32 | ObservedGeneration int64 `json:"observedGeneration,omitempty" protobuf:"varint,2,opt,name=observedGeneration"` 33 | } 34 | 35 | //+kubebuilder:object:root=true 36 | //+kubebuilder:subresource:status 37 | 38 | // JobPool is the Schema for the jobpools API 39 | type JobPool struct { 40 | metav1.TypeMeta `json:",inline"` 41 | metav1.ObjectMeta `json:"metadata,omitempty"` 42 | 43 | Spec JobPoolSpec `json:"spec,omitempty"` 44 | Status JobPoolStatus `json:"status,omitempty"` 45 | } 46 | 47 | //+kubebuilder:object:root=true 48 | 49 | // JobPoolList contains a list of JobPool 50 | type JobPoolList struct { 51 | metav1.TypeMeta `json:",inline"` 52 | metav1.ListMeta `json:"metadata,omitempty"` 53 | Items []JobPool `json:"items"` 54 | } 55 | 56 | func init() { 57 | SchemeBuilder.Register(&JobPool{}, &JobPoolList{}) 58 | } 59 | -------------------------------------------------------------------------------- /api/v1beta1/nodepool_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1beta1 18 | 19 | import ( 20 | corev1 "k8s.io/api/core/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | ) 23 | 24 | // NodePoolSpec defines the desired state of NodePool 25 | type NodePoolSpec struct { 26 | NodeCount int32 `json:"nodeCount"` 27 | NodeTemplate corev1.Node `json:"nodeTemplate"` 28 | } 29 | 30 | // NodePoolStatus defines the observed state of NodePool 31 | type NodePoolStatus struct { 32 | Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` 33 | ObservedGeneration int64 `json:"observedGeneration,omitempty" protobuf:"varint,2,opt,name=observedGeneration"` 34 | } 35 | 36 | //+kubebuilder:object:root=true 37 | //+kubebuilder:subresource:status 38 | //+kubebuilder:resource:scope=Cluster 39 | 40 | // NodePool is the Schema for the nodepools API 41 | type NodePool struct { 42 | metav1.TypeMeta `json:",inline"` 43 | metav1.ObjectMeta `json:"metadata,omitempty"` 44 | 45 | Spec NodePoolSpec `json:"spec,omitempty"` 46 | Status NodePoolStatus `json:"status,omitempty"` 47 | } 48 | 49 | //+kubebuilder:object:root=true 50 | 51 | // NodePoolList contains a list of NodePool 52 | type NodePoolList struct { 53 | metav1.TypeMeta `json:",inline"` 54 | metav1.ListMeta `json:"metadata,omitempty"` 55 | Items []NodePool `json:"items"` 56 | } 57 | 58 | func init() { 59 | SchemeBuilder.Register(&NodePool{}, &NodePoolList{}) 60 | } 61 | -------------------------------------------------------------------------------- /api/v1beta1/podpool_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1beta1 18 | 19 | import ( 20 | corev1 "k8s.io/api/core/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | ) 23 | 24 | // PodPoolSpec defines the desired state of PodPool 25 | type PodPoolSpec struct { 26 | PodCount int32 `json:"podCount"` 27 | PodTemplate corev1.Pod `json:"podTemplate"` 28 | } 29 | 30 | // PodPoolStatus defines the observed state of PodPool 31 | type PodPoolStatus struct { 32 | Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` 33 | ObservedGeneration int64 `json:"observedGeneration,omitempty" protobuf:"varint,2,opt,name=observedGeneration"` 34 | } 35 | 36 | //+kubebuilder:object:root=true 37 | //+kubebuilder:subresource:status 38 | 39 | // PodPool is the Schema for the podpools API 40 | type PodPool struct { 41 | metav1.TypeMeta `json:",inline"` 42 | metav1.ObjectMeta `json:"metadata,omitempty"` 43 | 44 | Spec PodPoolSpec `json:"spec,omitempty"` 45 | Status PodPoolStatus `json:"status,omitempty"` 46 | } 47 | 48 | //+kubebuilder:object:root=true 49 | 50 | // PodPoolList contains a list of PodPool 51 | type PodPoolList struct { 52 | metav1.TypeMeta `json:",inline"` 53 | metav1.ListMeta `json:"metadata,omitempty"` 54 | Items []PodPool `json:"items"` 55 | } 56 | 57 | func init() { 58 | SchemeBuilder.Register(&PodPool{}, &PodPoolList{}) 59 | } 60 | -------------------------------------------------------------------------------- /api/v1beta1/statefulsetpool_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1beta1 18 | 19 | import ( 20 | appsv1 "k8s.io/api/apps/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | ) 23 | 24 | // StatefulsetPoolSpec defines the desired state of StatefulsetPool 25 | type StatefulsetPoolSpec struct { 26 | CreatePV bool `json:"createPV,omitempty"` 27 | StatefulsetCount int32 `json:"statefulsetCount"` 28 | StatefulsetTemplate appsv1.StatefulSet `json:"StatefulsetTemplate"` 29 | } 30 | 31 | // StatefulsetPoolStatus defines the observed state of StatefulsetPool 32 | type StatefulsetPoolStatus struct { 33 | Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` 34 | ObservedGeneration int64 `json:"observedGeneration,omitempty" protobuf:"varint,2,opt,name=observedGeneration"` 35 | } 36 | 37 | //+kubebuilder:object:root=true 38 | //+kubebuilder:subresource:status 39 | 40 | // StatefulsetPool is the Schema for the statefulsetpools API 41 | type StatefulsetPool struct { 42 | metav1.TypeMeta `json:",inline"` 43 | metav1.ObjectMeta `json:"metadata,omitempty"` 44 | 45 | Spec StatefulsetPoolSpec `json:"spec,omitempty"` 46 | Status StatefulsetPoolStatus `json:"status,omitempty"` 47 | } 48 | 49 | //+kubebuilder:object:root=true 50 | 51 | // StatefulsetPoolList contains a list of StatefulsetPool 52 | type StatefulsetPoolList struct { 53 | metav1.TypeMeta `json:",inline"` 54 | metav1.ListMeta `json:"metadata,omitempty"` 55 | Items []StatefulsetPool `json:"items"` 56 | } 57 | 58 | func init() { 59 | SchemeBuilder.Register(&StatefulsetPool{}, &StatefulsetPoolList{}) 60 | } 61 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "crypto/tls" 21 | "flag" 22 | "os" 23 | 24 | // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) 25 | // to ensure that exec-entrypoint and run can make use of them. 26 | _ "k8s.io/client-go/plugin/pkg/client/auth" 27 | 28 | "k8s.io/apimachinery/pkg/runtime" 29 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 30 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 31 | ctrl "sigs.k8s.io/controller-runtime" 32 | "sigs.k8s.io/controller-runtime/pkg/healthz" 33 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 34 | metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" 35 | "sigs.k8s.io/controller-runtime/pkg/webhook" 36 | 37 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 38 | "github.com/run-ai/kwok-operator/internal/controller" 39 | //+kubebuilder:scaffold:imports 40 | ) 41 | 42 | var ( 43 | scheme = runtime.NewScheme() 44 | setupLog = ctrl.Log.WithName("setup") 45 | ) 46 | 47 | func init() { 48 | utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 49 | 50 | utilruntime.Must(kwoksigsv1beta1.AddToScheme(scheme)) 51 | //+kubebuilder:scaffold:scheme 52 | } 53 | 54 | func main() { 55 | var metricsAddr string 56 | var enableLeaderElection bool 57 | var probeAddr string 58 | var secureMetrics bool 59 | var enableHTTP2 bool 60 | flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") 61 | flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") 62 | flag.BoolVar(&enableLeaderElection, "leader-elect", false, 63 | "Enable leader election for controller manager. "+ 64 | "Enabling this will ensure there is only one active controller manager.") 65 | flag.BoolVar(&secureMetrics, "metrics-secure", false, 66 | "If set the metrics endpoint is served securely") 67 | flag.BoolVar(&enableHTTP2, "enable-http2", false, 68 | "If set, HTTP/2 will be enabled for the metrics and webhook servers") 69 | opts := zap.Options{ 70 | Development: true, 71 | } 72 | opts.BindFlags(flag.CommandLine) 73 | flag.Parse() 74 | 75 | ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) 76 | 77 | // if the enable-http2 flag is false (the default), http/2 should be disabled 78 | // due to its vulnerabilities. More specifically, disabling http/2 will 79 | // prevent from being vulnerable to the HTTP/2 Stream Cancelation and 80 | // Rapid Reset CVEs. For more information see: 81 | // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 82 | // - https://github.com/advisories/GHSA-4374-p667-p6c8 83 | disableHTTP2 := func(c *tls.Config) { 84 | setupLog.Info("disabling http/2") 85 | c.NextProtos = []string{"http/1.1"} 86 | } 87 | 88 | tlsOpts := []func(*tls.Config){} 89 | if !enableHTTP2 { 90 | tlsOpts = append(tlsOpts, disableHTTP2) 91 | } 92 | 93 | webhookServer := webhook.NewServer(webhook.Options{ 94 | TLSOpts: tlsOpts, 95 | }) 96 | 97 | mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ 98 | Scheme: scheme, 99 | Metrics: metricsserver.Options{ 100 | BindAddress: metricsAddr, 101 | SecureServing: secureMetrics, 102 | TLSOpts: tlsOpts, 103 | }, 104 | WebhookServer: webhookServer, 105 | HealthProbeBindAddress: probeAddr, 106 | LeaderElection: enableLeaderElection, 107 | LeaderElectionID: "2f4ccc1c.run-ai.com", 108 | // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily 109 | // when the Manager ends. This requires the binary to immediately end when the 110 | // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly 111 | // speeds up voluntary leader transitions as the new leader don't have to wait 112 | // LeaseDuration time first. 113 | // 114 | // In the default scaffold provided, the program ends immediately after 115 | // the manager stops, so would be fine to enable this option. However, 116 | // if you are doing or is intended to do any operation such as perform cleanups 117 | // after the manager stops then its usage might be unsafe. 118 | // LeaderElectionReleaseOnCancel: true, 119 | }) 120 | if err != nil { 121 | setupLog.Error(err, "unable to start manager") 122 | os.Exit(1) 123 | } 124 | 125 | if err = (&controller.NodePoolReconciler{ 126 | Client: mgr.GetClient(), 127 | Scheme: mgr.GetScheme(), 128 | }).SetupWithManager(mgr); err != nil { 129 | setupLog.Error(err, "unable to create controller", "controller", "NodePool") 130 | os.Exit(1) 131 | } 132 | if err = (&controller.DeploymentPoolReconciler{ 133 | Client: mgr.GetClient(), 134 | Scheme: mgr.GetScheme(), 135 | }).SetupWithManager(mgr); err != nil { 136 | setupLog.Error(err, "unable to create controller", "controller", "DeploymentPool") 137 | os.Exit(1) 138 | } 139 | if err = (&controller.PodPoolReconciler{ 140 | Client: mgr.GetClient(), 141 | Scheme: mgr.GetScheme(), 142 | }).SetupWithManager(mgr); err != nil { 143 | setupLog.Error(err, "unable to create controller", "controller", "PodPool") 144 | os.Exit(1) 145 | } 146 | if err = (&controller.JobPoolReconciler{ 147 | Client: mgr.GetClient(), 148 | Scheme: mgr.GetScheme(), 149 | }).SetupWithManager(mgr); err != nil { 150 | setupLog.Error(err, "unable to create controller", "controller", "JobPool") 151 | os.Exit(1) 152 | } 153 | if err = (&controller.DaemonsetPoolReconciler{ 154 | Client: mgr.GetClient(), 155 | Scheme: mgr.GetScheme(), 156 | }).SetupWithManager(mgr); err != nil { 157 | setupLog.Error(err, "unable to create controller", "controller", "DaemonsetPool") 158 | os.Exit(1) 159 | } 160 | if err = (&controller.StatefulsetPoolReconciler{ 161 | Client: mgr.GetClient(), 162 | Scheme: mgr.GetScheme(), 163 | }).SetupWithManager(mgr); err != nil { 164 | setupLog.Error(err, "unable to create controller", "controller", "StatefulsetPool") 165 | os.Exit(1) 166 | } 167 | //+kubebuilder:scaffold:builder 168 | 169 | if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 170 | setupLog.Error(err, "unable to set up health check") 171 | os.Exit(1) 172 | } 173 | if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { 174 | setupLog.Error(err, "unable to set up ready check") 175 | os.Exit(1) 176 | } 177 | 178 | setupLog.Info("starting manager") 179 | if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { 180 | setupLog.Error(err, "problem running manager") 181 | os.Exit(1) 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /config/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # This kustomization.yaml is not intended to be run by itself, 2 | # since it depends on service name and namespace that are out of this kustomize package. 3 | # It should be run by config/default 4 | resources: 5 | - bases/kwok.sigs.run-ai.com_nodepools.yaml 6 | - bases/kwok.sigs.run-ai.com_deploymentpools.yaml 7 | - bases/kwok.sigs.run-ai.com_podpools.yaml 8 | - bases/kwok.sigs.run-ai.com_jobpools.yaml 9 | - bases/kwok.sigs.run-ai.com_daemonsetpools.yaml 10 | - bases/kwok.sigs.run-ai.com_statefulsetpools.yaml 11 | #+kubebuilder:scaffold:crdkustomizeresource 12 | 13 | patches: 14 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. 15 | # patches here are for enabling the conversion webhook for each CRD 16 | #- path: patches/webhook_in_nodepools.yaml 17 | #- path: patches/webhook_in_deploymentpools.yaml 18 | #- path: patches/webhook_in_podpools.yaml 19 | #- path: patches/webhook_in_jobpools.yaml 20 | #- path: patches/webhook_in_daemonsetpools.yaml 21 | #- path: patches/webhook_in_statefulsetpools.yaml 22 | #+kubebuilder:scaffold:crdkustomizewebhookpatch 23 | 24 | # [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix. 25 | # patches here are for enabling the CA injection for each CRD 26 | #- path: patches/cainjection_in_nodepools.yaml 27 | #- path: patches/cainjection_in_deploymentpools.yaml 28 | #- path: patches/cainjection_in_podpools.yaml 29 | #- path: patches/cainjection_in_jobpools.yaml 30 | #- path: patches/cainjection_in_daemonsetpools.yaml 31 | #- path: patches/cainjection_in_statefulsetpools.yaml 32 | #+kubebuilder:scaffold:crdkustomizecainjectionpatch 33 | 34 | # [WEBHOOK] To enable webhook, uncomment the following section 35 | # the following config is for teaching kustomize how to do kustomization for CRDs. 36 | 37 | #configurations: 38 | #- kustomizeconfig.yaml 39 | -------------------------------------------------------------------------------- /config/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD 2 | nameReference: 3 | - kind: Service 4 | version: v1 5 | fieldSpecs: 6 | - kind: CustomResourceDefinition 7 | version: v1 8 | group: apiextensions.k8s.io 9 | path: spec/conversion/webhook/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: CustomResourceDefinition 13 | version: v1 14 | group: apiextensions.k8s.io 15 | path: spec/conversion/webhook/clientConfig/service/namespace 16 | create: false 17 | 18 | varReference: 19 | - path: metadata/annotations 20 | -------------------------------------------------------------------------------- /config/default/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # Adds namespace to all resources. 2 | namespace: kwok-operator 3 | 4 | # Value of this field is prepended to the 5 | # names of all resources, e.g. a deployment named 6 | # "wordpress" becomes "alices-wordpress". 7 | # Note that it should also match with the prefix (text before '-') of the namespace 8 | # field above. 9 | namePrefix: kwok-operator- 10 | 11 | # Labels to add to all resources and selectors. 12 | #labels: 13 | #- includeSelectors: true 14 | # pairs: 15 | # someName: someValue 16 | 17 | resources: 18 | - ../crd 19 | - ../rbac 20 | - ../manager 21 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 22 | # crd/kustomization.yaml 23 | #- ../webhook 24 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. 25 | #- ../certmanager 26 | # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. 27 | #- ../prometheus 28 | 29 | patches: 30 | # Protect the /metrics endpoint by putting it behind auth. 31 | # If you want your kwok-operator to expose the /metrics 32 | # endpoint w/o any authn/z, please comment the following line. 33 | - path: manager_auth_proxy_patch.yaml 34 | 35 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 36 | # crd/kustomization.yaml 37 | #- path: manager_webhook_patch.yaml 38 | 39 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 40 | # Uncomment 'CERTMANAGER' sections in crd/kustomization.yaml to enable the CA injection in the admission webhooks. 41 | # 'CERTMANAGER' needs to be enabled to use ca injection 42 | #- path: webhookcainjection_patch.yaml 43 | 44 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. 45 | # Uncomment the following replacements to add the cert-manager CA injection annotations 46 | #replacements: 47 | # - source: # Add cert-manager annotation to ValidatingWebhookConfiguration, MutatingWebhookConfiguration and CRDs 48 | # kind: Certificate 49 | # group: cert-manager.io 50 | # version: v1 51 | # name: serving-cert # this name should match the one in certificate.yaml 52 | # fieldPath: .metadata.namespace # namespace of the certificate CR 53 | # targets: 54 | # - select: 55 | # kind: ValidatingWebhookConfiguration 56 | # fieldPaths: 57 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 58 | # options: 59 | # delimiter: '/' 60 | # index: 0 61 | # create: true 62 | # - select: 63 | # kind: MutatingWebhookConfiguration 64 | # fieldPaths: 65 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 66 | # options: 67 | # delimiter: '/' 68 | # index: 0 69 | # create: true 70 | # - select: 71 | # kind: CustomResourceDefinition 72 | # fieldPaths: 73 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 74 | # options: 75 | # delimiter: '/' 76 | # index: 0 77 | # create: true 78 | # - source: 79 | # kind: Certificate 80 | # group: cert-manager.io 81 | # version: v1 82 | # name: serving-cert # this name should match the one in certificate.yaml 83 | # fieldPath: .metadata.name 84 | # targets: 85 | # - select: 86 | # kind: ValidatingWebhookConfiguration 87 | # fieldPaths: 88 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 89 | # options: 90 | # delimiter: '/' 91 | # index: 1 92 | # create: true 93 | # - select: 94 | # kind: MutatingWebhookConfiguration 95 | # fieldPaths: 96 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 97 | # options: 98 | # delimiter: '/' 99 | # index: 1 100 | # create: true 101 | # - select: 102 | # kind: CustomResourceDefinition 103 | # fieldPaths: 104 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 105 | # options: 106 | # delimiter: '/' 107 | # index: 1 108 | # create: true 109 | # - source: # Add cert-manager annotation to the webhook Service 110 | # kind: Service 111 | # version: v1 112 | # name: webhook-service 113 | # fieldPath: .metadata.name # namespace of the service 114 | # targets: 115 | # - select: 116 | # kind: Certificate 117 | # group: cert-manager.io 118 | # version: v1 119 | # fieldPaths: 120 | # - .spec.dnsNames.0 121 | # - .spec.dnsNames.1 122 | # options: 123 | # delimiter: '.' 124 | # index: 0 125 | # create: true 126 | # - source: 127 | # kind: Service 128 | # version: v1 129 | # name: webhook-service 130 | # fieldPath: .metadata.namespace # namespace of the service 131 | # targets: 132 | # - select: 133 | # kind: Certificate 134 | # group: cert-manager.io 135 | # version: v1 136 | # fieldPaths: 137 | # - .spec.dnsNames.0 138 | # - .spec.dnsNames.1 139 | # options: 140 | # delimiter: '.' 141 | # index: 1 142 | # create: true 143 | -------------------------------------------------------------------------------- /config/default/manager_auth_proxy_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch inject a sidecar container which is a HTTP proxy for the 2 | # controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: kwok-operator 7 | namespace: kwok-operator 8 | spec: 9 | template: 10 | spec: 11 | containers: 12 | - name: kube-rbac-proxy 13 | securityContext: 14 | allowPrivilegeEscalation: false 15 | capabilities: 16 | drop: 17 | - "ALL" 18 | image: gcr.io/kubebuilder/kube-rbac-proxy:v0.15.0 19 | args: 20 | - "--secure-listen-address=0.0.0.0:8443" 21 | - "--upstream=http://127.0.0.1:8080/" 22 | - "--logtostderr=true" 23 | - "--v=0" 24 | ports: 25 | - containerPort: 8443 26 | protocol: TCP 27 | name: https 28 | resources: 29 | limits: 30 | cpu: 500m 31 | memory: 128Mi 32 | requests: 33 | cpu: 5m 34 | memory: 64Mi 35 | - name: manager 36 | args: 37 | - "--health-probe-bind-address=:8081" 38 | - "--metrics-bind-address=127.0.0.1:8080" 39 | - "--leader-elect" 40 | -------------------------------------------------------------------------------- /config/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manager.yaml 3 | apiVersion: kustomize.config.k8s.io/v1beta1 4 | kind: Kustomization 5 | images: 6 | - name: controller 7 | newName: docker.io/runaidevops/kwok-operator 8 | newTag: 1.0.1 9 | -------------------------------------------------------------------------------- /config/manager/manager.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | labels: 5 | control-plane: kwok-operator 6 | app.kubernetes.io/name: namespace 7 | app.kubernetes.io/instance: system 8 | app.kubernetes.io/component: manager 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: kwok-operator 13 | --- 14 | apiVersion: apps/v1 15 | kind: Deployment 16 | metadata: 17 | name: kwok-operator 18 | namespace: kwok-operator 19 | labels: 20 | control-plane: kwok-operator 21 | app.kubernetes.io/name: deployment 22 | app.kubernetes.io/instance: kwok-operator 23 | app.kubernetes.io/component: manager 24 | app.kubernetes.io/created-by: kwok-operator 25 | app.kubernetes.io/part-of: kwok-operator 26 | app.kubernetes.io/managed-by: kustomize 27 | spec: 28 | selector: 29 | matchLabels: 30 | control-plane: kwok-operator 31 | replicas: 1 32 | template: 33 | metadata: 34 | annotations: 35 | kubectl.kubernetes.io/default-container: manager 36 | labels: 37 | control-plane: kwok-operator 38 | spec: 39 | securityContext: 40 | runAsNonRoot: true 41 | containers: 42 | - command: 43 | - /manager 44 | args: 45 | - --leader-elect 46 | image: controller:latest 47 | name: manager 48 | securityContext: 49 | allowPrivilegeEscalation: false 50 | capabilities: 51 | drop: 52 | - "ALL" 53 | livenessProbe: 54 | httpGet: 55 | path: /healthz 56 | port: 8081 57 | initialDelaySeconds: 15 58 | periodSeconds: 20 59 | readinessProbe: 60 | httpGet: 61 | path: /readyz 62 | port: 8081 63 | initialDelaySeconds: 5 64 | periodSeconds: 10 65 | resources: 66 | limits: 67 | cpu: 500m 68 | memory: 128Mi 69 | requests: 70 | cpu: 10m 71 | memory: 64Mi 72 | serviceAccountName: kwok-operator-sa 73 | terminationGracePeriodSeconds: 10 74 | -------------------------------------------------------------------------------- /config/prometheus/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - monitor.yaml 3 | -------------------------------------------------------------------------------- /config/prometheus/monitor.yaml: -------------------------------------------------------------------------------- 1 | # Prometheus Monitor Service (Metrics) 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | labels: 6 | control-plane: kwok-operator 7 | app.kubernetes.io/name: servicemonitor 8 | app.kubernetes.io/instance: kwok-operator-metrics-monitor 9 | app.kubernetes.io/component: metrics 10 | app.kubernetes.io/created-by: kwok-operator 11 | app.kubernetes.io/part-of: kwok-operator 12 | app.kubernetes.io/managed-by: kustomize 13 | name: kwok-operator-metrics-monitor 14 | namespace: kwok-operator 15 | spec: 16 | endpoints: 17 | - path: /metrics 18 | port: https 19 | scheme: https 20 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 21 | tlsConfig: 22 | insecureSkipVerify: true 23 | selector: 24 | matchLabels: 25 | control-plane: kwok-operator 26 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_client_clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrole 6 | app.kubernetes.io/instance: metrics-reader 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: kwok-operator 9 | app.kubernetes.io/part-of: kwok-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: metrics-reader 12 | rules: 13 | - nonResourceURLs: 14 | - "/metrics" 15 | verbs: 16 | - get 17 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrole 6 | app.kubernetes.io/instance: proxy-role 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: kwok-operator 9 | app.kubernetes.io/part-of: kwok-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: proxy-role 12 | rules: 13 | - apiGroups: 14 | - authentication.k8s.io 15 | resources: 16 | - tokenreviews 17 | verbs: 18 | - create 19 | - apiGroups: 20 | - authorization.k8s.io 21 | resources: 22 | - subjectaccessreviews 23 | verbs: 24 | - create 25 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrolebinding 6 | app.kubernetes.io/instance: proxy-rolebinding 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: kwok-operator 9 | app.kubernetes.io/part-of: kwok-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: proxy-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: proxy-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: kwok-operator-sa 19 | namespace: kwok-operator 20 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: kwok-operator 6 | app.kubernetes.io/name: service 7 | app.kubernetes.io/instance: kwok-operator-metrics-service 8 | app.kubernetes.io/component: kube-rbac-proxy 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: kwok-operator-metrics-service 13 | namespace: kwok-operator 14 | spec: 15 | ports: 16 | - name: https 17 | port: 8443 18 | protocol: TCP 19 | targetPort: https 20 | selector: 21 | control-plane: kwok-operator 22 | -------------------------------------------------------------------------------- /config/rbac/daemonsetpool_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit daemonsetpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: daemonsetpool-editor-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: daemonsetpool-editor-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - daemonsetpools 18 | verbs: 19 | - create 20 | - delete 21 | - get 22 | - list 23 | - patch 24 | - update 25 | - watch 26 | - apiGroups: 27 | - kwok.sigs.run-ai.com 28 | resources: 29 | - daemonsetpools/status 30 | verbs: 31 | - get 32 | -------------------------------------------------------------------------------- /config/rbac/daemonsetpool_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view daemonsetpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: daemonsetpool-viewer-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: daemonsetpool-viewer-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - daemonsetpools 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - apiGroups: 23 | - kwok.sigs.run-ai.com 24 | resources: 25 | - daemonsetpools/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/deploymentpool_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit deploymentpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: deploymentpool-editor-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: deploymentpool-editor-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - deploymentpools 18 | verbs: 19 | - create 20 | - delete 21 | - get 22 | - list 23 | - patch 24 | - update 25 | - watch 26 | - apiGroups: 27 | - kwok.sigs.run-ai.com 28 | resources: 29 | - deploymentpools/status 30 | verbs: 31 | - get 32 | -------------------------------------------------------------------------------- /config/rbac/deploymentpool_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view deploymentpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: deploymentpool-viewer-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: deploymentpool-viewer-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - deploymentpools 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - apiGroups: 23 | - kwok.sigs.run-ai.com 24 | resources: 25 | - deploymentpools/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/jobpool_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit jobpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: jobpool-editor-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: jobpool-editor-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - jobpools 18 | verbs: 19 | - create 20 | - delete 21 | - get 22 | - list 23 | - patch 24 | - update 25 | - watch 26 | - apiGroups: 27 | - kwok.sigs.run-ai.com 28 | resources: 29 | - jobpools/status 30 | verbs: 31 | - get 32 | -------------------------------------------------------------------------------- /config/rbac/jobpool_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view jobpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: jobpool-viewer-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: jobpool-viewer-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - jobpools 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - apiGroups: 23 | - kwok.sigs.run-ai.com 24 | resources: 25 | - jobpools/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | # All RBAC will be applied under this service account in 3 | # the deployment namespace. You may comment out this resource 4 | # if your manager will use a service account that exists at 5 | # runtime. Be sure to update RoleBinding and ClusterRoleBinding 6 | # subjects if changing service account names. 7 | - service_account.yaml 8 | - role.yaml 9 | - role_binding.yaml 10 | - leader_election_role.yaml 11 | - leader_election_role_binding.yaml 12 | # Comment the following 4 lines if you want to disable 13 | # the auth proxy (https://github.com/brancz/kube-rbac-proxy) 14 | # which protects your /metrics endpoint. 15 | - auth_proxy_service.yaml 16 | - auth_proxy_role.yaml 17 | - auth_proxy_role_binding.yaml 18 | - auth_proxy_client_clusterrole.yaml 19 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions to do leader election. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: role 7 | app.kubernetes.io/instance: leader-election-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: leader-election-role 13 | rules: 14 | - apiGroups: 15 | - "" 16 | resources: 17 | - configmaps 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - create 23 | - update 24 | - patch 25 | - delete 26 | - apiGroups: 27 | - coordination.k8s.io 28 | resources: 29 | - leases 30 | verbs: 31 | - get 32 | - list 33 | - watch 34 | - create 35 | - update 36 | - patch 37 | - delete 38 | - apiGroups: 39 | - "" 40 | resources: 41 | - events 42 | verbs: 43 | - create 44 | - patch 45 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: rolebinding 6 | app.kubernetes.io/instance: leader-election-rolebinding 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: kwok-operator 9 | app.kubernetes.io/part-of: kwok-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: leader-election-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: Role 15 | name: leader-election-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: kwok-operator-sa 19 | namespace: kwok-operator 20 | -------------------------------------------------------------------------------- /config/rbac/nodepool_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit nodepools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: nodepool-editor-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: nodepool-editor-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - nodepools 18 | verbs: 19 | - create 20 | - delete 21 | - get 22 | - list 23 | - patch 24 | - update 25 | - watch 26 | - apiGroups: 27 | - kwok.sigs.k8s.io 28 | resources: 29 | - nodepools/status 30 | verbs: 31 | - get 32 | -------------------------------------------------------------------------------- /config/rbac/nodepool_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view nodepools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: nodepool-viewer-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: nodepool-viewer-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - nodepools 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - apiGroups: 23 | - kwok.sigs.run-ai.com 24 | resources: 25 | - nodepools/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/podpool_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit podpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: podpool-editor-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: podpool-editor-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - podpools 18 | verbs: 19 | - create 20 | - delete 21 | - get 22 | - list 23 | - patch 24 | - update 25 | - watch 26 | - apiGroups: 27 | - kwok.sigs.run-ai.com 28 | resources: 29 | - podpools/status 30 | verbs: 31 | - get 32 | -------------------------------------------------------------------------------- /config/rbac/podpool_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view podpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: podpool-viewer-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: podpool-viewer-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - podpools 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - apiGroups: 23 | - kwok.sigs.run-ai.com 24 | resources: 25 | - podpools/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: kwok-operator-cr 6 | rules: 7 | - apiGroups: 8 | - batch 9 | resources: 10 | - jobs 11 | verbs: 12 | - get 13 | - list 14 | - patch 15 | - update 16 | - watch 17 | - create 18 | - delete 19 | - apiGroups: 20 | - apps 21 | resources: 22 | - deployments 23 | - statefulsets 24 | - daemonsets 25 | verbs: 26 | - get 27 | - list 28 | - patch 29 | - update 30 | - watch 31 | - create 32 | - delete 33 | - apiGroups: 34 | - "" 35 | resources: 36 | - persistentvolumes 37 | - persistentvolumeclaims 38 | - nodes 39 | - pods 40 | verbs: 41 | - create 42 | - delete 43 | - get 44 | - list 45 | - patch 46 | - update 47 | - watch 48 | - apiGroups: 49 | - kwok.sigs.run-ai.com 50 | - kwok.sigs.run-ai.com/v1beta1 51 | resources: 52 | - nodepools 53 | - deploymentpools 54 | - daemonsetpools 55 | - podpools 56 | - jobpools 57 | - statefulsetpools 58 | verbs: 59 | - create 60 | - delete 61 | - get 62 | - list 63 | - patch 64 | - update 65 | - watch 66 | - apiGroups: 67 | - kwok.sigs.run-ai.com 68 | resources: 69 | - nodepools/finalizers 70 | - deploymentpools/finalizers 71 | - podpools/finalizers 72 | - jobpools/finalizers 73 | - daemonsetpools/finalizers 74 | - statefulsetpools/finalizers 75 | verbs: 76 | - get 77 | - patch 78 | - update 79 | - apiGroups: 80 | - kwok.sigs.run-ai.com 81 | resources: 82 | - nodepools/status 83 | - deploymentpools/status 84 | - podpools/status 85 | - jobpools/status 86 | - daemonsetpools/status 87 | - statefulsetpools/status 88 | verbs: 89 | - get 90 | - patch 91 | - update 92 | - apiGroups: 93 | - "storage.k8s.io" 94 | resources: 95 | - storageclasses 96 | verbs: 97 | - create 98 | - delete 99 | - get 100 | - list 101 | - patch 102 | - update 103 | - watch -------------------------------------------------------------------------------- /config/rbac/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrolebinding 6 | app.kubernetes.io/instance: kwok-operator-crb 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: kwok-operator 9 | app.kubernetes.io/part-of: kwok-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: kwok-operator-crb 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: kwok-operator-cr 16 | subjects: 17 | - kind: ServiceAccount 18 | name: kwok-operator-sa 19 | namespace: kwok-operator 20 | -------------------------------------------------------------------------------- /config/rbac/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: serviceaccount 6 | app.kubernetes.io/instance: kwok-operator-sa 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: kwok-operator 9 | app.kubernetes.io/part-of: kwok-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: kwok-operator-sa 12 | namespace: kwok-operator 13 | -------------------------------------------------------------------------------- /config/rbac/statefulsetpool_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit statefulsetpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: statefulsetpool-editor-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: statefulsetpool-editor-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - statefulsetpools 18 | verbs: 19 | - create 20 | - delete 21 | - get 22 | - list 23 | - patch 24 | - update 25 | - watch 26 | - apiGroups: 27 | - kwok.sigs.run-ai.com 28 | resources: 29 | - statefulsetpools/status 30 | verbs: 31 | - get 32 | -------------------------------------------------------------------------------- /config/rbac/statefulsetpool_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view statefulsetpools. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: statefulsetpool-viewer-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: kwok-operator 10 | app.kubernetes.io/part-of: kwok-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: statefulsetpool-viewer-role 13 | rules: 14 | - apiGroups: 15 | - kwok.sigs.run-ai.com 16 | resources: 17 | - statefulsetpools 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - apiGroups: 23 | - kwok.sigs.run-ai.com 24 | resources: 25 | - statefulsetpools/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/samples/kustomization.yaml: -------------------------------------------------------------------------------- 1 | ## Append samples of your project ## 2 | resources: 3 | - kwok.sigs_v1beta1_nodepool.yaml 4 | - kwok.sigs_v1beta1_deploymentpool.yaml 5 | - kwok.sigs_v1beta1_podpool.yaml 6 | - kwok.sigs_v1beta1_jobpool.yaml 7 | - kwok.sigs_v1beta1_daemonsetpool.yaml 8 | - kwok.sigs_v1beta1_statefulsetpool.yaml 9 | #+kubebuilder:scaffold:manifestskustomizesamples 10 | -------------------------------------------------------------------------------- /config/samples/kwok.sigs_v1beta1_daemonsetpool.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kwok.sigs.run-ai.com/v1beta1 2 | kind: DaemonsetPool 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: daemonsetpool 6 | app.kubernetes.io/instance: daemonsetpool-sample 7 | app.kubernetes.io/part-of: kwok-operator 8 | app.kubernetes.io/managed-by: kustomize 9 | app.kubernetes.io/created-by: kwok-operator 10 | name: daemonsetpool-sample 11 | namespace: default 12 | spec: 13 | daemonsetCount: 10 14 | daemonsetTemplate: 15 | metadata: 16 | name: kwok-operator 17 | labels: 18 | app.kubernetes.io/name: daemonset 19 | app.kubernetes.io/instance: daemonset-sample 20 | app.kubernetes.io/part-of: kwok-operator 21 | app.kubernetes.io/managed-by: kustomize 22 | app.kubernetes.io/created-by: kwok-operator 23 | spec: 24 | selector: 25 | matchLabels: 26 | app.kubernetes.io/name: deployment 27 | app.kubernetes.io/instance: deployment-sample 28 | app.kubernetes.io/part-of: kwok-operator 29 | app.kubernetes.io/managed-by: kustomize 30 | app.kubernetes.io/created-by: kwok-operator 31 | template: 32 | metadata: 33 | labels: 34 | app.kubernetes.io/name: daemonset 35 | app.kubernetes.io/instance: daemonset-sample 36 | app.kubernetes.io/part-of: kwok-operator 37 | app.kubernetes.io/managed-by: kustomize 38 | app.kubernetes.io/created-by: kwok-operator 39 | spec: 40 | containers: 41 | - image: nginx 42 | name: nginx 43 | restartPolicy: Always 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /config/samples/kwok.sigs_v1beta1_deploymentpool.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kwok.sigs.run-ai.com/v1beta1 2 | kind: DeploymentPool 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: deploymentpool 6 | app.kubernetes.io/instance: deploymentpool-sample 7 | app.kubernetes.io/part-of: kwok-operator 8 | app.kubernetes.io/managed-by: kustomize 9 | app.kubernetes.io/created-by: kwok-operator 10 | name: deploymentpool-sample 11 | namespace: default 12 | spec: 13 | deploymentCount: 5 14 | deploymentTemplate: 15 | metadata: 16 | name: kwok-operator 17 | labels: 18 | app.kubernetes.io/name: deployment 19 | app.kubernetes.io/instance: deployment-sample 20 | app.kubernetes.io/part-of: kwok-operator 21 | app.kubernetes.io/managed-by: kustomize 22 | app.kubernetes.io/created-by: kwok-operator 23 | spec: 24 | replicas: 3 25 | selector: 26 | matchLabels: 27 | app.kubernetes.io/name: deployment 28 | app.kubernetes.io/instance: deployment-sample 29 | app.kubernetes.io/part-of: kwok-operator 30 | app.kubernetes.io/managed-by: kustomize 31 | app.kubernetes.io/created-by: kwok-operator 32 | template: 33 | metadata: 34 | labels: 35 | app.kubernetes.io/name: deployment 36 | app.kubernetes.io/instance: deployment-sample 37 | app.kubernetes.io/part-of: kwok-operator 38 | app.kubernetes.io/managed-by: kustomize 39 | app.kubernetes.io/created-by: kwok-operator 40 | spec: 41 | containers: 42 | - image: nginx 43 | name: nginx 44 | restartPolicy: Always 45 | -------------------------------------------------------------------------------- /config/samples/kwok.sigs_v1beta1_jobpool.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kwok.sigs.run-ai.com/v1beta1 2 | kind: JobPool 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: jobpool 6 | app.kubernetes.io/instance: jobpool-sample 7 | app.kubernetes.io/part-of: kwok-operator 8 | app.kubernetes.io/managed-by: kustomize 9 | app.kubernetes.io/created-by: kwok-operator 10 | name: jobpool-sample 11 | spec: 12 | jobCount: 5 13 | jobTemplate: 14 | metadata: 15 | name: kwok-operator 16 | labels: 17 | app.kubernetes.io/name: job 18 | app.kubernetes.io/instance: job-sample 19 | app.kubernetes.io/part-of: kwok-operator 20 | app.kubernetes.io/managed-by: kustomize 21 | app.kubernetes.io/created-by: kwok-operator 22 | spec: 23 | template: 24 | metadata: 25 | labels: 26 | app.kubernetes.io/name: job 27 | app.kubernetes.io/instance: job-sample 28 | app.kubernetes.io/part-of: kwok-operator 29 | app.kubernetes.io/managed-by: kustomize 30 | app.kubernetes.io/created-by: kwok-operator 31 | spec: 32 | containers: 33 | - name: job 34 | image: busybox 35 | command: ["sh", "-c", "echo Hello, Kubernetes! && sleep 3600"] 36 | restartPolicy: Never 37 | -------------------------------------------------------------------------------- /config/samples/kwok.sigs_v1beta1_nodepool.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kwok.sigs.run-ai.com/v1beta1 2 | kind: NodePool 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: nodepool 6 | app.kubernetes.io/instance: nodepool-sample 7 | app.kubernetes.io/part-of: kwok-operator 8 | app.kubernetes.io/managed-by: kustomize 9 | app.kubernetes.io/created-by: kwok-operator 10 | name: nodepool-sample 11 | spec: 12 | nodeCount: 15 13 | nodeTemplate: 14 | apiVersion: v1 15 | metadata: 16 | annotations: 17 | node.alpha.kubernetes.io/ttl: "0" 18 | labels: 19 | kubernetes.io/hostname: kwok-node 20 | kubernetes.io/role: agent 21 | type: kwok 22 | spec: {} 23 | status: 24 | allocatable: 25 | cpu: 32 26 | memory: 256Gi 27 | pods: 110 28 | capacity: 29 | cpu: 32 30 | memory: 256Gi 31 | pods: 110 32 | nodeInfo: 33 | architecture: amd64 34 | bootID: "" 35 | containerRuntimeVersion: "" 36 | kernelVersion: "" 37 | kubeProxyVersion: fake 38 | kubeletVersion: fake 39 | machineID: "" 40 | operatingSystem: linux 41 | osImage: "" 42 | systemUUID: "" 43 | phase: Running 44 | -------------------------------------------------------------------------------- /config/samples/kwok.sigs_v1beta1_podpool.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kwok.sigs.run-ai.com/v1beta1 2 | kind: PodPool 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: podpool 6 | app.kubernetes.io/instance: podpool-sample 7 | app.kubernetes.io/part-of: kwok-operator 8 | app.kubernetes.io/created-by: kwok-operator 9 | name: podpool-sample 10 | namespace: default 11 | spec: 12 | podCount: 5 13 | podTemplate: 14 | metadata: 15 | name: kwok-operator 16 | labels: 17 | app.kubernetes.io/name: pod 18 | app.kubernetes.io/instance: pod-sample 19 | app.kubernetes.io/part-of: kwok-operator 20 | app.kubernetes.io/managed-by: kustomize 21 | app.kubernetes.io/created-by: kwok-operator 22 | spec: 23 | containers: 24 | - image: nginx 25 | name: nginx 26 | restartPolicy: Always 27 | -------------------------------------------------------------------------------- /config/samples/kwok.sigs_v1beta1_statefulsetpool.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kwok.sigs.run-ai.com/v1beta1 2 | kind: StatefulsetPool 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: statefulsetpool 6 | app.kubernetes.io/instance: statefulsetpool-sample 7 | app.kubernetes.io/part-of: kwok-operator 8 | app.kubernetes.io/managed-by: kustomize 9 | app.kubernetes.io/created-by: kwok-operator 10 | name: statefulsetpool-sample 11 | spec: 12 | createPV: true # optional, default is false 13 | statefulsetCount: 2 14 | StatefulsetTemplate: 15 | metadata: 16 | labels: 17 | app.kubernetes.io/name: statefulsetpool 18 | app.kubernetes.io/instance: statefulsetpool-sample 19 | app.kubernetes.io/part-of: kwok-operator 20 | app.kubernetes.io/managed-by: kustomize 21 | app.kubernetes.io/created-by: kwok-operator 22 | spec: 23 | serviceName: "nginx" 24 | replicas: 15 25 | selector: 26 | matchLabels: 27 | app: nginx 28 | template: 29 | metadata: 30 | labels: 31 | app: nginx 32 | spec: 33 | containers: 34 | - name: nginx 35 | image: registry.k8s.io/nginx-slim:0.21 36 | ports: 37 | - containerPort: 80 38 | name: web 39 | volumeMounts: 40 | - name: www 41 | mountPath: /usr/share/nginx/html 42 | volumeClaimTemplates: 43 | - metadata: 44 | name: www 45 | spec: 46 | accessModes: [ "ReadWriteOnce" ] 47 | resources: 48 | requests: 49 | storage: 1Gi -------------------------------------------------------------------------------- /config/scorecard/bases/config.yaml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /config/scorecard/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - bases/config.yaml 3 | patchesJson6902: 4 | - path: patches/basic.config.yaml 5 | target: 6 | group: scorecard.operatorframework.io 7 | version: v1alpha3 8 | kind: Configuration 9 | name: config 10 | - path: patches/olm.config.yaml 11 | target: 12 | group: scorecard.operatorframework.io 13 | version: v1alpha3 14 | kind: Configuration 15 | name: config 16 | #+kubebuilder:scaffold:patchesJson6902 17 | -------------------------------------------------------------------------------- /config/scorecard/patches/basic.config.yaml: -------------------------------------------------------------------------------- 1 | - op: add 2 | path: /stages/0/tests/- 3 | value: 4 | entrypoint: 5 | - scorecard-test 6 | - basic-check-spec 7 | image: quay.io/operator-framework/scorecard-test:v1.34.2 8 | labels: 9 | suite: basic 10 | test: basic-check-spec-test 11 | -------------------------------------------------------------------------------- /config/scorecard/patches/olm.config.yaml: -------------------------------------------------------------------------------- 1 | - op: add 2 | path: /stages/0/tests/- 3 | value: 4 | entrypoint: 5 | - scorecard-test 6 | - olm-bundle-validation 7 | image: quay.io/operator-framework/scorecard-test:v1.34.2 8 | labels: 9 | suite: olm 10 | test: olm-bundle-validation-test 11 | - op: add 12 | path: /stages/0/tests/- 13 | value: 14 | entrypoint: 15 | - scorecard-test 16 | - olm-crds-have-validation 17 | image: quay.io/operator-framework/scorecard-test:v1.34.2 18 | labels: 19 | suite: olm 20 | test: olm-crds-have-validation-test 21 | - op: add 22 | path: /stages/0/tests/- 23 | value: 24 | entrypoint: 25 | - scorecard-test 26 | - olm-crds-have-resources 27 | image: quay.io/operator-framework/scorecard-test:v1.34.2 28 | labels: 29 | suite: olm 30 | test: olm-crds-have-resources-test 31 | - op: add 32 | path: /stages/0/tests/- 33 | value: 34 | entrypoint: 35 | - scorecard-test 36 | - olm-spec-descriptors 37 | image: quay.io/operator-framework/scorecard-test:v1.34.2 38 | labels: 39 | suite: olm 40 | test: olm-spec-descriptors-test 41 | - op: add 42 | path: /stages/0/tests/- 43 | value: 44 | entrypoint: 45 | - scorecard-test 46 | - olm-status-descriptors 47 | image: quay.io/operator-framework/scorecard-test:v1.34.2 48 | labels: 49 | suite: olm 50 | test: olm-status-descriptors-test 51 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/run-ai/kwok-operator 2 | 3 | go 1.22.3 4 | 5 | require ( 6 | github.com/onsi/ginkgo/v2 v2.11.0 7 | github.com/onsi/gomega v1.27.10 8 | github.com/stretchr/testify v1.8.2 9 | k8s.io/api v0.28.3 10 | k8s.io/apimachinery v0.28.3 11 | k8s.io/client-go v0.28.3 12 | sigs.k8s.io/controller-runtime v0.16.3 13 | ) 14 | 15 | require ( 16 | github.com/beorn7/perks v1.0.1 // indirect 17 | github.com/cespare/xxhash/v2 v2.2.0 // indirect 18 | github.com/davecgh/go-spew v1.1.1 // indirect 19 | github.com/emicklei/go-restful/v3 v3.11.0 // indirect 20 | github.com/evanphx/json-patch v5.6.0+incompatible // indirect 21 | github.com/evanphx/json-patch/v5 v5.6.0 // indirect 22 | github.com/fsnotify/fsnotify v1.6.0 // indirect 23 | github.com/go-logr/logr v1.2.4 // indirect 24 | github.com/go-logr/zapr v1.2.4 // indirect 25 | github.com/go-openapi/jsonpointer v0.19.6 // indirect 26 | github.com/go-openapi/jsonreference v0.20.2 // indirect 27 | github.com/go-openapi/swag v0.22.3 // indirect 28 | github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect 29 | github.com/gogo/protobuf v1.3.2 // indirect 30 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 31 | github.com/golang/protobuf v1.5.3 // indirect 32 | github.com/google/gnostic-models v0.6.8 // indirect 33 | github.com/google/go-cmp v0.5.9 // indirect 34 | github.com/google/gofuzz v1.2.0 // indirect 35 | github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect 36 | github.com/google/uuid v1.3.0 // indirect 37 | github.com/imdario/mergo v0.3.6 // indirect 38 | github.com/josharian/intern v1.0.0 // indirect 39 | github.com/json-iterator/go v1.1.12 // indirect 40 | github.com/mailru/easyjson v0.7.7 // indirect 41 | github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect 42 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 43 | github.com/modern-go/reflect2 v1.0.2 // indirect 44 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 45 | github.com/pkg/errors v0.9.1 // indirect 46 | github.com/pmezard/go-difflib v1.0.0 // indirect 47 | github.com/prometheus/client_golang v1.16.0 // indirect 48 | github.com/prometheus/client_model v0.4.0 // indirect 49 | github.com/prometheus/common v0.44.0 // indirect 50 | github.com/prometheus/procfs v0.10.1 // indirect 51 | github.com/spf13/pflag v1.0.5 // indirect 52 | go.uber.org/multierr v1.11.0 // indirect 53 | go.uber.org/zap v1.25.0 // indirect 54 | golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect 55 | golang.org/x/net v0.25.0 // indirect 56 | golang.org/x/oauth2 v0.8.0 // indirect 57 | golang.org/x/sys v0.20.0 // indirect 58 | golang.org/x/term v0.20.0 // indirect 59 | golang.org/x/text v0.15.0 // indirect 60 | golang.org/x/time v0.3.0 // indirect 61 | golang.org/x/tools v0.9.3 // indirect 62 | gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect 63 | google.golang.org/appengine v1.6.7 // indirect 64 | google.golang.org/protobuf v1.33.0 // indirect 65 | gopkg.in/inf.v0 v0.9.1 // indirect 66 | gopkg.in/yaml.v2 v2.4.0 // indirect 67 | gopkg.in/yaml.v3 v3.0.1 // indirect 68 | k8s.io/apiextensions-apiserver v0.28.3 // indirect 69 | k8s.io/component-base v0.28.3 // indirect 70 | k8s.io/klog/v2 v2.100.1 // indirect 71 | k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 // indirect 72 | k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 // indirect 73 | sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect 74 | sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect 75 | sigs.k8s.io/yaml v1.3.0 // indirect 76 | ) 77 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ -------------------------------------------------------------------------------- /install_kwok.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | if [ -z "$1" ]; then 3 | echo "There is an option to install metrics-server." 4 | echo "run 'bash install_kwok.sh true' to install metrics-server. by default, it is false." 5 | INSTALL_METRICS_SERVER=false 6 | else 7 | INSTALL_METRICS_SERVER=$1 8 | fi 9 | 10 | 11 | KWOK_REPO=kubernetes-sigs/kwok 12 | KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name') 13 | 14 | echo "Apply Kwok version ${KWOK_LATEST_RELEASE}..." 15 | kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/kwok.yaml" 16 | echo "check Kwok status..." 17 | kubectl wait --for=condition=Ready -n kube-system pod -l app=kwok-controller --timeout=300s 18 | if [ $? -ne 0 ]; then 19 | echo "Kwok is not ready, please check the logs." 20 | exit 1 21 | fi 22 | echo "Kwok is ready!" 23 | echo "install CRDs for Kwok..." 24 | kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/stage-fast.yaml" 25 | 26 | if [ "$INSTALL_METRICS_SERVER" = true ]; then 27 | echo "install metrics-server..." 28 | kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/metrics-usage.yaml" 29 | fi 30 | 31 | echo "Kwok is installed successfully! - Happy Kwoking!" -------------------------------------------------------------------------------- /internal/controller/daemonsetpool_controller.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "time" 22 | 23 | appsv1 "k8s.io/api/apps/v1" 24 | corev1 "k8s.io/api/core/v1" 25 | "k8s.io/apimachinery/pkg/api/errors" 26 | apierrors "k8s.io/apimachinery/pkg/api/errors" 27 | "k8s.io/apimachinery/pkg/api/meta" 28 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 | "k8s.io/apimachinery/pkg/runtime" 30 | 31 | //"k8s.io/client-go/tools/clientcmd/api" 32 | ctrl "sigs.k8s.io/controller-runtime" 33 | "sigs.k8s.io/controller-runtime/pkg/client" 34 | "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 35 | "sigs.k8s.io/controller-runtime/pkg/log" 36 | 37 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 38 | ) 39 | 40 | // DaemonsetPoolReconciler reconciles a DaemonsetPool object 41 | type DaemonsetPoolReconciler struct { 42 | client.Client 43 | Scheme *runtime.Scheme 44 | } 45 | 46 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=daemonsetpools,verbs=get;list;watch;create;update;patch;delete 47 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=daemonsetpools/status,verbs=get;update;patch 48 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=daemonsetpools/finalizers,verbs=update 49 | 50 | func (r *DaemonsetPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 51 | log := log.FromContext(ctx) 52 | log.Info("Reconciling DaemonsetPool") 53 | daemonsetPool := &kwoksigsv1beta1.DaemonsetPool{} 54 | err := r.Get(ctx, req.NamespacedName, daemonsetPool) 55 | if err != nil { 56 | if apierrors.IsNotFound(err) { 57 | log.Info("DaemonsetPool resource not found. Ignoring since object must be deleted") 58 | return ctrl.Result{}, nil 59 | } 60 | log.Error(err, "unable to fetch DaemonsetPool") 61 | return ctrl.Result{}, err 62 | } 63 | log.Info("DaemonsetPool resource found") 64 | if daemonsetPool.Status.Conditions == nil || len(daemonsetPool.Status.Conditions) == 0 { 65 | err = r.statusConditionController(ctx, daemonsetPool, metav1.Condition{ 66 | Type: "Available", 67 | Status: metav1.ConditionUnknown, 68 | Reason: "DaemonsetPoolCreated", 69 | Message: "Stating to reconcile DaemonsetPool", 70 | }) 71 | if err != nil { 72 | log.Error(err, "unable to update DaemonsetPool status") 73 | return ctrl.Result{}, err 74 | } 75 | err = r.Get(ctx, req.NamespacedName, daemonsetPool) 76 | if err != nil { 77 | log.Error(err, "unable to fetch DaemonsetPool") 78 | return ctrl.Result{}, err 79 | } 80 | return ctrl.Result{}, nil 81 | } 82 | // add finalizer to the daemonset pool 83 | if !controllerutil.ContainsFinalizer(daemonsetPool, controllerFinalizer) { 84 | log.Info("Adding Finalizer for the DaemonsetPool") 85 | err = r.addFinalizer(ctx, daemonsetPool) 86 | if err != nil { 87 | log.Error(err, "unable to update DaemonsetPool") 88 | return ctrl.Result{}, err 89 | } 90 | return ctrl.Result{}, nil 91 | } 92 | // Get Daemonset in the cluster with owner reference of the DaemonsetPool 93 | daemonsets, err := r.getDaemonsets(ctx, daemonsetPool) 94 | if err != nil { 95 | log.Error(err, "unable to get Daemonset") 96 | return ctrl.Result{}, err 97 | } 98 | if int32(len(daemonsets)) == 0 { 99 | log.Info("Daemonset resource not found. Creating a new one") 100 | for i := int32(len(daemonsets)); i < daemonsetPool.Spec.DaemonsetCount; i++ { 101 | err = r.createDaemonset(ctx, daemonsetPool) 102 | if err != nil { 103 | return ctrl.Result{}, err 104 | } 105 | } 106 | return ctrl.Result{Requeue: true}, nil 107 | } 108 | err = r.statusConditionController(ctx, daemonsetPool, metav1.Condition{ 109 | Type: "Available", 110 | Status: metav1.ConditionTrue, 111 | Reason: "DaemonsetPoolReconciled", 112 | Message: "DaemonsetPool reconciled successfully", 113 | }) 114 | if err != nil { 115 | log.Error(err, "unable to update DaemonsetPool status") 116 | return ctrl.Result{}, err 117 | } 118 | // update Daemonset if the DaemonsetPool is updated 119 | if daemonsetPool.Status.ObservedGeneration != daemonsetPool.Generation { 120 | log.Info("DaemonsetPool updated. Updating Daemonset") 121 | err = r.Get(ctx, req.NamespacedName, daemonsetPool) 122 | if err != nil { 123 | log.Error(err, "unable to fetch DaemonsetPool") 124 | return ctrl.Result{}, err 125 | } 126 | err = r.statusConditionController(ctx, daemonsetPool, metav1.Condition{ 127 | Type: "Available", 128 | Status: metav1.ConditionFalse, 129 | Reason: "DaemonsetPoolPoolReconciling", 130 | Message: "Updating DaemonsetPool", 131 | }) 132 | if err != nil { 133 | log.Error(err, "unable to update DaemonsetPool status") 134 | return ctrl.Result{}, err 135 | } 136 | log.Info("updating Daemonset") 137 | err = r.updateDaemonset(ctx, daemonsetPool) 138 | if err != nil { 139 | log.Error(err, "unable to update Daemonset") 140 | return ctrl.Result{}, nil 141 | } 142 | return ctrl.Result{}, nil 143 | } 144 | // delete the Daemonset if the DaemonsetPool is being deleted 145 | if !daemonsetPool.DeletionTimestamp.IsZero() { 146 | log.Info("DaemonsetPool is being deleted. Deleting Daemonset") 147 | err = r.statusConditionController(ctx, daemonsetPool, metav1.Condition{ 148 | Type: "Available", 149 | Status: metav1.ConditionFalse, 150 | Reason: "Deleting", 151 | Message: "Deleting the daemonsetPool", 152 | }) 153 | if err != nil { 154 | log.Error(err, "unable to update DaemonsetPool status") 155 | return ctrl.Result{}, err 156 | } 157 | // delete the Daemonsets in the cluster 158 | for _, daemonset := range daemonsets { 159 | err = r.Delete(ctx, &daemonset) 160 | if err != nil { 161 | log.Error(err, "unable to delete daemonset") 162 | } 163 | } 164 | err = r.deleteFinalizer(ctx, daemonsetPool) 165 | if err != nil { 166 | log.Error(err, "unable to delete Finalizer") 167 | return ctrl.Result{}, nil 168 | } 169 | return ctrl.Result{}, nil 170 | } 171 | log.Info("Reconciliation completed successfully") 172 | return ctrl.Result{RequeueAfter: time.Duration(60 * time.Second)}, nil 173 | } 174 | 175 | // delete finalizer from the DaemonsetPool 176 | func (r *DaemonsetPoolReconciler) deleteFinalizer(ctx context.Context, daemonsetPool *kwoksigsv1beta1.DaemonsetPool) error { 177 | controllerutil.RemoveFinalizer(daemonsetPool, controllerFinalizer) 178 | return r.Update(ctx, daemonsetPool) 179 | } 180 | 181 | // update the Daemonset in the cluster with owner reference to the DaemonsetPool 182 | func (r *DaemonsetPoolReconciler) updateDaemonset(ctx context.Context, daemonsetPool *kwoksigsv1beta1.DaemonsetPool) error { 183 | // get the Daemonset spec from the cluster 184 | daemonsets, err := r.getDaemonsets(ctx, daemonsetPool) 185 | log.Log.Info("Updating daemonset", "daemonset", daemonsets) 186 | if err != nil { 187 | return err 188 | } 189 | // loop through the Daemonset and update the Daemonset with the DaemonsetPool spec 190 | if len(daemonsets) < int(daemonsetPool.Spec.DaemonsetCount) { 191 | log.Log.Info("the len of the current daemonset is: ", "len", len(daemonsets)) 192 | for i := int32(len(daemonsets)); i < daemonsetPool.Spec.DaemonsetCount; i++ { 193 | err = r.createDaemonset(ctx, daemonsetPool) 194 | if err != nil { 195 | log.Log.Error(err, "unable to create Daemonset") 196 | return err 197 | } 198 | } 199 | } else if len(daemonsets) > int(daemonsetPool.Spec.DaemonsetCount) { 200 | for i := int32(len(daemonsets)); i > daemonsetPool.Spec.DaemonsetCount; i-- { 201 | err = r.Delete(ctx, &daemonsets[i-1]) 202 | if err != nil { 203 | return err 204 | } 205 | } 206 | } else { 207 | for i := 0; i < len(daemonsets); i++ { 208 | daemonset := daemonsets[i] 209 | daemonset.Spec.Template.Spec.Containers = daemonsetPool.Spec.DaemonsetTemplate.Spec.Template.Spec.Containers 210 | err = r.Update(ctx, &daemonset) 211 | if err != nil { 212 | return err 213 | } 214 | } 215 | } 216 | 217 | err = r.updateObservedGeneration(ctx, daemonsetPool) 218 | if err != nil { 219 | return err 220 | } 221 | return nil 222 | } 223 | 224 | // create the Daemonset in the cluster with owner reference to the DaemonsetPool 225 | func (r *DaemonsetPoolReconciler) createDaemonset(ctx context.Context, daemonsetPool *kwoksigsv1beta1.DaemonsetPool) error { 226 | daemonsetLabels := daemonsetPool.Spec.DaemonsetTemplate.Labels 227 | if daemonsetLabels == nil { 228 | daemonsetLabels = make(map[string]string) 229 | } 230 | daemonsetLabels[controllerLabel] = daemonsetPool.Name 231 | daemonsetToleration := daemonsetPool.Spec.DaemonsetTemplate.Spec.Template.Spec.Tolerations 232 | if daemonsetToleration == nil { 233 | daemonsetToleration = make([]corev1.Toleration, 0) 234 | } 235 | daemonsetToleration = append(daemonsetToleration, corev1.Toleration{ 236 | Key: controllerAnnotation, 237 | Operator: corev1.TolerationOpExists, 238 | Effect: corev1.TaintEffectNoSchedule, 239 | }) 240 | daemonset := &appsv1.DaemonSet{ 241 | ObjectMeta: metav1.ObjectMeta{ 242 | OwnerReferences: []metav1.OwnerReference{ 243 | *metav1.NewControllerRef(daemonsetPool, kwoksigsv1beta1.GroupVersion.WithKind("DaemonsetPool")), 244 | }, 245 | GenerateName: daemonsetPool.Name + "-", 246 | Namespace: daemonsetPool.Namespace, 247 | Labels: daemonsetLabels, 248 | }, 249 | Spec: daemonsetPool.Spec.DaemonsetTemplate.Spec, 250 | } 251 | daemonset.Spec.Template.ObjectMeta.Labels = map[string]string{ 252 | controllerLabel: daemonsetPool.Name, 253 | } 254 | daemonset.Spec.Template.Spec.Tolerations = daemonsetToleration 255 | daemonset.Spec.Selector = &metav1.LabelSelector{ 256 | MatchLabels: map[string]string{ 257 | controllerLabel: daemonsetPool.Name, 258 | }, 259 | } 260 | 261 | err := r.Create(ctx, daemonset) 262 | if err != nil { 263 | return err 264 | } 265 | err = r.updateObservedGeneration(ctx, daemonsetPool) 266 | if err != nil { 267 | return err 268 | } 269 | return nil 270 | } 271 | 272 | // update the observed generation of the DaemonsetPool 273 | func (r *DaemonsetPoolReconciler) updateObservedGeneration(ctx context.Context, daemonsetPool *kwoksigsv1beta1.DaemonsetPool) error { 274 | daemonsetPool.Status.ObservedGeneration = daemonsetPool.Generation 275 | return r.Status().Update(ctx, daemonsetPool) 276 | } 277 | 278 | // get the Daemonset in the cluster with owner reference to the DaemonsetPool 279 | func (r *DaemonsetPoolReconciler) getDaemonsets(ctx context.Context, daemonsetPool *kwoksigsv1beta1.DaemonsetPool) ([]appsv1.DaemonSet, error) { 280 | daemonset := &appsv1.DaemonSetList{} 281 | err := r.List(ctx, daemonset, client.InNamespace(daemonsetPool.Namespace), client.MatchingLabels{controllerLabel: daemonsetPool.Name}) 282 | if err != nil && errors.IsNotFound(err) { 283 | return []appsv1.DaemonSet{}, nil 284 | } else if err != nil { 285 | return nil, err 286 | } 287 | return daemonset.Items, nil 288 | } 289 | 290 | // adding finalizer to the DaemonsetPool 291 | func (r *DaemonsetPoolReconciler) addFinalizer(ctx context.Context, daemonsetPool *kwoksigsv1beta1.DaemonsetPool) error { 292 | controllerutil.AddFinalizer(daemonsetPool, controllerFinalizer) 293 | return r.Update(ctx, daemonsetPool) 294 | } 295 | 296 | // update the status of the DaemonsetPool 297 | func (r *DaemonsetPoolReconciler) statusConditionController(ctx context.Context, daemonsetPool *kwoksigsv1beta1.DaemonsetPool, condition metav1.Condition) error { 298 | meta.SetStatusCondition(&daemonsetPool.Status.Conditions, condition) 299 | return r.Status().Update(ctx, daemonsetPool) 300 | } 301 | 302 | // SetupWithManager sets up the controller with the Manager. 303 | func (r *DaemonsetPoolReconciler) SetupWithManager(mgr ctrl.Manager) error { 304 | return ctrl.NewControllerManagedBy(mgr). 305 | For(&kwoksigsv1beta1.DaemonsetPool{}). 306 | Complete(r) 307 | } 308 | -------------------------------------------------------------------------------- /internal/controller/daemonsetpool_controller_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "testing" 22 | 23 | . "github.com/onsi/ginkgo/v2" 24 | . "github.com/onsi/gomega" 25 | appsv1 "k8s.io/api/apps/v1" 26 | corev1 "k8s.io/api/core/v1" 27 | "k8s.io/apimachinery/pkg/api/errors" 28 | "k8s.io/apimachinery/pkg/types" 29 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 30 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 31 | 32 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 33 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 | ) 35 | 36 | var _ = Describe("DaemonsetPool Controller", func() { 37 | Context("When reconciling a resource", func() { 38 | const resourceName = "test-resource" 39 | 40 | ctx := context.Background() 41 | 42 | typeNamespacedName := types.NamespacedName{ 43 | Name: resourceName, 44 | Namespace: "default", 45 | } 46 | daemonsetpool := &kwoksigsv1beta1.DaemonsetPool{} 47 | 48 | BeforeEach(func() { 49 | By("creating the custom resource for the Kind DaemonsetPool") 50 | err := k8sClient.Get(ctx, typeNamespacedName, daemonsetpool) 51 | if err != nil && errors.IsNotFound(err) { 52 | resource := &kwoksigsv1beta1.DaemonsetPool{ 53 | ObjectMeta: metav1.ObjectMeta{ 54 | Name: resourceName, 55 | Namespace: "default", 56 | }, 57 | Spec: kwoksigsv1beta1.DaemonsetPoolSpec{ 58 | DaemonsetTemplate: appsv1.DaemonSet{ 59 | ObjectMeta: metav1.ObjectMeta{ 60 | Name: resourceName, 61 | Namespace: "default", 62 | }, 63 | Spec: appsv1.DaemonSetSpec{ 64 | Selector: &metav1.LabelSelector{ 65 | MatchLabels: map[string]string{ 66 | "app": resourceName, 67 | }, 68 | }, 69 | Template: corev1.PodTemplateSpec{ 70 | ObjectMeta: metav1.ObjectMeta{ 71 | Labels: map[string]string{ 72 | "app": resourceName, 73 | }, 74 | }, 75 | Spec: corev1.PodSpec{ 76 | Containers: []corev1.Container{ 77 | { 78 | Name: "test-container", 79 | Image: "nginx", 80 | }, 81 | }, 82 | }, 83 | }, 84 | }, 85 | }, 86 | }, 87 | } 88 | Expect(k8sClient.Create(ctx, resource)).To(Succeed()) 89 | } 90 | }) 91 | 92 | AfterEach(func() { 93 | resource := &kwoksigsv1beta1.DaemonsetPool{} 94 | err := k8sClient.Get(ctx, typeNamespacedName, resource) 95 | Expect(err).NotTo(HaveOccurred()) 96 | 97 | By("Cleanup the specific resource instance DaemonsetPool") 98 | Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) 99 | }) 100 | It("should successfully reconcile the resource", func() { 101 | By("Reconciling the created resource") 102 | controllerReconciler := &DaemonsetPoolReconciler{ 103 | Client: k8sClient, 104 | Scheme: k8sClient.Scheme(), 105 | } 106 | 107 | _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ 108 | NamespacedName: typeNamespacedName, 109 | }) 110 | Expect(err).NotTo(HaveOccurred()) 111 | }) 112 | }) 113 | }) 114 | 115 | func TestReconcileDaemonsetPool(t *testing.T) { 116 | //create a fake client to mock API calls 117 | fakeClient := fake.NewClientBuilder().WithScheme(setupScheme()).WithStatusSubresource(&kwoksigsv1beta1.DaemonsetPool{}).Build() 118 | 119 | // Create a ReconcileDaemonsetPool object with the scheme and fake client. 120 | r := &DaemonsetPoolReconciler{ 121 | Client: fakeClient, 122 | Scheme: setupScheme(), 123 | } 124 | 125 | // Create a DaemonsetPool object with the scheme and fake client. 126 | daemonsetpool := &kwoksigsv1beta1.DaemonsetPool{ 127 | ObjectMeta: metav1.ObjectMeta{ 128 | Name: "test-daemonsetpool", 129 | Namespace: "default", 130 | }, 131 | Spec: kwoksigsv1beta1.DaemonsetPoolSpec{ 132 | DaemonsetTemplate: appsv1.DaemonSet{ 133 | ObjectMeta: metav1.ObjectMeta{ 134 | Name: "test-daemonset", 135 | Namespace: "default", 136 | }, 137 | Spec: appsv1.DaemonSetSpec{ 138 | Selector: &metav1.LabelSelector{ 139 | MatchLabels: map[string]string{ 140 | "app": "test-daemonset", 141 | }, 142 | }, 143 | Template: corev1.PodTemplateSpec{ 144 | ObjectMeta: metav1.ObjectMeta{ 145 | Labels: map[string]string{ 146 | "app": "test-daemonset", 147 | }, 148 | }, 149 | Spec: corev1.PodSpec{ 150 | Containers: []corev1.Container{ 151 | { 152 | Name: "test-container", 153 | Image: "nginx", 154 | }, 155 | }, 156 | }, 157 | }, 158 | }, 159 | }, 160 | }, 161 | } 162 | err := fakeClient.Create(ctx, daemonsetpool) 163 | if err != nil { 164 | t.Fatalf("create DaemonsetPool: (%v)", err) 165 | } 166 | // Reconcile an object to get back a result 167 | res, err := r.Reconcile(ctx, reconcile.Request{ 168 | NamespacedName: types.NamespacedName{ 169 | Name: "test-daemonsetpool", 170 | Namespace: "default", 171 | }, 172 | }) 173 | if res != (reconcile.Result{}) { 174 | t.Fatalf("Reconcile did not return an empty result") 175 | } 176 | // Check to make sure the reconcile was successful and that it should requeue the request. 177 | if err != nil { 178 | t.Fatalf("reconcile: (%v)", err) 179 | } 180 | // get latest DaemonsetPool object 181 | err = fakeClient.Get(ctx, types.NamespacedName{ 182 | Name: "test-daemonsetpool", 183 | Namespace: "default", 184 | }, daemonsetpool) 185 | if err != nil { 186 | t.Fatalf("get DaemonsetPool: (%v)", err) 187 | } 188 | // change image of container 189 | daemonsetpool.Spec.DaemonsetTemplate.Spec.Template.Spec.Containers[0].Image = "busybox" 190 | err = fakeClient.Update(ctx, daemonsetpool) 191 | if err != nil { 192 | t.Fatalf("update DaemonsetPool: (%v)", err) 193 | } 194 | // Reconcile an object to get back a result 195 | res, err = r.Reconcile(ctx, reconcile.Request{ 196 | NamespacedName: types.NamespacedName{ 197 | Name: "test-daemonsetpool", 198 | Namespace: "default", 199 | }, 200 | }) 201 | if res != (reconcile.Result{}) { 202 | t.Fatalf("Reconcile did not return an empty result") 203 | } 204 | // Check to make sure the reconcile was successful and that it should requeue the request. 205 | if err != nil { 206 | t.Fatalf("reconcile: (%v)", err) 207 | } 208 | // get latest DaemonsetPool object 209 | err = fakeClient.Get(ctx, types.NamespacedName{ 210 | Name: "test-daemonsetpool", 211 | Namespace: "default", 212 | }, daemonsetpool) 213 | if err != nil { 214 | t.Fatalf("get DaemonsetPool: (%v)", err) 215 | } 216 | //delete DaemonsetPool object 217 | err = fakeClient.Delete(ctx, daemonsetpool) 218 | if err != nil { 219 | t.Fatalf("delete DaemonsetPool: (%v)", err) 220 | } 221 | // Reconcile an object to get back a result 222 | res, err = r.Reconcile(ctx, reconcile.Request{ 223 | NamespacedName: types.NamespacedName{ 224 | Name: "test-daemonsetpool", 225 | Namespace: "default", 226 | }, 227 | }) 228 | // Check to make sure the reconcile was successful and that it should requeue the request. 229 | if err != nil { 230 | t.Fatalf("reconcile: (%v)", err) 231 | } 232 | // validate the daemosetpool deleted 233 | err = fakeClient.Get(ctx, types.NamespacedName{ 234 | Name: "test-daemonsetpool", 235 | Namespace: "default", 236 | }, daemonsetpool) 237 | 238 | // list DaemonsetPool and check if the DaemonsetPool has been created 239 | daemonsetpoolList := &kwoksigsv1beta1.DaemonsetPoolList{ 240 | Items: []kwoksigsv1beta1.DaemonsetPool{}, 241 | } 242 | err = fakeClient.List(ctx, daemonsetpoolList) 243 | if err != nil { 244 | t.Fatalf("list DaemonsetPool: (%v)", err) 245 | } 246 | 247 | } 248 | -------------------------------------------------------------------------------- /internal/controller/deploymentpool_controller.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "time" 22 | 23 | "k8s.io/apimachinery/pkg/api/errors" 24 | 25 | "k8s.io/apimachinery/pkg/runtime" 26 | 27 | appsv1 "k8s.io/api/apps/v1" 28 | corev1 "k8s.io/api/core/v1" 29 | apierrors "k8s.io/apimachinery/pkg/api/errors" 30 | "k8s.io/apimachinery/pkg/api/meta" 31 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 | ctrl "sigs.k8s.io/controller-runtime" 33 | "sigs.k8s.io/controller-runtime/pkg/client" 34 | "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 35 | "sigs.k8s.io/controller-runtime/pkg/log" 36 | 37 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 38 | ) 39 | 40 | // DeploymentPoolReconciler reconciles a DeploymentPool object 41 | type DeploymentPoolReconciler struct { 42 | client.Client 43 | Scheme *runtime.Scheme 44 | } 45 | 46 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=deploymentpools,verbs=get;list;watch;create;update;patch;delete 47 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=deploymentpools/status,verbs=get;update;patch 48 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=deploymentpools/finalizers,verbs=update 49 | 50 | func (r *DeploymentPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 51 | log := log.FromContext(ctx) 52 | log.Info("Reconciling DeploymentPool") 53 | deploymentPool := &kwoksigsv1beta1.DeploymentPool{} 54 | err := r.Get(ctx, req.NamespacedName, deploymentPool) 55 | if err != nil { 56 | if apierrors.IsNotFound(err) { 57 | log.Info("DeploymentPool resource not found. Ignoring since object must be deleted") 58 | return ctrl.Result{}, nil 59 | } 60 | log.Error(err, "unable to fetch DeploymentPool") 61 | return ctrl.Result{}, err 62 | } 63 | log.Info("DeploymentPool resource found") 64 | 65 | if deploymentPool.Status.Conditions == nil || len(deploymentPool.Status.Conditions) == 0 { 66 | err = r.statusConditionController(ctx, deploymentPool, metav1.Condition{ 67 | Type: "Available", 68 | Status: metav1.ConditionUnknown, 69 | Reason: "DeploymentPoolCreated", 70 | Message: "Starting to reconcile DeploymentPool", 71 | }) 72 | if err != nil { 73 | log.Error(err, "unable to update DeploymentPool status") 74 | return ctrl.Result{}, err 75 | } 76 | err = r.Get(ctx, req.NamespacedName, deploymentPool) 77 | if err != nil { 78 | log.Error(err, "unable to fetch DeploymentPool") 79 | return ctrl.Result{}, err 80 | } 81 | 82 | return ctrl.Result{}, nil 83 | } 84 | // add finalizer to the deployment pool 85 | if !controllerutil.ContainsFinalizer(deploymentPool, controllerFinalizer) { 86 | log.Info("Adding Finalizer for the DeploymentPool") 87 | err = r.addFinalizer(ctx, deploymentPool) 88 | if err != nil { 89 | log.Error(err, "unable to add Finalizer for the DeploymentPool") 90 | return ctrl.Result{}, err 91 | } 92 | return ctrl.Result{}, nil 93 | } 94 | 95 | // Get Deployment in the cluster with owner reference to the DeploymentPool 96 | deployments, err := r.getDeployments(ctx, deploymentPool) 97 | if err != nil { 98 | //log.Error(err, "unable to get Deployment", deploymentPool) 99 | return ctrl.Result{}, err 100 | } 101 | // Create Deployment if it does not exist 102 | if len(deployments) == 0 { 103 | //log.Info("Creating %v Deployments", deploymentPool.Spec.DeploymentCount) 104 | for i := 0; i < int(deploymentPool.Spec.DeploymentCount); i++ { 105 | err = r.createDeployment(ctx, deploymentPool) 106 | if err != nil { 107 | return ctrl.Result{}, err 108 | } 109 | } 110 | err = r.updateObservedGeneration(ctx, deploymentPool) 111 | if err != nil { 112 | return ctrl.Result{}, err 113 | } 114 | return ctrl.Result{Requeue: true}, nil 115 | } 116 | // update DeploymentPool status to condition true 117 | err = r.statusConditionController(ctx, deploymentPool, metav1.Condition{ 118 | Type: "Available", 119 | Status: metav1.ConditionTrue, 120 | Reason: "DeploymentPoolReconciled", 121 | Message: "DeploymentPool reconciled successfully", 122 | }) 123 | if err != nil { 124 | log.Error(err, "unable to update DeploymentPool status") 125 | return ctrl.Result{Requeue: true}, nil 126 | } 127 | // update status of the deployment pool 128 | 129 | if deploymentPool.Status.ObservedGeneration != deploymentPool.Generation { 130 | log.Info("DeploymentPool generation has changed, requeuing") 131 | err = r.Get(ctx, req.NamespacedName, deploymentPool) 132 | if err != nil { 133 | log.Error(err, "unable to fetch DeploymentPool") 134 | return ctrl.Result{}, err 135 | } 136 | err = r.statusConditionController(ctx, deploymentPool, metav1.Condition{ 137 | Type: "Available", 138 | Status: metav1.ConditionFalse, 139 | Reason: "DeploymentPoolReconciling", 140 | Message: "Updating DeploymentPool", 141 | }) 142 | log.Info("Updating DeploymentPool") 143 | if err != nil { 144 | log.Error(err, "unable to update DeploymentPool status") 145 | return ctrl.Result{}, err 146 | } 147 | forceRequeue := false 148 | forceRequeue, err = r.updateDeployment(ctx, deploymentPool) 149 | if err != nil { 150 | log.Error(err, "unable to update Deployment") 151 | return ctrl.Result{}, err 152 | } 153 | if forceRequeue { 154 | println("Requeueing the deployment") 155 | return ctrl.Result{Requeue: true}, nil 156 | } 157 | return ctrl.Result{}, nil 158 | } 159 | if !deploymentPool.DeletionTimestamp.IsZero() { 160 | log.Info("Deleting DeploymentPool") 161 | err = r.statusConditionController(ctx, deploymentPool, metav1.Condition{ 162 | Type: "Available", 163 | Status: metav1.ConditionFalse, 164 | Reason: "Deleting", 165 | Message: "Deleting the deploymentPool", 166 | }) 167 | if err != nil { 168 | log.Error(err, "unable to update DeploymentPool status") 169 | return ctrl.Result{}, nil 170 | } 171 | for _, deployment := range deployments { 172 | err = r.Delete(ctx, &deployment) 173 | if err != nil { 174 | log.Error(err, "unable to delete Deployment") 175 | return ctrl.Result{}, nil 176 | } 177 | } 178 | err = r.deleteFinalizer(ctx, deploymentPool) 179 | if err != nil { 180 | log.Error(err, "unable to delete Finalizer") 181 | return ctrl.Result{}, nil 182 | } 183 | return ctrl.Result{}, nil 184 | } 185 | log.Info("Reconciliation completed successfully") 186 | return ctrl.Result{RequeueAfter: time.Duration(60 * time.Second)}, nil 187 | } 188 | 189 | func (r *DeploymentPoolReconciler) statusConditionController(ctx context.Context, deploymentPool *kwoksigsv1beta1.DeploymentPool, condition metav1.Condition) error { 190 | meta.SetStatusCondition(&deploymentPool.Status.Conditions, condition) 191 | return r.Status().Update(ctx, deploymentPool) 192 | } 193 | 194 | func (r *DeploymentPoolReconciler) addFinalizer(ctx context.Context, deploymentPool *kwoksigsv1beta1.DeploymentPool) error { 195 | controllerutil.AddFinalizer(deploymentPool, controllerFinalizer) 196 | return r.Update(ctx, deploymentPool) 197 | } 198 | 199 | func (r *DeploymentPoolReconciler) deleteFinalizer(ctx context.Context, deploymentPool *kwoksigsv1beta1.DeploymentPool) error { 200 | controllerutil.RemoveFinalizer(deploymentPool, controllerFinalizer) 201 | return r.Update(ctx, deploymentPool) 202 | } 203 | 204 | func (r *DeploymentPoolReconciler) getDeployments(ctx context.Context, deploymentPool *kwoksigsv1beta1.DeploymentPool) ([]appsv1.Deployment, error) { 205 | 206 | deployment := &appsv1.DeploymentList{} 207 | err := r.List(ctx, deployment, client.InNamespace(deploymentPool.Namespace), client.MatchingLabels{controllerLabel: deploymentPool.Name}) 208 | if err != nil && errors.IsNotFound(err) { 209 | return []appsv1.Deployment{}, nil 210 | } else if err != nil { 211 | return nil, err 212 | } 213 | return deployment.Items, nil 214 | } 215 | 216 | // update deployment 217 | func (r *DeploymentPoolReconciler) updateDeployment(ctx context.Context, deploymentPool *kwoksigsv1beta1.DeploymentPool) (bool, error) { 218 | // get the deployment spec from the cluster 219 | forceRequeue := false 220 | deployments, err := r.getDeployments(ctx, deploymentPool) 221 | if err != nil { 222 | return forceRequeue, err 223 | } 224 | if len(deployments) < int(deploymentPool.Spec.DeploymentCount) { 225 | for i := int32(len(deployments)); i < deploymentPool.Spec.DeploymentCount; i++ { 226 | err = r.createDeployment(ctx, deploymentPool) 227 | if err != nil { 228 | log.Log.Error(err, "unable to create Deployment") 229 | return forceRequeue, err 230 | } 231 | } 232 | forceRequeue = true 233 | return forceRequeue, nil 234 | } else if len(deployments) > int(deploymentPool.Spec.DeploymentCount) { 235 | for i := int32(len(deployments)); i > deploymentPool.Spec.DeploymentCount; i-- { 236 | err = r.Delete(ctx, &deployments[i-1]) 237 | if err != nil { 238 | log.Log.Error(err, "unable to delete Deployment") 239 | return forceRequeue, err 240 | } 241 | } 242 | forceRequeue = true 243 | return forceRequeue, nil 244 | } else { 245 | for i := 0; i < len(deployments); i++ { 246 | deployment := &deployments[i] 247 | deployment.Spec.Replicas = deploymentPool.Spec.DeploymentTemplate.Spec.Replicas 248 | deployment.Spec.Template.Spec.Containers = deploymentPool.Spec.DeploymentTemplate.Spec.Template.Spec.Containers 249 | err = r.Update(ctx, deployment) 250 | if err != nil { 251 | log.Log.Error(err, "unable to update Deployment") 252 | return forceRequeue, err 253 | } 254 | } 255 | } 256 | err = r.updateObservedGeneration(ctx, deploymentPool) 257 | if err != nil { 258 | log.Log.Error(err, "unable to update DeploymentPool") 259 | return forceRequeue, err 260 | } 261 | return forceRequeue, nil 262 | } 263 | 264 | // create deployment 265 | func (r *DeploymentPoolReconciler) createDeployment(ctx context.Context, deploymentPool *kwoksigsv1beta1.DeploymentPool) error { 266 | appendSelector := deploymentPool.Spec.DeploymentTemplate.Spec.Selector 267 | appendSelector.MatchLabels[controllerLabel] = deploymentPool.Name 268 | 269 | overrideLabels := deploymentPool.Spec.DeploymentTemplate.Spec.Template.Labels 270 | if overrideLabels == nil { 271 | overrideLabels = map[string]string{ 272 | controllerLabel: deploymentPool.Name, 273 | } 274 | } else { 275 | overrideLabels[controllerLabel] = deploymentPool.Name 276 | } 277 | 278 | deploymentToleration := deploymentPool.Spec.DeploymentTemplate.Spec.Template.Spec.Tolerations 279 | if deploymentToleration == nil { 280 | deploymentToleration = make([]corev1.Toleration, 0) 281 | } 282 | 283 | deploymentToleration = append(deploymentToleration, corev1.Toleration{ 284 | Key: controllerAnnotation, 285 | Operator: corev1.TolerationOpExists, 286 | Effect: corev1.TaintEffectNoSchedule, 287 | }) 288 | 289 | deployment := &appsv1.Deployment{ 290 | ObjectMeta: metav1.ObjectMeta{ 291 | OwnerReferences: []metav1.OwnerReference{ 292 | *metav1.NewControllerRef(deploymentPool, kwoksigsv1beta1.GroupVersion.WithKind("DeploymentPool")), 293 | }, 294 | GenerateName: deploymentPool.Name + "-", 295 | Namespace: deploymentPool.Namespace, 296 | Labels: overrideLabels, 297 | }, 298 | Spec: deploymentPool.Spec.DeploymentTemplate.Spec, 299 | } 300 | deployment.Spec.Template.ObjectMeta.Labels = overrideLabels 301 | deployment.Spec.Selector = appendSelector 302 | deployment.Spec.Template.Spec.Tolerations = deploymentToleration 303 | 304 | err := r.Create(ctx, deployment) 305 | if err != nil { 306 | return err 307 | } 308 | return nil 309 | } 310 | 311 | // updateObservedGeneration updates the observed generation of the NodePool 312 | func (r *DeploymentPoolReconciler) updateObservedGeneration(ctx context.Context, deploymentPool *kwoksigsv1beta1.DeploymentPool) error { 313 | deploymentPool.Status.ObservedGeneration = deploymentPool.Generation 314 | return r.Status().Update(ctx, deploymentPool) 315 | } 316 | 317 | // SetupWithManager sets up the controller with the Manager. 318 | func (r *DeploymentPoolReconciler) SetupWithManager(mgr ctrl.Manager) error { 319 | return ctrl.NewControllerManagedBy(mgr). 320 | For(&kwoksigsv1beta1.DeploymentPool{}). 321 | Complete(r) 322 | } 323 | -------------------------------------------------------------------------------- /internal/controller/deploymentpool_controller_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "testing" 22 | 23 | . "github.com/onsi/ginkgo/v2" 24 | . "github.com/onsi/gomega" 25 | 26 | appsv1 "k8s.io/api/apps/v1" 27 | corev1 "k8s.io/api/core/v1" 28 | "k8s.io/apimachinery/pkg/api/errors" 29 | 30 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 | "k8s.io/apimachinery/pkg/types" 32 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 33 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 34 | 35 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 36 | ) 37 | 38 | const ( 39 | deploymentpoolName = "test-deploymentpool" 40 | deploymentpoolNamespace = "default" 41 | deploymentName = "test-deployment" 42 | ) 43 | 44 | var ( 45 | typeNamespacedName = types.NamespacedName{ 46 | Name: deploymentpoolName, 47 | Namespace: deploymentpoolNamespace, 48 | } 49 | ctx = context.Background() 50 | ) 51 | 52 | var _ = Describe("DeploymentPool Controller", func() { 53 | Context("When reconciling a resource", func() { 54 | 55 | deploymentpool := &kwoksigsv1beta1.DeploymentPool{} 56 | 57 | BeforeEach(func() { 58 | By("creating the custom resource for the Kind DeploymentPool") 59 | err := k8sClient.Get(ctx, typeNamespacedName, deploymentpool) 60 | if err != nil && errors.IsNotFound(err) { 61 | resource := &kwoksigsv1beta1.DeploymentPool{ 62 | ObjectMeta: metav1.ObjectMeta{ 63 | Name: deploymentpoolName, 64 | Namespace: deploymentpoolNamespace, 65 | }, 66 | Spec: kwoksigsv1beta1.DeploymentPoolSpec{ 67 | DeploymentTemplate: appsv1.Deployment{ 68 | ObjectMeta: metav1.ObjectMeta{ 69 | Name: deploymentName, 70 | Namespace: deploymentpoolNamespace, 71 | }, 72 | Spec: appsv1.DeploymentSpec{ 73 | Replicas: func() *int32 { i := int32(1); return &i }(), 74 | Selector: &metav1.LabelSelector{ 75 | MatchLabels: map[string]string{ 76 | "app": deploymentName, 77 | }, 78 | }, 79 | Template: corev1.PodTemplateSpec{ 80 | ObjectMeta: metav1.ObjectMeta{ 81 | Labels: map[string]string{ 82 | "app": deploymentName, 83 | }, 84 | }, 85 | Spec: corev1.PodSpec{ 86 | Containers: []corev1.Container{ 87 | { 88 | Name: "test-container", 89 | Image: "nginx", 90 | }, 91 | }, 92 | }, 93 | }, 94 | }, 95 | }, 96 | }, 97 | } 98 | Expect(k8sClient.Create(ctx, resource)).To(Succeed()) 99 | } 100 | }) 101 | 102 | AfterEach(func() { 103 | resource := &kwoksigsv1beta1.DeploymentPool{} 104 | err := k8sClient.Get(ctx, typeNamespacedName, resource) 105 | Expect(err).NotTo(HaveOccurred()) 106 | 107 | By("Cleanup the specific resource instance DeploymentPool") 108 | Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) 109 | }) 110 | It("should successfully reconcile the resource", func() { 111 | By("Reconciling the created resource") 112 | controllerReconciler := &DeploymentPoolReconciler{ 113 | Client: k8sClient, 114 | Scheme: k8sClient.Scheme(), 115 | } 116 | 117 | _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ 118 | NamespacedName: typeNamespacedName, 119 | }) 120 | Expect(err).NotTo(HaveOccurred()) 121 | }) 122 | }) 123 | }) 124 | 125 | func TestReconcileDeploymentPool(t *testing.T) { 126 | // Create a fake client with the scheme and status subresource with all v1 objects. 127 | fakeClient := fake.NewClientBuilder().WithScheme(setupScheme()).WithStatusSubresource(&kwoksigsv1beta1.DeploymentPool{}).Build() 128 | 129 | // Create a ReconcileDeploymentPool object with the scheme and fake client. 130 | r := &DeploymentPoolReconciler{ 131 | Client: fakeClient, 132 | Scheme: setupScheme(), 133 | } 134 | // Create a DeploymentPool object and Deployment object to use in test 135 | dp := &kwoksigsv1beta1.DeploymentPool{ 136 | ObjectMeta: metav1.ObjectMeta{ 137 | Name: deploymentpoolName, 138 | Namespace: deploymentpoolNamespace, 139 | }, 140 | Spec: kwoksigsv1beta1.DeploymentPoolSpec{ 141 | DeploymentTemplate: appsv1.Deployment{ 142 | ObjectMeta: metav1.ObjectMeta{ 143 | Name: deploymentName, 144 | Namespace: deploymentpoolNamespace, 145 | }, 146 | Spec: appsv1.DeploymentSpec{ 147 | Replicas: func() *int32 { i := int32(1); return &i }(), 148 | Selector: &metav1.LabelSelector{ 149 | MatchLabels: map[string]string{ 150 | "app": deploymentName, 151 | }, 152 | }, 153 | Template: corev1.PodTemplateSpec{ 154 | ObjectMeta: metav1.ObjectMeta{ 155 | Labels: map[string]string{ 156 | "app": deploymentName, 157 | }, 158 | }, 159 | Spec: corev1.PodSpec{ 160 | Containers: []corev1.Container{ 161 | { 162 | Name: "test-container", 163 | Image: "nginx", 164 | }, 165 | }, 166 | }, 167 | }, 168 | }, 169 | }, 170 | }, 171 | } 172 | // Create the DeploymentPool object in the fake client. 173 | err := fakeClient.Create(ctx, dp) 174 | if err != nil { 175 | t.Fatalf("create DeploymentPool: (%v)", err) 176 | } 177 | // Reconcile an object to get back a result. 178 | res, err := r.Reconcile(ctx, reconcile.Request{ 179 | NamespacedName: typeNamespacedName, 180 | }) 181 | 182 | if res != (reconcile.Result{}) { 183 | t.Fatalf("reconcile did not return an empty result") 184 | } 185 | // Check to make sure the reconcile was successful and that it should requeue the request. 186 | if err != nil { 187 | t.Fatalf("reconcile: (%v)", err) 188 | } 189 | // list deployment and check if the deployment has been created 190 | depList := &appsv1.DeploymentList{} 191 | err = fakeClient.List(ctx, depList) 192 | if err != nil { 193 | t.Fatalf("list Deployment: (%v)", err) 194 | } 195 | // Check to make sure the DeploymentPool has been reconciled. 196 | err = fakeClient.Get(ctx, typeNamespacedName, dp) 197 | if err != nil { 198 | t.Fatalf("get DeploymentPool: (%v)", err) 199 | } 200 | if dp.Status.ObservedGeneration != dp.Generation { 201 | t.Fatalf("observedGeneration not updated") 202 | } 203 | // update replicas to 8 204 | replicas := int32(8) 205 | dp.Spec.DeploymentTemplate.Spec.Replicas = &replicas 206 | err = fakeClient.Update(ctx, dp) 207 | if err != nil { 208 | t.Fatalf("update DeploymentPool: (%v)", err) 209 | } 210 | // Reconcile the updated object. 211 | res, err = r.Reconcile(ctx, reconcile.Request{ 212 | NamespacedName: typeNamespacedName, 213 | }) 214 | if err != nil { 215 | t.Fatalf("reconcile: (%v)", err) 216 | } 217 | 218 | // delete deploymentPool and check if the deployment has been deleted 219 | err = fakeClient.Delete(ctx, dp) 220 | if err != nil { 221 | t.Fatalf("delete DeploymentPool: (%v)", err) 222 | } 223 | // Reconcile the deleted object. 224 | res, err = r.Reconcile(ctx, reconcile.Request{ 225 | NamespacedName: typeNamespacedName, 226 | }) 227 | if err != nil { 228 | t.Fatalf("reconcile: (%v)", err) 229 | } 230 | // Check to make sure the DeploymentPool has been deleted. 231 | dpList := &kwoksigsv1beta1.DeploymentPoolList{} 232 | err = fakeClient.List(ctx, dpList) 233 | if err != nil { 234 | t.Fatalf("list DeploymentPool: (%v)", err) 235 | } 236 | err = fakeClient.List(ctx, depList) 237 | if err != nil { 238 | t.Fatalf("list Deployment: (%v)", err) 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /internal/controller/global.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | v1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 5 | appsv1 "k8s.io/api/apps/v1" 6 | corev1 "k8s.io/api/core/v1" 7 | "k8s.io/apimachinery/pkg/runtime" 8 | ) 9 | 10 | const ( 11 | controllerFinalizer = "kwok.sigs.run-ai.com/finalizer" 12 | controllerLabel = "kwok.x-k8s.io/controller" 13 | controllerAnnotation = "kwok.x-k8s.io/node" 14 | fakeString = "fake" 15 | ) 16 | 17 | // setupScheme sets up the scheme for the tests 18 | func setupScheme() *runtime.Scheme { 19 | scheme := runtime.NewScheme() 20 | _ = v1beta1.AddToScheme(scheme) 21 | _ = corev1.AddToScheme(scheme) 22 | _ = appsv1.AddToScheme(scheme) 23 | return scheme 24 | } 25 | -------------------------------------------------------------------------------- /internal/controller/jobpool_controller.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "k8s.io/apimachinery/pkg/api/errors" 22 | "time" 23 | 24 | batchv1 "k8s.io/api/batch/v1" 25 | corev1 "k8s.io/api/core/v1" 26 | apierrors "k8s.io/apimachinery/pkg/api/errors" 27 | "k8s.io/apimachinery/pkg/api/meta" 28 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 | "k8s.io/apimachinery/pkg/runtime" 30 | ctrl "sigs.k8s.io/controller-runtime" 31 | "sigs.k8s.io/controller-runtime/pkg/client" 32 | "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 33 | "sigs.k8s.io/controller-runtime/pkg/log" 34 | 35 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 36 | ) 37 | 38 | type JobPoolReconciler struct { 39 | client.Client 40 | Scheme *runtime.Scheme 41 | } 42 | 43 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=jobpools,verbs=get;list;watch;create;update;patch;delete 44 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=jobpools/status,verbs=get;update;patch 45 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=jobpools/finalizers,verbs=update 46 | 47 | func (r *JobPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 48 | log := log.FromContext(ctx) 49 | log.Info("Reconciling JobPool") 50 | jobPool := &kwoksigsv1beta1.JobPool{} 51 | err := r.Get(ctx, req.NamespacedName, jobPool) 52 | if err != nil { 53 | if apierrors.IsNotFound(err) { 54 | log.Info("JobPool resource not found. Ignoring since object must be deleted") 55 | return ctrl.Result{}, nil 56 | } 57 | log.Error(err, "unable to fetch JobPool") 58 | return ctrl.Result{}, err 59 | } 60 | if jobPool.Status.Conditions == nil || len(jobPool.Status.Conditions) == 0 { 61 | err := r.statusConditionController(ctx, jobPool, metav1.Condition{ 62 | Type: "Available", 63 | Status: metav1.ConditionUnknown, 64 | Reason: "Reconciling", 65 | Message: "Starting to reconcile the JobPool", 66 | }) 67 | if err != nil { 68 | log.Error(err, "Failed to update JobPool status") 69 | return ctrl.Result{}, err 70 | } 71 | err = r.Get(ctx, req.NamespacedName, jobPool) 72 | if err != nil { 73 | log.Error(err, "Failed to get jobPool") 74 | return ctrl.Result{}, err 75 | } 76 | } 77 | log.Info("JobPool resource found") 78 | // Set the finalizer for the JobPool 79 | if !controllerutil.ContainsFinalizer(jobPool, controllerFinalizer) { 80 | log.Info("Adding Finalizer for the JobPool") 81 | err = r.addFinalizer(ctx, jobPool) 82 | if err != nil { 83 | log.Error(err, "Failed to add finalizer for the JobPool") 84 | return ctrl.Result{}, err 85 | } 86 | } 87 | // Get the jobs for the JobPool 88 | jobs, err := r.getJobs(ctx, jobPool) 89 | if err != nil { 90 | log.Error(err, "Failed to get jobs") 91 | return ctrl.Result{}, err 92 | } 93 | // Check if the JobPool is in the desired state 94 | log.Info("Checking if the JobPool is in the desired state") 95 | println("jobs: ", len(jobs), "jobPool.Spec.JobCount: ", jobPool.Spec.JobCount) 96 | if int32(len(jobs)) != jobPool.Spec.JobCount { 97 | if int32(len(jobs)) < jobPool.Spec.JobCount { 98 | log.Info("Creating jobs") 99 | err := r.statusConditionController(ctx, jobPool, metav1.Condition{ 100 | Type: "Available", 101 | Status: metav1.ConditionFalse, 102 | Reason: "ScalingUp", 103 | Message: "Scalling up the jobPool", 104 | }) 105 | if err != nil { 106 | log.Error(err, "Failed to update jobPool status") 107 | return ctrl.Result{}, err 108 | } 109 | log.Info("Scalling up the jobPool... creating jobs!") 110 | err = r.createJobs(ctx, jobPool, jobs) 111 | if err != nil { 112 | log.Error(err, "Failed to create jobs") 113 | return ctrl.Result{}, err 114 | } 115 | } else { 116 | log.Info("Deleting jobs") 117 | err := r.statusConditionController(ctx, jobPool, metav1.Condition{ 118 | Type: "Available", 119 | Status: metav1.ConditionFalse, 120 | Reason: "ScalingDown", 121 | Message: "Scalling down the jobPool", 122 | }) 123 | if err != nil { 124 | log.Error(err, "Failed to update jobPool status") 125 | return ctrl.Result{}, err 126 | } 127 | log.Info("Scalling down the jobPool... deleting jobs!") 128 | err = r.deleteJobs(ctx, jobPool, jobs) 129 | if err != nil { 130 | log.Error(err, "Failed to delete jobs") 131 | return ctrl.Result{}, err 132 | } 133 | } 134 | } 135 | // Update the status of the jobPool 136 | err = r.statusConditionController(ctx, jobPool, metav1.Condition{ 137 | Type: "Available", 138 | Status: metav1.ConditionTrue, 139 | Reason: "jobPoolReconciled", 140 | Message: "jobPool reconciled successfully", 141 | }) 142 | if err != nil { 143 | log.Error(err, "Failed to update jobPool status") 144 | return ctrl.Result{}, err 145 | } 146 | // Update the observed generation of the jobPool 147 | if jobPool.Status.ObservedGeneration != jobPool.Generation { 148 | log.Info("jobTemplate has changed") 149 | err := r.statusConditionController(ctx, jobPool, metav1.Condition{ 150 | Type: "Available", 151 | Status: metav1.ConditionFalse, 152 | Reason: "Updating", 153 | Message: "Updating the jobPool", 154 | }) 155 | if err != nil { 156 | log.Error(err, "Failed to update jobPool status") 157 | return ctrl.Result{}, err 158 | } 159 | emptyJobPool := &kwoksigsv1beta1.JobPool{ 160 | ObjectMeta: metav1.ObjectMeta{ 161 | Name: jobPool.Name, 162 | }, 163 | } 164 | err = r.deleteJobs(ctx, emptyJobPool, jobs) 165 | if err != nil { 166 | log.Error(err, "Failed to delete jobs") 167 | return ctrl.Result{}, err 168 | } 169 | err = r.createJobs(ctx, jobPool, jobs) 170 | if err != nil { 171 | log.Error(err, "Failed to create jobs") 172 | return ctrl.Result{}, err 173 | } 174 | err = r.statusConditionController(ctx, jobPool, metav1.Condition{ 175 | Type: "Available", 176 | Status: metav1.ConditionTrue, 177 | Reason: "Ready", 178 | Message: "jobPool is ready", 179 | }) 180 | if err != nil { 181 | log.Error(err, "Failed to update jobPool status") 182 | return ctrl.Result{}, err 183 | } 184 | return ctrl.Result{Requeue: true}, nil 185 | } 186 | if !jobPool.DeletionTimestamp.IsZero() { 187 | log.Info("Deleting jobPool") 188 | err = r.statusConditionController(ctx, jobPool, metav1.Condition{ 189 | Type: "Available", 190 | Status: metav1.ConditionFalse, 191 | Reason: "Deleting", 192 | Message: "Deleting the jobPool", 193 | }) 194 | if err != nil { 195 | log.Error(err, "Failed to update jobPool status") 196 | return ctrl.Result{}, nil 197 | } 198 | err := r.deleteJobs(ctx, jobPool, jobs) 199 | if err != nil { 200 | log.Error(err, "Failed to delete jobs") 201 | return ctrl.Result{}, err 202 | } 203 | err = r.deleteFinalizer(ctx, jobPool) 204 | if err != nil { 205 | log.Error(err, "Failed to delete finalizer from jobPool") 206 | return ctrl.Result{}, err 207 | } 208 | return ctrl.Result{}, nil 209 | } 210 | log.Info("Reconciliation finished") 211 | return ctrl.Result{RequeueAfter: time.Duration(60 * time.Second)}, nil 212 | } 213 | 214 | // delete the finalizer from the jobPool 215 | func (r *JobPoolReconciler) deleteFinalizer(ctx context.Context, jobPool *kwoksigsv1beta1.JobPool) error { 216 | controllerutil.RemoveFinalizer(jobPool, controllerFinalizer) 217 | return r.Update(ctx, jobPool) 218 | } 219 | 220 | // delede jobs for the JobPool 221 | func (r *JobPoolReconciler) deleteJobs(ctx context.Context, jobPool *kwoksigsv1beta1.JobPool, jobs []batchv1.Job) error { 222 | for i := int32(len(jobs)); i > jobPool.Spec.JobCount; i-- { 223 | // Delete a job 224 | err := r.Delete(ctx, &jobs[i-1]) 225 | if err != nil { 226 | return err 227 | } 228 | } 229 | err := r.updateObservedGeneration(ctx, jobPool) 230 | if err != nil { 231 | return err 232 | } 233 | return nil 234 | } 235 | 236 | // updateObservedGeneration updates the observedGeneration of the JobPool 237 | func (r *JobPoolReconciler) updateObservedGeneration(ctx context.Context, jobPool *kwoksigsv1beta1.JobPool) error { 238 | jobPool.Status.ObservedGeneration = jobPool.Generation 239 | return r.Status().Update(ctx, jobPool) 240 | } 241 | 242 | // create jobs for the JobPool 243 | func (r *JobPoolReconciler) createJobs(ctx context.Context, jobPool *kwoksigsv1beta1.JobPool, jobs []batchv1.Job) error { 244 | jobLabels := jobPool.Spec.JobTemplate.Spec.Template.Labels 245 | if jobLabels == nil { 246 | jobLabels = make((map[string]string), 0) 247 | } 248 | jobLabels[controllerLabel] = jobPool.Name 249 | jobToleration := jobPool.Spec.JobTemplate.Spec.Template.Spec.Tolerations 250 | if jobToleration == nil { 251 | jobToleration = make([]corev1.Toleration, 0) 252 | } 253 | jobToleration = append(jobToleration, corev1.Toleration{ 254 | Key: controllerAnnotation, 255 | Operator: corev1.TolerationOpExists, 256 | Effect: corev1.TaintEffectNoSchedule, 257 | }) 258 | 259 | jobAnnotation := jobPool.Spec.JobTemplate.Spec.Template.Annotations 260 | if jobAnnotation == nil { 261 | jobAnnotation = make(map[string]string) 262 | } 263 | jobAnnotation[controllerAnnotation] = fakeString 264 | for i := int32(len(jobs)); i < jobPool.Spec.JobCount; i++ { 265 | // Create a new jobs 266 | job := &batchv1.Job{ 267 | ObjectMeta: metav1.ObjectMeta{ 268 | GenerateName: jobPool.Name + "-", 269 | Namespace: jobPool.Namespace, 270 | Labels: jobLabels, 271 | Annotations: jobAnnotation, 272 | OwnerReferences: []metav1.OwnerReference{ 273 | *metav1.NewControllerRef(jobPool, kwoksigsv1beta1.GroupVersion.WithKind("JobPool")), 274 | }, 275 | }, 276 | Spec: jobPool.Spec.JobTemplate.Spec, 277 | } 278 | job.Spec.Template.Spec.Tolerations = jobToleration 279 | job.Spec.Template.ObjectMeta.Labels = jobLabels 280 | err := r.Create(ctx, job) 281 | if err != nil { 282 | return err 283 | } 284 | err = r.updateObservedGeneration(ctx, jobPool) 285 | if err != nil { 286 | return err 287 | } 288 | } 289 | return nil 290 | } 291 | 292 | // get the jobs for the JobPool 293 | func (r *JobPoolReconciler) getJobs(ctx context.Context, jobPool *kwoksigsv1beta1.JobPool) ([]batchv1.Job, error) { 294 | jobs := &batchv1.JobList{} 295 | err := r.List(ctx, jobs, client.InNamespace(jobPool.Namespace), client.MatchingLabels{controllerLabel: jobPool.Name}) 296 | if err != nil && errors.IsNotFound(err) { 297 | return []batchv1.Job{}, nil 298 | } else if err != nil { 299 | return nil, err 300 | } 301 | return jobs.Items, nil 302 | } 303 | 304 | // addFinalizer adds the finalizer to the JobPool 305 | func (r *JobPoolReconciler) addFinalizer(ctx context.Context, jobPool *kwoksigsv1beta1.JobPool) error { 306 | controllerutil.AddFinalizer(jobPool, controllerFinalizer) 307 | return r.Update(ctx, jobPool) 308 | } 309 | 310 | // statusConditionController updates the status of the jobPool 311 | func (r *JobPoolReconciler) statusConditionController(ctx context.Context, jodPool *kwoksigsv1beta1.JobPool, condition metav1.Condition) error { 312 | meta.SetStatusCondition(&jodPool.Status.Conditions, condition) 313 | return r.Status().Update(ctx, jodPool) 314 | } 315 | 316 | // SetupWithManager sets up the controller with the Manager. 317 | func (r *JobPoolReconciler) SetupWithManager(mgr ctrl.Manager) error { 318 | return ctrl.NewControllerManagedBy(mgr). 319 | For(&kwoksigsv1beta1.JobPool{}). 320 | Complete(r) 321 | } 322 | -------------------------------------------------------------------------------- /internal/controller/jobpool_controller_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "testing" 22 | 23 | . "github.com/onsi/ginkgo/v2" 24 | . "github.com/onsi/gomega" 25 | batchv1 "k8s.io/api/batch/v1" 26 | "k8s.io/apimachinery/pkg/api/errors" 27 | "k8s.io/apimachinery/pkg/types" 28 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 29 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 30 | 31 | corev1 "k8s.io/api/core/v1" 32 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 | 34 | "github.com/run-ai/kwok-operator/api/v1beta1" 35 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 36 | ) 37 | 38 | const ( 39 | jobPoolName = "default" 40 | ) 41 | 42 | var _ = Describe("JobPool Controller", func() { 43 | Context("When reconciling a resource", func() { 44 | const resourceName = "test-resource" 45 | 46 | ctx := context.Background() 47 | 48 | typeNamespacedName := types.NamespacedName{ 49 | Name: resourceName, 50 | Namespace: jobPoolName, 51 | } 52 | jobpool := &kwoksigsv1beta1.JobPool{} 53 | 54 | BeforeEach(func() { 55 | By("creating the custom resource for the Kind JobPool") 56 | err := k8sClient.Get(ctx, typeNamespacedName, jobpool) 57 | if err != nil && errors.IsNotFound(err) { 58 | resource := &kwoksigsv1beta1.JobPool{ 59 | 60 | ObjectMeta: metav1.ObjectMeta{ 61 | Name: resourceName, 62 | Namespace: jobPoolName, 63 | Labels: map[string]string{ 64 | controllerLabel: resourceName, 65 | }, 66 | }, 67 | Spec: v1beta1.JobPoolSpec{ 68 | JobCount: 5, 69 | JobTemplate: batchv1.Job{ 70 | Spec: batchv1.JobSpec{ 71 | Template: corev1.PodTemplateSpec{ 72 | Spec: corev1.PodSpec{ 73 | RestartPolicy: corev1.RestartPolicyNever, 74 | Containers: []corev1.Container{ 75 | { 76 | Name: "test-container", 77 | Image: "busybox", 78 | }, 79 | }, 80 | }, 81 | }, 82 | }, 83 | }, 84 | }, 85 | } 86 | 87 | Expect(k8sClient.Create(ctx, resource)).To(Succeed()) 88 | } 89 | }) 90 | 91 | AfterEach(func() { 92 | resource := &kwoksigsv1beta1.JobPool{} 93 | err := k8sClient.Get(ctx, typeNamespacedName, resource) 94 | Expect(err).NotTo(HaveOccurred()) 95 | 96 | By("Cleanup the specific resource instance JobPool") 97 | Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) 98 | }) 99 | It("should successfully reconcile the resource", func() { 100 | By("Reconciling the created resource") 101 | controllerReconciler := &JobPoolReconciler{ 102 | Client: k8sClient, 103 | Scheme: k8sClient.Scheme(), 104 | } 105 | 106 | _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ 107 | NamespacedName: typeNamespacedName, 108 | }) 109 | Expect(err).NotTo(HaveOccurred()) 110 | }) 111 | }) 112 | }) 113 | 114 | func TestJobPoolController(t *testing.T) { 115 | // create a fake client to mock API calls 116 | fakeClient := fake.NewClientBuilder().WithScheme(setupScheme()).WithStatusSubresource(&v1beta1.JobPool{}).Build() 117 | // create jobpool 118 | jobpool := &kwoksigsv1beta1.JobPool{ 119 | ObjectMeta: metav1.ObjectMeta{ 120 | Name: "test-jobpool", 121 | Namespace: "default", 122 | }, 123 | Spec: v1beta1.JobPoolSpec{ 124 | JobCount: 3, 125 | JobTemplate: batchv1.Job{ 126 | Spec: batchv1.JobSpec{ 127 | Template: corev1.PodTemplateSpec{ 128 | Spec: corev1.PodSpec{ 129 | Containers: []corev1.Container{ 130 | { 131 | Name: "test-container", 132 | Image: "busybox", 133 | }, 134 | }, 135 | }, 136 | }, 137 | }, 138 | }, 139 | }, 140 | } 141 | // Create a Reconcile JobPool object with the scheme and fake client. 142 | r := &JobPoolReconciler{ 143 | Client: fakeClient, 144 | Scheme: setupScheme(), 145 | } 146 | 147 | // create jobpool object for testing 148 | err := fakeClient.Create(context.Background(), jobpool) 149 | if err != nil { 150 | t.Fatalf("Failed to create jobpool: %v", err) 151 | } 152 | 153 | // reconcile an object to get back a result 154 | res, err := r.Reconcile(context.Background(), reconcile.Request{ 155 | NamespacedName: types.NamespacedName{ 156 | Name: "test-jobpool", 157 | Namespace: "default", 158 | }, 159 | }) 160 | if res != (reconcile.Result{}) { 161 | t.Fatalf("Reconcile did not return an empty result") 162 | } 163 | 164 | // reconcile an object to get back a result 165 | res, err = r.Reconcile(context.Background(), reconcile.Request{ 166 | NamespacedName: types.NamespacedName{ 167 | Name: "test-jobpool", 168 | Namespace: "default", 169 | }, 170 | }) 171 | if res != (reconcile.Result{}) { 172 | t.Fatalf("Reconcile did not return an empty result") 173 | } 174 | // get latest jobpool object 175 | err = fakeClient.Get(context.Background(), types.NamespacedName{ 176 | Name: "test-jobpool", 177 | Namespace: "default", 178 | }, jobpool) 179 | if err != nil { 180 | t.Fatalf("Failed to get jobpool: %v", err) 181 | } 182 | // scale jobpool jobCount to 5 and update the jobpool object in the fake client 183 | jobpool.Spec.JobCount = 5 184 | err = fakeClient.Update(context.Background(), jobpool) 185 | if err != nil { 186 | t.Fatalf("Failed to update jobpool: %v", err) 187 | } 188 | // reconcile an object to get back a result 189 | res, err = r.Reconcile(context.Background(), reconcile.Request{ 190 | NamespacedName: types.NamespacedName{ 191 | Name: "test-jobpool", 192 | Namespace: "default", 193 | }, 194 | }) 195 | if res != (reconcile.Result{}) { 196 | t.Fatalf("Reconcile did not return an empty result") 197 | } 198 | // delete the jobpool object in the fake client 199 | err = fakeClient.Delete(context.Background(), jobpool) 200 | if err != nil { 201 | t.Fatalf("Failed to delete jobpool: %v", err) 202 | } 203 | // reconcile an object to get back a result 204 | res, err = r.Reconcile(context.Background(), reconcile.Request{ 205 | NamespacedName: types.NamespacedName{ 206 | Name: "test-jobpool", 207 | Namespace: "default", 208 | }, 209 | }) 210 | if res != (reconcile.Result{}) { 211 | t.Fatalf("Reconcile did not return an empty result") 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /internal/controller/nodepool_controller.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "strings" 22 | "time" 23 | 24 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 25 | corev1 "k8s.io/api/core/v1" 26 | apierrors "k8s.io/apimachinery/pkg/api/errors" 27 | "k8s.io/apimachinery/pkg/api/meta" 28 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 | "k8s.io/apimachinery/pkg/runtime" 30 | ctrl "sigs.k8s.io/controller-runtime" 31 | "sigs.k8s.io/controller-runtime/pkg/client" 32 | "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 33 | "sigs.k8s.io/controller-runtime/pkg/log" 34 | ) 35 | 36 | // NodePoolReconciler reconciles a NodePool object 37 | type NodePoolReconciler struct { 38 | client.Client 39 | Scheme *runtime.Scheme 40 | } 41 | 42 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=nodepools,verbs=get;list;watch;create;update;patch;delete 43 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=nodepools/status,verbs=get;update;patch 44 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=nodepools/finalizers,verbs=update 45 | 46 | func (r *NodePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 47 | log := log.FromContext(ctx) 48 | log.Info("Reconciling NodePool") 49 | nodePool := &kwoksigsv1beta1.NodePool{} 50 | err := r.Get(ctx, req.NamespacedName, nodePool) 51 | if err != nil { 52 | if apierrors.IsNotFound(err) { 53 | log.Info("nodePool resource not found. Ignoring since object must be deleted") 54 | return ctrl.Result{}, nil 55 | } 56 | // Error reading the object - requeue the request CR. 57 | log.Error(err, "Failed to get NodePool") 58 | return ctrl.Result{}, err 59 | } 60 | // Set reconciling status condition in the NodePool 61 | if nodePool.Status.Conditions == nil || len(nodePool.Status.Conditions) == 0 { 62 | err := r.statusConditionController(ctx, nodePool, metav1.Condition{ 63 | Type: "Available", 64 | Status: metav1.ConditionUnknown, 65 | Reason: "Reconciling", 66 | Message: "Starting to reconcile the NodePool", 67 | }) 68 | if err != nil { 69 | log.Error(err, "Failed to update NodePool status") 70 | return ctrl.Result{}, err 71 | } 72 | err = r.Get(ctx, req.NamespacedName, nodePool) 73 | if err != nil { 74 | log.Error(err, "Failed to get NodePool") 75 | return ctrl.Result{}, err 76 | } 77 | } 78 | // Add finalizer to the NodePool 79 | if !controllerutil.ContainsFinalizer(nodePool, controllerFinalizer) { 80 | log.Info("Adding Finalizer for the NodePool") 81 | err := r.addFinalizer(ctx, nodePool) 82 | if err != nil { 83 | log.Error(err, "Failed to add finalizer to NodePool") 84 | return ctrl.Result{}, err 85 | } 86 | } 87 | // Get nodes in the cluster with owner reference to the nodePool 88 | nodes, err := r.getNodes(ctx, nodePool) 89 | if err != nil { 90 | log.Error(err, "Failed to get nodes") 91 | return ctrl.Result{}, err 92 | } 93 | // Check if the number of nodes in the cluster is equal to the desired number of nodes 94 | if int32(len(nodes)) != nodePool.Spec.NodeCount { 95 | // Create or delete nodes in the cluster 96 | if int32(len(nodes)) < nodePool.Spec.NodeCount { 97 | err := r.statusConditionController(ctx, nodePool, metav1.Condition{ 98 | Type: "Available", 99 | Status: metav1.ConditionFalse, 100 | Reason: "ScalingUp", 101 | Message: "Scalling up the NodePool", 102 | }) 103 | if err != nil { 104 | log.Error(err, "Failed to update NodePool status") 105 | return ctrl.Result{}, err 106 | } 107 | log.Info("Scalling up the NodePool... creating nodes!") 108 | err = r.createNodes(ctx, nodePool, nodes) 109 | if err != nil { 110 | log.Error(err, "Failed to create nodes") 111 | return ctrl.Result{}, err 112 | } 113 | } else { 114 | err := r.statusConditionController(ctx, nodePool, metav1.Condition{ 115 | Type: "Available", 116 | Status: metav1.ConditionFalse, 117 | Reason: "ScalingDown", 118 | Message: "Scalling down the NodePool", 119 | }) 120 | if err != nil { 121 | log.Error(err, "Failed to update NodePool status") 122 | return ctrl.Result{}, err 123 | } 124 | log.Info("Too many nodes... deleting! ") 125 | err = r.deleteNodes(ctx, nodePool, nodes) 126 | if err != nil { 127 | log.Error(err, "Failed to delete nodes") 128 | return ctrl.Result{}, err 129 | } 130 | } 131 | err := r.statusConditionController(ctx, nodePool, metav1.Condition{ 132 | Type: "Available", 133 | Status: metav1.ConditionTrue, 134 | Reason: "Ready", 135 | Message: "NodePool is ready", 136 | }) 137 | if err != nil { 138 | log.Error(err, "Failed to update NodePool status") 139 | return ctrl.Result{}, err 140 | } 141 | return ctrl.Result{Requeue: true}, nil 142 | } 143 | 144 | // Check if observed generation is different from the generation 145 | if nodePool.Status.ObservedGeneration != nodePool.Generation { 146 | log.Info("NodeTemplate has changed") 147 | err := r.statusConditionController(ctx, nodePool, metav1.Condition{ 148 | Type: "Available", 149 | Status: metav1.ConditionFalse, 150 | Reason: "Updating", 151 | Message: "Updating the NodePool", 152 | }) 153 | if err != nil { 154 | log.Error(err, "Failed to update NodePool status") 155 | return ctrl.Result{}, err 156 | } 157 | emptyNodePool := &kwoksigsv1beta1.NodePool{ 158 | ObjectMeta: metav1.ObjectMeta{ 159 | Name: nodePool.Name, 160 | }, 161 | } 162 | err = r.deleteNodes(ctx, emptyNodePool, nodes) 163 | if err != nil { 164 | log.Error(err, "Failed to delete nodes") 165 | return ctrl.Result{}, err 166 | } 167 | err = r.createNodes(ctx, nodePool, nodes) 168 | if err != nil { 169 | log.Error(err, "Failed to create nodes") 170 | return ctrl.Result{}, err 171 | } 172 | err = r.statusConditionController(ctx, nodePool, metav1.Condition{ 173 | Type: "Available", 174 | Status: metav1.ConditionTrue, 175 | Reason: "Ready", 176 | Message: "NodePool is ready", 177 | }) 178 | if err != nil { 179 | log.Error(err, "Failed to update NodePool status") 180 | return ctrl.Result{}, err 181 | } 182 | return ctrl.Result{Requeue: true}, nil 183 | } 184 | 185 | if !nodePool.DeletionTimestamp.IsZero() { 186 | // Remove finalizer from the NodePool 187 | log.Info("Deleting the NodePool") 188 | err = r.statusConditionController(ctx, nodePool, metav1.Condition{ 189 | Type: "Available", 190 | Status: metav1.ConditionFalse, 191 | Reason: "Deleting", 192 | Message: "Deleting the NodePool", 193 | }) 194 | if err != nil { 195 | log.Error(err, "Failed to update NodePool status") 196 | return ctrl.Result{}, err 197 | } 198 | err := r.deleteNodes(ctx, nodePool, nodes) 199 | if err != nil { 200 | log.Error(err, "Failed to delete nodes") 201 | return ctrl.Result{}, err 202 | } 203 | err = r.deleteFinalizer(ctx, nodePool) 204 | if err != nil { 205 | log.Error(err, "Failed to delete finalizer from NodePool") 206 | return ctrl.Result{}, err 207 | } 208 | return ctrl.Result{}, nil 209 | } 210 | log.Info("Reconciliation completed successfully") 211 | return ctrl.Result{RequeueAfter: time.Duration(60 * time.Second)}, nil 212 | } 213 | 214 | func (r *NodePoolReconciler) getNodes(ctx context.Context, nodePool *kwoksigsv1beta1.NodePool) ([]corev1.Node, error) { 215 | nodes := &corev1.NodeList{} 216 | err := r.List(ctx, nodes, client.InNamespace(nodePool.Namespace), client.MatchingLabels{controllerLabel: nodePool.Name}) 217 | if err != nil && strings.Contains(err.Error(), "does not exist") { 218 | return []corev1.Node{}, nil 219 | } else if err != nil { 220 | return nil, err 221 | } 222 | return nodes.Items, nil 223 | } 224 | 225 | // Create nodes in the cluster 226 | func (r *NodePoolReconciler) createNodes(ctx context.Context, nodePool *kwoksigsv1beta1.NodePool, nodes []corev1.Node) error { 227 | nodeLabels := nodePool.Spec.NodeTemplate.Labels 228 | if nodeLabels == nil { 229 | nodeLabels = make(map[string]string) 230 | } 231 | nodeLabels[controllerLabel] = nodePool.Name 232 | nodeTaint := nodePool.Spec.NodeTemplate.Spec.Taints 233 | if nodeTaint == nil { 234 | nodeTaint = make([]corev1.Taint, 0) 235 | } 236 | nodeTaint = append(nodeTaint, corev1.Taint{ 237 | Key: controllerAnnotation, 238 | Value: fakeString, 239 | Effect: corev1.TaintEffectNoSchedule, 240 | }) 241 | nodeAnnotation := nodePool.Spec.NodeTemplate.Annotations 242 | if nodeAnnotation == nil { 243 | nodeAnnotation = make(map[string]string) 244 | } 245 | nodeAnnotation[controllerAnnotation] = fakeString 246 | for i := int32(len(nodes)); i < nodePool.Spec.NodeCount; i++ { 247 | // Create a new node 248 | node := &corev1.Node{ 249 | ObjectMeta: metav1.ObjectMeta{ 250 | GenerateName: nodePool.Name + "-", 251 | Labels: nodeLabels, 252 | Annotations: nodeAnnotation, 253 | OwnerReferences: []metav1.OwnerReference{ 254 | *metav1.NewControllerRef(nodePool, kwoksigsv1beta1.GroupVersion.WithKind("NodePool")), 255 | }, 256 | }, 257 | Spec: nodePool.Spec.NodeTemplate.Spec, 258 | Status: nodePool.Spec.NodeTemplate.Status, 259 | } 260 | node.Spec.Taints = nodeTaint 261 | //node.ObjectMeta.Annotations = nodeAnnotation 262 | 263 | err := r.Create(ctx, node) 264 | if err != nil { 265 | return err 266 | } 267 | } 268 | 269 | err := r.updateObservedGeneration(ctx, nodePool) 270 | if err != nil { 271 | return err 272 | } 273 | return nil 274 | } 275 | 276 | // Delete nodes in the cluster 277 | func (r *NodePoolReconciler) deleteNodes(ctx context.Context, nodePool *kwoksigsv1beta1.NodePool, nodes []corev1.Node) error { 278 | for i := int32(len(nodes)); i > nodePool.Spec.NodeCount; i-- { 279 | // Delete a node 280 | err := r.Delete(ctx, &nodes[i-1]) 281 | if err != nil { 282 | return err 283 | } 284 | } 285 | err := r.updateObservedGeneration(ctx, nodePool) 286 | if err != nil { 287 | return err 288 | } 289 | return nil 290 | } 291 | 292 | // Add the finalizer to the NodePool 293 | func (r *NodePoolReconciler) addFinalizer(ctx context.Context, nodePool *kwoksigsv1beta1.NodePool) error { 294 | controllerutil.AddFinalizer(nodePool, controllerFinalizer) 295 | return r.Update(ctx, nodePool) 296 | } 297 | 298 | // Delete the finalizer from the NodePool 299 | func (r *NodePoolReconciler) deleteFinalizer(ctx context.Context, nodePool *kwoksigsv1beta1.NodePool) error { 300 | controllerutil.RemoveFinalizer(nodePool, controllerFinalizer) 301 | return r.Update(ctx, nodePool) 302 | } 303 | 304 | // statusConditionController updates the status of the NodePool 305 | func (r *NodePoolReconciler) statusConditionController(ctx context.Context, nodePool *kwoksigsv1beta1.NodePool, condition metav1.Condition) error { 306 | meta.SetStatusCondition(&nodePool.Status.Conditions, condition) 307 | return r.Status().Update(ctx, nodePool) 308 | } 309 | 310 | // updateObservedGeneration updates the observed generation of the NodePool 311 | func (r *NodePoolReconciler) updateObservedGeneration(ctx context.Context, nodePool *kwoksigsv1beta1.NodePool) error { 312 | nodePool.Status.ObservedGeneration = nodePool.Generation 313 | return r.Status().Update(ctx, nodePool) 314 | } 315 | 316 | // SetupWithManager sets up the controller with the Manager. 317 | func (r *NodePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { 318 | return ctrl.NewControllerManagedBy(mgr). 319 | For(&kwoksigsv1beta1.NodePool{}). 320 | Complete(r) 321 | } 322 | -------------------------------------------------------------------------------- /internal/controller/nodepool_controller_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "log" 22 | "testing" 23 | 24 | . "github.com/onsi/ginkgo/v2" 25 | . "github.com/onsi/gomega" 26 | "k8s.io/apimachinery/pkg/api/errors" 27 | "k8s.io/apimachinery/pkg/types" 28 | 29 | "github.com/run-ai/kwok-operator/api/v1beta1" 30 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 31 | "github.com/stretchr/testify/assert" 32 | corev1 "k8s.io/api/core/v1" 33 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 | 35 | "sigs.k8s.io/controller-runtime/pkg/client" 36 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 37 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 38 | ) 39 | 40 | const ( 41 | kubernetesRoleLabel = "kubernetes.io/role" 42 | ) 43 | 44 | var _ = Describe("NodePool Controller", func() { 45 | Context("When reconciling a resource", func() { 46 | const resourceName = "test-resource" 47 | 48 | ctx := context.Background() 49 | 50 | typeNamespacedName := types.NamespacedName{ 51 | Name: resourceName, 52 | } 53 | nodepool := &kwoksigsv1beta1.NodePool{} 54 | 55 | BeforeEach(func() { 56 | By("creating the custom resource for the Kind NodePool") 57 | err := k8sClient.Get(ctx, typeNamespacedName, nodepool) 58 | if err != nil && errors.IsNotFound(err) { 59 | resource := &kwoksigsv1beta1.NodePool{ 60 | ObjectMeta: metav1.ObjectMeta{ 61 | Name: resourceName, 62 | }, 63 | Spec: kwoksigsv1beta1.NodePoolSpec{ 64 | NodeCount: 2, 65 | }, 66 | } 67 | Expect(k8sClient.Create(ctx, resource)).To(Succeed()) 68 | } 69 | }) 70 | 71 | AfterEach(func() { 72 | resource := &kwoksigsv1beta1.NodePool{} 73 | err := k8sClient.Get(ctx, typeNamespacedName, resource) 74 | Expect(err).NotTo(HaveOccurred()) 75 | 76 | By("Cleanup the specific resource instance NodePool") 77 | Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) 78 | }) 79 | It("should successfully reconcile the resource", func() { 80 | By("Reconciling the created resource") 81 | controllerReconciler := &NodePoolReconciler{ 82 | Client: k8sClient, 83 | Scheme: k8sClient.Scheme(), 84 | } 85 | 86 | _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ 87 | NamespacedName: typeNamespacedName, 88 | }) 89 | Expect(err).NotTo(HaveOccurred()) 90 | }) 91 | }) 92 | }) 93 | 94 | func TestReconcileNodePool(t *testing.T) { 95 | // Create a fake client 96 | fakeClient := fake.NewClientBuilder().WithScheme(setupScheme()).WithStatusSubresource(&v1beta1.NodePool{}).Build() 97 | // Create a NodePool object for testing 98 | nodePool := &v1beta1.NodePool{ 99 | ObjectMeta: metav1.ObjectMeta{Name: "single-nodepool"}, 100 | Spec: v1beta1.NodePoolSpec{ 101 | NodeCount: 5, // Set the desired number of nodes 102 | NodeTemplate: corev1.Node{ 103 | Spec: corev1.NodeSpec{ 104 | // Set node spec fields as needed for testing 105 | }, 106 | }, 107 | }, 108 | Status: v1beta1.NodePoolStatus{}, 109 | } 110 | 111 | // Create a Reconciler instance 112 | reconciler := &NodePoolReconciler{ 113 | Client: fakeClient, 114 | Scheme: setupScheme(), 115 | } 116 | 117 | // Create a context 118 | ctx := context.Background() 119 | 120 | // Create the NodePool object in the fake client 121 | err := fakeClient.Create(ctx, nodePool) 122 | assert.NoError(t, err, "failed to create NodePool object") 123 | 124 | // Reconcile the NodePool 125 | req := reconcile.Request{NamespacedName: types.NamespacedName{Name: "single-nodepool"}} 126 | _, err = reconciler.Reconcile(ctx, req) 127 | assert.NoError(t, err, "reconciliation failed") 128 | 129 | // Verify that the number of nodes matches the desired count 130 | nodes := &corev1.NodeList{} 131 | err = fakeClient.List(ctx, nodes) 132 | assert.NoError(t, err, "failed to list nodes") 133 | assert.Equal(t, int(nodePool.Spec.NodeCount), len(nodes.Items), "unexpected number of nodes") 134 | 135 | // Update the NodePool object to have a single node 136 | 137 | err = fakeClient.Get(ctx, types.NamespacedName{Name: "single-nodepool"}, nodePool) 138 | assert.NoError(t, err, "failed to get NodePool object") 139 | nodePool.Spec.NodeCount = 2 140 | err = fakeClient.Update(ctx, nodePool) 141 | assert.NoError(t, err, "failed to update NodePool object") 142 | req = reconcile.Request{NamespacedName: types.NamespacedName{Name: "single-nodepool"}} 143 | _, err = reconciler.Reconcile(ctx, req) 144 | assert.NoError(t, err, "reconciliation failed") 145 | err = fakeClient.List(ctx, nodes) 146 | assert.NoError(t, err, "failed to list nodes") 147 | assert.Equal(t, int(nodePool.Spec.NodeCount), len(nodes.Items), "expected 1 got %d", len(nodes.Items)) 148 | 149 | // delete the NodePool object and check if the nodes are deleted 150 | err = fakeClient.Get(ctx, types.NamespacedName{Name: "single-nodepool"}, nodePool) 151 | if err != nil { 152 | log.Println("failed to get NodePool object") 153 | } 154 | err = fakeClient.Delete(ctx, nodePool) 155 | if err != nil { 156 | assert.NoError(t, err, "failed to delete NodePool object") 157 | } 158 | // Reconcile the NodePool 159 | req = reconcile.Request{NamespacedName: types.NamespacedName{Name: "single-nodepool"}} 160 | _, err = reconciler.Reconcile(ctx, req) 161 | assert.NoError(t, err, "reconciliation failed") 162 | err = fakeClient.Get(ctx, types.NamespacedName{Name: "single-nodepool"}, nodePool) 163 | assert.Error(t, err, "single-nodepool not found") 164 | 165 | // validate that the nodes are deleted 166 | //fakeClient.List(ctx, nodes) 167 | //assert.Equal(t, 0, len(nodes.Items), "unexpected number of nodes") 168 | } 169 | 170 | func TestMultipleNodePools(t *testing.T) { 171 | // Create a fake client 172 | fakeClient := fake.NewClientBuilder().WithScheme(setupScheme()).WithStatusSubresource(&v1beta1.NodePool{}).Build() 173 | 174 | // Initial node count for first NodePool 175 | initialNodeCount1 := int32(2) 176 | 177 | // Create the first NodePool object for testing 178 | nodePool1 := &v1beta1.NodePool{ 179 | ObjectMeta: metav1.ObjectMeta{Name: "test-nodepool-1"}, 180 | Spec: v1beta1.NodePoolSpec{ 181 | NodeCount: initialNodeCount1, 182 | NodeTemplate: corev1.Node{ 183 | Spec: corev1.NodeSpec{ 184 | // Set node spec fields as needed for testing 185 | }, 186 | }, 187 | }, 188 | } 189 | 190 | // Initial node count for second NodePool 191 | initialNodeCount2 := int32(3) 192 | 193 | // Create the second NodePool object for testing 194 | nodePool2 := &v1beta1.NodePool{ 195 | ObjectMeta: metav1.ObjectMeta{Name: "test-nodepool-2"}, 196 | Spec: v1beta1.NodePoolSpec{ 197 | NodeCount: initialNodeCount2, 198 | NodeTemplate: corev1.Node{ 199 | Spec: corev1.NodeSpec{ 200 | // Set node spec fields as needed for testing 201 | }, 202 | }, 203 | }, 204 | } 205 | 206 | // Create a Reconciler instance 207 | reconciler := &NodePoolReconciler{ 208 | Client: fakeClient, 209 | Scheme: setupScheme(), 210 | } 211 | 212 | // Create a context 213 | ctx := context.Background() 214 | 215 | // Create the first NodePool object in the fake client 216 | err := fakeClient.Create(ctx, nodePool1) 217 | assert.NoError(t, err, "failed to create NodePool 1") 218 | 219 | // Create the second NodePool object in the fake client 220 | err = fakeClient.Create(ctx, nodePool2) 221 | assert.NoError(t, err, "failed to create NodePool 2") 222 | 223 | // Reconcile the first NodePool 224 | req1 := reconcile.Request{NamespacedName: types.NamespacedName{Name: "test-nodepool-1"}} 225 | _, err = reconciler.Reconcile(ctx, req1) 226 | assert.NoError(t, err, "reconciliation for NodePool 1 failed") 227 | 228 | // Reconcile the second NodePool 229 | req2 := reconcile.Request{NamespacedName: types.NamespacedName{Name: "test-nodepool-2"}} 230 | _, err = reconciler.Reconcile(ctx, req2) 231 | assert.NoError(t, err, "reconciliation for NodePool 2 failed") 232 | 233 | // Verify the desired node count for each NodePool 234 | assertNodeCount(t, fakeClient, "test-nodepool-1", initialNodeCount1) 235 | assertNodeCount(t, fakeClient, "test-nodepool-2", initialNodeCount2) 236 | } 237 | 238 | func assertNodeCount(t *testing.T, c client.Client, nodeName string, expectedNodeCount int32) { 239 | nodes := &corev1.NodeList{} 240 | err := c.List(context.Background(), nodes) 241 | assert.NoError(t, err, "failed to list nodes") 242 | count := 0 243 | for _, node := range nodes.Items { 244 | if node.Labels[controllerLabel] == nodeName { 245 | count++ 246 | } 247 | } 248 | assert.Equal(t, int(expectedNodeCount), count, "unexpected node count for NodePool: "+nodeName) 249 | } 250 | 251 | // Test that changing nodeTemplate in NodePool spec updates the nodes 252 | func TestNodeTemplateChange(t *testing.T) { 253 | // Create a fake client 254 | fakeClient := fake.NewClientBuilder().WithScheme(setupScheme()).WithStatusSubresource(&v1beta1.NodePool{}).Build() 255 | 256 | // Initial node count for NodePool 257 | initialNodeCount := int32(2) 258 | 259 | // Create the NodePool object for testing 260 | nodePool := &v1beta1.NodePool{ 261 | ObjectMeta: metav1.ObjectMeta{Name: "test-nodepool"}, 262 | Spec: v1beta1.NodePoolSpec{ 263 | NodeCount: initialNodeCount, 264 | NodeTemplate: corev1.Node{ 265 | ObjectMeta: metav1.ObjectMeta{ 266 | Labels: map[string]string{ 267 | kubernetesRoleLabel: "test-nodepool", 268 | }, 269 | }, 270 | }, 271 | }, 272 | } 273 | 274 | // Create a Reconciler instance 275 | reconciler := &NodePoolReconciler{ 276 | Client: fakeClient, 277 | Scheme: setupScheme(), 278 | } 279 | 280 | // Create a context 281 | ctx := context.Background() 282 | 283 | // Create the NodePool object in the fake client 284 | err := fakeClient.Create(ctx, nodePool) 285 | assert.NoError(t, err, "failed to create NodePool object") 286 | 287 | // Reconcile the NodePool 288 | req := reconcile.Request{NamespacedName: types.NamespacedName{Name: "test-nodepool"}} 289 | _, err = reconciler.Reconcile(ctx, req) 290 | assert.NoError(t, err, "reconciliation failed") 291 | 292 | // Verify that the number of nodes matches the desired count 293 | nodes := &corev1.NodeList{} 294 | err = fakeClient.List(ctx, nodes) 295 | assert.NoError(t, err, "failed to list nodes") 296 | assert.Equal(t, int(nodePool.Spec.NodeCount), len(nodes.Items), "unexpected number of nodes") 297 | 298 | // Verify that the nodes have the correct labels 299 | for _, node := range nodes.Items { 300 | if node.Labels[controllerLabel] == nodePool.Name { 301 | assert.Equal(t, nodePool.Spec.NodeTemplate.Labels[kubernetesRoleLabel], "test-nodepool", "unexpected node labels") 302 | } 303 | } 304 | 305 | // update the NodePool object 306 | err = fakeClient.Get(ctx, types.NamespacedName{Name: "test-nodepool"}, nodePool) 307 | assert.NoError(t, err, "failed to get NodePool object") 308 | 309 | // Update the nodeTemplate in the NodePool spec 310 | newNodeTemplate := corev1.Node{ 311 | ObjectMeta: metav1.ObjectMeta{ 312 | Labels: map[string]string{ 313 | kubernetesRoleLabel: "test-nodepool2", 314 | }, 315 | }, 316 | } 317 | 318 | nodePool.Spec.NodeTemplate = newNodeTemplate 319 | err = fakeClient.Update(ctx, nodePool) 320 | assert.NoError(t, err, "failed to update NodePool object") 321 | 322 | // Reconcile the NodePool 323 | _, err = reconciler.Reconcile(ctx, req) 324 | assert.NoError(t, err, "reconciliation failed") 325 | 326 | // Verify that the nodes have been updated 327 | nodes = &corev1.NodeList{} 328 | err = fakeClient.List(ctx, nodes) 329 | assert.NoError(t, err, "failed to list nodes") 330 | 331 | // update the NodePool object 332 | err = fakeClient.Get(ctx, types.NamespacedName{Name: "test-nodepool"}, nodePool) 333 | assert.NoError(t, err, "failed to get NodePool object") 334 | 335 | for _, node := range nodes.Items { 336 | if node.Labels[controllerLabel] == nodePool.Name { 337 | assert.Equal(t, nodePool.Spec.NodeTemplate.Labels[kubernetesRoleLabel], "test-nodepool2", "unexpected node labels") 338 | } 339 | } 340 | } 341 | -------------------------------------------------------------------------------- /internal/controller/podpool_controller.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "strings" 22 | "time" 23 | 24 | corev1 "k8s.io/api/core/v1" 25 | apierrors "k8s.io/apimachinery/pkg/api/errors" 26 | "k8s.io/apimachinery/pkg/api/meta" 27 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 | "k8s.io/apimachinery/pkg/runtime" 29 | ctrl "sigs.k8s.io/controller-runtime" 30 | "sigs.k8s.io/controller-runtime/pkg/client" 31 | "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 32 | "sigs.k8s.io/controller-runtime/pkg/log" 33 | 34 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 35 | ) 36 | 37 | // PodPoolReconciler reconciles a PodPool object 38 | type PodPoolReconciler struct { 39 | client.Client 40 | Scheme *runtime.Scheme 41 | } 42 | 43 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=podpools,verbs=get;list;watch;create;update;patch;delete 44 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=podpools/status,verbs=get;update;patch 45 | //+kubebuilder:rbac:groups=kwok.sigs.run-ai.com,resources=podpools/finalizers,verbs=update 46 | 47 | func (r *PodPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 48 | log := log.FromContext(ctx) 49 | log.Info("Reconciling PodPool") 50 | podPool := &kwoksigsv1beta1.PodPool{} 51 | err := r.Get(ctx, req.NamespacedName, podPool) 52 | if err != nil { 53 | if apierrors.IsNotFound(err) { 54 | // If the custom resource is not found then, it usually means that it was deleted or not created 55 | // In this way, we will stop the reconciliation 56 | log.Info("podPool resource not found. Ignoring since object must be deleted") 57 | return ctrl.Result{}, nil 58 | } 59 | // Error reading the object - requeue the request CR. 60 | log.Error(err, "Failed to get podPool") 61 | return ctrl.Result{}, err 62 | } 63 | // Set reconciling status condition in the podPool status 64 | if podPool.Status.Conditions == nil || len(podPool.Status.Conditions) == 0 { 65 | err := r.statusConditionController(ctx, podPool, metav1.Condition{ 66 | Type: "Available", 67 | Status: metav1.ConditionUnknown, 68 | Reason: "Reconciling", 69 | Message: "Starting to reconcile the PodPool", 70 | }) 71 | if err != nil { 72 | log.Error(err, "Failed to update PodPool status") 73 | return ctrl.Result{}, err 74 | } 75 | err = r.Get(ctx, req.NamespacedName, podPool) 76 | if err != nil { 77 | log.Error(err, "Failed to get PodPool") 78 | return ctrl.Result{}, err 79 | } 80 | } 81 | // Set the finalizer for the podPool 82 | if !controllerutil.ContainsFinalizer(podPool, controllerFinalizer) { 83 | log.Info("Adding Finalizer for the PodPool") 84 | err := r.addFinalizer(ctx, podPool) 85 | if err != nil { 86 | log.Error(err, "Failed to add finalizer to PodPool") 87 | return ctrl.Result{}, err 88 | } 89 | } 90 | // Get pods in the cluster with owner reference to the podPool 91 | pods, err := r.getPods(ctx, podPool) 92 | if err != nil { 93 | log.Error(err, "Failed to get pods") 94 | return ctrl.Result{}, err 95 | } 96 | // Check if the number of pods in the cluster is equal to the desired number of pods in the podPool 97 | if int32(len(pods)) != podPool.Spec.PodCount { 98 | if int32(len(pods)) < podPool.Spec.PodCount { 99 | log.Info("Creating pods") 100 | err := r.statusConditionController(ctx, podPool, metav1.Condition{ 101 | Type: "Available", 102 | Status: metav1.ConditionFalse, 103 | Reason: "ScalingUp", 104 | Message: "Scalling up the podPool", 105 | }) 106 | if err != nil { 107 | log.Error(err, "Failed to update podPool status") 108 | return ctrl.Result{}, err 109 | } 110 | log.Info("Scalling up the podPool... creating pods!") 111 | err = r.createPods(ctx, podPool, pods) 112 | if err != nil { 113 | log.Error(err, "Failed to create pods") 114 | return ctrl.Result{}, err 115 | } 116 | } else { 117 | log.Info("Deleting pods") 118 | err := r.statusConditionController(ctx, podPool, metav1.Condition{ 119 | Type: "Available", 120 | Status: metav1.ConditionFalse, 121 | Reason: "ScalingDown", 122 | Message: "Scalling down the podPool", 123 | }) 124 | if err != nil { 125 | log.Error(err, "Failed to update podPool status") 126 | return ctrl.Result{}, err 127 | } 128 | log.Info("Scalling down the podPool... deleting pods!") 129 | err = r.deletePods(ctx, podPool, pods) 130 | if err != nil { 131 | log.Error(err, "Failed to delete pods") 132 | return ctrl.Result{}, err 133 | } 134 | } 135 | } 136 | // Update the status of the podPool 137 | err = r.statusConditionController(ctx, podPool, metav1.Condition{ 138 | Type: "Available", 139 | Status: metav1.ConditionTrue, 140 | Reason: "PodPoolReconciled", 141 | Message: "PodPool reconciled successfully", 142 | }) 143 | if err != nil { 144 | log.Error(err, "Failed to update podPool status") 145 | return ctrl.Result{}, err 146 | } 147 | // Update the observed generation of the podPool 148 | if podPool.Status.ObservedGeneration != podPool.Generation { 149 | log.Info("podTemplate has changed") 150 | err := r.statusConditionController(ctx, podPool, metav1.Condition{ 151 | Type: "Available", 152 | Status: metav1.ConditionFalse, 153 | Reason: "Updating", 154 | Message: "Updating the podPool", 155 | }) 156 | if err != nil { 157 | log.Error(err, "Failed to update podPool status") 158 | return ctrl.Result{}, err 159 | } 160 | emptyPodPool := &kwoksigsv1beta1.PodPool{ 161 | ObjectMeta: metav1.ObjectMeta{ 162 | Name: podPool.Name, 163 | }, 164 | } 165 | err = r.deletePods(ctx, emptyPodPool, pods) 166 | if err != nil { 167 | log.Error(err, "Failed to delete pods") 168 | return ctrl.Result{}, err 169 | } 170 | err = r.createPods(ctx, podPool, pods) 171 | if err != nil { 172 | log.Error(err, "Failed to create pods") 173 | return ctrl.Result{}, err 174 | } 175 | err = r.statusConditionController(ctx, podPool, metav1.Condition{ 176 | Type: "Available", 177 | Status: metav1.ConditionTrue, 178 | Reason: "Ready", 179 | Message: "podPool is ready", 180 | }) 181 | if err != nil { 182 | log.Error(err, "Failed to update PodPool status") 183 | return ctrl.Result{}, err 184 | } 185 | return ctrl.Result{Requeue: true}, nil 186 | } 187 | 188 | if !podPool.DeletionTimestamp.IsZero() { 189 | log.Info("Deleting PodPool") 190 | err = r.statusConditionController(ctx, podPool, metav1.Condition{ 191 | Type: "Available", 192 | Status: metav1.ConditionFalse, 193 | Reason: "Deleting", 194 | Message: "Deleting the podPool", 195 | }) 196 | if err != nil { 197 | log.Error(err, "Failed to update podPool status") 198 | return ctrl.Result{}, nil 199 | } 200 | err := r.deletePods(ctx, podPool, pods) 201 | if err != nil { 202 | log.Error(err, "Failed to delete nodes") 203 | return ctrl.Result{}, err 204 | } 205 | err = r.deleteFinalizer(ctx, podPool) 206 | if err != nil { 207 | log.Error(err, "Failed to delete finalizer from PodPool") 208 | return ctrl.Result{}, err 209 | } 210 | return ctrl.Result{}, nil 211 | } 212 | log.Info("Reconciliation finished") 213 | return ctrl.Result{RequeueAfter: time.Duration(60 * time.Second)}, nil 214 | } 215 | 216 | // SetupWithManager sets up the controller with the Manager. 217 | func (r *PodPoolReconciler) SetupWithManager(mgr ctrl.Manager) error { 218 | return ctrl.NewControllerManagedBy(mgr). 219 | For(&kwoksigsv1beta1.PodPool{}). 220 | Complete(r) 221 | } 222 | 223 | // deleteFinalizer deletes the finalizer from the podPool 224 | func (r *PodPoolReconciler) deleteFinalizer(ctx context.Context, podPool *kwoksigsv1beta1.PodPool) error { 225 | controllerutil.RemoveFinalizer(podPool, controllerFinalizer) 226 | return r.Update(ctx, podPool) 227 | } 228 | 229 | // statusConditionController updates the status of the podPool 230 | func (r *PodPoolReconciler) statusConditionController(ctx context.Context, podPool *kwoksigsv1beta1.PodPool, condition metav1.Condition) error { 231 | meta.SetStatusCondition(&podPool.Status.Conditions, condition) 232 | return r.Status().Update(ctx, podPool) 233 | } 234 | 235 | // addFinalizer adds the finalizer to the podPool 236 | func (r *PodPoolReconciler) addFinalizer(ctx context.Context, podPool *kwoksigsv1beta1.PodPool) error { 237 | controllerutil.AddFinalizer(podPool, controllerFinalizer) 238 | return r.Update(ctx, podPool) 239 | } 240 | 241 | // get pods in the cluster with owner reference to the podPool 242 | func (r *PodPoolReconciler) getPods(ctx context.Context, podPool *kwoksigsv1beta1.PodPool) ([]corev1.Pod, error) { 243 | pods := &corev1.PodList{} 244 | err := r.List(ctx, pods, client.InNamespace(podPool.Namespace), client.MatchingLabels{controllerLabel: podPool.Name}) 245 | if err != nil && strings.Contains(err.Error(), "does not exist") { 246 | return []corev1.Pod{}, nil 247 | } else if err != nil { 248 | return nil, err 249 | } 250 | return pods.Items, nil 251 | } 252 | 253 | // deletePods deletes the pods in the cluster 254 | func (r *PodPoolReconciler) deletePods(ctx context.Context, podPool *kwoksigsv1beta1.PodPool, pods []corev1.Pod) error { 255 | for i := int32(len(pods)); i > podPool.Spec.PodCount; i-- { 256 | // Delete a pod 257 | err := r.Delete(ctx, &pods[i-1]) 258 | if err != nil { 259 | return err 260 | } 261 | } 262 | err := r.updateObservedGeneration(ctx, podPool) 263 | if err != nil { 264 | return err 265 | } 266 | return nil 267 | } 268 | 269 | // updateObservedGeneration updates the observed generation of the podPool 270 | func (r *PodPoolReconciler) updateObservedGeneration(ctx context.Context, podPool *kwoksigsv1beta1.PodPool) error { 271 | podPool.Status.ObservedGeneration = podPool.Generation 272 | return r.Status().Update(ctx, podPool) 273 | } 274 | 275 | func (r *PodPoolReconciler) createPods(ctx context.Context, podPool *kwoksigsv1beta1.PodPool, pods []corev1.Pod) error { 276 | podLabels := podPool.Spec.PodTemplate.Labels 277 | if podLabels == nil { 278 | podLabels = make(map[string]string) 279 | } 280 | podLabels[controllerLabel] = podPool.Name 281 | podToleration := podPool.Spec.PodTemplate.Spec.Tolerations 282 | if podToleration == nil { 283 | podToleration = make([]corev1.Toleration, 0) 284 | } 285 | podToleration = append(podToleration, corev1.Toleration{ 286 | Key: controllerAnnotation, 287 | Operator: corev1.TolerationOpExists, 288 | Effect: corev1.TaintEffectNoSchedule, 289 | }) 290 | 291 | podAnnotation := podPool.Spec.PodTemplate.Annotations 292 | if podAnnotation == nil { 293 | podAnnotation = make(map[string]string) 294 | } 295 | podAnnotation[controllerAnnotation] = fakeString 296 | for i := int32(len(pods)); i < podPool.Spec.PodCount; i++ { 297 | // Create a new pod 298 | pod := &corev1.Pod{ 299 | ObjectMeta: metav1.ObjectMeta{ 300 | GenerateName: podPool.Name + "-", 301 | Namespace: podPool.Namespace, 302 | Labels: podLabels, 303 | Annotations: podAnnotation, 304 | OwnerReferences: []metav1.OwnerReference{ 305 | *metav1.NewControllerRef(podPool, kwoksigsv1beta1.GroupVersion.WithKind("PodPool")), 306 | }, 307 | }, 308 | Spec: podPool.Spec.PodTemplate.Spec, 309 | } 310 | pod.Spec.Tolerations = podToleration 311 | 312 | err := r.Create(ctx, pod) 313 | if err != nil { 314 | return err 315 | } 316 | } 317 | 318 | err := r.updateObservedGeneration(ctx, podPool) 319 | if err != nil { 320 | return err 321 | } 322 | return nil 323 | } 324 | -------------------------------------------------------------------------------- /internal/controller/podpool_controller_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | "testing" 22 | 23 | . "github.com/onsi/ginkgo/v2" 24 | . "github.com/onsi/gomega" 25 | "k8s.io/apimachinery/pkg/api/errors" 26 | "k8s.io/apimachinery/pkg/types" 27 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 28 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 29 | 30 | corev1 "k8s.io/api/core/v1" 31 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 | 33 | "github.com/run-ai/kwok-operator/api/v1beta1" 34 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 35 | ) 36 | 37 | var _ = Describe("PodPool Controller", func() { 38 | Context("When reconciling a resource", func() { 39 | const resourceName = "test-resource" 40 | 41 | ctx := context.Background() 42 | 43 | typeNamespacedName := types.NamespacedName{ 44 | Name: resourceName, 45 | Namespace: "default", 46 | } 47 | podpool := &kwoksigsv1beta1.PodPool{} 48 | 49 | BeforeEach(func() { 50 | By("creating the custom resource for the Kind PodPool") 51 | err := k8sClient.Get(ctx, typeNamespacedName, podpool) 52 | if err != nil && errors.IsNotFound(err) { 53 | resource := &kwoksigsv1beta1.PodPool{ 54 | ObjectMeta: metav1.ObjectMeta{ 55 | Name: resourceName, 56 | Namespace: "default", 57 | }, 58 | Spec: v1beta1.PodPoolSpec{ 59 | PodCount: 1, 60 | PodTemplate: corev1.Pod{ 61 | ObjectMeta: metav1.ObjectMeta{ 62 | Name: "test-pod", 63 | }, 64 | Spec: corev1.PodSpec{ 65 | Containers: []corev1.Container{ 66 | { 67 | Name: "test-container", 68 | Image: "nginx", 69 | }, 70 | }, 71 | }, 72 | }, 73 | }, 74 | } 75 | Expect(k8sClient.Create(ctx, resource)).To(Succeed()) 76 | } 77 | }) 78 | 79 | AfterEach(func() { 80 | resource := &kwoksigsv1beta1.PodPool{} 81 | err := k8sClient.Get(ctx, typeNamespacedName, resource) 82 | Expect(err).NotTo(HaveOccurred()) 83 | 84 | By("Cleanup the specific resource instance PodPool") 85 | Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) 86 | }) 87 | It("should successfully reconcile the resource", func() { 88 | By("Reconciling the created resource") 89 | controllerReconciler := &PodPoolReconciler{ 90 | Client: k8sClient, 91 | Scheme: k8sClient.Scheme(), 92 | } 93 | 94 | _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ 95 | NamespacedName: typeNamespacedName, 96 | }) 97 | Expect(err).NotTo(HaveOccurred()) 98 | }) 99 | }) 100 | }) 101 | 102 | // test for the controller of PodPool resource provision and deletion with fake client 103 | func TestReconcilepodPool(t *testing.T) { 104 | // Create a fake client 105 | fakeClient := fake.NewClientBuilder().WithScheme(setupScheme()).WithStatusSubresource(&v1beta1.PodPool{}).Build() 106 | // Create a podPool object for testing 107 | podPool := &v1beta1.PodPool{ 108 | ObjectMeta: metav1.ObjectMeta{ 109 | Name: "test-podpool", 110 | Namespace: "default", 111 | }, 112 | Spec: v1beta1.PodPoolSpec{ 113 | PodCount: 1, 114 | PodTemplate: corev1.Pod{ 115 | ObjectMeta: metav1.ObjectMeta{ 116 | Name: "test-pod", 117 | }, 118 | Spec: corev1.PodSpec{ 119 | Containers: []corev1.Container{ 120 | { 121 | Name: "test-container", 122 | Image: "nginx", 123 | }, 124 | }, 125 | }, 126 | }, 127 | }, 128 | } 129 | // Create a ReconcilepodPool object with the scheme and fake client. 130 | r := &PodPoolReconciler{ 131 | Client: fakeClient, 132 | Scheme: setupScheme(), 133 | } 134 | // Create the PodPool object in the fake client. 135 | err := fakeClient.Create(ctx, podPool) 136 | if err != nil { 137 | t.Fatalf("create PodPool: (%v)", err) 138 | } 139 | // Reconcile an object to get back a result. 140 | res, err := r.Reconcile(ctx, reconcile.Request{ 141 | NamespacedName: typeNamespacedName, 142 | }) 143 | if res != (reconcile.Result{}) { 144 | t.Fatalf("reconcile did not return an empty result") 145 | } 146 | // Check to make sure the reconcile was successful and that it should requeue the request. 147 | if err != nil { 148 | t.Fatalf("reconcile: (%v)", err) 149 | } 150 | //update podPool podCount to 5 and update the podPool object in the fake client 151 | podPool.Spec.PodCount = 5 152 | err = fakeClient.Update(ctx, podPool) 153 | if err != nil { 154 | t.Fatalf("update PodPool: (%v)", err) 155 | } 156 | // Reconcile an object to get back a result. 157 | res, err = r.Reconcile(ctx, reconcile.Request{ 158 | NamespacedName: typeNamespacedName, 159 | }) 160 | if res != (reconcile.Result{}) { 161 | t.Fatalf("reconcile did not return an empty result") 162 | } 163 | // Check to make sure the reconcile was successful and that it should requeue the request. 164 | if err != nil { 165 | t.Fatalf("reconcile: (%v)", err) 166 | } 167 | //delete the podPool object in the fake client 168 | err = fakeClient.Delete(ctx, podPool) 169 | if err != nil { 170 | t.Fatalf("delete PodPool: (%v)", err) 171 | } 172 | // Reconcile an object to get back a result. 173 | res, err = r.Reconcile(ctx, reconcile.Request{ 174 | NamespacedName: typeNamespacedName, 175 | }) 176 | if res != (reconcile.Result{}) { 177 | t.Fatalf("reconcile did not return an empty result") 178 | } 179 | // Check to make sure the reconcile was successful and that it should requeue the request. 180 | if err != nil { 181 | t.Fatalf("reconcile: (%v)", err) 182 | } 183 | 184 | } 185 | -------------------------------------------------------------------------------- /internal/controller/statefulsetpool_controller_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "context" 21 | 22 | . "github.com/onsi/ginkgo/v2" 23 | . "github.com/onsi/gomega" 24 | "k8s.io/apimachinery/pkg/api/errors" 25 | "k8s.io/apimachinery/pkg/types" 26 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 27 | 28 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 | 30 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 31 | ) 32 | 33 | var _ = Describe("StatefulsetPool Controller", func() { 34 | Context("When reconciling a resource", func() { 35 | const resourceName = "test-resource" 36 | 37 | ctx := context.Background() 38 | 39 | typeNamespacedName := types.NamespacedName{ 40 | Name: resourceName, 41 | Namespace: "default", 42 | } 43 | statefulsetpool := &kwoksigsv1beta1.StatefulsetPool{} 44 | 45 | BeforeEach(func() { 46 | By("creating the custom resource for the Kind StatefulsetPool") 47 | err := k8sClient.Get(ctx, typeNamespacedName, statefulsetpool) 48 | if err != nil && errors.IsNotFound(err) { 49 | resource := &kwoksigsv1beta1.StatefulsetPool{ 50 | ObjectMeta: metav1.ObjectMeta{ 51 | Name: resourceName, 52 | Namespace: "default", 53 | }, 54 | } 55 | Expect(k8sClient.Create(ctx, resource)).To(Succeed()) 56 | } 57 | }) 58 | 59 | AfterEach(func() { 60 | resource := &kwoksigsv1beta1.StatefulsetPool{} 61 | err := k8sClient.Get(ctx, typeNamespacedName, resource) 62 | Expect(err).NotTo(HaveOccurred()) 63 | 64 | By("Cleanup the specific resource instance StatefulsetPool") 65 | Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) 66 | }) 67 | It("should successfully reconcile the resource", func() { 68 | By("Reconciling the created resource") 69 | controllerReconciler := &StatefulsetPoolReconciler{ 70 | Client: k8sClient, 71 | Scheme: k8sClient.Scheme(), 72 | } 73 | 74 | _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ 75 | NamespacedName: typeNamespacedName, 76 | }) 77 | Expect(err).NotTo(HaveOccurred()) 78 | }) 79 | }) 80 | }) 81 | -------------------------------------------------------------------------------- /internal/controller/suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controller 18 | 19 | import ( 20 | "fmt" 21 | "path/filepath" 22 | "runtime" 23 | "testing" 24 | 25 | . "github.com/onsi/ginkgo/v2" 26 | . "github.com/onsi/gomega" 27 | 28 | "k8s.io/client-go/kubernetes/scheme" 29 | "k8s.io/client-go/rest" 30 | "sigs.k8s.io/controller-runtime/pkg/client" 31 | "sigs.k8s.io/controller-runtime/pkg/envtest" 32 | logf "sigs.k8s.io/controller-runtime/pkg/log" 33 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 34 | 35 | kwoksigsv1beta1 "github.com/run-ai/kwok-operator/api/v1beta1" 36 | //+kubebuilder:scaffold:imports 37 | ) 38 | 39 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to 40 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. 41 | 42 | var cfg *rest.Config 43 | var k8sClient client.Client 44 | var testEnv *envtest.Environment 45 | 46 | func TestControllers(t *testing.T) { 47 | RegisterFailHandler(Fail) 48 | 49 | RunSpecs(t, "Controller Suite") 50 | } 51 | 52 | var _ = BeforeSuite(func() { 53 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 54 | 55 | By("bootstrapping test environment") 56 | testEnv = &envtest.Environment{ 57 | CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, 58 | ErrorIfCRDPathMissing: true, 59 | 60 | // The BinaryAssetsDirectory is only required if you want to run the tests directly 61 | // without call the makefile target test. If not informed it will look for the 62 | // default path defined in controller-runtime which is /usr/local/kubebuilder/. 63 | // Note that you must have the required binaries setup under the bin directory to perform 64 | // the tests directly. When we run make test it will be setup and used automatically. 65 | BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", 66 | fmt.Sprintf("1.28.3-%s-%s", runtime.GOOS, runtime.GOARCH)), 67 | } 68 | 69 | var err error 70 | // cfg is defined in this file globally. 71 | cfg, err = testEnv.Start() 72 | Expect(err).NotTo(HaveOccurred()) 73 | Expect(cfg).NotTo(BeNil()) 74 | 75 | err = kwoksigsv1beta1.AddToScheme(scheme.Scheme) 76 | Expect(err).NotTo(HaveOccurred()) 77 | 78 | //+kubebuilder:scaffold:scheme 79 | 80 | k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) 81 | Expect(err).NotTo(HaveOccurred()) 82 | Expect(k8sClient).NotTo(BeNil()) 83 | 84 | }) 85 | 86 | var _ = AfterSuite(func() { 87 | By("tearing down the test environment") 88 | err := testEnv.Stop() 89 | Expect(err).NotTo(HaveOccurred()) 90 | }) 91 | -------------------------------------------------------------------------------- /test/e2e/e2e_suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package e2e 18 | 19 | import ( 20 | "fmt" 21 | "testing" 22 | 23 | . "github.com/onsi/ginkgo/v2" 24 | . "github.com/onsi/gomega" 25 | ) 26 | 27 | // Run e2e tests using the Ginkgo runner. 28 | func TestE2E(t *testing.T) { 29 | RegisterFailHandler(Fail) 30 | fmt.Fprintf(GinkgoWriter, "Starting kwok-operator suite\n") 31 | RunSpecs(t, "e2e suite") 32 | } 33 | -------------------------------------------------------------------------------- /test/e2e/e2e_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package e2e 18 | 19 | import ( 20 | "fmt" 21 | "os/exec" 22 | "time" 23 | 24 | . "github.com/onsi/ginkgo/v2" 25 | . "github.com/onsi/gomega" 26 | 27 | "github.com/run-ai/kwok-operator/test/utils" 28 | ) 29 | 30 | const namespace = "kwok-operator" 31 | 32 | var _ = Describe("controller", Ordered, func() { 33 | BeforeAll(func() { 34 | By("installing prometheus operator") 35 | Expect(utils.InstallPrometheusOperator()).To(Succeed()) 36 | 37 | By("installing the cert-manager") 38 | Expect(utils.InstallCertManager()).To(Succeed()) 39 | 40 | By("creating manager namespace") 41 | cmd := exec.Command("kubectl", "create", "ns", namespace) 42 | _, _ = utils.Run(cmd) 43 | }) 44 | 45 | AfterAll(func() { 46 | By("uninstalling the Prometheus manager bundle") 47 | utils.UninstallPrometheusOperator() 48 | 49 | By("uninstalling the cert-manager bundle") 50 | utils.UninstallCertManager() 51 | 52 | By("removing manager namespace") 53 | cmd := exec.Command("kubectl", "delete", "ns", namespace) 54 | _, _ = utils.Run(cmd) 55 | }) 56 | 57 | Context("Operator", func() { 58 | It("should run successfully", func() { 59 | var controllerPodName string 60 | var err error 61 | 62 | // projectimage stores the name of the image used in the example 63 | var projectimage = "example.com/kwok-operator:v0.0.2" 64 | 65 | By("building the manager(Operator) image") 66 | cmd := exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectimage)) 67 | _, err = utils.Run(cmd) 68 | ExpectWithOffset(1, err).NotTo(HaveOccurred()) 69 | 70 | By("loading the the manager(Operator) image on Kind") 71 | err = utils.LoadImageToKindClusterWithName(projectimage) 72 | ExpectWithOffset(1, err).NotTo(HaveOccurred()) 73 | 74 | By("installing CRDs") 75 | cmd = exec.Command("make", "install") 76 | _, err = utils.Run(cmd) 77 | 78 | By("deploying the kwok-operator") 79 | cmd = exec.Command("make", "deploy", fmt.Sprintf("IMG=%s", projectimage)) 80 | _, err = utils.Run(cmd) 81 | ExpectWithOffset(1, err).NotTo(HaveOccurred()) 82 | 83 | By("validating that the kwok-operator pod is running as expected") 84 | verifyControllerUp := func() error { 85 | // Get pod name 86 | 87 | cmd = exec.Command("kubectl", "get", 88 | "pods", "-l", "control-plane=kwok-operator", 89 | "-o", "go-template={{ range .items }}"+ 90 | "{{ if not .metadata.deletionTimestamp }}"+ 91 | "{{ .metadata.name }}"+ 92 | "{{ \"\\n\" }}{{ end }}{{ end }}", 93 | "-n", namespace, 94 | ) 95 | 96 | podOutput, err := utils.Run(cmd) 97 | ExpectWithOffset(2, err).NotTo(HaveOccurred()) 98 | podNames := utils.GetNonEmptyLines(string(podOutput)) 99 | if len(podNames) != 1 { 100 | return fmt.Errorf("expect 1 controller pods running, but got %d", len(podNames)) 101 | } 102 | controllerPodName = podNames[0] 103 | ExpectWithOffset(2, controllerPodName).Should(ContainSubstring("kwok-operator")) 104 | 105 | // Validate pod status 106 | cmd = exec.Command("kubectl", "get", 107 | "pods", controllerPodName, "-o", "jsonpath={.status.phase}", 108 | "-n", namespace, 109 | ) 110 | status, err := utils.Run(cmd) 111 | ExpectWithOffset(2, err).NotTo(HaveOccurred()) 112 | if string(status) != "Running" { 113 | return fmt.Errorf("controller pod in %s status", status) 114 | } 115 | return nil 116 | } 117 | EventuallyWithOffset(1, verifyControllerUp, time.Minute, time.Second).Should(Succeed()) 118 | 119 | }) 120 | }) 121 | }) 122 | -------------------------------------------------------------------------------- /test/utils/utils.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package utils 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "os/exec" 23 | "strings" 24 | 25 | . "github.com/onsi/ginkgo/v2" //nolint:golint,revive 26 | ) 27 | 28 | const ( 29 | prometheusOperatorVersion = "v0.68.0" 30 | prometheusOperatorURL = "https://github.com/prometheus-operator/prometheus-operator/" + 31 | "releases/download/%s/bundle.yaml" 32 | 33 | certmanagerVersion = "v1.5.3" 34 | certmanagerURLTmpl = "https://github.com/jetstack/cert-manager/releases/download/%s/cert-manager.yaml" 35 | ) 36 | 37 | func warnError(err error) { 38 | fmt.Fprintf(GinkgoWriter, "warning: %v\n", err) 39 | } 40 | 41 | // InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics. 42 | func InstallPrometheusOperator() error { 43 | url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) 44 | cmd := exec.Command("kubectl", "create", "-f", url) 45 | _, err := Run(cmd) 46 | return err 47 | } 48 | 49 | // Run executes the provided command within this context 50 | func Run(cmd *exec.Cmd) ([]byte, error) { 51 | dir, _ := GetProjectDir() 52 | cmd.Dir = dir 53 | 54 | if err := os.Chdir(cmd.Dir); err != nil { 55 | fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err) 56 | } 57 | 58 | cmd.Env = append(os.Environ(), "GO111MODULE=on") 59 | command := strings.Join(cmd.Args, " ") 60 | fmt.Fprintf(GinkgoWriter, "running: %s\n", command) 61 | output, err := cmd.CombinedOutput() 62 | if err != nil { 63 | return output, fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output)) 64 | } 65 | 66 | return output, nil 67 | } 68 | 69 | // UninstallPrometheusOperator uninstalls the prometheus 70 | func UninstallPrometheusOperator() { 71 | url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) 72 | cmd := exec.Command("kubectl", "delete", "-f", url) 73 | if _, err := Run(cmd); err != nil { 74 | warnError(err) 75 | } 76 | } 77 | 78 | // UninstallCertManager uninstalls the cert manager 79 | func UninstallCertManager() { 80 | url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) 81 | cmd := exec.Command("kubectl", "delete", "-f", url) 82 | if _, err := Run(cmd); err != nil { 83 | warnError(err) 84 | } 85 | } 86 | 87 | // InstallCertManager installs the cert manager bundle. 88 | func InstallCertManager() error { 89 | url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) 90 | cmd := exec.Command("kubectl", "apply", "-f", url) 91 | if _, err := Run(cmd); err != nil { 92 | return err 93 | } 94 | // Wait for cert-manager-webhook to be ready, which can take time if cert-manager 95 | // was re-installed after uninstalling on a cluster. 96 | cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook", 97 | "--for", "condition=Available", 98 | "--namespace", "cert-manager", 99 | "--timeout", "5m", 100 | ) 101 | 102 | _, err := Run(cmd) 103 | return err 104 | } 105 | 106 | // LoadImageToKindCluster loads a local docker image to the kind cluster 107 | func LoadImageToKindClusterWithName(name string) error { 108 | cluster := "kind" 109 | if v, ok := os.LookupEnv("KIND_CLUSTER"); ok { 110 | cluster = v 111 | } 112 | kindOptions := []string{"load", "docker-image", name, "--name", cluster} 113 | cmd := exec.Command("kind", kindOptions...) 114 | _, err := Run(cmd) 115 | return err 116 | } 117 | 118 | // GetNonEmptyLines converts given command output string into individual objects 119 | // according to line breakers, and ignores the empty elements in it. 120 | func GetNonEmptyLines(output string) []string { 121 | var res []string 122 | elements := strings.Split(output, "\n") 123 | for _, element := range elements { 124 | if element != "" { 125 | res = append(res, element) 126 | } 127 | } 128 | 129 | return res 130 | } 131 | 132 | // GetProjectDir will return the directory where the project is 133 | func GetProjectDir() (string, error) { 134 | wd, err := os.Getwd() 135 | if err != nil { 136 | return wd, err 137 | } 138 | wd = strings.Replace(wd, "/test/e2e", "", -1) 139 | return wd, nil 140 | } 141 | --------------------------------------------------------------------------------