├── .dockerignore
├── .gitignore
├── .golangci.yml
├── Dockerfile.controller
├── Dockerfile.daemonset
├── Makefile
├── PROJECT
├── README.md
├── api
    └── v1alpha1
    │   ├── groupversion_info.go
    │   ├── instaslice_types.go
    │   └── zz_generated.deepcopy.go
├── cmd
    ├── controller
    │   └── main.go
    └── daemonset
    │   └── main.go
├── config
    ├── crd
    │   ├── bases
    │   │   └── inference.codeflare.dev_instaslices.yaml
    │   ├── kustomization.yaml
    │   └── kustomizeconfig.yaml
    ├── default
    │   ├── kustomization.yaml
    │   ├── manager_auth_proxy_patch.yaml
    │   └── manager_config_patch.yaml
    ├── manager
    │   ├── kustomization.yaml
    │   └── manager.yaml
    ├── prometheus
    │   ├── kustomization.yaml
    │   └── monitor.yaml
    └── rbac
    │   ├── auth_proxy_client_clusterrole.yaml
    │   ├── auth_proxy_role.yaml
    │   ├── auth_proxy_role_binding.yaml
    │   ├── auth_proxy_service.yaml
    │   ├── instaslice_editor_role.yaml
    │   ├── instaslice_viewer_role.yaml
    │   ├── kustomization.yaml
    │   ├── leader_election_role.yaml
    │   ├── leader_election_role_binding.yaml
    │   ├── role.yaml
    │   ├── role_binding.yaml
    │   └── service_account.yaml
├── deploy
    ├── custom-configmapwithprofiles.yaml
    └── setup.sh
├── go.mod
├── go.sum
├── hack
    └── boilerplate.go.txt
├── internal
    └── controller
    │   ├── instaslice_controller.go
    │   ├── instaslice_controller_test.go
    │   ├── instaslice_daemonset.go
    │   ├── instaslice_daemonset_test.go
    │   └── suite_test.go
├── samples
    ├── test-pod.yaml
    ├── tf-notebook.yaml
    ├── vllm_cache.yaml
    └── vllm_dep.yaml
└── test
    ├── e2e
        ├── e2e_suite_test.go
        └── e2e_test.go
    └── utils
        └── utils.go


/.dockerignore:
--------------------------------------------------------------------------------
1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
2 | # Ignore build and test binaries.
3 | bin/
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Binaries for programs and plugins
 3 | *.exe
 4 | *.exe~
 5 | *.dll
 6 | *.so
 7 | *.dylib
 8 | bin/*
 9 | Dockerfile.cross
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | 
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 | 
17 | # Go workspace file
18 | go.work
19 | 
20 | # Kubernetes Generated files - skip generated files, except for vendored files
21 | !vendor/**/zz_generated.*
22 | 
23 | # editor and IDE paraphernalia
24 | .idea
25 | .vscode
26 | *.swp
27 | *.swo
28 | *~
29 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   deadline: 5m
 3 |   allow-parallel-runners: true
 4 | 
 5 | issues:
 6 |   # don't skip warning about doc comments
 7 |   # don't exclude the default set of lint
 8 |   exclude-use-default: false
 9 |   # restore some of the defaults
10 |   # (fill in the rest as needed)
11 |   exclude-rules:
12 |     - path: "api/*"
13 |       linters:
14 |         - lll
15 |     - path: "internal/*"
16 |       linters:
17 |         - dupl
18 |         - lll
19 | linters:
20 |   disable-all: true
21 |   enable:
22 |     - dupl
23 |     - errcheck
24 |     - exportloopref
25 |     - goconst
26 |     - gocyclo
27 |     - gofmt
28 |     - goimports
29 |     - gosimple
30 |     - govet
31 |     - ineffassign
32 |     - lll
33 |     - misspell
34 |     - nakedret
35 |     - prealloc
36 |     - staticcheck
37 |     - typecheck
38 |     - unconvert
39 |     - unparam
40 |     - unused
41 | 


--------------------------------------------------------------------------------
/Dockerfile.controller:
--------------------------------------------------------------------------------
 1 | ARG CUDA_VERSION=12.4.1
 2 | ARG BASE_DIST=ubi8
 3 | FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} AS build
 4 | 
 5 | ARG GOLANG_VERSION=1.22.2
 6 | RUN yum install -y wget make git gcc
 7 | 
 8 | RUN set -eux; \
 9 |     \
10 |     arch="$(uname -m)"; \
11 |     case "${arch##*-}" in \
12 |         x86_64 | amd64) ARCH='amd64' ;; \
13 |         ppc64el | ppc64le) ARCH='ppc64le' ;; \
14 |         aarch64) ARCH='arm64' ;; \
15 |         *) echo "unsupported architecture" ; exit 1 ;; \
16 |     esac; \
17 |        wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
18 |     | tar -C /usr/local -xz
19 | 
20 | ENV GOPATH /go
21 | ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
22 | 
23 | WORKDIR /workspace
24 | # Copy the Go Modules manifests
25 | COPY go.mod go.mod
26 | COPY go.sum go.sum
27 | # cache deps before building and copying source so that we don't need to re-download as much
28 | # and so that source changes don't invalidate our downloaded layer
29 | RUN go mod download
30 | 
31 | # Copy the go source
32 | COPY cmd/controller/main.go cmd/controller/main.go
33 | COPY api/ api/
34 | COPY internal/controller/instaslice_controller.go internal/controller/instaslice_controller.go
35 | 
36 | # Build
37 | # the GOARCH has not a default value to allow the binary be built according to the host where the command
38 | # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
39 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
40 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
41 | RUN go build -o bin/manager cmd/controller/main.go
42 | 
43 | ARG CUDA_VERSION=12.4.1
44 | ARG BASE_DIST=ubi8
45 | FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST}
46 | 
47 | # Remove CUDA libs(compat etc) in favor of libs installed by the NVIDIA driver
48 | RUN dnf remove -y cuda-*
49 | 
50 | ENV NVIDIA_DISABLE_REQUIRE="true"
51 | ENV NVIDIA_VISIBLE_DEVICES=all
52 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
53 | 
54 | WORKDIR /
55 | 
56 | COPY --from=build /workspace/bin/manager .
57 | 
58 | # Install / upgrade packages here that are required to resolve CVEs
59 | ARG CVE_UPDATES
60 | RUN if [ -n "${CVE_UPDATES}" ]; then \
61 |         yum update -y ${CVE_UPDATES} && \
62 |         rm -rf /var/cache/yum/*; \
63 |     fi
64 | 
65 | ENTRYPOINT ["/manager"]


--------------------------------------------------------------------------------
/Dockerfile.daemonset:
--------------------------------------------------------------------------------
 1 | ARG CUDA_VERSION=12.4.1
 2 | ARG BASE_DIST=ubi8
 3 | FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} AS build
 4 | 
 5 | ARG GOLANG_VERSION=1.22.2
 6 | RUN yum install -y wget make git gcc
 7 | 
 8 | RUN set -eux; \
 9 |     \
10 |     arch="$(uname -m)"; \
11 |     case "${arch##*-}" in \
12 |         x86_64 | amd64) ARCH='amd64' ;; \
13 |         ppc64el | ppc64le) ARCH='ppc64le' ;; \
14 |         aarch64) ARCH='arm64' ;; \
15 |         *) echo "unsupported architecture" ; exit 1 ;; \
16 |     esac; \
17 |     wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
18 |     | tar -C /usr/local -xz
19 | 
20 | ENV GOPATH /go
21 | ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
22 | 
23 | WORKDIR /workspace
24 | # Copy the Go Modules manifests
25 | COPY go.mod go.mod
26 | COPY go.sum go.sum
27 | # cache deps before building and copying source so that we don't need to re-download as much
28 | # and so that source changes don't invalidate our downloaded layer
29 | RUN go mod download
30 | 
31 | # Copy the go source
32 | COPY cmd/daemonset/main.go cmd/daemonset/main.go
33 | COPY api/ api/
34 | COPY internal/controller/instaslice_daemonset.go internal/controller/instaslice_daemonset.go
35 | 
36 | # Build
37 | # the GOARCH has not a default value to allow the binary be built according to the host where the command
38 | # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
39 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
40 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
41 | RUN go build -o bin/daemonset cmd/daemonset/main.go
42 | 
43 | ARG CUDA_VERSION=12.4.1
44 | ARG BASE_DIST=ubi8
45 | FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST}
46 | 
47 | # Remove CUDA libs(compat etc) in favor of libs installed by the NVIDIA driver
48 | RUN dnf remove -y cuda-*
49 | 
50 | ENV NVIDIA_DISABLE_REQUIRE="true"
51 | ENV NVIDIA_VISIBLE_DEVICES=all
52 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
53 | 
54 | WORKDIR /
55 | 
56 | COPY --from=build /workspace/bin/daemonset .
57 | 
58 | # Install / upgrade packages here that are required to resolve CVEs
59 | ARG CVE_UPDATES
60 | RUN if [ -n "${CVE_UPDATES}" ]; then \
61 |         yum update -y ${CVE_UPDATES} && \
62 |         rm -rf /var/cache/yum/*; \
63 |     fi
64 | 
65 | ENTRYPOINT ["/daemonset"]


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Image URL to use all building/pushing image targets
  2 | IMG ?= asm582/instaslicev2-controller:latest
  3 | IMG_DMST ?= asm582/instaslicev2-daemonset:latest
  4 | 
  5 | # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
  6 | ENVTEST_K8S_VERSION = 1.29.0
  7 | 
  8 | # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
  9 | ifeq (,$(shell go env GOBIN))
 10 | GOBIN=$(shell go env GOPATH)/bin
 11 | else
 12 | GOBIN=$(shell go env GOBIN)
 13 | endif
 14 | 
 15 | # CONTAINER_TOOL defines the container tool to be used for building images.
 16 | # Be aware that the target commands are only tested with Docker which is
 17 | # scaffolded by default. However, you might want to replace it to use other
 18 | # tools. (i.e. podman)
 19 | CONTAINER_TOOL ?= docker
 20 | 
 21 | ifeq ($(CONTAINER_TOOL),podman)
 22 | MULTI_ARCH_OPTION=--manifest
 23 | else
 24 | MULTI_ARCH_OPTION=--push --provenance=false --tag
 25 | endif
 26 | 
 27 | 
 28 | # Setting SHELL to bash allows bash commands to be executed by recipes.
 29 | # Options are set to exit when a recipe line exits non-zero or a piped command fails.
 30 | SHELL = /usr/bin/env bash -o pipefail
 31 | .SHELLFLAGS = -ec
 32 | 
 33 | # GOOS?=linux
 34 | # GOARCH?=arm64
 35 | # CGO_ENABLED?=0
 36 | # CLI_VERSION_PACKAGE := main
 37 | # COMMIT ?= $(shell git describe --dirty --long --always --abbrev=15)
 38 | # CGO_LDFLAGS_ALLOW := "-Wl,--unresolved-symbols=ignore-in-object-files"
 39 | # LDFLAGS_COMMON := "-s -w -X $(CLI_VERSION_PACKAGE).commitSha=$(COMMIT) -X $(CLI_VERSION_PACKAGE).version=$(VERSION)
 40 | 
 41 | .PHONY: all
 42 | all: build
 43 | 
 44 | ##@ General
 45 | 
 46 | # The help target prints out all targets with their descriptions organized
 47 | # beneath their categories. The categories are represented by '##@' and the
 48 | # target descriptions by '##'. The awk command is responsible for reading the
 49 | # entire set of makefiles included in this invocation, looking for lines of the
 50 | # file as xyz: ## something, and then pretty-format the target and help. Then,
 51 | # if there's a line with ##@ something, that gets pretty-printed as a category.
 52 | # More info on the usage of ANSI control characters for terminal formatting:
 53 | # https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters
 54 | # More info on the awk command:
 55 | # http://linuxcommand.org/lc3_adv_awk.php
 56 | 
 57 | .PHONY: help
 58 | help: ## Display this help.
 59 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 60 | 
 61 | ##@ Development
 62 | 
 63 | .PHONY: manifests
 64 | manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
 65 | 	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
 66 | 
 67 | .PHONY: generate
 68 | generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
 69 | 	$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
 70 | 
 71 | .PHONY: fmt
 72 | fmt: ## Run go fmt against code.
 73 | 	go fmt ./...
 74 | 
 75 | .PHONY: vet
 76 | vet: ## Run go vet against code.
 77 | 	go vet ./...
 78 | 
 79 | .PHONY: test
 80 | test: manifests generate fmt vet envtest ## Run tests.
 81 | 	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out
 82 | 
 83 | # Utilize Kind or modify the e2e tests to load the image locally, enabling compatibility with other vendors.
 84 | .PHONY: test-e2e  # Run the e2e tests against a Kind k8s instance that is spun up.
 85 | test-e2e:
 86 | 	go test ./test/e2e/ -v -ginkgo.v
 87 | 
 88 | .PHONY: lint
 89 | lint: golangci-lint ## Run golangci-lint linter & yamllint
 90 | 	$(GOLANGCI_LINT) run
 91 | 
 92 | .PHONY: lint-fix
 93 | lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
 94 | 	$(GOLANGCI_LINT) run --fix
 95 | 
 96 | ##@ Build
 97 | 
 98 | .PHONY: build
 99 | build: manifests generate fmt vet ## Build manager binary.
100 | 	go build -o bin/manager cmd/controller/main.go
101 | 	go build -o bin/daemonset cmd/daemonset/main.go
102 | .PHONY: run-controller
103 | run-controller: manifests generate fmt vet ## Run a controller from your host.
104 | 	sudo -E go run ./cmd/controller/main.go
105 | 
106 | .PHONY: run-daemonset
107 | run-daemonset: manifests generate fmt vet ## Run a controller from your host.
108 | 	sudo -E go run ./cmd/daemonset/main.go
109 | 
110 | # If you wish to build the manager image targeting other platforms you can use the --platform flag.
111 | # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
112 | # More info: https://docs.docker.com/develop/develop-images/build_enhancements/
113 | .PHONY: docker-build
114 | docker-build: ## Build docker image with the manager.
115 | 	$(CONTAINER_TOOL) build -t ${IMG} -f Dockerfile.controller .
116 | 	$(CONTAINER_TOOL) build -t ${IMG_DMST} -f Dockerfile.daemonset .
117 | 
118 | .PHONY: docker-push
119 | docker-push: ## Push docker image with the manager.
120 | 	$(CONTAINER_TOOL) push ${IMG}
121 | 	$(CONTAINER_TOOL) push ${IMG_DMST}
122 | 
123 | # PLATFORMS defines the target platforms for the manager image be built to provide support to multiple
124 | # architectures. Make sure that base image in the Dockerfile/Containerfile is itself multi-platform, and includes
125 | # the requested plaforms. Unlike "docker buildx", for multi-platform images podman requires creating a manifest.
126 | PLATFORMS ?= linux/arm64,linux/amd64
127 | .PHONY: docker-buildx
128 | docker-buildx: ## Build and push docker images with multi-platform support
129 | 	if [ "$(CONTAINER_TOOL)" == "podman" ]; then \
130 | 	  $(CONTAINER_TOOL) manifest rm ${IMG} || true; \
131 | 	  $(CONTAINER_TOOL) manifest create ${IMG}; \
132 | 	  $(CONTAINER_TOOL) manifest rm ${IMG_DMST} || true; \
133 | 	  $(CONTAINER_TOOL) manifest create ${IMG_DMST}; \
134 | 	fi
135 | 	DOCKER_BUILDKIT=1 $(CONTAINER_TOOL) buildx build --platform=$(PLATFORMS) $(MULTI_ARCH_OPTION) ${IMG} -f Dockerfile.controller .
136 | 	DOCKER_BUILDKIT=1 $(CONTAINER_TOOL) buildx build --platform=$(PLATFORMS) $(MULTI_ARCH_OPTION) ${IMG_DMST} -f Dockerfile.daemonset .
137 | 	if [ "$(CONTAINER_TOOL)" == "podman" ]; then \
138 | 	  $(CONTAINER_TOOL) manifest push ${IMG}; \
139 | 	  $(CONTAINER_TOOL) manifest push ${IMG_DMST}; \
140 | 	fi
141 | 
142 | .PHONY: build-installer
143 | build-installer: manifests generate kustomize ## Generate a consolidated YAML with CRDs and deployment.
144 | 	mkdir -p dist
145 | 	cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG}
146 | 	$(KUSTOMIZE) build config/default > dist/install.yaml
147 | 
148 | ##@ Deployment
149 | 
150 | ifndef ignore-not-found
151 |   ignore-not-found = false
152 | endif
153 | 
154 | .PHONY: install
155 | install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config.
156 | 	$(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f -
157 | 
158 | .PHONY: uninstall
159 | uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
160 | 	$(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
161 | 
162 | .PHONY: deploy
163 | deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
164 | 	cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG}
165 | 	$(KUSTOMIZE) build config/default | $(KUBECTL) apply -f -
166 | 
167 | # .PHONY: deploy-daemonset
168 | # deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
169 | # 	cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG_DMST}
170 | # 	$(KUSTOMIZE) build config/daemonset | $(KUBECTL) apply -f -
171 | 
172 | .PHONY: undeploy
173 | undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
174 | 	$(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
175 | 
176 | ##@ Dependencies
177 | 
178 | ## Location to install dependencies to
179 | LOCALBIN ?= $(shell pwd)/bin
180 | $(LOCALBIN):
181 | 	mkdir -p $(LOCALBIN)
182 | 
183 | ## Tool Binaries
184 | KUBECTL ?= kubectl
185 | KUSTOMIZE ?= $(LOCALBIN)/kustomize-$(KUSTOMIZE_VERSION)
186 | CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen-$(CONTROLLER_TOOLS_VERSION)
187 | ENVTEST ?= $(LOCALBIN)/setup-envtest-$(ENVTEST_VERSION)
188 | GOLANGCI_LINT = $(LOCALBIN)/golangci-lint-$(GOLANGCI_LINT_VERSION)
189 | 
190 | ## Tool Versions
191 | KUSTOMIZE_VERSION ?= v5.3.0
192 | CONTROLLER_TOOLS_VERSION ?= v0.14.0
193 | ENVTEST_VERSION ?= release-0.17
194 | GOLANGCI_LINT_VERSION ?= v1.54.2
195 | 
196 | .PHONY: kustomize
197 | kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary.
198 | $(KUSTOMIZE): $(LOCALBIN)
199 | 	$(call go-install-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v5,$(KUSTOMIZE_VERSION))
200 | 
201 | .PHONY: controller-gen
202 | controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary.
203 | $(CONTROLLER_GEN): $(LOCALBIN)
204 | 	$(call go-install-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen,$(CONTROLLER_TOOLS_VERSION))
205 | 
206 | .PHONY: envtest
207 | envtest: $(ENVTEST) ## Download setup-envtest locally if necessary.
208 | $(ENVTEST): $(LOCALBIN)
209 | 	$(call go-install-tool,$(ENVTEST),sigs.k8s.io/controller-runtime/tools/setup-envtest,$(ENVTEST_VERSION))
210 | 
211 | .PHONY: golangci-lint
212 | golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary.
213 | $(GOLANGCI_LINT): $(LOCALBIN)
214 | 	$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,${GOLANGCI_LINT_VERSION})
215 | 
216 | # go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
217 | # $1 - target path with name of binary (ideally with version)
218 | # $2 - package url which can be installed
219 | # $3 - specific version of package
220 | define go-install-tool
221 | @[ -f $(1) ] || { \
222 | set -e; \
223 | package=$(2)@$(3) ;\
224 | echo "Downloading $${package}" ;\
225 | GOBIN=$(LOCALBIN) go install $${package} ;\
226 | mv "$$(echo "$(1)" | sed "s/-$(3)$$//")" $(1) ;\
227 | }
228 | endef
229 | 


--------------------------------------------------------------------------------
/PROJECT:
--------------------------------------------------------------------------------
 1 | # Code generated by tool. DO NOT EDIT.
 2 | # This file is used to track the info used to scaffold your project
 3 | # and allow the plugins properly work.
 4 | # More info: https://book.kubebuilder.io/reference/project-config.html
 5 | domain: codeflare.dev
 6 | layout:
 7 | - go.kubebuilder.io/v4
 8 | projectName: instaslice
 9 | repo: codeflare.dev/instaslice
10 | resources:
11 | - api:
12 |     crdVersion: v1alpha1
13 |     namespaced: true
14 |   controller: true
15 |   domain: codeflare.dev
16 |   group: inference
17 |   kind: Instaslice
18 |   path: codeflare.dev/instaslice/api/v1alpha1
19 |   version: v1alpha1
20 | version: "3"
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Note - we have moved to https://github.com/openshift/instaslice-operator
  2 | 
  3 | # Note - Kubecon EU 2024 code (DRA code) is now available in the legacy branch
  4 | 
  5 | # InstaSlice
  6 | 
  7 | Experimental InstaSlice works with GPU operator to create mig slices on demand.
  8 | 
  9 | ## Getting Started
 10 | 
 11 | ### Prerequisites
 12 | - [Go](https://go.dev/doc/install) v1.22.0+
 13 | - [Docker](https://docs.docker.com/get-docker/) v17.03+
 14 | - [Docker buildx plugin](https://github.com/docker/buildx) for building cross-platform images.
 15 | - [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) v1.11.3+.
 16 | - Access to a [KinD](https://kind.sigs.k8s.io/docs/user/quick-start/) cluster.
 17 | 
 18 | ### Install KinD cluster with GPU operator
 19 | 
 20 | - Make sure the GPUs on the host have MIG enabled
 21 | 
 22 | ```sh
 23 | +-----------------------------------------------------------------------------------------+
 24 | | NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
 25 | |-----------------------------------------+------------------------+----------------------+
 26 | | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 27 | | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 28 | |                                         |                        |               MIG M. |
 29 | |=========================================+========================+======================|
 30 | |   0  NVIDIA A100-PCIE-40GB          Off |   00000000:0E:00.0 Off |                   On |
 31 | | N/A   36C    P0             33W /  250W |       0MiB /  40960MiB |     N/A      Default |
 32 | |                                         |                        |              Enabled |
 33 | +-----------------------------------------+------------------------+----------------------+
 34 | |   1  NVIDIA A100-PCIE-40GB          Off |   00000000:0F:00.0 Off |                   On |
 35 | | N/A   40C    P0             32W /  250W |       0MiB /  40960MiB |     N/A      Default |
 36 | |                                         |                        |              Enabled |
 37 | +-----------------------------------------+------------------------+----------------------+
 38 | 
 39 | +-----------------------------------------------------------------------------------------+
 40 | | MIG devices:                                                                            |
 41 | +------------------+----------------------------------+-----------+-----------------------+
 42 | | GPU  GI  CI  MIG |                     Memory-Usage |        Vol|      Shared           |
 43 | |      ID  ID  Dev |                       BAR1-Usage | SM     Unc| CE ENC DEC OFA JPG    |
 44 | |                  |                                  |        ECC|                       |
 45 | |==================+==================================+===========+=======================|
 46 | |  No MIG devices found                                                                   |
 47 | +-----------------------------------------------------------------------------------------+
 48 | 
 49 | +-----------------------------------------------------------------------------------------+
 50 | | Processes:                                                                              |
 51 | |  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
 52 | |        ID   ID                                                               Usage      |
 53 | |=========================================================================================|
 54 | |  No running processes found                                                             |
 55 | ```
 56 | 
 57 | - Run the below script
 58 | ```sh
 59 | sh ./deploy/setup.sh
 60 | ```
 61 | NOTE: Please check if all the pods in GPU operator are completed or Running before moving to the next step.
 62 | 
 63 | ```sh
 64 | (base) openstack@netsres62:~/asmalvan/instaslice2$ kubectl get pods -n gpu-operator
 65 | NAME                                                              READY   STATUS      RESTARTS   AGE
 66 | gpu-feature-discovery-578q8                                       1/1     Running     0          102s
 67 | gpu-operator-1714053627-node-feature-discovery-gc-9b857c99phlnn   1/1     Running     0          7m21s
 68 | gpu-operator-1714053627-node-feature-discovery-master-6df78zgsz   1/1     Running     0          7m21s
 69 | gpu-operator-1714053627-node-feature-discovery-worker-47tpx       1/1     Running     0          7m19s
 70 | gpu-operator-54b8bfbfd8-rmzbd                                     1/1     Running     0          7m21s
 71 | nvidia-container-toolkit-daemonset-wkc5h                          1/1     Running     0          6m21s
 72 | nvidia-cuda-validator-cn8lg                                       0/1     Completed   0          88s
 73 | nvidia-dcgm-exporter-h75xg                                        1/1     Running     0          102s
 74 | nvidia-device-plugin-daemonset-452dk                              1/1     Running     0          101s
 75 | nvidia-mig-manager-htt7z                                          1/1     Running     0          2m21s
 76 | nvidia-operator-validator-kh6jf                                   1/1     Running     0          102s
 77 | ```
 78 | 
 79 | - After all the pods are Running/Completed, run nvidia-smi on the host and check if MIG slices appear on the all the GPUs of the host.
 80 | 
 81 | ```sh
 82 | (base) openstack@netsres62:~/asmalvan/instaslice2$ nvidia-smi
 83 | Thu Apr 25 10:08:24 2024
 84 | +-----------------------------------------------------------------------------------------+
 85 | | NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
 86 | |-----------------------------------------+------------------------+----------------------+
 87 | | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 88 | | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 89 | |                                         |                        |               MIG M. |
 90 | |=========================================+========================+======================|
 91 | |   0  NVIDIA A100-PCIE-40GB          Off |   00000000:0E:00.0 Off |                   On |
 92 | | N/A   45C    P0             71W /  250W |      87MiB /  40960MiB |     N/A      Default |
 93 | |                                         |                        |              Enabled |
 94 | +-----------------------------------------+------------------------+----------------------+
 95 | |   1  NVIDIA A100-PCIE-40GB          Off |   00000000:0F:00.0 Off |                   On |
 96 | | N/A   49C    P0             69W /  250W |      87MiB /  40960MiB |     N/A      Default |
 97 | |                                         |                        |              Enabled |
 98 | +-----------------------------------------+------------------------+----------------------+
 99 | 
100 | +-----------------------------------------------------------------------------------------+
101 | | MIG devices:                                                                            |
102 | +------------------+----------------------------------+-----------+-----------------------+
103 | | GPU  GI  CI  MIG |                     Memory-Usage |        Vol|      Shared           |
104 | |      ID  ID  Dev |                       BAR1-Usage | SM     Unc| CE ENC DEC OFA JPG    |
105 | |                  |                                  |        ECC|                       |
106 | |==================+==================================+===========+=======================|
107 | |  0    2   0   0  |              37MiB / 19968MiB    | 42      0 |  3   0    2    0    0 |
108 | |                  |                 0MiB / 32767MiB  |           |                       |
109 | +------------------+----------------------------------+-----------+-----------------------+
110 | |  0    3   0   1  |              25MiB /  9856MiB    | 28      0 |  2   0    1    0    0 |
111 | |                  |                 0MiB / 16383MiB  |           |                       |
112 | +------------------+----------------------------------+-----------+-----------------------+
113 | |  0    9   0   2  |              12MiB /  4864MiB    | 14      0 |  1   0    0    0    0 |
114 | |                  |                 0MiB /  8191MiB  |           |                       |
115 | +------------------+----------------------------------+-----------+-----------------------+
116 | |  0   10   0   3  |              12MiB /  4864MiB    | 14      0 |  1   0    0    0    0 |
117 | |                  |                 0MiB /  8191MiB  |           |                       |
118 | +------------------+----------------------------------+-----------+-----------------------+
119 | |  1    2   0   0  |              37MiB / 19968MiB    | 42      0 |  3   0    2    0    0 |
120 | |                  |                 0MiB / 32767MiB  |           |                       |
121 | +------------------+----------------------------------+-----------+-----------------------+
122 | |  1    3   0   1  |              25MiB /  9856MiB    | 28      0 |  2   0    1    0    0 |
123 | |                  |                 0MiB / 16383MiB  |           |                       |
124 | +------------------+----------------------------------+-----------+-----------------------+
125 | |  1    9   0   2  |              12MiB /  4864MiB    | 14      0 |  1   0    0    0    0 |
126 | |                  |                 0MiB /  8191MiB  |           |                       |
127 | +------------------+----------------------------------+-----------+-----------------------+
128 | |  1   10   0   3  |              12MiB /  4864MiB    | 14      0 |  1   0    0    0    0 |
129 | |                  |                 0MiB /  8191MiB  |           |                       |
130 | +------------------+----------------------------------+-----------+-----------------------+
131 | 
132 | +-----------------------------------------------------------------------------------------+
133 | | Processes:                                                                              |
134 | |  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
135 | |        ID   ID                                                               Usage      |
136 | |=========================================================================================|
137 | |  No running processes found                                                             |
138 | +-----------------------------------------------------------------------------------------+
139 | (base) openstack@netsres62:~/asmalvan/instaslice2$
140 | ```
141 | 
142 | 
143 | - Delete mig slices using the commmand
144 | 
145 | ```sh
146 | sudo nvidia-smi mig -dci && sudo nvidia-smi mig -dgi
147 | 
148 | uccessfully destroyed compute instance ID  0 from GPU  0 GPU instance ID  9
149 | Successfully destroyed compute instance ID  0 from GPU  0 GPU instance ID 10
150 | Successfully destroyed compute instance ID  0 from GPU  0 GPU instance ID  3
151 | Successfully destroyed compute instance ID  0 from GPU  0 GPU instance ID  2
152 | Successfully destroyed compute instance ID  0 from GPU  1 GPU instance ID  9
153 | Successfully destroyed compute instance ID  0 from GPU  1 GPU instance ID 10
154 | Successfully destroyed compute instance ID  0 from GPU  1 GPU instance ID  3
155 | Successfully destroyed compute instance ID  0 from GPU  1 GPU instance ID  2
156 | Successfully destroyed GPU instance ID  9 from GPU  0
157 | Successfully destroyed GPU instance ID 10 from GPU  0
158 | Successfully destroyed GPU instance ID  3 from GPU  0
159 | Successfully destroyed GPU instance ID  2 from GPU  0
160 | Successfully destroyed GPU instance ID  9 from GPU  1
161 | Successfully destroyed GPU instance ID 10 from GPU  1
162 | Successfully destroyed GPU instance ID  3 from GPU  1
163 | Successfully destroyed GPU instance ID  2 from GPU  1
164 | ```
165 | 
166 | - Create placeholder slice to make k8s-device-plugin happy using the command
167 | 
168 | ```sh
169 | sudo nvidia-smi mig -cgi 3g.20gb -C
170 | Successfully created GPU instance ID  2 on GPU  0 using profile MIG 3g.20gb (ID  9)
171 | Successfully created compute instance ID  0 on GPU  0 GPU instance ID  2 using profile MIG 3g.20gb (ID  2)
172 | Successfully created GPU instance ID  2 on GPU  1 using profile MIG 3g.20gb (ID  9)
173 | Successfully created compute instance ID  0 on GPU  1 GPU instance ID  2 using profile MIG 3g.20gb (ID  2)
174 | ```
175 | 
176 | - Run the below command to patch device plugin with configmap created by the setup script. For OpenShift replace clusterpolicies.nvidia.com/cluster-policy to clusterpolicies.nvidia.com/gpu-cluster-policy and namespace to nvidia-gpu-operator
177 | 
178 | ```sh
179 | (base) openstack@netsres62:~/asmalvan/instaslice2$ kubectl patch clusterpolicies.nvidia.com/cluster-policy     -n gpu-operator --type merge     -p '{"spec": {"devicePlugin": {"config": {"name": "test"}}}}'
180 | ```
181 | 
182 | You are now all set to dynamically create slices on the cluster using InstaSlice.
183 | 
184 | ### Running the controller
185 | 
186 | - Refer to section `To Deploy on the cluster`
187 | 
188 | ### Submitting the workload
189 | 
190 | - Submit a sample workload using the command
191 | 
192 | ```sh
193 | kubectl apply -f ./samples/test-pod.yaml
194 | pod/cuda-vectoradd-5 created
195 | ```
196 | 
197 | - check the status of the workload using commands
198 | 
199 | ```sh
200 | kubectl get pods
201 | NAME               READY   STATUS    RESTARTS   AGE
202 | cuda-vectoradd-5   1/1     Running   0          15s
203 | kubectl logs cuda-vectoradd-5
204 | GPU 0: NVIDIA A100-PCIE-40GB (UUID: GPU-31cfe05c-ed13-cd17-d7aa-c63db5108c24)
205 |   MIG 1g.5gb      Device  0: (UUID: MIG-c5720b34-e550-5278-90e6-d99a979aafd1)
206 | [Vector addition of 50000 elements]
207 | Copy input data from the host memory to the CUDA device
208 | CUDA kernel launch with 196 blocks of 256 threads
209 | Copy output data from the CUDA device to the host memory
210 | Test PASSED
211 | Done
212 | 
213 | +-----------------------------------------------------------------------------------------+
214 | | NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
215 | |-----------------------------------------+------------------------+----------------------+
216 | | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
217 | | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
218 | |                                         |                        |               MIG M. |
219 | |=========================================+========================+======================|
220 | |   0  NVIDIA A100-PCIE-40GB          Off |   00000000:0E:00.0 Off |                   On |
221 | | N/A   52C    P0             75W /  250W |      50MiB /  40960MiB |     N/A      Default |
222 | |                                         |                        |              Enabled |
223 | +-----------------------------------------+------------------------+----------------------+
224 | |   1  NVIDIA A100-PCIE-40GB          Off |   00000000:0F:00.0 Off |                   On |
225 | | N/A   60C    P0             75W /  250W |      37MiB /  40960MiB |     N/A      Default |
226 | |                                         |                        |              Enabled |
227 | +-----------------------------------------+------------------------+----------------------+
228 | 
229 | +-----------------------------------------------------------------------------------------+
230 | | MIG devices:                                                                            |
231 | +------------------+----------------------------------+-----------+-----------------------+
232 | | GPU  GI  CI  MIG |                     Memory-Usage |        Vol|      Shared           |
233 | |      ID  ID  Dev |                       BAR1-Usage | SM     Unc| CE ENC DEC OFA JPG    |
234 | |                  |                                  |        ECC|                       |
235 | |==================+==================================+===========+=======================|
236 | |  0    2   0   0  |              37MiB / 19968MiB    | 42      0 |  3   0    2    0    0 |
237 | |                  |                 0MiB / 32767MiB  |           |                       |
238 | +------------------+----------------------------------+-----------+-----------------------+
239 | |  0   10   0   1  |              12MiB /  4864MiB    | 14      0 |  1   0    0    0    0 |
240 | |                  |                 0MiB /  8191MiB  |           |                       |
241 | +------------------+----------------------------------+-----------+-----------------------+
242 | |  1    2   0   0  |              37MiB / 19968MiB    | 42      0 |  3   0    2    0    0 |
243 | |                  |                 0MiB / 32767MiB  |           |                       |
244 | +------------------+----------------------------------+-----------+-----------------------+
245 | 
246 | +-----------------------------------------------------------------------------------------+
247 | | Processes:                                                                              |
248 | |  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
249 | |        ID   ID                                                               Usage      |
250 | |=========================================================================================|
251 | |  No running processes found                                                             |
252 | +-----------------------------------------------------------------------------------------+
253 | 
254 | ```
255 | ### Deleting the workload
256 | 
257 | - Delete the pod and see the newly created MIG slice deleted
258 | 
259 | ```sh
260 | kubectl delete pod cuda-vectoradd-5
261 | 
262 | +-----------------------------------------------------------------------------------------+
263 | | NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
264 | |-----------------------------------------+------------------------+----------------------+
265 | | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
266 | | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
267 | |                                         |                        |               MIG M. |
268 | |=========================================+========================+======================|
269 | |   0  NVIDIA A100-PCIE-40GB          Off |   00000000:0E:00.0 Off |                   On |
270 | | N/A   53C    P0             75W /  250W |      37MiB /  40960MiB |     N/A      Default |
271 | |                                         |                        |              Enabled |
272 | +-----------------------------------------+------------------------+----------------------+
273 | |   1  NVIDIA A100-PCIE-40GB          Off |   00000000:0F:00.0 Off |                   On |
274 | | N/A   60C    P0             75W /  250W |      37MiB /  40960MiB |     N/A      Default |
275 | |                                         |                        |              Enabled |
276 | +-----------------------------------------+------------------------+----------------------+
277 | 
278 | +-----------------------------------------------------------------------------------------+
279 | | MIG devices:                                                                            |
280 | +------------------+----------------------------------+-----------+-----------------------+
281 | | GPU  GI  CI  MIG |                     Memory-Usage |        Vol|      Shared           |
282 | |      ID  ID  Dev |                       BAR1-Usage | SM     Unc| CE ENC DEC OFA JPG    |
283 | |                  |                                  |        ECC|                       |
284 | |==================+==================================+===========+=======================|
285 | |  0    2   0   0  |              37MiB / 19968MiB    | 42      0 |  3   0    2    0    0 |
286 | |                  |                 0MiB / 32767MiB  |           |                       |
287 | +------------------+----------------------------------+-----------+-----------------------+
288 | |  1    2   0   0  |              37MiB / 19968MiB    | 42      0 |  3   0    2    0    0 |
289 | |                  |                 0MiB / 32767MiB  |           |                       |
290 | +------------------+----------------------------------+-----------+-----------------------+
291 | 
292 | +-----------------------------------------------------------------------------------------+
293 | | Processes:                                                                              |
294 | |  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
295 | |        ID   ID                                                               Usage      |
296 | |=========================================================================================|
297 | |  No running processes found                                                             |
298 | +-----------------------------------------------------------------------------------------+
299 | 
300 | ```
301 | 
302 | ### To Deploy on the cluster
303 | 
304 | **All in one command**
305 | 
306 | make docker-build && make docker-push && make deploy
307 | 
308 | Cross-platform or multi-arch images can be built and pushed using
309 | `make docker-buildx`. When using Docker as your container tool, make
310 | sure to create a builder instance. Refer to
311 | [Multi-platform images](https://docs.docker.com/build/building/multi-platform/)
312 | for documentation on building mutli-platform images with Docker.
313 | 
314 | You can change the destination platform(s) by
315 | setting `PLATFORMS`, e.g.
316 | 
317 | ```sh
318 | PLATFORMS=linux/arm64,linux/amd64 make docker-buildx
319 | ```
320 | 
321 | **Build and push your image to the location specified by `IMG`:**
322 | 
323 | ```sh
324 | make docker-build docker-push IMG=<some-registry>/instaslice:tag
325 | ```
326 | 
327 | **NOTE:** This image ought to be published in the personal registry you specified.
328 | And it is required to have access to pull the image from the working environment.
329 | Make sure you have the proper permission to the registry if the above commands don’t work.
330 | 
331 | **Install the CRDs into the cluster:**
332 | 
333 | ```sh
334 | make install
335 | ```
336 | 
337 | **Deploy the Manager to the cluster with the image specified by `IMG`:**
338 | 
339 | ```sh
340 | make deploy IMG=<some-registry>/instaslice:tag
341 | ```
342 | 
343 | > **NOTE**: If you encounter RBAC errors, you may need to grant yourself cluster-admin
344 | privileges or be logged in as admin.
345 | 
346 | **Create instances of your solution**
347 | You can apply the samples (examples) from the config/sample:
348 | 
349 | ```sh
350 | kubectl apply -k config/samples/
351 | ```
352 | 
353 | >**NOTE**: Ensure that the samples has default values to test it out.
354 | 
355 | ### To Uninstall
356 | **Delete the instances (CRs) from the cluster:**
357 | 
358 | ```sh
359 | kubectl delete -k config/samples/
360 | ```
361 | 
362 | **Delete the APIs(CRDs) from the cluster:**
363 | 
364 | ```sh
365 | make uninstall
366 | ```
367 | 
368 | **UnDeploy the controller from the cluster:**
369 | 
370 | ```sh
371 | make undeploy
372 | ```
373 | 
374 | ## Project Distribution
375 | 
376 | Following are the steps to build the installer and distribute this project to users.
377 | 
378 | 1. Build the installer for the image built and published in the registry:
379 | 
380 | ```sh
381 | make build-installer IMG=<some-registry>/instaslice:tag
382 | ```
383 | 
384 | NOTE: The makefile target mentioned above generates an 'install.yaml'
385 | file in the dist directory. This file contains all the resources built
386 | with Kustomize, which are necessary to install this project without
387 | its dependencies.
388 | 
389 | 2. Using the installer
390 | 
391 | Users can just run kubectl apply -f <URL for YAML BUNDLE> to install the project, i.e.:
392 | 
393 | ```sh
394 | kubectl apply -f https://raw.githubusercontent.com/<org>/instaslice/<tag or branch>/dist/install.yaml
395 | ```
396 | 
397 | ## Contributing
398 | // TODO(user): Add detailed information on how you would like others to contribute to this project
399 | 
400 | **NOTE:** Run `make help` for more information on all potential `make` targets
401 | 
402 | More information can be found via the [Kubebuilder Documentation](https://book.kubebuilder.io/introduction.html)
403 | 
404 | ## License
405 | 
406 | Copyright 2024.
407 | 
408 | Licensed under the Apache License, Version 2.0 (the "License");
409 | you may not use this file except in compliance with the License.
410 | You may obtain a copy of the License at
411 | 
412 |     http://www.apache.org/licenses/LICENSE-2.0
413 | 
414 | Unless required by applicable law or agreed to in writing, software
415 | distributed under the License is distributed on an "AS IS" BASIS,
416 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
417 | See the License for the specific language governing permissions and
418 | limitations under the License.
419 | 
420 | 


--------------------------------------------------------------------------------
/api/v1alpha1/groupversion_info.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package v1alpha1 contains API Schema definitions for the inference v1alpha1 API group
18 | // +kubebuilder:object:generate=true
19 | // +groupName=inference.codeflare.dev
20 | package v1alpha1
21 | 
22 | import (
23 | 	"k8s.io/apimachinery/pkg/runtime/schema"
24 | 	"sigs.k8s.io/controller-runtime/pkg/scheme"
25 | )
26 | 
27 | var (
28 | 	// GroupVersion is group version used to register these objects
29 | 	GroupVersion = schema.GroupVersion{Group: "inference.codeflare.dev", Version: "v1alpha1"}
30 | 
31 | 	// SchemeBuilder is used to add go types to the GroupVersionKind scheme
32 | 	SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
33 | 
34 | 	// AddToScheme adds the types in this group-version to the given scheme.
35 | 	AddToScheme = SchemeBuilder.AddToScheme
36 | )
37 | 


--------------------------------------------------------------------------------
/api/v1alpha1/instaslice_types.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package v1alpha1
 18 | 
 19 | import (
 20 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 21 | )
 22 | 
 23 | type Mig struct {
 24 | 	Placements     []Placement `json:"placements,omitempty"`
 25 | 	Profile        string      `json:"profile,omitempty"`
 26 | 	Giprofileid    int         `json:"giprofileid"`
 27 | 	CIProfileID    int         `json:"ciProfileid"`
 28 | 	CIEngProfileID int         `json:"ciengprofileid"`
 29 | }
 30 | 
 31 | type Placement struct {
 32 | 	Size  int `json:"size"`
 33 | 	Start int `json:"start"`
 34 | }
 35 | 
 36 | // Define the struct for allocation details
 37 | type AllocationDetails struct {
 38 | 	Profile          string `json:"profile"`
 39 | 	Start            uint32 `json:"start"`
 40 | 	Size             uint32 `json:"size"`
 41 | 	PodUUID          string `json:"podUUID"`
 42 | 	GPUUUID          string `json:"gpuUUID"`
 43 | 	Nodename         string `json:"nodename"`
 44 | 	Allocationstatus string `json:"allocationStatus"`
 45 | 	Giprofileid      int    `json:"giprofileid"`
 46 | 	CIProfileID      int    `json:"ciProfileid"`
 47 | 	CIEngProfileID   int    `json:"ciengprofileid"`
 48 | 	Namespace        string `json:"namespace"`
 49 | 	PodName          string `json:"podName"`
 50 | }
 51 | 
 52 | // Define the struct for allocation details
 53 | type PreparedDetails struct {
 54 | 	Profile string `json:"profile"`
 55 | 	Start   uint32 `json:"start"`
 56 | 	Size    uint32 `json:"size"`
 57 | 	Parent  string `json:"parent"`
 58 | 	//Do we need POD UID here?
 59 | 	PodUUID  string `json:"podUUID"`
 60 | 	Giinfoid uint32 `json:"giinfo"`
 61 | 	Ciinfoid uint32 `json:"ciinfo"`
 62 | }
 63 | 
 64 | // InstasliceSpec defines the desired state of Instaslice
 65 | type InstasliceSpec struct {
 66 | 	MigGPUUUID map[string]string `json:"MigGPUUUID,omitempty"`
 67 | 	// GPUID, Profile, start, podUUID
 68 | 	Allocations map[string]AllocationDetails `json:"allocations,omitempty"`
 69 | 	//Prepared :  GPUID, Profile, start
 70 | 	Prepared     map[string]PreparedDetails `json:"prepared,omitempty"`
 71 | 	Migplacement []Mig                      `json:"migplacement,omitempty"`
 72 | }
 73 | 
 74 | // InstasliceStatus defines the observed state of Instaslice
 75 | type InstasliceStatus struct {
 76 | 	Processed string `json:"processed,omitempty"`
 77 | }
 78 | 
 79 | //+kubebuilder:object:root=true
 80 | //+kubebuilder:subresource:status
 81 | 
 82 | // Instaslice is the Schema for the instaslices API
 83 | type Instaslice struct {
 84 | 	metav1.TypeMeta   `json:",inline"`
 85 | 	metav1.ObjectMeta `json:"metadata,omitempty"`
 86 | 
 87 | 	Spec   InstasliceSpec   `json:"spec,omitempty"`
 88 | 	Status InstasliceStatus `json:"status,omitempty"`
 89 | }
 90 | 
 91 | //+kubebuilder:object:root=true
 92 | 
 93 | // InstasliceList contains a list of Instaslice
 94 | type InstasliceList struct {
 95 | 	metav1.TypeMeta `json:",inline"`
 96 | 	metav1.ListMeta `json:"metadata,omitempty"`
 97 | 	Items           []Instaslice `json:"items"`
 98 | }
 99 | 
100 | func init() {
101 | 	SchemeBuilder.Register(&Instaslice{}, &InstasliceList{})
102 | }
103 | 


--------------------------------------------------------------------------------
/api/v1alpha1/zz_generated.deepcopy.go:
--------------------------------------------------------------------------------
  1 | //go:build !ignore_autogenerated
  2 | 
  3 | /*
  4 | Copyright 2024.
  5 | 
  6 | Licensed under the Apache License, Version 2.0 (the "License");
  7 | you may not use this file except in compliance with the License.
  8 | You may obtain a copy of the License at
  9 | 
 10 |     http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 | Unless required by applicable law or agreed to in writing, software
 13 | distributed under the License is distributed on an "AS IS" BASIS,
 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | See the License for the specific language governing permissions and
 16 | limitations under the License.
 17 | */
 18 | 
 19 | // Code generated by controller-gen. DO NOT EDIT.
 20 | 
 21 | package v1alpha1
 22 | 
 23 | import (
 24 | 	runtime "k8s.io/apimachinery/pkg/runtime"
 25 | )
 26 | 
 27 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 28 | func (in *AllocationDetails) DeepCopyInto(out *AllocationDetails) {
 29 | 	*out = *in
 30 | }
 31 | 
 32 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AllocationDetails.
 33 | func (in *AllocationDetails) DeepCopy() *AllocationDetails {
 34 | 	if in == nil {
 35 | 		return nil
 36 | 	}
 37 | 	out := new(AllocationDetails)
 38 | 	in.DeepCopyInto(out)
 39 | 	return out
 40 | }
 41 | 
 42 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 43 | func (in *Instaslice) DeepCopyInto(out *Instaslice) {
 44 | 	*out = *in
 45 | 	out.TypeMeta = in.TypeMeta
 46 | 	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
 47 | 	in.Spec.DeepCopyInto(&out.Spec)
 48 | 	out.Status = in.Status
 49 | }
 50 | 
 51 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Instaslice.
 52 | func (in *Instaslice) DeepCopy() *Instaslice {
 53 | 	if in == nil {
 54 | 		return nil
 55 | 	}
 56 | 	out := new(Instaslice)
 57 | 	in.DeepCopyInto(out)
 58 | 	return out
 59 | }
 60 | 
 61 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
 62 | func (in *Instaslice) DeepCopyObject() runtime.Object {
 63 | 	if c := in.DeepCopy(); c != nil {
 64 | 		return c
 65 | 	}
 66 | 	return nil
 67 | }
 68 | 
 69 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 70 | func (in *InstasliceList) DeepCopyInto(out *InstasliceList) {
 71 | 	*out = *in
 72 | 	out.TypeMeta = in.TypeMeta
 73 | 	in.ListMeta.DeepCopyInto(&out.ListMeta)
 74 | 	if in.Items != nil {
 75 | 		in, out := &in.Items, &out.Items
 76 | 		*out = make([]Instaslice, len(*in))
 77 | 		for i := range *in {
 78 | 			(*in)[i].DeepCopyInto(&(*out)[i])
 79 | 		}
 80 | 	}
 81 | }
 82 | 
 83 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InstasliceList.
 84 | func (in *InstasliceList) DeepCopy() *InstasliceList {
 85 | 	if in == nil {
 86 | 		return nil
 87 | 	}
 88 | 	out := new(InstasliceList)
 89 | 	in.DeepCopyInto(out)
 90 | 	return out
 91 | }
 92 | 
 93 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
 94 | func (in *InstasliceList) DeepCopyObject() runtime.Object {
 95 | 	if c := in.DeepCopy(); c != nil {
 96 | 		return c
 97 | 	}
 98 | 	return nil
 99 | }
100 | 
101 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
102 | func (in *InstasliceSpec) DeepCopyInto(out *InstasliceSpec) {
103 | 	*out = *in
104 | 	if in.MigGPUUUID != nil {
105 | 		in, out := &in.MigGPUUUID, &out.MigGPUUUID
106 | 		*out = make(map[string]string, len(*in))
107 | 		for key, val := range *in {
108 | 			(*out)[key] = val
109 | 		}
110 | 	}
111 | 	if in.Allocations != nil {
112 | 		in, out := &in.Allocations, &out.Allocations
113 | 		*out = make(map[string]AllocationDetails, len(*in))
114 | 		for key, val := range *in {
115 | 			(*out)[key] = val
116 | 		}
117 | 	}
118 | 	if in.Prepared != nil {
119 | 		in, out := &in.Prepared, &out.Prepared
120 | 		*out = make(map[string]PreparedDetails, len(*in))
121 | 		for key, val := range *in {
122 | 			(*out)[key] = val
123 | 		}
124 | 	}
125 | 	if in.Migplacement != nil {
126 | 		in, out := &in.Migplacement, &out.Migplacement
127 | 		*out = make([]Mig, len(*in))
128 | 		for i := range *in {
129 | 			(*in)[i].DeepCopyInto(&(*out)[i])
130 | 		}
131 | 	}
132 | }
133 | 
134 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InstasliceSpec.
135 | func (in *InstasliceSpec) DeepCopy() *InstasliceSpec {
136 | 	if in == nil {
137 | 		return nil
138 | 	}
139 | 	out := new(InstasliceSpec)
140 | 	in.DeepCopyInto(out)
141 | 	return out
142 | }
143 | 
144 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
145 | func (in *InstasliceStatus) DeepCopyInto(out *InstasliceStatus) {
146 | 	*out = *in
147 | }
148 | 
149 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InstasliceStatus.
150 | func (in *InstasliceStatus) DeepCopy() *InstasliceStatus {
151 | 	if in == nil {
152 | 		return nil
153 | 	}
154 | 	out := new(InstasliceStatus)
155 | 	in.DeepCopyInto(out)
156 | 	return out
157 | }
158 | 
159 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
160 | func (in *Mig) DeepCopyInto(out *Mig) {
161 | 	*out = *in
162 | 	if in.Placements != nil {
163 | 		in, out := &in.Placements, &out.Placements
164 | 		*out = make([]Placement, len(*in))
165 | 		copy(*out, *in)
166 | 	}
167 | }
168 | 
169 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Mig.
170 | func (in *Mig) DeepCopy() *Mig {
171 | 	if in == nil {
172 | 		return nil
173 | 	}
174 | 	out := new(Mig)
175 | 	in.DeepCopyInto(out)
176 | 	return out
177 | }
178 | 
179 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
180 | func (in *Placement) DeepCopyInto(out *Placement) {
181 | 	*out = *in
182 | }
183 | 
184 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Placement.
185 | func (in *Placement) DeepCopy() *Placement {
186 | 	if in == nil {
187 | 		return nil
188 | 	}
189 | 	out := new(Placement)
190 | 	in.DeepCopyInto(out)
191 | 	return out
192 | }
193 | 
194 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
195 | func (in *PreparedDetails) DeepCopyInto(out *PreparedDetails) {
196 | 	*out = *in
197 | }
198 | 
199 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PreparedDetails.
200 | func (in *PreparedDetails) DeepCopy() *PreparedDetails {
201 | 	if in == nil {
202 | 		return nil
203 | 	}
204 | 	out := new(PreparedDetails)
205 | 	in.DeepCopyInto(out)
206 | 	return out
207 | }
208 | 


--------------------------------------------------------------------------------
/cmd/controller/main.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package main
 18 | 
 19 | import (
 20 | 	"crypto/tls"
 21 | 	"flag"
 22 | 	"os"
 23 | 
 24 | 	// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
 25 | 	// to ensure that exec-entrypoint and run can make use of them.
 26 | 
 27 | 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 28 | 
 29 | 	"k8s.io/apimachinery/pkg/runtime"
 30 | 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 31 | 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
 32 | 	ctrl "sigs.k8s.io/controller-runtime"
 33 | 	"sigs.k8s.io/controller-runtime/pkg/healthz"
 34 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 35 | 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 36 | 	"sigs.k8s.io/controller-runtime/pkg/webhook"
 37 | 
 38 | 	inferencev1alpha1 "codeflare.dev/instaslice/api/v1alpha1"
 39 | 	"codeflare.dev/instaslice/internal/controller"
 40 | 	//+kubebuilder:scaffold:imports
 41 | )
 42 | 
 43 | var (
 44 | 	scheme   = runtime.NewScheme()
 45 | 	setupLog = ctrl.Log.WithName("setup")
 46 | )
 47 | 
 48 | func init() {
 49 | 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
 50 | 
 51 | 	utilruntime.Must(inferencev1alpha1.AddToScheme(scheme))
 52 | 	//+kubebuilder:scaffold:scheme
 53 | }
 54 | 
 55 | func main() {
 56 | 	var metricsAddr string
 57 | 	var enableLeaderElection bool
 58 | 	var probeAddr string
 59 | 	var secureMetrics bool
 60 | 	var enableHTTP2 bool
 61 | 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
 62 | 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
 63 | 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
 64 | 		"Enable leader election for controller manager. "+
 65 | 			"Enabling this will ensure there is only one active controller manager.")
 66 | 	flag.BoolVar(&secureMetrics, "metrics-secure", false,
 67 | 		"If set the metrics endpoint is served securely")
 68 | 	flag.BoolVar(&enableHTTP2, "enable-http2", false,
 69 | 		"If set, HTTP/2 will be enabled for the metrics and webhook servers")
 70 | 	opts := zap.Options{
 71 | 		Development: true,
 72 | 	}
 73 | 	opts.BindFlags(flag.CommandLine)
 74 | 	flag.Parse()
 75 | 
 76 | 	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
 77 | 
 78 | 	// if the enable-http2 flag is false (the default), http/2 should be disabled
 79 | 	// due to its vulnerabilities. More specifically, disabling http/2 will
 80 | 	// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
 81 | 	// Rapid Reset CVEs. For more information see:
 82 | 	// - https://github.com/advisories/GHSA-qppj-fm5r-hxr3
 83 | 	// - https://github.com/advisories/GHSA-4374-p667-p6c8
 84 | 	disableHTTP2 := func(c *tls.Config) {
 85 | 		setupLog.Info("disabling http/2")
 86 | 		c.NextProtos = []string{"http/1.1"}
 87 | 	}
 88 | 
 89 | 	tlsOpts := []func(*tls.Config){}
 90 | 	if !enableHTTP2 {
 91 | 		tlsOpts = append(tlsOpts, disableHTTP2)
 92 | 	}
 93 | 
 94 | 	webhookServer := webhook.NewServer(webhook.Options{
 95 | 		TLSOpts: tlsOpts,
 96 | 	})
 97 | 
 98 | 	mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
 99 | 		Scheme: scheme,
100 | 		Metrics: metricsserver.Options{
101 | 			BindAddress:   metricsAddr,
102 | 			SecureServing: secureMetrics,
103 | 			TLSOpts:       tlsOpts,
104 | 		},
105 | 		WebhookServer:          webhookServer,
106 | 		HealthProbeBindAddress: probeAddr,
107 | 		LeaderElection:         enableLeaderElection,
108 | 		LeaderElectionID:       "7cbd68d5.codeflare.dev",
109 | 		// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
110 | 		// when the Manager ends. This requires the binary to immediately end when the
111 | 		// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
112 | 		// speeds up voluntary leader transitions as the new leader don't have to wait
113 | 		// LeaseDuration time first.
114 | 		//
115 | 		// In the default scaffold provided, the program ends immediately after
116 | 		// the manager stops, so would be fine to enable this option. However,
117 | 		// if you are doing or is intended to do any operation such as perform cleanups
118 | 		// after the manager stops then its usage might be unsafe.
119 | 		// LeaderElectionReleaseOnCancel: true,
120 | 	})
121 | 	if err != nil {
122 | 		setupLog.Error(err, "unable to start manager")
123 | 		os.Exit(1)
124 | 	}
125 | 
126 | 	if err = (&controller.InstasliceReconciler{
127 | 		Client: mgr.GetClient(),
128 | 		Scheme: mgr.GetScheme(),
129 | 	}).SetupWithManager(mgr); err != nil {
130 | 		setupLog.Error(err, "unable to create controller", "controller", "Instaslice")
131 | 		os.Exit(1)
132 | 	}
133 | 
134 | 	// if err = (&controller.InstaSliceDaemonsetReconciler{
135 | 	// 	Client: mgr.GetClient(),
136 | 	// 	Scheme: mgr.GetScheme(),
137 | 	// }).SetupWithManager(mgr); err != nil {
138 | 	// 	setupLog.Error(err, "unable to create controller", "controller", "Instaslice")
139 | 	// 	os.Exit(1)
140 | 	// }
141 | 	//+kubebuilder:scaffold:builder
142 | 
143 | 	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
144 | 		setupLog.Error(err, "unable to set up health check")
145 | 		os.Exit(1)
146 | 	}
147 | 	if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
148 | 		setupLog.Error(err, "unable to set up ready check")
149 | 		os.Exit(1)
150 | 	}
151 | 
152 | 	// setupLog.Info("starting CLI App")
153 | 	// os.Args = []string{
154 | 	// 	filepath.Base("set-nas-status"),
155 | 	// 	"--status=NotReady",
156 | 	// 	"--node-name=kind-control-plane",
157 | 	// }
158 | 	// if err := newApp().Run(os.Args); err != nil {
159 | 	// 	fmt.Fprintf(os.Stderr, "Error: %v\n", err)
160 | 	// 	os.Exit(1)
161 | 	// }
162 | 
163 | 	setupLog.Info("starting manager")
164 | 	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
165 | 		setupLog.Error(err, "problem running manager")
166 | 		os.Exit(1)
167 | 	}
168 | }
169 | 


--------------------------------------------------------------------------------
/cmd/daemonset/main.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package main
 18 | 
 19 | import (
 20 | 	"crypto/tls"
 21 | 	"flag"
 22 | 	"os"
 23 | 
 24 | 	// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
 25 | 	// to ensure that exec-entrypoint and run can make use of them.
 26 | 
 27 | 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 28 | 
 29 | 	"k8s.io/apimachinery/pkg/runtime"
 30 | 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 31 | 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
 32 | 	ctrl "sigs.k8s.io/controller-runtime"
 33 | 	"sigs.k8s.io/controller-runtime/pkg/healthz"
 34 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 35 | 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 36 | 	"sigs.k8s.io/controller-runtime/pkg/webhook"
 37 | 
 38 | 	inferencev1alpha1 "codeflare.dev/instaslice/api/v1alpha1"
 39 | 	"codeflare.dev/instaslice/internal/controller"
 40 | 	//+kubebuilder:scaffold:imports
 41 | )
 42 | 
 43 | var (
 44 | 	scheme   = runtime.NewScheme()
 45 | 	setupLog = ctrl.Log.WithName("setup")
 46 | )
 47 | 
 48 | func init() {
 49 | 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
 50 | 
 51 | 	utilruntime.Must(inferencev1alpha1.AddToScheme(scheme))
 52 | 	//+kubebuilder:scaffold:scheme
 53 | }
 54 | 
 55 | func main() {
 56 | 	var metricsAddr string
 57 | 	var enableLeaderElection bool
 58 | 	var probeAddr string
 59 | 	var secureMetrics bool
 60 | 	var enableHTTP2 bool
 61 | 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8084", "The address the metric endpoint binds to.")
 62 | 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8085", "The address the probe endpoint binds to.")
 63 | 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
 64 | 		"Enable leader election for controller manager. "+
 65 | 			"Enabling this will ensure there is only one active controller manager.")
 66 | 	flag.BoolVar(&secureMetrics, "metrics-secure", false,
 67 | 		"If set the metrics endpoint is served securely")
 68 | 	flag.BoolVar(&enableHTTP2, "enable-http2", false,
 69 | 		"If set, HTTP/2 will be enabled for the metrics and webhook servers")
 70 | 	opts := zap.Options{
 71 | 		Development: true,
 72 | 	}
 73 | 	opts.BindFlags(flag.CommandLine)
 74 | 	flag.Parse()
 75 | 
 76 | 	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
 77 | 
 78 | 	// if the enable-http2 flag is false (the default), http/2 should be disabled
 79 | 	// due to its vulnerabilities. More specifically, disabling http/2 will
 80 | 	// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
 81 | 	// Rapid Reset CVEs. For more information see:
 82 | 	// - https://github.com/advisories/GHSA-qppj-fm5r-hxr3
 83 | 	// - https://github.com/advisories/GHSA-4374-p667-p6c8
 84 | 	disableHTTP2 := func(c *tls.Config) {
 85 | 		setupLog.Info("disabling http/2")
 86 | 		c.NextProtos = []string{"http/1.1"}
 87 | 	}
 88 | 
 89 | 	tlsOpts := []func(*tls.Config){}
 90 | 	if !enableHTTP2 {
 91 | 		tlsOpts = append(tlsOpts, disableHTTP2)
 92 | 	}
 93 | 
 94 | 	webhookServer := webhook.NewServer(webhook.Options{
 95 | 		TLSOpts: tlsOpts,
 96 | 	})
 97 | 
 98 | 	mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
 99 | 		Scheme: scheme,
100 | 		Metrics: metricsserver.Options{
101 | 			BindAddress:   metricsAddr,
102 | 			SecureServing: secureMetrics,
103 | 			TLSOpts:       tlsOpts,
104 | 		},
105 | 		WebhookServer:          webhookServer,
106 | 		HealthProbeBindAddress: probeAddr,
107 | 		LeaderElection:         enableLeaderElection,
108 | 		LeaderElectionID:       "7cbd68d6.codeflare.dev",
109 | 		// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
110 | 		// when the Manager ends. This requires the binary to immediately end when the
111 | 		// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
112 | 		// speeds up voluntary leader transitions as the new leader don't have to wait
113 | 		// LeaseDuration time first.
114 | 		//
115 | 		// In the default scaffold provided, the program ends immediately after
116 | 		// the manager stops, so would be fine to enable this option. However,
117 | 		// if you are doing or is intended to do any operation such as perform cleanups
118 | 		// after the manager stops then its usage might be unsafe.
119 | 		// LeaderElectionReleaseOnCancel: true,
120 | 	})
121 | 	if err != nil {
122 | 		setupLog.Error(err, "unable to start manager")
123 | 		os.Exit(1)
124 | 	}
125 | 
126 | 	// if err = (&controller.InstasliceReconciler{
127 | 	// 	Client: mgr.GetClient(),
128 | 	// 	Scheme: mgr.GetScheme(),
129 | 	// }).SetupWithManager(mgr); err != nil {
130 | 	// 	setupLog.Error(err, "unable to create controller", "controller", "Instaslice")
131 | 	// 	os.Exit(1)
132 | 	// }
133 | 
134 | 	if err = (&controller.InstaSliceDaemonsetReconciler{
135 | 		Client: mgr.GetClient(),
136 | 		Scheme: mgr.GetScheme(),
137 | 	}).SetupWithManager(mgr); err != nil {
138 | 		setupLog.Error(err, "unable to create controller", "controller", "InstaSliceDaemonsetReconciler")
139 | 		//os.Exit(1)
140 | 	}
141 | 	//+kubebuilder:scaffold:builder
142 | 
143 | 	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
144 | 		setupLog.Error(err, "unable to set up health check")
145 | 		os.Exit(1)
146 | 	}
147 | 	if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
148 | 		setupLog.Error(err, "unable to set up ready check")
149 | 		os.Exit(1)
150 | 	}
151 | 
152 | 	// setupLog.Info("starting CLI App")
153 | 	// os.Args = []string{
154 | 	// 	filepath.Base("set-nas-status"),
155 | 	// 	"--status=NotReady",
156 | 	// 	"--node-name=kind-control-plane",
157 | 	// }
158 | 	// if err := newApp().Run(os.Args); err != nil {
159 | 	// 	fmt.Fprintf(os.Stderr, "Error: %v\n", err)
160 | 	// 	os.Exit(1)
161 | 	// }
162 | 
163 | 	setupLog.Info("starting daemonset")
164 | 	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
165 | 		setupLog.Error(err, "problem running manager")
166 | 		os.Exit(1)
167 | 	}
168 | }
169 | 


--------------------------------------------------------------------------------
/config/crd/bases/inference.codeflare.dev_instaslices.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | apiVersion: apiextensions.k8s.io/v1
  3 | kind: CustomResourceDefinition
  4 | metadata:
  5 |   annotations:
  6 |     controller-gen.kubebuilder.io/version: v0.14.0
  7 |   name: instaslices.inference.codeflare.dev
  8 | spec:
  9 |   group: inference.codeflare.dev
 10 |   names:
 11 |     kind: Instaslice
 12 |     listKind: InstasliceList
 13 |     plural: instaslices
 14 |     singular: instaslice
 15 |   scope: Namespaced
 16 |   versions:
 17 |   - name: v1alpha1
 18 |     schema:
 19 |       openAPIV3Schema:
 20 |         description: Instaslice is the Schema for the instaslices API
 21 |         properties:
 22 |           apiVersion:
 23 |             description: |-
 24 |               APIVersion defines the versioned schema of this representation of an object.
 25 |               Servers should convert recognized schemas to the latest internal value, and
 26 |               may reject unrecognized values.
 27 |               More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
 28 |             type: string
 29 |           kind:
 30 |             description: |-
 31 |               Kind is a string value representing the REST resource this object represents.
 32 |               Servers may infer this from the endpoint the client submits requests to.
 33 |               Cannot be updated.
 34 |               In CamelCase.
 35 |               More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
 36 |             type: string
 37 |           metadata:
 38 |             type: object
 39 |           spec:
 40 |             description: InstasliceSpec defines the desired state of Instaslice
 41 |             properties:
 42 |               MigGPUUUID:
 43 |                 additionalProperties:
 44 |                   type: string
 45 |                 type: object
 46 |               allocations:
 47 |                 additionalProperties:
 48 |                   description: Define the struct for allocation details
 49 |                   properties:
 50 |                     allocationStatus:
 51 |                       type: string
 52 |                     ciProfileid:
 53 |                       type: integer
 54 |                     ciengprofileid:
 55 |                       type: integer
 56 |                     giprofileid:
 57 |                       type: integer
 58 |                     gpuUUID:
 59 |                       type: string
 60 |                     namespace:
 61 |                       type: string
 62 |                     nodename:
 63 |                       type: string
 64 |                     podName:
 65 |                       type: string
 66 |                     podUUID:
 67 |                       type: string
 68 |                     profile:
 69 |                       type: string
 70 |                     size:
 71 |                       format: int32
 72 |                       type: integer
 73 |                     start:
 74 |                       format: int32
 75 |                       type: integer
 76 |                   required:
 77 |                   - allocationStatus
 78 |                   - ciProfileid
 79 |                   - ciengprofileid
 80 |                   - giprofileid
 81 |                   - gpuUUID
 82 |                   - namespace
 83 |                   - nodename
 84 |                   - podName
 85 |                   - podUUID
 86 |                   - profile
 87 |                   - size
 88 |                   - start
 89 |                   type: object
 90 |                 description: GPUID, Profile, start, podUUID
 91 |                 type: object
 92 |               migplacement:
 93 |                 items:
 94 |                   properties:
 95 |                     ciProfileid:
 96 |                       type: integer
 97 |                     ciengprofileid:
 98 |                       type: integer
 99 |                     giprofileid:
100 |                       type: integer
101 |                     placements:
102 |                       items:
103 |                         properties:
104 |                           size:
105 |                             type: integer
106 |                           start:
107 |                             type: integer
108 |                         required:
109 |                         - size
110 |                         - start
111 |                         type: object
112 |                       type: array
113 |                     profile:
114 |                       type: string
115 |                   required:
116 |                   - ciProfileid
117 |                   - ciengprofileid
118 |                   - giprofileid
119 |                   type: object
120 |                 type: array
121 |               prepared:
122 |                 additionalProperties:
123 |                   description: Define the struct for allocation details
124 |                   properties:
125 |                     ciinfo:
126 |                       format: int32
127 |                       type: integer
128 |                     giinfo:
129 |                       format: int32
130 |                       type: integer
131 |                     parent:
132 |                       type: string
133 |                     podUUID:
134 |                       description: Do we need POD UID here?
135 |                       type: string
136 |                     profile:
137 |                       type: string
138 |                     size:
139 |                       format: int32
140 |                       type: integer
141 |                     start:
142 |                       format: int32
143 |                       type: integer
144 |                   required:
145 |                   - ciinfo
146 |                   - giinfo
147 |                   - parent
148 |                   - podUUID
149 |                   - profile
150 |                   - size
151 |                   - start
152 |                   type: object
153 |                 description: 'Prepared :  GPUID, Profile, start'
154 |                 type: object
155 |             type: object
156 |           status:
157 |             description: InstasliceStatus defines the observed state of Instaslice
158 |             properties:
159 |               processed:
160 |                 type: string
161 |             type: object
162 |         type: object
163 |     served: true
164 |     storage: true
165 |     subresources:
166 |       status: {}
167 | 


--------------------------------------------------------------------------------
/config/crd/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # This kustomization.yaml is not intended to be run by itself,
 2 | # since it depends on service name and namespace that are out of this kustomize package.
 3 | # It should be run by config/default
 4 | resources:
 5 | - bases/inference.codeflare.dev_instaslices.yaml
 6 | #+kubebuilder:scaffold:crdkustomizeresource
 7 | 
 8 | patches:
 9 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix.
10 | # patches here are for enabling the conversion webhook for each CRD
11 | #- path: patches/webhook_in_instaslices.yaml
12 | #+kubebuilder:scaffold:crdkustomizewebhookpatch
13 | 
14 | # [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix.
15 | # patches here are for enabling the CA injection for each CRD
16 | #- path: patches/cainjection_in_instaslices.yaml
17 | #+kubebuilder:scaffold:crdkustomizecainjectionpatch
18 | 
19 | # [WEBHOOK] To enable webhook, uncomment the following section
20 | # the following config is for teaching kustomize how to do kustomization for CRDs.
21 | 
22 | #configurations:
23 | #- kustomizeconfig.yaml
24 | 


--------------------------------------------------------------------------------
/config/crd/kustomizeconfig.yaml:
--------------------------------------------------------------------------------
 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD
 2 | nameReference:
 3 | - kind: Service
 4 |   version: v1
 5 |   fieldSpecs:
 6 |   - kind: CustomResourceDefinition
 7 |     version: v1
 8 |     group: apiextensions.k8s.io
 9 |     path: spec/conversion/webhook/clientConfig/service/name
10 | 
11 | namespace:
12 | - kind: CustomResourceDefinition
13 |   version: v1
14 |   group: apiextensions.k8s.io
15 |   path: spec/conversion/webhook/clientConfig/service/namespace
16 |   create: false
17 | 
18 | varReference:
19 | - path: metadata/annotations
20 | 


--------------------------------------------------------------------------------
/config/default/kustomization.yaml:
--------------------------------------------------------------------------------
  1 | # Adds namespace to all resources.
  2 | namespace: instaslicev2-system
  3 | 
  4 | # Value of this field is prepended to the
  5 | # names of all resources, e.g. a deployment named
  6 | # "wordpress" becomes "alices-wordpress".
  7 | # Note that it should also match with the prefix (text before '-') of the namespace
  8 | # field above.
  9 | namePrefix: instaslicev2-
 10 | 
 11 | # Labels to add to all resources and selectors.
 12 | #labels:
 13 | #- includeSelectors: true
 14 | #  pairs:
 15 | #    someName: someValue
 16 | 
 17 | resources:
 18 | - ../crd
 19 | - ../rbac
 20 | - ../manager
 21 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 22 | # crd/kustomization.yaml
 23 | #- ../webhook
 24 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required.
 25 | #- ../certmanager
 26 | # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'.
 27 | #- ../prometheus
 28 | 
 29 | patches:
 30 | # Protect the /metrics endpoint by putting it behind auth.
 31 | # If you want your controller-manager to expose the /metrics
 32 | # endpoint w/o any authn/z, please comment the following line.
 33 | - path: manager_auth_proxy_patch.yaml
 34 | 
 35 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 36 | # crd/kustomization.yaml
 37 | #- path: manager_webhook_patch.yaml
 38 | 
 39 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'.
 40 | # Uncomment 'CERTMANAGER' sections in crd/kustomization.yaml to enable the CA injection in the admission webhooks.
 41 | # 'CERTMANAGER' needs to be enabled to use ca injection
 42 | #- path: webhookcainjection_patch.yaml
 43 | 
 44 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix.
 45 | # Uncomment the following replacements to add the cert-manager CA injection annotations
 46 | #replacements:
 47 | #  - source: # Add cert-manager annotation to ValidatingWebhookConfiguration, MutatingWebhookConfiguration and CRDs
 48 | #      kind: Certificate
 49 | #      group: cert-manager.io
 50 | #      version: v1
 51 | #      name: serving-cert # this name should match the one in certificate.yaml
 52 | #      fieldPath: .metadata.namespace # namespace of the certificate CR
 53 | #    targets:
 54 | #      - select:
 55 | #          kind: ValidatingWebhookConfiguration
 56 | #        fieldPaths:
 57 | #          - .metadata.annotations.[cert-manager.io/inject-ca-from]
 58 | #        options:
 59 | #          delimiter: '/'
 60 | #          index: 0
 61 | #          create: true
 62 | #      - select:
 63 | #          kind: MutatingWebhookConfiguration
 64 | #        fieldPaths:
 65 | #          - .metadata.annotations.[cert-manager.io/inject-ca-from]
 66 | #        options:
 67 | #          delimiter: '/'
 68 | #          index: 0
 69 | #          create: true
 70 | #      - select:
 71 | #          kind: CustomResourceDefinition
 72 | #        fieldPaths:
 73 | #          - .metadata.annotations.[cert-manager.io/inject-ca-from]
 74 | #        options:
 75 | #          delimiter: '/'
 76 | #          index: 0
 77 | #          create: true
 78 | #  - source:
 79 | #      kind: Certificate
 80 | #      group: cert-manager.io
 81 | #      version: v1
 82 | #      name: serving-cert # this name should match the one in certificate.yaml
 83 | #      fieldPath: .metadata.name
 84 | #    targets:
 85 | #      - select:
 86 | #          kind: ValidatingWebhookConfiguration
 87 | #        fieldPaths:
 88 | #          - .metadata.annotations.[cert-manager.io/inject-ca-from]
 89 | #        options:
 90 | #          delimiter: '/'
 91 | #          index: 1
 92 | #          create: true
 93 | #      - select:
 94 | #          kind: MutatingWebhookConfiguration
 95 | #        fieldPaths:
 96 | #          - .metadata.annotations.[cert-manager.io/inject-ca-from]
 97 | #        options:
 98 | #          delimiter: '/'
 99 | #          index: 1
100 | #          create: true
101 | #      - select:
102 | #          kind: CustomResourceDefinition
103 | #        fieldPaths:
104 | #          - .metadata.annotations.[cert-manager.io/inject-ca-from]
105 | #        options:
106 | #          delimiter: '/'
107 | #          index: 1
108 | #          create: true
109 | #  - source: # Add cert-manager annotation to the webhook Service
110 | #      kind: Service
111 | #      version: v1
112 | #      name: webhook-service
113 | #      fieldPath: .metadata.name # namespace of the service
114 | #    targets:
115 | #      - select:
116 | #          kind: Certificate
117 | #          group: cert-manager.io
118 | #          version: v1
119 | #        fieldPaths:
120 | #          - .spec.dnsNames.0
121 | #          - .spec.dnsNames.1
122 | #        options:
123 | #          delimiter: '.'
124 | #          index: 0
125 | #          create: true
126 | #  - source:
127 | #      kind: Service
128 | #      version: v1
129 | #      name: webhook-service
130 | #      fieldPath: .metadata.namespace # namespace of the service
131 | #    targets:
132 | #      - select:
133 | #          kind: Certificate
134 | #          group: cert-manager.io
135 | #          version: v1
136 | #        fieldPaths:
137 | #          - .spec.dnsNames.0
138 | #          - .spec.dnsNames.1
139 | #        options:
140 | #          delimiter: '.'
141 | #          index: 1
142 | #          create: true
143 | 


--------------------------------------------------------------------------------
/config/default/manager_auth_proxy_patch.yaml:
--------------------------------------------------------------------------------
 1 | # This patch inject a sidecar container which is a HTTP proxy for the
 2 | # controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews.
 3 | apiVersion: apps/v1
 4 | kind: Deployment
 5 | metadata:
 6 |   name: controller-manager
 7 |   namespace: system
 8 | spec:
 9 |   template:
10 |     spec:
11 |       containers:
12 |       - name: kube-rbac-proxy
13 |         securityContext:
14 |           allowPrivilegeEscalation: false
15 |           capabilities:
16 |             drop:
17 |             - "ALL"
18 |         image: gcr.io/kubebuilder/kube-rbac-proxy:v0.15.0
19 |         args:
20 |         - "--secure-listen-address=0.0.0.0:8443"
21 |         - "--upstream=http://127.0.0.1:8080/"
22 |         - "--logtostderr=true"
23 |         - "--v=0"
24 |         ports:
25 |         - containerPort: 8443
26 |           protocol: TCP
27 |           name: https
28 |         resources:
29 |           limits:
30 |             cpu: 500m
31 |             memory: 128Mi
32 |           requests:
33 |             cpu: 5m
34 |             memory: 64Mi
35 |       - name: manager
36 |         args:
37 |         - "--health-probe-bind-address=:8081"
38 |         - "--metrics-bind-address=127.0.0.1:8080"
39 |         - "--leader-elect"
40 | 


--------------------------------------------------------------------------------
/config/default/manager_config_patch.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: controller-manager
 5 |   namespace: system
 6 | spec:
 7 |   template:
 8 |     spec:
 9 |       containers:
10 |       - name: manager
11 | 


--------------------------------------------------------------------------------
/config/manager/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - manager.yaml
3 | apiVersion: kustomize.config.k8s.io/v1beta1
4 | kind: Kustomization
5 | images:
6 | - name: controller
7 |   newName: asm582/instaslicev2-controller
8 |   newTag: latest
9 | 


--------------------------------------------------------------------------------
/config/manager/manager.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: Namespace
  3 | metadata:
  4 |   labels:
  5 |     control-plane: controller-manager
  6 |     app.kubernetes.io/name: namespace
  7 |     app.kubernetes.io/instance: system
  8 |     app.kubernetes.io/component: manager
  9 |     app.kubernetes.io/created-by: instaslicev2
 10 |     app.kubernetes.io/part-of: instaslicev2
 11 |     app.kubernetes.io/managed-by: kustomize
 12 |   name: system
 13 | ---
 14 | apiVersion: apps/v1
 15 | kind: Deployment
 16 | metadata:
 17 |   name: controller-manager
 18 |   namespace: system
 19 |   labels:
 20 |     control-plane: controller-manager
 21 |     app.kubernetes.io/name: deployment
 22 |     app.kubernetes.io/instance: controller-manager
 23 |     app.kubernetes.io/component: manager
 24 |     app.kubernetes.io/created-by: instaslicev2
 25 |     app.kubernetes.io/part-of: instaslicev2
 26 |     app.kubernetes.io/managed-by: kustomize
 27 | spec:
 28 |   selector:
 29 |     matchLabels:
 30 |       control-plane: controller-manager
 31 |   replicas: 1
 32 |   template:
 33 |     metadata:
 34 |       annotations:
 35 |         kubectl.kubernetes.io/default-container: manager
 36 |       labels:
 37 |         control-plane: controller-manager
 38 |     spec:
 39 |       # TODO(user): Uncomment the following code to configure the nodeAffinity expression
 40 |       # according to the platforms which are supported by your solution.
 41 |       # It is considered best practice to support multiple architectures. You can
 42 |       # build your manager image using the makefile target docker-buildx.
 43 |       # affinity:
 44 |       #   nodeAffinity:
 45 |       #     requiredDuringSchedulingIgnoredDuringExecution:
 46 |       #       nodeSelectorTerms:
 47 |       #         - matchExpressions:
 48 |       #           - key: kubernetes.io/arch
 49 |       #             operator: In
 50 |       #             values:
 51 |       #               - amd64
 52 |       #               - arm64
 53 |       #               - ppc64le
 54 |       #               - s390x
 55 |       #           - key: kubernetes.io/os
 56 |       #             operator: In
 57 |       #             values:
 58 |       #               - linux
 59 |       securityContext:
 60 |         runAsNonRoot: false
 61 |         # TODO(user): For common cases that do not require escalating privileges
 62 |         # it is recommended to ensure that all your Pods/Containers are restrictive.
 63 |         # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
 64 |         # Please uncomment the following code if your project does NOT have to work on old Kubernetes
 65 |         # versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ).
 66 |         # seccompProfile:
 67 |         #   type: RuntimeDefault
 68 |       containers:
 69 |       - command:
 70 |         - /manager
 71 |         args:
 72 |         - --leader-elect
 73 |         image: asm582/instaslicev2-controller:latest
 74 |         name: manager
 75 |         securityContext:
 76 |           allowPrivilegeEscalation: true
 77 |           privileged: true
 78 |           capabilities:
 79 |             drop:
 80 |             - "ALL"
 81 |         livenessProbe:
 82 |           httpGet:
 83 |             path: /healthz
 84 |             port: 8081
 85 |           initialDelaySeconds: 15
 86 |           periodSeconds: 20
 87 |         readinessProbe:
 88 |           httpGet:
 89 |             path: /readyz
 90 |             port: 8081
 91 |           initialDelaySeconds: 5
 92 |           periodSeconds: 10
 93 |         # TODO(user): Configure the resources accordingly based on the project requirements.
 94 |         # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
 95 |         resources:
 96 |           limits:
 97 |             cpu: 500m
 98 |             memory: 128Mi
 99 |           requests:
100 |             cpu: 10m
101 |             memory: 64Mi
102 |       serviceAccountName: controller-manager
103 |       terminationGracePeriodSeconds: 10
104 | ---
105 | apiVersion: apps/v1
106 | kind: DaemonSet
107 | metadata:
108 |   name: controller-daemonset
109 |   namespace: system
110 |   labels:
111 |     app: controller-daemonset
112 | spec:
113 |   selector:
114 |     matchLabels:
115 |       app: controller-daemonset
116 |   template:
117 |     metadata:
118 |       annotations:
119 |         kubectl.kubernetes.io/default-container: daemonset
120 |       labels:
121 |         app: controller-daemonset
122 |     spec:
123 |       securityContext:
124 |         runAsNonRoot: false
125 |       containers:
126 |       - command:
127 |         - /daemonset
128 |         args:
129 |         - --leader-elect
130 |         name: daemonset
131 |         image: asm582/instaslicev2-daemonset:latest
132 |         securityContext:
133 |           allowPrivilegeEscalation: true
134 |           privileged: true
135 |           capabilities:
136 |             add:
137 |             - "ALL"
138 |         resources:
139 |           limits:
140 |             cpu: 500m
141 |             memory: 128Mi
142 |           requests:
143 |             cpu: 10m
144 |             memory: 64Mi
145 |         env:
146 |           - name: NODE_NAME
147 |             valueFrom:
148 |               fieldRef:
149 |                 fieldPath: spec.nodeName
150 |           - name: NVIDIA_MIG_CONFIG_DEVICES
151 |             value: all
152 |       serviceAccountName: controller-manager
153 |       terminationGracePeriodSeconds: 10
154 | 


--------------------------------------------------------------------------------
/config/prometheus/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - monitor.yaml
3 | 


--------------------------------------------------------------------------------
/config/prometheus/monitor.yaml:
--------------------------------------------------------------------------------
 1 | # Prometheus Monitor Service (Metrics)
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: ServiceMonitor
 4 | metadata:
 5 |   labels:
 6 |     control-plane: controller-manager
 7 |     app.kubernetes.io/name: servicemonitor
 8 |     app.kubernetes.io/instance: controller-manager-metrics-monitor
 9 |     app.kubernetes.io/component: metrics
10 |     app.kubernetes.io/created-by: instaslicev2
11 |     app.kubernetes.io/part-of: instaslicev2
12 |     app.kubernetes.io/managed-by: kustomize
13 |   name: controller-manager-metrics-monitor
14 |   namespace: system
15 | spec:
16 |   endpoints:
17 |     - path: /metrics
18 |       port: https
19 |       scheme: https
20 |       bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
21 |       tlsConfig:
22 |         insecureSkipVerify: true
23 |   selector:
24 |     matchLabels:
25 |       control-plane: controller-manager
26 | 


--------------------------------------------------------------------------------
/config/rbac/auth_proxy_client_clusterrole.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: clusterrole
 6 |     app.kubernetes.io/instance: metrics-reader
 7 |     app.kubernetes.io/component: kube-rbac-proxy
 8 |     app.kubernetes.io/created-by: instaslicev2
 9 |     app.kubernetes.io/part-of: instaslicev2
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: metrics-reader
12 | rules:
13 | - nonResourceURLs:
14 |   - "/metrics"
15 |   verbs:
16 |   - get
17 | 


--------------------------------------------------------------------------------
/config/rbac/auth_proxy_role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: clusterrole
 6 |     app.kubernetes.io/instance: proxy-role
 7 |     app.kubernetes.io/component: kube-rbac-proxy
 8 |     app.kubernetes.io/created-by: instaslicev2
 9 |     app.kubernetes.io/part-of: instaslicev2
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: proxy-role
12 | rules:
13 | - apiGroups:
14 |   - authentication.k8s.io
15 |   resources:
16 |   - tokenreviews
17 |   verbs:
18 |   - create
19 | - apiGroups:
20 |   - authorization.k8s.io
21 |   resources:
22 |   - subjectaccessreviews
23 |   verbs:
24 |   - create
25 | 


--------------------------------------------------------------------------------
/config/rbac/auth_proxy_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: clusterrolebinding
 6 |     app.kubernetes.io/instance: proxy-rolebinding
 7 |     app.kubernetes.io/component: kube-rbac-proxy
 8 |     app.kubernetes.io/created-by: instaslicev2
 9 |     app.kubernetes.io/part-of: instaslicev2
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: proxy-rolebinding
12 | roleRef:
13 |   apiGroup: rbac.authorization.k8s.io
14 |   kind: ClusterRole
15 |   name: proxy-role
16 | subjects:
17 | - kind: ServiceAccount
18 |   name: controller-manager
19 |   namespace: system
20 | 


--------------------------------------------------------------------------------
/config/rbac/auth_proxy_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     control-plane: controller-manager
 6 |     app.kubernetes.io/name: service
 7 |     app.kubernetes.io/instance: controller-manager-metrics-service
 8 |     app.kubernetes.io/component: kube-rbac-proxy
 9 |     app.kubernetes.io/created-by: instaslicev2
10 |     app.kubernetes.io/part-of: instaslicev2
11 |     app.kubernetes.io/managed-by: kustomize
12 |   name: controller-manager-metrics-service
13 |   namespace: system
14 | spec:
15 |   ports:
16 |   - name: https
17 |     port: 8443
18 |     protocol: TCP
19 |     targetPort: https
20 |   selector:
21 |     control-plane: controller-manager
22 | 


--------------------------------------------------------------------------------
/config/rbac/instaslice_editor_role.yaml:
--------------------------------------------------------------------------------
 1 | # permissions for end users to edit instaslices.
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   labels:
 6 |     app.kubernetes.io/name: clusterrole
 7 |     app.kubernetes.io/instance: instaslice-editor-role
 8 |     app.kubernetes.io/component: rbac
 9 |     app.kubernetes.io/created-by: instaslicev2
10 |     app.kubernetes.io/part-of: instaslicev2
11 |     app.kubernetes.io/managed-by: kustomize
12 |   name: instaslice-editor-role
13 | rules:
14 | - apiGroups:
15 |   - inference.codeflare.dev
16 |   resources:
17 |   - instaslices
18 |   verbs:
19 |   - create
20 |   - delete
21 |   - get
22 |   - list
23 |   - patch
24 |   - update
25 |   - watch
26 | - apiGroups:
27 |   - inference.codeflare.dev
28 |   resources:
29 |   - instaslices/status
30 |   verbs:
31 |   - get
32 | 


--------------------------------------------------------------------------------
/config/rbac/instaslice_viewer_role.yaml:
--------------------------------------------------------------------------------
 1 | # permissions for end users to view instaslices.
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   labels:
 6 |     app.kubernetes.io/name: clusterrole
 7 |     app.kubernetes.io/instance: instaslice-viewer-role
 8 |     app.kubernetes.io/component: rbac
 9 |     app.kubernetes.io/created-by: instaslicev2
10 |     app.kubernetes.io/part-of: instaslicev2
11 |     app.kubernetes.io/managed-by: kustomize
12 |   name: instaslice-viewer-role
13 | rules:
14 | - apiGroups:
15 |   - inference.codeflare.dev
16 |   resources:
17 |   - instaslices
18 |   verbs:
19 |   - get
20 |   - list
21 |   - watch
22 | - apiGroups:
23 |   - inference.codeflare.dev
24 |   resources:
25 |   - instaslices/status
26 |   verbs:
27 |   - get
28 | 


--------------------------------------------------------------------------------
/config/rbac/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 | # All RBAC will be applied under this service account in
 3 | # the deployment namespace. You may comment out this resource
 4 | # if your manager will use a service account that exists at
 5 | # runtime. Be sure to update RoleBinding and ClusterRoleBinding
 6 | # subjects if changing service account names.
 7 | - service_account.yaml
 8 | - role.yaml
 9 | - role_binding.yaml
10 | - leader_election_role.yaml
11 | - leader_election_role_binding.yaml
12 | # Comment the following 4 lines if you want to disable
13 | # the auth proxy (https://github.com/brancz/kube-rbac-proxy)
14 | # which protects your /metrics endpoint.
15 | - auth_proxy_service.yaml
16 | - auth_proxy_role.yaml
17 | - auth_proxy_role_binding.yaml
18 | - auth_proxy_client_clusterrole.yaml
19 | 


--------------------------------------------------------------------------------
/config/rbac/leader_election_role.yaml:
--------------------------------------------------------------------------------
 1 | # permissions to do leader election.
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: Role
 4 | metadata:
 5 |   labels:
 6 |     app.kubernetes.io/name: role
 7 |     app.kubernetes.io/instance: leader-election-role
 8 |     app.kubernetes.io/component: rbac
 9 |     app.kubernetes.io/created-by: instaslicev2
10 |     app.kubernetes.io/part-of: instaslicev2
11 |     app.kubernetes.io/managed-by: kustomize
12 |   name: leader-election-role
13 | rules:
14 | - apiGroups:
15 |   - ""
16 |   resources:
17 |   - configmaps
18 |   verbs:
19 |   - get
20 |   - list
21 |   - watch
22 |   - create
23 |   - update
24 |   - patch
25 |   - delete
26 | - apiGroups:
27 |   - coordination.k8s.io
28 |   resources:
29 |   - leases
30 |   verbs:
31 |   - get
32 |   - list
33 |   - watch
34 |   - create
35 |   - update
36 |   - patch
37 |   - delete
38 | - apiGroups:
39 |   - ""
40 |   resources:
41 |   - events
42 |   verbs:
43 |   - create
44 |   - patch
45 | 


--------------------------------------------------------------------------------
/config/rbac/leader_election_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: rolebinding
 6 |     app.kubernetes.io/instance: leader-election-rolebinding
 7 |     app.kubernetes.io/component: rbac
 8 |     app.kubernetes.io/created-by: instaslicev2
 9 |     app.kubernetes.io/part-of: instaslicev2
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: leader-election-rolebinding
12 | roleRef:
13 |   apiGroup: rbac.authorization.k8s.io
14 |   kind: Role
15 |   name: leader-election-role
16 | subjects:
17 | - kind: ServiceAccount
18 |   name: controller-manager
19 |   namespace: system
20 | 


--------------------------------------------------------------------------------
/config/rbac/role.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   name: manager-role
 6 | rules:
 7 | - apiGroups:
 8 |   - ""
 9 |   resources:
10 |   - configmaps
11 |   verbs:
12 |   - create
13 |   - delete
14 |   - get
15 |   - list
16 |   - patch
17 |   - update
18 |   - watch
19 | - apiGroups:
20 |   - ""
21 |   resources:
22 |   - nodes
23 |   verbs:
24 |   - create
25 |   - delete
26 |   - get
27 |   - list
28 |   - patch
29 |   - update
30 |   - watch
31 | - apiGroups:
32 |   - ""
33 |   resources:
34 |   - nodes/status
35 |   verbs:
36 |   - get
37 |   - patch
38 |   - update
39 | - apiGroups:
40 |   - ""
41 |   resources:
42 |   - pods
43 |   verbs:
44 |   - create
45 |   - delete
46 |   - get
47 |   - list
48 |   - patch
49 |   - update
50 |   - watch
51 | - apiGroups:
52 |   - inference.codeflare.dev
53 |   resources:
54 |   - instaslices
55 |   verbs:
56 |   - create
57 |   - delete
58 |   - get
59 |   - list
60 |   - patch
61 |   - update
62 |   - watch
63 | - apiGroups:
64 |   - inference.codeflare.dev
65 |   resources:
66 |   - instaslices/finalizers
67 |   verbs:
68 |   - update
69 | - apiGroups:
70 |   - inference.codeflare.dev
71 |   resources:
72 |   - instaslices/status
73 |   verbs:
74 |   - get
75 |   - patch
76 |   - update
77 | 


--------------------------------------------------------------------------------
/config/rbac/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: clusterrolebinding
 6 |     app.kubernetes.io/instance: manager-rolebinding
 7 |     app.kubernetes.io/component: rbac
 8 |     app.kubernetes.io/created-by: instaslicev2
 9 |     app.kubernetes.io/part-of: instaslicev2
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: manager-rolebinding
12 | roleRef:
13 |   apiGroup: rbac.authorization.k8s.io
14 |   kind: ClusterRole
15 |   name: manager-role
16 | subjects:
17 | - kind: ServiceAccount
18 |   name: controller-manager
19 |   namespace: system
20 | # ---
21 | # apiVersion: rbac.authorization.k8s.io/v1
22 | # kind: ClusterRoleBinding
23 | # metadata:
24 | #   name: system:openshift:scc:privileged
25 | # roleRef:
26 | #   apiGroup: rbac.authorization.k8s.io
27 | #   kind: ClusterRole
28 | #   name: system:openshift:scc:privileged
29 | # subjects:
30 | # - kind: ServiceAccount
31 | #   name: controller-manager
32 | #   namespace: system


--------------------------------------------------------------------------------
/config/rbac/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   labels:
5 |     app.kubernetes.io/name: instaslicev2
6 |     app.kubernetes.io/managed-by: kustomize
7 |   name: controller-manager
8 |   namespace: system
9 | 


--------------------------------------------------------------------------------
/deploy/custom-configmapwithprofiles.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: test
 5 |   namespace: gpu-operator
 6 | data:
 7 |   update-capacity: |-
 8 |     version: v1
 9 |     flags:
10 |       migStrategy: mixed
11 |   update-capacity-1: |-
12 |     version: v1
13 |     flags:
14 |       migStrategy: mixed
15 | 
16 | 


--------------------------------------------------------------------------------
/deploy/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Create the Kind cluster
 4 | kind create cluster --config - <<EOF
 5 | apiVersion: kind.x-k8s.io/v1alpha4
 6 | kind: Cluster
 7 | nodes:
 8 | - role: control-plane
 9 |   image: kindest/node:v1.27.3@sha256:3966ac761ae0136263ffdb6cfd4db23ef8a83cba8a463690e98317add2c9ba72
10 |   # required for GPU workaround
11 |   extraMounts:
12 |     - hostPath: /dev/null
13 |       containerPath: /var/run/nvidia-container-devices/all
14 | EOF
15 | 
16 | # Check if Kind cluster creation was successful
17 | if [ $? -ne 0 ]; then
18 |     echo "Failed to create Kind cluster"
19 |     exit 1
20 | fi
21 | 
22 | # Create symlink in the control-plane container
23 | docker exec -ti kind-control-plane ln -s /sbin/ldconfig /sbin/ldconfig.real
24 | 
25 | # Check if symlink creation was successful
26 | if [ $? -ne 0 ]; then
27 |     echo "Failed to create symlink"
28 |     exit 1
29 | fi
30 | 
31 | # Unmount the nvidia devices in the control-plane container
32 | docker exec -ti kind-control-plane umount -R /proc/driver/nvidia
33 | 
34 | # Check if unmounting was successful
35 | if [ $? -ne 0 ]; then
36 |     echo "Failed to unmount nvidia devices"
37 |     exit 1
38 | fi
39 | 
40 | # Label all nodes with the specified label
41 | kubectl label node --all --overwrite nvidia.com/mig.config=all-balanced
42 | 
43 | # Check if labeling was successful
44 | if [ $? -ne 0 ]; then
45 |     echo "Failed to label nodes"
46 |     exit 1
47 | fi
48 | 
49 | # Install the GPU Operator using Helm with the --wait flag
50 | #--set devicePlugin.enabled=false
51 | helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --set mig.strategy=mixed --set cdi.enabled=true
52 | 
53 | # Check if GPU Operator installation was successful
54 | if [ $? -ne 0 ]; then
55 |     echo "Failed to install GPU Operator"
56 |     exit 1
57 | fi
58 | 
59 | echo "GPU operator installation commands executed successfully"
60 | 
61 | kubectl apply -f ./deploy/custom-configmapwithprofiles.yaml
62 | 
63 | kubectl label node --all nvidia.com/device-plugin.config=update-capacity
64 | 
65 | #for already deployed GPU operator
66 | #To avoid waiting for minutes, for now run the below command manually
67 | #kubectl patch clusterpolicies.nvidia.com/cluster-policy     -n gpu-operator --type merge     -p '{"spec": {"devicePlugin": {"config": {"name": "test"}}}}'
68 | 
69 | #kubectl label node --all nvidia.com/device-plugin.config=a100-40gb
70 | #for openshift replace gpu-operator with nvidia-gpu-operator
71 | #TODO: configmap name and key should be changed.
72 | 
73 | exit 0
74 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module codeflare.dev/instaslice
 2 | 
 3 | go 1.22.0
 4 | 
 5 | toolchain go1.22.2
 6 | 
 7 | require (
 8 | 	github.com/onsi/ginkgo/v2 v2.15.0
 9 | 	github.com/onsi/gomega v1.31.0
10 | 	github.com/stretchr/testify v1.9.0
11 | 	k8s.io/apimachinery v0.30.0
12 | 	k8s.io/client-go v0.29.2
13 | 	sigs.k8s.io/controller-runtime v0.17.2
14 | )
15 | 
16 | require (
17 | 	github.com/evanphx/json-patch v4.12.0+incompatible // indirect
18 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
19 | )
20 | 
21 | require (
22 | 	github.com/NVIDIA/go-nvlib v0.3.1
23 | 	github.com/NVIDIA/go-nvml v0.12.0-6
24 | 	github.com/beorn7/perks v1.0.1 // indirect
25 | 	github.com/cespare/xxhash/v2 v2.2.0 // indirect
26 | 	github.com/davecgh/go-spew v1.1.1 // indirect
27 | 	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
28 | 	github.com/evanphx/json-patch/v5 v5.8.0 // indirect
29 | 	github.com/fsnotify/fsnotify v1.7.0 // indirect
30 | 	github.com/go-logr/logr v1.4.1
31 | 	github.com/go-logr/zapr v1.3.0 // indirect
32 | 	github.com/go-openapi/jsonpointer v0.19.6 // indirect
33 | 	github.com/go-openapi/jsonreference v0.20.2 // indirect
34 | 	github.com/go-openapi/swag v0.22.3 // indirect
35 | 	github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
36 | 	github.com/gogo/protobuf v1.3.2 // indirect
37 | 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
38 | 	github.com/golang/protobuf v1.5.4 // indirect
39 | 	github.com/google/gnostic-models v0.6.8 // indirect
40 | 	github.com/google/go-cmp v0.6.0 // indirect
41 | 	github.com/google/gofuzz v1.2.0 // indirect
42 | 	github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect
43 | 	github.com/google/uuid v1.6.0 // indirect
44 | 	github.com/imdario/mergo v0.3.6 // indirect
45 | 	github.com/josharian/intern v1.0.0 // indirect
46 | 	github.com/json-iterator/go v1.1.12 // indirect
47 | 	github.com/mailru/easyjson v0.7.7 // indirect
48 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
49 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
50 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
51 | 	github.com/pkg/errors v0.9.1 // indirect
52 | 	github.com/prometheus/client_golang v1.19.0 // indirect
53 | 	github.com/prometheus/client_model v0.5.0 // indirect
54 | 	github.com/prometheus/common v0.48.0 // indirect
55 | 	github.com/prometheus/procfs v0.12.0 // indirect
56 | 	github.com/spf13/pflag v1.0.5 // indirect
57 | 	go.uber.org/multierr v1.11.0 // indirect
58 | 	go.uber.org/zap v1.26.0 // indirect
59 | 	golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
60 | 	golang.org/x/net v0.23.0 // indirect
61 | 	golang.org/x/oauth2 v0.16.0 // indirect
62 | 	golang.org/x/sys v0.18.0 // indirect
63 | 	golang.org/x/term v0.18.0 // indirect
64 | 	golang.org/x/text v0.14.0 // indirect
65 | 	golang.org/x/time v0.3.0 // indirect
66 | 	golang.org/x/tools v0.18.0 // indirect
67 | 	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
68 | 	google.golang.org/appengine v1.6.7 // indirect
69 | 	google.golang.org/protobuf v1.33.0 // indirect
70 | 	gopkg.in/inf.v0 v0.9.1 // indirect
71 | 	gopkg.in/yaml.v2 v2.4.0 // indirect
72 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
73 | 	k8s.io/api v0.29.2
74 | 	k8s.io/apiextensions-apiserver v0.29.0 // indirect
75 | 	k8s.io/component-base v0.29.2 // indirect
76 | 	k8s.io/klog/v2 v2.120.1 // indirect
77 | 	k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
78 | 	k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
79 | 	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
80 | 	sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
81 | 	sigs.k8s.io/yaml v1.4.0 // indirect
82 | )
83 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
  1 | github.com/NVIDIA/go-nvlib v0.3.1 h1:4xvcf/OHXPL2BYXx9Sj44FtoEPYsYNxUe+Dvmy9V6IE=
  2 | github.com/NVIDIA/go-nvlib v0.3.1/go.mod h1:87z49ULPr4GWPSGfSIp3taU4XENRYN/enIg88MzcL4k=
  3 | github.com/NVIDIA/go-nvml v0.12.0-6 h1:FJYc2KrpvX+VOC/8QQvMiQMmZ/nPMRpdJO/Ik4xfcr0=
  4 | github.com/NVIDIA/go-nvml v0.12.0-6/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
  5 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
  6 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
  7 | github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
  8 | github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
  9 | github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
 10 | github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
 11 | github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
 12 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 13 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 14 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 15 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 16 | github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
 17 | github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
 18 | github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84=
 19 | github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
 20 | github.com/evanphx/json-patch/v5 v5.8.0 h1:lRj6N9Nci7MvzrXuX6HFzU8XjmhPiXPlsKEy1u0KQro=
 21 | github.com/evanphx/json-patch/v5 v5.8.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ=
 22 | github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
 23 | github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
 24 | github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ=
 25 | github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 26 | github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
 27 | github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
 28 | github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE=
 29 | github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
 30 | github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
 31 | github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
 32 | github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g=
 33 | github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
 34 | github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
 35 | github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
 36 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 37 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 38 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
 39 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 40 | github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 41 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
 42 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
 43 | github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
 44 | github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U=
 45 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 46 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 47 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 48 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 49 | github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
 50 | github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 51 | github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec=
 52 | github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
 53 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 54 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 55 | github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
 56 | github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28=
 57 | github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
 58 | github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 59 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 60 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 61 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 62 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 63 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 64 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 65 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 66 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 67 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 68 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 69 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 70 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 71 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 72 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 73 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 74 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 75 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 76 | github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 77 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 78 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 79 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 80 | github.com/onsi/ginkgo/v2 v2.15.0 h1:79HwNRBAZHOEwrczrgSOPy+eFTTlIGELKy5as+ClttY=
 81 | github.com/onsi/ginkgo/v2 v2.15.0/go.mod h1:HlxMHtYF57y6Dpf+mc5529KKmSq9h2FpCF+/ZkwUxKM=
 82 | github.com/onsi/gomega v1.31.0 h1:54UJxxj6cPInHS3a35wm6BK/F9nHYueZ1NVujHDrnXE=
 83 | github.com/onsi/gomega v1.31.0/go.mod h1:DW9aCi7U6Yi40wNVAvT6kzFnEVEI5n3DloYBiKiT6zk=
 84 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 85 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 86 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 87 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 88 | github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU=
 89 | github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k=
 90 | github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
 91 | github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI=
 92 | github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE=
 93 | github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc=
 94 | github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo=
 95 | github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo=
 96 | github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
 97 | github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
 98 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
 99 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
100 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
101 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
102 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
103 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
104 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
105 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
106 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
107 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
108 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
109 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
110 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
111 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
112 | go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
113 | go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
114 | go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
115 | go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
116 | go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
117 | go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
118 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
119 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
120 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
121 | golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e h1:+WEEuIdZHnUeJJmEUjyYC2gfUMj69yZXw17EnHg/otA=
122 | golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e/go.mod h1:Kr81I6Kryrl9sr8s2FK3vxD90NdsKWRuOIl2O4CvYbA=
123 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
124 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
125 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
126 | golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
127 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
128 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
129 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
130 | golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=
131 | golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
132 | golang.org/x/oauth2 v0.16.0 h1:aDkGMBSYxElaoP81NpoUoz2oo2R2wHdZpGToUxfyQrQ=
133 | golang.org/x/oauth2 v0.16.0/go.mod h1:hqZ+0LWXsiVoZpeld6jVt06P3adbS2Uu911W1SsJv2o=
134 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
135 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
136 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
137 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
138 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
139 | golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
140 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
141 | golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
142 | golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
143 | golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8=
144 | golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58=
145 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
146 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
147 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
148 | golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
149 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
150 | golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
151 | golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
152 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
153 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
154 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
155 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
156 | golang.org/x/tools v0.18.0 h1:k8NLag8AGHnn+PHbl7g43CtqZAwG60vZkLqgyZgIHgQ=
157 | golang.org/x/tools v0.18.0/go.mod h1:GL7B4CwcLLeo59yx/9UWWuNOW1n3VZ4f5axWfML7Lcg=
158 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
159 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
160 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
161 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
162 | gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
163 | gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
164 | google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c=
165 | google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
166 | google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
167 | google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
168 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
169 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
170 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
171 | gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
172 | gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
173 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
174 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
175 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
176 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
177 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
178 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
179 | k8s.io/api v0.29.2 h1:hBC7B9+MU+ptchxEqTNW2DkUosJpp1P+Wn6YncZ474A=
180 | k8s.io/api v0.29.2/go.mod h1:sdIaaKuU7P44aoyyLlikSLayT6Vb7bvJNCX105xZXY0=
181 | k8s.io/apiextensions-apiserver v0.29.0 h1:0VuspFG7Hj+SxyF/Z/2T0uFbI5gb5LRgEyUVE3Q4lV0=
182 | k8s.io/apiextensions-apiserver v0.29.0/go.mod h1:TKmpy3bTS0mr9pylH0nOt/QzQRrW7/h7yLdRForMZwc=
183 | k8s.io/apimachinery v0.30.0 h1:qxVPsyDM5XS96NIh9Oj6LavoVFYff/Pon9cZeDIkHHA=
184 | k8s.io/apimachinery v0.30.0/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc=
185 | k8s.io/client-go v0.29.2 h1:FEg85el1TeZp+/vYJM7hkDlSTFZ+c5nnK44DJ4FyoRg=
186 | k8s.io/client-go v0.29.2/go.mod h1:knlvFZE58VpqbQpJNbCbctTVXcd35mMyAAwBdpt4jrA=
187 | k8s.io/component-base v0.29.2 h1:lpiLyuvPA9yV1aQwGLENYyK7n/8t6l3nn3zAtFTJYe8=
188 | k8s.io/component-base v0.29.2/go.mod h1:BfB3SLrefbZXiBfbM+2H1dlat21Uewg/5qtKOl8degM=
189 | k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw=
190 | k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
191 | k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag=
192 | k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
193 | k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI=
194 | k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
195 | sigs.k8s.io/controller-runtime v0.17.2 h1:FwHwD1CTUemg0pW2otk7/U5/i5m2ymzvOXdbeGOUvw0=
196 | sigs.k8s.io/controller-runtime v0.17.2/go.mod h1:+MngTvIQQQhfXtwfdGw/UOQ/aIaqsYywfCINOtwMO/s=
197 | sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
198 | sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
199 | sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
200 | sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
201 | sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
202 | sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
203 | 


--------------------------------------------------------------------------------
/hack/boilerplate.go.txt:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */


--------------------------------------------------------------------------------
/internal/controller/instaslice_controller.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package controller
 18 | 
 19 | import (
 20 | 	"context"
 21 | 	"fmt"
 22 | 	"regexp"
 23 | 	"strings"
 24 | 	"time"
 25 | 
 26 | 	inferencev1alpha1 "codeflare.dev/instaslice/api/v1alpha1"
 27 | 	v1 "k8s.io/api/core/v1"
 28 | 	"k8s.io/apimachinery/pkg/api/errors"
 29 | 	"k8s.io/apimachinery/pkg/runtime"
 30 | 	"k8s.io/apimachinery/pkg/types"
 31 | 	"k8s.io/client-go/kubernetes"
 32 | 	ctrl "sigs.k8s.io/controller-runtime"
 33 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 34 | 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 35 | 	"sigs.k8s.io/controller-runtime/pkg/handler"
 36 | 	"sigs.k8s.io/controller-runtime/pkg/log"
 37 | 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 38 | )
 39 | 
 40 | // InstasliceReconciler reconciles a Instaslice object
 41 | type InstasliceReconciler struct {
 42 | 	client.Client
 43 | 	Scheme     *runtime.Scheme
 44 | 	kubeClient *kubernetes.Clientset
 45 | }
 46 | 
 47 | // AllocationPolicy interface with a single method
 48 | type AllocationPolicy interface {
 49 | 	SetAllocationDetails(profileName string, newStart, size uint32, podUUID string, nodename string, processed string, discoveredGiprofile int, Ciprofileid int, Ciengprofileid int, namespace string, podName string, gpuUuid string) *inferencev1alpha1.AllocationDetails
 50 | }
 51 | 
 52 | type RightToLeftPolicy struct{}
 53 | 
 54 | type LeftToRightPolicy struct{}
 55 | 
 56 | type FirstFitPolicy struct{}
 57 | 
 58 | //+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices,verbs=get;list;watch;create;update;patch;delete
 59 | //+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices/status,verbs=get;update;patch
 60 | //+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices/finalizers,verbs=update
 61 | //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
 62 | //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;create;update;patch;delete
 63 | 
 64 | func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 65 | 
 66 | 	var policy AllocationPolicy
 67 | 	policy = &FirstFitPolicy{}
 68 | 	pod := &v1.Pod{}
 69 | 	var isPodGated = false
 70 | 	err := r.Get(ctx, req.NamespacedName, pod)
 71 | 	if err != nil {
 72 | 		// Error fetching the Pod
 73 | 		if errors.IsNotFound(err) {
 74 | 			log.FromContext(ctx).Error(err, "unable to fetch pod might be deleted")
 75 | 			return ctrl.Result{}, nil
 76 | 		}
 77 | 		log.FromContext(ctx).Error(err, "unable to fetch pod")
 78 | 		return ctrl.Result{}, err
 79 | 	}
 80 | 
 81 | 	isPodGated = checkIfPodGated(pod, isPodGated)
 82 | 
 83 | 	var instasliceList inferencev1alpha1.InstasliceList
 84 | 
 85 | 	if err := r.List(ctx, &instasliceList, &client.ListOptions{}); err != nil {
 86 | 		log.FromContext(ctx).Error(err, "Error listing Instaslice")
 87 | 	}
 88 | 	// handles graceful termination of pods, wait for about 30 seconds from the time deletiontimestamp is set on the pod
 89 | 	if !pod.DeletionTimestamp.IsZero() && isPodGated {
 90 | 		if controllerutil.RemoveFinalizer(pod, "org.instaslice/accelarator") {
 91 | 			if err := r.Update(ctx, pod); err != nil {
 92 | 				log.FromContext(ctx).Info("unable to update removal of finalizer, retrying")
 93 | 				return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
 94 | 			}
 95 | 			log.FromContext(ctx).Info("finalizer deleted")
 96 | 			return ctrl.Result{}, nil
 97 | 		}
 98 | 	}
 99 | 	if !pod.DeletionTimestamp.IsZero() {
100 | 		log.FromContext(ctx).Info("set status to deleted for ", "pod", pod.Name)
101 | 		if controllerutil.ContainsFinalizer(pod, "org.instaslice/accelarator") {
102 | 			for _, instaslice := range instasliceList.Items {
103 | 				for podUuid, allocation := range instaslice.Spec.Allocations {
104 | 					if podUuid == string(pod.UID) {
105 | 						elapsed := time.Since(pod.DeletionTimestamp.Time)
106 | 						if elapsed > 30*time.Second {
107 | 							if controllerutil.RemoveFinalizer(pod, "org.instaslice/accelarator") {
108 | 								if err := r.Update(ctx, pod); err != nil {
109 | 									log.FromContext(ctx).Info("unable to update removal of finalizer, retrying")
110 | 									return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
111 | 								}
112 | 								log.FromContext(ctx).Info("finalizer deleted")
113 | 								allocation.Allocationstatus = "deleted"
114 | 								var updateInstasliceObject inferencev1alpha1.Instaslice
115 | 								typeNamespacedName := types.NamespacedName{
116 | 									Name:      instaslice.Name,
117 | 									Namespace: "default", // TODO: modify
118 | 								}
119 | 								err := r.Get(ctx, typeNamespacedName, &updateInstasliceObject)
120 | 								if err != nil {
121 | 									log.FromContext(ctx).Error(err, "error getting latest instaslice object")
122 | 								}
123 | 								updateInstasliceObject.Spec.Allocations[podUuid] = allocation
124 | 								errUpdatingInstaslice := r.Update(ctx, &updateInstasliceObject)
125 | 								if errUpdatingInstaslice != nil {
126 | 									log.FromContext(ctx).Info("unable to set instaslice to state deleted for ", "pod", allocation.PodName)
127 | 									return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
128 | 								}
129 | 							}
130 | 
131 | 						} else {
132 | 							remainingTime := 30*time.Second - elapsed
133 | 							return ctrl.Result{RequeueAfter: remainingTime}, nil
134 | 						}
135 | 					}
136 | 				}
137 | 
138 | 			}
139 | 		}
140 | 		//exit after handling deletion event for a pod.
141 | 		return ctrl.Result{}, nil
142 | 	}
143 | 
144 | 	// find allocation in the cluster for the pod
145 | 	// set allocationstatus to creating when controller adds the allocation
146 | 	// check for allocationstatus as created when daemonset is done realizing the slice on the GPU node.
147 | 	// set allocationstatus to ungated and ungate the pod so that the workload can begin execution.
148 | 	if isPodGated {
149 | 		//Assume pod only has one container with one GPU requests
150 | 		if len(pod.Spec.Containers) != 1 {
151 | 			return ctrl.Result{}, fmt.Errorf("multiple containers per pod not supported")
152 | 		}
153 | 		limits := pod.Spec.Containers[0].Resources.Limits
154 | 		profileName := r.extractProfileName(limits)
155 | 		for _, instaslice := range instasliceList.Items {
156 | 			for podUuid, allocations := range instaslice.Spec.Allocations {
157 | 				if allocations.Allocationstatus == "created" && allocations.PodUUID == string(pod.UID) {
158 | 					pod := r.unGatePod(pod)
159 | 					errForUngating := r.Update(ctx, pod)
160 | 					if errForUngating != nil {
161 | 						//pod updates are retried as controller is the only entiting working on pod updates.
162 | 						return ctrl.Result{Requeue: true}, nil
163 | 					}
164 | 					allocations.Allocationstatus = "ungated"
165 | 					instaslice.Spec.Allocations[podUuid] = allocations
166 | 					var updateInstasliceObject inferencev1alpha1.Instaslice
167 | 					typeNamespacedName := types.NamespacedName{
168 | 						Name:      instaslice.Name,
169 | 						Namespace: "default", // TODO: modify
170 | 					}
171 | 					err := r.Get(ctx, typeNamespacedName, &updateInstasliceObject)
172 | 					if err != nil {
173 | 						log.FromContext(ctx).Error(err, "error getting latest instaslice object")
174 | 					}
175 | 					if updateInstasliceObject.Spec.Allocations == nil {
176 | 						updateInstasliceObject.Spec.Allocations = make(map[string]inferencev1alpha1.AllocationDetails)
177 | 					}
178 | 					updateInstasliceObject.Spec.Allocations[podUuid] = allocations
179 | 					if err := r.Update(ctx, &updateInstasliceObject); err != nil {
180 | 						log.FromContext(ctx).Error(err, "Error updating instaslice allocations")
181 | 						return ctrl.Result{Requeue: true}, nil
182 | 					}
183 | 					return ctrl.Result{}, nil
184 | 				}
185 | 			}
186 | 		}
187 | 		//pod does not have an allocation yet, make allocation
188 | 		//Find the node
189 | 		podHasNodeAllocation := false
190 | 		for _, instaslice := range instasliceList.Items {
191 | 			//Find the GPU on the node and the GPU index where the slice can be created
192 | 			allocDetails, err := r.findDeviceForASlice(&instaslice, profileName, policy, pod)
193 | 			if err != nil {
194 | 				log.FromContext(ctx).Info("sufficient capacity not available to allocate GPU for ", "pod", pod.Name, "node", instaslice.Name)
195 | 				continue
196 | 			}
197 | 			podHasNodeAllocation = true
198 | 			for _, item := range instaslice.Spec.Prepared {
199 | 				if item.Parent == allocDetails.GPUUUID && item.Size == allocDetails.Size && item.Start == allocDetails.Start {
200 | 					log.FromContext(ctx).Info("prepared allocation is yet to be deleted, retrying new allocation")
201 | 					return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
202 | 				}
203 | 			}
204 | 			if podHasNodeAllocation {
205 | 				var updateInstasliceObject inferencev1alpha1.Instaslice
206 | 				typeNamespacedName := types.NamespacedName{
207 | 					Name:      instaslice.Name,
208 | 					Namespace: "default", // TODO: modify
209 | 				}
210 | 				err := r.Get(ctx, typeNamespacedName, &updateInstasliceObject)
211 | 				if err != nil {
212 | 					log.FromContext(ctx).Error(err, "error getting latest instaslice object")
213 | 				}
214 | 				log.FromContext(ctx).Info("allocation obtained for ", "pod", allocDetails.PodName)
215 | 				if updateInstasliceObject.Spec.Allocations == nil {
216 | 					updateInstasliceObject.Spec.Allocations = make(map[string]inferencev1alpha1.AllocationDetails)
217 | 				}
218 | 				updateInstasliceObject.Spec.Allocations[string(pod.UID)] = *allocDetails
219 | 				if err := r.Update(ctx, &updateInstasliceObject); err != nil {
220 | 					log.FromContext(ctx).Error(err, "Error updating instaslice allocations")
221 | 					return ctrl.Result{Requeue: true}, nil
222 | 				}
223 | 			} else {
224 | 				log.FromContext(ctx).Info("requeuing, cluster does not have resources for ", "pod", pod.Name)
225 | 				return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
226 | 			}
227 | 		}
228 | 		//if the cluster does not have suitable node, requeue request
229 | 		if !podHasNodeAllocation {
230 | 			log.FromContext(ctx).Info("no suitable node found in cluster for ", "pod", pod.Name)
231 | 			return ctrl.Result{RequeueAfter: 2 * time.Second}, nil
232 | 		}
233 | 
234 | 	}
235 | 
236 | 	// no gated pod or dangling reference found
237 | 	return ctrl.Result{}, nil
238 | }
239 | 
240 | func (r *InstasliceReconciler) findDeviceForASlice(instaslice *inferencev1alpha1.Instaslice, profileName string, policy AllocationPolicy, pod *v1.Pod) (*inferencev1alpha1.AllocationDetails, error) {
241 | 	//TODO: discover this value, this may work for A100 and H100 for now.
242 | 	for gpuuuid, _ := range instaslice.Spec.MigGPUUUID {
243 | 		if instaslice.Spec.Allocations == nil {
244 | 			instaslice.Spec.Allocations = make(map[string]inferencev1alpha1.AllocationDetails)
245 | 		}
246 | 		newStart := r.getStartIndexFromPreparedState(instaslice, gpuuuid, profileName)
247 | 		//size cannot be 9 atleast for A100s 40GB/80GB and H100 variants
248 | 		notValidIndex := uint32(9)
249 | 		if newStart == notValidIndex {
250 | 			//Move to next GPU
251 | 			continue
252 | 		}
253 | 		size, discoveredGiprofile, Ciprofileid, Ciengprofileid := r.extractGpuProfile(instaslice, profileName)
254 | 		allocDetails := policy.SetAllocationDetails(profileName, uint32(newStart), uint32(size),
255 | 			string(pod.UID), instaslice.Name, "creating", discoveredGiprofile,
256 | 			Ciprofileid, Ciengprofileid, pod.Namespace, pod.Name, gpuuuid)
257 | 		//instaslice.Spec.Allocations[string(pod.UID)] = *allocDetails
258 | 		return allocDetails, nil
259 | 	}
260 | 
261 | 	return nil, fmt.Errorf("failed to find allocatable gpu")
262 | }
263 | 
264 | // Extract profile name from the container limits spec
265 | func (*InstasliceReconciler) extractProfileName(limits v1.ResourceList) string {
266 | 	profileName := ""
267 | 	for k, _ := range limits {
268 | 		if strings.Contains(k.String(), "nvidia") {
269 | 
270 | 			re := regexp.MustCompile(`(\d+g\.\d+gb)`)
271 | 			match := re.FindStringSubmatch(k.String())
272 | 			if len(match) > 1 {
273 | 				profileName = match[1]
274 | 			} else {
275 | 				log.Log.Info("No match found")
276 | 			}
277 | 		}
278 | 	}
279 | 	return profileName
280 | }
281 | 
282 | // Extract NVML specific attributes for GPUs, this will change for different generations of the GPU.
283 | func (*InstasliceReconciler) extractGpuProfile(instaslice *inferencev1alpha1.Instaslice, profileName string) (int, int, int, int) {
284 | 	var size int
285 | 	var discoveredGiprofile int
286 | 	var Ciprofileid int
287 | 	var Ciengprofileid int
288 | 	for _, item := range instaslice.Spec.Migplacement {
289 | 		if item.Profile == profileName {
290 | 			for _, aPlacement := range item.Placements {
291 | 				size = aPlacement.Size
292 | 				discoveredGiprofile = item.Giprofileid
293 | 				Ciprofileid = item.CIProfileID
294 | 				Ciengprofileid = item.CIEngProfileID
295 | 				break
296 | 			}
297 | 		}
298 | 	}
299 | 	return size, discoveredGiprofile, Ciprofileid, Ciengprofileid
300 | }
301 | 
302 | // accounting logic that finds the correct GPU and index where a slice could be placed.
303 | func (*InstasliceReconciler) getStartIndexFromPreparedState(instaslice *inferencev1alpha1.Instaslice, gpuUUID string, profileName string) uint32 {
304 | 	//TODO: generalize, A100 and H100 have 8 indexes for 3g and 7g and 7 for rest, so go with 8 and we are bounded by
305 | 	//only valid placement indexes for a profile.
306 | 	var gpuAllocatedIndex [8]uint32
307 | 	// clean slate init
308 | 	for i := range gpuAllocatedIndex {
309 | 		gpuAllocatedIndex[i] = 0
310 | 	}
311 | 	//TODO: remove this once we start using GPU operator with device plugin fix
312 | 	for _, item := range instaslice.Spec.Prepared {
313 | 		if item.Parent == gpuUUID && item.PodUUID == "" {
314 | 
315 | 			for i := 0; i < int(item.Size); i++ {
316 | 				gpuAllocatedIndex[int(item.Start)+i] = 1
317 | 			}
318 | 
319 | 		}
320 | 	}
321 | 
322 | 	for _, item := range instaslice.Spec.Allocations {
323 | 		if item.GPUUUID == gpuUUID {
324 | 			for i := 0; i < int(item.Size); i++ {
325 | 				gpuAllocatedIndex[int(item.Start)+i] = 1
326 | 			}
327 | 		}
328 | 	}
329 | 
330 | 	var neededContinousSlot int
331 | 	var possiblePlacements []int
332 | 	for _, placement := range instaslice.Spec.Migplacement {
333 | 		if placement.Profile == profileName {
334 | 			neededContinousSlot = placement.Placements[0].Size
335 | 			for _, placement := range placement.Placements {
336 | 				possiblePlacements = append(possiblePlacements, placement.Start)
337 | 			}
338 | 			break
339 | 		}
340 | 	}
341 | 	//TODO: generalize for other hardware models like A30, no slices can be placed on 9th index
342 | 	//if we return 9 then assume no valid index is found.
343 | 	var newStart = uint32(9)
344 | 	for _, value := range possiblePlacements {
345 | 		if gpuAllocatedIndex[value] == 0 {
346 | 			if neededContinousSlot == 1 {
347 | 				newStart = uint32(value)
348 | 				break
349 | 			}
350 | 			if neededContinousSlot == 2 {
351 | 				if value+neededContinousSlot < len(gpuAllocatedIndex) {
352 | 					if gpuAllocatedIndex[value] == 0 && gpuAllocatedIndex[value+1] == 0 {
353 | 						newStart = uint32(value)
354 | 						break
355 | 					}
356 | 				}
357 | 
358 | 			}
359 | 			if neededContinousSlot == 4 {
360 | 				if value+neededContinousSlot < len(gpuAllocatedIndex) {
361 | 					if gpuAllocatedIndex[value] == 0 && gpuAllocatedIndex[value+1] == 0 && gpuAllocatedIndex[value+2] == 0 && gpuAllocatedIndex[value+3] == 0 {
362 | 						newStart = uint32(value)
363 | 						break
364 | 					}
365 | 				}
366 | 			}
367 | 
368 | 			if neededContinousSlot == 8 {
369 | 				//special case
370 | 				if value+neededContinousSlot < len(gpuAllocatedIndex) {
371 | 					if gpuAllocatedIndex[value] == 0 && gpuAllocatedIndex[value+1] == 0 &&
372 | 						gpuAllocatedIndex[value+2] == 0 && gpuAllocatedIndex[value+3] == 0 &&
373 | 						gpuAllocatedIndex[value+4] == 0 && gpuAllocatedIndex[value+5] == 0 &&
374 | 						gpuAllocatedIndex[value+6] == 0 && gpuAllocatedIndex[value+7] == 0 {
375 | 						newStart = uint32(value)
376 | 					}
377 | 				}
378 | 			}
379 | 		}
380 | 
381 | 	}
382 | 
383 | 	return newStart
384 | }
385 | 
386 | func checkIfPodGated(pod *v1.Pod, isPodGated bool) bool {
387 | 	for _, gate := range pod.Spec.SchedulingGates {
388 | 		if gate.Name == "org.instaslice/accelarator" {
389 | 			if pod.Status.Phase == v1.PodPending && strings.Contains(pod.Status.Conditions[0].Message, "blocked") {
390 | 				isPodGated = true
391 | 			}
392 | 		}
393 | 	}
394 | 	return isPodGated
395 | }
396 | 
397 | // podMapFunc maps pods to instaslice created allocations
398 | func (r *InstasliceReconciler) podMapFunc(ctx context.Context, obj client.Object) []reconcile.Request {
399 | 	instaslice := obj.(*inferencev1alpha1.Instaslice)
400 | 	for _, allocation := range instaslice.Spec.Allocations {
401 | 		if allocation.Allocationstatus == "created" {
402 | 			return []reconcile.Request{{NamespacedName: types.NamespacedName{Namespace: allocation.Namespace, Name: allocation.PodName}}}
403 | 		}
404 | 	}
405 | 
406 | 	return nil
407 | }
408 | 
409 | // SetupWithManager sets up the controller with the Manager.
410 | func (r *InstasliceReconciler) SetupWithManager(mgr ctrl.Manager) error {
411 | 
412 | 	restConfig := mgr.GetConfig()
413 | 
414 | 	var err error
415 | 	r.kubeClient, err = kubernetes.NewForConfig(restConfig)
416 | 	if err != nil {
417 | 		return err
418 | 	}
419 | 
420 | 	return ctrl.NewControllerManagedBy(mgr).
421 | 		For(&v1.Pod{}).Named("InstaSlice-controller").
422 | 		Watches(&inferencev1alpha1.Instaslice{}, handler.EnqueueRequestsFromMapFunc(r.podMapFunc)).
423 | 		Complete(r)
424 | }
425 | 
426 | func (r *InstasliceReconciler) unGatePod(podUpdate *v1.Pod) *v1.Pod {
427 | 	for i, gate := range podUpdate.Spec.SchedulingGates {
428 | 		if gate.Name == "org.instaslice/accelarator" {
429 | 			podUpdate.Spec.SchedulingGates = append(podUpdate.Spec.SchedulingGates[:i], podUpdate.Spec.SchedulingGates[i+1:]...)
430 | 		}
431 | 	}
432 | 	return podUpdate
433 | }
434 | 
435 | // Policy based allocation - FirstFit
436 | func (r *FirstFitPolicy) SetAllocationDetails(profileName string, newStart, size uint32, podUUID, nodename string,
437 | 	processed string, discoveredGiprofile int, Ciprofileid int, Ciengprofileid int,
438 | 	namespace string, podName string, gpuUuid string) *inferencev1alpha1.AllocationDetails {
439 | 	return &inferencev1alpha1.AllocationDetails{
440 | 		Profile:          profileName,
441 | 		Start:            uint32(newStart),
442 | 		Size:             uint32(size),
443 | 		PodUUID:          podUUID,
444 | 		Nodename:         nodename,
445 | 		Allocationstatus: processed,
446 | 		Giprofileid:      discoveredGiprofile,
447 | 		CIProfileID:      Ciprofileid,
448 | 		CIEngProfileID:   Ciengprofileid,
449 | 		Namespace:        namespace,
450 | 		PodName:          podName,
451 | 		GPUUUID:          gpuUuid,
452 | 	}
453 | }
454 | 
455 | // Policy based allocation - LeftToRIght
456 | func (l *LeftToRightPolicy) SetAllocationDetails(profileName string, newStart, size uint32, podUUID, nodename string,
457 | 	processed string, discoveredGiprofile int, Ciprofileid int, Ciengprofileid int,
458 | 	namespace string, podName string, gpuUuid string) *inferencev1alpha1.AllocationDetails {
459 | 	// Implement the left-to-right policy here
460 | 	return &inferencev1alpha1.AllocationDetails{}
461 | }
462 | 
463 | // Policy based allocation - RigghToLeft
464 | func (l *RightToLeftPolicy) SetAllocationDetails(profileName string, newStart, size uint32, podUUID, nodename string,
465 | 	processed string, discoveredGiprofile int, Ciprofileid int, Ciengprofileid int,
466 | 	namespace string, podName string, gpuUuid string) *inferencev1alpha1.AllocationDetails {
467 | 	// Implement the left-to-right policy here
468 | 	return &inferencev1alpha1.AllocationDetails{}
469 | }
470 | 
471 | func isPodDeletionProcessed(str string, arr []string) bool {
472 | 	for _, v := range arr {
473 | 		if v == str {
474 | 			return false
475 | 		}
476 | 	}
477 | 	return true
478 | }
479 | 


--------------------------------------------------------------------------------
/internal/controller/instaslice_controller_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package controller
18 | 
19 | import (
20 | 	"context"
21 | 
22 | 	. "github.com/onsi/ginkgo/v2"
23 | 	. "github.com/onsi/gomega"
24 | 	"k8s.io/apimachinery/pkg/api/errors"
25 | 	"k8s.io/apimachinery/pkg/types"
26 | 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
27 | 
28 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29 | 
30 | 	inferencev1alpha1 "codeflare.dev/instaslice/api/v1alpha1"
31 | )
32 | 
33 | var _ = Describe("Instaslice Controller", func() {
34 | 	Context("When reconciling a resource", func() {
35 | 		const resourceName = "test-resource"
36 | 
37 | 		ctx := context.Background()
38 | 
39 | 		typeNamespacedName := types.NamespacedName{
40 | 			Name:      resourceName,
41 | 			Namespace: "default", // TODO(user):Modify as needed
42 | 		}
43 | 		instaslice := &inferencev1alpha1.Instaslice{}
44 | 
45 | 		BeforeEach(func() {
46 | 			By("creating the custom resource for the Kind Instaslice")
47 | 			err := k8sClient.Get(ctx, typeNamespacedName, instaslice)
48 | 			if err != nil && errors.IsNotFound(err) {
49 | 				resource := &inferencev1alpha1.Instaslice{
50 | 					ObjectMeta: metav1.ObjectMeta{
51 | 						Name:      resourceName,
52 | 						Namespace: "default",
53 | 					},
54 | 					// TODO(user): Specify other spec details if needed.
55 | 				}
56 | 				Expect(k8sClient.Create(ctx, resource)).To(Succeed())
57 | 			}
58 | 		})
59 | 
60 | 		AfterEach(func() {
61 | 			// TODO(user): Cleanup logic after each test, like removing the resource instance.
62 | 			resource := &inferencev1alpha1.Instaslice{}
63 | 			err := k8sClient.Get(ctx, typeNamespacedName, resource)
64 | 			Expect(err).NotTo(HaveOccurred())
65 | 
66 | 			By("Cleanup the specific resource instance Instaslice")
67 | 			Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
68 | 		})
69 | 		It("should successfully reconcile the resource", func() {
70 | 			By("Reconciling the created resource")
71 | 			controllerReconciler := &InstasliceReconciler{
72 | 				Client: k8sClient,
73 | 				Scheme: k8sClient.Scheme(),
74 | 			}
75 | 
76 | 			_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
77 | 				NamespacedName: typeNamespacedName,
78 | 			})
79 | 			Expect(err).NotTo(HaveOccurred())
80 | 			// TODO(user): Add more specific assertions depending on your controller's reconciliation logic.
81 | 			// Example: If you expect a certain status condition after reconciliation, verify it here.
82 | 		})
83 | 	})
84 | })
85 | 


--------------------------------------------------------------------------------
/internal/controller/instaslice_daemonset.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package controller
 18 | 
 19 | import (
 20 | 	"context"
 21 | 	"encoding/json"
 22 | 	"fmt"
 23 | 	"math"
 24 | 	"os"
 25 | 	"strings"
 26 | 	"time"
 27 | 
 28 | 	inferencev1alpha1 "codeflare.dev/instaslice/api/v1alpha1"
 29 | 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 30 | 	v1 "k8s.io/api/core/v1"
 31 | 	"k8s.io/apimachinery/pkg/api/errors"
 32 | 	"k8s.io/apimachinery/pkg/runtime"
 33 | 	"k8s.io/apimachinery/pkg/types"
 34 | 	"k8s.io/client-go/kubernetes"
 35 | 	ctrl "sigs.k8s.io/controller-runtime"
 36 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 37 | 	"sigs.k8s.io/controller-runtime/pkg/log"
 38 | 	"sigs.k8s.io/controller-runtime/pkg/manager"
 39 | 
 40 | 	nvdevice "github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
 41 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 42 | )
 43 | 
 44 | // InstaSliceDaemonsetReconciler reconciles a InstaSliceDaemonset object
 45 | type InstaSliceDaemonsetReconciler struct {
 46 | 	client.Client
 47 | 	Scheme     *runtime.Scheme
 48 | 	kubeClient *kubernetes.Clientset
 49 | 	NodeName   string
 50 | }
 51 | 
 52 | //+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices,verbs=get;list;watch;create;update;patch;delete
 53 | //+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices/status,verbs=get;update;patch
 54 | //+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices/finalizers,verbs=update
 55 | //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;create;update;patch;delete
 56 | //+kubebuilder:rbac:groups="",resources=nodes/status,verbs=get;update;patch
 57 | //+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch;create;update;patch;delete
 58 | 
 59 | var discoveredGpusOnHost []string
 60 | 
 61 | // Additional handler used for making NVML calls.
 62 | type deviceHandler struct {
 63 | 	nvdevice nvdevice.Interface
 64 | 	nvml     nvml.Interface
 65 | }
 66 | 
 67 | type MigProfile struct {
 68 | 	C              int
 69 | 	G              int
 70 | 	GB             int
 71 | 	GIProfileID    int
 72 | 	CIProfileID    int
 73 | 	CIEngProfileID int
 74 | }
 75 | 
 76 | type ResPatchOperation struct {
 77 | 	Op    string `json:"op"`
 78 | 	Path  string `json:"path"`
 79 | 	Value string `json:"value"`
 80 | }
 81 | 
 82 | const (
 83 | 	// AttributeMediaExtensions holds the string representation for the media extension MIG profile attribute.
 84 | 	AttributeMediaExtensions = "me"
 85 | )
 86 | 
 87 | type preparedMig struct {
 88 | 	gid     uint32
 89 | 	miguuid string
 90 | 	cid     uint32
 91 | }
 92 | 
 93 | var cachedPreparedMig = make(map[string]preparedMig)
 94 | 
 95 | func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 96 | 
 97 | 	nodeName := os.Getenv("NODE_NAME")
 98 | 	nsName := types.NamespacedName{
 99 | 		Name:      nodeName,
100 | 		Namespace: "default",
101 | 	}
102 | 	var instaslice inferencev1alpha1.Instaslice
103 | 	if err := r.Get(ctx, nsName, &instaslice); err != nil {
104 | 		log.FromContext(ctx).Error(err, "Error listing Instaslice")
105 | 	}
106 | 
107 | 	for _, allocations := range instaslice.Spec.Allocations {
108 | 		if allocations.Allocationstatus == "creating" {
109 | 			//Assume pod only has one container with one GPU request
110 | 			log.FromContext(ctx).Info("creating allocation for ", "pod", allocations.PodName)
111 | 			var podUUID = allocations.PodUUID
112 | 			ret := nvml.Init()
113 | 			if ret != nvml.SUCCESS {
114 | 				log.FromContext(ctx).Error(ret, "Unable to initialize NVML")
115 | 			}
116 | 
117 | 			availableGpus, ret := nvml.DeviceGetCount()
118 | 			if ret != nvml.SUCCESS {
119 | 				log.FromContext(ctx).Error(ret, "Unable to get device count")
120 | 			}
121 | 
122 | 			if errCreatingInstaSliceResource := r.createInstaSliceResource(ctx, nodeName, allocations.PodName); errCreatingInstaSliceResource != nil {
123 | 				return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
124 | 			}
125 | 
126 | 			deviceForMig, profileName, Giprofileid, Ciprofileid, CiEngProfileid := r.getAllocation(instaslice, allocations.PodUUID)
127 | 			placement := nvml.GpuInstancePlacement{}
128 | 			for i := 0; i < availableGpus; i++ {
129 | 				existingAllocations := instaslice.Spec.Allocations[podUUID]
130 | 
131 | 				device, ret := nvml.DeviceGetHandleByIndex(i)
132 | 				if ret != nvml.SUCCESS {
133 | 					log.FromContext(ctx).Error(ret, "Unable to get device at index")
134 | 				}
135 | 
136 | 				uuid, ret := device.GetUUID()
137 | 				if ret != nvml.SUCCESS {
138 | 					log.FromContext(ctx).Error(ret, "Unable to get uuid of device at index")
139 | 				}
140 | 				if deviceForMig != uuid {
141 | 					continue
142 | 				}
143 | 
144 | 				//Move to next GPU as this is not the selected GPU by the controller.
145 | 				if allocations.GPUUUID != uuid {
146 | 					continue
147 | 				}
148 | 				//TODO: any GPU can fail creating CI and GI
149 | 				if _, exists := cachedPreparedMig[allocations.PodName]; !exists {
150 | 					var giInfo nvml.GpuInstanceInfo
151 | 					log.FromContext(ctx).Info("Slice does not exists on GPU for ", "pod", allocations.PodName)
152 | 
153 | 					device, retCodeForDevice := nvml.DeviceGetHandleByUUID(uuid)
154 | 
155 | 					if retCodeForDevice != nvml.SUCCESS {
156 | 						log.FromContext(ctx).Error(ret, "error getting GPU device handle")
157 | 					}
158 | 
159 | 					giProfileInfo, retCodeForGi := device.GetGpuInstanceProfileInfo(Giprofileid)
160 | 					if retCodeForGi != nvml.SUCCESS {
161 | 						log.FromContext(ctx).Error(retCodeForGi, "error getting GPU instance profile info", "giProfileInfo", giProfileInfo, "retCodeForGi", retCodeForGi)
162 | 					}
163 | 
164 | 					log.FromContext(ctx).Info("The profile id is", "giProfileInfo", giProfileInfo.Id, "Memory", giProfileInfo.MemorySizeMB, "pod", podUUID)
165 | 
166 | 					updatedPlacement, err := r.getAllocationsToprepare(ctx, placement, instaslice, allocations.PodUUID)
167 | 					if err != nil {
168 | 						log.FromContext(ctx).Error(err, "prepared already exists for ", "pod", allocations.PodName)
169 | 						return ctrl.Result{}, nil
170 | 					}
171 | 
172 | 					gi, retCodeForGiWithPlacement := device.CreateGpuInstanceWithPlacement(&giProfileInfo, &updatedPlacement)
173 | 					if retCodeForGiWithPlacement != nvml.SUCCESS {
174 | 						log.FromContext(ctx).Error(retCodeForGiWithPlacement, "error creating GPU instance for ", "gi", &gi)
175 | 					}
176 | 					giInfo, retForGiInfor := gi.GetInfo()
177 | 					if retForGiInfor != nvml.SUCCESS {
178 | 						log.FromContext(ctx).Error(retForGiInfor, "error getting GPU instance info for ", "giInfo", &giInfo)
179 | 						//TODO: clean up GI and then return
180 | 					}
181 | 					//TODO: figure out the compute slice scenario, I think Kubernetes does not support this use case yet
182 | 					ciProfileInfo, retCodeForCiProfile := gi.GetComputeInstanceProfileInfo(Ciprofileid, CiEngProfileid)
183 | 					if retCodeForCiProfile != nvml.SUCCESS {
184 | 						log.FromContext(ctx).Error(retCodeForCiProfile, "error getting Compute instance profile info for ", "ciProfileInfo", ciProfileInfo)
185 | 					}
186 | 					ci, retCodeForComputeInstance := gi.CreateComputeInstance(&ciProfileInfo)
187 | 					if retCodeForComputeInstance != nvml.SUCCESS {
188 | 						log.FromContext(ctx).Error(retCodeForComputeInstance, "error creating Compute instance for ", "ci", ci)
189 | 					}
190 | 
191 | 					//get created mig details
192 | 					giId, migUUID, ciId := r.getCreatedSliceDetails(ctx, giInfo, ret, device, uuid, profileName)
193 | 					cachedPreparedMig[allocations.PodName] = preparedMig{gid: giId, miguuid: migUUID, cid: ciId}
194 | 				}
195 | 
196 | 				createdSliceDetails := cachedPreparedMig[allocations.PodName]
197 | 				log.FromContext(ctx).Info("The created cache details loaded are for allocation ", "pod name", allocations.PodName, "slice details", createdSliceDetails)
198 | 
199 | 				if errCreatingConfigMap := r.createConfigMap(ctx, createdSliceDetails.miguuid, existingAllocations.Namespace, existingAllocations.PodName); errCreatingConfigMap != nil {
200 | 					return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
201 | 				}
202 | 
203 | 				if errAddingPrepared := r.createPreparedEntry(ctx, profileName, podUUID, allocations.GPUUUID, createdSliceDetails.gid, createdSliceDetails.cid, &instaslice, createdSliceDetails.miguuid); errAddingPrepared != nil {
204 | 					return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
205 | 				}
206 | 				nodeName := os.Getenv("NODE_NAME")
207 | 				if errUpdatingNodeCapacity := r.updateNodeCapacity(ctx, nodeName); errUpdatingNodeCapacity != nil {
208 | 					return ctrl.Result{Requeue: true}, nil
209 | 				}
210 | 				var updateInstasliceObject inferencev1alpha1.Instaslice
211 | 				typeNamespacedName := types.NamespacedName{
212 | 					Name:      instaslice.Name,
213 | 					Namespace: "default", // TODO: modify
214 | 				}
215 | 				err := r.Get(ctx, typeNamespacedName, &updateInstasliceObject)
216 | 				if err != nil {
217 | 					log.FromContext(ctx).Error(err, "error getting latest instaslice object")
218 | 				}
219 | 				existingAllocations.Allocationstatus = "created"
220 | 				updateInstasliceObject.Spec.Allocations[podUUID] = existingAllocations
221 | 				errForUpdate := r.Update(ctx, &updateInstasliceObject)
222 | 				if errForUpdate != nil {
223 | 					log.FromContext(ctx).Error(errForUpdate, "error adding prepared statement\n")
224 | 					return ctrl.Result{Requeue: true}, nil
225 | 				}
226 | 
227 | 				return ctrl.Result{}, nil
228 | 
229 | 			}
230 | 
231 | 		}
232 | 		//TODO: if cm and instaslice resource does not exists, then slice was never created, can early terminate
233 | 		if allocations.Allocationstatus == "deleted" {
234 | 			log.FromContext(ctx).Info("Performing cleanup ", "pod", allocations.PodName)
235 | 			if errDeletingCm := r.deleteConfigMap(ctx, allocations.PodName, allocations.Namespace); errDeletingCm != nil {
236 | 				log.FromContext(ctx).Error(errDeletingCm, "error deleting configmap for ", "pod", allocations.PodName)
237 | 				return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
238 | 			}
239 | 
240 | 			if errDeletingInstaSliceResource := r.cleanUpInstaSliceResource(ctx, allocations.PodName); errDeletingInstaSliceResource != nil {
241 | 				log.FromContext(ctx).Error(errDeletingInstaSliceResource, "Error deleting InstaSlice resource object")
242 | 				return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
243 | 			}
244 | 
245 | 			nodeName := os.Getenv("NODE_NAME")
246 | 			if errUpdatingNodeCapacity := r.updateNodeCapacity(ctx, nodeName); errUpdatingNodeCapacity != nil {
247 | 				return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
248 | 			}
249 | 			deletePrepared := r.cleanUpCiAndGi(ctx, allocations.PodUUID, instaslice)
250 | 			log.FromContext(ctx).Info("Done deleting ci and gi for ", "pod", allocations.PodName)
251 | 			delete(cachedPreparedMig, allocations.PodName)
252 | 			var updateInstasliceObject inferencev1alpha1.Instaslice
253 | 			typeNamespacedName := types.NamespacedName{
254 | 				Name:      instaslice.Name,
255 | 				Namespace: "default", // TODO: modify
256 | 			}
257 | 			err := r.Get(ctx, typeNamespacedName, &updateInstasliceObject)
258 | 			if err != nil {
259 | 				log.FromContext(ctx).Error(err, "error getting latest instaslice object")
260 | 			}
261 | 			delete(updateInstasliceObject.Spec.Prepared, deletePrepared)
262 | 			delete(updateInstasliceObject.Spec.Allocations, allocations.PodUUID)
263 | 			errUpdatingAllocation := r.Update(ctx, &updateInstasliceObject)
264 | 			if errUpdatingAllocation != nil {
265 | 				log.FromContext(ctx).Error(errUpdatingAllocation, "Error updating InstaSlice object for ", "pod", allocations.PodName)
266 | 				return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
267 | 			}
268 | 
269 | 			return ctrl.Result{}, nil
270 | 		}
271 | 
272 | 	}
273 | 
274 | 	return ctrl.Result{}, nil
275 | }
276 | 
277 | func (r *InstaSliceDaemonsetReconciler) createInstaSliceResource(ctx context.Context, nodeName string, podName string) error {
278 | 	node := &v1.Node{}
279 | 	if err := r.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil {
280 | 		log.FromContext(ctx).Error(err, "unable to fetch Node")
281 | 		return err
282 | 	}
283 | 	capacityKey := "org.instaslice/" + podName
284 | 	//desiredCapacity := resource.MustParse("1")
285 | 	if _, exists := node.Status.Capacity[v1.ResourceName(capacityKey)]; exists {
286 | 		log.FromContext(ctx).Info("Node already patched with ", "capacity", capacityKey)
287 | 		return nil
288 | 	}
289 | 	patchData, err := createPatchData("org.instaslice/"+podName, "1")
290 | 	if err != nil {
291 | 		log.FromContext(ctx).Error(err, "unable to create correct json for patching node")
292 | 		return err
293 | 	}
294 | 
295 | 	if err := r.Status().Patch(ctx, node, client.RawPatch(types.JSONPatchType, patchData)); err != nil {
296 | 		log.FromContext(ctx).Error(err, "unable to patch Node status")
297 | 		return err
298 | 	}
299 | 	return nil
300 | }
301 | 
302 | func (r *InstaSliceDaemonsetReconciler) getAllocationsToprepare(ctx context.Context, placement nvml.GpuInstancePlacement, instaslice inferencev1alpha1.Instaslice, podUuid string) (nvml.GpuInstancePlacement, error) {
303 | 	allocationExists := false
304 | 	for _, v := range instaslice.Spec.Allocations {
305 | 		for _, prepared := range instaslice.Spec.Prepared {
306 | 			if prepared.PodUUID == podUuid {
307 | 				allocationExists = true
308 | 			}
309 | 		}
310 | 		if !allocationExists {
311 | 			if v.Allocationstatus == "creating" && v.PodUUID == podUuid {
312 | 				placement.Size = v.Size
313 | 				placement.Start = v.Start
314 | 				return placement, nil
315 | 			}
316 | 		}
317 | 	}
318 | 	//TODO: handle empty placement object
319 | 	log.FromContext(ctx).Info("placement not found for ", "podUuid", podUuid)
320 | 	return placement, fmt.Errorf("got prepared slice wait for object to be updated")
321 | }
322 | 
323 | func (*InstaSliceDaemonsetReconciler) getCreatedSliceDetails(ctx context.Context, giInfo nvml.GpuInstanceInfo, ret nvml.Return, device nvml.Device, uuid string, profileName string) (uint32, string, uint32) {
324 | 
325 | 	h := &deviceHandler{}
326 | 	h.nvml = nvml.New()
327 | 	h.nvdevice = nvdevice.New(nvdevice.WithNvml(h.nvml))
328 | 
329 | 	ret1 := h.nvml.Init()
330 | 	if ret1 != nvml.SUCCESS {
331 | 		log.FromContext(ctx).Error(ret, "Unable to initialize NVML")
332 | 	}
333 | 	nvlibParentDevice, err := h.nvdevice.NewDevice(device)
334 | 	if err != nil {
335 | 		log.FromContext(ctx).Error(err, "unable to get nvlib GPU parent device for MIG UUID")
336 | 	}
337 | 	migs, err := nvlibParentDevice.GetMigDevices()
338 | 	if err != nil {
339 | 		log.FromContext(ctx).Error(err, "unable to get MIG devices on GPU")
340 | 	}
341 | 	for _, mig := range migs {
342 | 		obtainedProfileName, _ := mig.GetProfile()
343 | 		giID, ret := mig.GetGpuInstanceId()
344 | 		if ret != nvml.SUCCESS {
345 | 			log.FromContext(ctx).Error(ret, "error getting GPU instance ID for MIG device")
346 | 		}
347 | 		gpuInstance, err1 := device.GetGpuInstanceById(giID)
348 | 		if err1 != nvml.SUCCESS {
349 | 			log.FromContext(ctx).Error(err1, "Unable to get GPU instance")
350 | 		}
351 | 
352 | 		if profileName == obtainedProfileName.String() && giID == int(giInfo.Id) {
353 | 			realizedMig, _ := mig.GetUUID()
354 | 			migCid, _ := mig.GetComputeInstanceId()
355 | 			ci, _ := gpuInstance.GetComputeInstanceById(migCid)
356 | 			ciMigInfo, _ := ci.GetInfo()
357 | 			log.FromContext(ctx).Info("device id is", "migUUID", giInfo.Device)
358 | 			log.FromContext(ctx).Info("Prepared details", "giId", giInfo.Id, "migUUID", realizedMig, "ciId", ciMigInfo.Id)
359 | 			return giInfo.Id, realizedMig, ciMigInfo.Id
360 | 		}
361 | 	}
362 | 	//TODO: handle this error
363 | 	return 0, "", 0
364 | }
365 | 
366 | func (r *InstaSliceDaemonsetReconciler) getAllocation(instaslice inferencev1alpha1.Instaslice, podUuid string) (string, string, int, int, int) {
367 | 
368 | 	for _, v := range instaslice.Spec.Allocations {
369 | 		if v.Allocationstatus == "creating" && v.PodUUID == podUuid {
370 | 			return v.GPUUUID, v.Profile, v.Giprofileid, v.CIProfileID, v.CIEngProfileID
371 | 		}
372 | 	}
373 | 	//TODO handle error
374 | 	return "", "", -1, -1, -1
375 | }
376 | 
377 | func (r *InstaSliceDaemonsetReconciler) cleanUpCiAndGi(ctx context.Context, podUuid string, instaslice inferencev1alpha1.Instaslice) string {
378 | 	ret := nvml.Init()
379 | 	if ret != nvml.SUCCESS {
380 | 		log.FromContext(ctx).Error(ret, "Unable to initialize NVML")
381 | 	}
382 | 
383 | 	var candidateDel string
384 | 	prepared := instaslice.Spec.Prepared
385 | 	for migUUID, value := range prepared {
386 | 		if value.PodUUID == podUuid {
387 | 			parent, errRecievingDeviceHandle := nvml.DeviceGetHandleByUUID(value.Parent)
388 | 			if errRecievingDeviceHandle != nvml.SUCCESS {
389 | 				log.FromContext(ctx).Error(errRecievingDeviceHandle, "Error obtaining GPU handle")
390 | 			}
391 | 			gi, errRetrievingGi := parent.GetGpuInstanceById(int(value.Giinfoid))
392 | 			if errRetrievingGi != nvml.SUCCESS {
393 | 				log.FromContext(ctx).Error(errRetrievingGi, "Error obtaining GPU instance")
394 | 			}
395 | 			ci, errRetrievingCi := gi.GetComputeInstanceById(int(value.Ciinfoid))
396 | 			if errRetrievingCi != nvml.SUCCESS {
397 | 				log.FromContext(ctx).Error(errRetrievingCi, "Error obtaining Compute instance")
398 | 			}
399 | 			errDestroyingCi := ci.Destroy()
400 | 			if errDestroyingCi != nvml.SUCCESS {
401 | 				log.FromContext(ctx).Error(errDestroyingCi, "Error deleting Compute instance")
402 | 			}
403 | 			errDestroyingGi := gi.Destroy()
404 | 			if errDestroyingGi != nvml.SUCCESS {
405 | 				log.FromContext(ctx).Error(errDestroyingGi, "Error deleting GPU instance")
406 | 			}
407 | 			candidateDel = migUUID
408 | 			log.FromContext(ctx).Info("Done deleting MIG slice for pod", "UUID", value.PodUUID)
409 | 		}
410 | 	}
411 | 
412 | 	return candidateDel
413 | }
414 | 
415 | func (r *InstaSliceDaemonsetReconciler) cleanUpInstaSliceResource(ctx context.Context, podName string) error {
416 | 	nodeName := os.Getenv("NODE_NAME")
417 | 	deletePatch, err := deletePatchData(podName)
418 | 	if err != nil {
419 | 		log.FromContext(ctx).Error(err, "unable to create delete json patch data")
420 | 		return err
421 | 	}
422 | 
423 | 	// Apply the patch to remove the resource
424 | 	node := &v1.Node{}
425 | 	if err := r.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil {
426 | 		log.FromContext(ctx).Error(err, "unable to fetch Node")
427 | 		return err
428 | 	}
429 | 	resourceName := v1.ResourceName(fmt.Sprintf("org.instaslice/%s", podName))
430 | 	//&& val.String() == "1"
431 | 	if _, ok := node.Status.Capacity[resourceName]; !ok {
432 | 		log.FromContext(ctx).Info("skipping non-existent deletion of instaslice resource for ", "pod", podName)
433 | 		return nil
434 | 	}
435 | 	if err := r.Status().Patch(ctx, node, client.RawPatch(types.JSONPatchType, deletePatch)); err != nil {
436 | 		log.FromContext(ctx).Error(err, "unable to patch Node status")
437 | 		return err
438 | 	}
439 | 	return nil
440 | }
441 | 
442 | func (r *InstaSliceDaemonsetReconciler) createPreparedEntry(ctx context.Context, profileName string, podUUID string, deviceUUID string, giId uint32, ciId uint32, instaslice *inferencev1alpha1.Instaslice, migUUID string) error {
443 | 	existingPreparedDetails := instaslice.Spec.Prepared
444 | 	checkAPreparedDetails := existingPreparedDetails[migUUID]
445 | 	if checkAPreparedDetails.Ciinfoid == ciId && checkAPreparedDetails.Giinfoid == giId && checkAPreparedDetails.PodUUID == podUUID {
446 | 		log.FromContext(ctx).Info("updated prepared details already exists")
447 | 		return nil
448 | 	}
449 | 	updatedAllocation := instaslice.Spec.Allocations[podUUID]
450 | 	instaslicePrepared := inferencev1alpha1.PreparedDetails{
451 | 		Profile:  profileName,
452 | 		Start:    updatedAllocation.Start,
453 | 		Size:     updatedAllocation.Size,
454 | 		Parent:   deviceUUID,
455 | 		PodUUID:  podUUID,
456 | 		Giinfoid: giId,
457 | 		Ciinfoid: ciId,
458 | 	}
459 | 	if instaslice.Spec.Prepared == nil {
460 | 		instaslice.Spec.Prepared = make(map[string]inferencev1alpha1.PreparedDetails)
461 | 	}
462 | 
463 | 	instaslice.Spec.Prepared[migUUID] = instaslicePrepared
464 | 	errForUpdate := r.Update(ctx, instaslice)
465 | 	if errForUpdate != nil {
466 | 		log.FromContext(ctx).Error(errForUpdate, "error adding prepared statement")
467 | 		return errForUpdate
468 | 	}
469 | 	return nil
470 | }
471 | 
472 | // Reloads the configuration in the device plugin to update node capacity
473 | // there is a possibility of double update, should that happen while we retry?
474 | func (r *InstaSliceDaemonsetReconciler) updateNodeCapacity(ctx context.Context, nodeName string) error {
475 | 	node := &v1.Node{}
476 | 	nodeNameObject := types.NamespacedName{Name: nodeName}
477 | 	err := r.Get(ctx, nodeNameObject, node)
478 | 	if err != nil {
479 | 		log.FromContext(ctx).Error(err, "unable to get node object")
480 | 		return err
481 | 	}
482 | 	// Label value should be maunally added when the cluster is setup.
483 | 	if value, exists := node.Labels["nvidia.com/device-plugin.config"]; exists && value == "update-capacity-1" {
484 | 		node.Labels["nvidia.com/device-plugin.config"] = "update-capacity"
485 | 	}
486 | 
487 | 	if value, exists := node.Labels["nvidia.com/device-plugin.config"]; exists && value == "update-capacity" {
488 | 		node.Labels["nvidia.com/device-plugin.config"] = "update-capacity-1"
489 | 	}
490 | 
491 | 	err = r.Update(ctx, node)
492 | 	if err != nil {
493 | 		log.FromContext(ctx).Error(err, "unable to update Node")
494 | 		return err
495 | 	}
496 | 	return nil
497 | }
498 | 
499 | // SetupWithManager sets up the controller with the Manager.
500 | func (r *InstaSliceDaemonsetReconciler) SetupWithManager(mgr ctrl.Manager) error {
501 | 
502 | 	restConfig := mgr.GetConfig()
503 | 
504 | 	var err error
505 | 	r.kubeClient, err = kubernetes.NewForConfig(restConfig)
506 | 	if err != nil {
507 | 		return err
508 | 	}
509 | 	if err := r.setupWithManager(mgr); err != nil {
510 | 		return err
511 | 	}
512 | 
513 | 	//make InstaSlice object when it does not exists
514 | 	//if it got restarted then use the existing state.
515 | 	nodeName := os.Getenv("NODE_NAME")
516 | 
517 | 	//Init InstaSlice obj as the first thing when cache is loaded.
518 | 	//RunnableFunc is added to the manager.
519 | 	//This function waits for the manager to be elected (<-mgr.Elected()) and then runs InstaSlice init code.
520 | 	mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
521 | 		<-mgr.Elected() // Wait for the manager to be elected
522 | 		var instaslice inferencev1alpha1.Instaslice
523 | 		typeNamespacedName := types.NamespacedName{
524 | 			Name: nodeName,
525 | 			//TODO: change namespace
526 | 			Namespace: "default",
527 | 		}
528 | 		errRetrievingInstaSliceForSetup := r.Get(ctx, typeNamespacedName, &instaslice)
529 | 		if errRetrievingInstaSliceForSetup != nil {
530 | 			log.FromContext(ctx).Error(errRetrievingInstaSliceForSetup, "unable to fetch InstaSlice resource for node")
531 | 			//TODO: should we do hard exit?
532 | 			//os.Exit(1)
533 | 		}
534 | 		if instaslice.Status.Processed != "true" || (instaslice.Name == "" && instaslice.Namespace == "") {
535 | 			_, errForDiscoveringGpus := r.discoverMigEnabledGpuWithSlices()
536 | 			if errForDiscoveringGpus != nil {
537 | 				log.FromContext(ctx).Error(errForDiscoveringGpus, "error discovering GPUs")
538 | 			}
539 | 		}
540 | 		return nil
541 | 	}))
542 | 
543 | 	return nil
544 | }
545 | 
546 | // Enable creation of controller caches to talk to the API server in order to perform
547 | // object discovery in SetupWithManager
548 | func (r *InstaSliceDaemonsetReconciler) setupWithManager(mgr ctrl.Manager) error {
549 | 	return ctrl.NewControllerManagedBy(mgr).
550 | 		For(&inferencev1alpha1.Instaslice{}).Named("InstaSliceDaemonSet").
551 | 		Complete(r)
552 | }
553 | 
554 | // This function discovers MIG devices as the plugin comes up. this is run exactly once.
555 | func (r *InstaSliceDaemonsetReconciler) discoverMigEnabledGpuWithSlices() ([]string, error) {
556 | 	instaslice, _, gpuModelMap, failed, returnValue, errorDiscoveringProfiles := r.discoverAvailableProfilesOnGpus()
557 | 	if failed {
558 | 		return returnValue, errorDiscoveringProfiles
559 | 	}
560 | 
561 | 	err := r.discoverDanglingSlices(instaslice)
562 | 
563 | 	if err != nil {
564 | 		return nil, err
565 | 	}
566 | 
567 | 	nodeName := os.Getenv("NODE_NAME")
568 | 	instaslice.Name = nodeName
569 | 	instaslice.Namespace = "default"
570 | 	instaslice.Spec.MigGPUUUID = gpuModelMap
571 | 	instaslice.Status.Processed = "true"
572 | 	//TODO: should we use context.TODO() ?
573 | 	customCtx := context.TODO()
574 | 	errToCreate := r.Create(customCtx, instaslice)
575 | 	if errToCreate != nil {
576 | 		return nil, errToCreate
577 | 	}
578 | 
579 | 	// Object exists, update its status
580 | 	instaslice.Status.Processed = "true"
581 | 	if errForStatus := r.Status().Update(customCtx, instaslice); errForStatus != nil {
582 | 		return nil, errForStatus
583 | 	}
584 | 
585 | 	return discoveredGpusOnHost, nil
586 | }
587 | 
588 | func (r *InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*inferencev1alpha1.Instaslice, nvml.Return, map[string]string, bool, []string, error) {
589 | 	instaslice := &inferencev1alpha1.Instaslice{}
590 | 	ret := nvml.Init()
591 | 	if ret != nvml.SUCCESS {
592 | 		return nil, ret, nil, false, nil, ret
593 | 	}
594 | 
595 | 	count, ret := nvml.DeviceGetCount()
596 | 	if ret != nvml.SUCCESS {
597 | 		return nil, ret, nil, false, nil, ret
598 | 	}
599 | 	gpuModelMap := make(map[string]string)
600 | 	discoverProfilePerNode := true
601 | 	for i := 0; i < count; i++ {
602 | 		device, ret := nvml.DeviceGetHandleByIndex(i)
603 | 		if ret != nvml.SUCCESS {
604 | 			return nil, ret, nil, false, nil, ret
605 | 		}
606 | 
607 | 		uuid, _ := device.GetUUID()
608 | 		gpuName, _ := device.GetName()
609 | 		gpuModelMap[uuid] = gpuName
610 | 		discoveredGpusOnHost = append(discoveredGpusOnHost, uuid)
611 | 		if discoverProfilePerNode {
612 | 
613 | 			for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
614 | 				giProfileInfo, ret := device.GetGpuInstanceProfileInfo(i)
615 | 				if ret == nvml.ERROR_NOT_SUPPORTED {
616 | 					continue
617 | 				}
618 | 				if ret == nvml.ERROR_INVALID_ARGUMENT {
619 | 					continue
620 | 				}
621 | 				if ret != nvml.SUCCESS {
622 | 					return nil, ret, nil, false, nil, ret
623 | 				}
624 | 
625 | 				memory, ret := device.GetMemoryInfo()
626 | 				if ret != nvml.SUCCESS {
627 | 					return nil, ret, nil, false, nil, ret
628 | 				}
629 | 
630 | 				profile := NewMigProfile(i, i, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED, giProfileInfo.SliceCount, giProfileInfo.SliceCount, giProfileInfo.MemorySizeMB, memory.Total)
631 | 
632 | 				giPossiblePlacements, ret := device.GetGpuInstancePossiblePlacements(&giProfileInfo)
633 | 				if ret == nvml.ERROR_NOT_SUPPORTED {
634 | 					continue
635 | 				}
636 | 				if ret == nvml.ERROR_INVALID_ARGUMENT {
637 | 					continue
638 | 				}
639 | 				if ret != nvml.SUCCESS {
640 | 					return nil, 0, nil, true, nil, ret
641 | 				}
642 | 				placementsForProfile := []inferencev1alpha1.Placement{}
643 | 				for _, p := range giPossiblePlacements {
644 | 					placement := inferencev1alpha1.Placement{
645 | 						Size:  int(p.Size),
646 | 						Start: int(p.Start),
647 | 					}
648 | 					placementsForProfile = append(placementsForProfile, placement)
649 | 				}
650 | 
651 | 				aggregatedPlacementsForProfile := inferencev1alpha1.Mig{
652 | 					Placements:     placementsForProfile,
653 | 					Profile:        profile.String(),
654 | 					Giprofileid:    i,
655 | 					CIProfileID:    profile.CIProfileID,
656 | 					CIEngProfileID: profile.CIEngProfileID,
657 | 				}
658 | 				instaslice.Spec.Migplacement = append(instaslice.Spec.Migplacement, aggregatedPlacementsForProfile)
659 | 			}
660 | 			discoverProfilePerNode = false
661 | 		}
662 | 	}
663 | 	return instaslice, ret, gpuModelMap, false, nil, nil
664 | }
665 | 
666 | func (r *InstaSliceDaemonsetReconciler) discoverDanglingSlices(instaslice *inferencev1alpha1.Instaslice) error {
667 | 	h := &deviceHandler{}
668 | 	h.nvml = nvml.New()
669 | 	h.nvdevice = nvdevice.New(nvdevice.WithNvml(h.nvml))
670 | 
671 | 	errInitNvml := h.nvml.Init()
672 | 	if errInitNvml != nvml.SUCCESS {
673 | 		return errInitNvml
674 | 	}
675 | 
676 | 	availableGpusOnNode, errObtainingDeviceCount := h.nvml.DeviceGetCount()
677 | 	if errObtainingDeviceCount != nvml.SUCCESS {
678 | 		return errObtainingDeviceCount
679 | 	}
680 | 
681 | 	for i := 0; i < availableGpusOnNode; i++ {
682 | 		device, errObtainingDeviceHandle := h.nvml.DeviceGetHandleByIndex(i)
683 | 		if errObtainingDeviceHandle != nvml.SUCCESS {
684 | 			return errObtainingDeviceHandle
685 | 		}
686 | 
687 | 		uuid, errObtainingDeviceUUID := device.GetUUID()
688 | 		if errObtainingDeviceUUID != nvml.SUCCESS {
689 | 			return errObtainingDeviceUUID
690 | 		}
691 | 
692 | 		nvlibParentDevice, errObtainingParentDevice := h.nvdevice.NewDevice(device)
693 | 		if errObtainingParentDevice != nil {
694 | 			return errObtainingParentDevice
695 | 		}
696 | 		migs, errRetrievingMigDevices := nvlibParentDevice.GetMigDevices()
697 | 		if errRetrievingMigDevices != nil {
698 | 			return errRetrievingMigDevices
699 | 		}
700 | 
701 | 		for _, mig := range migs {
702 | 			migUUID, _ := mig.GetUUID()
703 | 			profile, errForProfile := mig.GetProfile()
704 | 			if errForProfile != nil {
705 | 				return errForProfile
706 | 			}
707 | 
708 | 			giID, errForMigGid := mig.GetGpuInstanceId()
709 | 			if errForMigGid != nvml.SUCCESS {
710 | 				return errForMigGid
711 | 			}
712 | 			gpuInstance, errRetrievingDeviceGid := device.GetGpuInstanceById(giID)
713 | 			if errRetrievingDeviceGid != nvml.SUCCESS {
714 | 				return errRetrievingDeviceGid
715 | 			}
716 | 			gpuInstanceInfo, errObtainingInfo := gpuInstance.GetInfo()
717 | 			if errObtainingInfo != nvml.SUCCESS {
718 | 				return errObtainingInfo
719 | 			}
720 | 
721 | 			ciID, ret := mig.GetComputeInstanceId()
722 | 			if ret != nvml.SUCCESS {
723 | 				return ret
724 | 			}
725 | 			ci, ret := gpuInstance.GetComputeInstanceById(ciID)
726 | 			if ret != nvml.SUCCESS {
727 | 				return ret
728 | 			}
729 | 			ciInfo, ret := ci.GetInfo()
730 | 			if ret != nvml.SUCCESS {
731 | 				return ret
732 | 			}
733 | 			prepared := inferencev1alpha1.PreparedDetails{
734 | 				Profile:  profile.GetInfo().String(),
735 | 				Start:    gpuInstanceInfo.Placement.Start,
736 | 				Size:     gpuInstanceInfo.Placement.Size,
737 | 				Parent:   uuid,
738 | 				Giinfoid: gpuInstanceInfo.Id,
739 | 				Ciinfoid: ciInfo.Id,
740 | 			}
741 | 			if instaslice.Spec.Prepared == nil {
742 | 				instaslice.Spec.Prepared = make(map[string]inferencev1alpha1.PreparedDetails)
743 | 			}
744 | 			instaslice.Spec.Prepared[migUUID] = prepared
745 | 		}
746 | 	}
747 | 	return nil
748 | }
749 | 
750 | // NewMigProfile constructs a new MigProfile struct using info from the giProfiles and ciProfiles used to create it.
751 | func NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, giSliceCount, ciSliceCount uint32, migMemorySizeMB, totalDeviceMemoryBytes uint64) *MigProfile {
752 | 	return &MigProfile{
753 | 		C:              int(ciSliceCount),
754 | 		G:              int(giSliceCount),
755 | 		GB:             int(getMigMemorySizeInGB(totalDeviceMemoryBytes, migMemorySizeMB)),
756 | 		GIProfileID:    giProfileID,
757 | 		CIProfileID:    ciProfileID,
758 | 		CIEngProfileID: ciEngProfileID,
759 | 	}
760 | }
761 | 
762 | // Helper function to get GPU memory size in GBs.
763 | func getMigMemorySizeInGB(totalDeviceMemory, migMemorySizeMB uint64) uint64 {
764 | 	const fracDenominator = 8
765 | 	const oneMB = 1024 * 1024
766 | 	const oneGB = 1024 * 1024 * 1024
767 | 	fractionalGpuMem := (float64(migMemorySizeMB) * oneMB) / float64(totalDeviceMemory)
768 | 	fractionalGpuMem = math.Ceil(fractionalGpuMem*fracDenominator) / fracDenominator
769 | 	totalMemGB := float64((totalDeviceMemory + oneGB - 1) / oneGB)
770 | 	return uint64(math.Round(fractionalGpuMem * totalMemGB))
771 | }
772 | 
773 | // String returns the string representation of a MigProfile.
774 | func (m MigProfile) String() string {
775 | 	var suffix string
776 | 	if len(m.Attributes()) > 0 {
777 | 		suffix = "+" + strings.Join(m.Attributes(), ",")
778 | 	}
779 | 	if m.C == m.G {
780 | 		return fmt.Sprintf("%dg.%dgb%s", m.G, m.GB, suffix)
781 | 	}
782 | 	return fmt.Sprintf("%dc.%dg.%dgb%s", m.C, m.G, m.GB, suffix)
783 | }
784 | 
785 | // Attributes returns the list of attributes associated with a MigProfile.
786 | func (m MigProfile) Attributes() []string {
787 | 	var attr []string
788 | 	switch m.GIProfileID {
789 | 	case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
790 | 		attr = append(attr, AttributeMediaExtensions)
791 | 	}
792 | 	return attr
793 | }
794 | 
795 | // Create configmap which is used by Pods to consume MIG device
796 | func (r *InstaSliceDaemonsetReconciler) createConfigMap(ctx context.Context, migGPUUUID string, namespace string, podName string) error {
797 | 	var configMap v1.ConfigMap
798 | 	err := r.Get(ctx, types.NamespacedName{Name: podName, Namespace: namespace}, &configMap)
799 | 	if err != nil {
800 | 		log.FromContext(ctx).Info("ConfigMap not found, creating for ", "pod", podName, "migGPUUUID", migGPUUUID)
801 | 		configMapToCreate := &v1.ConfigMap{
802 | 			ObjectMeta: metav1.ObjectMeta{
803 | 				Name:      podName,
804 | 				Namespace: namespace,
805 | 			},
806 | 			Data: map[string]string{
807 | 				"NVIDIA_VISIBLE_DEVICES": migGPUUUID,
808 | 				"CUDA_VISIBLE_DEVICES":   migGPUUUID,
809 | 			},
810 | 		}
811 | 		if err := r.Create(ctx, configMapToCreate); err != nil {
812 | 			log.FromContext(ctx).Error(err, "failed to create ConfigMap")
813 | 			return err
814 | 		}
815 | 
816 | 	}
817 | 	return nil
818 | }
819 | 
820 | // Manage lifecycle of configmap, delete it once the pod is deleted from the system
821 | func (r *InstaSliceDaemonsetReconciler) deleteConfigMap(ctx context.Context, configMapName string, namespace string) error {
822 | 	// Define the ConfigMap object with the name and namespace
823 | 	configMap := &v1.ConfigMap{
824 | 		ObjectMeta: metav1.ObjectMeta{
825 | 			Name:      configMapName,
826 | 			Namespace: namespace,
827 | 		},
828 | 	}
829 | 
830 | 	err := r.Delete(ctx, configMap)
831 | 	if err != nil {
832 | 		if errors.IsNotFound(err) {
833 | 			log.FromContext(ctx).Error(err, "configmap not found for ", "pod", configMapName)
834 | 			return nil
835 | 		}
836 | 		return err
837 | 	}
838 | 
839 | 	log.FromContext(ctx).Info("ConfigMap deleted successfully ", "name", configMapName)
840 | 	return nil
841 | }
842 | 
843 | func createPatchData(resourceName string, resourceValue string) ([]byte, error) {
844 | 	patch := []ResPatchOperation{
845 | 		{Op: "add",
846 | 			Path:  fmt.Sprintf("/status/capacity/%s", strings.ReplaceAll(resourceName, "/", "~1")),
847 | 			Value: resourceValue,
848 | 		},
849 | 	}
850 | 	return json.Marshal(patch)
851 | }
852 | 
853 | func deletePatchData(resourceName string) ([]byte, error) {
854 | 	patch := []ResPatchOperation{
855 | 		{Op: "remove",
856 | 			Path: fmt.Sprintf("/status/capacity/%s", strings.ReplaceAll("org.instaslice/"+resourceName, "/", "~1")),
857 | 		},
858 | 	}
859 | 	return json.Marshal(patch)
860 | }
861 | 


--------------------------------------------------------------------------------
/internal/controller/instaslice_daemonset_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package controller
 18 | 
 19 | import (
 20 | 	"context"
 21 | 	"os"
 22 | 	"testing"
 23 | 
 24 | 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 25 | 	"github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
 26 | 	"github.com/stretchr/testify/assert"
 27 | 	v1 "k8s.io/api/core/v1"
 28 | 	"k8s.io/client-go/kubernetes/scheme"
 29 | 
 30 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 31 | 	"k8s.io/apimachinery/pkg/types"
 32 | 
 33 | 	inferencev1alpha1 "codeflare.dev/instaslice/api/v1alpha1"
 34 | 	runtimefake "sigs.k8s.io/controller-runtime/pkg/client/fake"
 35 | )
 36 | 
 37 | func TestCleanUp(t *testing.T) {
 38 | 	// Set up the mock server
 39 | 	server := dgxa100.New()
 40 | 
 41 | 	// Mock the NVML functions
 42 | 	nvml.Init = func() nvml.Return {
 43 | 		return nvml.SUCCESS
 44 | 	}
 45 | 	nvml.Shutdown = func() nvml.Return {
 46 | 		return nvml.SUCCESS
 47 | 	}
 48 | 	nvml.DeviceGetHandleByUUID = func(uuid string) (nvml.Device, nvml.Return) {
 49 | 		for _, dev := range server.Devices {
 50 | 			device := dev.(*dgxa100.Device)
 51 | 			if device.UUID == uuid {
 52 | 				return device, nvml.SUCCESS
 53 | 			}
 54 | 		}
 55 | 		return nil, nvml.ERROR_NOT_FOUND
 56 | 	}
 57 | 
 58 | 	// Create a fake Kubernetes client
 59 | 	s := scheme.Scheme
 60 | 	_ = inferencev1alpha1.AddToScheme(s)
 61 | 	fakeClient := runtimefake.NewClientBuilder().WithScheme(s).Build()
 62 | 
 63 | 	// Create a fake kubernetes clientset
 64 | 
 65 | 	//fakeKubeClient := fake.NewSimpleClientset()
 66 | 
 67 | 	// Create an InstaSliceDaemonsetReconciler
 68 | 	reconciler := &InstaSliceDaemonsetReconciler{
 69 | 		Client: fakeClient,
 70 | 		Scheme: s,
 71 | 	}
 72 | 	// Create a fake Instaslice resource
 73 | 	instaslice := &inferencev1alpha1.Instaslice{
 74 | 		ObjectMeta: metav1.ObjectMeta{
 75 | 			Name: "node-1",
 76 | 		},
 77 | 		Spec: inferencev1alpha1.InstasliceSpec{
 78 | 			Prepared: map[string]inferencev1alpha1.PreparedDetails{
 79 | 				"mig-uuid-1": {
 80 | 					PodUUID:  "pod-uid-1",
 81 | 					Parent:   "GPU-1",
 82 | 					Giinfoid: 1,
 83 | 					Ciinfoid: 1,
 84 | 				},
 85 | 			},
 86 | 			Allocations: map[string]inferencev1alpha1.AllocationDetails{
 87 | 				"allocation-1": {
 88 | 					PodUUID:   "pod-uid-1",
 89 | 					PodName:   "pod-name-1",
 90 | 					Namespace: "default",
 91 | 				},
 92 | 			},
 93 | 		},
 94 | 	}
 95 | 	fakeClient.Create(context.Background(), instaslice)
 96 | 
 97 | 	// Set the NODE_NAME environment variable
 98 | 	os.Setenv("NODE_NAME", "node-1")
 99 | 	defer os.Unsetenv("NODE_NAME")
100 | 
101 | 	// Create a fake Pod resource
102 | 	pod := &v1.Pod{
103 | 		ObjectMeta: metav1.ObjectMeta{
104 | 			UID:       "pod-uid-1",
105 | 			Name:      "pod-name-1",
106 | 			Namespace: "default",
107 | 		},
108 | 	}
109 | 
110 | 	// Call the cleanUp function
111 | 	reconciler.cleanUp(context.Background(), string(pod.UID))
112 | 
113 | 	// Verify the Instaslice resource was updated
114 | 	var updatedInstaslice inferencev1alpha1.Instaslice
115 | 	err := fakeClient.Get(context.Background(), types.NamespacedName{Name: "node-1"}, &updatedInstaslice)
116 | 	assert.NoError(t, err)
117 | 	assert.Empty(t, updatedInstaslice.Spec.Prepared)
118 | 	assert.Empty(t, updatedInstaslice.Spec.Allocations)
119 | }
120 | 


--------------------------------------------------------------------------------
/internal/controller/suite_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package controller
18 | 
19 | import (
20 | 	"fmt"
21 | 	"path/filepath"
22 | 	"runtime"
23 | 	"testing"
24 | 
25 | 	. "github.com/onsi/ginkgo/v2"
26 | 	. "github.com/onsi/gomega"
27 | 
28 | 	"k8s.io/client-go/kubernetes/scheme"
29 | 	"k8s.io/client-go/rest"
30 | 	"sigs.k8s.io/controller-runtime/pkg/client"
31 | 	"sigs.k8s.io/controller-runtime/pkg/envtest"
32 | 	logf "sigs.k8s.io/controller-runtime/pkg/log"
33 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
34 | 
35 | 	inferencev1alpha1 "codeflare.dev/instaslice/api/v1alpha1"
36 | 	//+kubebuilder:scaffold:imports
37 | )
38 | 
39 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to
40 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
41 | 
42 | var cfg *rest.Config
43 | var k8sClient client.Client
44 | var testEnv *envtest.Environment
45 | 
46 | func TestControllers(t *testing.T) {
47 | 	RegisterFailHandler(Fail)
48 | 
49 | 	RunSpecs(t, "Controller Suite")
50 | }
51 | 
52 | var _ = BeforeSuite(func() {
53 | 	logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
54 | 
55 | 	By("bootstrapping test environment")
56 | 	testEnv = &envtest.Environment{
57 | 		CRDDirectoryPaths:     []string{filepath.Join("..", "..", "config", "crd", "bases")},
58 | 		ErrorIfCRDPathMissing: true,
59 | 
60 | 		// The BinaryAssetsDirectory is only required if you want to run the tests directly
61 | 		// without call the makefile target test. If not informed it will look for the
62 | 		// default path defined in controller-runtime which is /usr/local/kubebuilder/.
63 | 		// Note that you must have the required binaries setup under the bin directory to perform
64 | 		// the tests directly. When we run make test it will be setup and used automatically.
65 | 		BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s",
66 | 			fmt.Sprintf("1.29.0-%s-%s", runtime.GOOS, runtime.GOARCH)),
67 | 	}
68 | 
69 | 	var err error
70 | 	// cfg is defined in this file globally.
71 | 	cfg, err = testEnv.Start()
72 | 	Expect(err).NotTo(HaveOccurred())
73 | 	Expect(cfg).NotTo(BeNil())
74 | 
75 | 	err = inferencev1alpha1.AddToScheme(scheme.Scheme)
76 | 	Expect(err).NotTo(HaveOccurred())
77 | 
78 | 	//+kubebuilder:scaffold:scheme
79 | 
80 | 	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
81 | 	Expect(err).NotTo(HaveOccurred())
82 | 	Expect(k8sClient).NotTo(BeNil())
83 | 
84 | })
85 | 
86 | var _ = AfterSuite(func() {
87 | 	By("tearing down the test environment")
88 | 	err := testEnv.Stop()
89 | 	Expect(err).NotTo(HaveOccurred())
90 | })
91 | 


--------------------------------------------------------------------------------
/samples/test-pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: cuda-vectoradd-1
 5 |   finalizers:
 6 |   - org.instaslice/accelarator
 7 | spec:
 8 |   restartPolicy: OnFailure
 9 |   schedulingGates:
10 |   - name: org.instaslice/accelarator
11 |   containers:
12 |   - name: cuda-vectoradd-1
13 |     image: "quay.io/tardieu/vectoradd:0.1.0"
14 |     resources:
15 |       limits:
16 |         nvidia.com/mig-1g.5gb: 1
17 |         org.instaslice/cuda-vectoradd-1: 1
18 |     envFrom:
19 |       - configMapRef:
20 |           name: cuda-vectoradd-1


--------------------------------------------------------------------------------
/samples/tf-notebook.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   name: tf-notebook
 6 |   labels:
 7 |     app: tf-notebook
 8 | spec:
 9 |   type: NodePort
10 |   ports:
11 |   - port: 80
12 |     name: http
13 |     targetPort: 8888
14 |     nodePort: 30001
15 |   selector:
16 |     app: tf-notebook
17 | ---
18 | apiVersion: v1
19 | kind: Pod
20 | metadata:
21 |   name: tf-notebook
22 |   labels:
23 |     app: tf-notebook
24 | spec:
25 |   runtimeClassName: nvidia-cdi
26 |   securityContext:
27 |     fsGroup: 0
28 |   containers:
29 |   - name: tf-notebook
30 |     image: tensorflow/tensorflow:latest-gpu-jupyter
31 |     resources:
32 |       limits:
33 |         nvidia.com/mig-3g.20gb: 1
34 |     ports:
35 |     - containerPort: 8888
36 |       name: notebook


--------------------------------------------------------------------------------
/samples/vllm_cache.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Secret
 4 | metadata:
 5 |   name: huggingface-secret
 6 | type: Opaque
 7 | data:
 8 |   HF_TOKEN: aGZfYU9wZ010WEtGRFFZZkhBaEdBdmpNeVJmWVNra3hxVHR2WA== # Base64-encoded value of 'your_huggingface_secret_token'
 9 | ---
10 | apiVersion: v1
11 | kind: PersistentVolume
12 | metadata:
13 |   name: huggingface-cache-pvc
14 | spec:
15 |   capacity:
16 |     storage: 10Gi
17 |   accessModes:
18 |     - ReadWriteOnce
19 |   storageClassName: manual
20 |   hostPath:
21 |     path: /data/huggingface-cache
22 | ---
23 | apiVersion: v1
24 | kind: PersistentVolumeClaim
25 | metadata:
26 |   name: huggingface-cache-pvc
27 | spec:
28 |   accessModes:
29 |     - ReadWriteOnce
30 |   resources:
31 |     requests:
32 |       storage: 10Gi
33 |   storageClassName: manual


--------------------------------------------------------------------------------
/samples/vllm_dep.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: vllm
 5 |   labels:
 6 |     app: gpu-test1-vllm
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: vllm
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: vllm
16 |     spec:
17 |       containers:
18 |       - name: vllm-container
19 |         image: quay.io/chenw615/vllm_dra:latest
20 |         imagePullPolicy: IfNotPresent
21 |         ports:
22 |         - containerPort: 8000
23 |         env:
24 |         - name: HUGGING_FACE_HUB_TOKEN
25 |           valueFrom:
26 |             secretKeyRef:
27 |               name: huggingface-secret
28 |               key: HF_TOKEN
29 |         - name: MODEL_NAME
30 |           value: "facebook/opt-125m"
31 |         volumeMounts:
32 |         - name: cache-volume
33 |           mountPath: /root/.cache/huggingface
34 |         resources:
35 |           limits:
36 |             nvidia.com/mig-3g.20gb: 1 
37 |       volumes:
38 |       - name: cache-volume
39 |         persistentVolumeClaim:
40 |           claimName: huggingface-cache-pvc
41 | ---
42 | apiVersion: v1
43 | kind: Service
44 | metadata:
45 |   name: vllm
46 | spec:
47 |   type: ClusterIP
48 |   ports:
49 |   - port: 8000
50 |     targetPort: 8000
51 |     name: http
52 |   selector:
53 |     app: vllm


--------------------------------------------------------------------------------
/test/e2e/e2e_suite_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package e2e
18 | 
19 | import (
20 | 	"fmt"
21 | 	"testing"
22 | 
23 | 	. "github.com/onsi/ginkgo/v2"
24 | 	. "github.com/onsi/gomega"
25 | )
26 | 
27 | // Run e2e tests using the Ginkgo runner.
28 | func TestE2E(t *testing.T) {
29 | 	RegisterFailHandler(Fail)
30 | 	fmt.Fprintf(GinkgoWriter, "Starting instaslicev2 suite\n")
31 | 	RunSpecs(t, "e2e suite")
32 | }
33 | 


--------------------------------------------------------------------------------
/test/e2e/e2e_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package e2e
 18 | 
 19 | import (
 20 | 	"fmt"
 21 | 	"os/exec"
 22 | 	"time"
 23 | 
 24 | 	. "github.com/onsi/ginkgo/v2"
 25 | 	. "github.com/onsi/gomega"
 26 | 
 27 | 	"codeflare.dev/instaslice/test/utils"
 28 | )
 29 | 
 30 | const namespace = "instaslicev2-system"
 31 | 
 32 | var _ = Describe("controller", Ordered, func() {
 33 | 	BeforeAll(func() {
 34 | 		By("installing prometheus operator")
 35 | 		Expect(utils.InstallPrometheusOperator()).To(Succeed())
 36 | 
 37 | 		By("installing the cert-manager")
 38 | 		Expect(utils.InstallCertManager()).To(Succeed())
 39 | 
 40 | 		By("creating manager namespace")
 41 | 		cmd := exec.Command("kubectl", "create", "ns", namespace)
 42 | 		_, _ = utils.Run(cmd)
 43 | 	})
 44 | 
 45 | 	AfterAll(func() {
 46 | 		By("uninstalling the Prometheus manager bundle")
 47 | 		utils.UninstallPrometheusOperator()
 48 | 
 49 | 		By("uninstalling the cert-manager bundle")
 50 | 		utils.UninstallCertManager()
 51 | 
 52 | 		By("removing manager namespace")
 53 | 		cmd := exec.Command("kubectl", "delete", "ns", namespace)
 54 | 		_, _ = utils.Run(cmd)
 55 | 	})
 56 | 
 57 | 	Context("Operator", func() {
 58 | 		It("should run successfully", func() {
 59 | 			var controllerPodName string
 60 | 			var err error
 61 | 
 62 | 			// projectimage stores the name of the image used in the example
 63 | 			var projectimage = "example.com/instaslicev2:v0.0.1"
 64 | 
 65 | 			By("building the manager(Operator) image")
 66 | 			cmd := exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectimage))
 67 | 			_, err = utils.Run(cmd)
 68 | 			ExpectWithOffset(1, err).NotTo(HaveOccurred())
 69 | 
 70 | 			By("loading the the manager(Operator) image on Kind")
 71 | 			err = utils.LoadImageToKindClusterWithName(projectimage)
 72 | 			ExpectWithOffset(1, err).NotTo(HaveOccurred())
 73 | 
 74 | 			By("installing CRDs")
 75 | 			cmd = exec.Command("make", "install")
 76 | 			_, err = utils.Run(cmd)
 77 | 			ExpectWithOffset(1, err).NotTo(HaveOccurred())
 78 | 
 79 | 			By("deploying the controller-manager")
 80 | 			cmd = exec.Command("make", "deploy", fmt.Sprintf("IMG=%s", projectimage))
 81 | 			_, err = utils.Run(cmd)
 82 | 			ExpectWithOffset(1, err).NotTo(HaveOccurred())
 83 | 
 84 | 			By("validating that the controller-manager pod is running as expected")
 85 | 			verifyControllerUp := func() error {
 86 | 				// Get pod name
 87 | 
 88 | 				cmd = exec.Command("kubectl", "get",
 89 | 					"pods", "-l", "control-plane=controller-manager",
 90 | 					"-o", "go-template={{ range .items }}"+
 91 | 						"{{ if not .metadata.deletionTimestamp }}"+
 92 | 						"{{ .metadata.name }}"+
 93 | 						"{{ \"\\n\" }}{{ end }}{{ end }}",
 94 | 					"-n", namespace,
 95 | 				)
 96 | 
 97 | 				podOutput, err := utils.Run(cmd)
 98 | 				ExpectWithOffset(2, err).NotTo(HaveOccurred())
 99 | 				podNames := utils.GetNonEmptyLines(string(podOutput))
100 | 				if len(podNames) != 1 {
101 | 					return fmt.Errorf("expect 1 controller pods running, but got %d", len(podNames))
102 | 				}
103 | 				controllerPodName = podNames[0]
104 | 				ExpectWithOffset(2, controllerPodName).Should(ContainSubstring("controller-manager"))
105 | 
106 | 				// Validate pod status
107 | 				cmd = exec.Command("kubectl", "get",
108 | 					"pods", controllerPodName, "-o", "jsonpath={.status.phase}",
109 | 					"-n", namespace,
110 | 				)
111 | 				status, err := utils.Run(cmd)
112 | 				ExpectWithOffset(2, err).NotTo(HaveOccurred())
113 | 				if string(status) != "Running" {
114 | 					return fmt.Errorf("controller pod in %s status", status)
115 | 				}
116 | 				return nil
117 | 			}
118 | 			EventuallyWithOffset(1, verifyControllerUp, time.Minute, time.Second).Should(Succeed())
119 | 
120 | 		})
121 | 	})
122 | })
123 | 


--------------------------------------------------------------------------------
/test/utils/utils.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package utils
 18 | 
 19 | import (
 20 | 	"fmt"
 21 | 	"os"
 22 | 	"os/exec"
 23 | 	"strings"
 24 | 
 25 | 	. "github.com/onsi/ginkgo/v2" //nolint:golint,revive
 26 | )
 27 | 
 28 | const (
 29 | 	prometheusOperatorVersion = "v0.68.0"
 30 | 	prometheusOperatorURL     = "https://github.com/prometheus-operator/prometheus-operator/" +
 31 | 		"releases/download/%s/bundle.yaml"
 32 | 
 33 | 	certmanagerVersion = "v1.5.3"
 34 | 	certmanagerURLTmpl = "https://github.com/jetstack/cert-manager/releases/download/%s/cert-manager.yaml"
 35 | )
 36 | 
 37 | func warnError(err error) {
 38 | 	fmt.Fprintf(GinkgoWriter, "warning: %v\n", err)
 39 | }
 40 | 
 41 | // InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics.
 42 | func InstallPrometheusOperator() error {
 43 | 	url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion)
 44 | 	cmd := exec.Command("kubectl", "create", "-f", url)
 45 | 	_, err := Run(cmd)
 46 | 	return err
 47 | }
 48 | 
 49 | // Run executes the provided command within this context
 50 | func Run(cmd *exec.Cmd) ([]byte, error) {
 51 | 	dir, _ := GetProjectDir()
 52 | 	cmd.Dir = dir
 53 | 
 54 | 	if err := os.Chdir(cmd.Dir); err != nil {
 55 | 		fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err)
 56 | 	}
 57 | 
 58 | 	cmd.Env = append(os.Environ(), "GO111MODULE=on")
 59 | 	command := strings.Join(cmd.Args, " ")
 60 | 	fmt.Fprintf(GinkgoWriter, "running: %s\n", command)
 61 | 	output, err := cmd.CombinedOutput()
 62 | 	if err != nil {
 63 | 		return output, fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output))
 64 | 	}
 65 | 
 66 | 	return output, nil
 67 | }
 68 | 
 69 | // UninstallPrometheusOperator uninstalls the prometheus
 70 | func UninstallPrometheusOperator() {
 71 | 	url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion)
 72 | 	cmd := exec.Command("kubectl", "delete", "-f", url)
 73 | 	if _, err := Run(cmd); err != nil {
 74 | 		warnError(err)
 75 | 	}
 76 | }
 77 | 
 78 | // UninstallCertManager uninstalls the cert manager
 79 | func UninstallCertManager() {
 80 | 	url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion)
 81 | 	cmd := exec.Command("kubectl", "delete", "-f", url)
 82 | 	if _, err := Run(cmd); err != nil {
 83 | 		warnError(err)
 84 | 	}
 85 | }
 86 | 
 87 | // InstallCertManager installs the cert manager bundle.
 88 | func InstallCertManager() error {
 89 | 	url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion)
 90 | 	cmd := exec.Command("kubectl", "apply", "-f", url)
 91 | 	if _, err := Run(cmd); err != nil {
 92 | 		return err
 93 | 	}
 94 | 	// Wait for cert-manager-webhook to be ready, which can take time if cert-manager
 95 | 	// was re-installed after uninstalling on a cluster.
 96 | 	cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook",
 97 | 		"--for", "condition=Available",
 98 | 		"--namespace", "cert-manager",
 99 | 		"--timeout", "5m",
100 | 	)
101 | 
102 | 	_, err := Run(cmd)
103 | 	return err
104 | }
105 | 
106 | // LoadImageToKindCluster loads a local docker image to the kind cluster
107 | func LoadImageToKindClusterWithName(name string) error {
108 | 	cluster := "kind"
109 | 	if v, ok := os.LookupEnv("KIND_CLUSTER"); ok {
110 | 		cluster = v
111 | 	}
112 | 	kindOptions := []string{"load", "docker-image", name, "--name", cluster}
113 | 	cmd := exec.Command("kind", kindOptions...)
114 | 	_, err := Run(cmd)
115 | 	return err
116 | }
117 | 
118 | // GetNonEmptyLines converts given command output string into individual objects
119 | // according to line breakers, and ignores the empty elements in it.
120 | func GetNonEmptyLines(output string) []string {
121 | 	var res []string
122 | 	elements := strings.Split(output, "\n")
123 | 	for _, element := range elements {
124 | 		if element != "" {
125 | 			res = append(res, element)
126 | 		}
127 | 	}
128 | 
129 | 	return res
130 | }
131 | 
132 | // GetProjectDir will return the directory where the project is
133 | func GetProjectDir() (string, error) {
134 | 	wd, err := os.Getwd()
135 | 	if err != nil {
136 | 		return wd, err
137 | 	}
138 | 	wd = strings.Replace(wd, "/test/e2e", "", -1)
139 | 	return wd, nil
140 | }
141 | 


--------------------------------------------------------------------------------