├── .github └── workflows │ └── main.yml ├── .gitignore ├── .gitlab-ci.yml ├── .gitmodules ├── AUTHORS ├── CHANGELOG.md ├── HAMi.jpg ├── LICENSE ├── MAINTAINERS.md ├── Makefile ├── README.md ├── README_cn.md ├── benchmarks └── ai-benchmark │ ├── Dockerfile │ ├── Hami │ └── ai-benchmark.yml │ └── Official-Nvidia-device-plugin │ ├── ai-benchmark.yml │ └── nvidia-device-plugin-official.yml ├── charts └── vgpu │ ├── Chart.yaml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── device-plugin │ │ ├── configmap.yaml │ │ ├── daemonsethygon.yaml │ │ ├── daemonsetmlu.yaml │ │ ├── daemonsetnvidia.yaml │ │ ├── monitorrole.yaml │ │ ├── monitorrolebinding.yaml │ │ ├── monitorservice.yaml │ │ └── monitorserviceaccount.yaml │ └── scheduler │ │ ├── configmap.yaml │ │ ├── configmapnew.yaml │ │ ├── deployment.yaml │ │ ├── job-patch │ │ ├── clusterrole.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── job-createSecret.yaml │ │ ├── job-patchWebhook.yaml │ │ ├── psp.yaml │ │ ├── role.yaml │ │ ├── rolebinding.yaml │ │ └── serviceaccount.yaml │ │ ├── rolebinding.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ └── webhook.yaml │ └── values.yaml ├── cmd ├── device-plugin │ ├── hygon │ │ └── main.go │ ├── mlu │ │ └── main.go │ └── nvidia │ │ ├── main.go │ │ ├── plugin-manager.go │ │ ├── vgpucfg.go │ │ └── watchers.go ├── scheduler │ ├── main.go │ └── metrics.go └── vGPUmonitor │ ├── build.sh │ ├── cudevshr.go │ ├── feedback.go │ ├── main.go │ ├── metrics.go │ ├── noderpc │ ├── noderpc.pb.go │ ├── noderpc.proto │ └── noderpc_grpc.pb.go │ ├── pathmonitor.go │ ├── pathmonitor_test.go │ ├── testcollector │ ├── main.go │ └── testcollector │ └── validation.go ├── docker ├── Dockerfile └── entrypoint.sh ├── docs ├── benchmark.md ├── benchmark_cn.md ├── cambricon-mlu-support.md ├── cambricon-mlu-support_cn.md ├── config.md ├── config_cn.md ├── dashboard.md ├── dashboard_cn.md ├── develop │ ├── design.md │ ├── imgs │ │ ├── flowchart.jpeg │ │ ├── offline_validation.png │ │ ├── protocol_pod.png │ │ └── protocol_register.png │ ├── protocol.md │ ├── roadmap.md │ └── tasklist.md ├── gpu-dashboard.json ├── hygon-dcu-support.md ├── hygon-dcu-support_cn.md └── offline-install.md ├── example.yaml ├── examples ├── hygon │ ├── default_use.yaml │ ├── specify_card_type_not_use.yaml │ └── specify_card_type_to_use.yaml ├── mlu │ ├── default_use.yaml │ ├── multi-pods.yaml │ ├── specify_card_type_not_use.yaml │ └── specify_card_type_to_use.yaml └── nvidia │ ├── default_use.yaml │ ├── default_use_legacy.yaml │ ├── example.yaml │ ├── mig_example.yaml │ ├── specify_card_type_not_use.yaml │ ├── specify_card_type_to_use.yaml │ ├── use_exclusive_card.yaml │ └── use_memory_fraction.yaml ├── go.mod ├── go.sum ├── hack ├── build.sh └── update-generated-api.sh ├── imgs ├── arch.png ├── benchmark.png ├── benchmark_inf.png ├── benchmark_train.png ├── example.png └── hard_limit.jpg ├── lib ├── mlu │ ├── cntopo │ ├── libcndev.so │ └── smlu-containerd └── nvidia │ ├── ld.so.preload │ └── libvgpu.so ├── pkg ├── api │ ├── device_register.go │ └── types.go ├── device-plugin │ ├── hygon │ │ └── dcu │ │ │ ├── amdgpu │ │ │ └── amdgpu.go │ │ │ ├── corealloc.go │ │ │ ├── corealloc_test.go │ │ │ ├── hwloc │ │ │ └── hwloc.go │ │ │ ├── register.go │ │ │ └── server.go │ ├── mlu │ │ ├── allocator │ │ │ ├── allocator.go │ │ │ ├── allocator_suite_test.go │ │ │ ├── board.go │ │ │ ├── board_test.go │ │ │ ├── default.go │ │ │ ├── spider.go │ │ │ └── spider_test.go │ │ ├── cache.go │ │ ├── cambricon.go │ │ ├── cndev │ │ │ ├── bindings.go │ │ │ ├── bindings_test.go │ │ │ ├── cndev.go │ │ │ ├── cndev_dl.go │ │ │ ├── cndev_test.go │ │ │ ├── include │ │ │ │ └── cndev.h │ │ │ └── mock │ │ │ │ ├── cJSON.c │ │ │ │ ├── cJSON.h │ │ │ │ ├── cndev.c │ │ │ │ └── main.c │ │ ├── cntopo │ │ │ ├── cntopo.go │ │ │ └── mock │ │ │ │ └── cntopo.go │ │ ├── const.go │ │ ├── options.go │ │ ├── podutils.go │ │ ├── register.go │ │ └── server.go │ └── nvidiadevice │ │ └── nvinternal │ │ ├── cdi │ │ ├── api.go │ │ ├── api_mock.go │ │ ├── cdi.go │ │ ├── factory.go │ │ ├── null.go │ │ └── options.go │ │ ├── info │ │ └── version.go │ │ ├── mig │ │ └── mig.go │ │ ├── plugin │ │ ├── api.go │ │ ├── manager │ │ │ ├── api.go │ │ │ ├── factory.go │ │ │ ├── null.go │ │ │ ├── nvml.go │ │ │ ├── options.go │ │ │ └── tegra.go │ │ ├── register.go │ │ ├── register_test.go │ │ ├── server.go │ │ └── server_test.go │ │ └── rm │ │ ├── allocate.go │ │ ├── device_map.go │ │ ├── device_map_test.go │ │ ├── devices.go │ │ ├── health.go │ │ ├── health_test.go │ │ ├── helper.go │ │ ├── nvml_devices.go │ │ ├── nvml_manager.go │ │ ├── rm.go │ │ ├── tegra_devices.go │ │ ├── tegra_manager.go │ │ └── wsl_devices.go ├── device │ ├── cambricon │ │ └── device.go │ ├── devices.go │ ├── hygon │ │ └── device.go │ ├── iluvatar │ │ └── device.go │ └── nvidia │ │ └── device.go ├── k8sutil │ ├── client.go │ └── pod.go ├── oci │ ├── runtime.go │ ├── runtime_exec.go │ ├── runtime_exec_test.go │ ├── runtime_mock.go │ ├── spec.go │ └── spec_mock.go ├── scheduler │ ├── config │ │ └── config.go │ ├── nodes.go │ ├── pods.go │ ├── routes │ │ └── route.go │ ├── scheduler.go │ ├── scheduler_test.go │ ├── score.go │ └── webhook.go ├── util │ ├── client │ │ └── client.go │ ├── nodelock │ │ └── nodelock.go │ ├── types.go │ ├── util.go │ └── util_test.go └── version │ └── version.go └── version.mk /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: Release 4 | 5 | # Controls when the workflow will run 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the master branch 8 | push: 9 | tags: 10 | - v[0-9]+.[0-9]+.[0-9]+.[0-9]+ 11 | - v[0-9]+.[0-9]+.[0-9]+ 12 | - v[0-9]+.[0-9]+ 13 | 14 | # Allows you to run this workflow manually from the Actions tab 15 | workflow_dispatch: 16 | 17 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 18 | jobs: 19 | # This workflow contains a single job called "build" 20 | build: 21 | # The type of runner that the job will run on 22 | runs-on: ubuntu-latest 23 | 24 | # Steps represent a sequence of tasks that will be executed as part of the job 25 | steps: 26 | - name: Checkout 27 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 28 | uses: actions/checkout@v2 29 | 30 | 31 | - name: Setup Go environment 32 | uses: actions/setup-go@v5.0.0 33 | with: 34 | go-version: 1.21 35 | 36 | - name: Get branch name 37 | uses: nelonoel/branch-name@v1.0.1 38 | 39 | - name: Docker Login 40 | uses: docker/login-action@v1.10.0 41 | with: 42 | # Server address of Docker registry. If not set then will default to Docker Hub 43 | # registry: 4pdosc 44 | # Username used to log against the Docker registry 45 | username: ${{ secrets.DOCKERHUB_USERNAME }} 46 | # Password or personal access token used to log against the Docker registry 47 | password: ${{ secrets.DOCKERHUB_TOKEN }} 48 | 49 | - name: Set up Docker Buildx 50 | id: buildx 51 | uses: docker/setup-buildx-action@v1 52 | 53 | - run: make tidy 54 | # run: make proto 55 | - run: SHORT_VERSION="${BRANCH_NAME}" bash ./hack/build.sh 56 | 57 | - name: Publish Helm charts 58 | uses: stefanprodan/helm-gh-pages@master 59 | with: 60 | token: ${{ secrets.GITHUB_TOKEN }} 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | run_device_plugin.sh 3 | run_scheduler.sh 4 | device_plugin.sh 5 | libvgpu/build 6 | updateso.sh 7 | .idea 8 | vendor 9 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - build_image 3 | - deploy 4 | 5 | variables: 6 | IMAGE_NAME: k8s-vgpu 7 | 8 | .build_image: 9 | stage: build_image 10 | image: '${DIND_IMAGE}' 11 | script: 12 | - IMAGE_FULL_NAME=${IMAGE_REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} 13 | - > 14 | docker build -t ${IMAGE_FULL_NAME} 15 | --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} 16 | --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} 17 | --build-arg VERSION=${VERSION} 18 | --build-arg GOPROXY=${GOPROXY} -f ./docker/Dockerfile . 19 | - docker push ${IMAGE_FULL_NAME} 20 | 21 | build_dev_image: 22 | extends: .build_image 23 | variables: 24 | IMAGE_TAG: ${CI_COMMIT_SHA} 25 | VERSION: ${CI_COMMIT_SHA} 26 | only: 27 | - master 28 | 29 | build_release_image: 30 | extends: .build_image 31 | variables: 32 | IMAGE_TAG: ${CI_COMMIT_TAG} 33 | VERSION: ${CI_COMMIT_TAG}-${CI_COMMIT_SHA} 34 | only: 35 | - tags 36 | 37 | .deploy: 38 | stage: deploy 39 | image: '${HELM_IMAGE}' 40 | variables: 41 | RELEASE_NAME: vgpu 42 | RELEASE_NAMESPACE: vgpu 43 | EXTRA_ARGS: '' 44 | script: 45 | - IMAGE_FULL_NAME=${IMAGE_REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} 46 | - > 47 | helm upgrade --install ${RELEASE_NAME} ./charts/4pd-vgpu 48 | -n ${RELEASE_NAMESPACE} 49 | --set scheduler.extender.image=${IMAGE_FULL_NAME} 50 | --set devicePlugin.image=${IMAGE_FULL_NAME} 51 | ${EXTRA_ARGS} 52 | 53 | deploy_develop: 54 | extends: .deploy 55 | variables: 56 | IMAGE_TAG: ${CI_COMMIT_SHA} 57 | environment: 58 | name: vgpu-develop 59 | only: 60 | - master 61 | tags: 62 | - deploy-test 63 | 64 | deploy_pre_product: 65 | extends: .deploy 66 | variables: 67 | IMAGE_TAG: ${CI_COMMIT_TAG} 68 | EXTRA_ARGS: "--wait --timeout=30m" 69 | environment: 70 | name: vgpu-develop 71 | only: 72 | - tags 73 | tags: 74 | - deploy-test 75 | 76 | deploy_product: 77 | extends: .deploy 78 | variables: 79 | IMAGE_TAG: ${CI_COMMIT_TAG} 80 | environment: 81 | name: vgpu-product 82 | only: 83 | - tags 84 | tags: 85 | - deploy-product 86 | when: manual 87 | 88 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libvgpu"] 2 | path = libvgpu 3 | url = https://github.com/Project-HAMi/HAMi-core.git 4 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | The following people, in alphabetical order, have either authored or signed 2 | off on commits in the HAMi repository: 3 | 4 | archlitchi limengxuan@4paradigm.com 5 | peizhaoyou peizhaoyou@4paradigm.com 6 | chaunceyjiang chaunceyjiang@gmail.com 7 | wawa0210 8 | whybeyoung 9 | gsakun 10 | CoderTH 11 | lengrongfu 12 | chaunceyjiang 13 | atttx123 14 | zhengbingxian 15 | 16 | 17 | -------------------------------------------------------------------------------- /HAMi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/HAMi.jpg -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | # Maintainers 2 | 3 | Please see the [AUTHORS](./AUTHORS) file for the full list of contributors to the project 4 | 5 | ## HAMi Committers 6 | 7 | | Maintainer | Emplolyer | 8 | |---------------------------------------------------|-----------| 9 | | [Li Mengxuan](https://github.com/archlitchi) | 4Paradigm | 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ##### Global variables ##### 2 | include version.mk 3 | 4 | all: build 5 | 6 | docker: 7 | docker build \ 8 | --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \ 9 | --build-arg TARGET_ARCH=${TARGET_ARCH} \ 10 | --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \ 11 | --build-arg DEST_DIR=${DEST_DIR} \ 12 | . -f=docker/Dockerfile -t ${IMG_TAG} 13 | 14 | tidy: 15 | $(GO) mod tidy 16 | 17 | proto: 18 | $(GO) get github.com/gogo/protobuf/protoc-gen-gofast@v1.3.2 19 | protoc --gofast_out=plugins=grpc:. ./pkg/api/*.proto 20 | 21 | build: $(CMDS) $(DEVICES) 22 | 23 | $(CMDS): 24 | $(GO) build -ldflags '-s -w -X 4pd.io/k8s-vgpu/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@ ./cmd/$@ 25 | 26 | $(DEVICES): 27 | $(GO) build -ldflags '-s -w -X 4pd.io/k8s-vgpu/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@-device-plugin ./cmd/device-plugin/$@ 28 | 29 | clean: 30 | $(GO) clean -r -x ./cmd/... 31 | -rm -rf $(OUTPUT_DIR) 32 | 33 | .PHONY: all build docker clean $(CMDS) 34 | -------------------------------------------------------------------------------- /benchmarks/ai-benchmark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.4.1-gpu 2 | 3 | RUN apt-get update && apt-get install -y --no-install-recommends apt-utils 4 | 5 | RUN pip install --upgrade pip 6 | 7 | RUN apt-get -y install git 8 | RUN git clone -b feat/transformer https://github.com/shiyoubun/ai-benchmark.git 9 | 10 | WORKDIR ai-benchmark 11 | RUN pip install -e . 12 | 13 | ENTRYPOINT [ "python", "bin/ai-benchmark.py" ] 14 | -------------------------------------------------------------------------------- /benchmarks/ai-benchmark/Hami/ai-benchmark.yml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: ai-benchmark 5 | spec: 6 | template: 7 | metadata: 8 | name: ai-benchmark 9 | spec: 10 | containers: 11 | - name: ai-benchmark 12 | image: 4pdosc/ai-benchmark:2.4.1-gpu 13 | resources: 14 | requests: 15 | nvidia.com/gpu: 1 16 | nvidia.com/gpumem-percentage: 50 17 | limits: 18 | nvidia.com/gpu: 1 19 | nvidia.com/gpumem-percentage: 50 20 | restartPolicy: Never -------------------------------------------------------------------------------- /benchmarks/ai-benchmark/Official-Nvidia-device-plugin/ai-benchmark.yml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: ai-benchmark 5 | spec: 6 | template: 7 | metadata: 8 | name: ai-benchmark 9 | spec: 10 | containers: 11 | - name: ai-benchmark 12 | image: 4pdosc/ai-benchmark:2.4.1-gpu 13 | resources: 14 | requests: 15 | nvidia.com/gpu: 1 16 | limits: 17 | nvidia.com/gpu: 1 18 | restartPolicy: Never -------------------------------------------------------------------------------- /benchmarks/ai-benchmark/Official-Nvidia-device-plugin/nvidia-device-plugin-official.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: apps/v1 16 | kind: DaemonSet 17 | metadata: 18 | name: nvidia-device-plugin-daemonset 19 | namespace: kube-system 20 | spec: 21 | selector: 22 | matchLabels: 23 | name: nvidia-device-plugin-ds 24 | updateStrategy: 25 | type: RollingUpdate 26 | template: 27 | metadata: 28 | # This annotation is deprecated. Kept here for backward compatibility 29 | # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 30 | annotations: 31 | scheduler.alpha.kubernetes.io/critical-pod: "" 32 | labels: 33 | name: nvidia-device-plugin-ds 34 | spec: 35 | tolerations: 36 | # This toleration is deprecated. Kept here for backward compatibility 37 | # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 38 | - key: CriticalAddonsOnly 39 | operator: Exists 40 | - key: nvidia.com/gpu 41 | operator: Exists 42 | effect: NoSchedule 43 | # Mark this pod as a critical add-on; when enabled, the critical add-on 44 | # scheduler reserves resources for critical add-on pods so that they can 45 | # be rescheduled after a failure. 46 | # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 47 | priorityClassName: "system-node-critical" 48 | containers: 49 | - image: nvcr.io/nvidia/k8s-device-plugin:v0.9.0 50 | name: nvidia-device-plugin-ctr 51 | args: ["--fail-on-init-error=false"] 52 | securityContext: 53 | allowPrivilegeEscalation: false 54 | capabilities: 55 | drop: ["ALL"] 56 | volumeMounts: 57 | - name: device-plugin 58 | mountPath: /var/lib/kubelet/device-plugins 59 | volumes: 60 | - name: device-plugin 61 | hostPath: 62 | path: /var/lib/kubelet/device-plugins 63 | 64 | -------------------------------------------------------------------------------- /charts/vgpu/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: vgpu 3 | version: 2.0.0 4 | kubeVersion: ">= 1.16.0" 5 | description: Heterogeneous AI Computing Virtualization Middleware 6 | keywords: 7 | - vgpu 8 | - gpu 9 | type: application 10 | maintainers: 11 | - name: limengxuan 12 | email: limengxuan@4paradigm.com 13 | appVersion: 0.0.2 -------------------------------------------------------------------------------- /charts/vgpu/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | ** Please be patient while the chart is being deployed ** 2 | Resource name: {{ .Values.resourceName }} 3 | 4 | -------------------------------------------------------------------------------- /charts/vgpu/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "4pd-vgpu.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 6 | {{- end -}} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "4pd-vgpu.fullname" -}} 14 | {{- if .Values.fullnameOverride -}} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride -}} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 20 | {{- else -}} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 22 | {{- end -}} 23 | {{- end -}} 24 | {{- end -}} 25 | 26 | {{/* 27 | The app name for Scheduler 28 | */}} 29 | {{- define "4pd-vgpu.scheduler" -}} 30 | {{- printf "%s-scheduler" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} 31 | {{- end -}} 32 | 33 | {{/* 34 | The app name for DevicePlugin 35 | */}} 36 | {{- define "4pd-vgpu.device-plugin" -}} 37 | {{- printf "%s-device-plugin" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} 38 | {{- end -}} 39 | 40 | {{/* 41 | The tls secret name for Scheduler 42 | */}} 43 | {{- define "4pd-vgpu.scheduler.tls" -}} 44 | {{- printf "%s-scheduler-tls" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} 45 | {{- end -}} 46 | 47 | {{/* 48 | The webhook name 49 | */}} 50 | {{- define "4pd-vgpu.scheduler.webhook" -}} 51 | {{- printf "%s-webhook" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} 52 | {{- end -}} 53 | 54 | {{/* 55 | Create chart name and version as used by the chart label. 56 | */}} 57 | {{- define "4pd-vgpu.chart" -}} 58 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 59 | {{- end }} 60 | 61 | {{/* 62 | Common labels 63 | */}} 64 | {{- define "4pd-vgpu.labels" -}} 65 | helm.sh/chart: {{ include "4pd-vgpu.chart" . }} 66 | {{ include "4pd-vgpu.selectorLabels" . }} 67 | {{- if .Chart.AppVersion }} 68 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 69 | {{- end }} 70 | app.kubernetes.io/managed-by: {{ .Release.Service }} 71 | {{- end }} 72 | 73 | {{/* 74 | Selector labels 75 | */}} 76 | {{- define "4pd-vgpu.selectorLabels" -}} 77 | app.kubernetes.io/name: {{ include "4pd-vgpu.name" . }} 78 | app.kubernetes.io/instance: {{ .Release.Name }} 79 | {{- end }} 80 | 81 | {{/* 82 | Image registry secret name 83 | */}} 84 | {{- define "4pd-vgpu.imagePullSecrets" -}} 85 | imagePullSecrets: {{ toYaml .Values.imagePullSecrets | nindent 2 }} 86 | {{- end }} 87 | 88 | -------------------------------------------------------------------------------- /charts/vgpu/templates/device-plugin/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "4pd-vgpu.device-plugin" . }} 5 | labels: 6 | app.kubernetes.io/component: 4pd-device-plugin 7 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 8 | data: 9 | config.json: | 10 | { 11 | "nodeconfig": [ 12 | { 13 | "name": "m5-cloudinfra-online02", 14 | "devicememoryscaling": 1.8, 15 | "devicesplitcount": 10, 16 | "migstrategy":"none" 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /charts/vgpu/templates/device-plugin/daemonsethygon.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: {{ include "4pd-vgpu.device-plugin" . }}-hygon 5 | labels: 6 | app.kubernetes.io/component: 4pd-device-plugin-hygon 7 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 8 | {{- with .Values.global.labels }} 9 | {{- toYaml . | nindent 4 }} 10 | {{- end }} 11 | {{- if .Values.global.annotations }} 12 | annotations: {{ toYaml .Values.global.annotations | nindent 4}} 13 | {{- end }} 14 | spec: 15 | selector: 16 | matchLabels: 17 | app.kubernetes.io/component: 4pd-device-plugin-hygon 18 | {{- include "4pd-vgpu.selectorLabels" . | nindent 6 }} 19 | template: 20 | metadata: 21 | labels: 22 | app.kubernetes.io/component: 4pd-device-plugin-hygon 23 | 4pd.io/webhook: ignore 24 | {{- include "4pd-vgpu.selectorLabels" . | nindent 8 }} 25 | {{- if .Values.devicePlugin.podAnnotations }} 26 | annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }} 27 | {{- end }} 28 | spec: 29 | {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}} 30 | serviceAccountName: {{ include "4pd-vgpu.device-plugin" . }} 31 | priorityClassName: system-node-critical 32 | hostPID: true 33 | hostNetwork: true 34 | containers: 35 | - name: dcu-device-plugin-ctr 36 | image: {{ .Values.devicePlugin.hygonimage }} 37 | imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }} 38 | command: ["/hygon","-logtostderr=true","-stderrthreshold=INFO","-v=5"] 39 | env: 40 | - name: NodeName 41 | valueFrom: 42 | fieldRef: 43 | fieldPath: spec.nodeName 44 | - name: HOOK_PATH 45 | value: {{ .Values.devicePlugin.libPath }} 46 | - name: HYGONPATH 47 | value: {{ .Values.devicePlugin.hygondriver }} 48 | securityContext: 49 | privileged: true 50 | allowPrivilegeEscalation: true 51 | capabilities: 52 | drop: ["ALL"] 53 | add: ["SYS_ADMIN"] 54 | volumeMounts: 55 | - name: device-plugin 56 | mountPath: /var/lib/kubelet/device-plugins 57 | - name: deviceconfig 58 | mountPath: /config 59 | - name: sysinfo 60 | mountPath: /sys 61 | - name: lib 62 | mountPath: /usr/local/vgpu 63 | - name: hwpath 64 | mountPath: /usr/share/hwdata 65 | - name: hygonloc 66 | mountPath: /opt/hygondriver/ 67 | volumes: 68 | - name: device-plugin 69 | hostPath: 70 | path: {{ .Values.devicePlugin.pluginPath }} 71 | - name: sysinfo 72 | hostPath: 73 | path: /sys 74 | - name: deviceconfig 75 | configMap: 76 | name: {{ template "4pd-vgpu.device-plugin" . }} 77 | - name: lib 78 | hostPath: 79 | path: {{ .Values.devicePlugin.libPath }} 80 | - name: hwpath 81 | hostPath: 82 | path: /usr/share/hwdata 83 | - name: hygonloc 84 | hostPath: 85 | path: {{ .Values.devicePlugin.hygondriver }} 86 | {{- if .Values.devicePlugin.hygonnodeSelector }} 87 | nodeSelector: {{ toYaml .Values.devicePlugin.hygonnodeSelector | nindent 8 }} 88 | {{- end }} 89 | {{- if .Values.devicePlugin.tolerations }} 90 | tolerations: {{ toYaml .Values.devicePlugin.tolerations | nindent 8 }} 91 | {{- end }} 92 | -------------------------------------------------------------------------------- /charts/vgpu/templates/device-plugin/monitorrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "4pd-vgpu.device-plugin" . }}-monitor 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - pods 10 | verbs: 11 | - get 12 | - create 13 | - watch 14 | - list 15 | - update 16 | - patch 17 | - apiGroups: 18 | - "" 19 | resources: 20 | - nodes 21 | verbs: 22 | - get 23 | - update 24 | - list 25 | - patch 26 | 27 | 28 | -------------------------------------------------------------------------------- /charts/vgpu/templates/device-plugin/monitorrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "4pd-vgpu.device-plugin" . }} 5 | labels: 6 | app.kubernetes.io/component: "4pd-device-plugin" 7 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | #name: cluster-admin 12 | name: {{ include "4pd-vgpu.device-plugin" . }}-monitor 13 | subjects: 14 | - kind: ServiceAccount 15 | name: {{ include "4pd-vgpu.device-plugin" . }} 16 | namespace: {{ .Release.Namespace | quote }} 17 | -------------------------------------------------------------------------------- /charts/vgpu/templates/device-plugin/monitorservice.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "4pd-vgpu.device-plugin" . }}-monitor 5 | labels: 6 | app.kubernetes.io/component: 4pd-scheduler 7 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 8 | {{- if .Values.scheduler.service.labels }} 9 | {{ toYaml .Values.scheduler.service.labels | indent 4 }} 10 | {{- end }} 11 | {{- if .Values.scheduler.service.annotations }} 12 | annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} 13 | {{- end }} 14 | spec: 15 | externalTrafficPolicy: Local 16 | selector: 17 | app.kubernetes.io/component: 4pd-device-plugin 18 | type: NodePort 19 | ports: 20 | - name: monitorport 21 | port: {{ .Values.devicePlugin.service.httpPort }} 22 | targetPort: 9394 23 | nodePort: {{ .Values.devicePlugin.service.httpPort }} -------------------------------------------------------------------------------- /charts/vgpu/templates/device-plugin/monitorserviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "4pd-vgpu.device-plugin" . }} 5 | namespace: {{ .Release.Namespace | quote }} 6 | labels: 7 | app.kubernetes.io/component: "4pd-device-plugin" 8 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 9 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "4pd-vgpu.scheduler" . }} 5 | labels: 6 | app.kubernetes.io/component: 4pd-scheduler 7 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 8 | data: 9 | config.json: | 10 | { 11 | "kind": "Policy", 12 | "apiVersion": "v1", 13 | "extenders": [ 14 | { 15 | "urlPrefix": "https://127.0.0.1:443", 16 | "filterVerb": "filter", 17 | "bindVerb": "bind", 18 | "enableHttps": true, 19 | "weight": 1, 20 | "nodeCacheCapable": true, 21 | "httpTimeout": 30000000000, 22 | "tlsConfig": { 23 | "insecure": true 24 | }, 25 | "managedResources": [ 26 | { 27 | "name": "{{ .Values.resourceName }}", 28 | "ignoredByScheduler": true 29 | }, 30 | { 31 | "name": "{{ .Values.resourceMem }}", 32 | "ignoredByScheduler": true 33 | }, 34 | { 35 | "name": "{{ .Values.resourceCores }}", 36 | "ignoredByScheduler": true 37 | }, 38 | { 39 | "name": "{{ .Values.resourceMemPercentage }}", 40 | "ignoredByScheduler": true 41 | }, 42 | { 43 | "name": "{{ .Values.resourcePriority }}", 44 | "ignoredByScheduler": true 45 | }, 46 | { 47 | "name": "{{ .Values.mluResourceName }}", 48 | "ignoredByScheduler": true 49 | }, 50 | { 51 | "name": "{{ .Values.mluResourceMem }}", 52 | "ignoredByScheduler": true 53 | }, 54 | { 55 | "name": "{{ .Values.dcuResourceName }}", 56 | "ignoredByScheduler": true 57 | }, 58 | { 59 | "name": "{{ .Values.dcuResourceMem }}", 60 | "ignoredByScheduler": true 61 | }, 62 | { 63 | "name": "{{ .Values.dcuResourceCores }}", 64 | "ignoredByScheduler": true 65 | }, 66 | { 67 | "name": "{{ .Values.iluvatarResourceName }}", 68 | "ignoredByScheduler": true 69 | } 70 | ], 71 | "ignoreable": false 72 | } 73 | ] 74 | } -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/configmapnew.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "4pd-vgpu.scheduler" . }}-newversion 5 | labels: 6 | app.kubernetes.io/component: 4pd-scheduler 7 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 8 | data: 9 | config.yaml: | 10 | apiVersion: kubescheduler.config.k8s.io/v1beta2 11 | kind: KubeSchedulerConfiguration 12 | leaderElection: 13 | leaderElect: false 14 | profiles: 15 | - schedulerName: {{ .Values.schedulerName }} 16 | extenders: 17 | - urlPrefix: "https://127.0.0.1:443" 18 | filterVerb: filter 19 | bindVerb: bind 20 | nodeCacheCapable: true 21 | weight: 1 22 | httpTimeout: 30s 23 | enableHTTPS: true 24 | tlsConfig: 25 | insecure: true 26 | managedResources: 27 | - name: {{ .Values.resourceName }} 28 | ignoredByScheduler: true 29 | - name: {{ .Values.resourceMem }} 30 | ignoredByScheduler: true 31 | - name: {{ .Values.resourceCores }} 32 | ignoredByScheduler: true 33 | - name: {{ .Values.resourceMemPercentage }} 34 | ignoredByScheduler: true 35 | - name: {{ .Values.resourcePriority }} 36 | ignoredByScheduler: true 37 | - name: {{ .Values.mluResourceName }} 38 | ignoredByScheduler: true 39 | - name: {{ .Values.mluResourceMem }} 40 | ignoredByScheduler: true 41 | - name: {{ .Values.dcuResourceName }} 42 | ignoredByScheduler: true 43 | - name: {{ .Values.dcuResourceMem }} 44 | ignoredByScheduler: true 45 | - name: {{ .Values.dcuResourceCores }} 46 | ignoredByScheduler: true 47 | - name: {{ .Values.iluvatarResourceName }} 48 | ignoredByScheduler: true -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/job-patch/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "4pd-vgpu.fullname" . }}-admission 5 | annotations: 6 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 7 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 8 | labels: 9 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 10 | app.kubernetes.io/component: admission-webhook 11 | rules: 12 | - apiGroups: 13 | - admissionregistration.k8s.io 14 | resources: 15 | #- validatingwebhookconfigurations 16 | - mutatingwebhookconfigurations 17 | verbs: 18 | - get 19 | - update 20 | {{- if .Values.podSecurityPolicy.enabled }} 21 | - apiGroups: ['extensions'] 22 | resources: ['podsecuritypolicies'] 23 | verbs: ['use'] 24 | resourceNames: 25 | - {{ include "4pd-vgpu.fullname" . }}-admission 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/job-patch/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "4pd-vgpu.fullname" . }}-admission 5 | annotations: 6 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 7 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 8 | labels: 9 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 10 | app.kubernetes.io/component: admission-webhook 11 | roleRef: 12 | apiGroup: rbac.authorization.k8s.io 13 | kind: ClusterRole 14 | name: {{ include "4pd-vgpu.fullname" . }}-admission 15 | subjects: 16 | - kind: ServiceAccount 17 | name: {{ include "4pd-vgpu.fullname" . }}-admission 18 | namespace: {{ .Release.Namespace | quote }} 19 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/job-patch/job-createSecret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: {{ include "4pd-vgpu.fullname" . }}-admission-create 5 | annotations: 6 | "helm.sh/hook": pre-install,pre-upgrade 7 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 8 | labels: 9 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 10 | app.kubernetes.io/component: admission-webhook 11 | spec: 12 | {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }} 13 | # Alpha feature since k8s 1.12 14 | ttlSecondsAfterFinished: 0 15 | {{- end }} 16 | template: 17 | metadata: 18 | name: {{ include "4pd-vgpu.fullname" . }}-admission-create 19 | {{- if .Values.scheduler.patch.podAnnotations }} 20 | annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }} 21 | {{- end }} 22 | labels: 23 | {{- include "4pd-vgpu.labels" . | nindent 8 }} 24 | app.kubernetes.io/component: admission-webhook 25 | 4pd.io/webhook: ignore 26 | spec: 27 | {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}} 28 | {{- if .Values.scheduler.patch.priorityClassName }} 29 | priorityClassName: {{ .Values.scheduler.patch.priorityClassName }} 30 | {{- end }} 31 | containers: 32 | - name: create 33 | {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }} 34 | image: {{ .Values.scheduler.patch.imageNew }} 35 | {{- else }} 36 | image: {{ .Values.scheduler.patch.image }} 37 | {{- end }} 38 | imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }} 39 | args: 40 | - create 41 | - --cert-name=tls.crt 42 | - --key-name=tls.key 43 | {{- if .Values.scheduler.customWebhook.enabled }} 44 | - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "4pd-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.customWebhook.host}} 45 | {{- else }} 46 | - --host={{ printf "%s.%s.svc,127.0.0.1" (include "4pd-vgpu.scheduler" .) .Release.Namespace }} 47 | {{- end }} 48 | - --namespace={{ .Release.Namespace }} 49 | - --secret-name={{ include "4pd-vgpu.scheduler.tls" . }} 50 | restartPolicy: OnFailure 51 | serviceAccountName: {{ include "4pd-vgpu.fullname" . }}-admission 52 | {{- if .Values.scheduler.patch.nodeSelector }} 53 | nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }} 54 | {{- end }} 55 | {{- if .Values.scheduler.patch.tolerations }} 56 | tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }} 57 | {{- end }} 58 | securityContext: 59 | runAsNonRoot: true 60 | runAsUser: {{ .Values.scheduler.patch.runAsUser }} 61 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/job-patch/job-patchWebhook.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: {{ include "4pd-vgpu.fullname" . }}-admission-patch 5 | annotations: 6 | "helm.sh/hook": post-install,post-upgrade 7 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 8 | labels: 9 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 10 | app.kubernetes.io/component: admission-webhook 11 | spec: 12 | {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }} 13 | # Alpha feature since k8s 1.12 14 | ttlSecondsAfterFinished: 0 15 | {{- end }} 16 | template: 17 | metadata: 18 | name: {{ include "4pd-vgpu.fullname" . }}-admission-patch 19 | {{- if .Values.scheduler.patch.podAnnotations }} 20 | annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }} 21 | {{- end }} 22 | labels: 23 | {{- include "4pd-vgpu.labels" . | nindent 8 }} 24 | app.kubernetes.io/component: admission-webhook 25 | 4pd.io/webhook: ignore 26 | spec: 27 | {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}} 28 | {{- if .Values.scheduler.patch.priorityClassName }} 29 | priorityClassName: {{ .Values.scheduler.patch.priorityClassName }} 30 | {{- end }} 31 | containers: 32 | - name: patch 33 | {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }} 34 | image: {{ .Values.scheduler.patch.imageNew }} 35 | {{- else }} 36 | image: {{ .Values.scheduler.patch.image }} 37 | {{- end }} 38 | imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }} 39 | args: 40 | - patch 41 | - --webhook-name={{ include "4pd-vgpu.scheduler.webhook" . }} 42 | - --namespace={{ .Release.Namespace }} 43 | - --patch-validating=false 44 | - --secret-name={{ include "4pd-vgpu.scheduler.tls" . }} 45 | - --patch-failure-policy=Fail 46 | restartPolicy: OnFailure 47 | serviceAccountName: {{ include "4pd-vgpu.fullname" . }}-admission 48 | {{- if .Values.scheduler.patch.nodeSelector }} 49 | nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }} 50 | {{- end }} 51 | {{- if .Values.scheduler.patch.tolerations }} 52 | tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }} 53 | {{- end }} 54 | securityContext: 55 | runAsNonRoot: true 56 | runAsUser: {{ .Values.scheduler.patch.runAsUser }} 57 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/job-patch/psp.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.podSecurityPolicy.enabled }} 2 | apiVersion: policy/v1beta1 3 | kind: PodSecurityPolicy 4 | metadata: 5 | name: {{ include "4pd-vgpu.fullname" . }}-admission 6 | annotations: 7 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 8 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 9 | labels: 10 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 11 | app.kubernetes.io/component: admission-webhook 12 | spec: 13 | allowPrivilegeEscalation: false 14 | fsGroup: 15 | ranges: 16 | - max: 65535 17 | min: 1 18 | rule: MustRunAs 19 | requiredDropCapabilities: 20 | - ALL 21 | runAsUser: 22 | rule: MustRunAsNonRoot 23 | seLinux: 24 | rule: RunAsAny 25 | supplementalGroups: 26 | ranges: 27 | - max: 65535 28 | min: 1 29 | rule: MustRunAs 30 | volumes: 31 | - configMap 32 | - emptyDir 33 | - projected 34 | - secret 35 | - downwardAPI 36 | {{- end }} 37 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/job-patch/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: {{ include "4pd-vgpu.fullname" . }}-admission 5 | annotations: 6 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 7 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 8 | labels: 9 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 10 | app.kubernetes.io/component: admission-webhook 11 | rules: 12 | - apiGroups: 13 | - "" 14 | resources: 15 | - secrets 16 | verbs: 17 | - get 18 | - create 19 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/job-patch/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: {{ include "4pd-vgpu.fullname" . }}-admission 5 | annotations: 6 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 7 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 8 | labels: 9 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 10 | app.kubernetes.io/component: admission-webhook 11 | roleRef: 12 | apiGroup: rbac.authorization.k8s.io 13 | kind: Role 14 | name: {{ include "4pd-vgpu.fullname" . }}-admission 15 | subjects: 16 | - kind: ServiceAccount 17 | name: {{ include "4pd-vgpu.fullname" . }}-admission 18 | namespace: {{ .Release.Namespace | quote }} 19 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/job-patch/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "4pd-vgpu.fullname" . }}-admission 5 | annotations: 6 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 7 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 8 | labels: 9 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 10 | app.kubernetes.io/component: admission-webhook 11 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "4pd-vgpu.scheduler" . }} 5 | labels: 6 | app.kubernetes.io/component: "4pd-scheduler" 7 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: cluster-admin 12 | subjects: 13 | - kind: ServiceAccount 14 | name: {{ include "4pd-vgpu.scheduler" . }} 15 | namespace: {{ .Release.Namespace | quote }} 16 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "4pd-vgpu.scheduler" . }} 5 | labels: 6 | app.kubernetes.io/component: 4pd-scheduler 7 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 8 | {{- if .Values.scheduler.service.labels }} 9 | {{ toYaml .Values.scheduler.service.labels | indent 4 }} 10 | {{- end }} 11 | {{- if .Values.scheduler.service.annotations }} 12 | annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} 13 | {{- end }} 14 | spec: 15 | type: NodePort 16 | ports: 17 | - name: http 18 | port: {{ .Values.scheduler.service.httpPort }} 19 | targetPort: 443 20 | nodePort: {{ .Values.scheduler.service.schedulerPort }} 21 | protocol: TCP 22 | - name: monitor 23 | port: {{ .Values.scheduler.service.monitorPort }} 24 | targetPort: 9395 25 | nodePort: {{ .Values.scheduler.service.monitorPort }} 26 | protocol: TCP 27 | selector: 28 | app.kubernetes.io/component: 4pd-scheduler 29 | {{- include "4pd-vgpu.selectorLabels" . | nindent 4 }} 30 | 31 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "4pd-vgpu.scheduler" . }} 5 | namespace: {{ .Release.Namespace | quote }} 6 | labels: 7 | app.kubernetes.io/component: "4pd-scheduler" 8 | {{- include "4pd-vgpu.labels" . | nindent 4 }} 9 | -------------------------------------------------------------------------------- /charts/vgpu/templates/scheduler/webhook.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: MutatingWebhookConfiguration 3 | metadata: 4 | name: {{ include "4pd-vgpu.scheduler.webhook" . }} 5 | webhooks: 6 | - admissionReviewVersions: 7 | - v1beta1 8 | clientConfig: 9 | {{- if .Values.scheduler.customWebhook.enabled }} 10 | url: https://{{ .Values.scheduler.customWebhook.host}}:{{.Values.scheduler.customWebhook.port}}{{.Values.scheduler.customWebhook.path}} 11 | {{- else }} 12 | service: 13 | name: {{ include "4pd-vgpu.scheduler" . }} 14 | namespace: {{ .Release.Namespace }} 15 | path: /webhook 16 | port: {{ .Values.scheduler.service.httpPort }} 17 | {{- end }} 18 | failurePolicy: {{ .Values.scheduler.mutatingWebhookConfiguration.failurePolicy }} 19 | matchPolicy: Equivalent 20 | name: vgpu.4pd.io 21 | namespaceSelector: 22 | matchExpressions: 23 | - key: 4pd.io/webhook 24 | operator: NotIn 25 | values: 26 | - ignore 27 | objectSelector: 28 | matchExpressions: 29 | - key: 4pd.io/webhook 30 | operator: NotIn 31 | values: 32 | - ignore 33 | reinvocationPolicy: Never 34 | rules: 35 | - apiGroups: 36 | - "" 37 | apiVersions: 38 | - v1 39 | operations: 40 | - CREATE 41 | resources: 42 | - pods 43 | scope: '*' 44 | sideEffects: None 45 | timeoutSeconds: 10 46 | -------------------------------------------------------------------------------- /cmd/device-plugin/nvidia/plugin-manager.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | 22 | "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/cdi" 23 | "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager" 24 | "4pd.io/k8s-vgpu/pkg/util" 25 | "github.com/NVIDIA/go-nvlib/pkg/nvml" 26 | spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" 27 | ) 28 | 29 | // NewPluginManager creates an NVML-based plugin manager 30 | func NewPluginManager(config *util.DeviceConfig) (manager.Interface, error) { 31 | var err error 32 | switch *config.Flags.MigStrategy { 33 | case spec.MigStrategyNone: 34 | case spec.MigStrategySingle: 35 | case spec.MigStrategyMixed: 36 | default: 37 | return nil, fmt.Errorf("unknown strategy: %v", *config.Flags.MigStrategy) 38 | } 39 | 40 | nvmllib := nvml.New() 41 | 42 | deviceListStrategies, err := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy) 43 | if err != nil { 44 | return nil, fmt.Errorf("invalid device list strategy: %v", err) 45 | } 46 | 47 | cdiEnabled := deviceListStrategies.IsCDIEnabled() 48 | 49 | cdiHandler, err := cdi.New( 50 | cdi.WithEnabled(cdiEnabled), 51 | cdi.WithDriverRoot(*config.Flags.Plugin.ContainerDriverRoot), 52 | cdi.WithTargetDriverRoot(*config.Flags.NvidiaDriverRoot), 53 | cdi.WithNvidiaCTKPath(*config.Flags.Plugin.NvidiaCTKPath), 54 | cdi.WithNvml(nvmllib), 55 | cdi.WithDeviceIDStrategy(*config.Flags.Plugin.DeviceIDStrategy), 56 | cdi.WithVendor("k8s.device-plugin.nvidia.com"), 57 | cdi.WithGdsEnabled(*config.Flags.GDSEnabled), 58 | cdi.WithMofedEnabled(*config.Flags.MOFEDEnabled), 59 | ) 60 | if err != nil { 61 | return nil, fmt.Errorf("unable to create cdi handler: %v", err) 62 | } 63 | 64 | m, err := manager.New( 65 | manager.WithNVML(nvmllib), 66 | manager.WithCDIEnabled(cdiEnabled), 67 | manager.WithCDIHandler(cdiHandler), 68 | manager.WithConfig(config), 69 | manager.WithFailOnInitError(*config.Flags.FailOnInitError), 70 | manager.WithMigStrategy(*config.Flags.MigStrategy), 71 | ) 72 | if err != nil { 73 | return nil, fmt.Errorf("unable to create plugin manager: %v", err) 74 | } 75 | 76 | if err := m.CreateCDISpecFile(); err != nil { 77 | return nil, fmt.Errorf("unable to create cdi spec file: %v", err) 78 | } 79 | 80 | return m, nil 81 | } 82 | -------------------------------------------------------------------------------- /cmd/device-plugin/nvidia/watchers.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "os" 21 | "os/signal" 22 | 23 | "github.com/fsnotify/fsnotify" 24 | ) 25 | 26 | func newFSWatcher(files ...string) (*fsnotify.Watcher, error) { 27 | watcher, err := fsnotify.NewWatcher() 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | for _, f := range files { 33 | err = watcher.Add(f) 34 | if err != nil { 35 | watcher.Close() 36 | return nil, err 37 | } 38 | } 39 | 40 | return watcher, nil 41 | } 42 | 43 | func newOSWatcher(sigs ...os.Signal) chan os.Signal { 44 | sigChan := make(chan os.Signal, 1) 45 | signal.Notify(sigChan, sigs...) 46 | 47 | return sigChan 48 | } 49 | -------------------------------------------------------------------------------- /cmd/scheduler/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package main 17 | 18 | import ( 19 | "net/http" 20 | 21 | "4pd.io/k8s-vgpu/pkg/device" 22 | "4pd.io/k8s-vgpu/pkg/version" 23 | 24 | "4pd.io/k8s-vgpu/pkg/scheduler" 25 | "4pd.io/k8s-vgpu/pkg/scheduler/config" 26 | "4pd.io/k8s-vgpu/pkg/scheduler/routes" 27 | "4pd.io/k8s-vgpu/pkg/util" 28 | "github.com/julienschmidt/httprouter" 29 | "github.com/spf13/cobra" 30 | klog "k8s.io/klog/v2" 31 | ) 32 | 33 | //var version string 34 | 35 | var ( 36 | sher *scheduler.Scheduler 37 | tlsKeyFile string 38 | tlsCertFile string 39 | rootCmd = &cobra.Command{ 40 | Use: "scheduler", 41 | Short: "kubernetes vgpu scheduler", 42 | Run: func(cmd *cobra.Command, args []string) { 43 | start() 44 | }, 45 | } 46 | ) 47 | 48 | func init() { 49 | rootCmd.Flags().SortFlags = false 50 | rootCmd.PersistentFlags().SortFlags = false 51 | 52 | rootCmd.Flags().StringVar(&config.HttpBind, "http_bind", "127.0.0.1:8080", "http server bind address") 53 | rootCmd.Flags().StringVar(&tlsCertFile, "cert_file", "", "tls cert file") 54 | rootCmd.Flags().StringVar(&tlsKeyFile, "key_file", "", "tls key file") 55 | rootCmd.Flags().StringVar(&config.SchedulerName, "scheduler-name", "", "the name to be added to pod.spec.schedulerName if not empty") 56 | rootCmd.Flags().Int32Var(&config.DefaultMem, "default-mem", 0, "default gpu device memory to allocate") 57 | rootCmd.Flags().Int32Var(&config.DefaultCores, "default-cores", 0, "default gpu core percentage to allocate") 58 | rootCmd.Flags().StringVar(&config.MetricsBindAddress, "metrics-bind-address", ":9395", "The TCP address that the scheduler should bind to for serving prometheus metrics(e.g. 127.0.0.1:9395, :9395)") 59 | rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet()) 60 | rootCmd.AddCommand(version.VersionCmd) 61 | rootCmd.Flags().AddGoFlagSet(util.InitKlogFlags()) 62 | } 63 | 64 | func start() { 65 | sher = scheduler.NewScheduler() 66 | sher.Start() 67 | defer sher.Stop() 68 | 69 | // start monitor metrics 70 | go sher.RegisterFromNodeAnnotatons() 71 | go initmetrics(config.MetricsBindAddress) 72 | 73 | // start http server 74 | router := httprouter.New() 75 | router.POST("/filter", routes.PredicateRoute(sher)) 76 | router.POST("/bind", routes.Bind(sher)) 77 | router.POST("/webhook", routes.WebHookRoute()) 78 | klog.Info("listen on ", config.HttpBind) 79 | if len(tlsCertFile) == 0 || len(tlsKeyFile) == 0 { 80 | if err := http.ListenAndServe(config.HttpBind, router); err != nil { 81 | klog.Fatal("Listen and Serve error, ", err) 82 | } 83 | } else { 84 | if err := http.ListenAndServeTLS(config.HttpBind, tlsCertFile, tlsKeyFile, router); err != nil { 85 | klog.Fatal("Listen and Serve error, ", err) 86 | } 87 | } 88 | } 89 | 90 | func main() { 91 | if err := rootCmd.Execute(); err != nil { 92 | klog.Fatal(err) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /cmd/vGPUmonitor/build.sh: -------------------------------------------------------------------------------- 1 | protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto 2 | go build 3 | -------------------------------------------------------------------------------- /cmd/vGPUmonitor/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "k8s.io/klog" 5 | ) 6 | 7 | //var addr = flag.String("listen-address", ":9394", "The address to listen on for HTTP requests.") 8 | 9 | //const shared_directory = "/usr/local/vgpu/shared" 10 | 11 | func main() { 12 | 13 | if err := ValidateEnvVars(); err != nil { 14 | klog.Fatalf("Failed to validate environment variables: %v", err) 15 | } 16 | cgroupDriver = 0 17 | errchannel := make(chan error) 18 | go serveInfo(errchannel) 19 | go initmetrics() 20 | go watchAndFeedback() 21 | for { 22 | err := <-errchannel 23 | klog.Errorf("failed to serve: %v", err) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /cmd/vGPUmonitor/noderpc/noderpc.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2015 gRPC authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | 17 | option go_package = "gitlab.4pd.io/vGPUmonitor"; 18 | option java_multiple_files = true; 19 | option java_package = "io.grpc.examples.helloworld"; 20 | option java_outer_classname = "HelloWorldProto"; 21 | 22 | package pluginrpc; 23 | 24 | // The greeting service definition. 25 | service NodeVGPUInfo { 26 | // Sends a greeting 27 | rpc GetNodeVGPU (GetNodeVGPURequest) returns (GetNodeVGPUReply) {} 28 | } 29 | 30 | // The sharedProcs contains the sharedRegion 31 | message shrregProcSlotT { 32 | int32 pid = 1; 33 | repeated uint64 used = 2; 34 | int32 status = 3; 35 | } 36 | 37 | // The sharedRegionT struct is the main struct for monitoring vgpu 38 | message sharedRegionT { 39 | int32 initializedFlag = 1; 40 | uint32 ownerPid = 2; 41 | uint32 sem = 3; 42 | repeated uint64 limit = 4; 43 | repeated uint64 sm_limit = 5; 44 | repeated shrregProcSlotT procs = 6; 45 | } 46 | 47 | message podusage { 48 | string poduuid = 1; 49 | sharedRegionT podvgpuinfo = 2; 50 | } 51 | 52 | // The request message containing the user's name. 53 | message GetNodeVGPURequest { 54 | string ctruuid = 1; 55 | } 56 | 57 | // The response message containing the greetings 58 | message GetNodeVGPUReply { 59 | string nodeid = 1; 60 | repeated podusage nodevgpuinfo = 2; 61 | } 62 | -------------------------------------------------------------------------------- /cmd/vGPUmonitor/pathmonitor_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | 6 | v1 "k8s.io/api/core/v1" 7 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | ) 9 | 10 | func TestIsVaildPod(t *testing.T) { 11 | pods := &v1.PodList{ 12 | Items: []v1.Pod{ 13 | { 14 | ObjectMeta: metav1.ObjectMeta{ 15 | UID: "123", 16 | }, 17 | }, 18 | { 19 | ObjectMeta: metav1.ObjectMeta{ 20 | UID: "456", 21 | }, 22 | }, 23 | }, 24 | } 25 | 26 | cases := []struct { 27 | name string 28 | expected bool 29 | }{ 30 | { 31 | name: "123", 32 | expected: true, 33 | }, 34 | { 35 | name: "789", 36 | expected: false, 37 | }, 38 | } 39 | 40 | for _, c := range cases { 41 | if got := isVaildPod(c.name, pods); got != c.expected { 42 | t.Errorf("isVaildPod(%q) == %v, want %v", c.name, got, c.expected) 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /cmd/vGPUmonitor/testcollector/testcollector: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/cmd/vGPUmonitor/testcollector/testcollector -------------------------------------------------------------------------------- /cmd/vGPUmonitor/validation.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | ) 7 | 8 | var requiredEnvVars = map[string]bool{ 9 | "HOOK_PATH": true, 10 | "OTHER_ENV_VAR": false, 11 | } 12 | 13 | func ValidateEnvVars() error { 14 | for envVar, required := range requiredEnvVars { 15 | _, exists := os.LookupEnv(envVar) 16 | if required && !exists { 17 | return fmt.Errorf("required environment variable %s not set", envVar) 18 | } 19 | } 20 | return nil 21 | } 22 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG GOLANG_IMAGE 2 | ARG NVIDIA_IMAGE 3 | FROM $GOLANG_IMAGE AS build 4 | 5 | FROM $GOLANG_IMAGE AS GOBUILD 6 | ADD . /k8s-vgpu 7 | ARG GOPROXY=https://goproxy.cn,direct 8 | RUN apt-get update && apt-get -y install libhwloc-dev libdrm-dev 9 | RUN cd /k8s-vgpu && make all 10 | 11 | FROM nvidia/cuda:12.2.0-base-ubuntu22.04 12 | ENV NVIDIA_DISABLE_REQUIRE="true" 13 | ENV NVIDIA_VISIBLE_DEVICES=all 14 | ENV NVIDIA_DRIVER_CAPABILITIES=utility 15 | 16 | ARG VERSION 17 | LABEL version="$VERSION" 18 | LABEL maintainer="opensource@4paradigm.com" 19 | COPY ./LICENSE /k8s-vgpu/LICENSE 20 | COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin 21 | COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh 22 | COPY ./lib /k8s-vgpu/lib 23 | COPY ./lib/mlu/cntopo /usr/bin/ 24 | COPY ./lib/mlu/libcndev.so /usr/lib/ 25 | 26 | ENV PATH="/k8s-vgpu/bin:${PATH}" 27 | ARG DEST_DIR 28 | ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh $DEST_DIR"] 29 | -------------------------------------------------------------------------------- /docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright © 2021 peizhaoyou 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # if [ $1 == "device-plugin" ]; then 19 | # cp -f /k8s-vgpu/lib/* $DEST_DIR/vgpu 20 | # fi 21 | exec "$@" -------------------------------------------------------------------------------- /docs/benchmark.md: -------------------------------------------------------------------------------- 1 | ## Benchmarks 2 | 3 | Three instances from ai-benchmark have been used to evaluate vGPU-device-plugin performance as follows 4 | 5 | | Test Environment | description | 6 | | ---------------- | :------------------------------------------------------: | 7 | | Kubernetes version | v1.12.9 | 8 | | Docker version | 18.09.1 | 9 | | GPU Type | Tesla V100 | 10 | | GPU Num | 2 | 11 | 12 | | Test instance | description | 13 | | ------------- | :---------------------------------------------------------: | 14 | | nvidia-device-plugin | k8s + nvidia k8s-device-plugin | 15 | | vGPU-device-plugin | k8s + VGPU k8s-device-plugin,without virtual device memory | 16 | | vGPU-device-plugin(virtual device memory) | k8s + VGPU k8s-device-plugin,with virtual device memory | 17 | 18 | Test Cases: 19 | 20 | | test id | case | type | params | 21 | | ------- | :-----------: | :-------: | :---------------------: | 22 | | 1.1 | Resnet-V2-50 | inference | batch=50,size=346*346 | 23 | | 1.2 | Resnet-V2-50 | training | batch=20,size=346*346 | 24 | | 2.1 | Resnet-V2-152 | inference | batch=10,size=256*256 | 25 | | 2.2 | Resnet-V2-152 | training | batch=10,size=256*256 | 26 | | 3.1 | VGG-16 | inference | batch=20,size=224*224 | 27 | | 3.2 | VGG-16 | training | batch=2,size=224*224 | 28 | | 4.1 | DeepLab | inference | batch=2,size=512*512 | 29 | | 4.2 | DeepLab | training | batch=1,size=384*384 | 30 | | 5.1 | LSTM | inference | batch=100,size=1024*300 | 31 | | 5.2 | LSTM | training | batch=10,size=1024*300 | 32 | 33 | Test Result: ![img](../imgs/benchmark_inf.png) 34 | 35 | ![img](../imgs/benchmark_train.png) 36 | 37 | To reproduce: 38 | 39 | 1. install k8s-vGPU-scheduler,and configure properly 40 | 2. run benchmark job 41 | 42 | ``` 43 | $ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml 44 | ``` 45 | 46 | 3. View the result by using kubctl logs 47 | 48 | ``` 49 | $ kubectl logs [pod id] -------------------------------------------------------------------------------- /docs/benchmark_cn.md: -------------------------------------------------------------------------------- 1 | ## 性能测试 2 | 3 | 在测试报告中,我们一共在下面五种场景都执行了ai-benchmark 测试脚本,并汇总最终结果: 4 | 5 | | 测试环境 | 环境描述 | 6 | | ---------------- | :------------------------------------------------------: | 7 | | Kubernetes version | v1.12.9 | 8 | | Docker version | 18.09.1 | 9 | | GPU Type | Tesla V100 | 10 | | GPU Num | 2 | 11 | 12 | | 测试名称 | 测试用例 | 13 | | -------- | :------------------------------------------------: | 14 | | Nvidia-device-plugin | k8s + nvidia官方k8s-device-plugin | 15 | | vGPU-device-plugin | k8s + VGPU k8s-device-plugin,无虚拟显存 | 16 | | vGPU-device-plugin(virtual device memory) | k8s + VGPU k8s-device-plugin,高负载,开启虚拟显存 | 17 | 18 | 测试内容 19 | 20 | | test id | 名称 | 类型 | 参数 | 21 | | ------- | :-----------: | :-------: | :---------------------: | 22 | | 1.1 | Resnet-V2-50 | inference | batch=50,size=346*346 | 23 | | 1.2 | Resnet-V2-50 | training | batch=20,size=346*346 | 24 | | 2.1 | Resnet-V2-152 | inference | batch=10,size=256*256 | 25 | | 2.2 | Resnet-V2-152 | training | batch=10,size=256*256 | 26 | | 3.1 | VGG-16 | inference | batch=20,size=224*224 | 27 | | 3.2 | VGG-16 | training | batch=2,size=224*224 | 28 | | 4.1 | DeepLab | inference | batch=2,size=512*512 | 29 | | 4.2 | DeepLab | training | batch=1,size=384*384 | 30 | | 5.1 | LSTM | inference | batch=100,size=1024*300 | 31 | | 5.2 | LSTM | training | batch=10,size=1024*300 | 32 | 33 | 测试结果: ![img](../imgs/benchmark_inf.png) 34 | 35 | ![img](../imgs/benchmark_train.png) 36 | 37 | 测试步骤: 38 | 39 | 1. 安装nvidia-device-plugin,并配置相应的参数 40 | 2. 运行benchmark任务 41 | 42 | ``` 43 | $ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml 44 | ``` 45 | 46 | 3. 通过kubctl logs 查看结果 47 | 48 | ``` 49 | $ kubectl logs [pod id] 50 | ``` -------------------------------------------------------------------------------- /docs/cambricon-mlu-support.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | **We now support cambricon.com/mlu by implementing most device-sharing features as nvidia-GPU**, including: 4 | 5 | ***MLU sharing***: Each task can allocate a portion of MLU instead of a whole MLU card, thus MLU can be shared among multiple tasks. 6 | 7 | ***Device Memory Control***: MLUs can be allocated with certain device memory size on certain type(i.e 370) and have made it that it does not exceed the boundary. 8 | 9 | ***MLU Type Specification***: You can specify which type of MLU to use or to avoid for a certain task, by setting "cambricon.com/use-mlutype" or "cambricon.com/nouse-mlutype" annotations. 10 | 11 | ***Very Easy to use***: You don't need to modify your task yaml to use our scheduler. All your MLU jobs will be automatically supported after installation. The only thing you need to do is tag the MLU node. 12 | 13 | ## Prerequisites 14 | 15 | * neuware-mlu370-driver > 4.15.10 16 | * cntoolkit > 2.5.3 17 | 18 | ## Enabling MLU-sharing Support 19 | 20 | * Install the chart using helm, See 'enabling vGPU support in kubernetes' section [here](https://github.com/4paradigm/k8s-vgpu-scheduler#enabling-vgpu-support-in-kubernetes) 21 | 22 | * Tag MLU node with the following command 23 | ``` 24 | kubectl label node {mlu-node} mlu=on 25 | ``` 26 | 27 | ## Running MLU jobs 28 | 29 | Cambricon MMLUs can now be requested by a container 30 | using the `cambricon.com/mlunum` and `cambricon.com/mlumem` resource type: 31 | 32 | ``` 33 | apiVersion: v1 34 | kind: Pod 35 | metadata: 36 | name: gpu-pod 37 | spec: 38 | containers: 39 | - name: ubuntu-container 40 | image: ubuntu:18.04 41 | command: ["bash", "-c", "sleep 86400"] 42 | resources: 43 | limits: 44 | cambricon.com/mlunum: 1 # requesting 1 MLU 45 | cambricon.com/mlumem: 10240 # requesting 10G MLU device memory 46 | - name: ubuntu-container1 47 | image: ubuntu:18.04 48 | command: ["bash", "-c", "sleep 86400"] 49 | resources: 50 | limits: 51 | cambricon.com/mlunum: 1 # requesting 1 MLU 52 | cambricon.com/mlumem: 10240 # requesting 10G MLU device memory 53 | ``` 54 | 55 | ## Notes 56 | 57 | 1. Mlu-sharing in init container is not supported, pods with "combricon.com/mlumem" in init container will never be scheduled. 58 | 59 | 2. Mlu-sharing with containerd is not supported, the container may not start successfully. 60 | 61 | 3. Mlu-sharing can only be applied on MLU-370 62 | -------------------------------------------------------------------------------- /docs/cambricon-mlu-support_cn.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | 3 | 本组件支持复用寒武纪MLU设备,并为此提供以下几种与vGPU类似的复用功能,包括: 4 | 5 | ***MLU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 6 | 7 | ***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配MLU,本组件会确保任务使用的显存不会超过分配数值,注意只有MLU-370型号的MLU支持可配显存 8 | 9 | ***指定MLU型号***:当前任务可以通过设置annotation("cambricon.com/use-mlutype","cambricon.com/nouse-mlutype")的方式,来选择使用或者不使用某些具体型号的MLU 10 | 11 | ***方便易用***: 部署本组件后,你只需要给MLU节点打上tag即可使用MLU复用功能 12 | 13 | 14 | ## 节点需求 15 | 16 | * neuware-mlu370-driver > 4.15.10 17 | * cntoolkit > 2.5.3 18 | 19 | ## 开启MLU复用 20 | 21 | * 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/4paradigm/k8s-vgpu-scheduler/blob/master/README_cn.md#kubernetes开启vgpu支持) 22 | 23 | * 使用以下指令,为MLU节点打上label 24 | ``` 25 | kubectl label node {mlu-node} mlu=on 26 | ``` 27 | 28 | ## 运行MLU任务 29 | 30 | ``` 31 | apiVersion: v1 32 | kind: Pod 33 | metadata: 34 | name: gpu-pod 35 | spec: 36 | containers: 37 | - name: ubuntu-container 38 | image: ubuntu:18.04 39 | command: ["bash", "-c", "sleep 86400"] 40 | resources: 41 | limits: 42 | cambricon.com/mlunum: 1 # requesting 1 MLU 43 | cambricon.com/mlumem: 10240 # requesting 10G MLU device memory 44 | - name: ubuntu-container1 45 | image: ubuntu:18.04 46 | command: ["bash", "-c", "sleep 86400"] 47 | resources: 48 | limits: 49 | cambricon.com/mlunum: 1 # requesting 1 MLU 50 | cambricon.com/mlumem: 10240 # requesting 10G MLU device memory 51 | ``` 52 | 53 | ## 注意事项 54 | 55 | 1. 在init container中无法使用MLU复用功能,否则该任务不会被调度 56 | 57 | 2. MLU复用功能目前不支持containerd,在containerd中使用会导致任务失败 58 | 59 | 3. 只有MLU-370可以使用MLU复用功能 60 | -------------------------------------------------------------------------------- /docs/config.md: -------------------------------------------------------------------------------- 1 | # Global Config 2 | 3 | you can customize your vGPU support by setting the following parameters using `-set`, for example 4 | 5 | ``` 6 | helm install vgpu-charts/vgpu vgpu --set devicePlugin.deviceMemoryScaling=5 ... 7 | ``` 8 | 9 | * `devicePlugin.service.schedulerPort:` 10 | Integer type, by default: 31998, scheduler webhook service nodePort. 11 | * `devicePlugin.deviceMemoryScaling:` 12 | Float type, by default: 1. The ratio for NVIDIA device memory scaling, can be greater than 1 (enable virtual device memory, experimental feature). For NVIDIA GPU with *M* memory, if we set `devicePlugin.deviceMemoryScaling` argument to *S*, vGPUs splitted by this GPU will totally get `S * M` memory in Kubernetes with our device plugin. 13 | * `devicePlugin.deviceSplitCount:` 14 | Integer type, by default: equals 10. Maximum tasks assigned to a simple GPU device. 15 | * `devicePlugin.migstrategy:` 16 | String type, "none" for ignoring MIG features or "mixed" for allocating MIG device by seperate resources. Default "none" 17 | * `devicePlugin.disablecorelimit:` 18 | String type, "true" for disable core limit, "false" for enable core limit, default: false 19 | * `scheduler.defaultMem:` 20 | Integer type, by default: 5000. The default device memory of the current task, in MB 21 | * `scheduler.defaultCores:` 22 | Integer type, by default: equals 0. Percentage of GPU cores reserved for the current task. If assigned to 0, it may fit in any GPU with enough device memory. If assigned to 100, it will use an entire GPU card exclusively. 23 | * `resourceName:` 24 | String type, vgpu number resource name, default: "nvidia.com/gpu" 25 | * `resourceMem:` 26 | String type, vgpu memory size resource name, default: "nvidia.com/gpumem" 27 | * `resourceMemPercentage:` 28 | String type, vgpu memory fraction resource name, default: "nvidia.com/gpumem-percentage" 29 | * `resourceCores:` 30 | String type, vgpu cores resource name, default: "nvidia.com/cores" 31 | * `resourcePriority:` 32 | String type, vgpu task priority name, default: "nvidia.com/priority" 33 | 34 | # Container config envs 35 | 36 | * `GPU_CORE_UTILIZATION_POLICY:` 37 | String type, "default", "force", "disable" 38 | "default" means the dafault utilization policy 39 | "force" means the container will always limit the core utilization below "nvidia.com/gpucores" 40 | "disable" means the container will ignore the utilization limitation set by "nvidia.com/gpucores" during task execution 41 | 42 | * `ACTIVE_OOM_KILLER:` 43 | String type, "true","false" 44 | "true" means the task may be killed if exceeds the limitation set by "nvidia.com/gpumem" or "nvidia.com/gpumemory" 45 | "false" means the task will not be killed even it exceeds the limitation. 46 | 47 | 48 | -------------------------------------------------------------------------------- /docs/config_cn.md: -------------------------------------------------------------------------------- 1 | # 全局配置 2 | 3 | 你可以在安装过程中,通过`-set`来修改以下的客制化参数,例如: 4 | 5 | ``` 6 | helm install vgpu vgpu-charts/vgpu --set devicePlugin.deviceMemoryScaling=5 ... 7 | ``` 8 | 9 | * `devicePlugin.deviceSplitCount:` 10 | 整数类型,预设值是10。GPU的分割数,每一张GPU都不能分配超过其配置数目的任务。若其配置为N的话,每个GPU上最多可以同时存在N个任务。 11 | * `devicePlugin.deviceMemoryScaling:` 12 | 浮点数类型,预设值是1。NVIDIA装置显存使用比例,可以大于1(启用虚拟显存,实验功能)。对于有*M*显存大小的NVIDIA GPU,如果我们配置`devicePlugin.deviceMemoryScaling`参数为*S*,在部署了我们装置插件的Kubenetes集群中,这张GPU分出的vGPU将总共包含 `S * M` 显存。 13 | * `devicePlugin.migStrategy:` 14 | 字符串类型,目前支持"none“与“mixed“两种工作方式,前者忽略MIG设备,后者使用专门的资源名称指定MIG设备,使用详情请参考mix_example.yaml,默认为"none" 15 | * `devicePlugin.disablecorelimit:` 16 | 字符串类型,"true"为关闭算力限制,"false"为启动算力限制,默认为"false" 17 | * `scheduler.defaultMem:` 18 | 整数类型,预设值为5000,表示不配置显存时使用的默认显存大小,单位为MB 19 | * `scheduler.defaultCores:` 20 | 整数类型(0-100),默认为0,表示默认为每个任务预留的百分比算力。若设置为0,则代表任务可能会被分配到任一满足显存需求的GPU中,若设置为100,代表该任务独享整张显卡 21 | * `resourceName:` 22 | 字符串类型, 申请vgpu个数的资源名, 默认: "nvidia.com/gpu" 23 | * `resourceMem:` 24 | 字符串类型, 申请vgpu显存大小资源名, 默认: "nvidia.com/gpumem" 25 | * `resourceMemPercentage:` 26 | 字符串类型,申请vgpu显存比例资源名,默认: "nvidia.com/gpumem-percentage" 27 | * `resourceCores:` 28 | 字符串类型, 申请vgpu算力资源名, 默认: "nvidia.com/cores" 29 | * `resourcePriority:` 30 | 字符串类型,表示申请任务的任务优先级,默认: "nvidia.com/priority" 31 | 32 | # 容器配置(在容器的环境变量中指定) 33 | 34 | * `GPU_CORE_UTILIZATION_POLICY:` 35 | 字符串类型,"default", "force", "disable" 36 | 代表容器算力限制策略, "default"为默认,"force"为强制限制算力,一般用于测试算力限制的功能,"disable"为忽略算力限制 37 | * `ACTIVE_OOM_KILLER:` 38 | 字符串类型,"true", "false" 39 | 代表容器是否会因为超用显存而被终止执行,"true"为会,"false"为不会 -------------------------------------------------------------------------------- /docs/dashboard.md: -------------------------------------------------------------------------------- 1 | ## Grafana Dashboard 2 | 3 | - You can load this dashboard json file [gpu-dashboard.json](./gpu-dashboard.json) 4 | 5 | - This dashboard also includes some NVIDIA DCGM metrics: 6 | 7 | [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) deploy:`kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml` 8 | 9 | - use this prometheus custom metric configure: 10 | 11 | ```yaml 12 | - job_name: 'kubernetes-vgpu-exporter' 13 | kubernetes_sd_configs: 14 | - role: endpoints 15 | relabel_configs: 16 | - source_labels: [__meta_kubernetes_endpoints_name] 17 | regex: vgpu-device-plugin-monitor 18 | replacement: $1 19 | action: keep 20 | - source_labels: [__meta_kubernetes_pod_node_name] 21 | regex: (.*) 22 | target_label: node_name 23 | replacement: ${1} 24 | action: replace 25 | - source_labels: [__meta_kubernetes_pod_host_ip] 26 | regex: (.*) 27 | target_label: ip 28 | replacement: $1 29 | action: replace 30 | - job_name: 'kubernetes-dcgm-exporter' 31 | kubernetes_sd_configs: 32 | - role: endpoints 33 | relabel_configs: 34 | - source_labels: [__meta_kubernetes_endpoints_name] 35 | regex: dcgm-exporter 36 | replacement: $1 37 | action: keep 38 | - source_labels: [__meta_kubernetes_pod_node_name] 39 | regex: (.*) 40 | target_label: node_name 41 | replacement: ${1} 42 | action: replace 43 | - source_labels: [__meta_kubernetes_pod_host_ip] 44 | regex: (.*) 45 | target_label: ip 46 | replacement: $1 47 | action: replace 48 | ``` 49 | 50 | - reload promethues: 51 | 52 | ```bash 53 | curl -XPOST http://{promethuesServer}:{port}/-/reload 54 | ``` 55 | -------------------------------------------------------------------------------- /docs/dashboard_cn.md: -------------------------------------------------------------------------------- 1 | ## Grafana Dashboard 2 | 3 | - 你可以在 grafana 中导入此 [gpu-dashboard.json](./gpu-dashboard.json) 4 | - 此 dashboard 还包括一部分 NVIDIA DCGM 监控指标: 5 | 6 | [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter)部署:`kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml` 7 | 8 | - 添加 prometheus 自定义的监控项: 9 | 10 | ```yaml 11 | - job_name: 'kubernetes-vgpu-exporter' 12 | kubernetes_sd_configs: 13 | - role: endpoints 14 | relabel_configs: 15 | - source_labels: [__meta_kubernetes_endpoints_name] 16 | regex: vgpu-device-plugin-monitor 17 | replacement: $1 18 | action: keep 19 | - source_labels: [__meta_kubernetes_pod_node_name] 20 | regex: (.*) 21 | target_label: node_name 22 | replacement: ${1} 23 | action: replace 24 | - source_labels: [__meta_kubernetes_pod_host_ip] 25 | regex: (.*) 26 | target_label: ip 27 | replacement: $1 28 | action: replace 29 | - job_name: 'kubernetes-dcgm-exporter' 30 | kubernetes_sd_configs: 31 | - role: endpoints 32 | relabel_configs: 33 | - source_labels: [__meta_kubernetes_endpoints_name] 34 | regex: dcgm-exporter 35 | replacement: $1 36 | action: keep 37 | - source_labels: [__meta_kubernetes_pod_node_name] 38 | regex: (.*) 39 | target_label: node_name 40 | replacement: ${1} 41 | action: replace 42 | - source_labels: [__meta_kubernetes_pod_host_ip] 43 | regex: (.*) 44 | target_label: ip 45 | replacement: $1 46 | action: replace 47 | ``` 48 | 49 | - 加载 promethues 配置: 50 | 51 | ```bash 52 | curl -XPOST http://{promethuesServer}:{port}/-/reload 53 | ``` 54 | -------------------------------------------------------------------------------- /docs/develop/design.md: -------------------------------------------------------------------------------- 1 | # Design 2 | 3 | 4 | 5 | The architect of HAMi is shown in the figure above, It is organized in the form of "chart". 6 | 7 | - MutatingWebhook 8 | 9 | The MutatingWebhook checks the validity of each task, and set the "schedulerName" to "HAMi scheduler" if the resource requests have been recognized by HAMi 10 | If Not, the MutatingWebhook does nothing and pass this task to default-scheduler. 11 | 12 | - Scheduler 13 | 14 | HAMi support default kube-scheduler and volcano-scheduler, it implements an extender and register 'Filter' and 'Score' methods to deal with sharable devices. 15 | When a pod with sharable device request arrives, 'Filter' searches the cluster and returns a list of 'available' nodes. 'Score' scores each node 'Filter' returned, and pick the highest one to host the pod. It patches the schedule decision on corresponding pod annotations, for the detailed protocol, see protocol.md 16 | 17 | - DevicePlugin 18 | 19 | When the schedule decision is made, scheduler calls devicePlugin on that node to generate environment variables and mounts according to pod annotations. 20 | Please note that, the DP used here is a customized version, you need to install according to README document with that device. Most officaial DP will not fit in HAMi, and will result in unexpected behaviour 21 | 22 | - InContainer Control 23 | 24 | The implementation of in-container hard limit is different for diffent devices. For example, HAMi-Core is responsible for NVIDIA devices. libnvidia-control.so is responsible for iluvatar devices, etc. HAMi needs to pass the correct environment variables in order for it to operate. 25 | 26 | 27 | 28 | In summary, The flowchart of pod is descirbed as the figure above. 29 | -------------------------------------------------------------------------------- /docs/develop/imgs/flowchart.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/docs/develop/imgs/flowchart.jpeg -------------------------------------------------------------------------------- /docs/develop/imgs/offline_validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/docs/develop/imgs/offline_validation.png -------------------------------------------------------------------------------- /docs/develop/imgs/protocol_pod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/docs/develop/imgs/protocol_pod.png -------------------------------------------------------------------------------- /docs/develop/imgs/protocol_register.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/docs/develop/imgs/protocol_register.png -------------------------------------------------------------------------------- /docs/develop/protocol.md: -------------------------------------------------------------------------------- 1 | # Protocol 2 | 3 | ## Device Register 4 | 5 | 6 | 7 | HAMi needs to know the spec of each AI devices in the cluster in order to schedule properly. During device registration, device-plugin needs to keep patching the spec of each device into node annotations every 30 seconds, in the format of the following: 8 | 9 | ``` 10 | HAMi.sh/node-handshake-{device-type}: Reported_{device_node_current_timestamp} 11 | HAMi.sh/node-register-{deivce-type}: {Device 1}:{Device2}:...:{Device N} 12 | ``` 13 | 14 | The definiation of each device is in the following format: 15 | ``` 16 | {Device UUID},{device split count},{device memory limit},{device core limit},{device type},{device numa},{healthy} 17 | ``` 18 | 19 | An example is shown below: 20 | ``` 21 | HAMi.sh/node-handshake-nvidia: Reported 2024-01-23 04:30:04.434037031 +0000 UTC m=+1104711.777756895 22 | HAMi.sh/node-handshake-mlu: Requesting_2024.01.10 04:06:57 23 | HAMi.sh/node-mlu-register: MLU-45013011-2257-0000-0000-000000000000,10,23308,0,MLU-MLU370-X4,0,false:MLU-54043011-2257-0000-0000-000000000000,10,23308,0, 24 | HAMi.sh/node-nvidia-register: GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec,10,32768,100,NVIDIA-Tesla V100-PCIE-32GB,0,true:GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,10,32768,100,NVIDIA-Tesla V100-PCIE-32GB,0,true: 25 | 26 | ``` 27 | In this example, this node has two different AI devices, 2 Nvidia-V100 GPUs, and 2 Cambircon 370-X4 MLUs 28 | 29 | Note that a device node may become unavailable due to hardware or network failure, if a node hasn't registered in last 5 minutes, scheduler will mark that node as 'unavailable'. 30 | 31 | Since system clock on scheduler node and 'device' node may not align properly, scheduler node will patch the following device node annotations every 30s 32 | 33 | ``` 34 | HAMi.sh/node-handshake-{device-type}: Requesting_{scheduler_node_current_timestamp} 35 | ``` 36 | 37 | If HAMi.sh/node-handshake annotations remains in "Requesting_xxxx" and {scheduler current timestamp} > 5 mins + {scheduler timestamp in annotations}, then this device on that node will be marked "unavailable" in scheduler. 38 | 39 | 40 | ## Schedule Decision 41 | 42 | 43 | 44 | HAMi scheduler needs to patch schedule decisions into pod annotations, in the format of the following: 45 | 46 | ``` 47 | HAMi.sh/devices-to-allocate:{ctr1 request}:{ctr2 request}:...{Last ctr request}: 48 | HAMi.sh/device-node: {schedule decision node} 49 | HAMi.sh/device-schedule-time: {timestamp} 50 | ``` 51 | 52 | each container request is in the following format: 53 | 54 | ``` 55 | {device UUID},{device type keywork},{device memory request}:{device core request} 56 | ``` 57 | 58 | for example: 59 | 60 | A pod with 2 containers, first container requests 1 GPU with 3G device Memory, second container requests 1 GPU with 5G device Memory, then the patched annotations will be like the 61 | 62 | ``` 63 | HAMi.sh/devices-to-allocate: GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,NVIDIA,3000,0:GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,NVIDIA,5000,0: 64 | HAMi.sh/vgpu-node: node67-4v100 65 | HAMi.sh/vgpu-time: 1705054796 66 | ``` 67 | 68 | -------------------------------------------------------------------------------- /docs/develop/roadmap.md: -------------------------------------------------------------------------------- 1 | # roadmap 2 | 3 | | feature | description | release | Example | Example expected behaviour | 4 | |--------------------|----------------------------------------------------------------------------------------------------------------------------------------|---------------|--------------|------------| 5 | | Kubernetes schedule layer | Support Resource Quota for vgpu-memory | v3.2.0 | "requests.nvidia.com/gpu-memory: 30000" in ResourceQuota | Pods in this namespace can allocate up to 30G device memory in this namespace | 6 | | | Support Best-fit, idle-first, Numa-first Schedule Policy | v3.2.0 | add "scheduler policy configmap" | execute schedule policy according to configMap | 7 | | | Support k8s 1.28 version with compatable to v1.16 | v3.1.0 | | | 8 | | Add more Heterogeneous AI computing device | HuaWei Ascend Support | v3.1.0 | | | 9 | | | Iluvatar GPU support | v3.1.0 | | | 10 | | |Teco DPU Support | v3.2.0 | | | 11 | -------------------------------------------------------------------------------- /docs/develop/tasklist.md: -------------------------------------------------------------------------------- 1 | # Tasks 2 | 3 | ## Support Moore threads MTT S4000 4 | 5 | ``` 6 | resources: 7 | requests: 8 | mthreads.com/gpu: ${num} 9 | mthreads.com/vcuda-core: ${core} 10 | mthreads.com/vcuda-memory: ${mem} 11 | limits: 12 | mthreads.com/gpu: ${num} 13 | mthreads.com/vcuda-core: ${core} 14 | mthreads.com/vcuda-memory: ${mem} 15 | ``` 16 | 17 | ## Support Birentech Model 110 18 | 19 | ``` 20 | resources: 21 | requests: 22 | birentech.com/gpu: ${num} 23 | birentech.com/vcuda-core: ${core} 24 | birentech.com/vcuda-memory: ${mem} 25 | limits: 26 | birentech.com/gpu: ${num} 27 | birentech.com/vcuda-core: ${core} 28 | birentech.com/vcuda-memory: ${mem} 29 | ``` 30 | 31 | ## Support iluvatar MR-V100 32 | 33 | ``` 34 | resources: 35 | requests: 36 | iluvatar.ai/gpu: ${num} 37 | iluvatar.ai/vcuda-core: ${core} 38 | iluvatar.ai/vcuda-memory: ${mem} 39 | limits: 40 | iluvatar.ai/gpu: ${num} 41 | iluvatar.ai/vcuda-core: ${core} 42 | iluvatar.ai/vcuda-memory: ${mem} 43 | ``` 44 | 45 | ## Support HuaWei Ascend 910B device 46 | 47 | ``` 48 | resources: 49 | requests: 50 | ascend.com/npu: ${num} 51 | ascend.com/npu-core: ${core} 52 | ascend.com/npu-mem: ${mem} 53 | limits: 54 | ascend.com/npu: ${num} 55 | ascend.com/npu-core: ${core} 56 | ascend.com/npu-mem: ${mem} 57 | ``` 58 | 59 | ## Support resourceQuota for Kubernetes 60 | 61 | Description: ResourceQuota is frequently used in kubernetes namespace. Since the number of virtual devices doesn't mean anything, we need to support the limitation in deviceMemory. 62 | 63 | For example, the following resourceQuota 64 | ``` 65 | cat < compute-resources.yaml 66 | apiVersion: v1 67 | kind: ResourceQuota 68 | metadata: 69 | name: compute-resources 70 | spec: 71 | hard: 72 | requests.cpu: "1" 73 | requests.memory: 1Gi 74 | limits.cpu: "2" 75 | limits.memory: 2Gi 76 | requests.nvidia.com/gpu-memory: 30000 77 | EOF 78 | ``` 79 | 80 | with the following command 81 | ``` 82 | kubectl create -f ./compute-resources.yaml--namespace=myspace 83 | ``` 84 | 85 | will limit the maxinum device memory allocated to namespace 'myspace' to 30G 86 | 87 | ## Support multiple schedule policies 88 | 89 | Description: HAMi needs to support multiple schedule policies, to provide meets the need in complex senarios, a pod can select a schedule policy in annotations field. 90 | 91 | The effect of each schedule policy is shown in the table below 92 | 93 | | Schedule Policy | Effect | 94 | | -------- | ------- | 95 | | best-fit | the fewer device memory remains, the higher score | 96 | | idle-first | idle GPU has higher score | 97 | | numa-first | for multiple GPU allocations, GPUs on the same numa have higher score | 98 | 99 | 100 | For example, if a pod want to select a 'best-fit' schedule policy, it can specify .metadata.annotations as the code below: 101 | 102 | ``` 103 | apiVersion: v1 104 | kind: Pod 105 | metadata: 106 | name: gpu-pod 107 | annotations: 108 | nvidia.com/schedule-policy: "best-fit" 109 | spec: 110 | containers: 111 | - name: ubuntu-container 112 | image: ubuntu:18.04 113 | command:["bash","-c","sleep 86400"] 114 | resources: 115 | limits: 116 | nvidia.com/gpu: 2 # requesting 2 VGPUs 117 | ``` 118 | 119 | -------------------------------------------------------------------------------- /docs/hygon-dcu-support.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | **We now support hygon.com/dcu by implementing most device-sharing features as nvidia-GPU**, including: 4 | 5 | ***DCU sharing***: Each task can allocate a portion of DCU instead of a whole DCU card, thus DCU can be shared among multiple tasks. 6 | 7 | ***Device Memory Control***: DCUs can be allocated with certain device memory size on certain type(i.e Z100) and have made it that it does not exceed the boundary. 8 | 9 | ***Device compute core limitation***: DCUs can be allocated with certain percentage of device core(i.e hygon.com/dcucores:60 indicate this container uses 60% compute cores of this device) 10 | 11 | ***DCU Type Specification***: You can specify which type of DCU to use or to avoid for a certain task, by setting "hygon.com/use-dcutype" or "hygon.com/nouse-dcutype" annotations. 12 | 13 | ## Prerequisites 14 | 15 | * dtk driver with virtualization enabled(i.e dtk-22.10.1-vdcu), try the following command to see if your driver has virtualization ability 16 | 17 | ``` 18 | hdmcli -show-device-info 19 | ``` 20 | 21 | If this command can't be found, then you should contact your device provider to aquire a vdcu version of dtk driver. 22 | 23 | * The absolute path of dtk driver on each dcu node must be the same(i.e placed in /root/dtk-driver) 24 | 25 | ## Enabling DCU-sharing Support 26 | 27 | * Install the chart using helm, See 'enabling vGPU support in kubernetes' section [here](https://github.com/4paradigm/k8s-vgpu-scheduler#enabling-vgpu-support-in-kubernetes), please be note that, you should set your dtk driver directory using --set devicePlugin.hygondriver={your dtk driver path on each nodes}, for example: 28 | 29 | ``` 30 | helm install vgpu vgpu-charts/vgpu --set devicePlugin.hygondriver="/root/dcu-driver/dtk-22.10.1-vdcu" --set scheduler.kubeScheduler.imageTag={your k8s server version} -n kube-system 31 | ``` 32 | 33 | * Tag DCU node with the following command 34 | ``` 35 | kubectl label node {dcu-node} dcu=on 36 | ``` 37 | 38 | ## Running DCU jobs 39 | 40 | Hygon DCUs can now be requested by a container 41 | using the `hygon.com/dcunum` , `hygon.com/dcumem` and `hygon.com/dcucores` resource type: 42 | 43 | ``` 44 | apiVersion: v1 45 | kind: Pod 46 | metadata: 47 | name: alexnet-tf-gpu-pod-mem 48 | labels: 49 | purpose: demo-tf-amdgpu 50 | spec: 51 | containers: 52 | - name: alexnet-tf-gpu-container 53 | image: pytorch:resnet50 54 | workingDir: /root 55 | command: ["sleep","infinity"] 56 | resources: 57 | limits: 58 | hygon.com/dcunum: 1 # requesting a GPU 59 | hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory 60 | hygon.com/dcucores: 60 # each dcu use 60% of total compute cores 61 | 62 | ``` 63 | 64 | ## Enable vDCU inside container 65 | 66 | You need to enable vDCU inside container in order to use it. 67 | ``` 68 | source /opt/hygondriver/env.sh 69 | ``` 70 | 71 | check if you have successfully enabled vDCU by using following command 72 | 73 | ``` 74 | hdmcli -show-device-info 75 | ``` 76 | 77 | If you have an output like this, then you have successfully enabled vDCU inside container. 78 | 79 | ``` 80 | Device 0: 81 | Actual Device: 0 82 | Compute units: 60 83 | Global memory: 2097152000 bytes 84 | ``` 85 | 86 | Launch your DCU tasks like you usually do 87 | 88 | ## Notes 89 | 90 | 1. DCU-sharing in init container is not supported, pods with "hygon.com/dcumem" in init container will never be scheduled. 91 | 92 | 2. Only one vdcu can be aquired per container. If you want to mount multiple dcu devices, then you shouldn't set `hygon.com/dcumem` or `hygon.com/dcucores` 93 | 94 | -------------------------------------------------------------------------------- /docs/hygon-dcu-support_cn.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | 3 | 本组件支持复用海光DCU设备,并为此提供以下几种与vGPU类似的复用功能,包括: 4 | 5 | ***DCU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 6 | 7 | ***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配DCU,本组件会确保任务使用的显存不会超过分配数值 8 | 9 | ***可限制计算单元数量***: 你现在可以指定任务使用的算力比例(例如60即代表使用60%算力)来分配DCU,本组件会确保任务使用的算力不会超过分配数值 10 | 11 | ***指定DCU型号***:当前任务可以通过设置annotation("hygon.com/use-dcutype","hygon.com/nouse-dcutype")的方式,来选择使用或者不使用某些具体型号的DCU 12 | 13 | ## 节点需求 14 | 15 | * 带有虚拟化功能的dtk驱动(例如dtk-22.10.1-vdcu),相关组件可以在海光开发者社区获取,或联系您的设备提供商 16 | 17 | * 在宿主机上执行hdmcli -show-device-info获取设备信息,若能成功获取,则代表配置成功。若找不到指令,说明您安装的驱动不带有虚拟化功能,请联系厂商获取代虚拟化功能的dtk驱动 18 | 19 | * 需要将各个DCU节点上的dtk驱动路径放置在统一的绝对路径上,例如均放置在/root/dtk-driver 20 | 21 | ## 开启DCU复用 22 | 23 | * 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/4paradigm/k8s-vgpu-scheduler/blob/master/README_cn.md#kubernetes开启vgpu支持),需要注意的是,必须使用--set devicePlugin.hygondriver="/root/dcu-driver/dtk-22.10.1-vdcu" 手动指定dtk驱动的绝对路径 24 | 25 | ``` 26 | helm install vgpu vgpu-charts/vgpu --set devicePlugin.hygondriver="/root/dcu-driver/dtk-22.10.1-vdcu" --set scheduler.kubeScheduler.imageTag={your k8s server version} -n kube-system 27 | ``` 28 | 29 | * 使用以下指令,为DCU节点打上label 30 | ``` 31 | kubectl label node {dcu-node} dcu=on 32 | ``` 33 | 34 | ## 运行DCU任务 35 | 36 | ``` 37 | apiVersion: v1 38 | kind: Pod 39 | metadata: 40 | name: alexnet-tf-gpu-pod-mem 41 | labels: 42 | purpose: demo-tf-amdgpu 43 | spec: 44 | containers: 45 | - name: alexnet-tf-gpu-container 46 | image: pytorch:resnet50 47 | workingDir: /root 48 | command: ["sleep","infinity"] 49 | resources: 50 | limits: 51 | hygon.com/dcunum: 1 # requesting a GPU 52 | hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory 53 | hygon.com/dcucores: 60 # each dcu use 60% of total compute cores 54 | 55 | ``` 56 | 57 | ## 容器内开启虚拟DCU功能 58 | 59 | 使用vDCU首先需要激活虚拟环境 60 | ``` 61 | source /opt/hygondriver/env.sh 62 | ``` 63 | 64 | 随后,使用hdmcli指令查看虚拟设备是否已经激活 65 | ``` 66 | hdmcli -show-device-info 67 | ``` 68 | 69 | 若输出如下,则代表虚拟设备已经成功激活 70 | ``` 71 | Device 0: 72 | Actual Device: 0 73 | Compute units: 60 74 | Global memory: 2097152000 bytes 75 | ``` 76 | 77 | 接下来正常启动DCU任务即可 78 | 79 | ## 注意事项 80 | 81 | 1. 在init container中无法使用DCU复用功能,否则该任务不会被调度 82 | 83 | 2. 每个容器最多只能使用一个虚拟DCU设备, 如果您希望在容器中挂载多个DCU设备,则不能使用`hygon.com/dcumem`和`hygon.com/dcucores`字段 84 | -------------------------------------------------------------------------------- /docs/offline-install.md: -------------------------------------------------------------------------------- 1 | # Offline-install Maunal 2 | 3 | For some cluster that don't have external web access, you can install HAMi by the following step: 4 | 5 | 1. Refer to [README.md](../README.md) until step 'Install and Uninstall' 6 | 7 | 2. copy the source of project into the master node in your cluster, placed in a path like "/root/HAMi" 8 | 9 | 3. pull the following images and save them into a '.tar' file, then move it into the master node in your cluster 10 | 11 | Image list: 12 | ``` 13 | 4pdosc/k8s-vdevice:{HAMi version} 14 | docker.io/jettech/kube-webhook-certgen:v1.5.2 15 | liangjw/kube-webhook-certgen:v1.1.1 16 | registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{your kubernetes version} 17 | ``` 18 | 19 | ``` 20 | docker pull {iamge} && docker save {image_name} -o {image_name}.tar 21 | ``` 22 | 23 | 4. Load these images using docker load, tag these images with your registry, and push them into your registry 24 | 25 | ``` 26 | docker load -i {image_name}.tar 27 | docker tag 4pdosc/k8s-vdevice:{HAMi version} {registry}/k8s-vdevice:{HAMi version} 28 | docker push {registry}/k8s-vdevice:{HAMi version} 29 | ``` 30 | 31 | 5. edit the following field in /root/HAMi/chart/vgpu/values.yaml to your image pushed 32 | 33 | ``` 34 | scheduler.kubeScheduler.image 35 | scheduler.extender.image 36 | scheduler.patch.image 37 | scheduler.patch.imageNew 38 | scheduler.devicePlugin.image 39 | scheduler.devicePlugin.monitorimage 40 | ``` 41 | 42 | 6. Execute the following command in your /root/HAMi/chart folder 43 | 44 | ``` 45 | helm install vgpu vgpu --set scheduler.kubeScheduler.imageTag={你的k8s server版本} -n kube-system 46 | ``` 47 | 48 | 7. Verify your installation 49 | 50 | execute the following command 51 | ``` 52 | kubectl get pods -n kube-system 53 | ``` 54 | 55 | If you can see both the 'device-plugin' and 'schduler' running, then HAMi is installed successfully, as the figure shown below: 56 | 57 | 58 | -------------------------------------------------------------------------------- /example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 1 # requesting 2 vGPUs 13 | nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory (Optional,Integer) 14 | -------------------------------------------------------------------------------- /examples/hygon/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: alexnet-tf-gpu-pod-mem 5 | labels: 6 | purpose: demo-tf-amdgpu 7 | spec: 8 | containers: 9 | - name: alexnet-tf-gpu-container 10 | image: pytorch:resnet50 11 | workingDir: /root 12 | command: ["sleep","infinity"] 13 | resources: 14 | limits: 15 | hygon.com/dcunum: 1 # requesting a GPU 16 | hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory 17 | hygon.com/dcucores: 60 # each dcu use 60% of total compute cores 18 | -------------------------------------------------------------------------------- /examples/hygon/specify_card_type_not_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: alexnet-tf-gpu-pod-mem 5 | annotations: 6 | hygon.com/nouse-dcutype: "Z100L" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card 7 | #In this example, we don't want this container to run on Z100L 8 | purpose: demo-tf-amdgpu 9 | spec: 10 | containers: 11 | - name: alexnet-tf-gpu-container 12 | image: pytorch:resnet50 13 | workingDir: /root 14 | command: ["sleep","infinity"] 15 | resources: 16 | limits: 17 | hygon.com/dcunum: 1 # requesting a GPU 18 | hygon.com/dcumem: 2000 19 | hygon.com/dcucores: 60 20 | -------------------------------------------------------------------------------- /examples/hygon/specify_card_type_to_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: alexnet-tf-gpu-pod-mem 5 | annotations: 6 | hygon.com/use-dcutype: "Z100" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card 7 | #In this example, we want to run this job on Z100 8 | labels: 9 | purpose: demo-tf-amdgpu 10 | spec: 11 | containers: 12 | - name: alexnet-tf-gpu-container 13 | image: pytorch:resnet50 14 | workingDir: /root 15 | command: ["sleep","infinity"] 16 | resources: 17 | limits: 18 | hygon.com/dcunum: 1 # requesting a GPU 19 | hygon.com/dcumem: 2000 20 | hygon.com/dcucores: 60 21 | -------------------------------------------------------------------------------- /examples/mlu/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | cambricon.com/mlunum: 1 # requesting 1 MLU 13 | cambricon.com/mlumem: 10240 # requesting 10G MLU device memory -------------------------------------------------------------------------------- /examples/mlu/multi-pods.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: binpack-1 5 | labels: 6 | app: binpack-1 7 | spec: 8 | replicas: 2 9 | selector: # define how the deployment finds the pods it mangages 10 | matchLabels: 11 | app: binpack-1 12 | template: # define the pods specifications 13 | metadata: 14 | labels: 15 | app: binpack-1 16 | spec: 17 | containers: 18 | - name: c-1 19 | image: ubuntu:18.04 20 | command: ["sleep"] 21 | args: ["100000"] 22 | lifecycle: # 必需 23 | #postStart: # 必需 24 | # exec: # 必需 25 | # command: ["/bin/sh", "-c", "/usr/bin/smlu-containerd"] # 必需 26 | resources: 27 | limits: 28 | cambricon.com/mlunum: 1 29 | cambricon.com/mlumem: 10240 # 设置 MLU 内存 30 | - name: c-2 31 | image: ubuntu:18.04 32 | command: ["sleep"] 33 | args: ["100000"] 34 | lifecycle: # 必需 35 | #postStart: # 必需 36 | # exec: # 必需 37 | # command: ["/bin/sh", "-c", "/usr/bin/smlu-containerd"] # 必需 38 | resources: 39 | limits: 40 | cambricon.com/mlunum: 1 41 | cambricon.com/mlumem: 10240 # 设置 MLU 内存 -------------------------------------------------------------------------------- /examples/mlu/specify_card_type_not_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | cambricon.com/nouse-mlutype: "270,370" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card 7 | #In this example, we want to run this job on 270 or 370 8 | spec: 9 | containers: 10 | - name: ubuntu-container 11 | image: ubuntu:18.04 12 | command: ["bash", "-c", "sleep 86400"] 13 | resources: 14 | limits: 15 | cambricon.com/mlunum: 1 16 | cambricon.com/mlumem: 10240 17 | -------------------------------------------------------------------------------- /examples/mlu/specify_card_type_to_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | cambricon.com/use-mlutype: "270" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card 7 | #In this example, we want to run this job on 270 or 370 8 | spec: 9 | containers: 10 | - name: ubuntu-container 11 | image: ubuntu:18.04 12 | command: ["bash", "-c", "sleep 86400"] 13 | resources: 14 | limits: 15 | cambricon.com/mlunum: 1 16 | cambricon.com/mlumem: 10240 17 | -------------------------------------------------------------------------------- /examples/nvidia/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # requesting 2 vGPUs 13 | nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory (Optional,Integer) 14 | nvidia.com/gpucores: 30 # Each vGPU uses 30% of the entire GPU (Optional,Integer) 15 | -------------------------------------------------------------------------------- /examples/nvidia/default_use_legacy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # requesting 2 vGPUs 13 | -------------------------------------------------------------------------------- /examples/nvidia/example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # requesting 2 vGPUs 13 | #nvidia.com/gpumem: 3000 # Each vGPU containers 3000M device memory 14 | nvidia.com/gpumem-percentage: 50 #Each vGPU containers 50% device memory of that GPU. Can not be used with nvidia.com/gpumem 15 | #nvidia.com/gpucores: 90 # Utilization limit of this vGPU is set to 50% of total GPU utilization 16 | #nvidia.com/priority: 0 # We only have two priority class, 0(high) and 1(low), default: 1 17 | #The utilization of high priority task won't be limited to resourceCores unless sharing GPU node with other high priority tasks. 18 | #The utilization of low priority task won't be limited to resourceCores if no other tasks sharing its GPU. 19 | - name: ubuntu-container0 20 | image: ubuntu:18.04 21 | command: ["bash", "-c", "sleep 86400"] 22 | - name: ubuntu-container1 23 | image: ubuntu:18.04 24 | command: ["bash", "-c", "sleep 86400"] 25 | resources: 26 | limits: 27 | nvidia.com/gpu: 2 # requesting 2 vGPUs 28 | nvidia.com/gpumem: 2000 29 | #nvidia.com/gpucores: 90 30 | 31 | -------------------------------------------------------------------------------- /examples/nvidia/mig_example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/mig-3g.20gb: 1 # requesting 2 vGPUs 13 | -------------------------------------------------------------------------------- /examples/nvidia/specify_card_type_not_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | nvidia.com/nouse-gputype: "1080,2080" # Specify the blacklist card type for this job, use comma to seperate, will not launch job on specified card 7 | # In this job, we don't want our job to run on 1080(include 1080Ti) or 2080(include 2080Ti) type of card. 8 | spec: 9 | containers: 10 | - name: ubuntu-container 11 | image: ubuntu:18.04 12 | command: ["bash", "-c", "sleep 86400"] 13 | resources: 14 | limits: 15 | nvidia.com/gpu: 2 # requesting 2 vGPUs 16 | -------------------------------------------------------------------------------- /examples/nvidia/specify_card_type_to_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | nvidia.com/use-gputype: "A100,V100" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card 7 | #In this example, we want to run this job on A100 or V100 8 | spec: 9 | containers: 10 | - name: ubuntu-container 11 | image: ubuntu:18.04 12 | command: ["bash", "-c", "sleep 86400"] 13 | resources: 14 | limits: 15 | nvidia.com/gpu: 2 # requesting 2 vGPUs 16 | -------------------------------------------------------------------------------- /examples/nvidia/use_exclusive_card.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod1 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # requesting 2 vGPUs 13 | nvidia.com/gpumem-percentage: 100 # Each vGPU contains 100% of the entire GPU device memory (Optional,Integer) 14 | nvidia.com/gpucores: 100 # Each vGPU uses 30% of the entire GPU cores(Optional,Integer) 15 | -------------------------------------------------------------------------------- /examples/nvidia/use_memory_fraction.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # requesting 2 vGPUs 13 | nvidia.com/gpumem-percentage: 50 # Each vGPU contains 50% device memory of that GPU (Optional,Integer) 14 | nvidia.com/gpucores: 30 # Each vGPU uses 30% of the entire GPU (Optional,Integer) 15 | -------------------------------------------------------------------------------- /hack/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright © 2021 peizhaoyou 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | set -e 18 | [[ -z ${SHORT_VERSION} ]] && SHORT_VERSION=$(git rev-parse --abbrev-ref HEAD) 19 | [[ -z ${COMMIT_CODE} ]] && COMMIT_CODE=$(git describe --abbrev=100 --always) 20 | 21 | export SHORT_VERSION 22 | export COMMIT_CODE 23 | export VERSION="${SHORT_VERSION}-${COMMIT_CODE}" 24 | export LATEST_VERSION="latest" 25 | export GOLANG_IMAGE="golang:1.21-bullseye" 26 | export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04" 27 | export DEST_DIR="/usr/local" 28 | 29 | IMAGE=${IMAGE-"4pdosc/k8s-vdevice"} 30 | 31 | function go_build() { 32 | [[ -z "$J" ]] && J=$(nproc | awk '{print int(($0 + 1)/ 2)}') 33 | make -j$J 34 | } 35 | 36 | function docker_build() { 37 | docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile . 38 | docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${SHORT_VERSION}" 39 | docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${LATEST_VERSION}" 40 | } 41 | 42 | function docker_push() { 43 | #docker push "${IMAGE}:${VERSION}" 44 | docker push "${IMAGE}:${SHORT_VERSION}" 45 | docker push "${IMAGE}:${LATEST_VERSION}" 46 | } 47 | 48 | go_build 49 | docker_build 50 | docker_push -------------------------------------------------------------------------------- /hack/update-generated-api.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright © 2021 peizhaoyou 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | ROOT_DIR=$(dirname "${BASH_SOURCE[0]}")/.. 19 | protoc -I${ROOT_DIR} --gofast_out=plugins=grpc:${ROOT_DIR} ${ROOT_DIR}/pkg/api/*.proto -------------------------------------------------------------------------------- /imgs/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/arch.png -------------------------------------------------------------------------------- /imgs/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/benchmark.png -------------------------------------------------------------------------------- /imgs/benchmark_inf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/benchmark_inf.png -------------------------------------------------------------------------------- /imgs/benchmark_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/benchmark_train.png -------------------------------------------------------------------------------- /imgs/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/example.png -------------------------------------------------------------------------------- /imgs/hard_limit.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/hard_limit.jpg -------------------------------------------------------------------------------- /lib/mlu/cntopo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/lib/mlu/cntopo -------------------------------------------------------------------------------- /lib/mlu/libcndev.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/lib/mlu/libcndev.so -------------------------------------------------------------------------------- /lib/mlu/smlu-containerd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/lib/mlu/smlu-containerd -------------------------------------------------------------------------------- /lib/nvidia/ld.so.preload: -------------------------------------------------------------------------------- 1 | /usr/local/vgpu/libvgpu.so -------------------------------------------------------------------------------- /lib/nvidia/libvgpu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/lib/nvidia/libvgpu.so -------------------------------------------------------------------------------- /pkg/api/device_register.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-gogo. DO NOT EDIT. 2 | // source: pkg/api/device_register.proto 3 | 4 | package api 5 | 6 | 7 | // Reference imports to suppress errors if they are not otherwise used. 8 | 9 | // This is a compile-time assertion to ensure that this generated file 10 | // is compatible with the proto package it is being compiled against. 11 | // A compilation error at this line likely means your copy of the 12 | // proto package needs to be updated. 13 | type DeviceInfo struct { 14 | Index int 15 | Id string 16 | Count int32 17 | Devmem int32 18 | Devcore int32 19 | Type string 20 | Numa int 21 | Health bool 22 | } 23 | -------------------------------------------------------------------------------- /pkg/api/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package api 18 | 19 | const ( 20 | TaskPriority = "CUDA_TASK_PRIORITY" 21 | CoreLimitSwitch = "GPU_CORE_UTILIZATION_POLICY" 22 | ) 23 | -------------------------------------------------------------------------------- /pkg/device-plugin/hygon/dcu/corealloc.go: -------------------------------------------------------------------------------- 1 | package dcu 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | ) 7 | 8 | func initCoreUsage(req int) string { 9 | res := "" 10 | i := 0 11 | for i < req/4 { 12 | res = res + "0" 13 | i++ 14 | } 15 | return res 16 | } 17 | 18 | func addCoreUsage(tot string, c string) (string, error) { 19 | i := 0 20 | res := "" 21 | for { 22 | left := int64(0) 23 | right := int64(0) 24 | if i < len(tot) && tot[i] != 0 { 25 | left, _ = strconv.ParseInt(string(tot[i]), 16, 0) 26 | right, _ = strconv.ParseInt(string(c[i]), 16, 0) 27 | merged := int(left | right) 28 | res = fmt.Sprintf("%s%x", res, merged) 29 | } else { 30 | break 31 | } 32 | i++ 33 | } 34 | fmt.Println("tot=", tot, "c=", c, "res=", res) 35 | return res, nil 36 | } 37 | 38 | func byteAlloc(b int, req int) (int, int) { 39 | if req == 0 { 40 | return 0, 0 41 | } 42 | remains := req 43 | leftstr := fmt.Sprintf("%b", b) 44 | for len(leftstr) < 4 { 45 | leftstr = "0" + leftstr 46 | } 47 | res := 0 48 | i := 0 49 | for i < len(leftstr) { 50 | res = res * 2 51 | if leftstr[i] == '0' && remains > 0 { 52 | remains-- 53 | res = res + 1 54 | } 55 | i++ 56 | } 57 | return res, remains 58 | } 59 | 60 | func allocCoreUsage(tot string, req int) (string, error) { 61 | i := 0 62 | res := "" 63 | remains := req 64 | for { 65 | left := int64(0) 66 | alloc := 0 67 | if i < len(tot) && tot[i] != 0 { 68 | left, _ = strconv.ParseInt(string(tot[i]), 16, 0) 69 | alloc, remains = byteAlloc(int(left), remains) 70 | res = fmt.Sprintf("%s%x", res, alloc) 71 | } else { 72 | break 73 | } 74 | i++ 75 | } 76 | return res, nil 77 | } 78 | -------------------------------------------------------------------------------- /pkg/device-plugin/hygon/dcu/corealloc_test.go: -------------------------------------------------------------------------------- 1 | package dcu 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "gotest.tools/v3/assert" 8 | ) 9 | 10 | func TestInit(t *testing.T) { 11 | str := initCoreUsage(60) 12 | t.Log("str=", str) 13 | assert.Equal(t, strings.Compare(str, "000000000000000"), 0) 14 | } 15 | 16 | func TestAddCoreUsage(t *testing.T) { 17 | str := initCoreUsage(60) 18 | str1 := "abcde000ad00012" 19 | res, _ := addCoreUsage(str, str1) 20 | t.Log("res1=", res) 21 | assert.Equal(t, strings.Compare(res, str1), 0) 22 | str1 = "50200fff4000000" 23 | res, _ = addCoreUsage(res, str1) 24 | t.Log("res1=", res) 25 | assert.Equal(t, strings.Compare(res, "fbedefffed00012"), 0) 26 | } 27 | 28 | func TestAllocCoreUsage(t *testing.T) { 29 | str1 := "50200fff4000000" 30 | res, _ := allocCoreUsage(str1, 16) 31 | t.Log("res=", res) 32 | assert.Equal(t, strings.Compare(res, "afdfe0000000000"), 0) 33 | str1 = "abcde000ad00012" 34 | res, _ = allocCoreUsage(str1, 32) 35 | t.Log("res=", res) 36 | } 37 | -------------------------------------------------------------------------------- /pkg/device-plugin/hygon/dcu/hwloc/hwloc.go: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | **/ 16 | 17 | // Package hwloc is a collection of utility functions to get NUMA membership 18 | // of AMD GPU via the hwloc library 19 | package hwloc 20 | 21 | // #cgo pkg-config: hwloc 22 | // #include 23 | // #include 24 | import "C" 25 | import ( 26 | "fmt" 27 | "unsafe" 28 | ) 29 | 30 | func GetVersions() string { 31 | return fmt.Sprintf("hwloc: _VERSION: %s, _API_VERSION: %#08x, _COMPONENT_ABI: %d, Runtime: %#08x", 32 | C.HWLOC_VERSION, 33 | C.HWLOC_API_VERSION, 34 | C.HWLOC_COMPONENT_ABI, 35 | uint(C.hwloc_get_api_version())) 36 | } 37 | 38 | type Hwloc struct { 39 | topology C.hwloc_topology_t 40 | } 41 | 42 | func (h *Hwloc) Init() error { 43 | rc := C.hwloc_topology_init(&h.topology) 44 | if rc != 0 { 45 | return fmt.Errorf("Problem initializing hwloc topology rc: %d", rc) 46 | } 47 | 48 | rc = C.hwloc_topology_set_type_filter(h.topology, 49 | C.HWLOC_OBJ_PCI_DEVICE, 50 | C.HWLOC_TYPE_FILTER_KEEP_IMPORTANT) 51 | if rc != 0 { 52 | C.hwloc_topology_destroy(h.topology) 53 | return fmt.Errorf("Problem setting type filter rc: %d", rc) 54 | } 55 | 56 | rc = C.hwloc_topology_load(h.topology) 57 | if rc != 0 { 58 | C.hwloc_topology_destroy(h.topology) 59 | return fmt.Errorf("Problem loading topology rc: %d", rc) 60 | } 61 | 62 | return nil 63 | } 64 | 65 | func (h *Hwloc) Destroy() { 66 | C.hwloc_topology_destroy(h.topology) 67 | } 68 | 69 | func (h *Hwloc) GetNUMANodes(busid string) ([]uint64, error) { 70 | var gpu C.hwloc_obj_t 71 | var ancestor C.hwloc_obj_t 72 | 73 | busidstr := C.CString(busid) 74 | defer C.free(unsafe.Pointer(busidstr)) 75 | 76 | gpu = C.hwloc_get_pcidev_by_busidstring(h.topology, busidstr) 77 | if gpu == nil { 78 | return []uint64{}, 79 | fmt.Errorf("Fail to find GPU with bus ID: %s", busid) 80 | } 81 | ancestor = C.hwloc_get_non_io_ancestor_obj(h.topology, gpu) 82 | 83 | if ancestor == nil || ancestor.memory_arity <= 0 { 84 | return []uint64{}, 85 | fmt.Errorf("No NUMA node found with bus ID: %s", busid) 86 | } 87 | 88 | var results []uint64 89 | nn := ancestor.memory_first_child 90 | 91 | for nn != nil { 92 | results = append(results, uint64(nn.logical_index)) 93 | nn = nn.next_sibling 94 | } 95 | 96 | return results, nil 97 | } 98 | -------------------------------------------------------------------------------- /pkg/device-plugin/hygon/dcu/register.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcu 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "time" 23 | 24 | "k8s.io/klog/v2" 25 | pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 26 | 27 | "4pd.io/k8s-vgpu/pkg/api" 28 | "4pd.io/k8s-vgpu/pkg/device/hygon" 29 | "4pd.io/k8s-vgpu/pkg/util" 30 | ) 31 | 32 | type DevListFunc func() []*pluginapi.Device 33 | 34 | func (r *Plugin) apiDevices() *[]*api.DeviceInfo { 35 | res := []*api.DeviceInfo{} 36 | for idx, val := range r.totalmem { 37 | if val > 0 { 38 | res = append(res, &api.DeviceInfo{ 39 | Index: idx, 40 | Id: "DCU-" + fmt.Sprint(idx), 41 | Count: 30, 42 | Devmem: int32(val), 43 | Devcore: 100, 44 | Numa: 0, 45 | Type: r.cardtype[idx], 46 | Health: true, 47 | }) 48 | } 49 | } 50 | return &res 51 | } 52 | 53 | func (r *Plugin) RegistrInAnnotation() error { 54 | devices := r.apiDevices() 55 | annos := make(map[string]string) 56 | if len(util.NodeName) == 0 { 57 | util.NodeName = os.Getenv("NodeName") 58 | } 59 | node, err := util.GetNode(util.NodeName) 60 | if err != nil { 61 | klog.Errorln("get node error", err.Error()) 62 | return err 63 | } 64 | encodeddevices := util.EncodeNodeDevices(*devices) 65 | annos[hygon.HandshakeAnnos] = "Reported " + time.Now().String() 66 | annos[hygon.RegisterAnnos] = encodeddevices 67 | klog.Infoln("Reporting devices", encodeddevices, "in", time.Now().String()) 68 | err = util.PatchNodeAnnotations(node, annos) 69 | 70 | if err != nil { 71 | klog.Errorln("patch node error", err.Error()) 72 | } 73 | return err 74 | } 75 | 76 | func (r *Plugin) WatchAndRegister() { 77 | klog.Infof("into WatchAndRegister") 78 | for { 79 | r.RefreshContainerDevices() 80 | err := r.RegistrInAnnotation() 81 | if err != nil { 82 | klog.Errorf("register error, %v", err) 83 | time.Sleep(time.Second * 5) 84 | } else { 85 | time.Sleep(time.Second * 30) 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/allocator/allocator.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package allocator 16 | 17 | import ( 18 | "strings" 19 | 20 | "4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cndev" 21 | ) 22 | 23 | type Allocator interface { 24 | Allocate(available []uint, required []uint, size int) ([]uint, error) 25 | } 26 | 27 | func New(policy string, devs map[string]*cndev.Device) Allocator { 28 | model := cndev.GetDeviceModel(uint(0)) 29 | if strings.Contains(model, "MLU290") || model == "MLU370-M8" { 30 | return NewSpiderAllocator(policy, devs) 31 | } 32 | if model == "MLU370-X8" { 33 | return NewBoardAllocator(policy, devs) 34 | } 35 | return NewDefaultAllocator(policy, devs) 36 | } 37 | 38 | func contains(set []uint, dev uint) bool { 39 | for i := range set { 40 | if set[i] == dev { 41 | return true 42 | } 43 | } 44 | return false 45 | } 46 | 47 | func containsAll(set []uint, devs []uint) bool { 48 | for _, dev := range devs { 49 | if !contains(set, dev) { 50 | return false 51 | } 52 | } 53 | return true 54 | } 55 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/allocator/allocator_suite_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package allocator 16 | 17 | import ( 18 | "testing" 19 | 20 | "4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cntopo/mock" 21 | "github.com/golang/mock/gomock" 22 | . "github.com/onsi/ginkgo" 23 | . "github.com/onsi/gomega" 24 | ) 25 | 26 | var ( 27 | cntopoMock *mock.Cntopo 28 | mockCtrl *gomock.Controller 29 | ) 30 | 31 | func TestAllocator(t *testing.T) { 32 | RegisterFailHandler(Fail) 33 | RunSpecs(t, "Allocator Suite") 34 | } 35 | 36 | var _ = BeforeSuite(func() { 37 | By("Bootstrap test environment") 38 | mockCtrl = gomock.NewController(GinkgoT()) 39 | cntopoMock = mock.NewCntopo(mockCtrl) 40 | }) 41 | 42 | var _ = AfterSuite(func() { 43 | By("Tear down the test environment") 44 | mockCtrl.Finish() 45 | }) 46 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/allocator/default.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package allocator 16 | 17 | import ( 18 | "fmt" 19 | "log" 20 | "sort" 21 | 22 | "4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cndev" 23 | "4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cntopo" 24 | "4pd.io/k8s-vgpu/pkg/util" 25 | ) 26 | 27 | type defaultAllocator struct { 28 | policy string 29 | cntopo cntopo.Cntopo 30 | devs map[string]*cndev.Device 31 | } 32 | 33 | func NewDefaultAllocator(policy string, devs map[string]*cndev.Device) Allocator { 34 | return &defaultAllocator{ 35 | policy: policy, 36 | cntopo: cntopo.New(), 37 | devs: devs, 38 | } 39 | } 40 | 41 | func (a *defaultAllocator) Allocate(available []uint, required []uint, size int) ([]uint, error) { 42 | 43 | rings, err := a.cntopo.GetRings(available, size) 44 | if err != nil { 45 | return nil, err 46 | } 47 | sort.Slice(rings, func(i int, j int) bool { 48 | return rings[i].NonConflictRingNum > rings[j].NonConflictRingNum 49 | }) 50 | 51 | if len(rings) == 0 { 52 | log.Println("found no rings") 53 | if a.policy != util.BestEffort && !a.sizeAlwaysFailsToFormRing(size) { 54 | return nil, fmt.Errorf("mode %s found no rings", a.policy) 55 | } 56 | return available[0:size], nil 57 | } 58 | 59 | return rings[0].Ordinals, nil 60 | } 61 | 62 | func (a *defaultAllocator) sizeAlwaysFailsToFormRing(size int) bool { 63 | return size%2 == 1 64 | } 65 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/cache.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package mlu 18 | 19 | import ( 20 | "context" 21 | "sync" 22 | 23 | "4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cndev" 24 | "4pd.io/k8s-vgpu/pkg/util" 25 | pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 26 | ) 27 | 28 | type MLUDevice struct { 29 | dev pluginapi.Device 30 | handle *cndev.Device 31 | } 32 | 33 | type DeviceCache struct { 34 | cache []*MLUDevice 35 | stopCh chan interface{} 36 | unhealthy chan *pluginapi.Device 37 | notifyCh map[string]chan *pluginapi.Device 38 | mutex sync.Mutex 39 | } 40 | 41 | func NewDeviceCache() *DeviceCache { 42 | return &DeviceCache{ 43 | stopCh: make(chan interface{}), 44 | unhealthy: make(chan *pluginapi.Device), 45 | notifyCh: make(map[string]chan *pluginapi.Device), 46 | } 47 | } 48 | 49 | func (d *DeviceCache) AddNotifyChannel(name string, ch chan *pluginapi.Device) { 50 | d.mutex.Lock() 51 | defer d.mutex.Unlock() 52 | d.notifyCh[name] = ch 53 | } 54 | 55 | func (d *DeviceCache) RemoveNotifyChannel(name string) { 56 | d.mutex.Lock() 57 | defer d.mutex.Unlock() 58 | delete(d.notifyCh, name) 59 | } 60 | 61 | func (d *DeviceCache) Start() { 62 | d.cache = d.Devices() 63 | go d.CheckHealth(d.stopCh, d.cache, d.unhealthy) 64 | go d.notify() 65 | } 66 | 67 | func (d *DeviceCache) Stop() { 68 | close(d.stopCh) 69 | } 70 | 71 | func (d *DeviceCache) GetCache() []*MLUDevice { 72 | return d.cache 73 | } 74 | 75 | func (d *DeviceCache) notify() { 76 | for { 77 | select { 78 | case <-d.stopCh: 79 | return 80 | case dev := <-d.unhealthy: 81 | dev.Health = pluginapi.Unhealthy 82 | d.mutex.Lock() 83 | for _, ch := range d.notifyCh { 84 | ch <- dev 85 | } 86 | d.mutex.Unlock() 87 | } 88 | } 89 | } 90 | 91 | // Devices returns a list of devices from the GpuDeviceManager 92 | func (d *DeviceCache) Devices() []*MLUDevice { 93 | n, err := cndev.GetDeviceCount() 94 | check(err) 95 | if n > util.DeviceLimit { 96 | n = util.DeviceLimit 97 | } 98 | 99 | var devs []*MLUDevice 100 | for i := uint(0); i < n; i++ { 101 | d, err := cndev.NewDeviceLite(i, false) 102 | check(err) 103 | 104 | devs = append(devs, &MLUDevice{ 105 | dev: pluginapi.Device{ID: d.UUID}, 106 | handle: d, 107 | }) 108 | } 109 | 110 | return devs 111 | } 112 | 113 | // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices 114 | func (d *DeviceCache) CheckHealth(stop <-chan interface{}, devices []*MLUDevice, unhealthy chan<- *pluginapi.Device) { 115 | // mlu.checkHealth... 116 | WatchUnhealthy(context.Background(), devices, unhealthy) 117 | } 118 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/cndev/bindings_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cndev 16 | 17 | import ( 18 | "fmt" 19 | "log" 20 | "os" 21 | "sort" 22 | "testing" 23 | 24 | "github.com/stretchr/testify/assert" 25 | ) 26 | 27 | func TestMain(m *testing.M) { 28 | err := Init() 29 | if err != nil { 30 | log.Fatal(err) 31 | } 32 | ret := m.Run() 33 | if ret != 0 { 34 | os.Exit(ret) 35 | } 36 | err = Release() 37 | if err != nil { 38 | log.Fatal(err) 39 | } 40 | } 41 | 42 | func TestGetDeviceCount(t *testing.T) { 43 | count, err := GetDeviceCount() 44 | assert.NoError(t, err) 45 | assert.Equal(t, uint(8), count) 46 | } 47 | 48 | func TestGetDeviceModel(t *testing.T) { 49 | model := GetDeviceModel(uint(0)) 50 | assert.Equal(t, "MLU290", model) 51 | } 52 | 53 | func TestGetDeviceMemory(t *testing.T) { 54 | memory, err := GetDeviceMemory(uint(0)) 55 | assert.NoError(t, err) 56 | assert.Equal(t, uint(16*1024), memory) 57 | } 58 | 59 | func TestGetDeviceInfo(t *testing.T) { 60 | uuid, _, mb, path, err := getDeviceInfo(uint(1)) 61 | assert.NoError(t, err) 62 | assert.Equal(t, "/dev/cambricon_dev1", path) 63 | assert.Equal(t, "MLU-20001012-1916-0000-0000-000000000000", uuid) 64 | assert.Equal(t, fmt.Sprintf("%x", 1111111), mb) 65 | } 66 | 67 | func TestGetDeviceHealthState(t *testing.T) { 68 | health, err := getDeviceHealthState(uint(0), 1) 69 | assert.NoError(t, err) 70 | assert.Equal(t, 1, health) 71 | } 72 | 73 | func TestGetDevicePCIeInfo(t *testing.T) { 74 | pcie, err := getDevicePCIeInfo(uint(0)) 75 | assert.NoError(t, err) 76 | assert.Equal(t, 0, pcie.domain) 77 | assert.Equal(t, 12, pcie.bus) 78 | assert.Equal(t, 13, pcie.device) 79 | assert.Equal(t, 1, pcie.function) 80 | } 81 | 82 | func TestGetDeviceMLULinkDevs(t *testing.T) { 83 | devs, err := getDeviceMLULinkDevs(uint(0)) 84 | assert.NoError(t, err) 85 | assert.Equal(t, map[string]int{ 86 | "MLU-20001012-1916-0000-0000-000000000000": 1, 87 | "MLU-30001012-1916-0000-0000-000000000000": 2, 88 | "MLU-40001012-1916-0000-0000-000000000000": 1, 89 | "MLU-50001012-1916-0000-0000-000000000000": 1, 90 | "MLU-d0001012-1916-0000-0000-000000000000": 1, 91 | }, devs) 92 | } 93 | 94 | func TestGetMLULinkGroups(t *testing.T) { 95 | groups, err := GetMLULinkGroups() 96 | assert.NoError(t, err) 97 | for i := range groups { 98 | sort.Slice(groups[i], func(x, y int) bool { 99 | return groups[i][x] < groups[i][y] 100 | }) 101 | } 102 | assert.Equal(t, [][]uint{{0, 1, 2, 3, 4, 5, 6, 7}}, groups) 103 | } 104 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/cndev/cndev_dl.go: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cndev 16 | 17 | import ( 18 | "unsafe" 19 | ) 20 | 21 | // #include 22 | // #include "include/cndev.h" 23 | import "C" 24 | 25 | type dlhandles struct{ handles []unsafe.Pointer } 26 | 27 | var dl dlhandles 28 | 29 | // Initialize CNDEV, open a dynamic reference to the CNDEV library in the process. 30 | func (dl *dlhandles) cndevInit() C.cndevRet_t { 31 | handle := C.dlopen(C.CString("libcndev.so"), C.RTLD_LAZY|C.RTLD_GLOBAL) 32 | if handle == C.NULL { 33 | return C.CNDEV_ERROR_UNINITIALIZED 34 | } 35 | dl.handles = append(dl.handles, handle) 36 | return C.cndevInit(C.int(0)) 37 | } 38 | 39 | // Release CNDEV, close the dynamic reference to the CNDEV library in the process. 40 | func (dl *dlhandles) cndevRelease() C.cndevRet_t { 41 | ret := C.cndevRelease() 42 | if ret != C.CNDEV_SUCCESS { 43 | return ret 44 | } 45 | 46 | for _, handle := range dl.handles { 47 | err := C.dlclose(handle) 48 | if err != 0 { 49 | return C.CNDEV_ERROR_UNKNOWN 50 | } 51 | } 52 | return C.CNDEV_SUCCESS 53 | } 54 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/cndev/cndev_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cndev 16 | 17 | import ( 18 | "os" 19 | "testing" 20 | 21 | "github.com/stretchr/testify/assert" 22 | ) 23 | 24 | func TestGetPCIeID(t *testing.T) { 25 | d := &Device{ 26 | pcie: &pcie{ 27 | domain: 0, 28 | bus: 3, 29 | device: 15, 30 | function: 1, 31 | }, 32 | } 33 | id, err := d.GetPCIeID() 34 | assert.NoError(t, err) 35 | assert.Equal(t, "0000:03:0f.1", id) 36 | } 37 | 38 | func TestGetNumFromFile(t *testing.T) { 39 | path := "/tmp/device_plugin_cndev_ut" 40 | f, err := os.Create(path) 41 | assert.NoError(t, err) 42 | 43 | data := []byte("4\n") 44 | _, err = f.Write(data) 45 | assert.NoError(t, err) 46 | num, err := getNumFromFile(path) 47 | assert.NoError(t, err) 48 | assert.Equal(t, 4, num) 49 | 50 | err = f.Close() 51 | assert.NoError(t, err) 52 | err = os.Remove(path) 53 | assert.NoError(t, err) 54 | } 55 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/cntopo/cntopo.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cntopo 16 | 17 | import ( 18 | "encoding/json" 19 | "os" 20 | "os/exec" 21 | "sync" 22 | ) 23 | 24 | type cntopo struct { 25 | sync.Mutex 26 | } 27 | 28 | type Input map[string][]struct { 29 | Size int `json:"num_devices"` 30 | WhiteList []uint `json:"white_dev_list"` 31 | } 32 | 33 | type Output []struct { 34 | Info struct { 35 | Ordinals []uint `json:"ordinal_list"` 36 | } `json:"info_by_host"` 37 | // The traffic is duplex, so this value is twice the number of rings, 38 | // except for the cases of less equal to 2 cards, that is, 39 | // "A>B>A" conflicts with "B>A>B", while "A>B>C>A" does not conflict with "A>C>B>A" 40 | NonConflictRings struct { 41 | Num int `json:"nonconflict_rings_num"` 42 | } `json:"nonconflict_rings"` 43 | } 44 | 45 | type Ring struct { 46 | Ordinals []uint 47 | NonConflictRingNum int 48 | } 49 | 50 | type Cntopo interface { 51 | GetRings(available []uint, size int) ([]Ring, error) 52 | } 53 | 54 | func New() Cntopo { 55 | return &cntopo{} 56 | } 57 | 58 | func (c *cntopo) GetRings(available []uint, size int) ([]Ring, error) { 59 | i := Input{ 60 | "host_list": { 61 | { 62 | Size: size, 63 | WhiteList: available, 64 | }, 65 | }, 66 | } 67 | b, err := json.Marshal(i) 68 | if err != nil { 69 | return nil, err 70 | } 71 | c.Lock() 72 | defer c.Unlock() 73 | err = os.WriteFile("/tmp/cntopo_input.json", b, 0666) 74 | if err != nil { 75 | return nil, err 76 | } 77 | err = exec.Command("sh", "-c", "cntopo find -I /tmp/cntopo_input.json -O /tmp/cntopo_output.json -R 1000000 -C").Run() 78 | if err != nil { 79 | return nil, err 80 | } 81 | j, err := os.ReadFile("/tmp/cntopo_output.json") 82 | if err != nil { 83 | return nil, err 84 | } 85 | var output Output 86 | err = json.Unmarshal(j, &output) 87 | if err != nil { 88 | return nil, err 89 | } 90 | rings := []Ring{} 91 | for _, o := range output { 92 | rings = append(rings, Ring{ 93 | NonConflictRingNum: o.NonConflictRings.Num, 94 | Ordinals: o.Info.Ordinals, 95 | }) 96 | } 97 | return rings, nil 98 | } 99 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/cntopo/mock/cntopo.go: -------------------------------------------------------------------------------- 1 | // Code generated by MockGen. DO NOT EDIT. 2 | // Source: github.com/Cambricon/cambricon-k8s-device-plugin/device-plugin/pkg/cntopo (interfaces: Cntopo) 3 | 4 | // Package mock is a generated GoMock package. 5 | package mock 6 | 7 | import ( 8 | reflect "reflect" 9 | 10 | cntopo "4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cntopo" 11 | gomock "github.com/golang/mock/gomock" 12 | ) 13 | 14 | // Cntopo is a mock of Cntopo interface. 15 | type Cntopo struct { 16 | ctrl *gomock.Controller 17 | recorder *CntopoMockRecorder 18 | } 19 | 20 | // CntopoMockRecorder is the mock recorder for Cntopo. 21 | type CntopoMockRecorder struct { 22 | mock *Cntopo 23 | } 24 | 25 | // NewCntopo creates a new mock instance. 26 | func NewCntopo(ctrl *gomock.Controller) *Cntopo { 27 | mock := &Cntopo{ctrl: ctrl} 28 | mock.recorder = &CntopoMockRecorder{mock} 29 | return mock 30 | } 31 | 32 | // EXPECT returns an object that allows the caller to indicate expected use. 33 | func (m *Cntopo) EXPECT() *CntopoMockRecorder { 34 | return m.recorder 35 | } 36 | 37 | // GetRings mocks base method. 38 | func (m *Cntopo) GetRings(arg0 []uint, arg1 int) ([]cntopo.Ring, error) { 39 | m.ctrl.T.Helper() 40 | ret := m.ctrl.Call(m, "GetRings", arg0, arg1) 41 | ret0, _ := ret[0].([]cntopo.Ring) 42 | ret1, _ := ret[1].(error) 43 | return ret0, ret1 44 | } 45 | 46 | // GetRings indicates an expected call of GetRings. 47 | func (mr *CntopoMockRecorder) GetRings(arg0, arg1 interface{}) *gomock.Call { 48 | mr.mock.ctrl.T.Helper() 49 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRings", reflect.TypeOf((*Cntopo)(nil).GetRings), arg0, arg1) 50 | } 51 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/const.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package mlu 16 | 17 | import pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 18 | 19 | const ( 20 | serverSock = pluginapi.DevicePluginPath + "cambricon.sock" 21 | mluLinkPolicyUnsatisfied = "mluLinkPolicyUnsatisfied" 22 | retries = 5 23 | 24 | BestEffort string = "best-effort" 25 | Restricted string = "restricted" 26 | Guaranteed string = "guaranteed" 27 | 28 | sriov string = "sriov" 29 | envShare string = "env-share" 30 | topologyAware string = "topology-aware" 31 | mluShare string = "mlu-share" 32 | 33 | mluMonitorDeviceName = "/dev/cambricon_ctl" 34 | mluDeviceName = "/dev/cambricon_dev" 35 | mluMsgqDeviceName = "/dev/cambr-msgq" 36 | mluRPCDeviceName = "/dev/cambr-rpc" 37 | mluCmsgDeviceName = "/dev/cmsg_ctrl" 38 | mluIpcmDeviceName = "/dev/cambricon_ipcm" 39 | mluCommuDeviceName = "/dev/commu" 40 | mluUARTConsoleDeviceName = "/dev/ttyMS" 41 | mluRPMsgDir = "/dev/cambricon/" 42 | mluSplitDeviceName = "/dev/cambricon-split" 43 | 44 | mluMemResourceName = "cambricon.com/mlumem" 45 | mluResourceCount = "cambricon.com/mlunum" 46 | mluMemResourceAssumeTime = "CAMBRICON_MEM_ASSUME_TIME" 47 | mluMemResourceAssigned = "CAMBRICON_MEM_ASSIGHED" 48 | mluMemSplitLimit = "CAMBRICON_SPLIT_MEMS" 49 | mluMemSplitIndex = "CAMBRICON_SPLIT_VISIBLE_DEVICES" 50 | mluMemSplitEnable = "CAMBRICON_SPLIT_ENABLE" 51 | mluMemLock = "cambricon.com/mlu-mem.lock" 52 | mluMemBinaryPath = "/usr/bin/smlu-containerd" 53 | ) 54 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/options.go: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Cambricon, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package mlu 16 | 17 | import ( 18 | "log" 19 | "os" 20 | "strings" 21 | 22 | "4pd.io/k8s-vgpu/pkg/util" 23 | flags "github.com/jessevdk/go-flags" 24 | ) 25 | 26 | type Options struct { 27 | Mode string `long:"mode" description:"device plugin mode" default:"default" choice:"default" choice:"sriov" choice:"env-share" choice:"topology-aware" choice:"mlu-share"` 28 | MLULinkPolicy string `long:"mlulink-policy" description:"MLULink topology policy" default:"best-effort" choice:"best-effort" choice:"restricted" choice:"guaranteed"` 29 | VirtualizationNum uint `long:"virtualization-num" description:"the virtualization number for each MLU, used only in sriov mode or env-share mode" default:"1" env:"VIRTUALIZATION_NUM"` 30 | DisableHealthCheck bool `long:"disable-health-check" description:"disable MLU health check"` 31 | NodeName string `long:"node-name" description:"host node name" env:"NODE_NAME"` 32 | EnableConsole bool `long:"enable-console" description:"enable UART console device(/dev/ttyMS) in container"` 33 | EnableDeviceType bool `long:"enable-device-type" description:"enable device registration with type info"` 34 | CnmonPath string `long:"cnmon-path" description:"host cnmon path"` 35 | SocketPath string `long:"socket-path" description:"socket path for communication between deviceplugin and container runtime"` 36 | } 37 | 38 | func ParseFlags() Options { 39 | for index, arg := range os.Args { 40 | if strings.HasPrefix(arg, "-mode") { 41 | os.Args[index] = strings.Replace(arg, "-mode", "--mode", 1) 42 | break 43 | } 44 | } 45 | if os.Getenv("DP_DISABLE_HEALTHCHECKS") == "all" { 46 | os.Args = append(os.Args, "--disable-health-check") 47 | } 48 | options := Options{} 49 | parser := flags.NewParser(&options, flags.Default) 50 | if _, err := parser.Parse(); err != nil { 51 | code := 1 52 | if fe, ok := err.(*flags.Error); ok { 53 | if fe.Type == flags.ErrHelp { 54 | code = 0 55 | } 56 | } 57 | os.Exit(code) 58 | } 59 | util.DeviceSplitCount = &options.VirtualizationNum 60 | util.RuntimeSocketFlag = options.SocketPath 61 | log.Printf("Parsed options: %v\n", options) 62 | return options 63 | } 64 | -------------------------------------------------------------------------------- /pkg/device-plugin/mlu/register.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package mlu 18 | 19 | import ( 20 | "fmt" 21 | "time" 22 | 23 | "k8s.io/klog/v2" 24 | pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 25 | 26 | "4pd.io/k8s-vgpu/pkg/api" 27 | "4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cndev" 28 | "4pd.io/k8s-vgpu/pkg/device/cambricon" 29 | "4pd.io/k8s-vgpu/pkg/util" 30 | ) 31 | 32 | type DevListFunc func() []*pluginapi.Device 33 | 34 | type DeviceRegister struct { 35 | deviceCache *DeviceCache 36 | unhealthy chan *pluginapi.Device 37 | stopCh chan struct{} 38 | } 39 | 40 | func NewDeviceRegister(deviceCache *DeviceCache) *DeviceRegister { 41 | return &DeviceRegister{ 42 | deviceCache: deviceCache, 43 | unhealthy: make(chan *pluginapi.Device), 44 | stopCh: make(chan struct{}), 45 | } 46 | } 47 | 48 | func (r *DeviceRegister) Start(opt Options) { 49 | r.deviceCache.AddNotifyChannel("register", r.unhealthy) 50 | go r.WatchAndRegister(opt) 51 | } 52 | 53 | func (r *DeviceRegister) Stop() { 54 | close(r.stopCh) 55 | } 56 | 57 | func (r *DeviceRegister) apiDevices() *[]*api.DeviceInfo { 58 | devs := r.deviceCache.GetCache() 59 | res := make([]*api.DeviceInfo, 0, len(devs)) 60 | for i, dev := range devs { 61 | //klog.V(3).Infoln("ndev type=", ndev.Model) 62 | memory, _ := cndev.GetDeviceMemory(uint(i)) 63 | fmt.Println("mlu registered device id=", dev.dev.ID, "memory=", memory, "type=", cndev.GetDeviceModel(uint(i))) 64 | registeredmem := int32(memory) 65 | res = append(res, &api.DeviceInfo{ 66 | Id: dev.dev.ID, 67 | Count: int32(*util.DeviceSplitCount), 68 | Devmem: registeredmem, 69 | Devcore: 0, 70 | Numa: 0, 71 | Type: fmt.Sprintf("%v-%v", "MLU", cndev.GetDeviceModel(uint(i))), 72 | Health: dev.dev.Health == "healthy", 73 | }) 74 | } 75 | return &res 76 | } 77 | 78 | func (r *DeviceRegister) RegistrInAnnotation() error { 79 | devices := r.apiDevices() 80 | annos := make(map[string]string) 81 | node, err := util.GetNode(util.NodeName) 82 | if err != nil { 83 | klog.Errorln("get node error", err.Error()) 84 | return err 85 | } 86 | encodeddevices := util.EncodeNodeDevices(*devices) 87 | annos[cambricon.HandshakeAnnos] = "Reported " + time.Now().String() 88 | annos[cambricon.RegisterAnnos] = encodeddevices 89 | klog.Infoln("Reporting devices", encodeddevices, "in", time.Now().String()) 90 | err = util.PatchNodeAnnotations(node, annos) 91 | 92 | if err != nil { 93 | klog.Errorln("patch node error", err.Error()) 94 | } 95 | return err 96 | } 97 | 98 | func (r *DeviceRegister) WatchAndRegister(opt Options) { 99 | klog.Infof("into WatchAndRegister") 100 | for { 101 | err := r.RegistrInAnnotation() 102 | if err != nil { 103 | klog.Errorf("register error, %v", err) 104 | time.Sleep(time.Second * 5) 105 | } else { 106 | time.Sleep(time.Second * 30) 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package cdi 18 | 19 | // Interface provides the API to the 'cdi' package 20 | // 21 | //go:generate moq -stub -out api_mock.go . Interface 22 | type Interface interface { 23 | CreateSpecFile() error 24 | QualifiedName(string, string) string 25 | } 26 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cdi 18 | 19 | import ( 20 | "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" 21 | 22 | "k8s.io/klog/v2" 23 | ) 24 | 25 | // New is a factory method that creates a CDI handler for creating CDI specs. 26 | func New(opts ...Option) (Interface, error) { 27 | infolib := info.New() 28 | 29 | hasNVML, _ := infolib.HasNvml() 30 | if !hasNVML { 31 | klog.Warning("No valid resources detected, creating a null CDI handler") 32 | return NewNullHandler(), nil 33 | } 34 | 35 | return newHandler(opts...) 36 | } 37 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cdi 18 | 19 | import ( 20 | "k8s.io/klog/v2" 21 | ) 22 | 23 | type null struct{} 24 | 25 | var _ Interface = &null{} 26 | 27 | // NewNullHandler returns an instance of the 'cdi' interface that can 28 | // be used when CDI specs are not required. 29 | func NewNullHandler() Interface { 30 | return &null{} 31 | } 32 | 33 | // CreateSpecFile is a no-op for the null handler. 34 | func (n *null) CreateSpecFile() error { 35 | return nil 36 | } 37 | 38 | // QualifiedName is a no-op for the null handler. A error message is logged 39 | // inidicating this should never be called for the null handler. 40 | func (n *null) QualifiedName(class string, id string) string { 41 | klog.Error("cannot return a qualified CDI device name with the null CDI handler") 42 | return "" 43 | } 44 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package cdi 18 | 19 | import ( 20 | "github.com/NVIDIA/go-nvlib/pkg/nvml" 21 | ) 22 | 23 | // Option defines a function for passing options to the New() call 24 | type Option func(*cdiHandler) 25 | 26 | // WithEnabled provides an Option to set the enabled flag used by the 'cdi' interface 27 | func WithEnabled(enabled bool) Option { 28 | return func(c *cdiHandler) { 29 | c.enabled = enabled 30 | } 31 | } 32 | 33 | // WithDriverRoot provides an Option to set the driver root used by the 'cdi' interface 34 | func WithDriverRoot(root string) Option { 35 | return func(c *cdiHandler) { 36 | c.driverRoot = root 37 | } 38 | } 39 | 40 | // WithTargetDriverRoot provides an Option to set the target driver root used by the 'cdi' interface 41 | func WithTargetDriverRoot(root string) Option { 42 | return func(c *cdiHandler) { 43 | c.targetDriverRoot = root 44 | } 45 | } 46 | 47 | // WithNvidiaCTKPath provides an Option to set the nvidia-ctk path used by the 'cdi' interface 48 | func WithNvidiaCTKPath(path string) Option { 49 | return func(c *cdiHandler) { 50 | c.nvidiaCTKPath = path 51 | } 52 | } 53 | 54 | // WithNvml provides an Option to set the NVML library used by the 'cdi' interface 55 | func WithNvml(nvml nvml.Interface) Option { 56 | return func(c *cdiHandler) { 57 | c.nvml = nvml 58 | } 59 | } 60 | 61 | // WithDeviceIDStrategy provides an Option to set the device ID strategy used by the 'cdi' interface 62 | func WithDeviceIDStrategy(strategy string) Option { 63 | return func(c *cdiHandler) { 64 | c.deviceIDStrategy = strategy 65 | } 66 | } 67 | 68 | // WithVendor provides an Option to set the vendor used by the 'cdi' interface 69 | func WithVendor(vendor string) Option { 70 | return func(c *cdiHandler) { 71 | c.vendor = vendor 72 | } 73 | } 74 | 75 | // WithGdsEnabled provides and option to set whether a GDS CDI spec should be generated 76 | func WithGdsEnabled(enabled bool) Option { 77 | return func(c *cdiHandler) { 78 | c.gdsEnabled = enabled 79 | } 80 | } 81 | 82 | // WithMofedEnabled provides and option to set whether a MOFED CDI spec should be generated 83 | func WithMofedEnabled(enabled bool) Option { 84 | return func(c *cdiHandler) { 85 | c.mofedEnabled = enabled 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/info/version.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package info 18 | 19 | import "strings" 20 | 21 | // version must be set by go build's -X main.version= option in the Makefile. 22 | var version = "unknown" 23 | 24 | // gitCommit will be the hash that the binary was built from 25 | // and will be populated by the Makefile 26 | var gitCommit = "" 27 | 28 | // GetVersionParts returns the different version components 29 | func GetVersionParts() []string { 30 | v := []string{version} 31 | 32 | if gitCommit != "" { 33 | v = append(v, "commit: "+gitCommit) 34 | } 35 | 36 | return v 37 | } 38 | 39 | // GetVersionString returns the string representation of the version 40 | func GetVersionString(more ...string) string { 41 | v := append(GetVersionParts(), more...) 42 | return strings.Join(v, "\n") 43 | } 44 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/mig/mig.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 - 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | package mig 4 | 5 | import ( 6 | "bufio" 7 | "fmt" 8 | "os" 9 | 10 | "k8s.io/klog/v2" 11 | ) 12 | 13 | const ( 14 | nvidiaProcDriverPath = "/proc/driver/nvidia" 15 | nvidiaCapabilitiesPath = nvidiaProcDriverPath + "/capabilities" 16 | 17 | nvcapsProcDriverPath = "/proc/driver/nvidia-caps" 18 | nvcapsMigMinorsPath = nvcapsProcDriverPath + "/mig-minors" 19 | nvcapsDevicePath = "/dev/nvidia-caps" 20 | ) 21 | 22 | // GetMigCapabilityDevicePaths returns a mapping of MIG capability path to device node path 23 | func GetMigCapabilityDevicePaths() (map[string]string, error) { 24 | // Open nvcapsMigMinorsPath for walking. 25 | // If the nvcapsMigMinorsPath does not exist, then we are not on a MIG 26 | // capable machine, so there is nothing to do. 27 | // The format of this file is discussed in: 28 | // https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#unique_1576522674 29 | minorsFile, err := os.Open(nvcapsMigMinorsPath) 30 | if os.IsNotExist(err) { 31 | return nil, nil 32 | } 33 | if err != nil { 34 | return nil, fmt.Errorf("error opening MIG minors file: %v", err) 35 | } 36 | defer minorsFile.Close() 37 | 38 | // Define a function to process each each line of nvcapsMigMinorsPath 39 | processLine := func(line string) (string, int, error) { 40 | var gpu, gi, ci, migMinor int 41 | 42 | // Look for a CI access file 43 | n, _ := fmt.Sscanf(line, "gpu%d/gi%d/ci%d/access %d", &gpu, &gi, &ci, &migMinor) 44 | if n == 4 { 45 | capPath := fmt.Sprintf(nvidiaCapabilitiesPath+"/gpu%d/mig/gi%d/ci%d/access", gpu, gi, ci) 46 | return capPath, migMinor, nil 47 | } 48 | 49 | // Look for a GI access file 50 | n, _ = fmt.Sscanf(line, "gpu%d/gi%d/access %d", &gpu, &gi, &migMinor) 51 | if n == 3 { 52 | capPath := fmt.Sprintf(nvidiaCapabilitiesPath+"/gpu%d/mig/gi%d/access", gpu, gi) 53 | return capPath, migMinor, nil 54 | } 55 | 56 | // Look for the MIG config file 57 | n, _ = fmt.Sscanf(line, "config %d", &migMinor) 58 | if n == 1 { 59 | capPath := fmt.Sprintf(nvidiaCapabilitiesPath + "/mig/config") 60 | return capPath, migMinor, nil 61 | } 62 | 63 | // Look for the MIG monitor file 64 | n, _ = fmt.Sscanf(line, "monitor %d", &migMinor) 65 | if n == 1 { 66 | capPath := fmt.Sprintf(nvidiaCapabilitiesPath + "/mig/monitor") 67 | return capPath, migMinor, nil 68 | } 69 | 70 | return "", 0, fmt.Errorf("unparsable line: %v", line) 71 | } 72 | 73 | // Walk each line of nvcapsMigMinorsPath and construct a mapping of nvidia 74 | // capabilities path to device minor for that capability 75 | capsDevicePaths := make(map[string]string) 76 | scanner := bufio.NewScanner(minorsFile) 77 | for scanner.Scan() { 78 | capPath, migMinor, err := processLine(scanner.Text()) 79 | if err != nil { 80 | klog.Errorf("Skipping line in MIG minors file: %v", err) 81 | continue 82 | } 83 | capsDevicePaths[capPath] = fmt.Sprintf(nvcapsDevicePath+"/nvidia-cap%d", migMinor) 84 | } 85 | return capsDevicePaths, nil 86 | } 87 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package plugin 18 | 19 | import "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/rm" 20 | 21 | // Interface defines the API for the plugin package 22 | type Interface interface { 23 | Devices() rm.Devices 24 | Start() error 25 | Stop() error 26 | } 27 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package manager 18 | 19 | import "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin" 20 | 21 | // Interface defines the API for the plugin manager package 22 | type Interface interface { 23 | GetPlugins() ([]plugin.Interface, error) 24 | CreateCDISpecFile() error 25 | } 26 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package manager 18 | 19 | import ( 20 | "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin" 21 | ) 22 | 23 | type null struct{} 24 | 25 | // GetPlugins returns an empty set of Plugins for the null manager 26 | func (m *null) GetPlugins() ([]plugin.Interface, error) { 27 | return nil, nil 28 | } 29 | 30 | // CreateCDISpecFile creates the spec is a no-op for the null plugin 31 | func (m *null) CreateCDISpecFile() error { 32 | return nil 33 | } 34 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package manager 18 | 19 | import ( 20 | "fmt" 21 | 22 | "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin" 23 | "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/rm" 24 | ) 25 | 26 | type nvmlmanager manager 27 | 28 | // GetPlugins returns the plugins associated with the NVML resources available on the node 29 | func (m *nvmlmanager) GetPlugins() ([]plugin.Interface, error) { 30 | rms, err := rm.NewNVMLResourceManagers(m.nvmllib, m.config) 31 | if err != nil { 32 | return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) 33 | } 34 | 35 | var plugins []plugin.Interface 36 | for _, r := range rms { 37 | plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled)) 38 | } 39 | return plugins, nil 40 | } 41 | 42 | // CreateCDISpecFile creates forwards the request to the CDI handler 43 | func (m *nvmlmanager) CreateCDISpecFile() error { 44 | return m.cdiHandler.CreateSpecFile() 45 | } 46 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package manager 18 | 19 | import ( 20 | "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/cdi" 21 | "4pd.io/k8s-vgpu/pkg/util" 22 | "github.com/NVIDIA/go-nvlib/pkg/nvml" 23 | ) 24 | 25 | // Option is a function that configures a manager 26 | type Option func(*manager) 27 | 28 | // WithCDIEnabled sets whether CDI is enabled for the manager 29 | func WithCDIEnabled(enabled bool) Option { 30 | return func(m *manager) { 31 | m.cdiEnabled = enabled 32 | } 33 | } 34 | 35 | // WithCDIHandler sets the CDI handler for the manager 36 | func WithCDIHandler(handler cdi.Interface) Option { 37 | return func(m *manager) { 38 | m.cdiHandler = handler 39 | } 40 | } 41 | 42 | // WithNVML sets the NVML handler for the manager 43 | func WithNVML(nvmllib nvml.Interface) Option { 44 | return func(m *manager) { 45 | m.nvmllib = nvmllib 46 | } 47 | } 48 | 49 | // WithFailOnInitError sets whether the manager should fail on initialization errors 50 | func WithFailOnInitError(failOnInitError bool) Option { 51 | return func(m *manager) { 52 | m.failOnInitError = failOnInitError 53 | } 54 | } 55 | 56 | // WithMigStrategy sets the MIG strategy for the manager 57 | func WithMigStrategy(migStrategy string) Option { 58 | return func(m *manager) { 59 | m.migStrategy = migStrategy 60 | } 61 | } 62 | 63 | // WithConfig sets the config reference for the manager 64 | func WithConfig(config *util.DeviceConfig) Option { 65 | return func(m *manager) { 66 | m.config = config 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package manager 18 | 19 | import ( 20 | "fmt" 21 | 22 | "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin" 23 | "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/rm" 24 | ) 25 | 26 | type tegramanager manager 27 | 28 | // GetPlugins returns the plugins associated with the NVML resources available on the node 29 | func (m *tegramanager) GetPlugins() ([]plugin.Interface, error) { 30 | rms, err := rm.NewTegraResourceManagers(m.config) 31 | if err != nil { 32 | return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) 33 | } 34 | 35 | var plugins []plugin.Interface 36 | for _, r := range rms { 37 | plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled)) 38 | } 39 | return plugins, nil 40 | } 41 | 42 | // CreateCDISpecFile creates the spec is a no-op for the tegra plugin 43 | func (m *tegramanager) CreateCDISpecFile() error { 44 | return nil 45 | } 46 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package plugin 18 | 19 | import "testing" 20 | 21 | func Test_parseNvidiaNumaInfo(t *testing.T) { 22 | 23 | tests := []struct { 24 | name string 25 | idx int 26 | nvidiaTopoStr string 27 | want int 28 | wantErr bool 29 | }{ 30 | { 31 | name: "single Tesla P4 NUMA", 32 | idx: 0, 33 | nvidiaTopoStr: `GPU0 CPU Affinity NUMA Affinity ... 34 | ...`, 35 | want: 0, 36 | wantErr: false, 37 | }, 38 | { 39 | name: "two Tesla P4 NUMA topo with index 0", 40 | idx: 0, 41 | nvidiaTopoStr: `GPU0 GPU1 CPU Affinity NUMA Affinity ... 42 | ...`, 43 | want: 0, 44 | wantErr: false, 45 | }, 46 | { 47 | name: "two Tesla P4 NUMA topo with index 1", 48 | idx: 1, 49 | nvidiaTopoStr: `GPU0 GPU1 CPU Affinity NUMA Affinity ... 50 | ...`, 51 | want: 0, 52 | wantErr: false, 53 | }, 54 | } 55 | 56 | for _, tt := range tests { 57 | t.Run(tt.name, func(t *testing.T) { 58 | got, err := parseNvidiaNumaInfo(tt.idx, tt.nvidiaTopoStr) 59 | if (err != nil) != tt.wantErr { 60 | t.Errorf("parseNvidiaNumaInfo() error = %v, wantErr %v", err, tt.wantErr) 61 | return 62 | } 63 | if got != tt.want { 64 | t.Errorf("parseNvidiaNumaInfo() got = %v, want %v", got, tt.want) 65 | } 66 | }) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package rm 18 | 19 | import ( 20 | "testing" 21 | 22 | spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" 23 | "github.com/stretchr/testify/require" 24 | pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 25 | ) 26 | 27 | func TestDeviceMapInsert(t *testing.T) { 28 | device0 := Device{Device: pluginapi.Device{ID: "0"}} 29 | device0withIndex := Device{Device: pluginapi.Device{ID: "0"}, Index: "index"} 30 | device1 := Device{Device: pluginapi.Device{ID: "1"}} 31 | 32 | testCases := []struct { 33 | description string 34 | deviceMap DeviceMap 35 | key string 36 | value *Device 37 | expectedDeviceMap DeviceMap 38 | }{ 39 | { 40 | description: "insert into empty map", 41 | deviceMap: make(DeviceMap), 42 | key: "resource", 43 | value: &device0, 44 | expectedDeviceMap: DeviceMap{ 45 | "resource": Devices{ 46 | "0": &device0, 47 | }, 48 | }, 49 | }, 50 | { 51 | description: "add to existing resource", 52 | deviceMap: DeviceMap{ 53 | "resource": Devices{ 54 | "0": &device0, 55 | }, 56 | }, 57 | key: "resource", 58 | value: &device1, 59 | expectedDeviceMap: DeviceMap{ 60 | "resource": Devices{ 61 | "0": &device0, 62 | "1": &device1, 63 | }, 64 | }, 65 | }, 66 | { 67 | description: "add new resource", 68 | deviceMap: DeviceMap{ 69 | "resource": Devices{ 70 | "0": &device0, 71 | }, 72 | }, 73 | key: "resource1", 74 | value: &device0, 75 | expectedDeviceMap: DeviceMap{ 76 | "resource": Devices{ 77 | "0": &device0, 78 | }, 79 | "resource1": Devices{ 80 | "0": &device0, 81 | }, 82 | }, 83 | }, 84 | { 85 | description: "overwrite existing device", 86 | deviceMap: DeviceMap{ 87 | "resource": Devices{ 88 | "0": &device0, 89 | }, 90 | }, 91 | key: "resource", 92 | value: &device0withIndex, 93 | expectedDeviceMap: DeviceMap{ 94 | "resource": Devices{ 95 | "0": &device0withIndex, 96 | }, 97 | }, 98 | }, 99 | } 100 | 101 | for _, tc := range testCases { 102 | t.Run(tc.description, func(t *testing.T) { 103 | tc.deviceMap.insert(spec.ResourceName(tc.key), tc.value) 104 | 105 | require.EqualValues(t, tc.expectedDeviceMap, tc.deviceMap) 106 | }) 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package rm 18 | 19 | import ( 20 | "testing" 21 | 22 | "github.com/stretchr/testify/require" 23 | ) 24 | 25 | func TestGetAdditionalXids(t *testing.T) { 26 | testCases := []struct { 27 | description string 28 | input string 29 | expected []uint64 30 | }{ 31 | { 32 | description: "Empty input", 33 | }, 34 | { 35 | description: "Only comma", 36 | input: ",", 37 | }, 38 | { 39 | description: "Non-integer input", 40 | input: "not-an-int", 41 | }, 42 | { 43 | description: "Single integer", 44 | input: "68", 45 | expected: []uint64{68}, 46 | }, 47 | { 48 | description: "Negative integer", 49 | input: "-68", 50 | }, 51 | { 52 | description: "Single integer with trailing spaces", 53 | input: "68 ", 54 | expected: []uint64{68}, 55 | }, 56 | { 57 | description: "Single integer followed by comma without trailing number", 58 | input: "68,", 59 | expected: []uint64{68}, 60 | }, 61 | { 62 | description: "Comma without preceding number followed by single integer", 63 | input: ",68", 64 | expected: []uint64{68}, 65 | }, 66 | { 67 | description: "Two comma-separated integers", 68 | input: "68,67", 69 | expected: []uint64{68, 67}, 70 | }, 71 | { 72 | description: "Two integers separated by non-integer", 73 | input: "68,not-an-int,67", 74 | expected: []uint64{68, 67}, 75 | }, 76 | } 77 | 78 | for _, tc := range testCases { 79 | t.Run(tc.description, func(t *testing.T) { 80 | xids := getAdditionalXids(tc.input) 81 | require.EqualValues(t, tc.expected, xids) 82 | }) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package rm 18 | 19 | // int8Slice wraps an []int8 with more functions. 20 | type int8Slice []int8 21 | 22 | // String turns a nil terminated int8Slice into a string 23 | func (s int8Slice) String() string { 24 | var b []byte 25 | for _, c := range s { 26 | if c == 0 { 27 | break 28 | } 29 | b = append(b, byte(c)) 30 | } 31 | return string(b) 32 | } 33 | 34 | // uintPtr returns a *uint from a uint32 35 | func uintPtr(c uint32) *uint { 36 | i := uint(c) 37 | return &i 38 | } 39 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package rm 18 | 19 | import ( 20 | "fmt" 21 | 22 | "4pd.io/k8s-vgpu/pkg/util" 23 | "github.com/NVIDIA/go-nvlib/pkg/nvml" 24 | "k8s.io/klog/v2" 25 | ) 26 | 27 | type nvmlResourceManager struct { 28 | resourceManager 29 | nvml nvml.Interface 30 | } 31 | 32 | var _ ResourceManager = (*nvmlResourceManager)(nil) 33 | 34 | // NewNVMLResourceManagers returns a set of ResourceManagers, one for each NVML resource in 'config'. 35 | func NewNVMLResourceManagers(nvmllib nvml.Interface, config *util.DeviceConfig) ([]ResourceManager, error) { 36 | ret := nvmllib.Init() 37 | if ret != nvml.SUCCESS { 38 | return nil, fmt.Errorf("failed to initialize NVML: %v", ret) 39 | } 40 | defer func() { 41 | ret := nvmllib.Shutdown() 42 | if ret != nvml.SUCCESS { 43 | klog.Infof("Error shutting down NVML: %v", ret) 44 | } 45 | }() 46 | 47 | deviceMap, err := NewDeviceMap(nvmllib, config) 48 | if err != nil { 49 | return nil, fmt.Errorf("error building device map: %v", err) 50 | } 51 | 52 | var rms []ResourceManager 53 | for resourceName, devices := range deviceMap { 54 | if len(devices) == 0 { 55 | continue 56 | } 57 | r := &nvmlResourceManager{ 58 | resourceManager: resourceManager{ 59 | config: config, 60 | resource: resourceName, 61 | devices: devices, 62 | }, 63 | nvml: nvmllib, 64 | } 65 | rms = append(rms, r) 66 | } 67 | 68 | return rms, nil 69 | } 70 | 71 | // GetPreferredAllocation runs an allocation algorithm over the inputs. 72 | // The algorithm chosen is based both on the incoming set of available devices and various config settings. 73 | func (r *nvmlResourceManager) GetPreferredAllocation(available, required []string, size int) ([]string, error) { 74 | return r.getPreferredAllocation(available, required, size) 75 | } 76 | 77 | // GetDevicePaths returns the required and optional device nodes for the requested resources 78 | func (r *nvmlResourceManager) GetDevicePaths(ids []string) []string { 79 | paths := []string{ 80 | "/dev/nvidiactl", 81 | "/dev/nvidia-uvm", 82 | "/dev/nvidia-uvm-tools", 83 | "/dev/nvidia-modeset", 84 | } 85 | 86 | for _, p := range r.Devices().Subset(ids).GetPaths() { 87 | paths = append(paths, p) 88 | } 89 | 90 | return paths 91 | } 92 | 93 | // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices 94 | func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error { 95 | return r.checkHealth(stop, r.devices, unhealthy) 96 | } 97 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package rm 18 | 19 | import ( 20 | "fmt" 21 | 22 | "4pd.io/k8s-vgpu/pkg/util" 23 | ) 24 | 25 | const ( 26 | tegraDeviceName = "tegra" 27 | ) 28 | 29 | // buildTegraDeviceMap creates a DeviceMap for the tegra devices in the sytesm. 30 | // NOTE: At present only a single tegra device is expected. 31 | func buildTegraDeviceMap(config *util.DeviceConfig) (DeviceMap, error) { 32 | devices := make(DeviceMap) 33 | 34 | name := tegraDeviceName 35 | i := 0 36 | for _, resource := range config.Resources.GPUs { 37 | if resource.Pattern.Matches(name) { 38 | index := fmt.Sprintf("%d", i) 39 | err := devices.setEntry(resource.Name, index, &tegraDevice{}) 40 | if err != nil { 41 | return nil, err 42 | } 43 | i++ 44 | } 45 | 46 | } 47 | return devices, nil 48 | } 49 | 50 | type tegraDevice struct{} 51 | 52 | var _ deviceInfo = (*tegraDevice)(nil) 53 | 54 | // GetUUID returns the UUID of the tegra device. 55 | // TODO: This is currently hardcoded to `tegra` 56 | func (d *tegraDevice) GetUUID() (string, error) { 57 | return tegraDeviceName, nil 58 | } 59 | 60 | // GetPaths returns the paths for a tegra device. 61 | // A tegra device does not have paths associated with it. 62 | func (d *tegraDevice) GetPaths() ([]string, error) { 63 | return nil, nil 64 | } 65 | 66 | // GetNumaNode always returns unsupported for a Tegra device 67 | func (d *tegraDevice) GetNumaNode() (bool, int, error) { 68 | return false, -1, nil 69 | } 70 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_manager.go: -------------------------------------------------------------------------------- 1 | /** 2 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | **/ 16 | 17 | package rm 18 | 19 | import ( 20 | "fmt" 21 | 22 | "4pd.io/k8s-vgpu/pkg/util" 23 | ) 24 | 25 | type tegraResourceManager struct { 26 | resourceManager 27 | } 28 | 29 | var _ ResourceManager = (*tegraResourceManager)(nil) 30 | 31 | // NewTegraResourceManagers returns a set of ResourceManagers for tegra resources 32 | func NewTegraResourceManagers(config *util.DeviceConfig) ([]ResourceManager, error) { 33 | deviceMap, err := buildTegraDeviceMap(config) 34 | if err != nil { 35 | return nil, fmt.Errorf("error building Tegra device map: %v", err) 36 | } 37 | 38 | deviceMap, err = updateDeviceMapWithReplicas(config, deviceMap) 39 | if err != nil { 40 | return nil, fmt.Errorf("error updating device map with replicas from config.sharing.timeSlicing.resources: %v", err) 41 | } 42 | 43 | var rms []ResourceManager 44 | for resourceName, devices := range deviceMap { 45 | if len(devices) == 0 { 46 | continue 47 | } 48 | r := &tegraResourceManager{ 49 | resourceManager: resourceManager{ 50 | config: config, 51 | resource: resourceName, 52 | devices: devices, 53 | }, 54 | } 55 | if len(devices) != 0 { 56 | rms = append(rms, r) 57 | } 58 | } 59 | 60 | return rms, nil 61 | } 62 | 63 | // GetPreferredAllocation returns a standard allocation for the Tegra resource manager. 64 | func (r *tegraResourceManager) GetPreferredAllocation(available, required []string, size int) ([]string, error) { 65 | return r.distributedAlloc(available, required, size) 66 | } 67 | 68 | // GetDevicePaths returns an empty slice for the tegraResourceManager 69 | func (r *tegraResourceManager) GetDevicePaths(ids []string) []string { 70 | return nil 71 | } 72 | 73 | // CheckHealth is disabled for the tegraResourceManager 74 | func (r *tegraResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error { 75 | return nil 76 | } 77 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package rm 18 | 19 | type wslDevice nvmlDevice 20 | 21 | var _ deviceInfo = (*wslDevice)(nil) 22 | 23 | // GetUUID returns the UUID of the device 24 | func (d wslDevice) GetUUID() (string, error) { 25 | return nvmlDevice(d).GetUUID() 26 | } 27 | 28 | // GetPaths returns the paths for a tegra device. 29 | func (d wslDevice) GetPaths() ([]string, error) { 30 | return []string{"/dev/dxg"}, nil 31 | } 32 | 33 | // GetNumaNode returns the NUMA node associated with the GPU device 34 | func (d wslDevice) GetNumaNode() (bool, int, error) { 35 | return nvmlDevice(d).GetNumaNode() 36 | } 37 | -------------------------------------------------------------------------------- /pkg/device/devices.go: -------------------------------------------------------------------------------- 1 | package device 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "os" 7 | "strings" 8 | 9 | "4pd.io/k8s-vgpu/pkg/device/cambricon" 10 | "4pd.io/k8s-vgpu/pkg/device/hygon" 11 | "4pd.io/k8s-vgpu/pkg/device/nvidia" 12 | "4pd.io/k8s-vgpu/pkg/util" 13 | "4pd.io/k8s-vgpu/pkg/util/client" 14 | "4pd.io/k8s-vgpu/pkg/util/nodelock" 15 | v1 "k8s.io/api/core/v1" 16 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 17 | "k8s.io/klog" 18 | ) 19 | 20 | type Devices interface { 21 | MutateAdmission(ctr *v1.Container) bool 22 | CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) 23 | GenerateResourceRequests(ctr *v1.Container) util.ContainerDeviceRequest 24 | ParseConfig(fs *flag.FlagSet) 25 | } 26 | 27 | var ( 28 | KnownDevice = map[string]string{ 29 | nvidia.HandshakeAnnos: nvidia.RegisterAnnos, 30 | cambricon.HandshakeAnnos: cambricon.RegisterAnnos, 31 | hygon.HandshakeAnnos: hygon.RegisterAnnos, 32 | } 33 | DevicesToHandle []string 34 | ) 35 | 36 | var devices map[string]Devices 37 | var DebugMode bool 38 | 39 | func GetDevices() map[string]Devices { 40 | return devices 41 | } 42 | 43 | func init() { 44 | devices = make(map[string]Devices) 45 | devices["Cambricon"] = cambricon.InitMLUDevice() 46 | devices["NVIDIA"] = nvidia.InitNvidiaDevice() 47 | devices["Hygon"] = hygon.InitDCUDevice() 48 | DevicesToHandle = []string{} 49 | DevicesToHandle = append(DevicesToHandle, nvidia.NvidiaGPUCommonWord) 50 | DevicesToHandle = append(DevicesToHandle, cambricon.CambriconMLUCommonWord) 51 | DevicesToHandle = append(DevicesToHandle, hygon.HygonDCUCommonWord) 52 | } 53 | 54 | func PodAllocationTrySuccess(nodeName string, pod *v1.Pod) { 55 | refreshed, _ := client.GetClient().CoreV1().Pods(pod.Namespace).Get(context.Background(), pod.Name, metav1.GetOptions{}) 56 | annos := refreshed.Annotations[util.AssignedIDsToAllocateAnnotations] 57 | klog.Infoln("TrySuccess:", annos) 58 | for _, val := range DevicesToHandle { 59 | if strings.Contains(annos, val) { 60 | return 61 | } 62 | } 63 | klog.Infoln("AllDevicesAllocateSuccess releasing lock") 64 | PodAllocationSuccess(nodeName, pod) 65 | } 66 | 67 | func PodAllocationSuccess(nodeName string, pod *v1.Pod) { 68 | newannos := make(map[string]string) 69 | newannos[util.DeviceBindPhase] = util.DeviceBindSuccess 70 | err := util.PatchPodAnnotations(pod, newannos) 71 | if err != nil { 72 | klog.Errorf("patchPodAnnotations failed:%v", err.Error()) 73 | } 74 | err = nodelock.ReleaseNodeLock(nodeName) 75 | if err != nil { 76 | klog.Errorf("release lock failed:%v", err.Error()) 77 | } 78 | } 79 | 80 | func PodAllocationFailed(nodeName string, pod *v1.Pod) { 81 | newannos := make(map[string]string) 82 | newannos[util.DeviceBindPhase] = util.DeviceBindFailed 83 | err := util.PatchPodAnnotations(pod, newannos) 84 | if err != nil { 85 | klog.Errorf("patchPodAnnotations failed:%v", err.Error()) 86 | } 87 | err = nodelock.ReleaseNodeLock(nodeName) 88 | if err != nil { 89 | klog.Errorf("release lock failed:%v", err.Error()) 90 | } 91 | } 92 | 93 | func GlobalFlagSet() *flag.FlagSet { 94 | fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError) 95 | for _, val := range devices { 96 | val.ParseConfig(fs) 97 | } 98 | fs.BoolVar(&DebugMode, "debug", false, "debug mode") 99 | klog.InitFlags(fs) 100 | return fs 101 | } 102 | -------------------------------------------------------------------------------- /pkg/k8sutil/client.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package k8sutil 18 | 19 | import ( 20 | "k8s.io/client-go/kubernetes" 21 | "k8s.io/client-go/rest" 22 | "k8s.io/client-go/tools/clientcmd" 23 | "os" 24 | "path/filepath" 25 | ) 26 | 27 | // NewClient connects to an API server 28 | func NewClient() (kubernetes.Interface, error) { 29 | kubeConfig := os.Getenv("KUBECONFIG") 30 | if kubeConfig == "" { 31 | kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config") 32 | } 33 | config, err := rest.InClusterConfig() 34 | if err != nil { 35 | config, err = clientcmd.BuildConfigFromFlags("", kubeConfig) 36 | if err != nil { 37 | return nil, err 38 | } 39 | } 40 | client, err := kubernetes.NewForConfig(config) 41 | return client, err 42 | } 43 | -------------------------------------------------------------------------------- /pkg/k8sutil/pod.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package k8sutil 18 | 19 | import ( 20 | "4pd.io/k8s-vgpu/pkg/device" 21 | "4pd.io/k8s-vgpu/pkg/util" 22 | corev1 "k8s.io/api/core/v1" 23 | "k8s.io/klog/v2" 24 | ) 25 | 26 | func Resourcereqs(pod *corev1.Pod) (counts util.PodDeviceRequests) { 27 | counts = make(util.PodDeviceRequests, len(pod.Spec.Containers)) 28 | //Count Nvidia GPU 29 | for i := 0; i < len(pod.Spec.Containers); i++ { 30 | devices := device.GetDevices() 31 | counts[i] = make(util.ContainerDeviceRequests) 32 | for idx, val := range devices { 33 | request := val.GenerateResourceRequests(&pod.Spec.Containers[i]) 34 | if request.Nums > 0 { 35 | counts[i][idx] = val.GenerateResourceRequests(&pod.Spec.Containers[i]) 36 | } 37 | } 38 | } 39 | klog.InfoS("collect requestreqs", counts) 40 | return counts 41 | } 42 | 43 | func IsPodInTerminatedState(pod *corev1.Pod) bool { 44 | return pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded 45 | } 46 | 47 | func AllContainersCreated(pod *corev1.Pod) bool { 48 | return len(pod.Status.ContainerStatuses) >= len(pod.Spec.Containers) 49 | } 50 | -------------------------------------------------------------------------------- /pkg/oci/runtime.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | // Runtime is an interface for a runtime shim. The Exec method accepts a list 20 | // of command line arguments, and returns an error / nil. 21 | type Runtime interface { 22 | Exec([]string) error 23 | } 24 | -------------------------------------------------------------------------------- /pkg/oci/runtime_exec.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "syscall" 23 | 24 | log "github.com/sirupsen/logrus" 25 | ) 26 | 27 | // SyscallExecRuntime wraps the path that a binary and defines the semanitcs for how to exec into it. 28 | // This can be used to wrap an OCI-compliant low-level runtime binary, allowing it to be used through the 29 | // Runtime internface. 30 | type SyscallExecRuntime struct { 31 | logger *log.Logger 32 | path string 33 | // exec is used for testing. This defaults to syscall.Exec 34 | exec func(argv0 string, argv []string, envv []string) error 35 | } 36 | 37 | var _ Runtime = (*SyscallExecRuntime)(nil) 38 | 39 | // NewSyscallExecRuntime creates a SyscallExecRuntime for the specified path with the standard logger 40 | func NewSyscallExecRuntime(path string) (Runtime, error) { 41 | return NewSyscallExecRuntimeWithLogger(log.StandardLogger(), path) 42 | } 43 | 44 | // NewSyscallExecRuntimeWithLogger creates a SyscallExecRuntime for the specified logger and path 45 | func NewSyscallExecRuntimeWithLogger(logger *log.Logger, path string) (Runtime, error) { 46 | info, err := os.Stat(path) 47 | if err != nil { 48 | return nil, fmt.Errorf("invalid path '%v': %v", path, err) 49 | } 50 | if info.IsDir() || info.Mode()&0111 == 0 { 51 | return nil, fmt.Errorf("specified path '%v' is not an executable file", path) 52 | } 53 | 54 | shim := SyscallExecRuntime{ 55 | logger: logger, 56 | path: path, 57 | exec: syscall.Exec, 58 | } 59 | 60 | return &shim, nil 61 | } 62 | 63 | // Exec exces into the binary at the path from the SyscallExecRuntime struct, passing it the supplied arguments 64 | // after ensuring that the first argument is the path of the target binary. 65 | func (s SyscallExecRuntime) Exec(args []string) error { 66 | runtimeArgs := []string{s.path} 67 | if len(args) > 1 { 68 | runtimeArgs = append(runtimeArgs, args[1:]...) 69 | } 70 | 71 | err := s.exec(s.path, runtimeArgs, os.Environ()) 72 | if err != nil { 73 | return fmt.Errorf("could not exec '%v': %v", s.path, err) 74 | } 75 | 76 | // syscall.Exec is not expected to return. This is an error state regardless of whether 77 | // err is nil or not. 78 | return fmt.Errorf("unexpected return from exec '%v'", s.path) 79 | } 80 | -------------------------------------------------------------------------------- /pkg/oci/runtime_exec_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | package oci 17 | 18 | import ( 19 | "fmt" 20 | "strings" 21 | "testing" 22 | 23 | testlog "github.com/sirupsen/logrus/hooks/test" 24 | "github.com/stretchr/testify/require" 25 | ) 26 | 27 | func TestSyscallExecConstructor(t *testing.T) { 28 | r, err := NewSyscallExecRuntime("////an/invalid/path") 29 | require.Error(t, err) 30 | require.Nil(t, r) 31 | 32 | r, err = NewSyscallExecRuntime("/tmp") 33 | require.Error(t, err) 34 | require.Nil(t, r) 35 | 36 | r, err = NewSyscallExecRuntime("/dev/null") 37 | require.Error(t, err) 38 | require.Nil(t, r) 39 | 40 | r, err = NewSyscallExecRuntime("/bin/sh") 41 | require.NoError(t, err) 42 | 43 | f, ok := r.(*SyscallExecRuntime) 44 | require.True(t, ok) 45 | 46 | require.Equal(t, "/bin/sh", f.path) 47 | } 48 | 49 | func TestSyscallExecForwardsArgs(t *testing.T) { 50 | logger, _ := testlog.NewNullLogger() 51 | f := SyscallExecRuntime{ 52 | logger: logger, 53 | path: "runtime", 54 | } 55 | 56 | testCases := []struct { 57 | returnError error 58 | args []string 59 | errorPrefix string 60 | }{ 61 | { 62 | returnError: nil, 63 | errorPrefix: "unexpected return from exec", 64 | }, 65 | { 66 | returnError: fmt.Errorf("error from exec"), 67 | errorPrefix: "could not exec", 68 | }, 69 | { 70 | returnError: nil, 71 | args: []string{"otherargv0"}, 72 | errorPrefix: "unexpected return from exec", 73 | }, 74 | { 75 | returnError: nil, 76 | args: []string{"otherargv0", "arg1", "arg2", "arg3"}, 77 | errorPrefix: "unexpected return from exec", 78 | }, 79 | } 80 | 81 | for i, tc := range testCases { 82 | execMock := WithMockExec(f, tc.returnError) 83 | 84 | err := execMock.Exec(tc.args) 85 | 86 | require.Errorf(t, err, "%d: %v", i, tc) 87 | require.Truef(t, strings.HasPrefix(err.Error(), tc.errorPrefix), "%d: %v", i, tc) 88 | if tc.returnError != nil { 89 | require.Truef(t, strings.HasSuffix(err.Error(), tc.returnError.Error()), "%d: %v", i, tc) 90 | } 91 | 92 | require.Equalf(t, f.path, execMock.argv0, "%d: %v", i, tc) 93 | require.Equalf(t, f.path, execMock.argv[0], "%d: %v", i, tc) 94 | 95 | require.LessOrEqualf(t, len(tc.args), len(execMock.argv), "%d: %v", i, tc) 96 | if len(tc.args) > 1 { 97 | require.Equalf(t, tc.args[1:], execMock.argv[1:], "%d: %v", i, tc) 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /pkg/oci/runtime_mock.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | // MockExecRuntime wraps a SyscallExecRuntime, intercepting the exec call for testing 20 | type MockExecRuntime struct { 21 | SyscallExecRuntime 22 | execMock 23 | } 24 | 25 | // WithMockExec wraps a specified SyscallExecRuntime with a mocked exec function for testing 26 | func WithMockExec(e SyscallExecRuntime, execResult error) *MockExecRuntime { 27 | m := MockExecRuntime{ 28 | SyscallExecRuntime: e, 29 | execMock: execMock{result: execResult}, 30 | } 31 | // overrdie the exec function to the mocked exec function. 32 | m.SyscallExecRuntime.exec = m.execMock.exec 33 | return &m 34 | } 35 | 36 | type execMock struct { 37 | argv0 string 38 | argv []string 39 | envv []string 40 | result error 41 | } 42 | 43 | func (m *execMock) exec(argv0 string, argv []string, envv []string) error { 44 | m.argv0 = argv0 45 | m.argv = argv 46 | m.envv = envv 47 | 48 | return m.result 49 | } 50 | -------------------------------------------------------------------------------- /pkg/oci/spec.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | import ( 20 | "encoding/json" 21 | "fmt" 22 | "os" 23 | 24 | oci "github.com/opencontainers/runtime-spec/specs-go" 25 | ) 26 | 27 | // SpecModifier is a function that accepts a pointer to an OCI Srec and returns an 28 | // error. The intention is that the function would modify the spec in-place. 29 | type SpecModifier func(*oci.Spec) error 30 | 31 | // Spec defines the operations to be performed on an OCI specification 32 | type Spec interface { 33 | Load() error 34 | Flush() error 35 | Modify(SpecModifier) error 36 | } 37 | 38 | type fileSpec struct { 39 | *oci.Spec 40 | path string 41 | } 42 | 43 | var _ Spec = (*fileSpec)(nil) 44 | 45 | // NewSpecFromFile creates an object that encapsulates a file-backed OCI spec. 46 | // This can be used to read from the file, modify the spec, and write to the 47 | // same file. 48 | func NewSpecFromFile(filepath string) Spec { 49 | oci := fileSpec{ 50 | path: filepath, 51 | } 52 | 53 | return &oci 54 | } 55 | 56 | // Load reads the contents of an OCI spec from file to be referenced internally. 57 | // The file is opened "read-only" 58 | func (s *fileSpec) Load() error { 59 | specFile, err := os.Open(s.path) 60 | if err != nil { 61 | return fmt.Errorf("error opening OCI specification file: %v", err) 62 | } 63 | defer specFile.Close() 64 | 65 | decoder := json.NewDecoder(specFile) 66 | 67 | var spec oci.Spec 68 | err = decoder.Decode(&spec) 69 | if err != nil { 70 | return fmt.Errorf("error reading OCI specification from file: %v", err) 71 | } 72 | 73 | s.Spec = &spec 74 | return nil 75 | } 76 | 77 | // Modify applies the specified SpecModifier to the stored OCI specification. 78 | func (s *fileSpec) Modify(f SpecModifier) error { 79 | if s.Spec == nil { 80 | return fmt.Errorf("no spec loaded for modification") 81 | } 82 | return f(s.Spec) 83 | } 84 | 85 | // Flush writes the stored OCI specification to the filepath specifed by the path member. 86 | // The file is truncated upon opening, overwriting any existing contents. 87 | func (s fileSpec) Flush() error { 88 | specFile, err := os.Create(s.path) 89 | if err != nil { 90 | return fmt.Errorf("error opening OCI specification file: %v", err) 91 | } 92 | defer specFile.Close() 93 | 94 | encoder := json.NewEncoder(specFile) 95 | 96 | err = encoder.Encode(s.Spec) 97 | if err != nil { 98 | return fmt.Errorf("error writing OCI specification to file: %v", err) 99 | } 100 | 101 | return nil 102 | } 103 | -------------------------------------------------------------------------------- /pkg/oci/spec_mock.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | import ( 20 | oci "github.com/opencontainers/runtime-spec/specs-go" 21 | ) 22 | 23 | // MockSpec provides a simple mock for an OCI spec to be used in testing. 24 | // It also implements the SpecModifier interface. 25 | type MockSpec struct { 26 | *oci.Spec 27 | MockLoad mockFunc 28 | MockFlush mockFunc 29 | MockModify mockFunc 30 | } 31 | 32 | var _ Spec = (*MockSpec)(nil) 33 | 34 | // NewMockSpec constructs a MockSpec to be used in testing as a Spec 35 | func NewMockSpec(spec *oci.Spec, flushResult error, modifyResult error) *MockSpec { 36 | s := MockSpec{ 37 | Spec: spec, 38 | MockFlush: mockFunc{result: flushResult}, 39 | MockModify: mockFunc{result: modifyResult}, 40 | } 41 | 42 | return &s 43 | } 44 | 45 | // Load invokes the mocked Load function to return the predefined error / result 46 | func (s *MockSpec) Load() error { 47 | return s.MockLoad.call() 48 | } 49 | 50 | // Flush invokes the mocked Load function to return the predefined error / result 51 | func (s *MockSpec) Flush() error { 52 | return s.MockFlush.call() 53 | } 54 | 55 | // Modify applies the specified SpecModifier to the spec and invokes the 56 | // mocked modify function to return the predefined error / result. 57 | func (s *MockSpec) Modify(f SpecModifier) error { 58 | f(s.Spec) 59 | return s.MockModify.call() 60 | } 61 | 62 | type mockFunc struct { 63 | Callcount int 64 | result error 65 | } 66 | 67 | func (m *mockFunc) call() error { 68 | m.Callcount++ 69 | return m.result 70 | } 71 | -------------------------------------------------------------------------------- /pkg/scheduler/config/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package config 18 | 19 | var ( 20 | HttpBind string 21 | SchedulerName string 22 | DefaultMem int32 23 | DefaultCores int32 24 | MetricsBindAddress string 25 | ) 26 | -------------------------------------------------------------------------------- /pkg/scheduler/nodes.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package scheduler 18 | 19 | import ( 20 | "fmt" 21 | "strings" 22 | "sync" 23 | 24 | "4pd.io/k8s-vgpu/pkg/util" 25 | "k8s.io/klog/v2" 26 | ) 27 | 28 | type DeviceInfo struct { 29 | ID string 30 | Index uint 31 | Count int32 32 | Devmem int32 33 | Devcore int32 34 | Type string 35 | Numa int 36 | Health bool 37 | } 38 | 39 | type NodeInfo struct { 40 | ID string 41 | Devices []DeviceInfo 42 | } 43 | 44 | type DeviceUsageList []*util.DeviceUsage 45 | 46 | type NodeUsage struct { 47 | Devices DeviceUsageList 48 | } 49 | 50 | type nodeManager struct { 51 | nodes map[string]*NodeInfo 52 | mutex sync.RWMutex 53 | } 54 | 55 | func (m *nodeManager) init() { 56 | m.nodes = make(map[string]*NodeInfo) 57 | } 58 | 59 | func (m *nodeManager) addNode(nodeID string, nodeInfo *NodeInfo) { 60 | if nodeInfo == nil || len(nodeInfo.Devices) == 0 { 61 | return 62 | } 63 | m.mutex.Lock() 64 | defer m.mutex.Unlock() 65 | _, ok := m.nodes[nodeID] 66 | if ok { 67 | tmp := make([]DeviceInfo, 0, len(m.nodes[nodeID].Devices)+len(nodeInfo.Devices)) 68 | tmp = append(tmp, m.nodes[nodeID].Devices...) 69 | tmp = append(tmp, nodeInfo.Devices...) 70 | m.nodes[nodeID].Devices = tmp 71 | } else { 72 | m.nodes[nodeID] = nodeInfo 73 | } 74 | } 75 | 76 | func (m *nodeManager) rmNodeDevice(nodeID string, nodeInfo *NodeInfo) { 77 | m.mutex.Lock() 78 | defer m.mutex.Unlock() 79 | _, ok := m.nodes[nodeID] 80 | if ok { 81 | if m.nodes[nodeID].Devices == nil || len(m.nodes[nodeID].Devices) == 0 { 82 | return 83 | } 84 | klog.Infoln("before rm:", m.nodes[nodeID].Devices, "needs remove", nodeInfo.Devices) 85 | tmp := make([]DeviceInfo, 0, len(m.nodes[nodeID].Devices)-len(nodeInfo.Devices)) 86 | for _, val := range m.nodes[nodeID].Devices { 87 | found := false 88 | for _, rmval := range nodeInfo.Devices { 89 | if strings.Compare(val.ID, rmval.ID) == 0 { 90 | found = true 91 | break 92 | } 93 | } 94 | if !found && len(val.ID) > 0 { 95 | tmp = append(tmp, val) 96 | } 97 | } 98 | m.nodes[nodeID].Devices = tmp 99 | klog.Infoln("Rm Devices res:", m.nodes[nodeID].Devices) 100 | } 101 | } 102 | 103 | func (m *nodeManager) GetNode(nodeID string) (*NodeInfo, error) { 104 | m.mutex.RLock() 105 | defer m.mutex.RUnlock() 106 | if n, ok := m.nodes[nodeID]; ok { 107 | return n, nil 108 | } 109 | return &NodeInfo{}, fmt.Errorf("node %v not found", nodeID) 110 | } 111 | 112 | func (m *nodeManager) ListNodes() (map[string]*NodeInfo, error) { 113 | m.mutex.RLock() 114 | defer m.mutex.RUnlock() 115 | return m.nodes, nil 116 | } 117 | -------------------------------------------------------------------------------- /pkg/scheduler/pods.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package scheduler 18 | 19 | import ( 20 | "sync" 21 | 22 | "4pd.io/k8s-vgpu/pkg/util" 23 | corev1 "k8s.io/api/core/v1" 24 | k8stypes "k8s.io/apimachinery/pkg/types" 25 | "k8s.io/klog/v2" 26 | ) 27 | 28 | type podInfo struct { 29 | Namespace string 30 | Name string 31 | Uid k8stypes.UID 32 | NodeID string 33 | Devices util.PodDevices 34 | CtrIDs []string 35 | } 36 | 37 | type podManager struct { 38 | pods map[k8stypes.UID]*podInfo 39 | mutex sync.RWMutex 40 | } 41 | 42 | func (m *podManager) init() { 43 | m.pods = make(map[k8stypes.UID]*podInfo) 44 | } 45 | 46 | func (m *podManager) addPod(pod *corev1.Pod, nodeID string, devices util.PodDevices) { 47 | m.mutex.Lock() 48 | defer m.mutex.Unlock() 49 | _, ok := m.pods[pod.UID] 50 | if !ok { 51 | pi := &podInfo{Name: pod.Name, Uid: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices} 52 | m.pods[pod.UID] = pi 53 | klog.Infof("Pod added: Name: %s, Uid: %s, Namespace: %s, NodeID: %s", pod.Name, pod.UID, pod.Namespace, nodeID) 54 | } 55 | } 56 | 57 | func (m *podManager) delPod(pod *corev1.Pod) { 58 | m.mutex.Lock() 59 | defer m.mutex.Unlock() 60 | pi, ok := m.pods[pod.UID] 61 | if ok { 62 | klog.Infof("Deleted pod %s with node ID %s", pi.Name, pi.NodeID) 63 | delete(m.pods, pod.UID) 64 | } 65 | } 66 | 67 | func (m *podManager) GetScheduledPods() (map[k8stypes.UID]*podInfo, error) { 68 | m.mutex.RLock() 69 | defer m.mutex.RUnlock() 70 | klog.Infof("Getting all scheduled pods with %d nums", len(m.pods)) 71 | return m.pods, nil 72 | } 73 | -------------------------------------------------------------------------------- /pkg/scheduler/scheduler_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package scheduler 18 | 19 | import ( 20 | "testing" 21 | 22 | "4pd.io/k8s-vgpu/pkg/util" 23 | "gotest.tools/v3/assert" 24 | corev1 "k8s.io/api/core/v1" 25 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 | ) 27 | 28 | func Test_getNodesUsage(t *testing.T) { 29 | nodeMage := nodeManager{} 30 | nodeMage.init() 31 | nodeMage.addNode("node1", &NodeInfo{ 32 | ID: "node1", 33 | Devices: []DeviceInfo{ 34 | { 35 | ID: "GPU0", 36 | Index: 0, 37 | Count: 10, 38 | Devmem: 1024, 39 | Devcore: 100, 40 | Numa: 1, 41 | Health: true, 42 | }, 43 | { 44 | ID: "GPU1", 45 | Index: 1, 46 | Count: 10, 47 | Devmem: 1024, 48 | Devcore: 100, 49 | Numa: 1, 50 | Health: true, 51 | }, 52 | }, 53 | }) 54 | podDevces := util.PodDevices{ 55 | "NVIDIA": util.PodSingleDevice{ 56 | []util.ContainerDevice{ 57 | { 58 | Idx: 0, 59 | UUID: "GPU0", 60 | Usedmem: 100, 61 | Usedcores: 10, 62 | }, 63 | }, 64 | }, 65 | } 66 | podMap := podManager{} 67 | podMap.init() 68 | podMap.addPod(&corev1.Pod{ 69 | ObjectMeta: metav1.ObjectMeta{ 70 | UID: "1111", 71 | Name: "test1", 72 | Namespace: "default", 73 | }, 74 | }, "node1", podDevces) 75 | podMap.addPod(&corev1.Pod{ 76 | ObjectMeta: metav1.ObjectMeta{ 77 | UID: "2222", 78 | Name: "test2", 79 | Namespace: "default", 80 | }, 81 | }, "node1", podDevces) 82 | s := Scheduler{ 83 | nodeManager: nodeMage, 84 | podManager: podMap, 85 | } 86 | nodes := make([]string, 0) 87 | nodes = append(nodes, "node1") 88 | cachenodeMap, _, err := s.getNodesUsage(&nodes, nil) 89 | if err != nil { 90 | t.Fatal(err) 91 | } 92 | assert.Equal(t, len(*cachenodeMap), 1) 93 | v, ok := (*cachenodeMap)["node1"] 94 | assert.Equal(t, ok, true) 95 | assert.Equal(t, len(v.Devices), 2) 96 | assert.Equal(t, v.Devices[0].Used, int32(2)) 97 | assert.Equal(t, v.Devices[0].Usedmem, int32(200)) 98 | assert.Equal(t, v.Devices[0].Usedcores, int32(20)) 99 | } 100 | -------------------------------------------------------------------------------- /pkg/scheduler/webhook.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package scheduler 18 | 19 | import ( 20 | "context" 21 | "encoding/json" 22 | "fmt" 23 | "net/http" 24 | 25 | "4pd.io/k8s-vgpu/pkg/device" 26 | "4pd.io/k8s-vgpu/pkg/scheduler/config" 27 | corev1 "k8s.io/api/core/v1" 28 | "k8s.io/apimachinery/pkg/runtime" 29 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 30 | "sigs.k8s.io/controller-runtime/pkg/webhook/admission" 31 | ) 32 | 33 | type webhook struct { 34 | decoder *admission.Decoder 35 | } 36 | 37 | func NewWebHook() (*admission.Webhook, error) { 38 | schema := runtime.NewScheme() 39 | if err := clientgoscheme.AddToScheme(schema); err != nil { 40 | return nil, err 41 | } 42 | decoder := admission.NewDecoder(schema) 43 | wh := &admission.Webhook{Handler: &webhook{decoder: decoder}} 44 | return wh, nil 45 | } 46 | 47 | func (h *webhook) Handle(_ context.Context, req admission.Request) admission.Response { 48 | pod := &corev1.Pod{} 49 | err := h.decoder.Decode(req, pod) 50 | if err != nil { 51 | return admission.Errored(http.StatusBadRequest, err) 52 | } 53 | if len(pod.Spec.Containers) == 0 { 54 | return admission.Denied("pod has no containers") 55 | } 56 | //klog.V(1).Infof("hook %v pod %v/%v", req.UID, req.Namespace, req.Name) 57 | fmt.Printf("hook %v pod %v/%v", req.UID, req.Namespace, req.Name) 58 | hasResource := false 59 | for idx, ctr := range pod.Spec.Containers { 60 | c := &pod.Spec.Containers[idx] 61 | if ctr.SecurityContext != nil { 62 | if ctr.SecurityContext.Privileged != nil && *ctr.SecurityContext.Privileged { 63 | continue 64 | } 65 | } 66 | 67 | for _, val := range device.GetDevices() { 68 | hasResource = hasResource || val.MutateAdmission(c) 69 | } 70 | } 71 | 72 | if !hasResource { 73 | return admission.Allowed("no resource found") 74 | } 75 | if len(config.SchedulerName) > 0 { 76 | pod.Spec.SchedulerName = config.SchedulerName 77 | } 78 | marshaledPod, err := json.Marshal(pod) 79 | if err != nil { 80 | return admission.Errored(http.StatusInternalServerError, err) 81 | } 82 | return admission.PatchResponseFromRaw(req.Object.Raw, marshaledPod) 83 | } 84 | -------------------------------------------------------------------------------- /pkg/util/client/client.go: -------------------------------------------------------------------------------- 1 | package client 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | 7 | "k8s.io/client-go/kubernetes" 8 | "k8s.io/client-go/rest" 9 | "k8s.io/client-go/tools/clientcmd" 10 | "k8s.io/klog" 11 | ) 12 | 13 | var ( 14 | kubeClient kubernetes.Interface 15 | ) 16 | 17 | func init() { 18 | kubeClient, _ = NewClient() 19 | } 20 | 21 | func GetClient() kubernetes.Interface { 22 | return kubeClient 23 | } 24 | 25 | // NewClient connects to an API server 26 | func NewClient() (kubernetes.Interface, error) { 27 | kubeConfig := os.Getenv("KUBECONFIG") 28 | if kubeConfig == "" { 29 | kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config") 30 | } 31 | config, err := rest.InClusterConfig() 32 | if err != nil { 33 | klog.Infoln("InClusterConfig failed", err.Error()) 34 | config, err = clientcmd.BuildConfigFromFlags("", kubeConfig) 35 | if err != nil { 36 | klog.Errorln("BuildFromFlags failed", err.Error()) 37 | return nil, err 38 | } 39 | } 40 | client, err := kubernetes.NewForConfig(config) 41 | return client, err 42 | } 43 | -------------------------------------------------------------------------------- /pkg/util/util_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package util 18 | 19 | import ( 20 | "fmt" 21 | "testing" 22 | 23 | "gotest.tools/v3/assert" 24 | ) 25 | 26 | var inRequestDevices map[string]string 27 | 28 | func init() { 29 | inRequestDevices = make(map[string]string) 30 | inRequestDevices["NVIDIA"] = "hami.sh/vgpu-devices-to-allocate" 31 | } 32 | 33 | func TestEmptyContainerDevicesCoding(t *testing.T) { 34 | cd1 := ContainerDevices{} 35 | s := EncodeContainerDevices(cd1) 36 | fmt.Println(s) 37 | cd2, _ := DecodeContainerDevices(s) 38 | assert.DeepEqual(t, cd1, cd2) 39 | } 40 | 41 | func TestEmptyPodDeviceCoding(t *testing.T) { 42 | pd1 := PodDevices{} 43 | s := EncodePodDevices(inRequestDevices, pd1) 44 | fmt.Println(s) 45 | pd2, _ := DecodePodDevices(inRequestDevices, s) 46 | assert.DeepEqual(t, pd1, pd2) 47 | } 48 | 49 | func TestPodDevicesCoding(t *testing.T) { 50 | pd1 := PodDevices{ 51 | "NVIDIA": PodSingleDevice{ 52 | ContainerDevices{ 53 | ContainerDevice{0, "UUID1", "Type1", 1000, 30}, 54 | }, 55 | ContainerDevices{ 56 | ContainerDevice{0, "UUID1", "Type1", 1000, 30}, 57 | }, 58 | }, 59 | } 60 | s := EncodePodDevices(inRequestDevices, pd1) 61 | fmt.Println(s) 62 | pd2, _ := DecodePodDevices(inRequestDevices, s) 63 | assert.DeepEqual(t, pd1, pd2) 64 | } 65 | -------------------------------------------------------------------------------- /pkg/version/version.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2021 peizhaoyou 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package version 18 | 19 | import ( 20 | "fmt" 21 | "github.com/spf13/cobra" 22 | ) 23 | 24 | var ( 25 | version string 26 | VersionCmd = &cobra.Command{ 27 | Use: "version", 28 | Short: "print version", 29 | Run: func(cmd *cobra.Command, args []string) { 30 | fmt.Println(Version()) 31 | }, 32 | } 33 | ) 34 | 35 | func Version() string { 36 | return version 37 | } 38 | -------------------------------------------------------------------------------- /version.mk: -------------------------------------------------------------------------------- 1 | GO=go 2 | GO111MODULE=on 3 | CMDS=scheduler vGPUmonitor 4 | DEVICES=nvidia 5 | OUTPUT_DIR=bin 6 | TARGET_ARCH=amd64 7 | GOLANG_IMAGE=golang:1.21-bullseye 8 | NVIDIA_IMAGE=nvidia/cuda:11.2.2-base-ubuntu20.04 9 | DEST_DIR=/usr/local/vgpu/ 10 | 11 | VERSION = v0.0.1 12 | IMG_NAME ="k8s-vgpu-scheduler" 13 | IMG_TAG="${IMG_NAME}:${VERSION}" --------------------------------------------------------------------------------