├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── .gitlab-ci.yml
├── .gitmodules
├── AUTHORS
├── CHANGELOG.md
├── HAMi.jpg
├── LICENSE
├── MAINTAINERS.md
├── Makefile
├── README.md
├── README_cn.md
├── benchmarks
    └── ai-benchmark
    │   ├── Dockerfile
    │   ├── Hami
    │       └── ai-benchmark.yml
    │   └── Official-Nvidia-device-plugin
    │       ├── ai-benchmark.yml
    │       └── nvidia-device-plugin-official.yml
├── charts
    └── vgpu
    │   ├── Chart.yaml
    │   ├── templates
    │       ├── NOTES.txt
    │       ├── _helpers.tpl
    │       ├── device-plugin
    │       │   ├── configmap.yaml
    │       │   ├── daemonsethygon.yaml
    │       │   ├── daemonsetmlu.yaml
    │       │   ├── daemonsetnvidia.yaml
    │       │   ├── monitorrole.yaml
    │       │   ├── monitorrolebinding.yaml
    │       │   ├── monitorservice.yaml
    │       │   └── monitorserviceaccount.yaml
    │       └── scheduler
    │       │   ├── configmap.yaml
    │       │   ├── configmapnew.yaml
    │       │   ├── deployment.yaml
    │       │   ├── job-patch
    │       │       ├── clusterrole.yaml
    │       │       ├── clusterrolebinding.yaml
    │       │       ├── job-createSecret.yaml
    │       │       ├── job-patchWebhook.yaml
    │       │       ├── psp.yaml
    │       │       ├── role.yaml
    │       │       ├── rolebinding.yaml
    │       │       └── serviceaccount.yaml
    │       │   ├── rolebinding.yaml
    │       │   ├── service.yaml
    │       │   ├── serviceaccount.yaml
    │       │   └── webhook.yaml
    │   └── values.yaml
├── cmd
    ├── device-plugin
    │   ├── hygon
    │   │   └── main.go
    │   ├── mlu
    │   │   └── main.go
    │   └── nvidia
    │   │   ├── main.go
    │   │   ├── plugin-manager.go
    │   │   ├── vgpucfg.go
    │   │   └── watchers.go
    ├── scheduler
    │   ├── main.go
    │   └── metrics.go
    └── vGPUmonitor
    │   ├── build.sh
    │   ├── cudevshr.go
    │   ├── feedback.go
    │   ├── main.go
    │   ├── metrics.go
    │   ├── noderpc
    │       ├── noderpc.pb.go
    │       ├── noderpc.proto
    │       └── noderpc_grpc.pb.go
    │   ├── pathmonitor.go
    │   ├── pathmonitor_test.go
    │   ├── testcollector
    │       ├── main.go
    │       └── testcollector
    │   └── validation.go
├── docker
    ├── Dockerfile
    └── entrypoint.sh
├── docs
    ├── benchmark.md
    ├── benchmark_cn.md
    ├── cambricon-mlu-support.md
    ├── cambricon-mlu-support_cn.md
    ├── config.md
    ├── config_cn.md
    ├── dashboard.md
    ├── dashboard_cn.md
    ├── develop
    │   ├── design.md
    │   ├── imgs
    │   │   ├── flowchart.jpeg
    │   │   ├── offline_validation.png
    │   │   ├── protocol_pod.png
    │   │   └── protocol_register.png
    │   ├── protocol.md
    │   ├── roadmap.md
    │   └── tasklist.md
    ├── gpu-dashboard.json
    ├── hygon-dcu-support.md
    ├── hygon-dcu-support_cn.md
    └── offline-install.md
├── example.yaml
├── examples
    ├── hygon
    │   ├── default_use.yaml
    │   ├── specify_card_type_not_use.yaml
    │   └── specify_card_type_to_use.yaml
    ├── mlu
    │   ├── default_use.yaml
    │   ├── multi-pods.yaml
    │   ├── specify_card_type_not_use.yaml
    │   └── specify_card_type_to_use.yaml
    └── nvidia
    │   ├── default_use.yaml
    │   ├── default_use_legacy.yaml
    │   ├── example.yaml
    │   ├── mig_example.yaml
    │   ├── specify_card_type_not_use.yaml
    │   ├── specify_card_type_to_use.yaml
    │   ├── use_exclusive_card.yaml
    │   └── use_memory_fraction.yaml
├── go.mod
├── go.sum
├── hack
    ├── build.sh
    └── update-generated-api.sh
├── imgs
    ├── arch.png
    ├── benchmark.png
    ├── benchmark_inf.png
    ├── benchmark_train.png
    ├── example.png
    └── hard_limit.jpg
├── lib
    ├── mlu
    │   ├── cntopo
    │   ├── libcndev.so
    │   └── smlu-containerd
    └── nvidia
    │   ├── ld.so.preload
    │   └── libvgpu.so
├── pkg
    ├── api
    │   ├── device_register.go
    │   └── types.go
    ├── device-plugin
    │   ├── hygon
    │   │   └── dcu
    │   │   │   ├── amdgpu
    │   │   │       └── amdgpu.go
    │   │   │   ├── corealloc.go
    │   │   │   ├── corealloc_test.go
    │   │   │   ├── hwloc
    │   │   │       └── hwloc.go
    │   │   │   ├── register.go
    │   │   │   └── server.go
    │   ├── mlu
    │   │   ├── allocator
    │   │   │   ├── allocator.go
    │   │   │   ├── allocator_suite_test.go
    │   │   │   ├── board.go
    │   │   │   ├── board_test.go
    │   │   │   ├── default.go
    │   │   │   ├── spider.go
    │   │   │   └── spider_test.go
    │   │   ├── cache.go
    │   │   ├── cambricon.go
    │   │   ├── cndev
    │   │   │   ├── bindings.go
    │   │   │   ├── bindings_test.go
    │   │   │   ├── cndev.go
    │   │   │   ├── cndev_dl.go
    │   │   │   ├── cndev_test.go
    │   │   │   ├── include
    │   │   │   │   └── cndev.h
    │   │   │   └── mock
    │   │   │   │   ├── cJSON.c
    │   │   │   │   ├── cJSON.h
    │   │   │   │   ├── cndev.c
    │   │   │   │   └── main.c
    │   │   ├── cntopo
    │   │   │   ├── cntopo.go
    │   │   │   └── mock
    │   │   │   │   └── cntopo.go
    │   │   ├── const.go
    │   │   ├── options.go
    │   │   ├── podutils.go
    │   │   ├── register.go
    │   │   └── server.go
    │   └── nvidiadevice
    │   │   └── nvinternal
    │   │       ├── cdi
    │   │           ├── api.go
    │   │           ├── api_mock.go
    │   │           ├── cdi.go
    │   │           ├── factory.go
    │   │           ├── null.go
    │   │           └── options.go
    │   │       ├── info
    │   │           └── version.go
    │   │       ├── mig
    │   │           └── mig.go
    │   │       ├── plugin
    │   │           ├── api.go
    │   │           ├── manager
    │   │           │   ├── api.go
    │   │           │   ├── factory.go
    │   │           │   ├── null.go
    │   │           │   ├── nvml.go
    │   │           │   ├── options.go
    │   │           │   └── tegra.go
    │   │           ├── register.go
    │   │           ├── register_test.go
    │   │           ├── server.go
    │   │           └── server_test.go
    │   │       └── rm
    │   │           ├── allocate.go
    │   │           ├── device_map.go
    │   │           ├── device_map_test.go
    │   │           ├── devices.go
    │   │           ├── health.go
    │   │           ├── health_test.go
    │   │           ├── helper.go
    │   │           ├── nvml_devices.go
    │   │           ├── nvml_manager.go
    │   │           ├── rm.go
    │   │           ├── tegra_devices.go
    │   │           ├── tegra_manager.go
    │   │           └── wsl_devices.go
    ├── device
    │   ├── cambricon
    │   │   └── device.go
    │   ├── devices.go
    │   ├── hygon
    │   │   └── device.go
    │   ├── iluvatar
    │   │   └── device.go
    │   └── nvidia
    │   │   └── device.go
    ├── k8sutil
    │   ├── client.go
    │   └── pod.go
    ├── oci
    │   ├── runtime.go
    │   ├── runtime_exec.go
    │   ├── runtime_exec_test.go
    │   ├── runtime_mock.go
    │   ├── spec.go
    │   └── spec_mock.go
    ├── scheduler
    │   ├── config
    │   │   └── config.go
    │   ├── nodes.go
    │   ├── pods.go
    │   ├── routes
    │   │   └── route.go
    │   ├── scheduler.go
    │   ├── scheduler_test.go
    │   ├── score.go
    │   └── webhook.go
    ├── util
    │   ├── client
    │   │   └── client.go
    │   ├── nodelock
    │   │   └── nodelock.go
    │   ├── types.go
    │   ├── util.go
    │   └── util_test.go
    └── version
    │   └── version.go
└── version.mk


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: Release
 4 | 
 5 | # Controls when the workflow will run
 6 | on:
 7 |   # Triggers the workflow on push or pull request events but only for the master branch
 8 |   push:
 9 |     tags:
10 |       - v[0-9]+.[0-9]+.[0-9]+.[0-9]+
11 |       - v[0-9]+.[0-9]+.[0-9]+
12 |       - v[0-9]+.[0-9]+
13 | 
14 |   # Allows you to run this workflow manually from the Actions tab
15 |   workflow_dispatch:
16 | 
17 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
18 | jobs:
19 |   # This workflow contains a single job called "build"
20 |   build:
21 |     # The type of runner that the job will run on
22 |     runs-on: ubuntu-latest
23 | 
24 |     # Steps represent a sequence of tasks that will be executed as part of the job
25 |     steps:
26 |       - name: Checkout
27 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
28 |         uses: actions/checkout@v2
29 | 
30 | 
31 |       - name: Setup Go environment
32 |         uses: actions/setup-go@v5.0.0
33 |         with:
34 |             go-version: 1.21
35 | 
36 |       - name: Get branch name
37 |         uses: nelonoel/branch-name@v1.0.1
38 |      
39 |       - name: Docker Login
40 |         uses: docker/login-action@v1.10.0
41 |         with:
42 |             # Server address of Docker registry. If not set then will default to Docker Hub
43 |             # registry: 4pdosc
44 |             # Username used to log against the Docker registry
45 |             username: ${{ secrets.DOCKERHUB_USERNAME }}
46 |             # Password or personal access token used to log against the Docker registry
47 |             password: ${{ secrets.DOCKERHUB_TOKEN }}
48 |       
49 |       - name: Set up Docker Buildx
50 |         id: buildx
51 |         uses: docker/setup-buildx-action@v1
52 |       
53 |       - run: make tidy
54 |         # run: make proto
55 |       - run: SHORT_VERSION="${BRANCH_NAME}" bash ./hack/build.sh
56 | 
57 |       - name: Publish Helm charts
58 |         uses: stefanprodan/helm-gh-pages@master
59 |         with:
60 |           token: ${{ secrets.GITHUB_TOKEN }}
61 |       
62 |       
63 |       
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | run_device_plugin.sh
3 | run_scheduler.sh
4 | device_plugin.sh
5 | libvgpu/build
6 | updateso.sh
7 | .idea
8 | vendor
9 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |   - build_image
 3 |   - deploy
 4 | 
 5 | variables:
 6 |   IMAGE_NAME: k8s-vgpu
 7 | 
 8 | .build_image:
 9 |   stage: build_image
10 |   image: '${DIND_IMAGE}'
11 |   script:
12 |     - IMAGE_FULL_NAME=${IMAGE_REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}
13 |     - >
14 |       docker build -t ${IMAGE_FULL_NAME}
15 |       --build-arg GOLANG_IMAGE=${GOLANG_IMAGE}
16 |       --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE}
17 |       --build-arg VERSION=${VERSION}
18 |       --build-arg GOPROXY=${GOPROXY} -f ./docker/Dockerfile .
19 |     - docker push ${IMAGE_FULL_NAME}
20 | 
21 | build_dev_image:
22 |   extends: .build_image
23 |   variables:
24 |     IMAGE_TAG: ${CI_COMMIT_SHA}
25 |     VERSION: ${CI_COMMIT_SHA}
26 |   only:
27 |     - master
28 | 
29 | build_release_image:
30 |   extends: .build_image
31 |   variables:
32 |     IMAGE_TAG: ${CI_COMMIT_TAG}
33 |     VERSION: ${CI_COMMIT_TAG}-${CI_COMMIT_SHA}
34 |   only:
35 |     - tags 
36 | 
37 | .deploy:
38 |   stage: deploy
39 |   image: '${HELM_IMAGE}'
40 |   variables:
41 |     RELEASE_NAME: vgpu 
42 |     RELEASE_NAMESPACE: vgpu
43 |     EXTRA_ARGS: ''
44 |   script:
45 |     - IMAGE_FULL_NAME=${IMAGE_REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}
46 |     - >
47 |       helm upgrade --install ${RELEASE_NAME} ./charts/4pd-vgpu
48 |       -n ${RELEASE_NAMESPACE}
49 |       --set scheduler.extender.image=${IMAGE_FULL_NAME}
50 |       --set devicePlugin.image=${IMAGE_FULL_NAME}
51 |       ${EXTRA_ARGS}
52 | 
53 | deploy_develop:
54 |   extends: .deploy
55 |   variables:
56 |     IMAGE_TAG: ${CI_COMMIT_SHA}
57 |   environment:
58 |     name: vgpu-develop
59 |   only:
60 |     - master
61 |   tags:
62 |     - deploy-test
63 | 
64 | deploy_pre_product:
65 |   extends: .deploy
66 |   variables:
67 |     IMAGE_TAG: ${CI_COMMIT_TAG}
68 |     EXTRA_ARGS: "--wait --timeout=30m"
69 |   environment:
70 |     name: vgpu-develop
71 |   only:
72 |     - tags
73 |   tags:
74 |     - deploy-test
75 | 
76 | deploy_product:
77 |   extends: .deploy
78 |   variables:
79 |     IMAGE_TAG: ${CI_COMMIT_TAG}
80 |   environment:
81 |     name: vgpu-product
82 |   only:
83 |     - tags
84 |   tags:
85 |     - deploy-product
86 |   when: manual
87 | 
88 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "libvgpu"]
2 | 	path = libvgpu
3 | 	url = https://github.com/Project-HAMi/HAMi-core.git
4 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | The following people, in alphabetical order, have either authored or signed
 2 | off on commits in the HAMi repository:
 3 | 
 4 | archlitchi	     limengxuan@4paradigm.com
 5 | peizhaoyou           peizhaoyou@4paradigm.com
 6 | chaunceyjiang        chaunceyjiang@gmail.com
 7 | wawa0210             
 8 | whybeyoung
 9 | gsakun
10 | CoderTH
11 | lengrongfu
12 | chaunceyjiang
13 | atttx123
14 | zhengbingxian
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/HAMi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/HAMi.jpg


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
 1 | # Maintainers
 2 | 
 3 | Please see the [AUTHORS](./AUTHORS) file for the full list of contributors to the project
 4 | 
 5 | ## HAMi Committers
 6 | 
 7 | | Maintainer                                        | Emplolyer |
 8 | |---------------------------------------------------|-----------|
 9 | | [Li Mengxuan](https://github.com/archlitchi)      | 4Paradigm |
10 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ##### Global variables #####
 2 | include version.mk
 3 | 
 4 | all: build
 5 | 
 6 | docker:
 7 | 	docker build \
 8 | 	--build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \
 9 | 	--build-arg TARGET_ARCH=${TARGET_ARCH} \
10 | 	--build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \
11 | 	--build-arg DEST_DIR=${DEST_DIR} \
12 | 	. -f=docker/Dockerfile -t ${IMG_TAG}
13 | 
14 | tidy:
15 | 	$(GO) mod tidy
16 | 
17 | proto:
18 | 	$(GO) get github.com/gogo/protobuf/protoc-gen-gofast@v1.3.2
19 | 	protoc --gofast_out=plugins=grpc:. ./pkg/api/*.proto
20 | 
21 | build: $(CMDS) $(DEVICES)
22 | 
23 | $(CMDS):
24 | 	$(GO) build -ldflags '-s -w -X 4pd.io/k8s-vgpu/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@ ./cmd/$@
25 | 
26 | $(DEVICES):
27 | 	$(GO) build -ldflags '-s -w -X 4pd.io/k8s-vgpu/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@-device-plugin ./cmd/device-plugin/$@
28 | 
29 | clean:
30 | 	$(GO) clean -r -x ./cmd/...
31 | 	-rm -rf $(OUTPUT_DIR)
32 | 
33 | .PHONY: all build docker clean $(CMDS)
34 | 


--------------------------------------------------------------------------------
/benchmarks/ai-benchmark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:2.4.1-gpu
 2 | 
 3 | RUN apt-get update && apt-get install -y --no-install-recommends apt-utils
 4 | 
 5 | RUN pip install --upgrade pip
 6 | 
 7 | RUN apt-get -y install git
 8 | RUN git clone -b feat/transformer https://github.com/shiyoubun/ai-benchmark.git
 9 | 
10 | WORKDIR ai-benchmark
11 | RUN pip install -e .
12 | 
13 | ENTRYPOINT [ "python", "bin/ai-benchmark.py" ]
14 | 


--------------------------------------------------------------------------------
/benchmarks/ai-benchmark/Hami/ai-benchmark.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: ai-benchmark
 5 | spec:
 6 |   template:
 7 |     metadata:
 8 |       name: ai-benchmark
 9 |     spec:
10 |       containers:
11 |         - name: ai-benchmark
12 |           image: 4pdosc/ai-benchmark:2.4.1-gpu
13 |           resources:
14 |             requests:
15 |               nvidia.com/gpu: 1
16 |               nvidia.com/gpumem-percentage: 50
17 |             limits:
18 |               nvidia.com/gpu: 1
19 |               nvidia.com/gpumem-percentage: 50
20 |       restartPolicy: Never


--------------------------------------------------------------------------------
/benchmarks/ai-benchmark/Official-Nvidia-device-plugin/ai-benchmark.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: ai-benchmark
 5 | spec:
 6 |   template:
 7 |     metadata:
 8 |       name: ai-benchmark
 9 |     spec:
10 |       containers:
11 |         - name: ai-benchmark
12 |           image: 4pdosc/ai-benchmark:2.4.1-gpu
13 |           resources:
14 |             requests:
15 |               nvidia.com/gpu: 1
16 |             limits:
17 |               nvidia.com/gpu: 1
18 |       restartPolicy: Never


--------------------------------------------------------------------------------
/benchmarks/ai-benchmark/Official-Nvidia-device-plugin/nvidia-device-plugin-official.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: apps/v1
16 | kind: DaemonSet
17 | metadata:
18 |   name: nvidia-device-plugin-daemonset
19 |   namespace: kube-system
20 | spec:
21 |   selector:
22 |     matchLabels:
23 |       name: nvidia-device-plugin-ds
24 |   updateStrategy:
25 |     type: RollingUpdate
26 |   template:
27 |     metadata:
28 |       # This annotation is deprecated. Kept here for backward compatibility
29 |       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
30 |       annotations:
31 |         scheduler.alpha.kubernetes.io/critical-pod: ""
32 |       labels:
33 |         name: nvidia-device-plugin-ds
34 |     spec:
35 |       tolerations:
36 |       # This toleration is deprecated. Kept here for backward compatibility
37 |       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
38 |       - key: CriticalAddonsOnly
39 |         operator: Exists
40 |       - key: nvidia.com/gpu
41 |         operator: Exists
42 |         effect: NoSchedule
43 |       # Mark this pod as a critical add-on; when enabled, the critical add-on
44 |       # scheduler reserves resources for critical add-on pods so that they can
45 |       # be rescheduled after a failure.
46 |       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
47 |       priorityClassName: "system-node-critical"
48 |       containers:
49 |       - image: nvcr.io/nvidia/k8s-device-plugin:v0.9.0
50 |         name: nvidia-device-plugin-ctr
51 |         args: ["--fail-on-init-error=false"]
52 |         securityContext:
53 |           allowPrivilegeEscalation: false
54 |           capabilities:
55 |             drop: ["ALL"]
56 |         volumeMounts:
57 |           - name: device-plugin
58 |             mountPath: /var/lib/kubelet/device-plugins
59 |       volumes:
60 |         - name: device-plugin
61 |           hostPath:
62 |             path: /var/lib/kubelet/device-plugins
63 | 
64 | 


--------------------------------------------------------------------------------
/charts/vgpu/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: vgpu
 3 | version: 2.0.0
 4 | kubeVersion: ">= 1.16.0"
 5 | description: Heterogeneous AI Computing Virtualization Middleware
 6 | keywords:
 7 |   - vgpu
 8 |   - gpu
 9 | type: application
10 | maintainers:
11 |   - name: limengxuan
12 |     email: limengxuan@4paradigm.com
13 | appVersion: 0.0.2


--------------------------------------------------------------------------------
/charts/vgpu/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | ** Please be patient while the chart is being deployed **
2 | Resource name: {{ .Values.resourceName }}
3 | 
4 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "4pd-vgpu.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
 6 | {{- end -}}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "4pd-vgpu.fullname" -}}
14 | {{- if .Values.fullnameOverride -}}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride -}}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
20 | {{- else -}}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
22 | {{- end -}}
23 | {{- end -}}
24 | {{- end -}}
25 | 
26 | {{/*
27 | The app name for Scheduler
28 | */}}
29 | {{- define "4pd-vgpu.scheduler" -}}
30 | {{- printf "%s-scheduler" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
31 | {{- end -}}
32 | 
33 | {{/*
34 | The app name for DevicePlugin
35 | */}}
36 | {{- define "4pd-vgpu.device-plugin" -}}
37 | {{- printf "%s-device-plugin" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
38 | {{- end -}}
39 | 
40 | {{/*
41 | The tls secret name for Scheduler
42 | */}}
43 | {{- define "4pd-vgpu.scheduler.tls" -}}
44 | {{- printf "%s-scheduler-tls" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
45 | {{- end -}}
46 | 
47 | {{/*
48 | The webhook name
49 | */}}
50 | {{- define "4pd-vgpu.scheduler.webhook" -}}
51 | {{- printf "%s-webhook" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
52 | {{- end -}}
53 | 
54 | {{/*
55 | Create chart name and version as used by the chart label.
56 | */}}
57 | {{- define "4pd-vgpu.chart" -}}
58 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
59 | {{- end }}
60 | 
61 | {{/*
62 | Common labels
63 | */}}
64 | {{- define "4pd-vgpu.labels" -}}
65 | helm.sh/chart: {{ include "4pd-vgpu.chart" . }}
66 | {{ include "4pd-vgpu.selectorLabels" . }}
67 | {{- if .Chart.AppVersion }}
68 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
69 | {{- end }}
70 | app.kubernetes.io/managed-by: {{ .Release.Service }}
71 | {{- end }}
72 | 
73 | {{/*
74 | Selector labels
75 | */}}
76 | {{- define "4pd-vgpu.selectorLabels" -}}
77 | app.kubernetes.io/name: {{ include "4pd-vgpu.name" . }}
78 | app.kubernetes.io/instance: {{ .Release.Name }}
79 | {{- end }}
80 | 
81 | {{/*
82 | Image registry secret name
83 | */}}
84 | {{- define "4pd-vgpu.imagePullSecrets" -}}
85 | imagePullSecrets: {{ toYaml .Values.imagePullSecrets | nindent 2 }}
86 | {{- end }}
87 | 
88 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/device-plugin/configmap.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.device-plugin" . }}
 5 |   labels:
 6 |     app.kubernetes.io/component: 4pd-device-plugin
 7 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
 8 | data:
 9 |   config.json: |
10 |     {
11 |         "nodeconfig": [
12 |             {
13 |                 "name": "m5-cloudinfra-online02",
14 |                 "devicememoryscaling": 1.8,
15 |                 "devicesplitcount": 10,
16 |                 "migstrategy":"none"
17 |             }
18 |         ]
19 |     }


--------------------------------------------------------------------------------
/charts/vgpu/templates/device-plugin/daemonsethygon.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.device-plugin" . }}-hygon
 5 |   labels:
 6 |     app.kubernetes.io/component: 4pd-device-plugin-hygon
 7 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
 8 |     {{- with .Values.global.labels }}
 9 |     {{- toYaml . | nindent 4 }}
10 |     {{- end }}
11 |   {{- if .Values.global.annotations }}
12 |   annotations: {{ toYaml .Values.global.annotations | nindent 4}}
13 |   {{- end }}
14 | spec:
15 |   selector:
16 |     matchLabels:
17 |       app.kubernetes.io/component: 4pd-device-plugin-hygon
18 |       {{- include "4pd-vgpu.selectorLabels" . | nindent 6 }}
19 |   template:
20 |     metadata:
21 |       labels:
22 |         app.kubernetes.io/component: 4pd-device-plugin-hygon
23 |         4pd.io/webhook: ignore
24 |         {{- include "4pd-vgpu.selectorLabels" . | nindent 8 }}
25 |       {{- if .Values.devicePlugin.podAnnotations }}
26 |       annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
27 |       {{- end }}
28 |     spec:
29 |       {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}}
30 |       serviceAccountName: {{ include "4pd-vgpu.device-plugin" . }}
31 |       priorityClassName: system-node-critical
32 |       hostPID: true
33 |       hostNetwork: true
34 |       containers:
35 |         - name: dcu-device-plugin-ctr
36 |           image: {{ .Values.devicePlugin.hygonimage }}
37 |           imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }}
38 |           command: ["/hygon","-logtostderr=true","-stderrthreshold=INFO","-v=5"]
39 |           env:
40 |             - name: NodeName
41 |               valueFrom:
42 |                 fieldRef:
43 |                   fieldPath: spec.nodeName
44 |             - name: HOOK_PATH
45 |               value: {{ .Values.devicePlugin.libPath }}
46 |             - name: HYGONPATH
47 |               value: {{ .Values.devicePlugin.hygondriver }}
48 |           securityContext:
49 |             privileged: true
50 |             allowPrivilegeEscalation: true
51 |             capabilities:
52 |               drop: ["ALL"]
53 |               add: ["SYS_ADMIN"]
54 |           volumeMounts:
55 |             - name: device-plugin
56 |               mountPath: /var/lib/kubelet/device-plugins
57 |             - name: deviceconfig
58 |               mountPath: /config
59 |             - name: sysinfo
60 |               mountPath: /sys
61 |             - name: lib
62 |               mountPath: /usr/local/vgpu
63 |             - name: hwpath
64 |               mountPath: /usr/share/hwdata
65 |             - name: hygonloc
66 |               mountPath: /opt/hygondriver/
67 |       volumes:
68 |         - name: device-plugin
69 |           hostPath:
70 |             path: {{ .Values.devicePlugin.pluginPath }}
71 |         - name: sysinfo
72 |           hostPath:
73 |             path: /sys
74 |         - name: deviceconfig
75 |           configMap:
76 |             name: {{ template "4pd-vgpu.device-plugin" . }}
77 |         - name: lib
78 |           hostPath:
79 |             path: {{ .Values.devicePlugin.libPath }}
80 |         - name: hwpath
81 |           hostPath:
82 |             path: /usr/share/hwdata
83 |         - name: hygonloc
84 |           hostPath:
85 |             path: {{ .Values.devicePlugin.hygondriver }}
86 |       {{- if .Values.devicePlugin.hygonnodeSelector }}
87 |       nodeSelector: {{ toYaml .Values.devicePlugin.hygonnodeSelector | nindent 8 }}
88 |       {{- end }}
89 |       {{- if .Values.devicePlugin.tolerations }}
90 |       tolerations: {{ toYaml .Values.devicePlugin.tolerations | nindent 8 }}
91 |       {{- end }}
92 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/device-plugin/monitorrole.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name:  {{ include "4pd-vgpu.device-plugin" . }}-monitor
 5 | rules:
 6 |   - apiGroups:
 7 |       - ""
 8 |     resources:
 9 |       - pods
10 |     verbs:
11 |       - get
12 |       - create
13 |       - watch
14 |       - list
15 |       - update
16 |       - patch
17 |   - apiGroups:
18 |       - ""
19 |     resources:
20 |       - nodes
21 |     verbs:
22 |       - get
23 |       - update
24 |       - list
25 |       - patch
26 |     
27 |     
28 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/device-plugin/monitorrolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.device-plugin" . }}
 5 |   labels:
 6 |     app.kubernetes.io/component: "4pd-device-plugin"
 7 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: ClusterRole
11 |   #name: cluster-admin
12 |   name: {{ include "4pd-vgpu.device-plugin" . }}-monitor
13 | subjects:
14 |   - kind: ServiceAccount
15 |     name: {{ include "4pd-vgpu.device-plugin" . }}
16 |     namespace: {{ .Release.Namespace | quote }}
17 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/device-plugin/monitorservice.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.device-plugin" . }}-monitor
 5 |   labels:
 6 |     app.kubernetes.io/component: 4pd-scheduler
 7 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
 8 |     {{- if .Values.scheduler.service.labels }}
 9 |     {{ toYaml .Values.scheduler.service.labels | indent 4 }}
10 |     {{- end }}
11 |   {{- if .Values.scheduler.service.annotations }}
12 |   annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }}
13 |   {{- end }}
14 | spec:
15 |   externalTrafficPolicy: Local
16 |   selector:
17 |     app.kubernetes.io/component: 4pd-device-plugin
18 |   type: NodePort
19 |   ports:
20 |     - name: monitorport
21 |       port: {{ .Values.devicePlugin.service.httpPort }}
22 |       targetPort: 9394
23 |       nodePort: {{ .Values.devicePlugin.service.httpPort }}


--------------------------------------------------------------------------------
/charts/vgpu/templates/device-plugin/monitorserviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: {{ include "4pd-vgpu.device-plugin" . }}
5 |   namespace: {{ .Release.Namespace | quote }}
6 |   labels:
7 |     app.kubernetes.io/component: "4pd-device-plugin"
8 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
9 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/configmap.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.scheduler" . }}
 5 |   labels:
 6 |     app.kubernetes.io/component: 4pd-scheduler
 7 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
 8 | data:
 9 |   config.json: |
10 |     {
11 |         "kind": "Policy",
12 |         "apiVersion": "v1",
13 |         "extenders": [
14 |             {
15 |                 "urlPrefix": "https://127.0.0.1:443",
16 |                 "filterVerb": "filter",
17 |                 "bindVerb": "bind",
18 |                 "enableHttps": true,
19 |                 "weight": 1,
20 |                 "nodeCacheCapable": true,
21 |                 "httpTimeout": 30000000000,
22 |                 "tlsConfig": {
23 |                     "insecure": true
24 |                 },
25 |                 "managedResources": [
26 |                     {
27 |                         "name": "{{ .Values.resourceName }}",
28 |                         "ignoredByScheduler": true
29 |                     },
30 |                     {
31 |                         "name": "{{ .Values.resourceMem }}",
32 |                         "ignoredByScheduler": true
33 |                     },
34 |                     {
35 |                         "name": "{{ .Values.resourceCores }}",
36 |                         "ignoredByScheduler": true
37 |                     },
38 |                     {
39 |                         "name": "{{ .Values.resourceMemPercentage }}",
40 |                         "ignoredByScheduler": true
41 |                     },
42 |                     {
43 |                         "name": "{{ .Values.resourcePriority }}",
44 |                         "ignoredByScheduler": true
45 |                     },
46 |                     {
47 |                         "name": "{{ .Values.mluResourceName }}",
48 |                         "ignoredByScheduler": true
49 |                     },
50 |                     {
51 |                         "name": "{{ .Values.mluResourceMem }}",
52 |                         "ignoredByScheduler": true   
53 |                     },
54 |                     {
55 |                         "name": "{{ .Values.dcuResourceName }}",
56 |                         "ignoredByScheduler": true
57 |                     },
58 |                     {
59 |                         "name": "{{ .Values.dcuResourceMem }}",
60 |                         "ignoredByScheduler": true 
61 |                     },
62 |                     {
63 |                         "name": "{{ .Values.dcuResourceCores }}",
64 |                         "ignoredByScheduler": true
65 |                     },
66 |                     {
67 |                         "name": "{{ .Values.iluvatarResourceName }}",
68 |                         "ignoredByScheduler": true
69 |                     }
70 |                 ],
71 |                 "ignoreable": false
72 |             }
73 |         ]
74 |     }


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/configmapnew.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.scheduler" . }}-newversion
 5 |   labels:
 6 |     app.kubernetes.io/component: 4pd-scheduler
 7 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
 8 | data:
 9 |   config.yaml: |
10 |     apiVersion: kubescheduler.config.k8s.io/v1beta2
11 |     kind: KubeSchedulerConfiguration
12 |     leaderElection:
13 |       leaderElect: false
14 |     profiles:
15 |     - schedulerName: {{ .Values.schedulerName }}
16 |     extenders:
17 |     - urlPrefix: "https://127.0.0.1:443"
18 |       filterVerb: filter
19 |       bindVerb: bind
20 |       nodeCacheCapable: true
21 |       weight: 1
22 |       httpTimeout: 30s
23 |       enableHTTPS: true
24 |       tlsConfig:
25 |         insecure: true
26 |       managedResources:
27 |       - name: {{ .Values.resourceName }}
28 |         ignoredByScheduler: true
29 |       - name: {{ .Values.resourceMem }}
30 |         ignoredByScheduler: true
31 |       - name: {{ .Values.resourceCores }}
32 |         ignoredByScheduler: true
33 |       - name: {{ .Values.resourceMemPercentage }}
34 |         ignoredByScheduler: true
35 |       - name: {{ .Values.resourcePriority }}
36 |         ignoredByScheduler: true
37 |       - name: {{ .Values.mluResourceName }}
38 |         ignoredByScheduler: true
39 |       - name: {{ .Values.mluResourceMem }}
40 |         ignoredByScheduler: true
41 |       - name: {{ .Values.dcuResourceName }}
42 |         ignoredByScheduler: true
43 |       - name: {{ .Values.dcuResourceMem }}
44 |         ignoredByScheduler: true
45 |       - name: {{ .Values.dcuResourceCores }}
46 |         ignoredByScheduler: true
47 |       - name: {{ .Values.iluvatarResourceName }}
48 |         ignoredByScheduler: true


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/job-patch/clusterrole.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.fullname" . }}-admission
 5 |   annotations:
 6 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 7 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 8 |   labels:
 9 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
10 |     app.kubernetes.io/component: admission-webhook
11 | rules:
12 |   - apiGroups:
13 |       - admissionregistration.k8s.io
14 |     resources:
15 |       #- validatingwebhookconfigurations
16 |       - mutatingwebhookconfigurations
17 |     verbs:
18 |       - get
19 |       - update
20 | {{- if .Values.podSecurityPolicy.enabled }}
21 |   - apiGroups: ['extensions']
22 |     resources: ['podsecuritypolicies']
23 |     verbs:     ['use']
24 |     resourceNames:
25 |     - {{ include "4pd-vgpu.fullname" . }}-admission
26 | {{- end }}
27 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/job-patch/clusterrolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name:  {{ include "4pd-vgpu.fullname" . }}-admission
 5 |   annotations:
 6 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 7 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 8 |   labels:
 9 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
10 |     app.kubernetes.io/component: admission-webhook
11 | roleRef:
12 |   apiGroup: rbac.authorization.k8s.io
13 |   kind: ClusterRole
14 |   name: {{ include "4pd-vgpu.fullname" . }}-admission
15 | subjects:
16 |   - kind: ServiceAccount
17 |     name: {{ include "4pd-vgpu.fullname" . }}-admission
18 |     namespace: {{ .Release.Namespace | quote }}
19 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/job-patch/job-createSecret.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.fullname" . }}-admission-create
 5 |   annotations:
 6 |     "helm.sh/hook": pre-install,pre-upgrade
 7 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 8 |   labels:
 9 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
10 |     app.kubernetes.io/component: admission-webhook
11 | spec:
12 |   {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }}
13 |   # Alpha feature since k8s 1.12
14 |   ttlSecondsAfterFinished: 0
15 |   {{- end }}
16 |   template:
17 |     metadata:
18 |       name: {{ include "4pd-vgpu.fullname" . }}-admission-create
19 |       {{- if .Values.scheduler.patch.podAnnotations }}
20 |       annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }}
21 |       {{- end }}
22 |       labels:
23 |         {{- include "4pd-vgpu.labels" . | nindent 8 }}
24 |         app.kubernetes.io/component: admission-webhook
25 |         4pd.io/webhook: ignore
26 |     spec:
27 |       {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}}
28 |       {{- if .Values.scheduler.patch.priorityClassName }}
29 |       priorityClassName: {{ .Values.scheduler.patch.priorityClassName }}
30 |       {{- end }}
31 |       containers:
32 |         - name: create
33 |           {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }}
34 |           image: {{ .Values.scheduler.patch.imageNew }}
35 |           {{- else }}
36 |           image: {{ .Values.scheduler.patch.image }}
37 |           {{- end }}
38 |           imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }}
39 |           args:
40 |             - create
41 |             - --cert-name=tls.crt
42 |             - --key-name=tls.key
43 |             {{- if .Values.scheduler.customWebhook.enabled }}
44 |             - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "4pd-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.customWebhook.host}}
45 |             {{- else }}
46 |             - --host={{ printf "%s.%s.svc,127.0.0.1" (include "4pd-vgpu.scheduler" .) .Release.Namespace }}
47 |             {{- end }}
48 |             - --namespace={{ .Release.Namespace }}
49 |             - --secret-name={{ include "4pd-vgpu.scheduler.tls" . }}
50 |       restartPolicy: OnFailure
51 |       serviceAccountName: {{ include "4pd-vgpu.fullname" . }}-admission
52 |       {{- if .Values.scheduler.patch.nodeSelector }}
53 |       nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }}
54 |       {{- end }}
55 |       {{- if .Values.scheduler.patch.tolerations }}
56 |       tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }}
57 |       {{- end }}
58 |       securityContext:
59 |         runAsNonRoot: true
60 |         runAsUser: {{ .Values.scheduler.patch.runAsUser }}
61 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/job-patch/job-patchWebhook.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.fullname" . }}-admission-patch
 5 |   annotations:
 6 |     "helm.sh/hook": post-install,post-upgrade
 7 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 8 |   labels:
 9 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
10 |     app.kubernetes.io/component: admission-webhook
11 | spec:
12 |   {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }}
13 |   # Alpha feature since k8s 1.12
14 |   ttlSecondsAfterFinished: 0
15 |   {{- end }}
16 |   template:
17 |     metadata:
18 |       name: {{ include "4pd-vgpu.fullname" . }}-admission-patch
19 |       {{- if .Values.scheduler.patch.podAnnotations }}
20 |       annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }}
21 |       {{- end }}
22 |       labels:
23 |         {{- include "4pd-vgpu.labels" . | nindent 8 }}
24 |         app.kubernetes.io/component: admission-webhook
25 |         4pd.io/webhook: ignore
26 |     spec:
27 |       {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}}
28 |       {{- if .Values.scheduler.patch.priorityClassName }}
29 |       priorityClassName: {{ .Values.scheduler.patch.priorityClassName }}
30 |       {{- end }}
31 |       containers:
32 |         - name: patch
33 |           {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }}
34 |           image: {{ .Values.scheduler.patch.imageNew }}
35 |           {{- else }}
36 |           image: {{ .Values.scheduler.patch.image }}
37 |           {{- end }}
38 |           imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }}
39 |           args:
40 |             - patch
41 |             - --webhook-name={{ include "4pd-vgpu.scheduler.webhook" . }}
42 |             - --namespace={{ .Release.Namespace }}
43 |             - --patch-validating=false
44 |             - --secret-name={{ include "4pd-vgpu.scheduler.tls" . }}
45 |             - --patch-failure-policy=Fail
46 |       restartPolicy: OnFailure
47 |       serviceAccountName: {{ include "4pd-vgpu.fullname" . }}-admission
48 |       {{- if .Values.scheduler.patch.nodeSelector }}
49 |       nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }}
50 |       {{- end }}
51 |       {{- if .Values.scheduler.patch.tolerations }}
52 |       tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }}
53 |       {{- end }}
54 |       securityContext:
55 |         runAsNonRoot: true
56 |         runAsUser: {{ .Values.scheduler.patch.runAsUser }}
57 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/job-patch/psp.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.podSecurityPolicy.enabled }}
 2 | apiVersion: policy/v1beta1
 3 | kind: PodSecurityPolicy
 4 | metadata:
 5 |   name: {{ include "4pd-vgpu.fullname" . }}-admission
 6 |   annotations:
 7 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 8 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 9 |   labels:
10 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
11 |     app.kubernetes.io/component: admission-webhook
12 | spec:
13 |   allowPrivilegeEscalation: false
14 |   fsGroup:
15 |     ranges:
16 |     - max: 65535
17 |       min: 1
18 |     rule: MustRunAs
19 |   requiredDropCapabilities:
20 |   - ALL
21 |   runAsUser:
22 |     rule: MustRunAsNonRoot
23 |   seLinux:
24 |     rule: RunAsAny
25 |   supplementalGroups:
26 |     ranges:
27 |     - max: 65535
28 |       min: 1
29 |     rule: MustRunAs
30 |   volumes:
31 |   - configMap
32 |   - emptyDir
33 |   - projected
34 |   - secret
35 |   - downwardAPI
36 | {{- end }}
37 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/job-patch/role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name:  {{ include "4pd-vgpu.fullname" . }}-admission
 5 |   annotations:
 6 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 7 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 8 |   labels:
 9 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
10 |     app.kubernetes.io/component: admission-webhook
11 | rules:
12 |   - apiGroups:
13 |       - ""
14 |     resources:
15 |       - secrets
16 |     verbs:
17 |       - get
18 |       - create
19 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/job-patch/rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.fullname" . }}-admission
 5 |   annotations:
 6 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 7 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 8 |   labels:
 9 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
10 |     app.kubernetes.io/component: admission-webhook
11 | roleRef:
12 |   apiGroup: rbac.authorization.k8s.io
13 |   kind: Role
14 |   name: {{ include "4pd-vgpu.fullname" . }}-admission
15 | subjects:
16 |   - kind: ServiceAccount
17 |     name: {{ include "4pd-vgpu.fullname" . }}-admission
18 |     namespace: {{ .Release.Namespace | quote }}
19 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/job-patch/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.fullname" . }}-admission
 5 |   annotations:
 6 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 7 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 8 |   labels:
 9 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
10 |     app.kubernetes.io/component: admission-webhook
11 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.scheduler" . }}
 5 |   labels:
 6 |     app.kubernetes.io/component: "4pd-scheduler"
 7 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: ClusterRole
11 |   name: cluster-admin
12 | subjects:
13 |   - kind: ServiceAccount
14 |     name: {{ include "4pd-vgpu.scheduler" . }}
15 |     namespace: {{ .Release.Namespace | quote }}
16 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.scheduler" . }}
 5 |   labels:
 6 |     app.kubernetes.io/component: 4pd-scheduler
 7 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
 8 |     {{- if .Values.scheduler.service.labels }}
 9 |     {{ toYaml .Values.scheduler.service.labels | indent 4 }}
10 |     {{- end }}
11 |   {{- if .Values.scheduler.service.annotations }}
12 |   annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }}
13 |   {{- end }}
14 | spec:
15 |   type: NodePort
16 |   ports:
17 |     - name: http
18 |       port: {{ .Values.scheduler.service.httpPort }}
19 |       targetPort: 443
20 |       nodePort: {{ .Values.scheduler.service.schedulerPort }}
21 |       protocol: TCP
22 |     - name: monitor
23 |       port: {{ .Values.scheduler.service.monitorPort }}
24 |       targetPort: 9395
25 |       nodePort: {{ .Values.scheduler.service.monitorPort }}
26 |       protocol: TCP
27 |   selector:
28 |     app.kubernetes.io/component: 4pd-scheduler
29 |     {{- include "4pd-vgpu.selectorLabels" . | nindent 4 }}
30 | 
31 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: {{ include "4pd-vgpu.scheduler" . }}
5 |   namespace: {{ .Release.Namespace | quote }}
6 |   labels:
7 |     app.kubernetes.io/component: "4pd-scheduler"
8 |     {{- include "4pd-vgpu.labels" . | nindent 4 }}
9 | 


--------------------------------------------------------------------------------
/charts/vgpu/templates/scheduler/webhook.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: admissionregistration.k8s.io/v1
 2 | kind: MutatingWebhookConfiguration
 3 | metadata:
 4 |   name: {{ include "4pd-vgpu.scheduler.webhook" . }}
 5 | webhooks:
 6 |   - admissionReviewVersions:
 7 |     - v1beta1
 8 |     clientConfig:
 9 |       {{- if .Values.scheduler.customWebhook.enabled }}
10 |       url: https://{{ .Values.scheduler.customWebhook.host}}:{{.Values.scheduler.customWebhook.port}}{{.Values.scheduler.customWebhook.path}}
11 |       {{- else }}
12 |       service:
13 |         name: {{ include "4pd-vgpu.scheduler" . }}
14 |         namespace: {{ .Release.Namespace }}
15 |         path: /webhook
16 |         port: {{ .Values.scheduler.service.httpPort }}
17 |       {{- end }}
18 |     failurePolicy: {{ .Values.scheduler.mutatingWebhookConfiguration.failurePolicy }}
19 |     matchPolicy: Equivalent
20 |     name: vgpu.4pd.io
21 |     namespaceSelector:
22 |       matchExpressions:
23 |       - key: 4pd.io/webhook
24 |         operator: NotIn
25 |         values:
26 |         - ignore
27 |     objectSelector:
28 |       matchExpressions:
29 |       - key: 4pd.io/webhook
30 |         operator: NotIn
31 |         values:
32 |         - ignore
33 |     reinvocationPolicy: Never
34 |     rules:
35 |       - apiGroups:
36 |           - ""
37 |         apiVersions:
38 |           - v1
39 |         operations:
40 |           - CREATE
41 |         resources:
42 |           - pods
43 |         scope: '*'
44 |     sideEffects: None
45 |     timeoutSeconds: 10
46 | 


--------------------------------------------------------------------------------
/cmd/device-plugin/nvidia/plugin-manager.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/cdi"
23 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager"
24 | 	"4pd.io/k8s-vgpu/pkg/util"
25 | 	"github.com/NVIDIA/go-nvlib/pkg/nvml"
26 | 	spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
27 | )
28 | 
29 | // NewPluginManager creates an NVML-based plugin manager
30 | func NewPluginManager(config *util.DeviceConfig) (manager.Interface, error) {
31 | 	var err error
32 | 	switch *config.Flags.MigStrategy {
33 | 	case spec.MigStrategyNone:
34 | 	case spec.MigStrategySingle:
35 | 	case spec.MigStrategyMixed:
36 | 	default:
37 | 		return nil, fmt.Errorf("unknown strategy: %v", *config.Flags.MigStrategy)
38 | 	}
39 | 
40 | 	nvmllib := nvml.New()
41 | 
42 | 	deviceListStrategies, err := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy)
43 | 	if err != nil {
44 | 		return nil, fmt.Errorf("invalid device list strategy: %v", err)
45 | 	}
46 | 
47 | 	cdiEnabled := deviceListStrategies.IsCDIEnabled()
48 | 
49 | 	cdiHandler, err := cdi.New(
50 | 		cdi.WithEnabled(cdiEnabled),
51 | 		cdi.WithDriverRoot(*config.Flags.Plugin.ContainerDriverRoot),
52 | 		cdi.WithTargetDriverRoot(*config.Flags.NvidiaDriverRoot),
53 | 		cdi.WithNvidiaCTKPath(*config.Flags.Plugin.NvidiaCTKPath),
54 | 		cdi.WithNvml(nvmllib),
55 | 		cdi.WithDeviceIDStrategy(*config.Flags.Plugin.DeviceIDStrategy),
56 | 		cdi.WithVendor("k8s.device-plugin.nvidia.com"),
57 | 		cdi.WithGdsEnabled(*config.Flags.GDSEnabled),
58 | 		cdi.WithMofedEnabled(*config.Flags.MOFEDEnabled),
59 | 	)
60 | 	if err != nil {
61 | 		return nil, fmt.Errorf("unable to create cdi handler: %v", err)
62 | 	}
63 | 
64 | 	m, err := manager.New(
65 | 		manager.WithNVML(nvmllib),
66 | 		manager.WithCDIEnabled(cdiEnabled),
67 | 		manager.WithCDIHandler(cdiHandler),
68 | 		manager.WithConfig(config),
69 | 		manager.WithFailOnInitError(*config.Flags.FailOnInitError),
70 | 		manager.WithMigStrategy(*config.Flags.MigStrategy),
71 | 	)
72 | 	if err != nil {
73 | 		return nil, fmt.Errorf("unable to create plugin manager: %v", err)
74 | 	}
75 | 
76 | 	if err := m.CreateCDISpecFile(); err != nil {
77 | 		return nil, fmt.Errorf("unable to create cdi spec file: %v", err)
78 | 	}
79 | 
80 | 	return m, nil
81 | }
82 | 


--------------------------------------------------------------------------------
/cmd/device-plugin/nvidia/watchers.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"os"
21 | 	"os/signal"
22 | 
23 | 	"github.com/fsnotify/fsnotify"
24 | )
25 | 
26 | func newFSWatcher(files ...string) (*fsnotify.Watcher, error) {
27 | 	watcher, err := fsnotify.NewWatcher()
28 | 	if err != nil {
29 | 		return nil, err
30 | 	}
31 | 
32 | 	for _, f := range files {
33 | 		err = watcher.Add(f)
34 | 		if err != nil {
35 | 			watcher.Close()
36 | 			return nil, err
37 | 		}
38 | 	}
39 | 
40 | 	return watcher, nil
41 | }
42 | 
43 | func newOSWatcher(sigs ...os.Signal) chan os.Signal {
44 | 	sigChan := make(chan os.Signal, 1)
45 | 	signal.Notify(sigChan, sigs...)
46 | 
47 | 	return sigChan
48 | }
49 | 


--------------------------------------------------------------------------------
/cmd/scheduler/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package main
17 | 
18 | import (
19 | 	"net/http"
20 | 
21 | 	"4pd.io/k8s-vgpu/pkg/device"
22 | 	"4pd.io/k8s-vgpu/pkg/version"
23 | 
24 | 	"4pd.io/k8s-vgpu/pkg/scheduler"
25 | 	"4pd.io/k8s-vgpu/pkg/scheduler/config"
26 | 	"4pd.io/k8s-vgpu/pkg/scheduler/routes"
27 | 	"4pd.io/k8s-vgpu/pkg/util"
28 | 	"github.com/julienschmidt/httprouter"
29 | 	"github.com/spf13/cobra"
30 | 	klog "k8s.io/klog/v2"
31 | )
32 | 
33 | //var version string
34 | 
35 | var (
36 | 	sher        *scheduler.Scheduler
37 | 	tlsKeyFile  string
38 | 	tlsCertFile string
39 | 	rootCmd     = &cobra.Command{
40 | 		Use:   "scheduler",
41 | 		Short: "kubernetes vgpu scheduler",
42 | 		Run: func(cmd *cobra.Command, args []string) {
43 | 			start()
44 | 		},
45 | 	}
46 | )
47 | 
48 | func init() {
49 | 	rootCmd.Flags().SortFlags = false
50 | 	rootCmd.PersistentFlags().SortFlags = false
51 | 
52 | 	rootCmd.Flags().StringVar(&config.HttpBind, "http_bind", "127.0.0.1:8080", "http server bind address")
53 | 	rootCmd.Flags().StringVar(&tlsCertFile, "cert_file", "", "tls cert file")
54 | 	rootCmd.Flags().StringVar(&tlsKeyFile, "key_file", "", "tls key file")
55 | 	rootCmd.Flags().StringVar(&config.SchedulerName, "scheduler-name", "", "the name to be added to pod.spec.schedulerName if not empty")
56 | 	rootCmd.Flags().Int32Var(&config.DefaultMem, "default-mem", 0, "default gpu device memory to allocate")
57 | 	rootCmd.Flags().Int32Var(&config.DefaultCores, "default-cores", 0, "default gpu core percentage to allocate")
58 | 	rootCmd.Flags().StringVar(&config.MetricsBindAddress, "metrics-bind-address", ":9395", "The TCP address that the scheduler should bind to for serving prometheus metrics(e.g. 127.0.0.1:9395, :9395)")
59 | 	rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet())
60 | 	rootCmd.AddCommand(version.VersionCmd)
61 | 	rootCmd.Flags().AddGoFlagSet(util.InitKlogFlags())
62 | }
63 | 
64 | func start() {
65 | 	sher = scheduler.NewScheduler()
66 | 	sher.Start()
67 | 	defer sher.Stop()
68 | 
69 | 	// start monitor metrics
70 | 	go sher.RegisterFromNodeAnnotatons()
71 | 	go initmetrics(config.MetricsBindAddress)
72 | 
73 | 	// start http server
74 | 	router := httprouter.New()
75 | 	router.POST("/filter", routes.PredicateRoute(sher))
76 | 	router.POST("/bind", routes.Bind(sher))
77 | 	router.POST("/webhook", routes.WebHookRoute())
78 | 	klog.Info("listen on ", config.HttpBind)
79 | 	if len(tlsCertFile) == 0 || len(tlsKeyFile) == 0 {
80 | 		if err := http.ListenAndServe(config.HttpBind, router); err != nil {
81 | 			klog.Fatal("Listen and Serve error, ", err)
82 | 		}
83 | 	} else {
84 | 		if err := http.ListenAndServeTLS(config.HttpBind, tlsCertFile, tlsKeyFile, router); err != nil {
85 | 			klog.Fatal("Listen and Serve error, ", err)
86 | 		}
87 | 	}
88 | }
89 | 
90 | func main() {
91 | 	if err := rootCmd.Execute(); err != nil {
92 | 		klog.Fatal(err)
93 | 	}
94 | }
95 | 


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/build.sh:
--------------------------------------------------------------------------------
1 | protoc --go_out=. --go_opt=paths=source_relative     --go-grpc_out=. --go-grpc_opt=paths=source_relative     noderpc/noderpc.proto
2 | go build
3 | 


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"k8s.io/klog"
 5 | )
 6 | 
 7 | //var addr = flag.String("listen-address", ":9394", "The address to listen on for HTTP requests.")
 8 | 
 9 | //const shared_directory = "/usr/local/vgpu/shared"
10 | 
11 | func main() {
12 | 
13 | 	if err := ValidateEnvVars(); err != nil {
14 | 		klog.Fatalf("Failed to validate environment variables: %v", err)
15 | 	}
16 | 	cgroupDriver = 0
17 | 	errchannel := make(chan error)
18 | 	go serveInfo(errchannel)
19 | 	go initmetrics()
20 | 	go watchAndFeedback()
21 | 	for {
22 | 		err := <-errchannel
23 | 		klog.Errorf("failed to serve: %v", err)
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/noderpc/noderpc.proto:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 gRPC authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | syntax = "proto3";
16 | 
17 | option go_package = "gitlab.4pd.io/vGPUmonitor";
18 | option java_multiple_files = true;
19 | option java_package = "io.grpc.examples.helloworld";
20 | option java_outer_classname = "HelloWorldProto";
21 | 
22 | package pluginrpc;
23 | 
24 | // The greeting service definition.
25 | service NodeVGPUInfo {
26 |   // Sends a greeting
27 |   rpc GetNodeVGPU (GetNodeVGPURequest) returns (GetNodeVGPUReply) {}
28 | }
29 | 
30 | // The sharedProcs contains the sharedRegion
31 | message shrregProcSlotT {
32 | 	int32 pid = 1;
33 | 	repeated uint64 used = 2;
34 | 	int32 status = 3;
35 | }
36 | 
37 | // The sharedRegionT struct is the main struct for monitoring vgpu
38 | message sharedRegionT {
39 | 	int32 initializedFlag = 1;
40 | 	uint32 ownerPid = 2;
41 | 	uint32 sem = 3;
42 | 	repeated uint64 limit = 4;
43 | 	repeated uint64 sm_limit = 5;
44 | 	repeated shrregProcSlotT procs = 6;
45 | }
46 | 
47 | message podusage {
48 | 	string poduuid = 1;
49 | 	sharedRegionT podvgpuinfo = 2;
50 | }
51 | 
52 | // The request message containing the user's name.
53 | message GetNodeVGPURequest {
54 |  	string ctruuid = 1;
55 | }
56 | 
57 | // The response message containing the greetings
58 | message GetNodeVGPUReply {
59 | 	string nodeid = 1;
60 | 	repeated podusage nodevgpuinfo = 2;	
61 | }
62 | 


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/pathmonitor_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	v1 "k8s.io/api/core/v1"
 7 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 8 | )
 9 | 
10 | func TestIsVaildPod(t *testing.T) {
11 | 	pods := &v1.PodList{
12 | 		Items: []v1.Pod{
13 | 			{
14 | 				ObjectMeta: metav1.ObjectMeta{
15 | 					UID: "123",
16 | 				},
17 | 			},
18 | 			{
19 | 				ObjectMeta: metav1.ObjectMeta{
20 | 					UID: "456",
21 | 				},
22 | 			},
23 | 		},
24 | 	}
25 | 
26 | 	cases := []struct {
27 | 		name     string
28 | 		expected bool
29 | 	}{
30 | 		{
31 | 			name:     "123",
32 | 			expected: true,
33 | 		},
34 | 		{
35 | 			name:     "789",
36 | 			expected: false,
37 | 		},
38 | 	}
39 | 
40 | 	for _, c := range cases {
41 | 		if got := isVaildPod(c.name, pods); got != c.expected {
42 | 			t.Errorf("isVaildPod(%q) == %v, want %v", c.name, got, c.expected)
43 | 		}
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/testcollector/testcollector:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/cmd/vGPUmonitor/testcollector/testcollector


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/validation.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | )
 7 | 
 8 | var requiredEnvVars = map[string]bool{
 9 | 	"HOOK_PATH":     true,
10 | 	"OTHER_ENV_VAR": false,
11 | }
12 | 
13 | func ValidateEnvVars() error {
14 | 	for envVar, required := range requiredEnvVars {
15 | 		_, exists := os.LookupEnv(envVar)
16 | 		if required && !exists {
17 | 			return fmt.Errorf("required environment variable %s not set", envVar)
18 | 		}
19 | 	}
20 | 	return nil
21 | }
22 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG GOLANG_IMAGE
 2 | ARG NVIDIA_IMAGE
 3 | FROM $GOLANG_IMAGE AS build
 4 | 
 5 | FROM $GOLANG_IMAGE AS GOBUILD
 6 | ADD . /k8s-vgpu
 7 | ARG GOPROXY=https://goproxy.cn,direct
 8 | RUN apt-get update && apt-get -y install libhwloc-dev libdrm-dev
 9 | RUN cd /k8s-vgpu && make all
10 | 
11 | FROM nvidia/cuda:12.2.0-base-ubuntu22.04
12 | ENV NVIDIA_DISABLE_REQUIRE="true"
13 | ENV NVIDIA_VISIBLE_DEVICES=all
14 | ENV NVIDIA_DRIVER_CAPABILITIES=utility
15 | 
16 | ARG VERSION
17 | LABEL version="$VERSION"
18 | LABEL maintainer="opensource@4paradigm.com"
19 | COPY ./LICENSE /k8s-vgpu/LICENSE
20 | COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin
21 | COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh
22 | COPY ./lib /k8s-vgpu/lib
23 | COPY ./lib/mlu/cntopo /usr/bin/
24 | COPY ./lib/mlu/libcndev.so /usr/lib/
25 | 
26 | ENV PATH="/k8s-vgpu/bin:${PATH}"
27 | ARG DEST_DIR
28 | ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh  $DEST_DIR"]
29 | 


--------------------------------------------------------------------------------
/docker/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # if [ $1 == "device-plugin" ]; then
19 | # cp -f /k8s-vgpu/lib/* $DEST_DIR/vgpu
20 | # fi
21 | exec "$@"


--------------------------------------------------------------------------------
/docs/benchmark.md:
--------------------------------------------------------------------------------
 1 | ## Benchmarks
 2 | 
 3 | Three instances from ai-benchmark have been used to evaluate vGPU-device-plugin performance as follows
 4 | 
 5 | | Test Environment | description                                              |
 6 | | ---------------- | :------------------------------------------------------: |
 7 | | Kubernetes version | v1.12.9                                                |
 8 | | Docker  version    | 18.09.1                                                |
 9 | | GPU Type           | Tesla V100                                             |
10 | | GPU Num            | 2                                                      |
11 | 
12 | | Test instance |                         description                         |
13 | | ------------- | :---------------------------------------------------------: |
14 | | nvidia-device-plugin      |               k8s + nvidia k8s-device-plugin                |
15 | | vGPU-device-plugin        | k8s + VGPU k8s-device-plugin，without virtual device memory |
16 | | vGPU-device-plugin(virtual device memory) |  k8s + VGPU k8s-device-plugin，with virtual device memory   |
17 | 
18 | Test Cases:
19 | 
20 | | test id |     case      |   type    |         params          |
21 | | ------- | :-----------: | :-------: | :---------------------: |
22 | | 1.1     | Resnet-V2-50  | inference |  batch=50,size=346*346  |
23 | | 1.2     | Resnet-V2-50  | training  |  batch=20,size=346*346  |
24 | | 2.1     | Resnet-V2-152 | inference |  batch=10,size=256*256  |
25 | | 2.2     | Resnet-V2-152 | training  |  batch=10,size=256*256  |
26 | | 3.1     |    VGG-16     | inference |  batch=20,size=224*224  |
27 | | 3.2     |    VGG-16     | training  |  batch=2,size=224*224   |
28 | | 4.1     |    DeepLab    | inference |  batch=2,size=512*512   |
29 | | 4.2     |    DeepLab    | training  |  batch=1,size=384*384   |
30 | | 5.1     |     LSTM      | inference | batch=100,size=1024*300 |
31 | | 5.2     |     LSTM      | training  | batch=10,size=1024*300  |
32 | 
33 | Test Result: ![img](../imgs/benchmark_inf.png)
34 | 
35 | ![img](../imgs/benchmark_train.png)
36 | 
37 | To reproduce:
38 | 
39 | 1. install k8s-vGPU-scheduler，and configure properly
40 | 2. run benchmark job
41 | 
42 | ```
43 | $ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml
44 | ```
45 | 
46 | 3. View the result by using kubctl logs
47 | 
48 | ```
49 | $ kubectl logs [pod id]


--------------------------------------------------------------------------------
/docs/benchmark_cn.md:
--------------------------------------------------------------------------------
 1 | ## 性能测试
 2 | 
 3 | 在测试报告中，我们一共在下面五种场景都执行了ai-benchmark 测试脚本，并汇总最终结果：
 4 | 
 5 | | 测试环境 | 环境描述                                              |
 6 | | ---------------- | :------------------------------------------------------: |
 7 | | Kubernetes version | v1.12.9                                                |
 8 | | Docker  version    | 18.09.1                                                |
 9 | | GPU Type           | Tesla V100                                             |
10 | | GPU Num            | 2                                                      |
11 | 
12 | | 测试名称 |                      测试用例                      |
13 | | -------- | :------------------------------------------------: |
14 | | Nvidia-device-plugin        |         k8s + nvidia官方k8s-device-plugin          |
15 | | vGPU-device-plugin        |      k8s + VGPU k8s-device-plugin，无虚拟显存      |
16 | | vGPU-device-plugin(virtual device memory)  | k8s + VGPU k8s-device-plugin，高负载，开启虚拟显存 |
17 | 
18 | 测试内容
19 | 
20 | | test id |     名称      |   类型    |          参数           |
21 | | ------- | :-----------: | :-------: | :---------------------: |
22 | | 1.1     | Resnet-V2-50  | inference |  batch=50,size=346*346  |
23 | | 1.2     | Resnet-V2-50  | training  |  batch=20,size=346*346  |
24 | | 2.1     | Resnet-V2-152 | inference |  batch=10,size=256*256  |
25 | | 2.2     | Resnet-V2-152 | training  |  batch=10,size=256*256  |
26 | | 3.1     |    VGG-16     | inference |  batch=20,size=224*224  |
27 | | 3.2     |    VGG-16     | training  |  batch=2,size=224*224   |
28 | | 4.1     |    DeepLab    | inference |  batch=2,size=512*512   |
29 | | 4.2     |    DeepLab    | training  |  batch=1,size=384*384   |
30 | | 5.1     |     LSTM      | inference | batch=100,size=1024*300 |
31 | | 5.2     |     LSTM      | training  | batch=10,size=1024*300  |
32 | 
33 | 测试结果： ![img](../imgs/benchmark_inf.png)
34 | 
35 | ![img](../imgs/benchmark_train.png)
36 | 
37 | 测试步骤：
38 | 
39 | 1. 安装nvidia-device-plugin，并配置相应的参数
40 | 2. 运行benchmark任务
41 | 
42 | ```
43 | $ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml
44 | ```
45 | 
46 | 3. 通过kubctl logs 查看结果
47 | 
48 | ```
49 | $ kubectl logs [pod id]
50 | ```


--------------------------------------------------------------------------------
/docs/cambricon-mlu-support.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | **We now support cambricon.com/mlu by implementing most device-sharing features as nvidia-GPU**, including:
 4 | 
 5 | ***MLU sharing***: Each task can allocate a portion of MLU instead of a whole MLU card, thus MLU can be shared among multiple tasks.
 6 | 
 7 | ***Device Memory Control***: MLUs can be allocated with certain device memory size on certain type(i.e 370) and have made it that it does not exceed the boundary.
 8 | 
 9 | ***MLU Type Specification***: You can specify which type of MLU to use or to avoid for a certain task, by setting "cambricon.com/use-mlutype" or "cambricon.com/nouse-mlutype" annotations. 
10 | 
11 | ***Very Easy to use***: You don't need to modify your task yaml to use our scheduler. All your MLU jobs will be automatically supported after installation. The only thing you need to do is tag the MLU node.
12 | 
13 | ## Prerequisites
14 | 
15 | * neuware-mlu370-driver > 4.15.10
16 | * cntoolkit > 2.5.3
17 | 
18 | ## Enabling MLU-sharing Support
19 | 
20 | * Install the chart using helm, See 'enabling vGPU support in kubernetes' section [here](https://github.com/4paradigm/k8s-vgpu-scheduler#enabling-vgpu-support-in-kubernetes)
21 | 
22 | * Tag MLU node with the following command
23 | ```
24 | kubectl label node {mlu-node} mlu=on
25 | ```
26 | 
27 | ## Running MLU jobs
28 | 
29 | Cambricon MMLUs can now be requested by a container
30 | using the `cambricon.com/mlunum` and `cambricon.com/mlumem` resource type:
31 | 
32 | ```
33 | apiVersion: v1
34 | kind: Pod
35 | metadata:
36 |   name: gpu-pod
37 | spec:
38 |   containers:
39 |     - name: ubuntu-container
40 |       image: ubuntu:18.04
41 |       command: ["bash", "-c", "sleep 86400"]
42 |       resources:
43 |         limits:
44 |           cambricon.com/mlunum: 1 # requesting 1 MLU
45 |           cambricon.com/mlumem: 10240 # requesting 10G MLU device memory
46 |     - name: ubuntu-container1
47 |       image: ubuntu:18.04
48 |       command: ["bash", "-c", "sleep 86400"]
49 |       resources:
50 |         limits:
51 |           cambricon.com/mlunum: 1 # requesting 1 MLU
52 |           cambricon.com/mlumem: 10240 # requesting 10G MLU device memory
53 | ```
54 | 
55 | ## Notes
56 | 
57 | 1. Mlu-sharing in init container is not supported, pods with "combricon.com/mlumem" in init container will never be scheduled.
58 | 
59 | 2. Mlu-sharing with containerd is not supported, the container may not start successfully.
60 | 
61 | 3. Mlu-sharing can only be applied on MLU-370
62 |    


--------------------------------------------------------------------------------
/docs/cambricon-mlu-support_cn.md:
--------------------------------------------------------------------------------
 1 | ## 简介
 2 | 
 3 | 本组件支持复用寒武纪MLU设备，并为此提供以下几种与vGPU类似的复用功能，包括：
 4 | 
 5 | ***MLU 共享***: 每个任务可以只占用一部分显卡，多个任务可以共享一张显卡
 6 | 
 7 | ***可限制分配的显存大小***: 你现在可以用显存值（例如3000M）来分配MLU，本组件会确保任务使用的显存不会超过分配数值，注意只有MLU-370型号的MLU支持可配显存
 8 | 
 9 | ***指定MLU型号***：当前任务可以通过设置annotation("cambricon.com/use-mlutype","cambricon.com/nouse-mlutype")的方式，来选择使用或者不使用某些具体型号的MLU
10 | 
11 | ***方便易用***:  部署本组件后，你只需要给MLU节点打上tag即可使用MLU复用功能
12 | 
13 | 
14 | ## 节点需求
15 | 
16 | * neuware-mlu370-driver > 4.15.10
17 | * cntoolkit > 2.5.3
18 | 
19 | ## 开启MLU复用
20 | 
21 | * 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/4paradigm/k8s-vgpu-scheduler/blob/master/README_cn.md#kubernetes开启vgpu支持)
22 | 
23 | * 使用以下指令，为MLU节点打上label
24 | ```
25 | kubectl label node {mlu-node} mlu=on
26 | ```
27 | 
28 | ## 运行MLU任务
29 | 
30 | ```
31 | apiVersion: v1
32 | kind: Pod
33 | metadata:
34 |   name: gpu-pod
35 | spec:
36 |   containers:
37 |     - name: ubuntu-container
38 |       image: ubuntu:18.04
39 |       command: ["bash", "-c", "sleep 86400"]
40 |       resources:
41 |         limits:
42 |           cambricon.com/mlunum: 1 # requesting 1 MLU
43 |           cambricon.com/mlumem: 10240 # requesting 10G MLU device memory
44 |     - name: ubuntu-container1
45 |       image: ubuntu:18.04
46 |       command: ["bash", "-c", "sleep 86400"]
47 |       resources:
48 |         limits:
49 |           cambricon.com/mlunum: 1 # requesting 1 MLU
50 |           cambricon.com/mlumem: 10240 # requesting 10G MLU device memory
51 | ```
52 | 
53 | ## 注意事项
54 | 
55 | 1. 在init container中无法使用MLU复用功能，否则该任务不会被调度
56 | 
57 | 2. MLU复用功能目前不支持containerd，在containerd中使用会导致任务失败
58 | 
59 | 3. 只有MLU-370可以使用MLU复用功能
60 | 


--------------------------------------------------------------------------------
/docs/config.md:
--------------------------------------------------------------------------------
 1 | # Global Config
 2 | 
 3 | you can customize your vGPU support by setting the following parameters using `-set`, for example
 4 | 
 5 | ```
 6 | helm install vgpu-charts/vgpu vgpu --set devicePlugin.deviceMemoryScaling=5 ...
 7 | ```
 8 | 
 9 | * `devicePlugin.service.schedulerPort:`
10 |   Integer type, by default: 31998, scheduler webhook service nodePort.
11 | * `devicePlugin.deviceMemoryScaling:` 
12 |   Float type, by default: 1. The ratio for NVIDIA device memory scaling, can be greater than 1 (enable virtual device memory, experimental feature). For NVIDIA GPU with *M* memory, if we set `devicePlugin.deviceMemoryScaling` argument to *S*, vGPUs splitted by this GPU will totally get `S * M` memory in Kubernetes with our device plugin.
13 | * `devicePlugin.deviceSplitCount:` 
14 |   Integer type, by default: equals 10. Maximum tasks assigned to a simple GPU device.
15 | * `devicePlugin.migstrategy:`
16 |   String type, "none" for ignoring MIG features or "mixed" for allocating MIG device by seperate resources. Default "none"
17 | * `devicePlugin.disablecorelimit:`
18 |   String type, "true" for disable core limit, "false" for enable core limit, default: false
19 | * `scheduler.defaultMem:` 
20 |   Integer type, by default: 5000. The default device memory of the current task, in MB
21 | * `scheduler.defaultCores:` 
22 |   Integer type, by default: equals 0. Percentage of GPU cores reserved for the current task. If assigned to 0, it may fit in any GPU with enough device memory. If assigned to 100, it will use an entire GPU card exclusively.
23 | * `resourceName:`
24 |   String type, vgpu number resource name, default: "nvidia.com/gpu"
25 | * `resourceMem:`
26 |   String type, vgpu memory size resource name, default: "nvidia.com/gpumem"
27 | * `resourceMemPercentage:`
28 |   String type, vgpu memory fraction resource name, default: "nvidia.com/gpumem-percentage" 
29 | * `resourceCores:`
30 |   String type, vgpu cores resource name, default: "nvidia.com/cores"
31 | * `resourcePriority:`
32 |   String type, vgpu task priority name, default: "nvidia.com/priority"
33 | 
34 | # Container config envs
35 | 
36 | * `GPU_CORE_UTILIZATION_POLICY:`
37 |   String type, "default", "force", "disable"
38 |   "default" means the dafault utilization policy
39 |   "force" means the container will always limit the core utilization below "nvidia.com/gpucores"
40 |   "disable" means the container will ignore the utilization limitation set by "nvidia.com/gpucores" during task execution
41 | 
42 | * `ACTIVE_OOM_KILLER:`
43 |   String type, "true","false"
44 |   "true" means the task may be killed if exceeds the limitation set by "nvidia.com/gpumem" or "nvidia.com/gpumemory"
45 |   "false" means the task will not be killed even it exceeds the limitation.
46 | 
47 |   
48 | 


--------------------------------------------------------------------------------
/docs/config_cn.md:
--------------------------------------------------------------------------------
 1 | # 全局配置
 2 | 
 3 | 你可以在安装过程中，通过`-set`来修改以下的客制化参数，例如：
 4 | 
 5 | ```
 6 | helm install vgpu vgpu-charts/vgpu --set devicePlugin.deviceMemoryScaling=5 ...
 7 | ```
 8 | 
 9 | * `devicePlugin.deviceSplitCount:` 
10 |   整数类型，预设值是10。GPU的分割数，每一张GPU都不能分配超过其配置数目的任务。若其配置为N的话，每个GPU上最多可以同时存在N个任务。
11 | * `devicePlugin.deviceMemoryScaling:` 
12 |   浮点数类型，预设值是1。NVIDIA装置显存使用比例，可以大于1（启用虚拟显存，实验功能）。对于有*M*显存大小的NVIDIA GPU，如果我们配置`devicePlugin.deviceMemoryScaling`参数为*S*，在部署了我们装置插件的Kubenetes集群中，这张GPU分出的vGPU将总共包含 `S * M` 显存。
13 | * `devicePlugin.migStrategy:`
14 |   字符串类型，目前支持"none“与“mixed“两种工作方式，前者忽略MIG设备，后者使用专门的资源名称指定MIG设备，使用详情请参考mix_example.yaml，默认为"none"
15 | * `devicePlugin.disablecorelimit:`
16 |   字符串类型，"true"为关闭算力限制，"false"为启动算力限制，默认为"false"
17 | * `scheduler.defaultMem:`
18 |   整数类型，预设值为5000，表示不配置显存时使用的默认显存大小，单位为MB
19 | * `scheduler.defaultCores:`
20 |   整数类型(0-100)，默认为0，表示默认为每个任务预留的百分比算力。若设置为0，则代表任务可能会被分配到任一满足显存需求的GPU中，若设置为100，代表该任务独享整张显卡
21 | * `resourceName:`
22 |   字符串类型, 申请vgpu个数的资源名, 默认: "nvidia.com/gpu"
23 | * `resourceMem:`
24 |   字符串类型, 申请vgpu显存大小资源名, 默认: "nvidia.com/gpumem"
25 | * `resourceMemPercentage:`
26 |   字符串类型，申请vgpu显存比例资源名，默认: "nvidia.com/gpumem-percentage"
27 | * `resourceCores:`
28 |   字符串类型, 申请vgpu算力资源名, 默认: "nvidia.com/cores"
29 | * `resourcePriority:`
30 |   字符串类型，表示申请任务的任务优先级，默认: "nvidia.com/priority"
31 | 
32 | # 容器配置（在容器的环境变量中指定）
33 | 
34 | * `GPU_CORE_UTILIZATION_POLICY:`
35 |   字符串类型，"default", "force", "disable"
36 |   代表容器算力限制策略， "default"为默认，"force"为强制限制算力，一般用于测试算力限制的功能，"disable"为忽略算力限制
37 | * `ACTIVE_OOM_KILLER:`
38 |   字符串类型，"true", "false"
39 |   代表容器是否会因为超用显存而被终止执行，"true"为会，"false"为不会


--------------------------------------------------------------------------------
/docs/dashboard.md:
--------------------------------------------------------------------------------
 1 | ## Grafana Dashboard
 2 | 
 3 | - You can load this dashboard json file [gpu-dashboard.json](./gpu-dashboard.json)
 4 | 
 5 | - This dashboard also includes some NVIDIA DCGM metrics:
 6 | 
 7 |   [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) deploy：`kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml`
 8 | 
 9 | - use this prometheus custom metric configure:
10 | 
11 | ```yaml
12 | - job_name: 'kubernetes-vgpu-exporter'
13 |     kubernetes_sd_configs:
14 |     - role: endpoints
15 |     relabel_configs:
16 |     - source_labels: [__meta_kubernetes_endpoints_name]
17 |       regex: vgpu-device-plugin-monitor
18 |       replacement: $1
19 |       action: keep
20 |     - source_labels: [__meta_kubernetes_pod_node_name]
21 |       regex: (.*)
22 |       target_label: node_name
23 |       replacement: ${1}
24 |       action: replace
25 |     - source_labels: [__meta_kubernetes_pod_host_ip]
26 |       regex: (.*)
27 |       target_label: ip
28 |       replacement: $1
29 |       action: replace
30 | - job_name: 'kubernetes-dcgm-exporter'
31 |     kubernetes_sd_configs:
32 |     - role: endpoints
33 |     relabel_configs:
34 |     - source_labels: [__meta_kubernetes_endpoints_name]
35 |       regex: dcgm-exporter
36 |       replacement: $1
37 |       action: keep
38 |     - source_labels: [__meta_kubernetes_pod_node_name]
39 |       regex: (.*)
40 |       target_label: node_name
41 |       replacement: ${1}
42 |       action: replace
43 |     - source_labels: [__meta_kubernetes_pod_host_ip]
44 |       regex: (.*)
45 |       target_label: ip
46 |       replacement: $1
47 |       action: replace
48 | ```
49 | 
50 | - reload promethues：
51 | 
52 | ```bash
53 | curl -XPOST http://{promethuesServer}:{port}/-/reload
54 | ```
55 | 


--------------------------------------------------------------------------------
/docs/dashboard_cn.md:
--------------------------------------------------------------------------------
 1 | ## Grafana Dashboard
 2 | 
 3 | - 你可以在 grafana 中导入此 [gpu-dashboard.json](./gpu-dashboard.json)
 4 | - 此 dashboard 还包括一部分 NVIDIA DCGM 监控指标:
 5 | 
 6 |   [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter)部署：`kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml`
 7 | 
 8 | - 添加 prometheus 自定义的监控项:
 9 | 
10 | ```yaml
11 | - job_name: 'kubernetes-vgpu-exporter'
12 |     kubernetes_sd_configs:
13 |     - role: endpoints
14 |     relabel_configs:
15 |     - source_labels: [__meta_kubernetes_endpoints_name]
16 |       regex: vgpu-device-plugin-monitor
17 |       replacement: $1
18 |       action: keep
19 |     - source_labels: [__meta_kubernetes_pod_node_name]
20 |       regex: (.*)
21 |       target_label: node_name
22 |       replacement: ${1}
23 |       action: replace
24 |     - source_labels: [__meta_kubernetes_pod_host_ip]
25 |       regex: (.*)
26 |       target_label: ip
27 |       replacement: $1
28 |       action: replace
29 | - job_name: 'kubernetes-dcgm-exporter'
30 |     kubernetes_sd_configs:
31 |     - role: endpoints
32 |     relabel_configs:
33 |     - source_labels: [__meta_kubernetes_endpoints_name]
34 |       regex: dcgm-exporter
35 |       replacement: $1
36 |       action: keep
37 |     - source_labels: [__meta_kubernetes_pod_node_name]
38 |       regex: (.*)
39 |       target_label: node_name
40 |       replacement: ${1}
41 |       action: replace
42 |     - source_labels: [__meta_kubernetes_pod_host_ip]
43 |       regex: (.*)
44 |       target_label: ip
45 |       replacement: $1
46 |       action: replace
47 | ```
48 | 
49 | - 加载 promethues 配置：
50 | 
51 | ```bash
52 | curl -XPOST http://{promethuesServer}:{port}/-/reload
53 | ```
54 | 


--------------------------------------------------------------------------------
/docs/develop/design.md:
--------------------------------------------------------------------------------
 1 | # Design
 2 | 
 3 | <img src="../../imgs/arch.png" width = "800" /> 
 4 | 
 5 | The architect of HAMi is shown in the figure above, It is organized in the form of "chart".
 6 | 
 7 | - MutatingWebhook
 8 | 
 9 | The MutatingWebhook checks the validity of each task, and set the "schedulerName" to "HAMi scheduler" if the resource requests have been recognized by HAMi
10 | If Not, the MutatingWebhook does nothing and pass this task to default-scheduler.
11 | 
12 | - Scheduler
13 | 
14 | HAMi support default kube-scheduler and volcano-scheduler, it implements an extender and register 'Filter' and 'Score' methods to deal with sharable devices.
15 | When a pod with sharable device request arrives, 'Filter' searches the cluster and returns a list of 'available' nodes. 'Score' scores each node 'Filter' returned, and pick the highest one to host the pod. It patches the schedule decision on corresponding pod annotations, for the detailed protocol, see protocol.md
16 | 
17 | - DevicePlugin
18 | 
19 | When the schedule decision is made, scheduler calls devicePlugin on that node to generate environment variables and mounts according to pod annotations.
20 | Please note that, the DP used here is a customized version, you need to install according to README document with that device. Most officaial DP will not fit in HAMi, and will result in unexpected behaviour
21 | 
22 | - InContainer Control
23 | 
24 | The implementation of in-container hard limit is different for diffent devices. For example, HAMi-Core is responsible for NVIDIA devices. libnvidia-control.so is responsible for iluvatar devices, etc. HAMi needs to pass the correct environment variables in order for it to operate.
25 | 
26 | <img src="./imgs/flowchart.jpeg" width = "600" /> 
27 | 
28 | In summary, The flowchart of pod is descirbed as the figure above.
29 | 


--------------------------------------------------------------------------------
/docs/develop/imgs/flowchart.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/docs/develop/imgs/flowchart.jpeg


--------------------------------------------------------------------------------
/docs/develop/imgs/offline_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/docs/develop/imgs/offline_validation.png


--------------------------------------------------------------------------------
/docs/develop/imgs/protocol_pod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/docs/develop/imgs/protocol_pod.png


--------------------------------------------------------------------------------
/docs/develop/imgs/protocol_register.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/docs/develop/imgs/protocol_register.png


--------------------------------------------------------------------------------
/docs/develop/protocol.md:
--------------------------------------------------------------------------------
 1 | # Protocol
 2 | 
 3 | ## Device Register
 4 | 
 5 | <img src="./imgs/protocol_register.png" width = "600" /> 
 6 | 
 7 | HAMi needs to know the spec of each AI devices in the cluster in order to schedule properly. During device registration, device-plugin needs to keep patching the spec of each device into node annotations every 30 seconds, in the format of the following:
 8 | 
 9 | ```
10 | HAMi.sh/node-handshake-{device-type}: Reported_{device_node_current_timestamp}
11 | HAMi.sh/node-register-{deivce-type}: {Device 1}:{Device2}:...:{Device N}
12 | ```
13 | 
14 | The definiation of each device is in the following format:
15 | ```
16 | {Device UUID},{device split count},{device memory limit},{device core limit},{device type},{device numa},{healthy}
17 | ```
18 | 
19 | An example is shown below:
20 | ```
21 | HAMi.sh/node-handshake-nvidia: Reported 2024-01-23 04:30:04.434037031 +0000 UTC m=+1104711.777756895
22 | HAMi.sh/node-handshake-mlu: Requesting_2024.01.10 04:06:57
23 | HAMi.sh/node-mlu-register: MLU-45013011-2257-0000-0000-000000000000,10,23308,0,MLU-MLU370-X4,0,false:MLU-54043011-2257-0000-0000-000000000000,10,23308,0,
24 | HAMi.sh/node-nvidia-register: GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec,10,32768,100,NVIDIA-Tesla V100-PCIE-32GB,0,true:GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,10,32768,100,NVIDIA-Tesla V100-PCIE-32GB,0,true:
25 | 
26 | ```
27 | In this example, this node has two different AI devices, 2 Nvidia-V100 GPUs, and 2 Cambircon 370-X4 MLUs
28 | 
29 | Note that a device node may become unavailable due to hardware or network failure, if a node hasn't registered in last 5 minutes, scheduler will mark that node as 'unavailable'.
30 | 
31 | Since system clock on scheduler node and 'device' node may not align properly, scheduler node will patch the following device node annotations every 30s
32 | 
33 | ```
34 | HAMi.sh/node-handshake-{device-type}: Requesting_{scheduler_node_current_timestamp}
35 | ```
36 | 
37 | If HAMi.sh/node-handshake annotations remains in "Requesting_xxxx" and {scheduler current timestamp} > 5 mins + {scheduler timestamp in annotations}, then this device on that node will be marked "unavailable" in scheduler.
38 |  
39 | 
40 | ## Schedule Decision
41 | 
42 | <img src="./imgs/protocol_pod.png" width = "400" /> 
43 | 
44 | HAMi scheduler needs to patch schedule decisions into pod annotations, in the format of the following:
45 | 
46 | ```
47 | HAMi.sh/devices-to-allocate:{ctr1 request}:{ctr2 request}:...{Last ctr request}:
48 | HAMi.sh/device-node: {schedule decision node}
49 | HAMi.sh/device-schedule-time: {timestamp}
50 | ```
51 | 
52 | each container request is in the following format:
53 | 
54 | ```
55 | {device UUID},{device type keywork},{device memory request}:{device core request}
56 | ```
57 | 
58 | for example:
59 | 
60 | A pod with 2 containers, first container requests 1 GPU with 3G device Memory, second container requests 1 GPU with 5G device Memory, then the patched annotations will be like the
61 | 
62 | ```
63 | HAMi.sh/devices-to-allocate: GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,NVIDIA,3000,0:GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,NVIDIA,5000,0: 
64 | HAMi.sh/vgpu-node: node67-4v100
65 | HAMi.sh/vgpu-time: 1705054796
66 | ```
67 | 
68 | 


--------------------------------------------------------------------------------
/docs/develop/roadmap.md:
--------------------------------------------------------------------------------
 1 | # roadmap
 2 | 
 3 | | feature            | description                                                                                                                            |  release | Example  | Example expected behaviour |
 4 | |--------------------|----------------------------------------------------------------------------------------------------------------------------------------|---------------|--------------|------------|
 5 | | Kubernetes schedule layer       | Support Resource Quota for vgpu-memory                                                                                                                            | v3.2.0        | "requests.nvidia.com/gpu-memory: 30000" in ResourceQuota      | Pods in this namespace can allocate up to 30G device memory in this namespace     |
 6 | |                    | Support Best-fit, idle-first, Numa-first Schedule Policy                                                                                                                     | v3.2.0        | add "scheduler policy configmap"       |  execute schedule policy according to configMap          |
 7 | |                    |  Support k8s 1.28 version with compatable to v1.16                                                                                                                   | v3.1.0        |        |            |
 8 | | Add more Heterogeneous AI computing device                    | HuaWei Ascend Support                                                                                                                 | v3.1.0        |              |            |
 9 | |                    | Iluvatar GPU support                                                                                                                     | v3.1.0        |              |            |
10 | |                    |Teco DPU Support                                                                                                                    | v3.2.0        |              |            |
11 | 


--------------------------------------------------------------------------------
/docs/develop/tasklist.md:
--------------------------------------------------------------------------------
  1 | # Tasks
  2 | 
  3 | ## Support Moore threads MTT S4000
  4 | 
  5 | ```
  6 | resources:
  7 | requests:
  8 |   mthreads.com/gpu: ${num}
  9 |   mthreads.com/vcuda-core: ${core}
 10 |   mthreads.com/vcuda-memory: ${mem}
 11 | limits:
 12 |   mthreads.com/gpu: ${num}
 13 |   mthreads.com/vcuda-core: ${core}
 14 |   mthreads.com/vcuda-memory: ${mem}
 15 | ```
 16 | 
 17 | ## Support Birentech Model 110
 18 | 
 19 | ```
 20 | resources:
 21 | requests:
 22 |   birentech.com/gpu: ${num}
 23 |   birentech.com/vcuda-core: ${core}
 24 |   birentech.com/vcuda-memory: ${mem}
 25 | limits:
 26 |   birentech.com/gpu: ${num}
 27 |   birentech.com/vcuda-core: ${core}
 28 |   birentech.com/vcuda-memory: ${mem}
 29 | ```
 30 | 
 31 | ## Support iluvatar MR-V100
 32 | 
 33 | ```
 34 | resources:
 35 | requests:
 36 |   iluvatar.ai/gpu: ${num}
 37 |   iluvatar.ai/vcuda-core: ${core}
 38 |   iluvatar.ai/vcuda-memory: ${mem}
 39 | limits:
 40 |   iluvatar.ai/gpu: ${num}
 41 |   iluvatar.ai/vcuda-core: ${core}
 42 |   iluvatar.ai/vcuda-memory: ${mem}
 43 | ```
 44 | 
 45 | ## Support HuaWei Ascend 910B device
 46 | 
 47 | ```
 48 | resources:
 49 |   requests:
 50 |     ascend.com/npu: ${num}
 51 |     ascend.com/npu-core: ${core}
 52 |     ascend.com/npu-mem: ${mem}
 53 |   limits:
 54 |     ascend.com/npu: ${num}
 55 |     ascend.com/npu-core: ${core}
 56 |     ascend.com/npu-mem: ${mem}
 57 | ```
 58 | 
 59 | ## Support resourceQuota for Kubernetes
 60 | 
 61 | Description: ResourceQuota is frequently used in kubernetes namespace. Since the number of virtual devices doesn't mean anything, we need to support the limitation in deviceMemory.
 62 | 
 63 | For example, the following resourceQuota
 64 | ```
 65 | cat <<EOF > compute-resources.yaml
 66 | apiVersion: v1
 67 | kind: ResourceQuota
 68 | metadata:
 69 |   name: compute-resources
 70 | spec:
 71 |   hard:
 72 |     requests.cpu: "1"
 73 |     requests.memory: 1Gi
 74 |     limits.cpu: "2"
 75 |     limits.memory: 2Gi
 76 |     requests.nvidia.com/gpu-memory: 30000
 77 | EOF
 78 | ```
 79 | 
 80 | with the following command
 81 | ```
 82 | kubectl create -f ./compute-resources.yaml--namespace=myspace
 83 | ```
 84 | 
 85 | will limit the maxinum device memory allocated to namespace 'myspace' to 30G
 86 | 
 87 | ## Support multiple schedule policies
 88 | 
 89 | Description: HAMi needs to support multiple schedule policies, to provide meets the need in complex senarios, a pod can select a schedule policy in annotations field.
 90 | 
 91 | The effect of each schedule policy is shown in the table below
 92 | 
 93 | | Schedule Policy    | Effect |
 94 | | -------- | ------- |
 95 | | best-fit  | the fewer device memory remains, the higher score    |
 96 | | idle-first | idle GPU has higher score     |
 97 | | numa-first    | for multiple GPU allocations, GPUs on the same numa have higher score    |
 98 | 
 99 | 
100 | For example, if a pod want to select a 'best-fit' schedule policy, it can specify .metadata.annotations as the code below:
101 | 
102 | ```
103 | apiVersion: v1
104 | kind: Pod
105 | metadata:
106 |   name: gpu-pod
107 |   annotations:
108 |     nvidia.com/schedule-policy: "best-fit"
109 | spec:
110 |   containers:
111 |     - name: ubuntu-container
112 |       image: ubuntu:18.04
113 |       command:["bash"，"-c"，"sleep 86400"]
114 |       resources:
115 |         limits:
116 |           nvidia.com/gpu: 2 # requesting 2 VGPUs
117 | ```
118 | 
119 | 


--------------------------------------------------------------------------------
/docs/hygon-dcu-support.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | **We now support hygon.com/dcu by implementing most device-sharing features as nvidia-GPU**, including:
 4 | 
 5 | ***DCU sharing***: Each task can allocate a portion of DCU instead of a whole DCU card, thus DCU can be shared among multiple tasks.
 6 | 
 7 | ***Device Memory Control***: DCUs can be allocated with certain device memory size on certain type(i.e Z100) and have made it that it does not exceed the boundary.
 8 | 
 9 | ***Device compute core limitation***: DCUs can be allocated with certain percentage of device core(i.e hygon.com/dcucores:60 indicate this container uses 60% compute cores of this device)
10 | 
11 | ***DCU Type Specification***: You can specify which type of DCU to use or to avoid for a certain task, by setting "hygon.com/use-dcutype" or "hygon.com/nouse-dcutype" annotations. 
12 | 
13 | ## Prerequisites
14 | 
15 | * dtk driver with virtualization enabled(i.e dtk-22.10.1-vdcu), try the following command to see if your driver has virtualization ability
16 | 
17 | ```
18 | hdmcli -show-device-info
19 | ```
20 | 
21 | If this command can't be found, then you should contact your device provider to aquire a vdcu version of dtk driver.
22 | 
23 | * The absolute path of dtk driver on each dcu node must be the same(i.e placed in /root/dtk-driver)
24 | 
25 | ## Enabling DCU-sharing Support
26 | 
27 | * Install the chart using helm, See 'enabling vGPU support in kubernetes' section [here](https://github.com/4paradigm/k8s-vgpu-scheduler#enabling-vgpu-support-in-kubernetes), please be note that, you should set your dtk driver directory using --set devicePlugin.hygondriver={your dtk driver path on each nodes}, for example:
28 | 
29 | ```
30 | helm install vgpu vgpu-charts/vgpu --set devicePlugin.hygondriver="/root/dcu-driver/dtk-22.10.1-vdcu" --set scheduler.kubeScheduler.imageTag={your k8s server version} -n kube-system
31 | ```
32 | 
33 | * Tag DCU node with the following command
34 | ```
35 | kubectl label node {dcu-node} dcu=on
36 | ```
37 | 
38 | ## Running DCU jobs
39 | 
40 | Hygon DCUs can now be requested by a container
41 | using the `hygon.com/dcunum` , `hygon.com/dcumem` and `hygon.com/dcucores` resource type:
42 | 
43 | ```
44 | apiVersion: v1
45 | kind: Pod
46 | metadata:
47 |   name: alexnet-tf-gpu-pod-mem
48 |   labels:
49 |     purpose: demo-tf-amdgpu
50 | spec:
51 |   containers:
52 |     - name: alexnet-tf-gpu-container
53 |       image: pytorch:resnet50
54 |       workingDir: /root
55 |       command: ["sleep","infinity"]
56 |       resources:
57 |         limits:
58 |           hygon.com/dcunum: 1 # requesting a GPU
59 |           hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory
60 |           hygon.com/dcucores: 60 # each dcu use 60% of total compute cores
61 | 
62 | ```
63 | 
64 | ## Enable vDCU inside container
65 | 
66 | You need to enable vDCU inside container in order to use it.
67 | ```
68 | source /opt/hygondriver/env.sh
69 | ```
70 | 
71 | check if you have successfully enabled vDCU by using following command
72 | 
73 | ```
74 | hdmcli -show-device-info
75 | ```
76 | 
77 | If you have an output like this, then you have successfully enabled vDCU inside container.
78 | 
79 | ```
80 | Device 0:
81 | 	Actual Device: 0
82 | 	Compute units: 60
83 | 	Global memory: 2097152000 bytes
84 | ```
85 | 
86 | Launch your DCU tasks like you usually do
87 | 
88 | ## Notes
89 | 
90 | 1. DCU-sharing in init container is not supported, pods with "hygon.com/dcumem" in init container will never be scheduled.
91 | 
92 | 2. Only one vdcu can be aquired per container. If you want to mount multiple dcu devices, then you shouldn't set `hygon.com/dcumem` or `hygon.com/dcucores`
93 | 
94 |    


--------------------------------------------------------------------------------
/docs/hygon-dcu-support_cn.md:
--------------------------------------------------------------------------------
 1 | ## 简介
 2 | 
 3 | 本组件支持复用海光DCU设备，并为此提供以下几种与vGPU类似的复用功能，包括：
 4 | 
 5 | ***DCU 共享***: 每个任务可以只占用一部分显卡，多个任务可以共享一张显卡
 6 | 
 7 | ***可限制分配的显存大小***: 你现在可以用显存值（例如3000M）来分配DCU，本组件会确保任务使用的显存不会超过分配数值
 8 | 
 9 | ***可限制计算单元数量***: 你现在可以指定任务使用的算力比例（例如60即代表使用60%算力）来分配DCU，本组件会确保任务使用的算力不会超过分配数值
10 | 
11 | ***指定DCU型号***：当前任务可以通过设置annotation("hygon.com/use-dcutype","hygon.com/nouse-dcutype")的方式，来选择使用或者不使用某些具体型号的DCU
12 | 
13 | ## 节点需求
14 | 
15 | * 带有虚拟化功能的dtk驱动（例如dtk-22.10.1-vdcu）,相关组件可以在海光开发者社区获取，或联系您的设备提供商
16 | 
17 | * 在宿主机上执行hdmcli -show-device-info获取设备信息，若能成功获取，则代表配置成功。若找不到指令，说明您安装的驱动不带有虚拟化功能，请联系厂商获取代虚拟化功能的dtk驱动
18 | 
19 | * 需要将各个DCU节点上的dtk驱动路径放置在统一的绝对路径上，例如均放置在/root/dtk-driver
20 | 
21 | ## 开启DCU复用
22 | 
23 | * 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/4paradigm/k8s-vgpu-scheduler/blob/master/README_cn.md#kubernetes开启vgpu支持)，需要注意的是，必须使用--set devicePlugin.hygondriver="/root/dcu-driver/dtk-22.10.1-vdcu" 手动指定dtk驱动的绝对路径
24 | 
25 | ```
26 | helm install vgpu vgpu-charts/vgpu --set devicePlugin.hygondriver="/root/dcu-driver/dtk-22.10.1-vdcu" --set scheduler.kubeScheduler.imageTag={your k8s server version} -n kube-system
27 | ```
28 | 
29 | * 使用以下指令，为DCU节点打上label
30 | ```
31 | kubectl label node {dcu-node} dcu=on
32 | ```
33 | 
34 | ## 运行DCU任务
35 | 
36 | ```
37 | apiVersion: v1
38 | kind: Pod
39 | metadata:
40 |   name: alexnet-tf-gpu-pod-mem
41 |   labels:
42 |     purpose: demo-tf-amdgpu
43 | spec:
44 |   containers:
45 |     - name: alexnet-tf-gpu-container
46 |       image: pytorch:resnet50
47 |       workingDir: /root
48 |       command: ["sleep","infinity"]
49 |       resources:
50 |         limits:
51 |           hygon.com/dcunum: 1 # requesting a GPU
52 |           hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory
53 |           hygon.com/dcucores: 60 # each dcu use 60% of total compute cores
54 | 
55 | ```
56 | 
57 | ## 容器内开启虚拟DCU功能
58 | 
59 | 使用vDCU首先需要激活虚拟环境
60 | ```
61 | source /opt/hygondriver/env.sh
62 | ```
63 | 
64 | 随后，使用hdmcli指令查看虚拟设备是否已经激活
65 | ```
66 | hdmcli -show-device-info
67 | ```
68 | 
69 | 若输出如下，则代表虚拟设备已经成功激活
70 | ```
71 | Device 0:
72 | 	Actual Device: 0
73 | 	Compute units: 60
74 | 	Global memory: 2097152000 bytes
75 | ```
76 | 
77 | 接下来正常启动DCU任务即可
78 | 
79 | ## 注意事项
80 | 
81 | 1. 在init container中无法使用DCU复用功能，否则该任务不会被调度
82 | 
83 | 2. 每个容器最多只能使用一个虚拟DCU设备, 如果您希望在容器中挂载多个DCU设备，则不能使用`hygon.com/dcumem`和`hygon.com/dcucores`字段
84 | 


--------------------------------------------------------------------------------
/docs/offline-install.md:
--------------------------------------------------------------------------------
 1 | # Offline-install Maunal
 2 | 
 3 | For some cluster that don't have external web access, you can install HAMi by the following step:
 4 | 
 5 | 1. Refer to [README.md](../README.md) until step 'Install and Uninstall'
 6 | 
 7 | 2. copy the source of project into the master node in your cluster, placed in a path like "/root/HAMi"
 8 | 
 9 | 3. pull the following images and save them into a '.tar' file, then move it into the master node in your cluster
10 | 
11 | Image list:
12 | ```
13 | 4pdosc/k8s-vdevice:{HAMi version} 
14 | docker.io/jettech/kube-webhook-certgen:v1.5.2
15 | liangjw/kube-webhook-certgen:v1.1.1
16 | registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{your kubernetes version}
17 | ```
18 | 
19 | ```
20 | docker pull {iamge} && docker save {image_name} -o {image_name}.tar 
21 | ```
22 | 
23 | 4. Load these images using docker load, tag these images with your registry, and push them into your registry
24 | 
25 | ```
26 | docker load -i {image_name}.tar
27 | docker tag 4pdosc/k8s-vdevice:{HAMi version} {registry}/k8s-vdevice:{HAMi version} 
28 | docker push {registry}/k8s-vdevice:{HAMi version}
29 | ```
30 | 
31 | 5. edit the following field in /root/HAMi/chart/vgpu/values.yaml to your image pushed
32 | 
33 | ```
34 | scheduler.kubeScheduler.image
35 | scheduler.extender.image
36 | scheduler.patch.image
37 | scheduler.patch.imageNew
38 | scheduler.devicePlugin.image
39 | scheduler.devicePlugin.monitorimage
40 | ```
41 | 
42 | 6. Execute the following command in your /root/HAMi/chart folder
43 | 
44 | ```
45 | helm install vgpu vgpu --set scheduler.kubeScheduler.imageTag={你的k8s server版本} -n kube-system
46 | ```
47 | 
48 | 7. Verify your installation
49 | 
50 | execute the following command
51 | ```
52 | kubectl get pods -n kube-system
53 | ```
54 | 
55 | If you can see both the 'device-plugin' and 'schduler' running, then HAMi is installed successfully, as the figure shown below:
56 | 
57 | <img src="./develop/imgs/offline_validation.png" width = "600" /> 
58 | 


--------------------------------------------------------------------------------
/example.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 1 # requesting 2 vGPUs
13 |           nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory （Optional,Integer）
14 | 


--------------------------------------------------------------------------------
/examples/hygon/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: alexnet-tf-gpu-pod-mem
 5 |   labels:
 6 |     purpose: demo-tf-amdgpu
 7 | spec:
 8 |   containers:
 9 |     - name: alexnet-tf-gpu-container
10 |       image: pytorch:resnet50
11 |       workingDir: /root
12 |       command: ["sleep","infinity"]
13 |       resources:
14 |         limits:
15 |           hygon.com/dcunum: 1 # requesting a GPU
16 |           hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory
17 |           hygon.com/dcucores: 60 # each dcu use 60% of total compute cores
18 | 


--------------------------------------------------------------------------------
/examples/hygon/specify_card_type_not_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: alexnet-tf-gpu-pod-mem
 5 |   annotations:
 6 |     hygon.com/nouse-dcutype: "Z100L" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card
 7 |     #In this example, we don't want this container to run on Z100L
 8 |     purpose: demo-tf-amdgpu
 9 | spec:
10 |   containers:
11 |     - name: alexnet-tf-gpu-container
12 |       image: pytorch:resnet50
13 |       workingDir: /root
14 |       command: ["sleep","infinity"]
15 |       resources:
16 |         limits:
17 |           hygon.com/dcunum: 1 # requesting a GPU
18 |           hygon.com/dcumem: 2000
19 |           hygon.com/dcucores: 60
20 | 


--------------------------------------------------------------------------------
/examples/hygon/specify_card_type_to_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: alexnet-tf-gpu-pod-mem
 5 |   annotations:
 6 |     hygon.com/use-dcutype: "Z100" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card
 7 |     #In this example, we want to run this job on Z100
 8 |   labels:
 9 |     purpose: demo-tf-amdgpu
10 | spec:
11 |   containers:
12 |     - name: alexnet-tf-gpu-container
13 |       image: pytorch:resnet50
14 |       workingDir: /root
15 |       command: ["sleep","infinity"]
16 |       resources:
17 |         limits:
18 |           hygon.com/dcunum: 1 # requesting a GPU
19 |           hygon.com/dcumem: 2000
20 |           hygon.com/dcucores: 60
21 | 


--------------------------------------------------------------------------------
/examples/mlu/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           cambricon.com/mlunum: 1 # requesting 1 MLU
13 |           cambricon.com/mlumem: 10240 # requesting 10G MLU device memory


--------------------------------------------------------------------------------
/examples/mlu/multi-pods.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: binpack-1
 5 |   labels:
 6 |     app: binpack-1
 7 | spec:
 8 |   replicas: 2
 9 |   selector: # define how the deployment finds the pods it mangages
10 |     matchLabels:
11 |       app: binpack-1
12 |   template: # define the pods specifications
13 |     metadata:
14 |       labels:
15 |         app: binpack-1
16 |     spec:
17 |       containers:
18 |         - name: c-1
19 |           image: ubuntu:18.04
20 |           command: ["sleep"]
21 |           args: ["100000"]
22 |           lifecycle:                                                       # 必需
23 |             #postStart:                                                     # 必需
24 |             #  exec:                                                        # 必需
25 |             #    command: ["/bin/sh", "-c", "/usr/bin/smlu-containerd"]     # 必需
26 |           resources:
27 |             limits:
28 |               cambricon.com/mlunum: 1
29 |               cambricon.com/mlumem: 10240                                   # 设置 MLU 内存
30 |         - name: c-2
31 |           image: ubuntu:18.04
32 |           command: ["sleep"]
33 |           args: ["100000"]
34 |           lifecycle:                                                       # 必需
35 |             #postStart:                                                     # 必需
36 |             #  exec:                                                        # 必需
37 |             #    command: ["/bin/sh", "-c", "/usr/bin/smlu-containerd"]     # 必需
38 |           resources:
39 |             limits:
40 |               cambricon.com/mlunum: 1
41 |               cambricon.com/mlumem: 10240                                   # 设置 MLU 内存


--------------------------------------------------------------------------------
/examples/mlu/specify_card_type_not_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     cambricon.com/nouse-mlutype: "270,370" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card
 7 |     #In this example, we want to run this job on 270 or 370
 8 | spec:
 9 |   containers:
10 |     - name: ubuntu-container
11 |       image: ubuntu:18.04
12 |       command: ["bash", "-c", "sleep 86400"]
13 |       resources:
14 |         limits:
15 |           cambricon.com/mlunum: 1
16 |           cambricon.com/mlumem: 10240 
17 | 


--------------------------------------------------------------------------------
/examples/mlu/specify_card_type_to_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     cambricon.com/use-mlutype: "270" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card
 7 |     #In this example, we want to run this job on 270 or 370
 8 | spec:
 9 |   containers:
10 |     - name: ubuntu-container
11 |       image: ubuntu:18.04
12 |       command: ["bash", "-c", "sleep 86400"]
13 |       resources:
14 |         limits:
15 |           cambricon.com/mlunum: 1
16 |           cambricon.com/mlumem: 10240 
17 | 


--------------------------------------------------------------------------------
/examples/nvidia/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # requesting 2 vGPUs
13 |           nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory （Optional,Integer）
14 |           nvidia.com/gpucores: 30 # Each vGPU uses 30% of the entire GPU （Optional,Integer)
15 | 


--------------------------------------------------------------------------------
/examples/nvidia/default_use_legacy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # requesting 2 vGPUs
13 | 


--------------------------------------------------------------------------------
/examples/nvidia/example.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # requesting 2 vGPUs
13 |           #nvidia.com/gpumem: 3000 # Each vGPU containers 3000M device memory
14 |           nvidia.com/gpumem-percentage: 50 #Each vGPU containers 50% device memory of that GPU. Can not be used with nvidia.com/gpumem
15 |           #nvidia.com/gpucores: 90 # Utilization limit of this vGPU is set to 50% of total GPU utilization 
16 |           #nvidia.com/priority: 0 # We only have two priority class, 0(high) and 1(low), default: 1 
17 |           #The utilization of high priority task won't be limited to resourceCores unless sharing GPU node with other high priority tasks.
18 |           #The utilization of low priority task won't be limited to resourceCores if no other tasks sharing its GPU.
19 |     - name: ubuntu-container0
20 |       image: ubuntu:18.04
21 |       command: ["bash", "-c", "sleep 86400"]
22 |     - name: ubuntu-container1
23 |       image: ubuntu:18.04
24 |       command: ["bash", "-c", "sleep 86400"]
25 |       resources:
26 |         limits:
27 |           nvidia.com/gpu: 2 # requesting 2 vGPUs
28 |           nvidia.com/gpumem: 2000
29 |           #nvidia.com/gpucores: 90
30 | 
31 | 


--------------------------------------------------------------------------------
/examples/nvidia/mig_example.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/mig-3g.20gb: 1 # requesting 2 vGPUs
13 | 


--------------------------------------------------------------------------------
/examples/nvidia/specify_card_type_not_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     nvidia.com/nouse-gputype: "1080,2080" # Specify the blacklist card type for this job, use comma to seperate, will not launch job on specified card
 7 |     # In this job, we don't want our job to run on 1080(include 1080Ti) or 2080(include 2080Ti) type of card.
 8 | spec:
 9 |   containers:
10 |     - name: ubuntu-container
11 |       image: ubuntu:18.04
12 |       command: ["bash", "-c", "sleep 86400"]
13 |       resources:
14 |         limits:
15 |           nvidia.com/gpu: 2 # requesting 2 vGPUs
16 | 


--------------------------------------------------------------------------------
/examples/nvidia/specify_card_type_to_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     nvidia.com/use-gputype: "A100,V100" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card
 7 |     #In this example, we want to run this job on A100 or V100
 8 | spec:
 9 |   containers:
10 |     - name: ubuntu-container
11 |       image: ubuntu:18.04
12 |       command: ["bash", "-c", "sleep 86400"]
13 |       resources:
14 |         limits:
15 |           nvidia.com/gpu: 2 # requesting 2 vGPUs
16 | 


--------------------------------------------------------------------------------
/examples/nvidia/use_exclusive_card.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod1
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # requesting 2 vGPUs
13 |           nvidia.com/gpumem-percentage: 100 # Each vGPU contains 100% of the entire GPU device memory （Optional,Integer）
14 |           nvidia.com/gpucores: 100 # Each vGPU uses 30% of the entire GPU cores（Optional,Integer)
15 | 


--------------------------------------------------------------------------------
/examples/nvidia/use_memory_fraction.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # requesting 2 vGPUs
13 |           nvidia.com/gpumem-percentage: 50 # Each vGPU contains 50% device memory of that GPU （Optional,Integer）
14 |           nvidia.com/gpucores: 30 # Each vGPU uses 30% of the entire GPU （Optional,Integer)
15 | 


--------------------------------------------------------------------------------
/hack/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | set -e
18 | [[ -z ${SHORT_VERSION} ]] && SHORT_VERSION=$(git rev-parse --abbrev-ref HEAD)
19 | [[ -z ${COMMIT_CODE} ]] && COMMIT_CODE=$(git describe --abbrev=100 --always)
20 | 
21 | export SHORT_VERSION
22 | export COMMIT_CODE
23 | export VERSION="${SHORT_VERSION}-${COMMIT_CODE}"
24 | export LATEST_VERSION="latest"
25 | export GOLANG_IMAGE="golang:1.21-bullseye"
26 | export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04"
27 | export DEST_DIR="/usr/local"
28 | 
29 | IMAGE=${IMAGE-"4pdosc/k8s-vdevice"}
30 | 
31 | function go_build() {
32 |   [[ -z "$J" ]] && J=$(nproc | awk '{print int(($0 + 1)/ 2)}')
33 |   make -j$J
34 | }
35 | 
36 | function docker_build() {
37 |     docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile .
38 |     docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${SHORT_VERSION}"
39 |     docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${LATEST_VERSION}"
40 | }
41 | 
42 | function docker_push() {
43 |     #docker push "${IMAGE}:${VERSION}"
44 |     docker push "${IMAGE}:${SHORT_VERSION}"
45 |     docker push "${IMAGE}:${LATEST_VERSION}"
46 | }
47 | 
48 | go_build
49 | docker_build
50 | docker_push


--------------------------------------------------------------------------------
/hack/update-generated-api.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | ROOT_DIR=$(dirname "${BASH_SOURCE[0]}")/..
19 | protoc -I${ROOT_DIR} --gofast_out=plugins=grpc:${ROOT_DIR} ${ROOT_DIR}/pkg/api/*.proto


--------------------------------------------------------------------------------
/imgs/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/arch.png


--------------------------------------------------------------------------------
/imgs/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/benchmark.png


--------------------------------------------------------------------------------
/imgs/benchmark_inf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/benchmark_inf.png


--------------------------------------------------------------------------------
/imgs/benchmark_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/benchmark_train.png


--------------------------------------------------------------------------------
/imgs/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/example.png


--------------------------------------------------------------------------------
/imgs/hard_limit.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/imgs/hard_limit.jpg


--------------------------------------------------------------------------------
/lib/mlu/cntopo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/lib/mlu/cntopo


--------------------------------------------------------------------------------
/lib/mlu/libcndev.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/lib/mlu/libcndev.so


--------------------------------------------------------------------------------
/lib/mlu/smlu-containerd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/lib/mlu/smlu-containerd


--------------------------------------------------------------------------------
/lib/nvidia/ld.so.preload:
--------------------------------------------------------------------------------
1 | /usr/local/vgpu/libvgpu.so


--------------------------------------------------------------------------------
/lib/nvidia/libvgpu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/k8s-vgpu-scheduler/da7d4ed48ff11e58666f9b9fa225fc5891b65587/lib/nvidia/libvgpu.so


--------------------------------------------------------------------------------
/pkg/api/device_register.go:
--------------------------------------------------------------------------------
 1 | // Code generated by protoc-gen-gogo. DO NOT EDIT.
 2 | // source: pkg/api/device_register.proto
 3 | 
 4 | package api
 5 | 
 6 | 
 7 | // Reference imports to suppress errors if they are not otherwise used.
 8 | 
 9 | // This is a compile-time assertion to ensure that this generated file
10 | // is compatible with the proto package it is being compiled against.
11 | // A compilation error at this line likely means your copy of the
12 | // proto package needs to be updated.
13 | type DeviceInfo struct {
14 | 	Index		int
15 | 	Id                   string   
16 | 	Count                int32    
17 | 	Devmem               int32
18 | 	Devcore              int32   
19 | 	Type                 string
20 | 	Numa		    int   
21 | 	Health               bool 
22 | }
23 | 


--------------------------------------------------------------------------------
/pkg/api/types.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package api
18 | 
19 | const (
20 | 	TaskPriority    = "CUDA_TASK_PRIORITY"
21 | 	CoreLimitSwitch = "GPU_CORE_UTILIZATION_POLICY"
22 | )
23 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/hygon/dcu/corealloc.go:
--------------------------------------------------------------------------------
 1 | package dcu
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"strconv"
 6 | )
 7 | 
 8 | func initCoreUsage(req int) string {
 9 | 	res := ""
10 | 	i := 0
11 | 	for i < req/4 {
12 | 		res = res + "0"
13 | 		i++
14 | 	}
15 | 	return res
16 | }
17 | 
18 | func addCoreUsage(tot string, c string) (string, error) {
19 | 	i := 0
20 | 	res := ""
21 | 	for {
22 | 		left := int64(0)
23 | 		right := int64(0)
24 | 		if i < len(tot) && tot[i] != 0 {
25 | 			left, _ = strconv.ParseInt(string(tot[i]), 16, 0)
26 | 			right, _ = strconv.ParseInt(string(c[i]), 16, 0)
27 | 			merged := int(left | right)
28 | 			res = fmt.Sprintf("%s%x", res, merged)
29 | 		} else {
30 | 			break
31 | 		}
32 | 		i++
33 | 	}
34 | 	fmt.Println("tot=", tot, "c=", c, "res=", res)
35 | 	return res, nil
36 | }
37 | 
38 | func byteAlloc(b int, req int) (int, int) {
39 | 	if req == 0 {
40 | 		return 0, 0
41 | 	}
42 | 	remains := req
43 | 	leftstr := fmt.Sprintf("%b", b)
44 | 	for len(leftstr) < 4 {
45 | 		leftstr = "0" + leftstr
46 | 	}
47 | 	res := 0
48 | 	i := 0
49 | 	for i < len(leftstr) {
50 | 		res = res * 2
51 | 		if leftstr[i] == '0' && remains > 0 {
52 | 			remains--
53 | 			res = res + 1
54 | 		}
55 | 		i++
56 | 	}
57 | 	return res, remains
58 | }
59 | 
60 | func allocCoreUsage(tot string, req int) (string, error) {
61 | 	i := 0
62 | 	res := ""
63 | 	remains := req
64 | 	for {
65 | 		left := int64(0)
66 | 		alloc := 0
67 | 		if i < len(tot) && tot[i] != 0 {
68 | 			left, _ = strconv.ParseInt(string(tot[i]), 16, 0)
69 | 			alloc, remains = byteAlloc(int(left), remains)
70 | 			res = fmt.Sprintf("%s%x", res, alloc)
71 | 		} else {
72 | 			break
73 | 		}
74 | 		i++
75 | 	}
76 | 	return res, nil
77 | }
78 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/hygon/dcu/corealloc_test.go:
--------------------------------------------------------------------------------
 1 | package dcu
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"testing"
 6 | 
 7 | 	"gotest.tools/v3/assert"
 8 | )
 9 | 
10 | func TestInit(t *testing.T) {
11 | 	str := initCoreUsage(60)
12 | 	t.Log("str=", str)
13 | 	assert.Equal(t, strings.Compare(str, "000000000000000"), 0)
14 | }
15 | 
16 | func TestAddCoreUsage(t *testing.T) {
17 | 	str := initCoreUsage(60)
18 | 	str1 := "abcde000ad00012"
19 | 	res, _ := addCoreUsage(str, str1)
20 | 	t.Log("res1=", res)
21 | 	assert.Equal(t, strings.Compare(res, str1), 0)
22 | 	str1 = "50200fff4000000"
23 | 	res, _ = addCoreUsage(res, str1)
24 | 	t.Log("res1=", res)
25 | 	assert.Equal(t, strings.Compare(res, "fbedefffed00012"), 0)
26 | }
27 | 
28 | func TestAllocCoreUsage(t *testing.T) {
29 | 	str1 := "50200fff4000000"
30 | 	res, _ := allocCoreUsage(str1, 16)
31 | 	t.Log("res=", res)
32 | 	assert.Equal(t, strings.Compare(res, "afdfe0000000000"), 0)
33 | 	str1 = "abcde000ad00012"
34 | 	res, _ = allocCoreUsage(str1, 32)
35 | 	t.Log("res=", res)
36 | }
37 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/hygon/dcu/hwloc/hwloc.go:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Advanced Micro Devices, Inc.  All rights reserved.
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 | **/
16 | 
17 | // Package hwloc is a collection of utility functions to get NUMA membership
18 | // of AMD GPU via the hwloc library
19 | package hwloc
20 | 
21 | // #cgo pkg-config: hwloc
22 | // #include <stdint.h>
23 | // #include <hwloc.h>
24 | import "C"
25 | import (
26 | 	"fmt"
27 | 	"unsafe"
28 | )
29 | 
30 | func GetVersions() string {
31 | 	return fmt.Sprintf("hwloc: _VERSION: %s, _API_VERSION: %#08x, _COMPONENT_ABI: %d, Runtime: %#08x",
32 | 		C.HWLOC_VERSION,
33 | 		C.HWLOC_API_VERSION,
34 | 		C.HWLOC_COMPONENT_ABI,
35 | 		uint(C.hwloc_get_api_version()))
36 | }
37 | 
38 | type Hwloc struct {
39 | 	topology C.hwloc_topology_t
40 | }
41 | 
42 | func (h *Hwloc) Init() error {
43 | 	rc := C.hwloc_topology_init(&h.topology)
44 | 	if rc != 0 {
45 | 		return fmt.Errorf("Problem initializing hwloc topology rc: %d", rc)
46 | 	}
47 | 
48 | 	rc = C.hwloc_topology_set_type_filter(h.topology,
49 | 		C.HWLOC_OBJ_PCI_DEVICE,
50 | 		C.HWLOC_TYPE_FILTER_KEEP_IMPORTANT)
51 | 	if rc != 0 {
52 | 		C.hwloc_topology_destroy(h.topology)
53 | 		return fmt.Errorf("Problem setting type filter rc: %d", rc)
54 | 	}
55 | 
56 | 	rc = C.hwloc_topology_load(h.topology)
57 | 	if rc != 0 {
58 | 		C.hwloc_topology_destroy(h.topology)
59 | 		return fmt.Errorf("Problem loading topology rc: %d", rc)
60 | 	}
61 | 
62 | 	return nil
63 | }
64 | 
65 | func (h *Hwloc) Destroy() {
66 | 	C.hwloc_topology_destroy(h.topology)
67 | }
68 | 
69 | func (h *Hwloc) GetNUMANodes(busid string) ([]uint64, error) {
70 | 	var gpu C.hwloc_obj_t
71 | 	var ancestor C.hwloc_obj_t
72 | 
73 | 	busidstr := C.CString(busid)
74 | 	defer C.free(unsafe.Pointer(busidstr))
75 | 
76 | 	gpu = C.hwloc_get_pcidev_by_busidstring(h.topology, busidstr)
77 | 	if gpu == nil {
78 | 		return []uint64{},
79 | 			fmt.Errorf("Fail to find GPU with bus ID: %s", busid)
80 | 	}
81 | 	ancestor = C.hwloc_get_non_io_ancestor_obj(h.topology, gpu)
82 | 
83 | 	if ancestor == nil || ancestor.memory_arity <= 0 {
84 | 		return []uint64{},
85 | 			fmt.Errorf("No NUMA node found with bus ID: %s", busid)
86 | 	}
87 | 
88 | 	var results []uint64
89 | 	nn := ancestor.memory_first_child
90 | 
91 | 	for nn != nil {
92 | 		results = append(results, uint64(nn.logical_index))
93 | 		nn = nn.next_sibling
94 | 	}
95 | 
96 | 	return results, nil
97 | }
98 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/hygon/dcu/register.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dcu
18 | 
19 | import (
20 | 	"fmt"
21 | 	"os"
22 | 	"time"
23 | 
24 | 	"k8s.io/klog/v2"
25 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
26 | 
27 | 	"4pd.io/k8s-vgpu/pkg/api"
28 | 	"4pd.io/k8s-vgpu/pkg/device/hygon"
29 | 	"4pd.io/k8s-vgpu/pkg/util"
30 | )
31 | 
32 | type DevListFunc func() []*pluginapi.Device
33 | 
34 | func (r *Plugin) apiDevices() *[]*api.DeviceInfo {
35 | 	res := []*api.DeviceInfo{}
36 | 	for idx, val := range r.totalmem {
37 | 		if val > 0 {
38 | 			res = append(res, &api.DeviceInfo{
39 | 				Index:   idx,
40 | 				Id:      "DCU-" + fmt.Sprint(idx),
41 | 				Count:   30,
42 | 				Devmem:  int32(val),
43 | 				Devcore: 100,
44 | 				Numa:    0,
45 | 				Type:    r.cardtype[idx],
46 | 				Health:  true,
47 | 			})
48 | 		}
49 | 	}
50 | 	return &res
51 | }
52 | 
53 | func (r *Plugin) RegistrInAnnotation() error {
54 | 	devices := r.apiDevices()
55 | 	annos := make(map[string]string)
56 | 	if len(util.NodeName) == 0 {
57 | 		util.NodeName = os.Getenv("NodeName")
58 | 	}
59 | 	node, err := util.GetNode(util.NodeName)
60 | 	if err != nil {
61 | 		klog.Errorln("get node error", err.Error())
62 | 		return err
63 | 	}
64 | 	encodeddevices := util.EncodeNodeDevices(*devices)
65 | 	annos[hygon.HandshakeAnnos] = "Reported " + time.Now().String()
66 | 	annos[hygon.RegisterAnnos] = encodeddevices
67 | 	klog.Infoln("Reporting devices", encodeddevices, "in", time.Now().String())
68 | 	err = util.PatchNodeAnnotations(node, annos)
69 | 
70 | 	if err != nil {
71 | 		klog.Errorln("patch node error", err.Error())
72 | 	}
73 | 	return err
74 | }
75 | 
76 | func (r *Plugin) WatchAndRegister() {
77 | 	klog.Infof("into WatchAndRegister")
78 | 	for {
79 | 		r.RefreshContainerDevices()
80 | 		err := r.RegistrInAnnotation()
81 | 		if err != nil {
82 | 			klog.Errorf("register error, %v", err)
83 | 			time.Sleep(time.Second * 5)
84 | 		} else {
85 | 			time.Sleep(time.Second * 30)
86 | 		}
87 | 	}
88 | }
89 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/allocator/allocator.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 Cambricon, Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package allocator
16 | 
17 | import (
18 | 	"strings"
19 | 
20 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cndev"
21 | )
22 | 
23 | type Allocator interface {
24 | 	Allocate(available []uint, required []uint, size int) ([]uint, error)
25 | }
26 | 
27 | func New(policy string, devs map[string]*cndev.Device) Allocator {
28 | 	model := cndev.GetDeviceModel(uint(0))
29 | 	if strings.Contains(model, "MLU290") || model == "MLU370-M8" {
30 | 		return NewSpiderAllocator(policy, devs)
31 | 	}
32 | 	if model == "MLU370-X8" {
33 | 		return NewBoardAllocator(policy, devs)
34 | 	}
35 | 	return NewDefaultAllocator(policy, devs)
36 | }
37 | 
38 | func contains(set []uint, dev uint) bool {
39 | 	for i := range set {
40 | 		if set[i] == dev {
41 | 			return true
42 | 		}
43 | 	}
44 | 	return false
45 | }
46 | 
47 | func containsAll(set []uint, devs []uint) bool {
48 | 	for _, dev := range devs {
49 | 		if !contains(set, dev) {
50 | 			return false
51 | 		}
52 | 	}
53 | 	return true
54 | }
55 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/allocator/allocator_suite_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 Cambricon, Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package allocator
16 | 
17 | import (
18 | 	"testing"
19 | 
20 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cntopo/mock"
21 | 	"github.com/golang/mock/gomock"
22 | 	. "github.com/onsi/ginkgo"
23 | 	. "github.com/onsi/gomega"
24 | )
25 | 
26 | var (
27 | 	cntopoMock *mock.Cntopo
28 | 	mockCtrl   *gomock.Controller
29 | )
30 | 
31 | func TestAllocator(t *testing.T) {
32 | 	RegisterFailHandler(Fail)
33 | 	RunSpecs(t, "Allocator Suite")
34 | }
35 | 
36 | var _ = BeforeSuite(func() {
37 | 	By("Bootstrap test environment")
38 | 	mockCtrl = gomock.NewController(GinkgoT())
39 | 	cntopoMock = mock.NewCntopo(mockCtrl)
40 | })
41 | 
42 | var _ = AfterSuite(func() {
43 | 	By("Tear down the test environment")
44 | 	mockCtrl.Finish()
45 | })
46 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/allocator/default.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 Cambricon, Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package allocator
16 | 
17 | import (
18 | 	"fmt"
19 | 	"log"
20 | 	"sort"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cndev"
23 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cntopo"
24 | 	"4pd.io/k8s-vgpu/pkg/util"
25 | )
26 | 
27 | type defaultAllocator struct {
28 | 	policy string
29 | 	cntopo cntopo.Cntopo
30 | 	devs   map[string]*cndev.Device
31 | }
32 | 
33 | func NewDefaultAllocator(policy string, devs map[string]*cndev.Device) Allocator {
34 | 	return &defaultAllocator{
35 | 		policy: policy,
36 | 		cntopo: cntopo.New(),
37 | 		devs:   devs,
38 | 	}
39 | }
40 | 
41 | func (a *defaultAllocator) Allocate(available []uint, required []uint, size int) ([]uint, error) {
42 | 
43 | 	rings, err := a.cntopo.GetRings(available, size)
44 | 	if err != nil {
45 | 		return nil, err
46 | 	}
47 | 	sort.Slice(rings, func(i int, j int) bool {
48 | 		return rings[i].NonConflictRingNum > rings[j].NonConflictRingNum
49 | 	})
50 | 
51 | 	if len(rings) == 0 {
52 | 		log.Println("found no rings")
53 | 		if a.policy != util.BestEffort && !a.sizeAlwaysFailsToFormRing(size) {
54 | 			return nil, fmt.Errorf("mode %s found no rings", a.policy)
55 | 		}
56 | 		return available[0:size], nil
57 | 	}
58 | 
59 | 	return rings[0].Ordinals, nil
60 | }
61 | 
62 | func (a *defaultAllocator) sizeAlwaysFailsToFormRing(size int) bool {
63 | 	return size%2 == 1
64 | }
65 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/cache.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package mlu
 18 | 
 19 | import (
 20 | 	"context"
 21 | 	"sync"
 22 | 
 23 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cndev"
 24 | 	"4pd.io/k8s-vgpu/pkg/util"
 25 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 26 | )
 27 | 
 28 | type MLUDevice struct {
 29 | 	dev    pluginapi.Device
 30 | 	handle *cndev.Device
 31 | }
 32 | 
 33 | type DeviceCache struct {
 34 | 	cache     []*MLUDevice
 35 | 	stopCh    chan interface{}
 36 | 	unhealthy chan *pluginapi.Device
 37 | 	notifyCh  map[string]chan *pluginapi.Device
 38 | 	mutex     sync.Mutex
 39 | }
 40 | 
 41 | func NewDeviceCache() *DeviceCache {
 42 | 	return &DeviceCache{
 43 | 		stopCh:    make(chan interface{}),
 44 | 		unhealthy: make(chan *pluginapi.Device),
 45 | 		notifyCh:  make(map[string]chan *pluginapi.Device),
 46 | 	}
 47 | }
 48 | 
 49 | func (d *DeviceCache) AddNotifyChannel(name string, ch chan *pluginapi.Device) {
 50 | 	d.mutex.Lock()
 51 | 	defer d.mutex.Unlock()
 52 | 	d.notifyCh[name] = ch
 53 | }
 54 | 
 55 | func (d *DeviceCache) RemoveNotifyChannel(name string) {
 56 | 	d.mutex.Lock()
 57 | 	defer d.mutex.Unlock()
 58 | 	delete(d.notifyCh, name)
 59 | }
 60 | 
 61 | func (d *DeviceCache) Start() {
 62 | 	d.cache = d.Devices()
 63 | 	go d.CheckHealth(d.stopCh, d.cache, d.unhealthy)
 64 | 	go d.notify()
 65 | }
 66 | 
 67 | func (d *DeviceCache) Stop() {
 68 | 	close(d.stopCh)
 69 | }
 70 | 
 71 | func (d *DeviceCache) GetCache() []*MLUDevice {
 72 | 	return d.cache
 73 | }
 74 | 
 75 | func (d *DeviceCache) notify() {
 76 | 	for {
 77 | 		select {
 78 | 		case <-d.stopCh:
 79 | 			return
 80 | 		case dev := <-d.unhealthy:
 81 | 			dev.Health = pluginapi.Unhealthy
 82 | 			d.mutex.Lock()
 83 | 			for _, ch := range d.notifyCh {
 84 | 				ch <- dev
 85 | 			}
 86 | 			d.mutex.Unlock()
 87 | 		}
 88 | 	}
 89 | }
 90 | 
 91 | // Devices returns a list of devices from the GpuDeviceManager
 92 | func (d *DeviceCache) Devices() []*MLUDevice {
 93 | 	n, err := cndev.GetDeviceCount()
 94 | 	check(err)
 95 | 	if n > util.DeviceLimit {
 96 | 		n = util.DeviceLimit
 97 | 	}
 98 | 
 99 | 	var devs []*MLUDevice
100 | 	for i := uint(0); i < n; i++ {
101 | 		d, err := cndev.NewDeviceLite(i, false)
102 | 		check(err)
103 | 
104 | 		devs = append(devs, &MLUDevice{
105 | 			dev:    pluginapi.Device{ID: d.UUID},
106 | 			handle: d,
107 | 		})
108 | 	}
109 | 
110 | 	return devs
111 | }
112 | 
113 | // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
114 | func (d *DeviceCache) CheckHealth(stop <-chan interface{}, devices []*MLUDevice, unhealthy chan<- *pluginapi.Device) {
115 | 	// mlu.checkHealth...
116 | 	WatchUnhealthy(context.Background(), devices, unhealthy)
117 | }
118 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/cndev/bindings_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Cambricon, Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package cndev
 16 | 
 17 | import (
 18 | 	"fmt"
 19 | 	"log"
 20 | 	"os"
 21 | 	"sort"
 22 | 	"testing"
 23 | 
 24 | 	"github.com/stretchr/testify/assert"
 25 | )
 26 | 
 27 | func TestMain(m *testing.M) {
 28 | 	err := Init()
 29 | 	if err != nil {
 30 | 		log.Fatal(err)
 31 | 	}
 32 | 	ret := m.Run()
 33 | 	if ret != 0 {
 34 | 		os.Exit(ret)
 35 | 	}
 36 | 	err = Release()
 37 | 	if err != nil {
 38 | 		log.Fatal(err)
 39 | 	}
 40 | }
 41 | 
 42 | func TestGetDeviceCount(t *testing.T) {
 43 | 	count, err := GetDeviceCount()
 44 | 	assert.NoError(t, err)
 45 | 	assert.Equal(t, uint(8), count)
 46 | }
 47 | 
 48 | func TestGetDeviceModel(t *testing.T) {
 49 | 	model := GetDeviceModel(uint(0))
 50 | 	assert.Equal(t, "MLU290", model)
 51 | }
 52 | 
 53 | func TestGetDeviceMemory(t *testing.T) {
 54 | 	memory, err := GetDeviceMemory(uint(0))
 55 | 	assert.NoError(t, err)
 56 | 	assert.Equal(t, uint(16*1024), memory)
 57 | }
 58 | 
 59 | func TestGetDeviceInfo(t *testing.T) {
 60 | 	uuid, _, mb, path, err := getDeviceInfo(uint(1))
 61 | 	assert.NoError(t, err)
 62 | 	assert.Equal(t, "/dev/cambricon_dev1", path)
 63 | 	assert.Equal(t, "MLU-20001012-1916-0000-0000-000000000000", uuid)
 64 | 	assert.Equal(t, fmt.Sprintf("%x", 1111111), mb)
 65 | }
 66 | 
 67 | func TestGetDeviceHealthState(t *testing.T) {
 68 | 	health, err := getDeviceHealthState(uint(0), 1)
 69 | 	assert.NoError(t, err)
 70 | 	assert.Equal(t, 1, health)
 71 | }
 72 | 
 73 | func TestGetDevicePCIeInfo(t *testing.T) {
 74 | 	pcie, err := getDevicePCIeInfo(uint(0))
 75 | 	assert.NoError(t, err)
 76 | 	assert.Equal(t, 0, pcie.domain)
 77 | 	assert.Equal(t, 12, pcie.bus)
 78 | 	assert.Equal(t, 13, pcie.device)
 79 | 	assert.Equal(t, 1, pcie.function)
 80 | }
 81 | 
 82 | func TestGetDeviceMLULinkDevs(t *testing.T) {
 83 | 	devs, err := getDeviceMLULinkDevs(uint(0))
 84 | 	assert.NoError(t, err)
 85 | 	assert.Equal(t, map[string]int{
 86 | 		"MLU-20001012-1916-0000-0000-000000000000": 1,
 87 | 		"MLU-30001012-1916-0000-0000-000000000000": 2,
 88 | 		"MLU-40001012-1916-0000-0000-000000000000": 1,
 89 | 		"MLU-50001012-1916-0000-0000-000000000000": 1,
 90 | 		"MLU-d0001012-1916-0000-0000-000000000000": 1,
 91 | 	}, devs)
 92 | }
 93 | 
 94 | func TestGetMLULinkGroups(t *testing.T) {
 95 | 	groups, err := GetMLULinkGroups()
 96 | 	assert.NoError(t, err)
 97 | 	for i := range groups {
 98 | 		sort.Slice(groups[i], func(x, y int) bool {
 99 | 			return groups[i][x] < groups[i][y]
100 | 		})
101 | 	}
102 | 	assert.Equal(t, [][]uint{{0, 1, 2, 3, 4, 5, 6, 7}}, groups)
103 | }
104 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/cndev/cndev_dl.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Cambricon, Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package cndev
16 | 
17 | import (
18 | 	"unsafe"
19 | )
20 | 
21 | // #include <dlfcn.h>
22 | // #include "include/cndev.h"
23 | import "C"
24 | 
25 | type dlhandles struct{ handles []unsafe.Pointer }
26 | 
27 | var dl dlhandles
28 | 
29 | // Initialize CNDEV, open a dynamic reference to the CNDEV library in the process.
30 | func (dl *dlhandles) cndevInit() C.cndevRet_t {
31 | 	handle := C.dlopen(C.CString("libcndev.so"), C.RTLD_LAZY|C.RTLD_GLOBAL)
32 | 	if handle == C.NULL {
33 | 		return C.CNDEV_ERROR_UNINITIALIZED
34 | 	}
35 | 	dl.handles = append(dl.handles, handle)
36 | 	return C.cndevInit(C.int(0))
37 | }
38 | 
39 | // Release CNDEV, close the dynamic reference to the CNDEV library in the process.
40 | func (dl *dlhandles) cndevRelease() C.cndevRet_t {
41 | 	ret := C.cndevRelease()
42 | 	if ret != C.CNDEV_SUCCESS {
43 | 		return ret
44 | 	}
45 | 
46 | 	for _, handle := range dl.handles {
47 | 		err := C.dlclose(handle)
48 | 		if err != 0 {
49 | 			return C.CNDEV_ERROR_UNKNOWN
50 | 		}
51 | 	}
52 | 	return C.CNDEV_SUCCESS
53 | }
54 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/cndev/cndev_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Cambricon, Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package cndev
16 | 
17 | import (
18 | 	"os"
19 | 	"testing"
20 | 
21 | 	"github.com/stretchr/testify/assert"
22 | )
23 | 
24 | func TestGetPCIeID(t *testing.T) {
25 | 	d := &Device{
26 | 		pcie: &pcie{
27 | 			domain:   0,
28 | 			bus:      3,
29 | 			device:   15,
30 | 			function: 1,
31 | 		},
32 | 	}
33 | 	id, err := d.GetPCIeID()
34 | 	assert.NoError(t, err)
35 | 	assert.Equal(t, "0000:03:0f.1", id)
36 | }
37 | 
38 | func TestGetNumFromFile(t *testing.T) {
39 | 	path := "/tmp/device_plugin_cndev_ut"
40 | 	f, err := os.Create(path)
41 | 	assert.NoError(t, err)
42 | 
43 | 	data := []byte("4\n")
44 | 	_, err = f.Write(data)
45 | 	assert.NoError(t, err)
46 | 	num, err := getNumFromFile(path)
47 | 	assert.NoError(t, err)
48 | 	assert.Equal(t, 4, num)
49 | 
50 | 	err = f.Close()
51 | 	assert.NoError(t, err)
52 | 	err = os.Remove(path)
53 | 	assert.NoError(t, err)
54 | }
55 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/cntopo/cntopo.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 Cambricon, Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package cntopo
16 | 
17 | import (
18 | 	"encoding/json"
19 | 	"os"
20 | 	"os/exec"
21 | 	"sync"
22 | )
23 | 
24 | type cntopo struct {
25 | 	sync.Mutex
26 | }
27 | 
28 | type Input map[string][]struct {
29 | 	Size      int    `json:"num_devices"`
30 | 	WhiteList []uint `json:"white_dev_list"`
31 | }
32 | 
33 | type Output []struct {
34 | 	Info struct {
35 | 		Ordinals []uint `json:"ordinal_list"`
36 | 	} `json:"info_by_host"`
37 | 	// The traffic is duplex, so this value is twice the number of rings,
38 | 	// except for the cases of less equal to 2 cards, that is,
39 | 	// "A>B>A" conflicts with "B>A>B", while "A>B>C>A" does not conflict with "A>C>B>A"
40 | 	NonConflictRings struct {
41 | 		Num int `json:"nonconflict_rings_num"`
42 | 	} `json:"nonconflict_rings"`
43 | }
44 | 
45 | type Ring struct {
46 | 	Ordinals           []uint
47 | 	NonConflictRingNum int
48 | }
49 | 
50 | type Cntopo interface {
51 | 	GetRings(available []uint, size int) ([]Ring, error)
52 | }
53 | 
54 | func New() Cntopo {
55 | 	return &cntopo{}
56 | }
57 | 
58 | func (c *cntopo) GetRings(available []uint, size int) ([]Ring, error) {
59 | 	i := Input{
60 | 		"host_list": {
61 | 			{
62 | 				Size:      size,
63 | 				WhiteList: available,
64 | 			},
65 | 		},
66 | 	}
67 | 	b, err := json.Marshal(i)
68 | 	if err != nil {
69 | 		return nil, err
70 | 	}
71 | 	c.Lock()
72 | 	defer c.Unlock()
73 | 	err = os.WriteFile("/tmp/cntopo_input.json", b, 0666)
74 | 	if err != nil {
75 | 		return nil, err
76 | 	}
77 | 	err = exec.Command("sh", "-c", "cntopo find -I /tmp/cntopo_input.json -O /tmp/cntopo_output.json -R 1000000 -C").Run()
78 | 	if err != nil {
79 | 		return nil, err
80 | 	}
81 | 	j, err := os.ReadFile("/tmp/cntopo_output.json")
82 | 	if err != nil {
83 | 		return nil, err
84 | 	}
85 | 	var output Output
86 | 	err = json.Unmarshal(j, &output)
87 | 	if err != nil {
88 | 		return nil, err
89 | 	}
90 | 	rings := []Ring{}
91 | 	for _, o := range output {
92 | 		rings = append(rings, Ring{
93 | 			NonConflictRingNum: o.NonConflictRings.Num,
94 | 			Ordinals:           o.Info.Ordinals,
95 | 		})
96 | 	}
97 | 	return rings, nil
98 | }
99 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/cntopo/mock/cntopo.go:
--------------------------------------------------------------------------------
 1 | // Code generated by MockGen. DO NOT EDIT.
 2 | // Source: github.com/Cambricon/cambricon-k8s-device-plugin/device-plugin/pkg/cntopo (interfaces: Cntopo)
 3 | 
 4 | // Package mock is a generated GoMock package.
 5 | package mock
 6 | 
 7 | import (
 8 | 	reflect "reflect"
 9 | 
10 | 	cntopo 	"4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cntopo"
11 | 	gomock "github.com/golang/mock/gomock"
12 | )
13 | 
14 | // Cntopo is a mock of Cntopo interface.
15 | type Cntopo struct {
16 | 	ctrl     *gomock.Controller
17 | 	recorder *CntopoMockRecorder
18 | }
19 | 
20 | // CntopoMockRecorder is the mock recorder for Cntopo.
21 | type CntopoMockRecorder struct {
22 | 	mock *Cntopo
23 | }
24 | 
25 | // NewCntopo creates a new mock instance.
26 | func NewCntopo(ctrl *gomock.Controller) *Cntopo {
27 | 	mock := &Cntopo{ctrl: ctrl}
28 | 	mock.recorder = &CntopoMockRecorder{mock}
29 | 	return mock
30 | }
31 | 
32 | // EXPECT returns an object that allows the caller to indicate expected use.
33 | func (m *Cntopo) EXPECT() *CntopoMockRecorder {
34 | 	return m.recorder
35 | }
36 | 
37 | // GetRings mocks base method.
38 | func (m *Cntopo) GetRings(arg0 []uint, arg1 int) ([]cntopo.Ring, error) {
39 | 	m.ctrl.T.Helper()
40 | 	ret := m.ctrl.Call(m, "GetRings", arg0, arg1)
41 | 	ret0, _ := ret[0].([]cntopo.Ring)
42 | 	ret1, _ := ret[1].(error)
43 | 	return ret0, ret1
44 | }
45 | 
46 | // GetRings indicates an expected call of GetRings.
47 | func (mr *CntopoMockRecorder) GetRings(arg0, arg1 interface{}) *gomock.Call {
48 | 	mr.mock.ctrl.T.Helper()
49 | 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRings", reflect.TypeOf((*Cntopo)(nil).GetRings), arg0, arg1)
50 | }
51 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/const.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 Cambricon, Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package mlu
16 | 
17 | import pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
18 | 
19 | const (
20 | 	serverSock               = pluginapi.DevicePluginPath + "cambricon.sock"
21 | 	mluLinkPolicyUnsatisfied = "mluLinkPolicyUnsatisfied"
22 | 	retries                  = 5
23 | 
24 | 	BestEffort string = "best-effort"
25 | 	Restricted string = "restricted"
26 | 	Guaranteed string = "guaranteed"
27 | 
28 | 	sriov         string = "sriov"
29 | 	envShare      string = "env-share"
30 | 	topologyAware string = "topology-aware"
31 | 	mluShare      string = "mlu-share"
32 | 
33 | 	mluMonitorDeviceName     = "/dev/cambricon_ctl"
34 | 	mluDeviceName            = "/dev/cambricon_dev"
35 | 	mluMsgqDeviceName        = "/dev/cambr-msgq"
36 | 	mluRPCDeviceName         = "/dev/cambr-rpc"
37 | 	mluCmsgDeviceName        = "/dev/cmsg_ctrl"
38 | 	mluIpcmDeviceName        = "/dev/cambricon_ipcm"
39 | 	mluCommuDeviceName       = "/dev/commu"
40 | 	mluUARTConsoleDeviceName = "/dev/ttyMS"
41 | 	mluRPMsgDir              = "/dev/cambricon/"
42 | 	mluSplitDeviceName       = "/dev/cambricon-split"
43 | 
44 | 	mluMemResourceName       = "cambricon.com/mlumem"
45 | 	mluResourceCount         = "cambricon.com/mlunum"
46 | 	mluMemResourceAssumeTime = "CAMBRICON_MEM_ASSUME_TIME"
47 | 	mluMemResourceAssigned   = "CAMBRICON_MEM_ASSIGHED"
48 | 	mluMemSplitLimit         = "CAMBRICON_SPLIT_MEMS"
49 | 	mluMemSplitIndex         = "CAMBRICON_SPLIT_VISIBLE_DEVICES"
50 | 	mluMemSplitEnable        = "CAMBRICON_SPLIT_ENABLE"
51 | 	mluMemLock               = "cambricon.com/mlu-mem.lock"
52 | 	mluMemBinaryPath         = "/usr/bin/smlu-containerd"
53 | )
54 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/options.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Cambricon, Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package mlu
16 | 
17 | import (
18 | 	"log"
19 | 	"os"
20 | 	"strings"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/util"
23 | 	flags "github.com/jessevdk/go-flags"
24 | )
25 | 
26 | type Options struct {
27 | 	Mode               string `long:"mode" description:"device plugin mode" default:"default" choice:"default" choice:"sriov" choice:"env-share" choice:"topology-aware" choice:"mlu-share"`
28 | 	MLULinkPolicy      string `long:"mlulink-policy" description:"MLULink topology policy" default:"best-effort" choice:"best-effort" choice:"restricted" choice:"guaranteed"`
29 | 	VirtualizationNum  uint   `long:"virtualization-num" description:"the virtualization number for each MLU, used only in sriov mode or env-share mode" default:"1" env:"VIRTUALIZATION_NUM"`
30 | 	DisableHealthCheck bool   `long:"disable-health-check" description:"disable MLU health check"`
31 | 	NodeName           string `long:"node-name" description:"host node name" env:"NODE_NAME"`
32 | 	EnableConsole      bool   `long:"enable-console" description:"enable UART console device(/dev/ttyMS) in container"`
33 | 	EnableDeviceType   bool   `long:"enable-device-type" description:"enable device registration with type info"`
34 | 	CnmonPath          string `long:"cnmon-path" description:"host cnmon path"`
35 | 	SocketPath         string `long:"socket-path" description:"socket path for communication between deviceplugin and container runtime"`
36 | }
37 | 
38 | func ParseFlags() Options {
39 | 	for index, arg := range os.Args {
40 | 		if strings.HasPrefix(arg, "-mode") {
41 | 			os.Args[index] = strings.Replace(arg, "-mode", "--mode", 1)
42 | 			break
43 | 		}
44 | 	}
45 | 	if os.Getenv("DP_DISABLE_HEALTHCHECKS") == "all" {
46 | 		os.Args = append(os.Args, "--disable-health-check")
47 | 	}
48 | 	options := Options{}
49 | 	parser := flags.NewParser(&options, flags.Default)
50 | 	if _, err := parser.Parse(); err != nil {
51 | 		code := 1
52 | 		if fe, ok := err.(*flags.Error); ok {
53 | 			if fe.Type == flags.ErrHelp {
54 | 				code = 0
55 | 			}
56 | 		}
57 | 		os.Exit(code)
58 | 	}
59 | 	util.DeviceSplitCount = &options.VirtualizationNum
60 | 	util.RuntimeSocketFlag = options.SocketPath
61 | 	log.Printf("Parsed options: %v\n", options)
62 | 	return options
63 | }
64 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/mlu/register.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package mlu
 18 | 
 19 | import (
 20 | 	"fmt"
 21 | 	"time"
 22 | 
 23 | 	"k8s.io/klog/v2"
 24 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 25 | 
 26 | 	"4pd.io/k8s-vgpu/pkg/api"
 27 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/mlu/cndev"
 28 | 	"4pd.io/k8s-vgpu/pkg/device/cambricon"
 29 | 	"4pd.io/k8s-vgpu/pkg/util"
 30 | )
 31 | 
 32 | type DevListFunc func() []*pluginapi.Device
 33 | 
 34 | type DeviceRegister struct {
 35 | 	deviceCache *DeviceCache
 36 | 	unhealthy   chan *pluginapi.Device
 37 | 	stopCh      chan struct{}
 38 | }
 39 | 
 40 | func NewDeviceRegister(deviceCache *DeviceCache) *DeviceRegister {
 41 | 	return &DeviceRegister{
 42 | 		deviceCache: deviceCache,
 43 | 		unhealthy:   make(chan *pluginapi.Device),
 44 | 		stopCh:      make(chan struct{}),
 45 | 	}
 46 | }
 47 | 
 48 | func (r *DeviceRegister) Start(opt Options) {
 49 | 	r.deviceCache.AddNotifyChannel("register", r.unhealthy)
 50 | 	go r.WatchAndRegister(opt)
 51 | }
 52 | 
 53 | func (r *DeviceRegister) Stop() {
 54 | 	close(r.stopCh)
 55 | }
 56 | 
 57 | func (r *DeviceRegister) apiDevices() *[]*api.DeviceInfo {
 58 | 	devs := r.deviceCache.GetCache()
 59 | 	res := make([]*api.DeviceInfo, 0, len(devs))
 60 | 	for i, dev := range devs {
 61 | 		//klog.V(3).Infoln("ndev type=", ndev.Model)
 62 | 		memory, _ := cndev.GetDeviceMemory(uint(i))
 63 | 		fmt.Println("mlu registered device id=", dev.dev.ID, "memory=", memory, "type=", cndev.GetDeviceModel(uint(i)))
 64 | 		registeredmem := int32(memory)
 65 | 		res = append(res, &api.DeviceInfo{
 66 | 			Id:      dev.dev.ID,
 67 | 			Count:   int32(*util.DeviceSplitCount),
 68 | 			Devmem:  registeredmem,
 69 | 			Devcore: 0,
 70 | 			Numa:    0,
 71 | 			Type:    fmt.Sprintf("%v-%v", "MLU", cndev.GetDeviceModel(uint(i))),
 72 | 			Health:  dev.dev.Health == "healthy",
 73 | 		})
 74 | 	}
 75 | 	return &res
 76 | }
 77 | 
 78 | func (r *DeviceRegister) RegistrInAnnotation() error {
 79 | 	devices := r.apiDevices()
 80 | 	annos := make(map[string]string)
 81 | 	node, err := util.GetNode(util.NodeName)
 82 | 	if err != nil {
 83 | 		klog.Errorln("get node error", err.Error())
 84 | 		return err
 85 | 	}
 86 | 	encodeddevices := util.EncodeNodeDevices(*devices)
 87 | 	annos[cambricon.HandshakeAnnos] = "Reported " + time.Now().String()
 88 | 	annos[cambricon.RegisterAnnos] = encodeddevices
 89 | 	klog.Infoln("Reporting devices", encodeddevices, "in", time.Now().String())
 90 | 	err = util.PatchNodeAnnotations(node, annos)
 91 | 
 92 | 	if err != nil {
 93 | 		klog.Errorln("patch node error", err.Error())
 94 | 	}
 95 | 	return err
 96 | }
 97 | 
 98 | func (r *DeviceRegister) WatchAndRegister(opt Options) {
 99 | 	klog.Infof("into WatchAndRegister")
100 | 	for {
101 | 		err := r.RegistrInAnnotation()
102 | 		if err != nil {
103 | 			klog.Errorf("register error, %v", err)
104 | 			time.Sleep(time.Second * 5)
105 | 		} else {
106 | 			time.Sleep(time.Second * 30)
107 | 		}
108 | 	}
109 | }
110 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package cdi
18 | 
19 | // Interface provides the API to the 'cdi' package
20 | //
21 | //go:generate moq -stub -out api_mock.go . Interface
22 | type Interface interface {
23 | 	CreateSpecFile() error
24 | 	QualifiedName(string, string) string
25 | }
26 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cdi
18 | 
19 | import (
20 | 	"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
21 | 
22 | 	"k8s.io/klog/v2"
23 | )
24 | 
25 | // New is a factory method that creates a CDI handler for creating CDI specs.
26 | func New(opts ...Option) (Interface, error) {
27 | 	infolib := info.New()
28 | 
29 | 	hasNVML, _ := infolib.HasNvml()
30 | 	if !hasNVML {
31 | 		klog.Warning("No valid resources detected, creating a null CDI handler")
32 | 		return NewNullHandler(), nil
33 | 	}
34 | 
35 | 	return newHandler(opts...)
36 | }
37 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cdi
18 | 
19 | import (
20 | 	"k8s.io/klog/v2"
21 | )
22 | 
23 | type null struct{}
24 | 
25 | var _ Interface = &null{}
26 | 
27 | // NewNullHandler returns an instance of the 'cdi' interface that can
28 | // be used when CDI specs are not required.
29 | func NewNullHandler() Interface {
30 | 	return &null{}
31 | }
32 | 
33 | // CreateSpecFile is a no-op for the null handler.
34 | func (n *null) CreateSpecFile() error {
35 | 	return nil
36 | }
37 | 
38 | // QualifiedName is a no-op for the null handler. A error message is logged
39 | // inidicating this should never be called for the null handler.
40 | func (n *null) QualifiedName(class string, id string) string {
41 | 	klog.Error("cannot return a qualified CDI device name with the null CDI handler")
42 | 	return ""
43 | }
44 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package cdi
18 | 
19 | import (
20 | 	"github.com/NVIDIA/go-nvlib/pkg/nvml"
21 | )
22 | 
23 | // Option defines a function for passing options to the New() call
24 | type Option func(*cdiHandler)
25 | 
26 | // WithEnabled provides an Option to set the enabled flag used by the 'cdi' interface
27 | func WithEnabled(enabled bool) Option {
28 | 	return func(c *cdiHandler) {
29 | 		c.enabled = enabled
30 | 	}
31 | }
32 | 
33 | // WithDriverRoot provides an Option to set the driver root used by the 'cdi' interface
34 | func WithDriverRoot(root string) Option {
35 | 	return func(c *cdiHandler) {
36 | 		c.driverRoot = root
37 | 	}
38 | }
39 | 
40 | // WithTargetDriverRoot provides an Option to set the target driver root used by the 'cdi' interface
41 | func WithTargetDriverRoot(root string) Option {
42 | 	return func(c *cdiHandler) {
43 | 		c.targetDriverRoot = root
44 | 	}
45 | }
46 | 
47 | // WithNvidiaCTKPath provides an Option to set the nvidia-ctk path used by the 'cdi' interface
48 | func WithNvidiaCTKPath(path string) Option {
49 | 	return func(c *cdiHandler) {
50 | 		c.nvidiaCTKPath = path
51 | 	}
52 | }
53 | 
54 | // WithNvml provides an Option to set the NVML library used by the 'cdi' interface
55 | func WithNvml(nvml nvml.Interface) Option {
56 | 	return func(c *cdiHandler) {
57 | 		c.nvml = nvml
58 | 	}
59 | }
60 | 
61 | // WithDeviceIDStrategy provides an Option to set the device ID strategy used by the 'cdi' interface
62 | func WithDeviceIDStrategy(strategy string) Option {
63 | 	return func(c *cdiHandler) {
64 | 		c.deviceIDStrategy = strategy
65 | 	}
66 | }
67 | 
68 | // WithVendor provides an Option to set the vendor used by the 'cdi' interface
69 | func WithVendor(vendor string) Option {
70 | 	return func(c *cdiHandler) {
71 | 		c.vendor = vendor
72 | 	}
73 | }
74 | 
75 | // WithGdsEnabled provides and option to set whether a GDS CDI spec should be generated
76 | func WithGdsEnabled(enabled bool) Option {
77 | 	return func(c *cdiHandler) {
78 | 		c.gdsEnabled = enabled
79 | 	}
80 | }
81 | 
82 | // WithMofedEnabled provides and option to set whether a MOFED CDI spec should be generated
83 | func WithMofedEnabled(enabled bool) Option {
84 | 	return func(c *cdiHandler) {
85 | 		c.mofedEnabled = enabled
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/info/version.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package info
18 | 
19 | import "strings"
20 | 
21 | // version must be set by go build's -X main.version= option in the Makefile.
22 | var version = "unknown"
23 | 
24 | // gitCommit will be the hash that the binary was built from
25 | // and will be populated by the Makefile
26 | var gitCommit = ""
27 | 
28 | // GetVersionParts returns the different version components
29 | func GetVersionParts() []string {
30 | 	v := []string{version}
31 | 
32 | 	if gitCommit != "" {
33 | 		v = append(v, "commit: "+gitCommit)
34 | 	}
35 | 
36 | 	return v
37 | }
38 | 
39 | // GetVersionString returns the string representation of the version
40 | func GetVersionString(more ...string) string {
41 | 	v := append(GetVersionParts(), more...)
42 | 	return strings.Join(v, "\n")
43 | }
44 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/mig/mig.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 - 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | package mig
 4 | 
 5 | import (
 6 | 	"bufio"
 7 | 	"fmt"
 8 | 	"os"
 9 | 
10 | 	"k8s.io/klog/v2"
11 | )
12 | 
13 | const (
14 | 	nvidiaProcDriverPath   = "/proc/driver/nvidia"
15 | 	nvidiaCapabilitiesPath = nvidiaProcDriverPath + "/capabilities"
16 | 
17 | 	nvcapsProcDriverPath = "/proc/driver/nvidia-caps"
18 | 	nvcapsMigMinorsPath  = nvcapsProcDriverPath + "/mig-minors"
19 | 	nvcapsDevicePath     = "/dev/nvidia-caps"
20 | )
21 | 
22 | // GetMigCapabilityDevicePaths returns a mapping of MIG capability path to device node path
23 | func GetMigCapabilityDevicePaths() (map[string]string, error) {
24 | 	// Open nvcapsMigMinorsPath for walking.
25 | 	// If the nvcapsMigMinorsPath does not exist, then we are not on a MIG
26 | 	// capable machine, so there is nothing to do.
27 | 	// The format of this file is discussed in:
28 | 	//     https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#unique_1576522674
29 | 	minorsFile, err := os.Open(nvcapsMigMinorsPath)
30 | 	if os.IsNotExist(err) {
31 | 		return nil, nil
32 | 	}
33 | 	if err != nil {
34 | 		return nil, fmt.Errorf("error opening MIG minors file: %v", err)
35 | 	}
36 | 	defer minorsFile.Close()
37 | 
38 | 	// Define a function to process each each line of nvcapsMigMinorsPath
39 | 	processLine := func(line string) (string, int, error) {
40 | 		var gpu, gi, ci, migMinor int
41 | 
42 | 		// Look for a CI access file
43 | 		n, _ := fmt.Sscanf(line, "gpu%d/gi%d/ci%d/access %d", &gpu, &gi, &ci, &migMinor)
44 | 		if n == 4 {
45 | 			capPath := fmt.Sprintf(nvidiaCapabilitiesPath+"/gpu%d/mig/gi%d/ci%d/access", gpu, gi, ci)
46 | 			return capPath, migMinor, nil
47 | 		}
48 | 
49 | 		// Look for a GI access file
50 | 		n, _ = fmt.Sscanf(line, "gpu%d/gi%d/access %d", &gpu, &gi, &migMinor)
51 | 		if n == 3 {
52 | 			capPath := fmt.Sprintf(nvidiaCapabilitiesPath+"/gpu%d/mig/gi%d/access", gpu, gi)
53 | 			return capPath, migMinor, nil
54 | 		}
55 | 
56 | 		// Look for the MIG config file
57 | 		n, _ = fmt.Sscanf(line, "config %d", &migMinor)
58 | 		if n == 1 {
59 | 			capPath := fmt.Sprintf(nvidiaCapabilitiesPath + "/mig/config")
60 | 			return capPath, migMinor, nil
61 | 		}
62 | 
63 | 		// Look for the MIG monitor file
64 | 		n, _ = fmt.Sscanf(line, "monitor %d", &migMinor)
65 | 		if n == 1 {
66 | 			capPath := fmt.Sprintf(nvidiaCapabilitiesPath + "/mig/monitor")
67 | 			return capPath, migMinor, nil
68 | 		}
69 | 
70 | 		return "", 0, fmt.Errorf("unparsable line: %v", line)
71 | 	}
72 | 
73 | 	// Walk each line of nvcapsMigMinorsPath and construct a mapping of nvidia
74 | 	// capabilities path to device minor for that capability
75 | 	capsDevicePaths := make(map[string]string)
76 | 	scanner := bufio.NewScanner(minorsFile)
77 | 	for scanner.Scan() {
78 | 		capPath, migMinor, err := processLine(scanner.Text())
79 | 		if err != nil {
80 | 			klog.Errorf("Skipping line in MIG minors file: %v", err)
81 | 			continue
82 | 		}
83 | 		capsDevicePaths[capPath] = fmt.Sprintf(nvcapsDevicePath+"/nvidia-cap%d", migMinor)
84 | 	}
85 | 	return capsDevicePaths, nil
86 | }
87 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package plugin
18 | 
19 | import "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/rm"
20 | 
21 | // Interface defines the API for the plugin package
22 | type Interface interface {
23 | 	Devices() rm.Devices
24 | 	Start() error
25 | 	Stop() error
26 | }
27 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package manager
18 | 
19 | import "4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin"
20 | 
21 | // Interface defines the API for the plugin manager package
22 | type Interface interface {
23 | 	GetPlugins() ([]plugin.Interface, error)
24 | 	CreateCDISpecFile() error
25 | }
26 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package manager
18 | 
19 | import (
20 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin"
21 | )
22 | 
23 | type null struct{}
24 | 
25 | // GetPlugins returns an empty set of Plugins for the null manager
26 | func (m *null) GetPlugins() ([]plugin.Interface, error) {
27 | 	return nil, nil
28 | }
29 | 
30 | // CreateCDISpecFile creates the spec is a no-op for the null plugin
31 | func (m *null) CreateCDISpecFile() error {
32 | 	return nil
33 | }
34 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package manager
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin"
23 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/rm"
24 | )
25 | 
26 | type nvmlmanager manager
27 | 
28 | // GetPlugins returns the plugins associated with the NVML resources available on the node
29 | func (m *nvmlmanager) GetPlugins() ([]plugin.Interface, error) {
30 | 	rms, err := rm.NewNVMLResourceManagers(m.nvmllib, m.config)
31 | 	if err != nil {
32 | 		return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err)
33 | 	}
34 | 
35 | 	var plugins []plugin.Interface
36 | 	for _, r := range rms {
37 | 		plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled))
38 | 	}
39 | 	return plugins, nil
40 | }
41 | 
42 | // CreateCDISpecFile creates forwards the request to the CDI handler
43 | func (m *nvmlmanager) CreateCDISpecFile() error {
44 | 	return m.cdiHandler.CreateSpecFile()
45 | }
46 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package manager
18 | 
19 | import (
20 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/cdi"
21 | 	"4pd.io/k8s-vgpu/pkg/util"
22 | 	"github.com/NVIDIA/go-nvlib/pkg/nvml"
23 | )
24 | 
25 | // Option is a function that configures a manager
26 | type Option func(*manager)
27 | 
28 | // WithCDIEnabled sets whether CDI is enabled for the manager
29 | func WithCDIEnabled(enabled bool) Option {
30 | 	return func(m *manager) {
31 | 		m.cdiEnabled = enabled
32 | 	}
33 | }
34 | 
35 | // WithCDIHandler sets the CDI handler for the manager
36 | func WithCDIHandler(handler cdi.Interface) Option {
37 | 	return func(m *manager) {
38 | 		m.cdiHandler = handler
39 | 	}
40 | }
41 | 
42 | // WithNVML sets the NVML handler for the manager
43 | func WithNVML(nvmllib nvml.Interface) Option {
44 | 	return func(m *manager) {
45 | 		m.nvmllib = nvmllib
46 | 	}
47 | }
48 | 
49 | // WithFailOnInitError sets whether the manager should fail on initialization errors
50 | func WithFailOnInitError(failOnInitError bool) Option {
51 | 	return func(m *manager) {
52 | 		m.failOnInitError = failOnInitError
53 | 	}
54 | }
55 | 
56 | // WithMigStrategy sets the MIG strategy for the manager
57 | func WithMigStrategy(migStrategy string) Option {
58 | 	return func(m *manager) {
59 | 		m.migStrategy = migStrategy
60 | 	}
61 | }
62 | 
63 | // WithConfig sets the config reference for the manager
64 | func WithConfig(config *util.DeviceConfig) Option {
65 | 	return func(m *manager) {
66 | 		m.config = config
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package manager
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/plugin"
23 | 	"4pd.io/k8s-vgpu/pkg/device-plugin/nvidiadevice/nvinternal/rm"
24 | )
25 | 
26 | type tegramanager manager
27 | 
28 | // GetPlugins returns the plugins associated with the NVML resources available on the node
29 | func (m *tegramanager) GetPlugins() ([]plugin.Interface, error) {
30 | 	rms, err := rm.NewTegraResourceManagers(m.config)
31 | 	if err != nil {
32 | 		return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err)
33 | 	}
34 | 
35 | 	var plugins []plugin.Interface
36 | 	for _, r := range rms {
37 | 		plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled))
38 | 	}
39 | 	return plugins, nil
40 | }
41 | 
42 | // CreateCDISpecFile creates the spec is a no-op for the tegra plugin
43 | func (m *tegramanager) CreateCDISpecFile() error {
44 | 	return nil
45 | }
46 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package plugin
18 | 
19 | import "testing"
20 | 
21 | func Test_parseNvidiaNumaInfo(t *testing.T) {
22 | 
23 | 	tests := []struct {
24 | 		name          string
25 | 		idx           int
26 | 		nvidiaTopoStr string
27 | 		want          int
28 | 		wantErr       bool
29 | 	}{
30 | 		{
31 | 			name: "single Tesla P4 NUMA",
32 | 			idx:  0,
33 | 			nvidiaTopoStr: `GPU0    CPU Affinity    NUMA Affinity ...
34 |                             ...`,
35 | 			want:    0,
36 | 			wantErr: false,
37 | 		},
38 | 		{
39 | 			name: "two Tesla P4 NUMA topo with index 0",
40 | 			idx:  0,
41 | 			nvidiaTopoStr: `GPU0    GPU1    CPU Affinity    NUMA Affinity ...
42 |                             ...`,
43 | 			want:    0,
44 | 			wantErr: false,
45 | 		},
46 | 		{
47 | 			name: "two Tesla P4 NUMA topo with index 1",
48 | 			idx:  1,
49 | 			nvidiaTopoStr: `GPU0    GPU1    CPU Affinity    NUMA Affinity ...
50 |                             ...`,
51 | 			want:    0,
52 | 			wantErr: false,
53 | 		},
54 | 	}
55 | 
56 | 	for _, tt := range tests {
57 | 		t.Run(tt.name, func(t *testing.T) {
58 | 			got, err := parseNvidiaNumaInfo(tt.idx, tt.nvidiaTopoStr)
59 | 			if (err != nil) != tt.wantErr {
60 | 				t.Errorf("parseNvidiaNumaInfo() error = %v, wantErr %v", err, tt.wantErr)
61 | 				return
62 | 			}
63 | 			if got != tt.want {
64 | 				t.Errorf("parseNvidiaNumaInfo() got = %v, want %v", got, tt.want)
65 | 			}
66 | 		})
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go:
--------------------------------------------------------------------------------
  1 | /**
  2 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | **/
 16 | 
 17 | package rm
 18 | 
 19 | import (
 20 | 	"testing"
 21 | 
 22 | 	spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
 23 | 	"github.com/stretchr/testify/require"
 24 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 25 | )
 26 | 
 27 | func TestDeviceMapInsert(t *testing.T) {
 28 | 	device0 := Device{Device: pluginapi.Device{ID: "0"}}
 29 | 	device0withIndex := Device{Device: pluginapi.Device{ID: "0"}, Index: "index"}
 30 | 	device1 := Device{Device: pluginapi.Device{ID: "1"}}
 31 | 
 32 | 	testCases := []struct {
 33 | 		description       string
 34 | 		deviceMap         DeviceMap
 35 | 		key               string
 36 | 		value             *Device
 37 | 		expectedDeviceMap DeviceMap
 38 | 	}{
 39 | 		{
 40 | 			description: "insert into empty map",
 41 | 			deviceMap:   make(DeviceMap),
 42 | 			key:         "resource",
 43 | 			value:       &device0,
 44 | 			expectedDeviceMap: DeviceMap{
 45 | 				"resource": Devices{
 46 | 					"0": &device0,
 47 | 				},
 48 | 			},
 49 | 		},
 50 | 		{
 51 | 			description: "add to existing resource",
 52 | 			deviceMap: DeviceMap{
 53 | 				"resource": Devices{
 54 | 					"0": &device0,
 55 | 				},
 56 | 			},
 57 | 			key:   "resource",
 58 | 			value: &device1,
 59 | 			expectedDeviceMap: DeviceMap{
 60 | 				"resource": Devices{
 61 | 					"0": &device0,
 62 | 					"1": &device1,
 63 | 				},
 64 | 			},
 65 | 		},
 66 | 		{
 67 | 			description: "add new resource",
 68 | 			deviceMap: DeviceMap{
 69 | 				"resource": Devices{
 70 | 					"0": &device0,
 71 | 				},
 72 | 			},
 73 | 			key:   "resource1",
 74 | 			value: &device0,
 75 | 			expectedDeviceMap: DeviceMap{
 76 | 				"resource": Devices{
 77 | 					"0": &device0,
 78 | 				},
 79 | 				"resource1": Devices{
 80 | 					"0": &device0,
 81 | 				},
 82 | 			},
 83 | 		},
 84 | 		{
 85 | 			description: "overwrite existing device",
 86 | 			deviceMap: DeviceMap{
 87 | 				"resource": Devices{
 88 | 					"0": &device0,
 89 | 				},
 90 | 			},
 91 | 			key:   "resource",
 92 | 			value: &device0withIndex,
 93 | 			expectedDeviceMap: DeviceMap{
 94 | 				"resource": Devices{
 95 | 					"0": &device0withIndex,
 96 | 				},
 97 | 			},
 98 | 		},
 99 | 	}
100 | 
101 | 	for _, tc := range testCases {
102 | 		t.Run(tc.description, func(t *testing.T) {
103 | 			tc.deviceMap.insert(spec.ResourceName(tc.key), tc.value)
104 | 
105 | 			require.EqualValues(t, tc.expectedDeviceMap, tc.deviceMap)
106 | 		})
107 | 	}
108 | }
109 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package rm
18 | 
19 | import (
20 | 	"testing"
21 | 
22 | 	"github.com/stretchr/testify/require"
23 | )
24 | 
25 | func TestGetAdditionalXids(t *testing.T) {
26 | 	testCases := []struct {
27 | 		description string
28 | 		input       string
29 | 		expected    []uint64
30 | 	}{
31 | 		{
32 | 			description: "Empty input",
33 | 		},
34 | 		{
35 | 			description: "Only comma",
36 | 			input:       ",",
37 | 		},
38 | 		{
39 | 			description: "Non-integer input",
40 | 			input:       "not-an-int",
41 | 		},
42 | 		{
43 | 			description: "Single integer",
44 | 			input:       "68",
45 | 			expected:    []uint64{68},
46 | 		},
47 | 		{
48 | 			description: "Negative integer",
49 | 			input:       "-68",
50 | 		},
51 | 		{
52 | 			description: "Single integer with trailing spaces",
53 | 			input:       "68  ",
54 | 			expected:    []uint64{68},
55 | 		},
56 | 		{
57 | 			description: "Single integer followed by comma without trailing number",
58 | 			input:       "68,",
59 | 			expected:    []uint64{68},
60 | 		},
61 | 		{
62 | 			description: "Comma without preceding number followed by single integer",
63 | 			input:       ",68",
64 | 			expected:    []uint64{68},
65 | 		},
66 | 		{
67 | 			description: "Two comma-separated integers",
68 | 			input:       "68,67",
69 | 			expected:    []uint64{68, 67},
70 | 		},
71 | 		{
72 | 			description: "Two integers separated by non-integer",
73 | 			input:       "68,not-an-int,67",
74 | 			expected:    []uint64{68, 67},
75 | 		},
76 | 	}
77 | 
78 | 	for _, tc := range testCases {
79 | 		t.Run(tc.description, func(t *testing.T) {
80 | 			xids := getAdditionalXids(tc.input)
81 | 			require.EqualValues(t, tc.expected, xids)
82 | 		})
83 | 	}
84 | }
85 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package rm
18 | 
19 | // int8Slice wraps an []int8 with more functions.
20 | type int8Slice []int8
21 | 
22 | // String turns a nil terminated int8Slice into a string
23 | func (s int8Slice) String() string {
24 | 	var b []byte
25 | 	for _, c := range s {
26 | 		if c == 0 {
27 | 			break
28 | 		}
29 | 		b = append(b, byte(c))
30 | 	}
31 | 	return string(b)
32 | }
33 | 
34 | // uintPtr returns a *uint from a uint32
35 | func uintPtr(c uint32) *uint {
36 | 	i := uint(c)
37 | 	return &i
38 | }
39 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package rm
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/util"
23 | 	"github.com/NVIDIA/go-nvlib/pkg/nvml"
24 | 	"k8s.io/klog/v2"
25 | )
26 | 
27 | type nvmlResourceManager struct {
28 | 	resourceManager
29 | 	nvml nvml.Interface
30 | }
31 | 
32 | var _ ResourceManager = (*nvmlResourceManager)(nil)
33 | 
34 | // NewNVMLResourceManagers returns a set of ResourceManagers, one for each NVML resource in 'config'.
35 | func NewNVMLResourceManagers(nvmllib nvml.Interface, config *util.DeviceConfig) ([]ResourceManager, error) {
36 | 	ret := nvmllib.Init()
37 | 	if ret != nvml.SUCCESS {
38 | 		return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
39 | 	}
40 | 	defer func() {
41 | 		ret := nvmllib.Shutdown()
42 | 		if ret != nvml.SUCCESS {
43 | 			klog.Infof("Error shutting down NVML: %v", ret)
44 | 		}
45 | 	}()
46 | 
47 | 	deviceMap, err := NewDeviceMap(nvmllib, config)
48 | 	if err != nil {
49 | 		return nil, fmt.Errorf("error building device map: %v", err)
50 | 	}
51 | 
52 | 	var rms []ResourceManager
53 | 	for resourceName, devices := range deviceMap {
54 | 		if len(devices) == 0 {
55 | 			continue
56 | 		}
57 | 		r := &nvmlResourceManager{
58 | 			resourceManager: resourceManager{
59 | 				config:   config,
60 | 				resource: resourceName,
61 | 				devices:  devices,
62 | 			},
63 | 			nvml: nvmllib,
64 | 		}
65 | 		rms = append(rms, r)
66 | 	}
67 | 
68 | 	return rms, nil
69 | }
70 | 
71 | // GetPreferredAllocation runs an allocation algorithm over the inputs.
72 | // The algorithm chosen is based both on the incoming set of available devices and various config settings.
73 | func (r *nvmlResourceManager) GetPreferredAllocation(available, required []string, size int) ([]string, error) {
74 | 	return r.getPreferredAllocation(available, required, size)
75 | }
76 | 
77 | // GetDevicePaths returns the required and optional device nodes for the requested resources
78 | func (r *nvmlResourceManager) GetDevicePaths(ids []string) []string {
79 | 	paths := []string{
80 | 		"/dev/nvidiactl",
81 | 		"/dev/nvidia-uvm",
82 | 		"/dev/nvidia-uvm-tools",
83 | 		"/dev/nvidia-modeset",
84 | 	}
85 | 
86 | 	for _, p := range r.Devices().Subset(ids).GetPaths() {
87 | 		paths = append(paths, p)
88 | 	}
89 | 
90 | 	return paths
91 | }
92 | 
93 | // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
94 | func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error {
95 | 	return r.checkHealth(stop, r.devices, unhealthy)
96 | }
97 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package rm
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/util"
23 | )
24 | 
25 | const (
26 | 	tegraDeviceName = "tegra"
27 | )
28 | 
29 | // buildTegraDeviceMap creates a DeviceMap for the tegra devices in the sytesm.
30 | // NOTE: At present only a single tegra device is expected.
31 | func buildTegraDeviceMap(config *util.DeviceConfig) (DeviceMap, error) {
32 | 	devices := make(DeviceMap)
33 | 
34 | 	name := tegraDeviceName
35 | 	i := 0
36 | 	for _, resource := range config.Resources.GPUs {
37 | 		if resource.Pattern.Matches(name) {
38 | 			index := fmt.Sprintf("%d", i)
39 | 			err := devices.setEntry(resource.Name, index, &tegraDevice{})
40 | 			if err != nil {
41 | 				return nil, err
42 | 			}
43 | 			i++
44 | 		}
45 | 
46 | 	}
47 | 	return devices, nil
48 | }
49 | 
50 | type tegraDevice struct{}
51 | 
52 | var _ deviceInfo = (*tegraDevice)(nil)
53 | 
54 | // GetUUID returns the UUID of the tegra device.
55 | // TODO: This is currently hardcoded to `tegra`
56 | func (d *tegraDevice) GetUUID() (string, error) {
57 | 	return tegraDeviceName, nil
58 | }
59 | 
60 | // GetPaths returns the paths for a tegra device.
61 | // A tegra device does not have paths associated with it.
62 | func (d *tegraDevice) GetPaths() ([]string, error) {
63 | 	return nil, nil
64 | }
65 | 
66 | // GetNumaNode always returns unsupported for a Tegra device
67 | func (d *tegraDevice) GetNumaNode() (bool, int, error) {
68 | 	return false, -1, nil
69 | }
70 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_manager.go:
--------------------------------------------------------------------------------
 1 | /**
 2 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | **/
16 | 
17 | package rm
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/util"
23 | )
24 | 
25 | type tegraResourceManager struct {
26 | 	resourceManager
27 | }
28 | 
29 | var _ ResourceManager = (*tegraResourceManager)(nil)
30 | 
31 | // NewTegraResourceManagers returns a set of ResourceManagers for tegra resources
32 | func NewTegraResourceManagers(config *util.DeviceConfig) ([]ResourceManager, error) {
33 | 	deviceMap, err := buildTegraDeviceMap(config)
34 | 	if err != nil {
35 | 		return nil, fmt.Errorf("error building Tegra device map: %v", err)
36 | 	}
37 | 
38 | 	deviceMap, err = updateDeviceMapWithReplicas(config, deviceMap)
39 | 	if err != nil {
40 | 		return nil, fmt.Errorf("error updating device map with replicas from config.sharing.timeSlicing.resources: %v", err)
41 | 	}
42 | 
43 | 	var rms []ResourceManager
44 | 	for resourceName, devices := range deviceMap {
45 | 		if len(devices) == 0 {
46 | 			continue
47 | 		}
48 | 		r := &tegraResourceManager{
49 | 			resourceManager: resourceManager{
50 | 				config:   config,
51 | 				resource: resourceName,
52 | 				devices:  devices,
53 | 			},
54 | 		}
55 | 		if len(devices) != 0 {
56 | 			rms = append(rms, r)
57 | 		}
58 | 	}
59 | 
60 | 	return rms, nil
61 | }
62 | 
63 | // GetPreferredAllocation returns a standard allocation for the Tegra resource manager.
64 | func (r *tegraResourceManager) GetPreferredAllocation(available, required []string, size int) ([]string, error) {
65 | 	return r.distributedAlloc(available, required, size)
66 | }
67 | 
68 | // GetDevicePaths returns an empty slice for the tegraResourceManager
69 | func (r *tegraResourceManager) GetDevicePaths(ids []string) []string {
70 | 	return nil
71 | }
72 | 
73 | // CheckHealth is disabled for the tegraResourceManager
74 | func (r *tegraResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error {
75 | 	return nil
76 | }
77 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package rm
18 | 
19 | type wslDevice nvmlDevice
20 | 
21 | var _ deviceInfo = (*wslDevice)(nil)
22 | 
23 | // GetUUID returns the UUID of the device
24 | func (d wslDevice) GetUUID() (string, error) {
25 | 	return nvmlDevice(d).GetUUID()
26 | }
27 | 
28 | // GetPaths returns the paths for a tegra device.
29 | func (d wslDevice) GetPaths() ([]string, error) {
30 | 	return []string{"/dev/dxg"}, nil
31 | }
32 | 
33 | // GetNumaNode returns the NUMA node associated with the GPU device
34 | func (d wslDevice) GetNumaNode() (bool, int, error) {
35 | 	return nvmlDevice(d).GetNumaNode()
36 | }
37 | 


--------------------------------------------------------------------------------
/pkg/device/devices.go:
--------------------------------------------------------------------------------
  1 | package device
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"flag"
  6 | 	"os"
  7 | 	"strings"
  8 | 
  9 | 	"4pd.io/k8s-vgpu/pkg/device/cambricon"
 10 | 	"4pd.io/k8s-vgpu/pkg/device/hygon"
 11 | 	"4pd.io/k8s-vgpu/pkg/device/nvidia"
 12 | 	"4pd.io/k8s-vgpu/pkg/util"
 13 | 	"4pd.io/k8s-vgpu/pkg/util/client"
 14 | 	"4pd.io/k8s-vgpu/pkg/util/nodelock"
 15 | 	v1 "k8s.io/api/core/v1"
 16 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 17 | 	"k8s.io/klog"
 18 | )
 19 | 
 20 | type Devices interface {
 21 | 	MutateAdmission(ctr *v1.Container) bool
 22 | 	CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool)
 23 | 	GenerateResourceRequests(ctr *v1.Container) util.ContainerDeviceRequest
 24 | 	ParseConfig(fs *flag.FlagSet)
 25 | }
 26 | 
 27 | var (
 28 | 	KnownDevice = map[string]string{
 29 | 		nvidia.HandshakeAnnos:    nvidia.RegisterAnnos,
 30 | 		cambricon.HandshakeAnnos: cambricon.RegisterAnnos,
 31 | 		hygon.HandshakeAnnos:     hygon.RegisterAnnos,
 32 | 	}
 33 | 	DevicesToHandle []string
 34 | )
 35 | 
 36 | var devices map[string]Devices
 37 | var DebugMode bool
 38 | 
 39 | func GetDevices() map[string]Devices {
 40 | 	return devices
 41 | }
 42 | 
 43 | func init() {
 44 | 	devices = make(map[string]Devices)
 45 | 	devices["Cambricon"] = cambricon.InitMLUDevice()
 46 | 	devices["NVIDIA"] = nvidia.InitNvidiaDevice()
 47 | 	devices["Hygon"] = hygon.InitDCUDevice()
 48 | 	DevicesToHandle = []string{}
 49 | 	DevicesToHandle = append(DevicesToHandle, nvidia.NvidiaGPUCommonWord)
 50 | 	DevicesToHandle = append(DevicesToHandle, cambricon.CambriconMLUCommonWord)
 51 | 	DevicesToHandle = append(DevicesToHandle, hygon.HygonDCUCommonWord)
 52 | }
 53 | 
 54 | func PodAllocationTrySuccess(nodeName string, pod *v1.Pod) {
 55 | 	refreshed, _ := client.GetClient().CoreV1().Pods(pod.Namespace).Get(context.Background(), pod.Name, metav1.GetOptions{})
 56 | 	annos := refreshed.Annotations[util.AssignedIDsToAllocateAnnotations]
 57 | 	klog.Infoln("TrySuccess:", annos)
 58 | 	for _, val := range DevicesToHandle {
 59 | 		if strings.Contains(annos, val) {
 60 | 			return
 61 | 		}
 62 | 	}
 63 | 	klog.Infoln("AllDevicesAllocateSuccess releasing lock")
 64 | 	PodAllocationSuccess(nodeName, pod)
 65 | }
 66 | 
 67 | func PodAllocationSuccess(nodeName string, pod *v1.Pod) {
 68 | 	newannos := make(map[string]string)
 69 | 	newannos[util.DeviceBindPhase] = util.DeviceBindSuccess
 70 | 	err := util.PatchPodAnnotations(pod, newannos)
 71 | 	if err != nil {
 72 | 		klog.Errorf("patchPodAnnotations failed:%v", err.Error())
 73 | 	}
 74 | 	err = nodelock.ReleaseNodeLock(nodeName)
 75 | 	if err != nil {
 76 | 		klog.Errorf("release lock failed:%v", err.Error())
 77 | 	}
 78 | }
 79 | 
 80 | func PodAllocationFailed(nodeName string, pod *v1.Pod) {
 81 | 	newannos := make(map[string]string)
 82 | 	newannos[util.DeviceBindPhase] = util.DeviceBindFailed
 83 | 	err := util.PatchPodAnnotations(pod, newannos)
 84 | 	if err != nil {
 85 | 		klog.Errorf("patchPodAnnotations failed:%v", err.Error())
 86 | 	}
 87 | 	err = nodelock.ReleaseNodeLock(nodeName)
 88 | 	if err != nil {
 89 | 		klog.Errorf("release lock failed:%v", err.Error())
 90 | 	}
 91 | }
 92 | 
 93 | func GlobalFlagSet() *flag.FlagSet {
 94 | 	fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
 95 | 	for _, val := range devices {
 96 | 		val.ParseConfig(fs)
 97 | 	}
 98 | 	fs.BoolVar(&DebugMode, "debug", false, "debug mode")
 99 | 	klog.InitFlags(fs)
100 | 	return fs
101 | }
102 | 


--------------------------------------------------------------------------------
/pkg/k8sutil/client.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package k8sutil
18 | 
19 | import (
20 |     "k8s.io/client-go/kubernetes"
21 |     "k8s.io/client-go/rest"
22 |     "k8s.io/client-go/tools/clientcmd"
23 |     "os"
24 |     "path/filepath"
25 | )
26 | 
27 | // NewClient connects to an API server
28 | func NewClient() (kubernetes.Interface, error) {
29 |     kubeConfig := os.Getenv("KUBECONFIG")
30 |     if kubeConfig == "" {
31 |         kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config")
32 |     }
33 |     config, err := rest.InClusterConfig()
34 |     if err != nil {
35 |         config, err = clientcmd.BuildConfigFromFlags("", kubeConfig)
36 |         if err != nil {
37 |             return nil, err
38 |         }
39 |     }
40 |     client, err := kubernetes.NewForConfig(config)
41 |     return client, err
42 | }
43 | 


--------------------------------------------------------------------------------
/pkg/k8sutil/pod.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package k8sutil
18 | 
19 | import (
20 | 	"4pd.io/k8s-vgpu/pkg/device"
21 | 	"4pd.io/k8s-vgpu/pkg/util"
22 | 	corev1 "k8s.io/api/core/v1"
23 | 	"k8s.io/klog/v2"
24 | )
25 | 
26 | func Resourcereqs(pod *corev1.Pod) (counts util.PodDeviceRequests) {
27 | 	counts = make(util.PodDeviceRequests, len(pod.Spec.Containers))
28 | 	//Count Nvidia GPU
29 | 	for i := 0; i < len(pod.Spec.Containers); i++ {
30 | 		devices := device.GetDevices()
31 | 		counts[i] = make(util.ContainerDeviceRequests)
32 | 		for idx, val := range devices {
33 | 			request := val.GenerateResourceRequests(&pod.Spec.Containers[i])
34 | 			if request.Nums > 0 {
35 | 				counts[i][idx] = val.GenerateResourceRequests(&pod.Spec.Containers[i])
36 | 			}
37 | 		}
38 | 	}
39 | 	klog.InfoS("collect requestreqs", counts)
40 | 	return counts
41 | }
42 | 
43 | func IsPodInTerminatedState(pod *corev1.Pod) bool {
44 | 	return pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded
45 | }
46 | 
47 | func AllContainersCreated(pod *corev1.Pod) bool {
48 | 	return len(pod.Status.ContainerStatuses) >= len(pod.Spec.Containers)
49 | }
50 | 


--------------------------------------------------------------------------------
/pkg/oci/runtime.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | */
16 | 
17 | package oci
18 | 
19 | // Runtime is an interface for a runtime shim. The Exec method accepts a list
20 | // of command line arguments, and returns an error / nil.
21 | type Runtime interface {
22 | 	Exec([]string) error
23 | }
24 | 


--------------------------------------------------------------------------------
/pkg/oci/runtime_exec.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | */
16 | 
17 | package oci
18 | 
19 | import (
20 | 	"fmt"
21 | 	"os"
22 | 	"syscall"
23 | 
24 | 	log "github.com/sirupsen/logrus"
25 | )
26 | 
27 | // SyscallExecRuntime wraps the path that a binary and defines the semanitcs for how to exec into it.
28 | // This can be used to wrap an OCI-compliant low-level runtime binary, allowing it to be used through the
29 | // Runtime internface.
30 | type SyscallExecRuntime struct {
31 | 	logger *log.Logger
32 | 	path   string
33 | 	// exec is used for testing. This defaults to syscall.Exec
34 | 	exec func(argv0 string, argv []string, envv []string) error
35 | }
36 | 
37 | var _ Runtime = (*SyscallExecRuntime)(nil)
38 | 
39 | // NewSyscallExecRuntime creates a SyscallExecRuntime for the specified path with the standard logger
40 | func NewSyscallExecRuntime(path string) (Runtime, error) {
41 | 	return NewSyscallExecRuntimeWithLogger(log.StandardLogger(), path)
42 | }
43 | 
44 | // NewSyscallExecRuntimeWithLogger creates a SyscallExecRuntime for the specified logger and path
45 | func NewSyscallExecRuntimeWithLogger(logger *log.Logger, path string) (Runtime, error) {
46 | 	info, err := os.Stat(path)
47 | 	if err != nil {
48 | 		return nil, fmt.Errorf("invalid path '%v': %v", path, err)
49 | 	}
50 | 	if info.IsDir() || info.Mode()&0111 == 0 {
51 | 		return nil, fmt.Errorf("specified path '%v' is not an executable file", path)
52 | 	}
53 | 
54 | 	shim := SyscallExecRuntime{
55 | 		logger: logger,
56 | 		path:   path,
57 | 		exec:   syscall.Exec,
58 | 	}
59 | 
60 | 	return &shim, nil
61 | }
62 | 
63 | // Exec exces into the binary at the path from the SyscallExecRuntime struct, passing it the supplied arguments
64 | // after ensuring that the first argument is the path of the target binary.
65 | func (s SyscallExecRuntime) Exec(args []string) error {
66 | 	runtimeArgs := []string{s.path}
67 | 	if len(args) > 1 {
68 | 		runtimeArgs = append(runtimeArgs, args[1:]...)
69 | 	}
70 | 
71 | 	err := s.exec(s.path, runtimeArgs, os.Environ())
72 | 	if err != nil {
73 | 		return fmt.Errorf("could not exec '%v': %v", s.path, err)
74 | 	}
75 | 
76 | 	// syscall.Exec is not expected to return. This is an error state regardless of whether
77 | 	// err is nil or not.
78 | 	return fmt.Errorf("unexpected return from exec '%v'", s.path)
79 | }
80 | 


--------------------------------------------------------------------------------
/pkg/oci/runtime_exec_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | */
 16 | package oci
 17 | 
 18 | import (
 19 | 	"fmt"
 20 | 	"strings"
 21 | 	"testing"
 22 | 
 23 | 	testlog "github.com/sirupsen/logrus/hooks/test"
 24 | 	"github.com/stretchr/testify/require"
 25 | )
 26 | 
 27 | func TestSyscallExecConstructor(t *testing.T) {
 28 | 	r, err := NewSyscallExecRuntime("////an/invalid/path")
 29 | 	require.Error(t, err)
 30 | 	require.Nil(t, r)
 31 | 
 32 | 	r, err = NewSyscallExecRuntime("/tmp")
 33 | 	require.Error(t, err)
 34 | 	require.Nil(t, r)
 35 | 
 36 | 	r, err = NewSyscallExecRuntime("/dev/null")
 37 | 	require.Error(t, err)
 38 | 	require.Nil(t, r)
 39 | 
 40 | 	r, err = NewSyscallExecRuntime("/bin/sh")
 41 | 	require.NoError(t, err)
 42 | 
 43 | 	f, ok := r.(*SyscallExecRuntime)
 44 | 	require.True(t, ok)
 45 | 
 46 | 	require.Equal(t, "/bin/sh", f.path)
 47 | }
 48 | 
 49 | func TestSyscallExecForwardsArgs(t *testing.T) {
 50 | 	logger, _ := testlog.NewNullLogger()
 51 | 	f := SyscallExecRuntime{
 52 | 		logger: logger,
 53 | 		path:   "runtime",
 54 | 	}
 55 | 
 56 | 	testCases := []struct {
 57 | 		returnError error
 58 | 		args        []string
 59 | 		errorPrefix string
 60 | 	}{
 61 | 		{
 62 | 			returnError: nil,
 63 | 			errorPrefix: "unexpected return from exec",
 64 | 		},
 65 | 		{
 66 | 			returnError: fmt.Errorf("error from exec"),
 67 | 			errorPrefix: "could not exec",
 68 | 		},
 69 | 		{
 70 | 			returnError: nil,
 71 | 			args:        []string{"otherargv0"},
 72 | 			errorPrefix: "unexpected return from exec",
 73 | 		},
 74 | 		{
 75 | 			returnError: nil,
 76 | 			args:        []string{"otherargv0", "arg1", "arg2", "arg3"},
 77 | 			errorPrefix: "unexpected return from exec",
 78 | 		},
 79 | 	}
 80 | 
 81 | 	for i, tc := range testCases {
 82 | 		execMock := WithMockExec(f, tc.returnError)
 83 | 
 84 | 		err := execMock.Exec(tc.args)
 85 | 
 86 | 		require.Errorf(t, err, "%d: %v", i, tc)
 87 | 		require.Truef(t, strings.HasPrefix(err.Error(), tc.errorPrefix), "%d: %v", i, tc)
 88 | 		if tc.returnError != nil {
 89 | 			require.Truef(t, strings.HasSuffix(err.Error(), tc.returnError.Error()), "%d: %v", i, tc)
 90 | 		}
 91 | 
 92 | 		require.Equalf(t, f.path, execMock.argv0, "%d: %v", i, tc)
 93 | 		require.Equalf(t, f.path, execMock.argv[0], "%d: %v", i, tc)
 94 | 
 95 | 		require.LessOrEqualf(t, len(tc.args), len(execMock.argv), "%d: %v", i, tc)
 96 | 		if len(tc.args) > 1 {
 97 | 			require.Equalf(t, tc.args[1:], execMock.argv[1:], "%d: %v", i, tc)
 98 | 		}
 99 | 	}
100 | }
101 | 


--------------------------------------------------------------------------------
/pkg/oci/runtime_mock.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | */
16 | 
17 | package oci
18 | 
19 | // MockExecRuntime wraps a SyscallExecRuntime, intercepting the exec call for testing
20 | type MockExecRuntime struct {
21 | 	SyscallExecRuntime
22 | 	execMock
23 | }
24 | 
25 | // WithMockExec wraps a specified SyscallExecRuntime with a mocked exec function for testing
26 | func WithMockExec(e SyscallExecRuntime, execResult error) *MockExecRuntime {
27 | 	m := MockExecRuntime{
28 | 		SyscallExecRuntime: e,
29 | 		execMock:           execMock{result: execResult},
30 | 	}
31 | 	// overrdie the exec function to the mocked exec function.
32 | 	m.SyscallExecRuntime.exec = m.execMock.exec
33 | 	return &m
34 | }
35 | 
36 | type execMock struct {
37 | 	argv0  string
38 | 	argv   []string
39 | 	envv   []string
40 | 	result error
41 | }
42 | 
43 | func (m *execMock) exec(argv0 string, argv []string, envv []string) error {
44 | 	m.argv0 = argv0
45 | 	m.argv = argv
46 | 	m.envv = envv
47 | 
48 | 	return m.result
49 | }
50 | 


--------------------------------------------------------------------------------
/pkg/oci/spec.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | */
 16 | 
 17 | package oci
 18 | 
 19 | import (
 20 | 	"encoding/json"
 21 | 	"fmt"
 22 | 	"os"
 23 | 
 24 | 	oci "github.com/opencontainers/runtime-spec/specs-go"
 25 | )
 26 | 
 27 | // SpecModifier is a function that accepts a pointer to an OCI Srec and returns an
 28 | // error. The intention is that the function would modify the spec in-place.
 29 | type SpecModifier func(*oci.Spec) error
 30 | 
 31 | // Spec defines the operations to be performed on an OCI specification
 32 | type Spec interface {
 33 | 	Load() error
 34 | 	Flush() error
 35 | 	Modify(SpecModifier) error
 36 | }
 37 | 
 38 | type fileSpec struct {
 39 | 	*oci.Spec
 40 | 	path string
 41 | }
 42 | 
 43 | var _ Spec = (*fileSpec)(nil)
 44 | 
 45 | // NewSpecFromFile creates an object that encapsulates a file-backed OCI spec.
 46 | // This can be used to read from the file, modify the spec, and write to the
 47 | // same file.
 48 | func NewSpecFromFile(filepath string) Spec {
 49 | 	oci := fileSpec{
 50 | 		path: filepath,
 51 | 	}
 52 | 
 53 | 	return &oci
 54 | }
 55 | 
 56 | // Load reads the contents of an OCI spec from file to be referenced internally.
 57 | // The file is opened "read-only"
 58 | func (s *fileSpec) Load() error {
 59 | 	specFile, err := os.Open(s.path)
 60 | 	if err != nil {
 61 | 		return fmt.Errorf("error opening OCI specification file: %v", err)
 62 | 	}
 63 | 	defer specFile.Close()
 64 | 
 65 | 	decoder := json.NewDecoder(specFile)
 66 | 
 67 | 	var spec oci.Spec
 68 | 	err = decoder.Decode(&spec)
 69 | 	if err != nil {
 70 | 		return fmt.Errorf("error reading OCI specification from file: %v", err)
 71 | 	}
 72 | 
 73 | 	s.Spec = &spec
 74 | 	return nil
 75 | }
 76 | 
 77 | // Modify applies the specified SpecModifier to the stored OCI specification.
 78 | func (s *fileSpec) Modify(f SpecModifier) error {
 79 | 	if s.Spec == nil {
 80 | 		return fmt.Errorf("no spec loaded for modification")
 81 | 	}
 82 | 	return f(s.Spec)
 83 | }
 84 | 
 85 | // Flush writes the stored OCI specification to the filepath specifed by the path member.
 86 | // The file is truncated upon opening, overwriting any existing contents.
 87 | func (s fileSpec) Flush() error {
 88 | 	specFile, err := os.Create(s.path)
 89 | 	if err != nil {
 90 | 		return fmt.Errorf("error opening OCI specification file: %v", err)
 91 | 	}
 92 | 	defer specFile.Close()
 93 | 
 94 | 	encoder := json.NewEncoder(specFile)
 95 | 
 96 | 	err = encoder.Encode(s.Spec)
 97 | 	if err != nil {
 98 | 		return fmt.Errorf("error writing OCI specification to file: %v", err)
 99 | 	}
100 | 
101 | 	return nil
102 | }
103 | 


--------------------------------------------------------------------------------
/pkg/oci/spec_mock.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | */
16 | 
17 | package oci
18 | 
19 | import (
20 | 	oci "github.com/opencontainers/runtime-spec/specs-go"
21 | )
22 | 
23 | // MockSpec provides a simple mock for an OCI spec to be used in testing.
24 | // It also implements the SpecModifier interface.
25 | type MockSpec struct {
26 | 	*oci.Spec
27 | 	MockLoad   mockFunc
28 | 	MockFlush  mockFunc
29 | 	MockModify mockFunc
30 | }
31 | 
32 | var _ Spec = (*MockSpec)(nil)
33 | 
34 | // NewMockSpec constructs a MockSpec to be used in testing as a Spec
35 | func NewMockSpec(spec *oci.Spec, flushResult error, modifyResult error) *MockSpec {
36 | 	s := MockSpec{
37 | 		Spec:       spec,
38 | 		MockFlush:  mockFunc{result: flushResult},
39 | 		MockModify: mockFunc{result: modifyResult},
40 | 	}
41 | 
42 | 	return &s
43 | }
44 | 
45 | // Load invokes the mocked Load function to return the predefined error / result
46 | func (s *MockSpec) Load() error {
47 | 	return s.MockLoad.call()
48 | }
49 | 
50 | // Flush invokes the mocked Load function to return the predefined error / result
51 | func (s *MockSpec) Flush() error {
52 | 	return s.MockFlush.call()
53 | }
54 | 
55 | // Modify applies the specified SpecModifier to the spec and invokes the
56 | // mocked modify function to return the predefined error / result.
57 | func (s *MockSpec) Modify(f SpecModifier) error {
58 | 	f(s.Spec)
59 | 	return s.MockModify.call()
60 | }
61 | 
62 | type mockFunc struct {
63 | 	Callcount int
64 | 	result    error
65 | }
66 | 
67 | func (m *mockFunc) call() error {
68 | 	m.Callcount++
69 | 	return m.result
70 | }
71 | 


--------------------------------------------------------------------------------
/pkg/scheduler/config/config.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package config
18 | 
19 | var (
20 | 	HttpBind           string
21 | 	SchedulerName      string
22 | 	DefaultMem         int32
23 | 	DefaultCores       int32
24 | 	MetricsBindAddress string
25 | )
26 | 


--------------------------------------------------------------------------------
/pkg/scheduler/nodes.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package scheduler
 18 | 
 19 | import (
 20 | 	"fmt"
 21 | 	"strings"
 22 | 	"sync"
 23 | 
 24 | 	"4pd.io/k8s-vgpu/pkg/util"
 25 | 	"k8s.io/klog/v2"
 26 | )
 27 | 
 28 | type DeviceInfo struct {
 29 | 	ID      string
 30 | 	Index   uint
 31 | 	Count   int32
 32 | 	Devmem  int32
 33 | 	Devcore int32
 34 | 	Type    string
 35 | 	Numa    int
 36 | 	Health  bool
 37 | }
 38 | 
 39 | type NodeInfo struct {
 40 | 	ID      string
 41 | 	Devices []DeviceInfo
 42 | }
 43 | 
 44 | type DeviceUsageList []*util.DeviceUsage
 45 | 
 46 | type NodeUsage struct {
 47 | 	Devices DeviceUsageList
 48 | }
 49 | 
 50 | type nodeManager struct {
 51 | 	nodes map[string]*NodeInfo
 52 | 	mutex sync.RWMutex
 53 | }
 54 | 
 55 | func (m *nodeManager) init() {
 56 | 	m.nodes = make(map[string]*NodeInfo)
 57 | }
 58 | 
 59 | func (m *nodeManager) addNode(nodeID string, nodeInfo *NodeInfo) {
 60 | 	if nodeInfo == nil || len(nodeInfo.Devices) == 0 {
 61 | 		return
 62 | 	}
 63 | 	m.mutex.Lock()
 64 | 	defer m.mutex.Unlock()
 65 | 	_, ok := m.nodes[nodeID]
 66 | 	if ok {
 67 | 		tmp := make([]DeviceInfo, 0, len(m.nodes[nodeID].Devices)+len(nodeInfo.Devices))
 68 | 		tmp = append(tmp, m.nodes[nodeID].Devices...)
 69 | 		tmp = append(tmp, nodeInfo.Devices...)
 70 | 		m.nodes[nodeID].Devices = tmp
 71 | 	} else {
 72 | 		m.nodes[nodeID] = nodeInfo
 73 | 	}
 74 | }
 75 | 
 76 | func (m *nodeManager) rmNodeDevice(nodeID string, nodeInfo *NodeInfo) {
 77 | 	m.mutex.Lock()
 78 | 	defer m.mutex.Unlock()
 79 | 	_, ok := m.nodes[nodeID]
 80 | 	if ok {
 81 | 		if m.nodes[nodeID].Devices == nil || len(m.nodes[nodeID].Devices) == 0 {
 82 | 			return
 83 | 		}
 84 | 		klog.Infoln("before rm:", m.nodes[nodeID].Devices, "needs remove", nodeInfo.Devices)
 85 | 		tmp := make([]DeviceInfo, 0, len(m.nodes[nodeID].Devices)-len(nodeInfo.Devices))
 86 | 		for _, val := range m.nodes[nodeID].Devices {
 87 | 			found := false
 88 | 			for _, rmval := range nodeInfo.Devices {
 89 | 				if strings.Compare(val.ID, rmval.ID) == 0 {
 90 | 					found = true
 91 | 					break
 92 | 				}
 93 | 			}
 94 | 			if !found && len(val.ID) > 0 {
 95 | 				tmp = append(tmp, val)
 96 | 			}
 97 | 		}
 98 | 		m.nodes[nodeID].Devices = tmp
 99 | 		klog.Infoln("Rm Devices res:", m.nodes[nodeID].Devices)
100 | 	}
101 | }
102 | 
103 | func (m *nodeManager) GetNode(nodeID string) (*NodeInfo, error) {
104 | 	m.mutex.RLock()
105 | 	defer m.mutex.RUnlock()
106 | 	if n, ok := m.nodes[nodeID]; ok {
107 | 		return n, nil
108 | 	}
109 | 	return &NodeInfo{}, fmt.Errorf("node %v not found", nodeID)
110 | }
111 | 
112 | func (m *nodeManager) ListNodes() (map[string]*NodeInfo, error) {
113 | 	m.mutex.RLock()
114 | 	defer m.mutex.RUnlock()
115 | 	return m.nodes, nil
116 | }
117 | 


--------------------------------------------------------------------------------
/pkg/scheduler/pods.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package scheduler
18 | 
19 | import (
20 | 	"sync"
21 | 
22 | 	"4pd.io/k8s-vgpu/pkg/util"
23 | 	corev1 "k8s.io/api/core/v1"
24 | 	k8stypes "k8s.io/apimachinery/pkg/types"
25 | 	"k8s.io/klog/v2"
26 | )
27 | 
28 | type podInfo struct {
29 | 	Namespace string
30 | 	Name      string
31 | 	Uid       k8stypes.UID
32 | 	NodeID    string
33 | 	Devices   util.PodDevices
34 | 	CtrIDs    []string
35 | }
36 | 
37 | type podManager struct {
38 | 	pods  map[k8stypes.UID]*podInfo
39 | 	mutex sync.RWMutex
40 | }
41 | 
42 | func (m *podManager) init() {
43 | 	m.pods = make(map[k8stypes.UID]*podInfo)
44 | }
45 | 
46 | func (m *podManager) addPod(pod *corev1.Pod, nodeID string, devices util.PodDevices) {
47 | 	m.mutex.Lock()
48 | 	defer m.mutex.Unlock()
49 | 	_, ok := m.pods[pod.UID]
50 | 	if !ok {
51 | 		pi := &podInfo{Name: pod.Name, Uid: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices}
52 | 		m.pods[pod.UID] = pi
53 | 		klog.Infof("Pod added: Name: %s, Uid: %s, Namespace: %s, NodeID: %s", pod.Name, pod.UID, pod.Namespace, nodeID)
54 | 	}
55 | }
56 | 
57 | func (m *podManager) delPod(pod *corev1.Pod) {
58 | 	m.mutex.Lock()
59 | 	defer m.mutex.Unlock()
60 | 	pi, ok := m.pods[pod.UID]
61 | 	if ok {
62 | 		klog.Infof("Deleted pod %s with node ID %s", pi.Name, pi.NodeID)
63 | 		delete(m.pods, pod.UID)
64 | 	}
65 | }
66 | 
67 | func (m *podManager) GetScheduledPods() (map[k8stypes.UID]*podInfo, error) {
68 | 	m.mutex.RLock()
69 | 	defer m.mutex.RUnlock()
70 | 	klog.Infof("Getting all scheduled pods with %d nums", len(m.pods))
71 | 	return m.pods, nil
72 | }
73 | 


--------------------------------------------------------------------------------
/pkg/scheduler/scheduler_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package scheduler
 18 | 
 19 | import (
 20 | 	"testing"
 21 | 
 22 | 	"4pd.io/k8s-vgpu/pkg/util"
 23 | 	"gotest.tools/v3/assert"
 24 | 	corev1 "k8s.io/api/core/v1"
 25 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 26 | )
 27 | 
 28 | func Test_getNodesUsage(t *testing.T) {
 29 | 	nodeMage := nodeManager{}
 30 | 	nodeMage.init()
 31 | 	nodeMage.addNode("node1", &NodeInfo{
 32 | 		ID: "node1",
 33 | 		Devices: []DeviceInfo{
 34 | 			{
 35 | 				ID:      "GPU0",
 36 | 				Index:   0,
 37 | 				Count:   10,
 38 | 				Devmem:  1024,
 39 | 				Devcore: 100,
 40 | 				Numa:    1,
 41 | 				Health:  true,
 42 | 			},
 43 | 			{
 44 | 				ID:      "GPU1",
 45 | 				Index:   1,
 46 | 				Count:   10,
 47 | 				Devmem:  1024,
 48 | 				Devcore: 100,
 49 | 				Numa:    1,
 50 | 				Health:  true,
 51 | 			},
 52 | 		},
 53 | 	})
 54 | 	podDevces := util.PodDevices{
 55 | 		"NVIDIA": util.PodSingleDevice{
 56 | 			[]util.ContainerDevice{
 57 | 				{
 58 | 					Idx:       0,
 59 | 					UUID:      "GPU0",
 60 | 					Usedmem:   100,
 61 | 					Usedcores: 10,
 62 | 				},
 63 | 			},
 64 | 		},
 65 | 	}
 66 | 	podMap := podManager{}
 67 | 	podMap.init()
 68 | 	podMap.addPod(&corev1.Pod{
 69 | 		ObjectMeta: metav1.ObjectMeta{
 70 | 			UID:       "1111",
 71 | 			Name:      "test1",
 72 | 			Namespace: "default",
 73 | 		},
 74 | 	}, "node1", podDevces)
 75 | 	podMap.addPod(&corev1.Pod{
 76 | 		ObjectMeta: metav1.ObjectMeta{
 77 | 			UID:       "2222",
 78 | 			Name:      "test2",
 79 | 			Namespace: "default",
 80 | 		},
 81 | 	}, "node1", podDevces)
 82 | 	s := Scheduler{
 83 | 		nodeManager: nodeMage,
 84 | 		podManager:  podMap,
 85 | 	}
 86 | 	nodes := make([]string, 0)
 87 | 	nodes = append(nodes, "node1")
 88 | 	cachenodeMap, _, err := s.getNodesUsage(&nodes, nil)
 89 | 	if err != nil {
 90 | 		t.Fatal(err)
 91 | 	}
 92 | 	assert.Equal(t, len(*cachenodeMap), 1)
 93 | 	v, ok := (*cachenodeMap)["node1"]
 94 | 	assert.Equal(t, ok, true)
 95 | 	assert.Equal(t, len(v.Devices), 2)
 96 | 	assert.Equal(t, v.Devices[0].Used, int32(2))
 97 | 	assert.Equal(t, v.Devices[0].Usedmem, int32(200))
 98 | 	assert.Equal(t, v.Devices[0].Usedcores, int32(20))
 99 | }
100 | 


--------------------------------------------------------------------------------
/pkg/scheduler/webhook.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package scheduler
18 | 
19 | import (
20 | 	"context"
21 | 	"encoding/json"
22 | 	"fmt"
23 | 	"net/http"
24 | 
25 | 	"4pd.io/k8s-vgpu/pkg/device"
26 | 	"4pd.io/k8s-vgpu/pkg/scheduler/config"
27 | 	corev1 "k8s.io/api/core/v1"
28 | 	"k8s.io/apimachinery/pkg/runtime"
29 | 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
30 | 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
31 | )
32 | 
33 | type webhook struct {
34 | 	decoder *admission.Decoder
35 | }
36 | 
37 | func NewWebHook() (*admission.Webhook, error) {
38 | 	schema := runtime.NewScheme()
39 | 	if err := clientgoscheme.AddToScheme(schema); err != nil {
40 | 		return nil, err
41 | 	}
42 | 	decoder := admission.NewDecoder(schema)
43 | 	wh := &admission.Webhook{Handler: &webhook{decoder: decoder}}
44 | 	return wh, nil
45 | }
46 | 
47 | func (h *webhook) Handle(_ context.Context, req admission.Request) admission.Response {
48 | 	pod := &corev1.Pod{}
49 | 	err := h.decoder.Decode(req, pod)
50 | 	if err != nil {
51 | 		return admission.Errored(http.StatusBadRequest, err)
52 | 	}
53 | 	if len(pod.Spec.Containers) == 0 {
54 | 		return admission.Denied("pod has no containers")
55 | 	}
56 | 	//klog.V(1).Infof("hook %v pod %v/%v", req.UID, req.Namespace, req.Name)
57 | 	fmt.Printf("hook %v pod %v/%v", req.UID, req.Namespace, req.Name)
58 | 	hasResource := false
59 | 	for idx, ctr := range pod.Spec.Containers {
60 | 		c := &pod.Spec.Containers[idx]
61 | 		if ctr.SecurityContext != nil {
62 | 			if ctr.SecurityContext.Privileged != nil && *ctr.SecurityContext.Privileged {
63 | 				continue
64 | 			}
65 | 		}
66 | 
67 | 		for _, val := range device.GetDevices() {
68 | 			hasResource = hasResource || val.MutateAdmission(c)
69 | 		}
70 | 	}
71 | 
72 | 	if !hasResource {
73 | 		return admission.Allowed("no resource found")
74 | 	}
75 | 	if len(config.SchedulerName) > 0 {
76 | 		pod.Spec.SchedulerName = config.SchedulerName
77 | 	}
78 | 	marshaledPod, err := json.Marshal(pod)
79 | 	if err != nil {
80 | 		return admission.Errored(http.StatusInternalServerError, err)
81 | 	}
82 | 	return admission.PatchResponseFromRaw(req.Object.Raw, marshaledPod)
83 | }
84 | 


--------------------------------------------------------------------------------
/pkg/util/client/client.go:
--------------------------------------------------------------------------------
 1 | package client
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"path/filepath"
 6 | 
 7 | 	"k8s.io/client-go/kubernetes"
 8 | 	"k8s.io/client-go/rest"
 9 | 	"k8s.io/client-go/tools/clientcmd"
10 | 	"k8s.io/klog"
11 | )
12 | 
13 | var (
14 | 	kubeClient kubernetes.Interface
15 | )
16 | 
17 | func init() {
18 | 	kubeClient, _ = NewClient()
19 | }
20 | 
21 | func GetClient() kubernetes.Interface {
22 | 	return kubeClient
23 | }
24 | 
25 | // NewClient connects to an API server
26 | func NewClient() (kubernetes.Interface, error) {
27 | 	kubeConfig := os.Getenv("KUBECONFIG")
28 | 	if kubeConfig == "" {
29 | 		kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config")
30 | 	}
31 | 	config, err := rest.InClusterConfig()
32 | 	if err != nil {
33 | 		klog.Infoln("InClusterConfig failed", err.Error())
34 | 		config, err = clientcmd.BuildConfigFromFlags("", kubeConfig)
35 | 		if err != nil {
36 | 			klog.Errorln("BuildFromFlags failed", err.Error())
37 | 			return nil, err
38 | 		}
39 | 	}
40 | 	client, err := kubernetes.NewForConfig(config)
41 | 	return client, err
42 | }
43 | 


--------------------------------------------------------------------------------
/pkg/util/util_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package util
18 | 
19 | import (
20 | 	"fmt"
21 | 	"testing"
22 | 
23 | 	"gotest.tools/v3/assert"
24 | )
25 | 
26 | var inRequestDevices map[string]string
27 | 
28 | func init() {
29 | 	inRequestDevices = make(map[string]string)
30 | 	inRequestDevices["NVIDIA"] = "hami.sh/vgpu-devices-to-allocate"
31 | }
32 | 
33 | func TestEmptyContainerDevicesCoding(t *testing.T) {
34 | 	cd1 := ContainerDevices{}
35 | 	s := EncodeContainerDevices(cd1)
36 | 	fmt.Println(s)
37 | 	cd2, _ := DecodeContainerDevices(s)
38 | 	assert.DeepEqual(t, cd1, cd2)
39 | }
40 | 
41 | func TestEmptyPodDeviceCoding(t *testing.T) {
42 | 	pd1 := PodDevices{}
43 | 	s := EncodePodDevices(inRequestDevices, pd1)
44 | 	fmt.Println(s)
45 | 	pd2, _ := DecodePodDevices(inRequestDevices, s)
46 | 	assert.DeepEqual(t, pd1, pd2)
47 | }
48 | 
49 | func TestPodDevicesCoding(t *testing.T) {
50 | 	pd1 := PodDevices{
51 | 		"NVIDIA": PodSingleDevice{
52 | 			ContainerDevices{
53 | 				ContainerDevice{0, "UUID1", "Type1", 1000, 30},
54 | 			},
55 | 			ContainerDevices{
56 | 				ContainerDevice{0, "UUID1", "Type1", 1000, 30},
57 | 			},
58 | 		},
59 | 	}
60 | 	s := EncodePodDevices(inRequestDevices, pd1)
61 | 	fmt.Println(s)
62 | 	pd2, _ := DecodePodDevices(inRequestDevices, s)
63 | 	assert.DeepEqual(t, pd1, pd2)
64 | }
65 | 


--------------------------------------------------------------------------------
/pkg/version/version.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2021 peizhaoyou <peizhaoyou@4paradigm.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package version
18 | 
19 | import (
20 |     "fmt"
21 |     "github.com/spf13/cobra"
22 | )
23 | 
24 | var (
25 |     version string
26 |     VersionCmd = &cobra.Command{
27 |         Use: "version",
28 |         Short: "print version",
29 |         Run: func(cmd *cobra.Command, args []string) {
30 |             fmt.Println(Version())
31 |         },
32 |     }
33 | )
34 | 
35 | func Version() string {
36 |     return version
37 | }
38 | 


--------------------------------------------------------------------------------
/version.mk:
--------------------------------------------------------------------------------
 1 | GO=go
 2 | GO111MODULE=on
 3 | CMDS=scheduler vGPUmonitor
 4 | DEVICES=nvidia
 5 | OUTPUT_DIR=bin
 6 | TARGET_ARCH=amd64
 7 | GOLANG_IMAGE=golang:1.21-bullseye
 8 | NVIDIA_IMAGE=nvidia/cuda:11.2.2-base-ubuntu20.04
 9 | DEST_DIR=/usr/local/vgpu/
10 | 
11 | VERSION = v0.0.1
12 | IMG_NAME ="k8s-vgpu-scheduler"
13 | IMG_TAG="${IMG_NAME}:${VERSION}"


--------------------------------------------------------------------------------