├── .github └── workflows │ ├── dev.yml │ └── go.yml ├── Dockerfile ├── Makefile ├── README.md ├── README_cn.md ├── ascend-device-plugin-2.4.0.yaml ├── ascend-device-plugin-2.4.1.yaml ├── cmd └── main.go ├── config.yaml ├── examples ├── ascendjob-310p.yaml └── ascendjob-910b.yaml ├── go.mod ├── go.sum ├── internal ├── manager │ └── manager.go ├── server │ └── server.go ├── vnpu.go └── wachers.go └── version └── version.go /.github/workflows/dev.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a golang project 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go 3 | 4 | name: CI 5 | 6 | on: 7 | pull_request: 8 | branches: [ "main" ] 9 | 10 | env: 11 | GO_VERSION: "1.22.5" 12 | 13 | jobs: 14 | golangci: 15 | name: lint 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: actions/setup-go@v5 20 | with: 21 | go-version: ${{ env.GO_VERSION }} 22 | - name: golangci-lint 23 | uses: golangci/golangci-lint-action@v6 24 | with: 25 | version: v1.60 26 | 27 | build: 28 | env: 29 | IMAGE_NAME: ${{ secrets.IMAGE_NAME || 'projecthami/ascend-device-plugin' }} 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v4 33 | 34 | - name: Get branch names. 35 | id: branch-names 36 | uses: tj-actions/branch-names@v8 37 | 38 | - name: Set up QEMU 39 | uses: docker/setup-qemu-action@v3 40 | 41 | - name: Set up Docker Buildx 42 | uses: docker/setup-buildx-action@v3 43 | 44 | - name: Docker Login 45 | uses: docker/login-action@v3.1.0 46 | with: 47 | username: ${{ secrets.DOCKERHUB_TOKEN }} 48 | password: ${{ secrets.DOCKERHUB_PASSWD }} 49 | 50 | - name: Build and push 51 | uses: docker/build-push-action@v6 52 | with: 53 | platforms: linux/amd64,linux/arm64 54 | push: true 55 | build-args: | 56 | BASE_IMAGE=ubuntu:20.04 57 | GO_VERSION=${{ env.GO_VERSION }} 58 | VERSION=${{ steps.branch-names.outputs.current_branch || steps.branch-names.outputs.tag }}-${{ github.sha }} 59 | tags: ${{ env.IMAGE_NAME }}:dev 60 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a golang project 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go 3 | 4 | name: CI 5 | 6 | on: 7 | push: 8 | branches: [ "master", "main" ] 9 | tags: 10 | - v[0-9]+.[0-9]+.[0-9]+.[0-9]+ 11 | - v[0-9]+.[0-9]+.[0-9]+ 12 | - v[0-9]+.[0-9]+ 13 | pull_request: 14 | branches: [ "master" ] 15 | 16 | env: 17 | GO_VERSION: "1.22.5" 18 | 19 | jobs: 20 | golangci: 21 | name: lint 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v4 25 | - uses: actions/setup-go@v5 26 | with: 27 | go-version: ${{ env.GO_VERSION }} 28 | - name: golangci-lint 29 | uses: golangci/golangci-lint-action@v6 30 | with: 31 | version: v1.60 32 | 33 | build: 34 | env: 35 | IMAGE_NAME: ${{ secrets.IMAGE_NAME || 'projecthami/ascend-device-plugin' }} 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v4 39 | 40 | - name: Get branch names. 41 | id: branch-names 42 | uses: tj-actions/branch-names@v8 43 | 44 | - name: Set up QEMU 45 | uses: docker/setup-qemu-action@v3 46 | 47 | - name: Set up Docker Buildx 48 | uses: docker/setup-buildx-action@v3 49 | 50 | - name: Docker Login 51 | uses: docker/login-action@v3.1.0 52 | with: 53 | username: ${{ secrets.DOCKERHUB_TOKEN }} 54 | password: ${{ secrets.DOCKERHUB_PASSWD }} 55 | 56 | - name: Build and push 57 | uses: docker/build-push-action@v6 58 | with: 59 | platforms: linux/amd64,linux/arm64 60 | push: true 61 | build-args: | 62 | BASE_IMAGE=ubuntu:20.04 63 | GO_VERSION=${{ env.GO_VERSION }} 64 | VERSION=${{ steps.branch-names.outputs.current_branch || steps.branch-names.outputs.tag }}-${{ github.sha }} 65 | tags: ${{ env.IMAGE_NAME }}:${{ steps.branch-names.outputs.current_branch || steps.branch-names.outputs.tag }} 66 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=ubuntu:20.04 2 | FROM $BASE_IMAGE AS build 3 | 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | RUN apt update -y && apt install -y gcc make wget software-properties-common 6 | RUN add-apt-repository ppa:longsleep/golang-backports 7 | RUN apt update 8 | RUN apt install -y golang-1.22 9 | ENV PATH=/usr/lib/go-1.22/bin:/usr/local/go/bin:/root/go/bin:$PATH 10 | ARG GOPROXY 11 | ENV GOPATH=/go 12 | ARG VERSION 13 | WORKDIR /build 14 | ADD . . 15 | RUN --mount=type=cache,target=/go/pkg/mod \ 16 | make all 17 | 18 | FROM $BASE_IMAGE 19 | ENV LD_LIBRARY_PATH /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common 20 | COPY --from=build /build/ascend-device-plugin /usr/local/bin/ascend-device-plugin 21 | 22 | ENTRYPOINT ["ascend-device-plugin"] 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GO ?= go 2 | VERSION ?= unknown 3 | BUILDARGS ?= -ldflags '-s -w -X github.com/Project-HAMi/ascend-device-plugin/version.version=$(VERSION)' 4 | IMG_NAME = projecthami/ascend-device-plugin 5 | 6 | all: ascend-device-plugin 7 | 8 | tidy: 9 | $(GO) mod tidy 10 | 11 | docker: 12 | docker build \ 13 | --build-arg BASE_IMAGE=ubuntu:20.04 \ 14 | --build-arg GOPROXY=https://goproxy.cn,direct \ 15 | -t ${IMG_NAME}:${VERSION} . 16 | 17 | lint: tidy 18 | $(GO) install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 19 | golangci-lint run 20 | 21 | ascend-device-plugin: tidy 22 | $(GO) build $(BUILDARGS) -o ./ascend-device-plugin ./cmd/main.go 23 | 24 | clean: 25 | rm -rf ./ascend-device-plugin 26 | 27 | .PHONY: all clean -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ascend Device Plugin 2 | 3 | ## Introduction 4 | 5 | This Ascend device plugin is implemented for [HAMi](https://github.com/Project-HAMi/HAMi) scheduling. 6 | 7 | Memory slicing is supported based on virtualization template, lease available template is automatically used. For detailed information, check [templeate](./config.yaml) 8 | 9 | ## Prequisites 10 | 11 | [ascend-docker-runtime](https://gitee.com/ascend/ascend-docker-runtime) 12 | 13 | ## Compile 14 | 15 | ```bash 16 | make all 17 | ``` 18 | 19 | ### Build 20 | 21 | ```bash 22 | docker buildx build -t $IMAGE_NAME . 23 | ``` 24 | 25 | ## Deployment 26 | 27 | Due to dependencies with HAMi, you need to set 28 | 29 | ``` 30 | devices.ascend.enabled=true 31 | ``` 32 | 33 | during HAMi installation. For more details, see 'devices' section in values.yaml. 34 | 35 | ```yaml 36 | devices: 37 | ascend: 38 | enabled: true 39 | image: "ascend-device-plugin:master" 40 | imagePullPolicy: IfNotPresent 41 | extraArgs: [] 42 | nodeSelector: 43 | ascend: "on" 44 | tolerations: [] 45 | resources: 46 | - huawei.com/Ascend910A 47 | - huawei.com/Ascend910A-memory 48 | - huawei.com/Ascend910B 49 | - huawei.com/Ascend910B-memory 50 | - huawei.com/Ascend310P 51 | - huawei.com/Ascend310P-memory 52 | ``` 53 | 54 | Note that resources here(hawei.com/Ascend910A,huawei.com/Ascend910B,...) is managed in hami-scheduler-device configMap. It defines three different templates(910A,910B,310P). 55 | 56 | label your NPU nodes with 'ascend=on' 57 | 58 | ``` 59 | kubectl label node {ascend-node} ascend=on 60 | ``` 61 | 62 | Deploy ascend-device-plugin by running 63 | 64 | ```bash 65 | kubectl apply -f ascend-device-plugin.yaml 66 | ``` 67 | 68 | 69 | ## Usage 70 | 71 | You can allocate a slice of NPU by specifying both resource number and resource memory. For more examples, see [examples](./examples/) 72 | 73 | ```yaml 74 | ... 75 | containers: 76 | - name: npu_pod 77 | ... 78 | resources: 79 | limits: 80 | huawei.com/Ascend910B: "1" 81 | # if you don't specify Asend910B-memory, it will use a whole NPU. 82 | huawei.com/Ascend910B-memory: "4096" 83 | ``` 84 | -------------------------------------------------------------------------------- /README_cn.md: -------------------------------------------------------------------------------- 1 | # Ascend Device Plugin 2 | 3 | ## 说明 4 | 5 | 基于[HAMi](https://github.com/Project-HAMi/HAMi)调度机制的ascend device plugin。 6 | 7 | 支持基于显存调度,显存是基于昇腾的虚拟化模板来切分的,会找到满足显存需求的最小模板来作为容器的显存。模版的具体信息参考[配置模版](./config.yaml) 8 | 9 | 启动容器依赖[ascend-docker-runtime](https://gitee.com/ascend/ascend-docker-runtime)。 10 | 11 | ## 编译 12 | 13 | ### 编译二进制文件 14 | 15 | ```bash 16 | make all 17 | ``` 18 | 19 | ### 编译镜像 20 | 21 | ```bash 22 | docker buildx build -t $IMAGE_NAME . 23 | ``` 24 | 25 | ## 部署 26 | 27 | 由于和HAMi的一些依赖关系,部署集成在HAMi的部署中,指定以下字段: 28 | 29 | ``` 30 | devices.ascend.enabled=true 31 | ``` 32 | 33 | 相关的每一种NPU设备的资源名,参考values.yaml中的以下字段,目前本组件支持3种型号的NPU切片(310p,910A,910B)若不需要修改的话可以直接使用以下的默认配置: 34 | 35 | ```yaml 36 | devices: 37 | ascend: 38 | enabled: true 39 | image: "ascend-device-plugin:master" 40 | imagePullPolicy: IfNotPresent 41 | extraArgs: [] 42 | nodeSelector: 43 | ascend: "on" 44 | tolerations: [] 45 | resources: 46 | - huawei.com/Ascend910A 47 | - huawei.com/Ascend910A-memory 48 | - huawei.com/Ascend910B 49 | - huawei.com/Ascend910B-memory 50 | - huawei.com/Ascend310P 51 | - huawei.com/Ascend310P-memory 52 | ``` 53 | 54 | 将集群中的NPU节点打上如下标签: 55 | 56 | ``` 57 | kubectl label node {ascend-node} ascend=on 58 | ``` 59 | 60 | 最后使用以下指令部署ascend-device-plugin 61 | 62 | ```bash 63 | kubectl apply -f ascend-device-plugin.yaml 64 | ``` 65 | 66 | ## 使用 67 | 68 | ```yaml 69 | ... 70 | containers: 71 | - name: npu_pod 72 | ... 73 | resources: 74 | limits: 75 | huawei.com/Ascend910B: "1" 76 | # 不填写显存默认使用整张卡 77 | huawei.com/Ascend910B-memory: "4096" 78 | ``` 79 | -------------------------------------------------------------------------------- /ascend-device-plugin-2.4.0.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: hami-ascend 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["pods"] 9 | verbs: ["get", "list", "update", "watch", "patch"] 10 | - apiGroups: [""] 11 | resources: ["nodes"] 12 | verbs: ["get", "update", "patch"] 13 | --- 14 | apiVersion: rbac.authorization.k8s.io/v1 15 | kind: ClusterRoleBinding 16 | metadata: 17 | name: hami-ascend 18 | subjects: 19 | - kind: ServiceAccount 20 | name: hami-ascend 21 | namespace: kube-system 22 | roleRef: 23 | kind: ClusterRole 24 | name: hami-ascend 25 | apiGroup: rbac.authorization.k8s.io 26 | --- 27 | apiVersion: v1 28 | kind: ServiceAccount 29 | metadata: 30 | name: hami-ascend 31 | namespace: kube-system 32 | labels: 33 | app.kubernetes.io/component: "hami-ascend" 34 | --- 35 | apiVersion: apps/v1 36 | kind: DaemonSet 37 | metadata: 38 | name: hami-ascend-device-plugin 39 | namespace: kube-system 40 | labels: 41 | app.kubernetes.io/component: hami-ascend-device-plugin 42 | spec: 43 | selector: 44 | matchLabels: 45 | app.kubernetes.io/component: hami-ascend-device-plugin 46 | hami.io/webhook: ignore 47 | template: 48 | metadata: 49 | labels: 50 | app.kubernetes.io/component: hami-ascend-device-plugin 51 | hami.io/webhook: ignore 52 | spec: 53 | priorityClassName: "system-node-critical" 54 | serviceAccountName: hami-ascend 55 | containers: 56 | - image: projecthami/ascend-device-plugin:main 57 | imagePullPolicy: IfNotPresent 58 | name: device-plugin 59 | resources: 60 | requests: 61 | memory: 500Mi 62 | cpu: 500m 63 | limits: 64 | memory: 500Mi 65 | cpu: 500m 66 | args: 67 | - --config_file 68 | - /ascend-config.yaml 69 | securityContext: 70 | privileged: true 71 | readOnlyRootFilesystem: false 72 | volumeMounts: 73 | - name: device-plugin 74 | mountPath: /var/lib/kubelet/device-plugins 75 | - name: pod-resource 76 | mountPath: /var/lib/kubelet/pod-resources 77 | - name: hiai-driver 78 | mountPath: /usr/local/Ascend/driver 79 | readOnly: true 80 | - name: log-path 81 | mountPath: /var/log/mindx-dl/devicePlugin 82 | - name: tmp 83 | mountPath: /tmp 84 | - name: ascend-config 85 | mountPath: /ascend-config.yaml 86 | subPath: ascend-config.yaml 87 | readOnly: true 88 | env: 89 | - name: NODE_NAME 90 | valueFrom: 91 | fieldRef: 92 | fieldPath: spec.nodeName 93 | volumes: 94 | - name: device-plugin 95 | hostPath: 96 | path: /var/lib/kubelet/device-plugins 97 | - name: pod-resource 98 | hostPath: 99 | path: /var/lib/kubelet/pod-resources 100 | - name: hiai-driver 101 | hostPath: 102 | path: /usr/local/Ascend/driver 103 | - name: log-path 104 | hostPath: 105 | path: /var/log/mindx-dl/devicePlugin 106 | type: Directory 107 | - name: tmp 108 | hostPath: 109 | path: /tmp 110 | - name: ascend-config 111 | configMap: 112 | name: hami-scheduler-device 113 | nodeSelector: 114 | ascend: "on" 115 | -------------------------------------------------------------------------------- /ascend-device-plugin-2.4.1.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: hami-ascend 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["pods"] 9 | verbs: ["get", "list", "update", "watch", "patch"] 10 | - apiGroups: [""] 11 | resources: ["nodes"] 12 | verbs: ["get", "update", "patch"] 13 | --- 14 | apiVersion: rbac.authorization.k8s.io/v1 15 | kind: ClusterRoleBinding 16 | metadata: 17 | name: hami-ascend 18 | subjects: 19 | - kind: ServiceAccount 20 | name: hami-ascend 21 | namespace: kube-system 22 | roleRef: 23 | kind: ClusterRole 24 | name: hami-ascend 25 | apiGroup: rbac.authorization.k8s.io 26 | --- 27 | apiVersion: v1 28 | kind: ServiceAccount 29 | metadata: 30 | name: hami-ascend 31 | namespace: kube-system 32 | labels: 33 | app.kubernetes.io/component: "hami-ascend" 34 | --- 35 | apiVersion: apps/v1 36 | kind: DaemonSet 37 | metadata: 38 | name: hami-ascend-device-plugin 39 | namespace: kube-system 40 | labels: 41 | app.kubernetes.io/component: hami-ascend-device-plugin 42 | spec: 43 | selector: 44 | matchLabels: 45 | app.kubernetes.io/component: hami-ascend-device-plugin 46 | hami.io/webhook: ignore 47 | template: 48 | metadata: 49 | labels: 50 | app.kubernetes.io/component: hami-ascend-device-plugin 51 | hami.io/webhook: ignore 52 | spec: 53 | priorityClassName: "system-node-critical" 54 | serviceAccountName: hami-ascend 55 | containers: 56 | - image: projecthami/ascend-device-plugin:v1.0.1 57 | imagePullPolicy: IfNotPresent 58 | name: device-plugin 59 | resources: 60 | requests: 61 | memory: 500Mi 62 | cpu: 500m 63 | limits: 64 | memory: 500Mi 65 | cpu: 500m 66 | args: 67 | - --config_file 68 | - /device-config.yaml 69 | securityContext: 70 | privileged: true 71 | readOnlyRootFilesystem: false 72 | volumeMounts: 73 | - name: device-plugin 74 | mountPath: /var/lib/kubelet/device-plugins 75 | - name: pod-resource 76 | mountPath: /var/lib/kubelet/pod-resources 77 | - name: hiai-driver 78 | mountPath: /usr/local/Ascend/driver 79 | readOnly: true 80 | - name: log-path 81 | mountPath: /var/log/mindx-dl/devicePlugin 82 | - name: tmp 83 | mountPath: /tmp 84 | - name: ascend-config 85 | mountPath: /device-config.yaml 86 | subPath: device-config.yaml 87 | readOnly: true 88 | env: 89 | - name: NODE_NAME 90 | valueFrom: 91 | fieldRef: 92 | fieldPath: spec.nodeName 93 | volumes: 94 | - name: device-plugin 95 | hostPath: 96 | path: /var/lib/kubelet/device-plugins 97 | - name: pod-resource 98 | hostPath: 99 | path: /var/lib/kubelet/pod-resources 100 | - name: hiai-driver 101 | hostPath: 102 | path: /usr/local/Ascend/driver 103 | - name: log-path 104 | hostPath: 105 | path: /var/log/mindx-dl/devicePlugin 106 | type: Directory 107 | - name: tmp 108 | hostPath: 109 | path: /tmp 110 | - name: ascend-config 111 | configMap: 112 | name: hami-scheduler-device 113 | nodeSelector: 114 | ascend: "on" 115 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 The HAMi Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "context" 21 | "flag" 22 | "fmt" 23 | "os" 24 | "syscall" 25 | 26 | "github.com/Project-HAMi/ascend-device-plugin/internal" 27 | "github.com/Project-HAMi/ascend-device-plugin/internal/manager" 28 | "github.com/Project-HAMi/ascend-device-plugin/internal/server" 29 | "github.com/Project-HAMi/ascend-device-plugin/version" 30 | "github.com/fsnotify/fsnotify" 31 | "huawei.com/npu-exporter/v6/common-utils/hwlog" 32 | "k8s.io/klog/v2" 33 | "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 34 | ) 35 | 36 | var ( 37 | hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0") 38 | configFile = flag.String("config_file", "", "config file path") 39 | nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name") 40 | ) 41 | 42 | func checkFlags() { 43 | version.CheckVersionFlag() 44 | if *configFile == "" { 45 | klog.Fatalf("config file not set, use --config_file to set config file path") 46 | } 47 | if *nodeName == "" { 48 | klog.Fatalf("node name not set, use --node_name or env NODE_NAME to set node name") 49 | } 50 | } 51 | 52 | func start(ps *server.PluginServer) error { 53 | klog.Info("Starting FS watcher.") 54 | watcher, err := internal.NewFSWatcher(v1beta1.DevicePluginPath) 55 | if err != nil { 56 | return fmt.Errorf("failed to create FS watcher: %v", err) 57 | } 58 | defer func(watcher *fsnotify.Watcher) { 59 | _ = watcher.Close() 60 | }(watcher) 61 | 62 | klog.Info("Starting OS watcher.") 63 | sigs := internal.NewOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) 64 | 65 | var restarting bool 66 | //var restartTimeout <-chan time.Time 67 | restart: 68 | if restarting { 69 | err := ps.Stop() 70 | if err != nil { 71 | klog.Errorf("Failed to stop plugin server: %v", err) 72 | } 73 | } 74 | restarting = true 75 | klog.Info("Starting Plugins.") 76 | err = ps.Start() 77 | if err != nil { 78 | klog.Errorf("Failed to start plugin server: %v", err) 79 | return err 80 | } 81 | 82 | for { 83 | select { 84 | //case <-restartTimeout: 85 | // goto restart 86 | case event := <-watcher.Events: 87 | if event.Name == v1beta1.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create { 88 | klog.Infof("inotify: %s created, restarting.", v1beta1.KubeletSocket) 89 | goto restart 90 | } 91 | case err := <-watcher.Errors: 92 | klog.Errorf("inotify: %s", err) 93 | case s := <-sigs: 94 | switch s { 95 | case syscall.SIGHUP: 96 | klog.Info("Received SIGHUP, restarting.") 97 | goto restart 98 | default: 99 | klog.Infof("Received signal \"%v\", shutting down.", s) 100 | goto exit 101 | } 102 | } 103 | } 104 | exit: 105 | err = ps.Stop() 106 | if err != nil { 107 | klog.Errorf("Failed to stop plugin server: %v", err) 108 | return err 109 | } 110 | return nil 111 | } 112 | 113 | func main() { 114 | klog.InitFlags(nil) 115 | flag.Parse() 116 | checkFlags() 117 | klog.Infof("version: %s", version.GetVersion()) 118 | klog.Infof("using config file: %s", *configFile) 119 | config := &hwlog.LogConfig{ 120 | OnlyToStdout: true, 121 | LogLevel: *hwLoglevel, 122 | } 123 | err := hwlog.InitRunLogger(config, context.Background()) 124 | if err != nil { 125 | klog.Fatalf("init huawei run logger failed, %v", err) 126 | } 127 | mgr, err := manager.NewAscendManager() 128 | if err != nil { 129 | klog.Fatalf("init AscendManager failed, error is %v", err) 130 | } 131 | err = mgr.LoadConfig(*configFile) 132 | if err != nil { 133 | klog.Fatalf("load config failed, error is %v", err) 134 | } 135 | server, err := server.NewPluginServer(mgr, *nodeName) 136 | if err != nil { 137 | klog.Fatalf("init PluginServer failed, error is %v", err) 138 | } 139 | 140 | err = start(server) 141 | if err != nil { 142 | klog.Fatalf("start PluginServer failed, error is %v", err) 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | vnpus: 2 | - chipName: 910B 3 | commonWord: Ascend910A 4 | resourceName: huawei.com/Ascend910A 5 | resourceMemoryName: huawei.com/Ascend910A-memory 6 | memoryAllocatable: 32768 7 | memoryCapacity: 32768 8 | aiCore: 30 9 | templates: 10 | - name: vir02 11 | memory: 2184 12 | aiCore: 2 13 | - name: vir04 14 | memory: 4369 15 | aiCore: 4 16 | - name: vir08 17 | memory: 8738 18 | aiCore: 8 19 | - name: vir16 20 | memory: 17476 21 | aiCore: 16 22 | - chipName: 910B3 23 | commonWord: Ascend910B 24 | resourceName: huawei.com/Ascend910B 25 | resourceMemoryName: huawei.com/Ascend910B-memory 26 | memoryAllocatable: 65536 27 | memoryCapacity: 65536 28 | aiCore: 20 29 | aiCPU: 7 30 | templates: 31 | - name: vir05_1c_16g 32 | memory: 16384 33 | aiCore: 5 34 | aiCPU: 1 35 | - name: vir10_3c_32g 36 | memory: 32768 37 | aiCore: 10 38 | aiCPU: 3 39 | - chipName: 310P3 40 | commonWord: Ascend310P 41 | resourceName: huawei.com/Ascend310P 42 | resourceMemoryName: huawei.com/Ascend310P-memory 43 | memoryAllocatable: 21527 44 | memoryCapacity: 24576 45 | aiCore: 8 46 | aiCPU: 7 47 | templates: 48 | - name: vir01 49 | memory: 3072 50 | aiCore: 1 51 | aiCPU: 1 52 | - name: vir02 53 | memory: 6144 54 | aiCore: 2 55 | aiCPU: 2 56 | - name: vir04 57 | memory: 12288 58 | aiCore: 4 59 | aiCPU: 4 60 | -------------------------------------------------------------------------------- /examples/ascendjob-310p.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ascend310p-job 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-pytorch:24.0.RC1-A2-1.11.0-ubuntu20.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | huawei.com/Ascend310P: 1 # requesting 1 NPU 13 | huawei.com/Ascend310P-memory: 2000 # requesting 2000m device m -------------------------------------------------------------------------------- /examples/ascendjob-910b.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ascend910b-job 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | huawei.com/Ascend910B: 1 # requesting 1 NPU 13 | huawei.com/Ascend910B-memory: 2000 # requesting 2000m device memory -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/Project-HAMi/ascend-device-plugin 2 | 3 | go 1.22.2 4 | 5 | require ( 6 | github.com/Project-HAMi/HAMi v0.0.0 7 | github.com/fsnotify/fsnotify v1.7.0 8 | google.golang.org/grpc v1.63.2 9 | huawei.com/npu-exporter/v6 v6.0.0-RC3.b001 10 | k8s.io/api v0.29.3 11 | k8s.io/apimachinery v0.29.3 12 | k8s.io/klog/v2 v2.120.1 13 | k8s.io/kubelet v0.29.3 14 | ) 15 | 16 | require ( 17 | github.com/davecgh/go-spew v1.1.1 // indirect 18 | github.com/emicklei/go-restful/v3 v3.11.3 // indirect 19 | github.com/go-logr/logr v1.4.1 // indirect 20 | github.com/go-openapi/jsonpointer v0.20.2 // indirect 21 | github.com/go-openapi/jsonreference v0.20.4 // indirect 22 | github.com/go-openapi/swag v0.22.9 // indirect 23 | github.com/gogo/protobuf v1.3.2 // indirect 24 | github.com/golang/protobuf v1.5.4 // indirect 25 | github.com/google/gnostic-models v0.6.8 // indirect 26 | github.com/google/gofuzz v1.2.0 // indirect 27 | github.com/google/uuid v1.6.0 // indirect 28 | github.com/imdario/mergo v0.3.16 // indirect 29 | github.com/josharian/intern v1.0.0 // indirect 30 | github.com/json-iterator/go v1.1.12 // indirect 31 | github.com/mailru/easyjson v0.7.7 // indirect 32 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 33 | github.com/modern-go/reflect2 v1.0.2 // indirect 34 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 35 | github.com/onsi/ginkgo/v2 v2.17.1 // indirect 36 | github.com/onsi/gomega v1.32.0 // indirect 37 | github.com/smartystreets/goconvey v1.7.2 // indirect 38 | github.com/spf13/pflag v1.0.5 // indirect 39 | golang.org/x/net v0.26.0 // indirect 40 | golang.org/x/oauth2 v0.17.0 // indirect 41 | golang.org/x/sys v0.21.0 // indirect 42 | golang.org/x/term v0.21.0 // indirect 43 | golang.org/x/text v0.16.0 // indirect 44 | golang.org/x/time v0.5.0 // indirect 45 | google.golang.org/appengine v1.6.8 // indirect 46 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de // indirect 47 | google.golang.org/protobuf v1.33.0 // indirect 48 | gopkg.in/inf.v0 v0.9.1 // indirect 49 | gopkg.in/yaml.v2 v2.4.0 // indirect 50 | gopkg.in/yaml.v3 v3.0.1 // indirect 51 | k8s.io/client-go v0.29.3 // indirect 52 | k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 // indirect 53 | k8s.io/utils v0.0.0-20240102154912-e7106e64919e // indirect 54 | sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect 55 | sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect 56 | sigs.k8s.io/yaml v1.4.0 // indirect 57 | ) 58 | 59 | replace ( 60 | github.com/Project-HAMi/HAMi v0.0.0 => github.com/Project-HAMi/HAMi v0.0.0-20250107033239-d04fc8baaad6 61 | huawei.com/npu-exporter/v6 => gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3 62 | ) 63 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3 h1:gmcdFAckl3OCubjk8Mz7jgYWBHm+7pzkmQ19/afghhY= 2 | gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3/go.mod h1:tQw2ukw5YzlXWJa5cDfY8TNcTiBieor69lsdHFEiMZ8= 3 | github.com/Project-HAMi/HAMi v0.0.0-20250107033239-d04fc8baaad6 h1:5SbvXn7H5spMTgCM4+sF6zm113WVCceUuOuwItkqELY= 4 | github.com/Project-HAMi/HAMi v0.0.0-20250107033239-d04fc8baaad6/go.mod h1:lY4bmpcPiKWg0bVPCJFRH6xDW8p5PouIk/nIIU1I2d8= 5 | github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= 6 | github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= 7 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 8 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 9 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 10 | github.com/emicklei/go-restful/v3 v3.11.3 h1:yagOQz/38xJmcNeZJtrUcKjkHRltIaIFXKWeG1SkWGE= 11 | github.com/emicklei/go-restful/v3 v3.11.3/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= 12 | github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= 13 | github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= 14 | github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= 15 | github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= 16 | github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= 17 | github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 18 | github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q= 19 | github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs= 20 | github.com/go-openapi/jsonreference v0.20.4 h1:bKlDxQxQJgwpUSgOENiMPzCTBVuc7vTdXSSgNeAhojU= 21 | github.com/go-openapi/jsonreference v0.20.4/go.mod h1:5pZJyJP2MnYCpoeoMAql78cCHauHj0V9Lhc506VOpw4= 22 | github.com/go-openapi/swag v0.22.9 h1:XX2DssF+mQKM2DHsbgZK74y/zj4mo9I99+89xUmuZCE= 23 | github.com/go-openapi/swag v0.22.9/go.mod h1:3/OXnFfnMAwBD099SwYRk7GD3xOrr1iL7d/XNLXVVwE= 24 | github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= 25 | github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= 26 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= 27 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= 28 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= 29 | github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= 30 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= 31 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= 32 | github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= 33 | github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= 34 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 35 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 36 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 37 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 38 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 39 | github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= 40 | github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 41 | github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= 42 | github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= 43 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 44 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 45 | github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= 46 | github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= 47 | github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= 48 | github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= 49 | github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= 50 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= 51 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= 52 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= 53 | github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= 54 | github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= 55 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= 56 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= 57 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 58 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 59 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 60 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 61 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= 62 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= 63 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 64 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= 65 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 66 | github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= 67 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= 68 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 69 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 70 | github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8= 71 | github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= 72 | github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= 73 | github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= 74 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 75 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 76 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 77 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 78 | github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= 79 | github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= 80 | github.com/smartystreets/assertions v1.2.0 h1:42S6lae5dvLc7BrLu/0ugRtcFVjoJNMC/N3yZFZkDFs= 81 | github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= 82 | github.com/smartystreets/goconvey v1.7.2 h1:9RBaZCeXEQ3UselpuwUQHltGVXvdwm6cv1hgR6gDIPg= 83 | github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3Pg9vgXWeJpQFMM= 84 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= 85 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 86 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 87 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 88 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= 89 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 90 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 91 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 92 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 93 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 94 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 95 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 96 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 97 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 98 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 99 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 100 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 101 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 102 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 103 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 104 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 105 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 106 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 107 | golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= 108 | golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= 109 | golang.org/x/oauth2 v0.17.0 h1:6m3ZPmLEFdVxKKWnKq4VqZ60gutO35zm+zrAHVmHyDQ= 110 | golang.org/x/oauth2 v0.17.0/go.mod h1:OzPDGQiuQMguemayvdylqddI7qcD9lnSDb+1FiwQ5HA= 111 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 112 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 113 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 114 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 115 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 116 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 117 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 118 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 119 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 120 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 121 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 122 | golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= 123 | golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 124 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 125 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 126 | golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= 127 | golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= 128 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 129 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 130 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 131 | golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= 132 | golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= 133 | golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= 134 | golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= 135 | golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= 136 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 137 | golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= 138 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 139 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= 140 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= 141 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 142 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= 143 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= 144 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 145 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 146 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 147 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 148 | google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= 149 | google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= 150 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1:cZGRis4/ot9uVm639a+rHCUaG0JJHEsdyzSQTMX+suY= 151 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= 152 | google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= 153 | google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= 154 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= 155 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= 156 | google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= 157 | google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= 158 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 159 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 160 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 161 | gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= 162 | gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= 163 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 164 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= 165 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 166 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 167 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 168 | gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= 169 | gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= 170 | k8s.io/api v0.29.3 h1:2ORfZ7+bGC3YJqGpV0KSDDEVf8hdGQ6A03/50vj8pmw= 171 | k8s.io/api v0.29.3/go.mod h1:y2yg2NTyHUUkIoTC+phinTnEa3KFM6RZ3szxt014a80= 172 | k8s.io/apimachinery v0.29.3 h1:2tbx+5L7RNvqJjn7RIuIKu9XTsIZ9Z5wX2G22XAa5EU= 173 | k8s.io/apimachinery v0.29.3/go.mod h1:hx/S4V2PNW4OMg3WizRrHutyB5la0iCUbZym+W0EQIU= 174 | k8s.io/client-go v0.29.3 h1:R/zaZbEAxqComZ9FHeQwOh3Y1ZUs7FaHKZdQtIc2WZg= 175 | k8s.io/client-go v0.29.3/go.mod h1:tkDisCvgPfiRpxGnOORfkljmS+UrW+WtXAy2fTvXJB0= 176 | k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= 177 | k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= 178 | k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 h1:02WBxjyRwX4rJdl3XlWVjFbXT/kAKCsipoM8hQY3Dwo= 179 | k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2/go.mod h1:B7Huvd1LKZtTYmY+nC6rnmN8lyGYT9lifBcPD5epL6k= 180 | k8s.io/kubelet v0.29.3 h1:X9h0ZHzc+eUeNTaksbN0ItHyvGhQ7Z0HPjnQD2oHdwU= 181 | k8s.io/kubelet v0.29.3/go.mod h1:jDiGuTkFOUynyBKzOoC1xRSWlgAZ9UPcTYeFyjr6vas= 182 | k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ= 183 | k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= 184 | sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= 185 | sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= 186 | sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= 187 | sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= 188 | sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= 189 | sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= 190 | -------------------------------------------------------------------------------- /internal/manager/manager.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 The HAMi Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package manager 18 | 19 | import ( 20 | "fmt" 21 | "sort" 22 | 23 | "github.com/Project-HAMi/ascend-device-plugin/internal" 24 | "huawei.com/npu-exporter/v6/devmanager" 25 | "huawei.com/npu-exporter/v6/devmanager/dcmi" 26 | "k8s.io/klog/v2" 27 | ) 28 | 29 | type Device struct { 30 | UUID string 31 | LogicID int32 32 | PhyID int32 33 | CardID int32 34 | DeviceID int32 35 | Memory int64 36 | AICore int32 37 | Health bool 38 | } 39 | 40 | type AscendManager struct { 41 | mgr *devmanager.DeviceManager 42 | //nodeName string 43 | config internal.VNPUConfig 44 | devs []*Device 45 | } 46 | 47 | func NewAscendManager() (*AscendManager, error) { 48 | mgr, err := devmanager.AutoInit("") 49 | if err != nil { 50 | return nil, err 51 | } 52 | return &AscendManager{ 53 | mgr: mgr, 54 | devs: []*Device{}, 55 | }, nil 56 | } 57 | 58 | func (am *AscendManager) LoadConfig(path string) error { 59 | config, err := internal.LoadConfig(path) 60 | if err != nil { 61 | return err 62 | } 63 | chipInfo, err := am.mgr.GetValidChipInfo() 64 | if err != nil { 65 | return err 66 | } 67 | if chipInfo.Type != "Ascend" { 68 | return fmt.Errorf("chip type is not Ascend") 69 | } 70 | idx := -1 71 | for i, vnpu := range config.VNPUs { 72 | if vnpu.ChipName == chipInfo.Name { 73 | idx = i 74 | break 75 | } 76 | } 77 | if idx == -1 { 78 | return fmt.Errorf("can not find vnpu config for chip %s", chipInfo.Name) 79 | } 80 | am.config = config.VNPUs[idx] 81 | sort.Slice(am.config.Templates, func(i, j int) bool { 82 | return am.config.Templates[i].Memory < am.config.Templates[j].Memory 83 | }) 84 | klog.Infof("load config: %v", am.config) 85 | return nil 86 | } 87 | 88 | func (am *AscendManager) CommonWord() string { 89 | return am.config.CommonWord 90 | } 91 | 92 | func (am *AscendManager) ResourceName() string { 93 | return am.config.ResourceName 94 | } 95 | 96 | func (am *AscendManager) VDeviceCount() int { 97 | if len(am.config.Templates) == 0 { 98 | return 1 99 | } 100 | return int(am.config.MemoryAllocatable / am.config.Templates[0].Memory) 101 | } 102 | 103 | func (am *AscendManager) UpdateDevice() error { 104 | _, IDs, err := am.mgr.GetDeviceList() 105 | if err != nil { 106 | klog.Errorf("failed to get device list: %v", err) 107 | return err 108 | } 109 | 110 | am.devs = make([]*Device, 0, len(IDs)) 111 | for _, ID := range IDs { 112 | phyID, err := am.mgr.GetPhysicIDFromLogicID(ID) 113 | if err != nil { 114 | klog.Errorf("failed to get physic id from logic id: %v", err) 115 | return err 116 | } 117 | cardID, deviceID, err := am.mgr.GetCardIDDeviceID(ID) 118 | if err != nil { 119 | klog.Errorf("failed to get card id from device id: %v", err) 120 | return err 121 | } 122 | uuid, err := am.mgr.GetDieID(ID, dcmi.VDIE) 123 | if err != nil { 124 | klog.Errorf("failed to get uuid from device id: %v", err) 125 | return err 126 | } 127 | health, err := am.mgr.GetDeviceHealth(ID) 128 | if err != nil { 129 | klog.Errorf("failed to get device health: %v", err) 130 | return err 131 | } 132 | am.devs = append(am.devs, &Device{ 133 | UUID: uuid, 134 | LogicID: ID, 135 | PhyID: phyID, 136 | CardID: cardID, 137 | DeviceID: deviceID, 138 | Memory: am.config.MemoryAllocatable, 139 | AICore: am.config.AICore, 140 | Health: health == 0, 141 | }) 142 | } 143 | return nil 144 | } 145 | 146 | func (am *AscendManager) GetDevices() []*Device { 147 | return am.devs 148 | } 149 | 150 | func (am *AscendManager) GetDeviceByUUID(UUID string) *Device { 151 | for _, dev := range am.devs { 152 | if dev.UUID == UUID { 153 | return dev 154 | } 155 | } 156 | return nil 157 | } 158 | 159 | func (am *AscendManager) GetIDs() []int32 { 160 | _, IDs, err := am.mgr.GetDeviceList() 161 | if err != nil { 162 | return nil 163 | } 164 | return IDs 165 | } 166 | 167 | func (am *AscendManager) GetUnHealthIDs() []int32 { 168 | _, IDs, err := am.mgr.GetDeviceList() 169 | if err != nil { 170 | return nil 171 | } 172 | var unhealthy []int32 173 | for _, d := range IDs { 174 | healthCode, err := am.mgr.GetDeviceHealth(d) 175 | if err != nil { 176 | continue 177 | } 178 | if healthCode != 0 { 179 | unhealthy = append(unhealthy, d) 180 | } 181 | } 182 | return unhealthy 183 | } 184 | -------------------------------------------------------------------------------- /internal/server/server.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 The HAMi Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package server 18 | 19 | import ( 20 | "context" 21 | "flag" 22 | "fmt" 23 | "net" 24 | "os" 25 | "path" 26 | "time" 27 | 28 | "github.com/Project-HAMi/HAMi/pkg/device/ascend" 29 | "github.com/Project-HAMi/HAMi/pkg/util" 30 | "github.com/Project-HAMi/HAMi/pkg/util/nodelock" 31 | "github.com/Project-HAMi/ascend-device-plugin/internal/manager" 32 | "google.golang.org/grpc" 33 | "google.golang.org/grpc/credentials/insecure" 34 | v1 "k8s.io/api/core/v1" 35 | "k8s.io/apimachinery/pkg/util/json" 36 | "k8s.io/klog/v2" 37 | "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 38 | ) 39 | 40 | const ( 41 | // RegisterAnnos = "hami.io/node-register-ascend" 42 | // PodAllocAnno = "huawei.com/AscendDevices" 43 | NodeLockAscend = "hami.io/mutex.lock" 44 | ) 45 | 46 | var ( 47 | reportTimeOffset = flag.Int64("report_time_offset", 1, "report time offset") 48 | ) 49 | 50 | type PluginServer struct { 51 | nodeName string 52 | registerAnno string 53 | handshakeAnno string 54 | allocAnno string 55 | grpcServer *grpc.Server 56 | mgr *manager.AscendManager 57 | socket string 58 | stopCh chan interface{} 59 | healthCh chan int32 60 | } 61 | 62 | func NewPluginServer(mgr *manager.AscendManager, nodeName string) (*PluginServer, error) { 63 | return &PluginServer{ 64 | nodeName: nodeName, 65 | registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()), 66 | handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()), 67 | allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()), 68 | grpcServer: grpc.NewServer(), 69 | mgr: mgr, 70 | socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())), 71 | stopCh: make(chan interface{}), 72 | healthCh: make(chan int32), 73 | }, nil 74 | } 75 | 76 | func (ps *PluginServer) Start() error { 77 | ps.stopCh = make(chan interface{}) 78 | err := ps.mgr.UpdateDevice() 79 | if err != nil { 80 | return err 81 | } 82 | err = ps.serve() 83 | if err != nil { 84 | return err 85 | } 86 | err = ps.registerKubelet() 87 | if err != nil { 88 | return err 89 | } 90 | go ps.watchAndRegister() 91 | return nil 92 | } 93 | 94 | func (ps *PluginServer) Stop() error { 95 | close(ps.stopCh) 96 | ps.grpcServer.Stop() 97 | return nil 98 | } 99 | 100 | func (ps *PluginServer) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { 101 | ctx, cancel := context.WithTimeout(context.Background(), timeout) 102 | defer cancel() 103 | c, err := grpc.DialContext(ctx, unixSocketPath, 104 | grpc.WithTransportCredentials(insecure.NewCredentials()), 105 | grpc.WithBlock(), 106 | grpc.WithContextDialer(func(ctx2 context.Context, addr string) (net.Conn, error) { 107 | var d net.Dialer 108 | return d.DialContext(ctx2, "unix", addr) 109 | }), 110 | ) 111 | 112 | if err != nil { 113 | return nil, err 114 | } 115 | return c, nil 116 | } 117 | 118 | func (ps *PluginServer) serve() error { 119 | _ = os.Remove(ps.socket) 120 | sock, err := net.Listen("unix", ps.socket) 121 | if err != nil { 122 | return err 123 | } 124 | v1beta1.RegisterDevicePluginServer(ps.grpcServer, ps) 125 | resourceName := ps.mgr.ResourceName() 126 | go func() { 127 | lastCrashTime := time.Now() 128 | restartCount := 0 129 | for { 130 | klog.Infof("Starting GRPC server for '%s'", resourceName) 131 | err := ps.grpcServer.Serve(sock) 132 | if err == nil { 133 | break 134 | } 135 | 136 | klog.Infof("GRPC server for '%s' crashed with error: %v", resourceName, err) 137 | 138 | // restart if it has not been too often 139 | // i.e. if server has crashed more than 5 times and it didn't last more than one hour each time 140 | if restartCount > 5 { 141 | // quit 142 | klog.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", resourceName) 143 | } 144 | timeSinceLastCrash := time.Since(lastCrashTime).Seconds() 145 | lastCrashTime = time.Now() 146 | if timeSinceLastCrash > 3600 { 147 | // it has been one hour since the last crash.. reset the count 148 | // to reflect on the frequency 149 | restartCount = 1 150 | } else { 151 | restartCount++ 152 | } 153 | } 154 | }() 155 | 156 | // Wait for server to start by launching a blocking connexion 157 | conn, err := ps.dial(ps.socket, 5*time.Second) 158 | if err != nil { 159 | return err 160 | } 161 | _ = conn.Close() 162 | 163 | return nil 164 | } 165 | 166 | func (ps *PluginServer) registerKubelet() error { 167 | conn, err := ps.dial(v1beta1.KubeletSocket, 5*time.Second) 168 | if err != nil { 169 | return err 170 | } 171 | defer func(conn *grpc.ClientConn) { 172 | _ = conn.Close() 173 | }(conn) 174 | client := v1beta1.NewRegistrationClient(conn) 175 | reqt := &v1beta1.RegisterRequest{ 176 | Version: v1beta1.Version, 177 | Endpoint: path.Base(ps.socket), 178 | ResourceName: ps.mgr.ResourceName(), 179 | Options: &v1beta1.DevicePluginOptions{ 180 | GetPreferredAllocationAvailable: false, 181 | }, 182 | } 183 | 184 | _, err = client.Register(context.Background(), reqt) 185 | if err != nil { 186 | return err 187 | } 188 | return nil 189 | } 190 | 191 | func (ps *PluginServer) registerHAMi() error { 192 | devs := ps.mgr.GetDevices() 193 | apiDevices := make([]*util.DeviceInfo, 0, len(devs)) 194 | // hami currently believes that the index starts from 0 and is continuous. 195 | for i, dev := range devs { 196 | apiDevices = append(apiDevices, &util.DeviceInfo{ 197 | Index: uint(i), 198 | ID: dev.UUID, 199 | Count: int32(ps.mgr.VDeviceCount()), 200 | Devmem: int32(dev.Memory), 201 | Devcore: dev.AICore, 202 | Type: ps.mgr.CommonWord(), 203 | Numa: 0, 204 | Health: dev.Health, 205 | }) 206 | } 207 | annos := make(map[string]string) 208 | annos[ps.registerAnno] = util.MarshalNodeDevices(apiDevices) 209 | annos[ps.handshakeAnno] = "Reported_" + time.Now().Add(time.Duration(*reportTimeOffset)*time.Second).Format("2006.01.02 15:04:05") 210 | node, err := util.GetNode(ps.nodeName) 211 | if err != nil { 212 | return fmt.Errorf("get node %s error: %v", ps.nodeName, err) 213 | } 214 | err = util.PatchNodeAnnotations(node, annos) 215 | if err != nil { 216 | return fmt.Errorf("patch node %s annotations error: %v", ps.nodeName, err) 217 | } 218 | klog.V(5).Infof("patch node %s annotations: %v", ps.nodeName, annos) 219 | return nil 220 | } 221 | 222 | func (ps *PluginServer) watchAndRegister() { 223 | timer := time.After(1 * time.Second) 224 | for { 225 | select { 226 | case <-ps.stopCh: 227 | klog.Infof("stop watch and register") 228 | return 229 | case <-timer: 230 | } 231 | unhealthy := ps.mgr.GetUnHealthIDs() 232 | if len(unhealthy) > 0 { 233 | if err := ps.mgr.UpdateDevice(); err != nil { 234 | klog.Errorf("update device error: %v", err) 235 | timer = time.After(5 * time.Second) 236 | continue 237 | } 238 | ps.healthCh <- unhealthy[0] 239 | } 240 | err := ps.registerHAMi() 241 | if err != nil { 242 | klog.Errorf("register HAMi error: %v", err) 243 | timer = time.After(5 * time.Second) 244 | } else { 245 | klog.V(3).Infof("register HAMi success") 246 | timer = time.After(30 * time.Second) 247 | } 248 | } 249 | } 250 | 251 | func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, error) { 252 | anno, ok := pod.Annotations[ps.allocAnno] 253 | if !ok { 254 | return nil, nil, fmt.Errorf("annotation %s not set", "huawei.com/Ascend") 255 | } 256 | var rtInfo []ascend.RuntimeInfo 257 | err := json.Unmarshal([]byte(anno), &rtInfo) 258 | if err != nil { 259 | return nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) 260 | } 261 | var IDs []int32 262 | var temps []string 263 | for _, info := range rtInfo { 264 | if info.UUID == "" { 265 | continue 266 | } 267 | d := ps.mgr.GetDeviceByUUID(info.UUID) 268 | if d == nil { 269 | return nil, nil, fmt.Errorf("unknown uuid: %s", info.UUID) 270 | } 271 | IDs = append(IDs, d.PhyID) 272 | temps = append(temps, info.Temp) 273 | } 274 | if len(IDs) == 0 { 275 | return nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) 276 | } 277 | return IDs, temps, nil 278 | } 279 | 280 | func (ps *PluginServer) apiDevices() []*v1beta1.Device { 281 | devs := ps.mgr.GetDevices() 282 | devices := make([]*v1beta1.Device, 0, len(devs)) 283 | vCount := ps.mgr.VDeviceCount() 284 | for _, dev := range devs { 285 | health := v1beta1.Unhealthy 286 | if dev.Health { 287 | health = v1beta1.Healthy 288 | } 289 | for i := 0; i < vCount; i++ { 290 | device := v1beta1.Device{ 291 | ID: fmt.Sprintf("%s-%d", dev.UUID, i), 292 | Health: health, 293 | } 294 | devices = append(devices, &device) 295 | } 296 | } 297 | klog.V(5).Infof("api devices: %v", devices) 298 | return devices 299 | } 300 | 301 | func (ps *PluginServer) GetDevicePluginOptions(context.Context, *v1beta1.Empty) (*v1beta1.DevicePluginOptions, error) { 302 | return &v1beta1.DevicePluginOptions{}, nil 303 | } 304 | 305 | func (ps *PluginServer) ListAndWatch(e *v1beta1.Empty, s v1beta1.DevicePlugin_ListAndWatchServer) error { 306 | _ = s.Send(&v1beta1.ListAndWatchResponse{Devices: ps.apiDevices()}) 307 | for { 308 | select { 309 | case <-ps.stopCh: 310 | return nil 311 | case <-ps.healthCh: 312 | _ = s.Send(&v1beta1.ListAndWatchResponse{Devices: ps.apiDevices()}) 313 | } 314 | } 315 | } 316 | 317 | func (ps *PluginServer) GetPreferredAllocation(context.Context, *v1beta1.PreferredAllocationRequest) (*v1beta1.PreferredAllocationResponse, error) { 318 | return nil, fmt.Errorf("not supported") 319 | } 320 | 321 | func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequest) (*v1beta1.AllocateResponse, error) { 322 | klog.V(5).Infof("Allocate: %v", reqs) 323 | pod, err := util.GetPendingPod(ctx, ps.nodeName) 324 | if err != nil { 325 | klog.Errorf("get pending pod error: %v", err) 326 | lockerr := nodelock.ReleaseNodeLock(ps.nodeName, NodeLockAscend, pod, false) 327 | if lockerr != nil { 328 | klog.Errorf("failed to release lock:%s", err.Error()) 329 | } 330 | return nil, fmt.Errorf("get pending pod error: %v", err) 331 | } 332 | resp := v1beta1.ContainerAllocateResponse{} 333 | IDs, temps, err := ps.parsePodAnnotation(pod) 334 | if err != nil { 335 | lockerr := nodelock.ReleaseNodeLock(ps.nodeName, NodeLockAscend, pod, false) 336 | if lockerr != nil { 337 | klog.Errorf("failed to release lock:%s", err.Error()) 338 | } 339 | return nil, fmt.Errorf("parse pod annotation error: %v", err) 340 | } 341 | if len(IDs) == 0 { 342 | lockerr := nodelock.ReleaseNodeLock(ps.nodeName, NodeLockAscend, pod, false) 343 | if lockerr != nil { 344 | klog.Errorf("failed to release lock:%s", err.Error()) 345 | } 346 | return nil, fmt.Errorf("empty id from pod annotation") 347 | } 348 | ascendVisibleDevices := fmt.Sprintf("%d", IDs[0]) 349 | ascendVNPUSpec := "" 350 | for i := 1; i < len(IDs); i++ { 351 | ascendVisibleDevices = fmt.Sprintf("%s,%d", ascendVisibleDevices, IDs[i]) 352 | } 353 | for i := 0; i < len(temps); i++ { 354 | if temps[i] != "" { 355 | ascendVNPUSpec = temps[i] 356 | break 357 | } 358 | } 359 | resp.Envs = make(map[string]string) 360 | resp.Envs["ASCEND_VISIBLE_DEVICES"] = ascendVisibleDevices 361 | if ascendVNPUSpec != "" { 362 | resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec 363 | } 364 | klog.V(5).Infof("allocate response: %v", resp) 365 | lockerr := nodelock.ReleaseNodeLock(ps.nodeName, NodeLockAscend, pod, true) 366 | if lockerr != nil { 367 | klog.Errorf("failed to release lock:%s", err.Error()) 368 | } 369 | return &v1beta1.AllocateResponse{ContainerResponses: []*v1beta1.ContainerAllocateResponse{&resp}}, nil 370 | } 371 | 372 | func (ps *PluginServer) PreStartContainer(context.Context, *v1beta1.PreStartContainerRequest) (*v1beta1.PreStartContainerResponse, error) { 373 | return &v1beta1.PreStartContainerResponse{}, nil 374 | } 375 | -------------------------------------------------------------------------------- /internal/vnpu.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package internal 18 | 19 | import ( 20 | "os" 21 | 22 | "k8s.io/apimachinery/pkg/util/yaml" 23 | ) 24 | 25 | type Template struct { 26 | Name string `json:"name"` 27 | Memory int64 `json:"memory"` 28 | AICore int32 `json:"aiCore,omitempty"` 29 | AICPU int32 `json:"aiCPU,omitempty"` 30 | } 31 | 32 | type VNPUConfig struct { 33 | CommonWord string `json:"commonWord"` 34 | ChipName string `json:"chipName"` 35 | ResourceName string `json:"resourceName"` 36 | ResourceMemoryName string `json:"resourceMemoryName"` 37 | MemoryAllocatable int64 `json:"memoryAllocatable"` 38 | MemoryCapacity int64 `json:"memoryCapacity"` 39 | AICore int32 `json:"aiCore"` 40 | AICPU int32 `json:"aiCPU"` 41 | Templates []Template `json:"templates"` 42 | } 43 | 44 | type Config struct { 45 | VNPUs []VNPUConfig `json:"vnpus"` 46 | } 47 | 48 | func LoadConfig(path string) (*Config, error) { 49 | data, err := os.ReadFile(path) 50 | if err != nil { 51 | return nil, err 52 | } 53 | var yamlData Config 54 | err = yaml.Unmarshal(data, &yamlData) 55 | if err != nil { 56 | return nil, err 57 | } 58 | return &yamlData, nil 59 | } 60 | -------------------------------------------------------------------------------- /internal/wachers.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 The HAMi Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package internal 18 | 19 | import ( 20 | "os" 21 | "os/signal" 22 | 23 | "github.com/fsnotify/fsnotify" 24 | ) 25 | 26 | func NewFSWatcher(files ...string) (*fsnotify.Watcher, error) { 27 | watcher, err := fsnotify.NewWatcher() 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | for _, f := range files { 33 | err = watcher.Add(f) 34 | if err != nil { 35 | watcher.Close() 36 | return nil, err 37 | } 38 | } 39 | 40 | return watcher, nil 41 | } 42 | 43 | func NewOSWatcher(sigs ...os.Signal) chan os.Signal { 44 | sigChan := make(chan os.Signal, 1) 45 | signal.Notify(sigChan, sigs...) 46 | 47 | return sigChan 48 | } 49 | -------------------------------------------------------------------------------- /version/version.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 The HAMi Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package version 18 | 19 | import ( 20 | "flag" 21 | "fmt" 22 | "os" 23 | ) 24 | 25 | var ( 26 | version string 27 | printVersion = flag.Bool("version", false, "print version") 28 | ) 29 | 30 | func GetVersion() string { 31 | return version 32 | } 33 | 34 | func CheckVersionFlag() { 35 | if *printVersion { 36 | fmt.Printf("version: %v\n", GetVersion()) 37 | os.Exit(0) 38 | } 39 | } 40 | --------------------------------------------------------------------------------