├── test
    ├── deployment
    │   ├── nvidia-container-runtime.yaml
    │   ├── nvidia-tmp-pod.yaml
    │   ├── nvidia-test-pod.yaml
    │   └── qwen-mini.yaml
    └── DEMO.md
├── readme
    ├── DEPLOY.md
    └── LOCAL.BUILD.md
├── Makefile
├── monitor
    ├── monitor.sh
    └── press_test.py
├── pkg
    ├── device
    │   ├── simulator.go
    │   ├── device.go
    │   ├── huawei.go
    │   └── nvidia.go
    ├── allocator
    │   └── allocator.go
    └── deviceplugin
    │   └── server.go
├── Dockerfile
├── go.mod
├── cmd
    └── main.go
├── README.md
└── manifests
    └── daemonset.yaml


/test/deployment/nvidia-container-runtime.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: node.k8s.io/v1
2 | kind: RuntimeClass
3 | metadata:
4 |   name: nvidia
5 | handler: nvidia  # 指向 nvidia-container-runtime


--------------------------------------------------------------------------------
/readme/DEPLOY.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### apply
 3 | ```shell
 4 | kubectl apply -f manifests/daemonset.yaml
 5 | 
 6 | ```
 7 | 
 8 | 
 9 | ### delete & auto create daemonset
10 | 
11 | ```shell
12 | kubectl delete pod -l app=micro-device-plugin -n kube-system
13 | 
14 | kubectl logs -l app=micro-device-plugin -n kube-system  --tail=-1
15 | 
16 | ```


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | BINARY := micro-device-plugin
 2 | IMAGE := your-registry/micro-device-plugin
 3 | VERSION := v1.0.0
 4 | 
 5 | .PHONY: build
 6 | build:
 7 | 	go build -o bin/$(BINARY) ./cmd
 8 | 
 9 | .PHONY: docker-build
10 | docker-build:
11 | 	docker build -t $(IMAGE):$(VERSION) .
12 | 
13 | .PHONY: push
14 | push:
15 | 	docker push $(IMAGE):$(VERSION)
16 | 
17 | .PHONY: clean
18 | clean:
19 | 	rm -rf bin


--------------------------------------------------------------------------------
/monitor/monitor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while true; do
 4 |   clear
 5 |   echo "===== $(date) ====="
 6 |   echo "MIG 设备列表:"
 7 |   nvidia-smi mig -lgi
 8 | 
 9 |   echo -e "\n设备 1 状态:"
10 |   nvidia-smi -i MIG-GPU-0-1 -q | grep -A 5 "Utilization" | grep -v "Gpu"
11 | 
12 |   echo -e "\n设备 2 状态:"
13 |   nvidia-smi -i MIG-GPU-0-2 -q | grep -A 5 "Utilization" | grep -v "Gpu"
14 | 
15 |   sleep 1
16 | done


--------------------------------------------------------------------------------
/test/deployment/nvidia-tmp-pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: nvidia-tmp-pod
 5 | spec:
 6 | 
 7 |   runtimeClassName: nvidia  # 使用 NVIDIA 运行时
 8 |   restartPolicy: Never  # 执行一次后退出
 9 |   containers:
10 |     - name: test-container
11 |       image: nvcr.io/nvidia/pytorch:24.05-py3
12 |       command: ["nvidia-smi"]
13 |       resources:
14 |         limits:
15 |           nvidia.com/microgpu: 1  # 请求 1 个 GPU


--------------------------------------------------------------------------------
/test/deployment/nvidia-test-pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: nvidia-test-pod
 5 | spec:
 6 |   runtimeClassName: nvidia
 7 |   restartPolicy: Never
 8 |   containers:
 9 |     - name: test-container
10 |       image: nvcr.io/nvidia/pytorch:24.05-py3
11 |       imagePullPolicy: IfNotPresent
12 |       # 关键修改：启动无限循环命令
13 |       command: ["/bin/sh", "-c"]
14 |       args: ["while true; do sleep 3600; done"]  # 每小时唤醒一次的永久循环
15 |       resources:
16 |         limits:
17 |           nvidia.com/microgpu: 1


--------------------------------------------------------------------------------
/readme/LOCAL.BUILD.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ### build
 4 | ```shell
 5 | 
 6 | 
 7 | git pull 
 8 | 
 9 | docker build   --build-arg HTTP_PROXY=http://10.0.168.12:7890   --build-arg HTTPS_PROXY=http://10.0.168.12:7890   -t binyue/micro-device-plugin:v1.0.13 .
10 | 
11 | 
12 | 
13 | ```
14 | ### transfer to containerd
15 | 
16 | ```shell
17 | 
18 | docker save binyue/micro-device-plugin:v1.0.13 -o micro-device-plugin.tar
19 | 
20 | sudo ctr -n k8s.io images import  micro-device-plugin.tar
21 | 
22 | sudo ctr -n k8s.io images ls |grep micro-device-plugin
23 | 
24 | ```
25 | 
26 | 


--------------------------------------------------------------------------------
/pkg/device/simulator.go:
--------------------------------------------------------------------------------
 1 | package device
 2 | 
 3 | import (
 4 | 	"time"
 5 | )
 6 | 
 7 | type SimulatorManager struct {
 8 | 	lastDiscovery time.Time
 9 | 	devices       []GPUDevice
10 | }
11 | 
12 | func (m *SimulatorManager) DiscoverGPUs() ([]GPUDevice, error) {
13 | 	return []GPUDevice{
14 | 		&SimulatorDevice{id: "0", healthy: true},
15 | 		&SimulatorDevice{id: "1", healthy: true},
16 | 		&SimulatorDevice{id: "2", healthy: true},
17 | 	}, nil
18 | }
19 | 
20 | func (m *SimulatorManager) CheckHealth(deviceID string) bool {
21 | 	// 模拟 10% 的失败率
22 | 	return time.Now().UnixNano()%10 != 0
23 | }
24 | 


--------------------------------------------------------------------------------
/test/DEMO.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ### deploy and see nvidia-smi 
 4 | ```shell
 5 | kubectl apply -f deployment/nvidia-test-pod.yaml
 6 | kubectl describe pod nvidia-test-pod
 7 | kubectl delete pod nvidia-test-pod
 8 | kubectl logs nvidia-test-pod --tail=-1
 9 | kubectl exec -it nvidia-test-pod -- sh
10 | ```
11 | 
12 | ### cuda test
13 | ```shell
14 | python -c "import torch; print(torch.__version__, torch.cuda.is_available())"
15 | 
16 | ```
17 | 
18 | 
19 | ### mutil allocate test
20 | 
21 | ```shell
22 | kubectl apply -f deployment/nvidia-tmp-pod.yaml
23 | kubectl describe pod nvidia-tmp-pod
24 | kubectl delete pod nvidia-tmp-pod
25 | kubectl logs nvidia-tmp-pod --tail=-1
26 | ```


--------------------------------------------------------------------------------
/pkg/device/device.go:
--------------------------------------------------------------------------------
 1 | package device
 2 | 
 3 | // GPUDevice 表示GPU设备的接口
 4 | type GPUDevice interface {
 5 | 	ID() string
 6 | 	IsHealthy() bool
 7 | 	GetVendor() string
 8 | 	GetPath() string
 9 | 	IsMIG() bool        // 新增：是否为MIG设备
10 | 	PhysicalID() string // 新增：物理GPU ID
11 | }
12 | 
13 | // DeviceManager 设备管理器接口
14 | type DeviceManager interface {
15 | 	DiscoverGPUs() ([]GPUDevice, error)
16 | 	CheckHealth(deviceID string) bool
17 | }
18 | 
19 | type SimulatorDevice struct {
20 | 	id      string
21 | 	healthy bool
22 | }
23 | 
24 | func (d *SimulatorDevice) IsMIG() bool {
25 | 	return false
26 | }
27 | 
28 | func (d *SimulatorDevice) PhysicalID() string {
29 | 	return d.id
30 | }
31 | 
32 | func (d *SimulatorDevice) ID() string        { return d.id }
33 | func (d *SimulatorDevice) IsHealthy() bool   { return d.healthy }
34 | func (d *SimulatorDevice) GetVendor() string { return "simulator" }
35 | func (d *SimulatorDevice) GetPath() string   { return "/dev/sim_gpu" + d.id }
36 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # 第一阶段：使用官方golang镜像构建
 2 | FROM golang:1.24 AS builder
 3 | 
 4 | # 设置工作目录
 5 | WORKDIR /workspace
 6 | COPY . .
 7 | 
 8 | # 设置代理（通过构建参数）
 9 | ARG HTTP_PROXY
10 | ARG HTTPS_PROXY
11 | ENV http_proxy=${HTTP_PROXY}
12 | ENV https_proxy=${HTTPS_PROXY}
13 | 
14 | 
15 | # 静态编译，确保零依赖
16 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
17 |     -a -installsuffix cgo \
18 |     -ldflags="-w -s" \
19 |     -o /usr/bin/micro-device-plugin ./cmd
20 | 
21 | # 第二阶段：使用Ubuntu基础镜像
22 | FROM ubuntu:22.04
23 | 
24 | 
25 | # 安装必要的运行依赖
26 | RUN apt-get update && apt-get install -y --no-install-recommends \
27 |     ca-certificates \
28 |     curl \
29 |     kmod \
30 |     libunwind8 \
31 |     && rm -rf /var/lib/apt/lists/*
32 | 
33 | # 设置环境变量（修复警告问题）
34 | ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu
35 | 
36 | # 从构建阶段复制静态编译的二进制文件
37 | COPY --from=builder /usr/bin/micro-device-plugin /usr/bin/
38 | 
39 | # 健康检查
40 | HEALTHCHECK --interval=30s --timeout=10s \
41 |     CMD curl -f http://localhost:8080/health || exit 1
42 | 
43 | # 容器入口点
44 | ENTRYPOINT ["/usr/bin/micro-device-plugin"]
45 | 


--------------------------------------------------------------------------------
/pkg/device/huawei.go:
--------------------------------------------------------------------------------
 1 | package device
 2 | 
 3 | import (
 4 | 	"sync"
 5 | 	"time"
 6 | 
 7 | 	"k8s.io/klog/v2"
 8 | )
 9 | 
10 | type HuaweiDevice struct {
11 | 	id      string
12 | 	healthy bool
13 | }
14 | 
15 | func (d *HuaweiDevice) IsMIG() bool {
16 | 	return false
17 | }
18 | 
19 | func (d *HuaweiDevice) PhysicalID() string {
20 | 	return d.id
21 | }
22 | 
23 | func (d *HuaweiDevice) ID() string        { return d.id }
24 | func (d *HuaweiDevice) IsHealthy() bool   { return d.healthy }
25 | func (d *HuaweiDevice) GetVendor() string { return "huawei" }
26 | func (d *HuaweiDevice) GetPath() string   { return "/dev/davinci" + d.id }
27 | 
28 | type HuaweiManager struct {
29 | 	lastDiscovery time.Time
30 | 	devices       []GPUDevice
31 | 	discoverySync sync.Mutex
32 | }
33 | 
34 | func (m *HuaweiManager) DiscoverGPUs() ([]GPUDevice, error) {
35 | 	m.discoverySync.Lock()
36 | 	defer m.discoverySync.Unlock()
37 | 
38 | 	// 如果最近已经发现过设备，则使用缓存
39 | 	if time.Since(m.lastDiscovery) < 5*time.Minute && m.devices != nil {
40 | 		klog.V(4).Infof("Using cached Huawei devices (last discovery: %s)", m.lastDiscovery)
41 | 		return m.devices, nil
42 | 	}
43 | 
44 | 	klog.Info("Discovering Huawei devices")
45 | 
46 | 	// 实际生产环境中应使用华为NPU SDK调用
47 | 	// 这里为模拟实现
48 | 	devices := []GPUDevice{
49 | 		&HuaweiDevice{id: "0", healthy: true},
50 | 		&HuaweiDevice{id: "1", healthy: true},
51 | 	}
52 | 
53 | 	klog.Infof("Discovered %d Huawei devices", len(devices))
54 | 	for _, d := range devices {
55 | 		klog.Infof("Huawei Device: ID=%s, Healthy=%v", d.ID(), d.IsHealthy())
56 | 	}
57 | 
58 | 	m.devices = devices
59 | 	m.lastDiscovery = time.Now()
60 | 	return devices, nil
61 | }
62 | 
63 | func (m *HuaweiManager) CheckHealth(deviceID string) bool {
64 | 	// 实际生产环境中应使用华为NPU SDK的健康检查
65 | 	// 这里总是返回true作为模拟
66 | 	healthy := true
67 | 	klog.V(5).Infof("Checking health of Huawei device %s: %v", deviceID, healthy)
68 | 	return healthy
69 | }
70 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/benyuereal/micro-device-plugin
 2 | 
 3 | go 1.24.0
 4 | 
 5 | require (
 6 | 	google.golang.org/grpc v1.68.1
 7 | 	k8s.io/api v0.33.4
 8 | 	k8s.io/apimachinery v0.33.4
 9 | 	k8s.io/client-go v0.33.4
10 | 	k8s.io/klog/v2 v2.130.1
11 | 	k8s.io/kubelet v0.33.4
12 | )
13 | 
14 | require (
15 | 	github.com/davecgh/go-spew v1.1.1 // indirect
16 | 	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
17 | 	github.com/fxamacker/cbor/v2 v2.7.0 // indirect
18 | 	github.com/go-logr/logr v1.4.2 // indirect
19 | 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
20 | 	github.com/go-openapi/jsonreference v0.20.2 // indirect
21 | 	github.com/go-openapi/swag v0.23.0 // indirect
22 | 	github.com/gogo/protobuf v1.3.2 // indirect
23 | 	github.com/golang/protobuf v1.5.4 // indirect
24 | 	github.com/google/gnostic-models v0.6.9 // indirect
25 | 	github.com/google/go-cmp v0.7.0 // indirect
26 | 	github.com/google/uuid v1.6.0 // indirect
27 | 	github.com/josharian/intern v1.0.0 // indirect
28 | 	github.com/json-iterator/go v1.1.12 // indirect
29 | 	github.com/mailru/easyjson v0.7.7 // indirect
30 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
31 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
32 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
33 | 	github.com/pkg/errors v0.9.1 // indirect
34 | 	github.com/x448/float16 v0.8.4 // indirect
35 | 	golang.org/x/net v0.38.0 // indirect
36 | 	golang.org/x/oauth2 v0.27.0 // indirect
37 | 	golang.org/x/sys v0.31.0 // indirect
38 | 	golang.org/x/term v0.30.0 // indirect
39 | 	golang.org/x/text v0.23.0 // indirect
40 | 	golang.org/x/time v0.9.0 // indirect
41 | 	google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
42 | 	google.golang.org/protobuf v1.36.5 // indirect
43 | 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
44 | 	gopkg.in/inf.v0 v0.9.1 // indirect
45 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
46 | 	k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
47 | 	k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect
48 | 	sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
49 | 	sigs.k8s.io/randfill v1.0.0 // indirect
50 | 	sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
51 | 	sigs.k8s.io/yaml v1.4.0 // indirect
52 | )
53 | 


--------------------------------------------------------------------------------
/monitor/press_test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import threading
 3 | import time
 4 | import random
 5 | import json  # 添加 json 模块用于格式化输出
 6 | 
 7 | # 目标URL
 8 | url = "http://10.0.168.12:30080/generate"
 9 | 
10 | # 请求头
11 | headers = {
12 |     "Content-Type": "application/json"
13 | }
14 | 
15 | # 多个不同的提示词，增加多样性
16 | prompts = [
17 |     "请用简单的话解释量子计算",
18 |     "详细描述深度学习的原理和应用",
19 |     "解释Transformer模型在自然语言处理中的作用",
20 |     "讲述人工智能的发展历史",
21 |     "比较机器学习和深度学习的区别",
22 |     "解释神经网络的基本原理",
23 |     "描述计算机视觉的最新进展",
24 |     "讲解自然语言处理中的注意力机制",
25 |     "什么是强化学习？它有哪些应用？",
26 |     "解释生成式对抗网络(GAN)的工作原理"
27 | ]
28 | 
29 | # 发送请求的函数
30 | def send_request(thread_id):
31 |     request_count = 0
32 |     while True:
33 |         try:
34 |             # 随机选择一个提示词
35 |             prompt = random.choice(prompts)
36 | 
37 |             data = {
38 |                 "inputs": f"<|im_start|>system\n你是一个AI助手<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
39 |                 "parameters": {
40 |                     "max_new_tokens": 512,  # 增加生成的token数量
41 |                     "temperature": 0.7,
42 |                     "top_p": 0.9,
43 |                     "do_sample": True,
44 |                     "repetition_penalty": 1.1
45 |                 }
46 |             }
47 | 
48 |             response = requests.post(url, json=data, headers=headers, timeout=30)
49 |             request_count += 1
50 | 
51 |             if response.status_code == 200:
52 |                 # 获取并打印响应内容
53 |                 response_data = response.json()
54 |                 generated_text = response_data.get("generated_text", "")
55 | 
56 |                 # 截取部分文本，避免输出过长
57 |                 preview_text = generated_text[:100] + "..." if len(generated_text) > 100 else generated_text
58 | 
59 |                 print(f"线程{thread_id} - 请求#{request_count}成功")
60 |                 print(f"生成文本: {preview_text}")
61 |                 print(f"详细响应: {json.dumps(response_data, indent=2, ensure_ascii=False)[:200]}...")  # 打印部分JSON响应
62 |                 print("-" * 80)  # 分隔线
63 |             else:
64 |                 print(f"线程{thread_id} - 请求#{request_count}失败，状态码: {response.status_code}")
65 |                 print(f"错误响应: {response.text[:200]}")
66 |                 print("-" * 80)
67 | 
68 |         except Exception as e:
69 |             print(f"线程{thread_id} - 请求异常: {e}")
70 |             print("-" * 80)
71 |         
72 |         # 稍微延迟，避免过度请求
73 |         time.sleep(0.1)
74 | 
75 | # 创建多个线程并发请求
76 | threads = []
77 | for i in range(20):  # 创建20个并发线程
78 |     t = threading.Thread(target=send_request, args=(i,))
79 |     t.daemon = True
80 |     threads.append(t)
81 |     t.start()
82 | 
83 | # 保持脚本运行
84 | try:
85 |     while True:
86 |         time.sleep(1)
87 | except KeyboardInterrupt:
88 |     print("停止压力测试")


--------------------------------------------------------------------------------
/test/deployment/qwen-mini.yaml:
--------------------------------------------------------------------------------
 1 | # 修改后的 qwen-statefulset-local-direct.yaml
 2 | apiVersion: apps/v1
 3 | kind: StatefulSet
 4 | metadata:
 5 |   name: qwen-mini
 6 | spec:
 7 |   serviceName: "qwen-service"
 8 |   replicas: 2  # 增加副本数
 9 |   podManagementPolicy: OrderedReady
10 |   selector:
11 |     matchLabels:
12 |       app: qwen
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: qwen
17 |     spec:
18 |       # 新增拓扑分布约束 - 核心修改点
19 |       topologySpreadConstraints:
20 |         - maxSkew: 2  # 节点间最大Pod数差异
21 |           topologyKey: kubernetes.io/hostname  # 节点级别的拓扑域
22 |           whenUnsatisfiable: DoNotSchedule  # 不满足条件时不调度
23 |           labelSelector:
24 |             matchLabels:
25 |               app: qwen
26 | 
27 |       runtimeClassName: nvidia
28 |       containers:
29 |         - name: text-generation
30 |           image: ghcr.io/huggingface/text-generation-inference:1.4.1
31 |           command: ["text-generation-launcher"]
32 |           args:
33 |             - "--model-id"
34 |             - "/model"
35 |             - "--num-shard"
36 |             - "1"
37 |             - "--port"
38 |             - "8000"
39 |             - "--quantize"
40 |             - "bitsandbytes"
41 |           env:
42 |             - name: TRANSFORMERS_OFFLINE
43 |               value: "1"
44 |             - name: HF_HUB_OFFLINE
45 |               value: "1"
46 |             - name: NVIDIA_DISABLE_REQUIRE
47 |               value: "1"
48 |           # 添加就绪探针，确保Pod完全启动后才被认为是就绪
49 |           readinessProbe:
50 |             httpGet:
51 |               path: /health
52 |               port: 8000
53 |             initialDelaySeconds: 30
54 |             periodSeconds: 5
55 |             failureThreshold: 10
56 |             timeoutSeconds: 5
57 |           # 添加存活探针，确保应用健康
58 |           livenessProbe:
59 |             httpGet:
60 |               path: /health
61 |               port: 8000
62 |             initialDelaySeconds: 60
63 |             periodSeconds: 10
64 |             failureThreshold: 3
65 |             timeoutSeconds: 5
66 |           resources:
67 |             limits:
68 |               nvidia.com/microgpu: 1
69 |           ports:
70 |             - containerPort: 8000
71 |           volumeMounts:
72 |             - name: model-storage
73 |               mountPath: /model
74 |       volumes:
75 |         - name: model-storage
76 |           hostPath:
77 |             path: /home/Qwen1.5-0.5B-Chat
78 |             type: Directory
79 |   # StatefulSet更新策略
80 |   updateStrategy:
81 |     type: RollingUpdate
82 |     rollingUpdate:
83 |       partition: 0  # 确保有序更新
84 | 
85 | ---
86 | apiVersion: v1
87 | kind: Service
88 | metadata:
89 |   name: qwen-service
90 | spec:
91 |   type: NodePort
92 |   selector:
93 |     app: qwen
94 |   ports:
95 |     - protocol: TCP
96 |       port: 8000
97 |       targetPort: 8000
98 |       nodePort: 30080


--------------------------------------------------------------------------------
/cmd/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"net/http"
  6 | 	"os"
  7 | 	"os/signal"
  8 | 	"sync"
  9 | 	"syscall"
 10 | 	"time"
 11 | 
 12 | 	"github.com/benyuereal/micro-device-plugin/pkg/device"
 13 | 	"github.com/benyuereal/micro-device-plugin/pkg/deviceplugin"
 14 | 	"k8s.io/klog/v2"
 15 | )
 16 | 
 17 | func main() {
 18 | 	klog.InitFlags(nil)
 19 | 	defer klog.Flush()
 20 | 
 21 | 	// 获取环境变量设置
 22 | 	simulate := os.Getenv("SIMULATE")
 23 | 	cdiEnabled := os.Getenv("CDI_ENABLED") == "true"
 24 | 	cdiPrefix := os.Getenv("CDI_PREFIX")
 25 | 	nodeName := os.Getenv("NODE_NAME")
 26 | 	if cdiPrefix == "" {
 27 | 		cdiPrefix = "micro.device" // 默认值
 28 | 	}
 29 | 	klog.Infof("Running in simulation mode: %s", simulate)
 30 | 
 31 | 	// 初始化设备管理器
 32 | 	var managers []struct {
 33 | 		vendor  string
 34 | 		manager device.DeviceManager
 35 | 	}
 36 | 
 37 | 	// 添加模拟管理器
 38 | 	if simulate != "" {
 39 | 		managers = append(managers, struct {
 40 | 			vendor  string
 41 | 			manager device.DeviceManager
 42 | 		}{
 43 | 			vendor:  "simulator",
 44 | 			manager: &device.NVIDIAManager{},
 45 | 		})
 46 | 	} else {
 47 | 		// 真实环境下的设备管理器
 48 | 		managers = append(managers, struct {
 49 | 			vendor  string
 50 | 			manager device.DeviceManager
 51 | 		}{"nvidia", device.NewNVIDIAManager()})
 52 | 		managers = append(managers, struct {
 53 | 			vendor  string
 54 | 			manager device.DeviceManager
 55 | 		}{"huawei", &device.HuaweiManager{}})
 56 | 	}
 57 | 
 58 | 	var servers []*deviceplugin.DevicePluginServer
 59 | 	var wg sync.WaitGroup
 60 | 	var serverMutex sync.Mutex
 61 | 
 62 | 	ctx, cancel := context.WithCancel(context.Background())
 63 | 
 64 | 	// 为每个供应商启动插件
 65 | 	for _, m := range managers {
 66 | 		wg.Add(1)
 67 | 		go func(vendor string, manager device.DeviceManager) {
 68 | 			defer wg.Done()
 69 | 
 70 | 			srv := deviceplugin.New(vendor, manager, cdiEnabled, cdiPrefix, nodeName)
 71 | 			if err := srv.Start(ctx); err != nil {
 72 | 				klog.Errorf("Failed to start %s device plugin: %v", vendor, err)
 73 | 				return
 74 | 			}
 75 | 
 76 | 			serverMutex.Lock()
 77 | 			servers = append(servers, srv)
 78 | 			serverMutex.Unlock()
 79 | 
 80 | 			// 后台运行健康检查
 81 | 			go srv.HealthCheck(ctx, 30*time.Second)
 82 | 		}(m.vendor, m.manager)
 83 | 	}
 84 | 
 85 | 	// 健康检查路由
 86 | 	http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
 87 | 		w.WriteHeader(http.StatusOK)
 88 | 	})
 89 | 	go func() {
 90 | 		if err := http.ListenAndServe(":8080", nil); err != nil {
 91 | 			klog.Fatalf("Health check server failed: %v", err)
 92 | 		}
 93 | 	}()
 94 | 	klog.Info("Health check server started on :8080")
 95 | 
 96 | 	// 等待终止信号
 97 | 	signalChan := make(chan os.Signal, 1)
 98 | 	signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
 99 | 	<-signalChan
100 | 	klog.Info("Received termination signal, shutting down...")
101 | 
102 | 	// 关闭所有插件
103 | 	cancel()
104 | 	for _, srv := range servers {
105 | 		srv.Stop()
106 | 	}
107 | 
108 | 	klog.Info("All device plugins stopped. Exiting.")
109 | }
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Micro GPU Device Plugin for Kubernetes
  2 | 
  3 | ## 概述
  4 | 这是一个支持多GPU资源限制的Kubernetes设备插件，特别优化了对NVIDIA MIG设备的支持。它能够在Kubernetes集群中自动发现、管理和分配GPU资源，包括完整的GPU设备和MIG分区。
  5 | 
  6 | ## 核心特性
  7 | - ✅ 完整的GPU设备发现与管理
  8 | - ✅ NVIDIA MIG设备支持（自动分区与配置）
  9 | - ✅ 设备健康检查与监控
 10 | - ⛔️ CDI（Container Device Interface）支持
 11 | - ✅ 资源回收与自动清理机制
 12 | 
 13 | ## 前提条件
 14 | ### 1. Kubernetes 集群
 15 | - Kubernetes 1.20+ 版本
 16 | - kubectl 配置完成
 17 | 
 18 | ### 2 Containerd 配置
 19 | 在 `/etc/containerd/config.toml` 中添加：
 20 | 
 21 | ```toml
 22 |       [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
 23 |         [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
 24 |           runtime_type = "io.containerd.runc.v2"
 25 |           privileged_without_host_devices = false
 26 |           [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
 27 |             BinaryName = "/usr/bin/nvidia-container-runtime"
 28 | ```
 29 | ### 3 runclass配置
 30 | ```yaml
 31 | apiVersion: node.k8s.io/v1
 32 | kind: RuntimeClass
 33 | metadata:
 34 |   name: nvidia
 35 | handler: nvidia  # 指向 nvidia-container-runtime
 36 | ```
 37 | 
 38 | ### 4. **GPU MIG 设置**:
 39 | ```yaml
 40 | ### 启用mig
 41 | sudo nvidia-smi -mig 1
 42 | ### 创建 MIG 设备 (例如 3g.20gb 配置)
 43 | sudo nvidia-smi mig -cgi 9 -C
 44 | ```
 45 | 
 46 | ## 🚀 快速开始
 47 | 
 48 | ### 部署设备插件
 49 | ```yaml
 50 | kubectl apply -f manifests/daemonset.yaml
 51 | ```
 52 | 
 53 | ### 验证部署
 54 | ```shell
 55 | kubectl get pods -n kube-system -l app=micro-device-plugin
 56 | 
 57 | kubectl logs -n kube-system -l app=micro-device-plugin --tail=50
 58 | ```
 59 | 
 60 | ### 测试示例
 61 | ```yaml
 62 | apiVersion: v1
 63 | kind: Pod
 64 | metadata:
 65 |   name: nvidia-test-pod
 66 | spec:
 67 |   runtimeClassName: nvidia
 68 |   restartPolicy: Never
 69 |   containers:
 70 |     - name: test-container
 71 |       image: nvcr.io/nvidia/pytorch:24.05-py3
 72 |       imagePullPolicy: IfNotPresent
 73 |       # 关键修改：启动无限循环命令
 74 |       command: ["/bin/sh", "-c"]
 75 |       args: ["while true; do sleep 3600; done"]  # 每小时唤醒一次的永久循环
 76 |       resources:
 77 |         limits:
 78 |           nvidia.com/microgpu: 1
 79 | ```
 80 | 
 81 | ### 部署测试应用：
 82 | 
 83 | ```shell
 84 | 
 85 | kubectl apply -f deployment/nvidia-test-pod.yaml
 86 | kubectl describe pod nvidia-test-pod
 87 | kubectl logs nvidia-test-pod --tail=-1
 88 | kubectl exec -it nvidia-test-pod -- sh
 89 | 
 90 | ```
 91 | 
 92 | ## 📊 功能特性
 93 | - 支持 NVIDIA GPU 和 MIG 设备管理
 94 | - 自动健康检查和设备回收
 95 | - CDI 设备注入支持
 96 | - 拓扑感知调度优化
 97 | - 多实例 GPU 资源切分
 98 | 
 99 | ## 🛠 构建与部署
100 | 
101 | 
102 | ```shell
103 | docker build -t your-registry/micro-device-plugin:v1.0.0 .
104 | docker push your-registry/micro-device-plugin:v1.0.0
105 | ```
106 | 
107 | ## 部署到kubernetes
108 | 
109 | ```shell
110 | kubectl apply -f manifests/daemonset.yaml
111 | ```
112 | 
113 | ## 🔧 配置选项
114 | | 环境变量 | 默认值 | 描述 |
115 | |---------|--------|------|
116 | | `ENABLE_MIG` | `false` | 启用 MIG 管理 |
117 | | `MIG_PROFILE` | `3g.20gb` | MIG 切分配置 |
118 | | `MIG_INSTANCE_COUNT` | `0` | MIG 实例数量 (0=自动计算) |
119 | | `SKIP_CONFIGURED` | `true` | 跳过已配置的 MIG 设备 |
120 | | `CDI_ENABLED` | `false` | 启用 CDI 设备注入 |
121 | | `CDI_PREFIX` | `micro.device` | CDI 设备前缀 |


--------------------------------------------------------------------------------
/pkg/allocator/allocator.go:
--------------------------------------------------------------------------------
  1 | package allocator
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"sync"
  6 | 
  7 | 	"k8s.io/klog/v2"
  8 | )
  9 | 
 10 | // Allocator 设备资源分配器接口
 11 | type Allocator interface {
 12 | 	Allocate(ids []string, podUID string) error // 增加podUID参数
 13 | 	Deallocate(ids []string)
 14 | 	GetAllocatedDevices() []string
 15 | 	CleanupOrphanedDevices(map[string]bool)
 16 | 	GetPodUID(deviceID string) string // 修改为 string 参数
 17 | 	GetAllocationMap() map[string]string
 18 | 	IsAvailable(id string) bool // 新增方法
 19 | }
 20 | 
 21 | // SimpleAllocator 简单的内存分配器实现
 22 | type SimpleAllocator struct {
 23 | 	mu          sync.RWMutex
 24 | 	allocated   map[string]bool   // 已分配设备ID
 25 | 	deviceToPod map[string]string // 新增：设备到 Pod 的映射
 26 | }
 27 | 
 28 | func NewSimpleAllocator() *SimpleAllocator {
 29 | 	return &SimpleAllocator{
 30 | 		allocated:   make(map[string]bool),
 31 | 		deviceToPod: make(map[string]string),
 32 | 	}
 33 | }
 34 | 
 35 | // Allocate 分配设备资源
 36 | func (a *SimpleAllocator) Allocate(ids []string, podUID string) error {
 37 | 	a.mu.Lock()
 38 | 	defer a.mu.Unlock()
 39 | 
 40 | 	// 首先检查所有设备是否可用
 41 | 	for _, id := range ids {
 42 | 		if _, exists := a.allocated[id]; exists {
 43 | 			return ErrDeviceAlreadyAllocated
 44 | 		}
 45 | 	}
 46 | 
 47 | 	// 然后分配设备
 48 | 	for _, id := range ids {
 49 | 		a.allocated[id] = true
 50 | 		klog.Infof("Device allocated: %s", id)
 51 | 	}
 52 | 
 53 | 	for _, id := range ids {
 54 | 		a.allocated[id] = true
 55 | 		a.deviceToPod[id] = podUID // 记录设备到 Pod 的映射
 56 | 		klog.Infof("Device allocated: %s to pod %s", id, podUID)
 57 | 	}
 58 | 
 59 | 	return nil
 60 | }
 61 | 
 62 | // 新增方法：获取设备对应的 Pod UID
 63 | func (a *SimpleAllocator) GetPodUID(deviceID string) string {
 64 | 	a.mu.RLock()
 65 | 	defer a.mu.RUnlock()
 66 | 	return a.deviceToPod[deviceID]
 67 | }
 68 | 
 69 | // Deallocate 释放设备资源
 70 | func (a *SimpleAllocator) Deallocate(ids []string) {
 71 | 	a.mu.Lock()
 72 | 	defer a.mu.Unlock()
 73 | 
 74 | 	for _, id := range ids {
 75 | 		if _, exists := a.allocated[id]; exists {
 76 | 			delete(a.allocated, id)
 77 | 			delete(a.deviceToPod, id) // 清理映射关系
 78 | 			klog.Infof("Device deallocated: %s", id)
 79 | 		}
 80 | 	}
 81 | }
 82 | 
 83 | // GetAllocatedDevices 获取所有已分配设备
 84 | func (a *SimpleAllocator) GetAllocatedDevices() []string {
 85 | 	a.mu.RLock()
 86 | 	defer a.mu.RUnlock()
 87 | 
 88 | 	devices := make([]string, 0, len(a.allocated))
 89 | 	for id := range a.allocated {
 90 | 		devices = append(devices, id)
 91 | 	}
 92 | 	return devices
 93 | }
 94 | func (a *SimpleAllocator) CleanupOrphanedDevices(discoveredIDs map[string]bool) {
 95 | 	a.mu.Lock()
 96 | 	defer a.mu.Unlock()
 97 | 
 98 | 	for id := range a.allocated {
 99 | 		if !discoveredIDs[id] {
100 | 			delete(a.allocated, id)
101 | 			klog.Warningf("Cleaned orphaned device: %s", id)
102 | 		}
103 | 	}
104 | }
105 | 
106 | // GetAllocationMap 返回设备分配状态的副本
107 | func (a *SimpleAllocator) GetAllocationMap() map[string]string {
108 | 	a.mu.RLock()
109 | 	defer a.mu.RUnlock()
110 | 
111 | 	// 返回深拷贝防止并发修改
112 | 	result := make(map[string]string)
113 | 	for k, v := range a.deviceToPod {
114 | 		result[k] = v
115 | 	}
116 | 	return result
117 | }
118 | 
119 | // IsAvailable 检查设备是否可用（未被分配）
120 | func (a *SimpleAllocator) IsAvailable(deviceID string) bool {
121 | 	a.mu.RLock()
122 | 	defer a.mu.RUnlock()
123 | 	_, exists := a.allocated[deviceID]
124 | 	return !exists // 如果存在表示已分配，不可用
125 | }
126 | 
127 | // 错误定义
128 | var (
129 | 	ErrDeviceAlreadyAllocated = errors.New("device already allocated")
130 | )
131 | 


--------------------------------------------------------------------------------
/manifests/daemonset.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: rbac.authorization.k8s.io/v1
  2 | kind: ClusterRole
  3 | metadata:
  4 |   name: micro-device-plugin
  5 | rules:
  6 |   - apiGroups: [""]
  7 |     resources: ["pods"]
  8 |     verbs: ["list", "watch", "get"]  # 增加get权限
  9 | 
 10 | ---
 11 | apiVersion: rbac.authorization.k8s.io/v1
 12 | kind: ClusterRoleBinding
 13 | metadata:
 14 |   name: micro-device-plugin
 15 | subjects:
 16 |   - kind: ServiceAccount
 17 |     name: default
 18 |     namespace: kube-system
 19 | roleRef:
 20 |   kind: ClusterRole
 21 |   name: micro-device-plugin
 22 |   apiGroup: rbac.authorization.k8s.io
 23 | ---
 24 | # 原有DaemonSet保持不变
 25 | apiVersion: apps/v1
 26 | kind: DaemonSet
 27 | metadata:
 28 |   name: micro-device-plugin
 29 |   namespace: kube-system
 30 |   labels:
 31 |     app: micro-device-plugin
 32 | spec:
 33 |   selector:
 34 |     matchLabels:
 35 |       app: micro-device-plugin
 36 |   template:
 37 |     metadata:
 38 |       labels:
 39 |         app: micro-device-plugin
 40 |     spec:
 41 |       hostNetwork: true
 42 |       containers:
 43 |         - name: plugin
 44 |           image: binyue/micro-device-plugin:v1.0.13 # 更新版本
 45 |           imagePullPolicy: IfNotPresent
 46 |           securityContext:
 47 |             privileged: true
 48 |           resources:
 49 |             limits:
 50 |               cpu: 100m
 51 |               memory: 128Mi
 52 |           env:
 53 |             - name: POD_NAME
 54 |               valueFrom:
 55 |                 fieldRef:
 56 |                   fieldPath: metadata.name
 57 |             - name: POD_NAMESPACE
 58 |               valueFrom:
 59 |                 fieldRef:
 60 |                   fieldPath: metadata.namespace
 61 |             - name: POD_UID
 62 |               valueFrom:
 63 |                 fieldRef:
 64 |                   fieldPath: metadata.uid
 65 |             - name: NODE_NAME
 66 |               valueFrom:
 67 |                 fieldRef:
 68 |                   fieldPath: spec.nodeName
 69 |             - name: LD_LIBRARY_PATH
 70 |               value: /usr/lib/x86_64-linux-gnu:/host-lib
 71 |             # 启用MIG配置
 72 |             - name: ENABLE_MIG
 73 |               value: "true"
 74 |             # 设置切分策略（默认3g.20gb）
 75 |             - name: MIG_PROFILE
 76 |               value: "3g.20gb"
 77 |             # 新增实例数量配置 (可选)
 78 |             - name: MIG_INSTANCE_COUNT
 79 |               value: "2"   # 显式指定创建4个实例
 80 | 
 81 |             # 是否跳过已切分的设备
 82 |             - name: SKIP_CONFIGURED
 83 |               value: "true"
 84 |             - name: CDI_ENABLED
 85 |               value: "false"  # 启用CDI
 86 |             - name: CDI_PREFIX
 87 |               value: "micro.device"  # CDI前缀
 88 |           volumeMounts:
 89 |             - name: device-plugin
 90 |               mountPath: /var/lib/kubelet/device-plugins
 91 |             - name: dev
 92 |               mountPath: /dev
 93 |             - name: nvidia-bin
 94 |               mountPath: /host-driver/nvidia-smi
 95 |               subPath: nvidia-smi
 96 |             - name: nvidia-lib
 97 |               mountPath: /host-lib
 98 |             - name: dev-caps
 99 |               mountPath: /dev/nvidia-caps
100 |             - name: cdi-dir  # 新增卷
101 |               mountPath: /etc/cdi
102 | 
103 |       volumes:
104 |         - name: device-plugin
105 |           hostPath:
106 |             path: /var/lib/kubelet/device-plugins
107 |             type: DirectoryOrCreate
108 |         - name: dev
109 |           hostPath:
110 |             path: /dev
111 |         - name: nvidia-bin
112 |           hostPath:
113 |             path: /usr/bin
114 |         - name: nvidia-lib
115 |           hostPath:
116 |             path: /usr/lib/x86_64-linux-gnu
117 |         - name: dev-caps
118 |           hostPath:
119 |             path: /dev/nvidia-caps
120 |             type: Directory
121 |         - name: cdi-dir   # 新增卷定义
122 |           hostPath:
123 |             path: /etc/cdi
124 |             type: DirectoryOrCreate
125 | 


--------------------------------------------------------------------------------
/pkg/deviceplugin/server.go:
--------------------------------------------------------------------------------
  1 | package deviceplugin
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"net"
  7 | 	"os"
  8 | 	"path"
  9 | 	"strings"
 10 | 	"syscall"
 11 | 	"time"
 12 | 
 13 | 	"github.com/benyuereal/micro-device-plugin/pkg/allocator"
 14 | 	"github.com/benyuereal/micro-device-plugin/pkg/device"
 15 | 	"google.golang.org/grpc"
 16 | 
 17 | 	corev1 "k8s.io/api/core/v1"
 18 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 19 | 	"k8s.io/client-go/kubernetes"
 20 | 	"k8s.io/client-go/rest"
 21 | 	"k8s.io/klog/v2"
 22 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 23 | )
 24 | 
 25 | const (
 26 | 	socketPrefix  = "microui.sock"
 27 | 	kubeletSocket = pluginapi.KubeletSocket
 28 | 	restartDelay  = 5 * time.Second
 29 | )
 30 | 
 31 | type DevicePluginServer struct {
 32 | 	vendor          string
 33 | 	resource        string
 34 | 	socket          string
 35 | 	stop            chan struct{}
 36 | 	healthChan      chan string
 37 | 	allocator       allocator.Allocator
 38 | 	manager         device.DeviceManager
 39 | 	server          *grpc.Server
 40 | 	lastDeviceState map[string]string           // 使用字符串记录健康状态
 41 | 	deviceMap       map[string]device.GPUDevice // 设备ID到设备对象的映射
 42 | 	cdiEnabled      bool
 43 | 	cdiPrefix       string                // 添加CDI前缀配置
 44 | 	kubeClient      *kubernetes.Clientset // 新增 Kubernetes 客户端
 45 | 	nodeName        string                // 新增节点名称
 46 | }
 47 | 
 48 | func New(vendor string, manager device.DeviceManager, cdiEnabled bool, cdiPrefix string, nodeName string) *DevicePluginServer {
 49 | 	// 创建 Kubernetes 客户端
 50 | 	config, _ := rest.InClusterConfig()
 51 | 	kubeClient, _ := kubernetes.NewForConfig(config)
 52 | 	return &DevicePluginServer{
 53 | 		vendor:          vendor,
 54 | 		resource:        vendor + ".com/microgpu",
 55 | 		socket:          path.Join(pluginapi.DevicePluginPath, socketPrefix+"."+vendor),
 56 | 		stop:            make(chan struct{}),
 57 | 		healthChan:      make(chan string, 1),
 58 | 		manager:         manager,
 59 | 		allocator:       allocator.NewSimpleAllocator(),
 60 | 		lastDeviceState: make(map[string]string),
 61 | 		deviceMap:       make(map[string]device.GPUDevice),
 62 | 		cdiEnabled:      cdiEnabled,
 63 | 		cdiPrefix:       cdiPrefix,
 64 | 		kubeClient:      kubeClient,
 65 | 		nodeName:        nodeName,
 66 | 	}
 67 | }
 68 | 
 69 | // ListAndWatch 实现设备插件服务
 70 | func (s *DevicePluginServer) ListAndWatch(_ *pluginapi.Empty, stream pluginapi.DevicePlugin_ListAndWatchServer) error {
 71 | 	klog.Infof("Starting ListAndWatch for %s device plugin", s.vendor)
 72 | 
 73 | 	// 初始设备列表
 74 | 	if err := s.updateDeviceList(stream); err != nil {
 75 | 		return err
 76 | 	}
 77 | 
 78 | 	// 定时更新和健康检查
 79 | 	ticker := time.NewTicker(10 * time.Second)
 80 | 	defer ticker.Stop()
 81 | 
 82 | 	for {
 83 | 		select {
 84 | 		case <-ticker.C:
 85 | 			klog.V(5).Infof("Periodic device list update for %s", s.vendor)
 86 | 			if err := s.updateDeviceList(stream); err != nil {
 87 | 				return err
 88 | 			}
 89 | 		case id := <-s.healthChan:
 90 | 			klog.Warningf("Device %s health status changed, updating device list", id)
 91 | 			if err := s.updateDeviceList(stream); err != nil {
 92 | 				return err
 93 | 			}
 94 | 		case <-s.stop:
 95 | 			klog.Infof("Stopping ListAndWatch for %s device plugin", s.vendor)
 96 | 			return nil
 97 | 		}
 98 | 	}
 99 | }
100 | 
101 | func (s *DevicePluginServer) updateDeviceList(stream pluginapi.DevicePlugin_ListAndWatchServer) error {
102 | 	devices, err := s.manager.DiscoverGPUs()
103 | 	if err != nil {
104 | 		klog.Errorf("Failed to discover devices: %v", err)
105 | 		return fmt.Errorf("failed to discover devices: %v", err)
106 | 	}
107 | 	// 新增：清理已消失设备的分配状态
108 | 	discoveredIDs := make(map[string]bool)
109 | 	for _, d := range devices {
110 | 		discoveredIDs[d.ID()] = true
111 | 	}
112 | 	s.allocator.CleanupOrphanedDevices(discoveredIDs)
113 | 
114 | 	// 修复：在更新设备列表时重建deviceMap
115 | 	newDeviceMap := make(map[string]device.GPUDevice)
116 | 	for _, d := range devices {
117 | 		newDeviceMap[d.ID()] = d
118 | 	}
119 | 	s.deviceMap = newDeviceMap
120 | 	klog.Infof("Discovered %d new devices, deviceMap %v", len(newDeviceMap), newDeviceMap)
121 | 
122 | 	deviceList := make([]*pluginapi.Device, len(devices))
123 | 	healthStatusCount := map[string]int{
124 | 		pluginapi.Healthy:   0,
125 | 		pluginapi.Unhealthy: 0}
126 | 
127 | 	for i, d := range devices {
128 | 		// 更新设备健康状态
129 | 		healthy := s.manager.CheckHealth(d.ID())
130 | 		state := pluginapi.Healthy
131 | 		if !healthy {
132 | 			state = pluginapi.Unhealthy
133 | 		}
134 | 		healthStatusCount[state]++
135 | 
136 | 		// 记录状态变化
137 | 		if prevState, exists := s.lastDeviceState[d.ID()]; exists && prevState != state {
138 | 			klog.Infof("Device %s health changed from %s to %s", d.ID(), prevState, state)
139 | 		}
140 | 		s.lastDeviceState[d.ID()] = state
141 | 
142 | 		deviceList[i] = &pluginapi.Device{
143 | 			ID:     d.ID(),
144 | 			Health: state,
145 | 		}
146 | 	}
147 | 
148 | 	klog.Infof("Updating device list for %s: %d devices (%d healthy, %d unhealthy)",
149 | 		s.vendor, len(deviceList), healthStatusCount[pluginapi.Healthy], healthStatusCount[pluginapi.Unhealthy])
150 | 
151 | 	return stream.Send(&pluginapi.ListAndWatchResponse{Devices: deviceList})
152 | }
153 | 
154 | // Allocate 设备分配实现 - 生产级MIG支持
155 | func (s *DevicePluginServer) Allocate(ctx context.Context, req *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
156 | 	klog.Infof("Received Allocate request for %s: %v", s.resource, req.ContainerRequests)
157 | 	response := pluginapi.AllocateResponse{}
158 | 
159 | 	// 修复：从请求的注解中获取 Pod UID（Kubernetes 标准方式）
160 | 	// 方法1: 尝试从环境变量获取 Pod 信息
161 | 	podName := os.Getenv("POD_NAME")
162 | 	podNamespace := os.Getenv("POD_NAMESPACE")
163 | 	podUID := ""
164 | 	if podName != "" && podNamespace != "" {
165 | 		pod, err := s.kubeClient.CoreV1().Pods(podNamespace).Get(ctx, podName, metav1.GetOptions{})
166 | 		if err != nil {
167 | 			klog.Warningf("Failed to get pod %s/%s: %v", podNamespace, podName, err)
168 | 		} else {
169 | 			podUID = string(pod.UID)
170 | 			klog.Infof("Found pod UID via API: %s", podUID)
171 | 		}
172 | 	}
173 | 
174 | 	for _, containerReq := range req.ContainerRequests {
175 | 		containerResp := new(pluginapi.ContainerAllocateResponse)
176 | 
177 | 		// 获取 Pod UI
178 | 		// 尝试分配这些设备
179 | 		// 在分配设备前检查设备是否可用
180 | 		for _, devID := range containerReq.DevicesIDs {
181 | 			if !s.allocator.IsAvailable(devID) {
182 | 				// 如果设备已被分配但Pod不存在，清除错误状态
183 | 				if !s.isPodActive(s.allocator.GetPodUID(devID)) {
184 | 					s.allocator.Deallocate([]string{devID})
185 | 				} else {
186 | 					return nil, fmt.Errorf("device %s is already allocated", devID)
187 | 				}
188 | 			}
189 | 		}
190 | 
191 | 		if err := s.allocator.Allocate(containerReq.DevicesIDs, podUID); err != nil {
192 | 			klog.Errorf("Allocation failed for devices %v: %v", containerReq.DevicesIDs, err)
193 | 			return nil, fmt.Errorf("allocation failed: %v", err)
194 | 		}
195 | 
196 | 		// ================= 核心环境变量设置 =================
197 | 		envs := make(map[string]string)
198 | 
199 | 		// 关键修改：使用物理索引而非设备ID
200 | 		envs["NVIDIA_VISIBLE_DEVICES"] = strings.Join(containerReq.DevicesIDs, ",")
201 | 		envs["NVIDIA_DRIVER_CAPABILITIES"] = "compute,utility,video,graphics"
202 | 		envs["NVIDIA_DISABLE_REQUIRE"] = "1"
203 | 		envs["NVIDIA_REQUIRE_MIG"] = "1"
204 | 
205 | 		containerResp.Envs = envs
206 | 
207 | 		// 打印环境变量用于调试
208 | 		for k, v := range containerResp.Envs {
209 | 			klog.Infof("Setting env: %s=%s", k, v)
210 | 		}
211 | 
212 | 		// 添加 CDI 设备注入
213 | 		if s.cdiEnabled {
214 | 			cdiDevices := make([]string, len(containerReq.DevicesIDs))
215 | 			for i, id := range containerReq.DevicesIDs {
216 | 				cdiDevices[i] = fmt.Sprintf("%s/%s=%s", s.cdiPrefix, s.vendor, id)
217 | 			}
218 | 			containerResp.CDIDevices = []*pluginapi.CDIDevice{
219 | 				{
220 | 					Name: strings.Join(cdiDevices, ","),
221 | 				},
222 | 			}
223 | 		}
224 | 
225 | 		response.ContainerResponses = append(response.ContainerResponses, containerResp)
226 | 	}
227 | 
228 | 	klog.Infof("Allocation successful for %s, req :%v, resp: %v", s.resource, req.ContainerRequests,
229 | 		response.ContainerResponses)
230 | 	return &response, nil
231 | }
232 | 
233 | func (s *DevicePluginServer) isMIGDevice(id string) bool {
234 | 	devices, _ := s.manager.DiscoverGPUs()
235 | 	for _, d := range devices {
236 | 		if d.ID() == id && d.IsMIG() {
237 | 			return true
238 | 		}
239 | 	}
240 | 	return false
241 | }
242 | 
243 | // GetDevicePluginOptions 插件选项
244 | func (s *DevicePluginServer) GetDevicePluginOptions(ctx context.Context, empty *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
245 | 	return &pluginapi.DevicePluginOptions{
246 | 		PreStartRequired: false,
247 | 	}, nil
248 | }
249 | 
250 | // PreStartContainer 容器启动前预处理（可选）
251 | func (s *DevicePluginServer) PreStartContainer(ctx context.Context, req *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
252 | 	return &pluginapi.PreStartContainerResponse{}, nil
253 | }
254 | 
255 | // GetPreferredAllocation 分配偏好（可选）
256 | func (s *DevicePluginServer) GetPreferredAllocation(ctx context.Context, req *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
257 | 	return &pluginapi.PreferredAllocationResponse{}, nil
258 | }
259 | 
260 | // *********** 服务管理方法 ***********
261 | 
262 | // Start 启动设备插件服务
263 | func (s *DevicePluginServer) Start(ctx context.Context) error {
264 | 	klog.Infof("Starting %s device plugin", s.vendor)
265 | 
266 | 	// 启动资源回收器（每 30 秒运行一次）
267 | 	go s.ResourceRecycler(ctx, 30*time.Second) // 共享主流程上下文
268 | 	// 如果是NVIDIA设备，配置MIG
269 | 	if nvidiaManager, ok := s.manager.(*device.NVIDIAManager); ok {
270 | 		nvidiaManager.ConfigureMIG()
271 | 	}
272 | 
273 | 	// 确保插件目录存在
274 | 	if err := os.MkdirAll(pluginapi.DevicePluginPath, 0755); err != nil {
275 | 		klog.Errorf("Failed to create device plugin directory: %v", err)
276 | 		return fmt.Errorf("failed to create device plugin directory: %v", err)
277 | 	}
278 | 
279 | 	// 清理现有的socket文件
280 | 	if err := syscall.Unlink(s.socket); err != nil && !os.IsNotExist(err) {
281 | 		klog.Errorf("Failed to unlink socket: %v", err)
282 | 		return fmt.Errorf("failed to unlink socket: %v", err)
283 | 	}
284 | 
285 | 	// 创建监听
286 | 	lis, err := net.Listen("unix", s.socket)
287 | 	if err != nil {
288 | 		klog.Errorf("Failed to listen on socket: %v", err)
289 | 		return fmt.Errorf("failed to listen on socket: %v", err)
290 | 	}
291 | 
292 | 	// 创建gRPC服务
293 | 	s.server = grpc.NewServer()
294 | 	pluginapi.RegisterDevicePluginServer(s.server, s)
295 | 
296 | 	// 启动gRPC服务
297 | 	go func() {
298 | 		klog.Infof("Starting %s device plugin server at: %s", s.vendor, s.socket)
299 | 		if err := s.server.Serve(lis); err != nil {
300 | 			klog.Fatalf("Device plugin server failed: %v", err)
301 | 		}
302 | 	}()
303 | 
304 | 	// 等待服务器启动
305 | 	connCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
306 | 	defer cancel()
307 | 
308 | 	if err := waitForSocket(connCtx, s.socket); err != nil {
309 | 		klog.Errorf("Failed to start gRPC server: %v", err)
310 | 		return fmt.Errorf("failed to start gRPC server: %v", err)
311 | 	}
312 | 
313 | 	// 注册到kubelet
314 | 	if err := s.registerWithKubelet(); err != nil {
315 | 		klog.Errorf("Failed to register with kubelet: %v", err)
316 | 		return fmt.Errorf("failed to register with kubelet: %v", err)
317 | 	}
318 | 
319 | 	klog.Infof("%s device plugin started and registered with resource name %s", s.vendor, s.resource)
320 | 
321 | 	return nil
322 | }
323 | 
324 | // Stop 停止设备插件
325 | func (s *DevicePluginServer) Stop() {
326 | 	klog.Infof("Stopping %s device plugin", s.vendor)
327 | 	close(s.stop)
328 | 	if s.server != nil {
329 | 		s.server.Stop()
330 | 	}
331 | }
332 | 
333 | // HealthCheck 后台健康检查
334 | func (s *DevicePluginServer) HealthCheck(ctx context.Context, interval time.Duration) {
335 | 	klog.Infof("Starting health check for %s plugin with interval %v", s.vendor, interval)
336 | 	ticker := time.NewTicker(interval)
337 | 	defer ticker.Stop()
338 | 
339 | 	for {
340 | 		select {
341 | 		case <-ticker.C:
342 | 			devices, err := s.manager.DiscoverGPUs()
343 | 			if err != nil {
344 | 				klog.Errorf("Failed to discover devices during health check: %v", err)
345 | 				continue
346 | 			}
347 | 
348 | 			for _, d := range devices {
349 | 				currentHealth := d.IsHealthy()
350 | 				actualHealth := s.manager.CheckHealth(d.ID())
351 | 
352 | 				if currentHealth != actualHealth {
353 | 					klog.Warningf("Device %s health status changed from %v to %v", d.ID(), currentHealth, actualHealth)
354 | 					s.healthChan <- d.ID()
355 | 				}
356 | 			}
357 | 		case <-ctx.Done():
358 | 			klog.Infof("Stopping health check for %s plugin", s.vendor)
359 | 			return
360 | 		}
361 | 	}
362 | }
363 | 
364 | // *********** 辅助方法 ***********
365 | 
366 | func (s *DevicePluginServer) registerWithKubelet() error {
367 | 	klog.Infof("Registering with kubelet at %s", kubeletSocket)
368 | 
369 | 	conn, err := grpc.Dial(kubeletSocket, grpc.WithInsecure(),
370 | 		grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
371 | 			return (&net.Dialer{}).DialContext(ctx, "unix", addr)
372 | 		}),
373 | 	)
374 | 
375 | 	if err != nil {
376 | 		return fmt.Errorf("failed to connect to kubelet: %v", err)
377 | 	}
378 | 	defer conn.Close()
379 | 
380 | 	client := pluginapi.NewRegistrationClient(conn)
381 | 	req := &pluginapi.RegisterRequest{
382 | 		Version:      pluginapi.Version,
383 | 		Endpoint:     path.Base(s.socket),
384 | 		ResourceName: s.resource,
385 | 	}
386 | 
387 | 	_, err = client.Register(context.Background(), req)
388 | 	return err
389 | }
390 | 
391 | func waitForSocket(ctx context.Context, socket string) error {
392 | 	klog.V(4).Infof("Waiting for socket %s to be ready", socket)
393 | 
394 | 	for {
395 | 		select {
396 | 		case <-ctx.Done():
397 | 			return ctx.Err()
398 | 		default:
399 | 			if conn, err := net.Dial("unix", socket); err == nil {
400 | 				conn.Close()
401 | 				klog.V(4).Infof("Socket %s is ready", socket)
402 | 				return nil
403 | 			}
404 | 			time.Sleep(restartDelay)
405 | 		}
406 | 	}
407 | }
408 | 
409 | // 新增方法：资源回收器
410 | func (s *DevicePluginServer) ResourceRecycler(ctx context.Context, interval time.Duration) {
411 | 	klog.Infof("Starting resource recycler for %s plugin", s.vendor)
412 | 	ticker := time.NewTicker(interval)
413 | 	defer ticker.Stop()
414 | 
415 | 	for {
416 | 		select {
417 | 		case <-ticker.C:
418 | 
419 | 			allocatedMap := s.allocator.GetAllocationMap() // 获取设备到 Pod 的映射
420 | 			if len(allocatedMap) == 0 {
421 | 				continue
422 | 			}
423 | 
424 | 			// 检查已分配设备对应的 Pod
425 | 			var toRelease []string
426 | 			for deviceID, podUID := range allocatedMap {
427 | 				if podUID == "" {
428 | 					toRelease = append(toRelease, deviceID) // 无主设备直接释放
429 | 					continue
430 | 				}
431 | 
432 | 				// 检查 Pod 状态：只有非活动状态（终止/完成）才释放
433 | 				if !s.isPodActive(podUID) {
434 | 					toRelease = append(toRelease, deviceID)
435 | 					klog.Infof("Marking device %s for release (pod %s is inactive)", deviceID, podUID)
436 | 				}
437 | 			}
438 | 
439 | 			// 释放资源
440 | 			if len(toRelease) > 0 {
441 | 				s.allocator.Deallocate(toRelease)
442 | 				klog.Infof("Released %d orphaned devices, deivce %v", len(toRelease), toRelease)
443 | 			}
444 | 
445 | 		case <-ctx.Done():
446 | 			klog.Infof("Stopping resource recycler for %s plugin", s.vendor)
447 | 			return
448 | 		}
449 | 	}
450 | }
451 | 
452 | // isPodActive 检查 Pod 是否处于活动状态（非终止/完成）
453 | func (s *DevicePluginServer) isPodActive(podUID string) bool {
454 | 	if podUID == "" {
455 | 		return false
456 | 	}
457 | 	pod, err := s.kubeClient.CoreV1().Pods("").Get(context.Background(), "", metav1.GetOptions{})
458 | 	if err != nil {
459 | 		klog.Warningf("Failed to get pod with UID %s: %v", podUID, err)
460 | 		return false // 默认按非活动处理
461 | 	}
462 | 	if pod.DeletionTimestamp != nil {
463 | 		return false // 正在终止，视为非活动
464 | 	}
465 | 
466 | 	// 活动状态：Running 或 Pending
467 | 	if pod.Status.Phase == corev1.PodRunning || pod.Status.Phase == corev1.PodPending {
468 | 		return true
469 | 	}
470 | 	// 非活动状态：Succeeded（完成）、Failed（失败）或正在删除（DeletionTimestamp 非空）
471 | 	return false
472 | }
473 | 


--------------------------------------------------------------------------------
/pkg/device/nvidia.go:
--------------------------------------------------------------------------------
  1 | package device
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"os/exec"
  7 | 	"regexp"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"sync"
 11 | 	"time"
 12 | 
 13 | 	"k8s.io/klog/v2"
 14 | )
 15 | 
 16 | type NVIDIADevice struct {
 17 | 	id          string
 18 | 	deviceIndex string // 系统设备索引
 19 | 	physicalID  string // 物理GPU ID
 20 | 	migEnabled  bool   // 是否为MIG设备
 21 | 	profile     string // MIG配置类型
 22 | 	healthy     bool
 23 | }
 24 | 
 25 | func (d *NVIDIADevice) ID() string        { return d.id }
 26 | func (d *NVIDIADevice) IsHealthy() bool   { return d.healthy }
 27 | func (d *NVIDIADevice) GetVendor() string { return "nvidia" }
 28 | 
 29 | // device/nvidia.go
 30 | func (d *NVIDIADevice) GetPath() string {
 31 | 	if d.migEnabled {
 32 | 		// 生成设备节点名称（如 nvidia-cap12）
 33 | 		return fmt.Sprintf("/dev/nvidia-caps/nvidia-cap%s", d.physicalID)
 34 | 	}
 35 | 	return "/dev/nvidia" + d.physicalID
 36 | }
 37 | func (d *NVIDIADevice) IsMIG() bool { return d.migEnabled }
 38 | func (d *NVIDIADevice) PhysicalID() string { // 对于MIG设备返回物理GPU索引（如"0"）
 39 | 	if d.migEnabled {
 40 | 		return d.physicalID
 41 | 	}
 42 | 	return d.deviceIndex
 43 | }
 44 | func (d *NVIDIADevice) Profile() string { return d.profile }
 45 | 
 46 | type NVIDIAManager struct {
 47 | 	lastDiscovery time.Time
 48 | 	devices       []GPUDevice
 49 | 	deviceMap     map[string]*NVIDIADevice // 设备ID到设备对象的映射
 50 | 	discoverySync sync.Mutex
 51 | 	migManager    *MIGManager
 52 | }
 53 | 
 54 | // 初始化MIG管理器
 55 | func NewNVIDIAManager() *NVIDIAManager {
 56 | 	return &NVIDIAManager{
 57 | 		migManager: NewMIGManager(),
 58 | 		deviceMap:  make(map[string]*NVIDIADevice),
 59 | 	}
 60 | }
 61 | 
 62 | // 获取nvidia-smi的路径
 63 | func getNvidiaSmiPath() string {
 64 | 	if customPath := os.Getenv("NVIDIA_SMI_PATH"); customPath != "" {
 65 | 		klog.V(4).Infof("Using custom NVIDIA-SMI path: %s", customPath)
 66 | 		return customPath
 67 | 	}
 68 | 	return "/host-driver/nvidia-smi"
 69 | }
 70 | 
 71 | // 确保命令使用正确的库路径
 72 | func runNvidiaSmiCommand(args ...string) ([]byte, error) {
 73 | 	cmd := exec.Command(getNvidiaSmiPath(), args...)
 74 | 	cmd.Env = append(os.Environ(),
 75 | 		"LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/host-lib",
 76 | 		"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
 77 | 	)
 78 | 	klog.Infof("Executing NVIDIA-SMI command: %v", cmd.Args)
 79 | 	return cmd.CombinedOutput()
 80 | }
 81 | 
 82 | func (m *NVIDIAManager) DiscoverGPUs() ([]GPUDevice, error) {
 83 | 	m.discoverySync.Lock()
 84 | 	defer m.discoverySync.Unlock()
 85 | 
 86 | 	// 使用缓存机制
 87 | 	if time.Since(m.lastDiscovery) < 5*time.Minute && m.devices != nil {
 88 | 		klog.V(4).Infof("Using cached NVIDIA devices (last discovery: %s)", m.lastDiscovery)
 89 | 		return m.devices, nil
 90 | 	}
 91 | 
 92 | 	klog.Info("Discovering NVIDIA devices")
 93 | 
 94 | 	// 重置设备映射
 95 | 	m.deviceMap = make(map[string]*NVIDIADevice)
 96 | 	var devices []GPUDevice
 97 | 
 98 | 	// 步骤1: 获取所有GPU设备列表
 99 | 	out, err := runNvidiaSmiCommand("--query-gpu=index,uuid,memory.total,mig.mode.current", "--format=csv,noheader")
100 | 	if err != nil {
101 | 		klog.Errorf("Failed to discover NVIDIA GPUs: %v", err)
102 | 		return nil, err
103 | 	}
104 | 
105 | 	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
106 | 	for _, line := range lines {
107 | 		fields := strings.Split(line, ",")
108 | 		if len(fields) < 4 {
109 | 			continue
110 | 		}
111 | 
112 | 		gpuIndex := strings.TrimSpace(fields[0])
113 | 		gpuUUID := strings.TrimSpace(fields[1])
114 | 		migMode := strings.TrimSpace(fields[3])
115 | 
116 | 		// 步骤2: 检查MIG模式
117 | 		if migMode == "Enabled" && os.Getenv("ENABLE_MIG") == "true" {
118 | 
119 | 			// 获取MIG设备
120 | 			migDevices, err := m.discoverMIGDevices(gpuIndex)
121 | 			if err != nil {
122 | 				klog.Errorf("Failed to discover MIG devices for GPU %s: %v", gpuIndex, err)
123 | 				continue
124 | 			}
125 | 			devices = append(devices, migDevices...)
126 | 		} else {
127 | 			// 普通GPU设备
128 | 			device := &NVIDIADevice{
129 | 				id:          gpuUUID,
130 | 				deviceIndex: gpuIndex,
131 | 				physicalID:  gpuIndex,
132 | 				migEnabled:  false,
133 | 				healthy:     true,
134 | 			}
135 | 			devices = append(devices, device)
136 | 			m.deviceMap[gpuUUID] = device
137 | 		}
138 | 	}
139 | 
140 | 	klog.Infof("Discovered %d NVIDIA devices", len(devices))
141 | 	for _, d := range devices {
142 | 		nvDevice := d.(*NVIDIADevice)
143 | 		klog.Infof("NVIDIA Device: ID=%s, Index=%s, MIG=%v, Profile=%s",
144 | 			nvDevice.ID(), nvDevice.deviceIndex, nvDevice.IsMIG(), nvDevice.Profile())
145 | 	}
146 | 
147 | 	m.devices = devices
148 | 	m.lastDiscovery = time.Now()
149 | 	return devices, nil
150 | }
151 | 
152 | // 发现MIG设备
153 | func (m *NVIDIAManager) discoverMIGDevices(gpuIndex string) ([]GPUDevice, error) {
154 | 	var devices []GPUDevice
155 | 
156 | 	// 查询GPU实例（GPU Instances）
157 | 	out, err := runNvidiaSmiCommand("mig", "-lgi", "-i", gpuIndex)
158 | 	output := strings.TrimSpace(string(out))
159 | 
160 | 	// 处理无GPU实例的情况
161 | 	if strings.Contains(output, "No GPU instances found") {
162 | 		klog.Infof("No MIG GPU instances found on GPU %s", gpuIndex)
163 | 		return devices, nil
164 | 	}
165 | 
166 | 	if err != nil {
167 | 		klog.Errorf("Failed to query GPU instances for GPU %s: %v", gpuIndex, err)
168 | 		return nil, err
169 | 	}
170 | 
171 | 	uuids, err := m.getMIGDeviceUUIDs(gpuIndex)
172 | 
173 | 	for index, uuid := range uuids {
174 | 		// 创建设备ID: GPUIndex-GI-CI
175 | 
176 | 		klog.Infof("Device ID: %s", uuid)
177 | 		device := &NVIDIADevice{
178 | 			id:          uuid,
179 | 			deviceIndex: string(rune(index)), // 使用GPU实例ID作为设备索引
180 | 			physicalID:  gpuIndex,
181 | 			migEnabled:  true,
182 | 			profile:     "3g.20gb",
183 | 			healthy:     true,
184 | 		}
185 | 		klog.Infof("device: %v", device)
186 | 		devices = append(devices, device)
187 | 		m.deviceMap[uuid] = device
188 | 
189 | 		klog.Infof("Found device: %v", device)
190 | 	}
191 | 
192 | 	return devices, nil
193 | }
194 | 
195 | // 获取指定GPU上的MIG设备UUID
196 | func (m *NVIDIAManager) getMIGDeviceUUIDs(gpuIndex string) ([]string, error) {
197 | 	// 使用nvidia-smi -L命令获取所有GPU信息
198 | 	out, err := runNvidiaSmiCommand("-L")
199 | 	if err != nil {
200 | 		return nil, fmt.Errorf("failed to get MIG UUIDs: %v", err)
201 | 	}
202 | 
203 | 	output := strings.TrimSpace(string(out))
204 | 	lines := strings.Split(output, "\n")
205 | 
206 | 	var uuids []string
207 | 	currentGPU := ""
208 | 
209 | 	for _, line := range lines {
210 | 		// 匹配GPU行
211 | 		if strings.HasPrefix(line, "GPU "+gpuIndex+":") {
212 | 			currentGPU = gpuIndex
213 | 			continue
214 | 		}
215 | 
216 | 		// 匹配MIG设备行
217 | 		if currentGPU == gpuIndex && strings.Contains(line, "MIG") && strings.Contains(line, "UUID") {
218 | 			parts := strings.Split(line, "UUID:")
219 | 			if len(parts) >= 2 {
220 | 				uuid := strings.TrimSpace(parts[1])
221 | 				// 移除末尾的括号
222 | 				uuid = strings.TrimSuffix(uuid, ")")
223 | 				uuids = append(uuids, uuid)
224 | 			}
225 | 		}
226 | 	}
227 | 
228 | 	klog.Infof("Found %d MIG UUIDs for GPU %s: %v", len(uuids), gpuIndex, uuids)
229 | 	return uuids, nil
230 | }
231 | 
232 | func (m *NVIDIAManager) getProfileName(profileID string) (string, error) {
233 | 	// 查询所有可用profile
234 | 	out, err := runNvidiaSmiCommand("mig", "-lgip")
235 | 	if err != nil {
236 | 		return "", err
237 | 	}
238 | 
239 | 	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
240 | 	for _, line := range lines {
241 | 		if strings.Contains(line, profileID) {
242 | 			// 示例行: "   19     4     4      0       1      0     0     0     0     1     0     0      0      0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0  1g.10gb"
243 | 			fields := strings.Fields(line)
244 | 			if len(fields) > 0 {
245 | 				// 最后一个字段是profile名称
246 | 				return fields[len(fields)-1], nil
247 | 			}
248 | 		}
249 | 	}
250 | 	return "unknown", fmt.Errorf("profile not found for ID %s", profileID)
251 | }
252 | 
253 | // 健康检查
254 | func (m *NVIDIAManager) CheckHealth(deviceID string) bool {
255 | 	klog.V(5).Infof("Checking health of NVIDIA device %s", deviceID)
256 | 
257 | 	// 从设备映射中获取设备
258 | 	device, exists := m.deviceMap[deviceID]
259 | 	if !exists {
260 | 		klog.Warningf("Device %s not found in device map", deviceID)
261 | 		return false
262 | 	}
263 | 
264 | 	// 对于MIG设备，检查其物理GPU的健康
265 | 	targetID := deviceID
266 | 	if device.IsMIG() {
267 | 		targetID = device.PhysicalID()
268 | 	}
269 | 
270 | 	// 使用更通用的健康检查方式
271 | 	out, err := runNvidiaSmiCommand("-i", targetID, "--query-gpu=utilization.gpu", "--format=csv,noheader")
272 | 	if err != nil {
273 | 		klog.Errorf("Failed to check health for NVIDIA device %s: %v", targetID, err)
274 | 		return false
275 | 	}
276 | 
277 | 	// 如果能够获取到GPU利用率数据，则认为设备健康
278 | 	utilization := strings.TrimSpace(string(out))
279 | 	if utilization != "" {
280 | 		klog.V(4).Infof("NVIDIA device %s is healthy (utilization: %s%%)", targetID, utilization)
281 | 		return true
282 | 	}
283 | 
284 | 	return false
285 | }
286 | 
287 | // MIG管理功能
288 | func (m *NVIDIAManager) ConfigureMIG() {
289 | 	klog.Info("Configuring MIG devices")
290 | 	m.migManager.Configure()
291 | }
292 | 
293 | // MIG管理器
294 | type MIGManager struct {
295 | 	enabled        bool
296 | 	profile        string
297 | 	skipConfigured bool
298 | 	instanceCount  int    // 每个GPU上要创建的实例数
299 | 	gpuMemory      uint64 // GPU显存大小(MB)
300 | }
301 | 
302 | func NewMIGManager() *MIGManager {
303 | 	enabled := os.Getenv("ENABLE_MIG") == "true"
304 | 	profile := os.Getenv("MIG_PROFILE")
305 | 	if profile == "" {
306 | 		profile = "3g.20gb" // 默认20GB切分策略
307 | 	}
308 | 
309 | 	skipConfigured := os.Getenv("SKIP_CONFIGURED") == "true"
310 | 
311 | 	// 读取实例数量配置
312 | 	instanceCount := 0 // 0表示自动计算
313 | 	if countStr := os.Getenv("MIG_INSTANCE_COUNT"); countStr != "" {
314 | 		if count, err := strconv.Atoi(countStr); err == nil {
315 | 			instanceCount = count
316 | 		}
317 | 	}
318 | 
319 | 	return &MIGManager{
320 | 		enabled:        enabled,
321 | 		profile:        profile,
322 | 		skipConfigured: skipConfigured,
323 | 		instanceCount:  instanceCount,
324 | 	}
325 | }
326 | 
327 | func (m *MIGManager) Configure() {
328 | 
329 | 	klog.Info("MIG configuration is in process ")
330 | 
331 | 	if !m.enabled {
332 | 		klog.Info("MIG configuration is disabled")
333 | 		return
334 | 	}
335 | 
336 | 	klog.Infof("Starting MIG configuration with profile: %s", m.profile)
337 | 
338 | 	// 检查设备是否支持MIG
339 | 	if supported, err := m.isMigSupported(); err != nil {
340 | 		klog.Errorf("Failed to check MIG support: %v", err)
341 | 		return
342 | 	} else if !supported {
343 | 		klog.Warning("MIG is not supported on this device. Skipping MIG configuration.")
344 | 		return
345 | 	}
346 | 
347 | 	// 2. 创建MIG设备 fixme 先不要创建设备
348 | 	//if err := m.createMIGDevices(); err != nil {
349 | 	//	klog.Errorf("Failed to create MIG devices: %v", err)
350 | 	//}
351 | }
352 | 
353 | // 检查设备是否支持MIG
354 | func (m *MIGManager) isMigSupported() (bool, error) {
355 | 	// 检查MIG支持状态
356 | 	out, err := runNvidiaSmiCommand("mig", "-lgip")
357 | 	output := strings.TrimSpace(string(out))
358 | 
359 | 	// 先检查特定不支持信息
360 | 	if strings.Contains(output, "No MIG-supported devices found") {
361 | 		klog.V(4).Info("MIG not supported: No MIG-supported devices found")
362 | 		return false, nil
363 | 	}
364 | 
365 | 	// 检查其他不支持情况
366 | 	if strings.Contains(output, "not supported") {
367 | 		klog.V(4).Infof("MIG not supported: %s", output)
368 | 		return false, nil
369 | 	}
370 | 
371 | 	// 处理命令错误
372 | 	if err != nil {
373 | 		klog.V(4).Infof("MIG command failed: %s", output)
374 | 		return false, fmt.Errorf("MIG command failed: %v", err)
375 | 	}
376 | 
377 | 	// 检查有效输出（应该包含设备信息）
378 | 	if len(output) > 0 && !strings.Contains(output, "error") {
379 | 		klog.V(4).Infof("MIG supported devices found: %s", output)
380 | 		return true, nil
381 | 	}
382 | 
383 | 	klog.V(4).Infof("Unknown MIG support status: %s", output)
384 | 	return false, nil
385 | }
386 | 
387 | func (m *MIGManager) enableMIGMode() error {
388 | 	out, err := runNvidiaSmiCommand("--enable-mig")
389 | 	if err != nil {
390 | 		return err
391 | 	}
392 | 	klog.V(4).Infof("MIG enable output: %s", string(out))
393 | 	return nil
394 | }
395 | 
396 | // 获取GPU显存大小
397 | func (m *MIGManager) getGPUMemory(gpuIndex string) (uint64, error) {
398 | 	out, err := runNvidiaSmiCommand("-i", gpuIndex, "--query-gpu=memory.total", "--format=csv,noheader,nounits")
399 | 	if err != nil {
400 | 		return 0, err
401 | 	}
402 | 
403 | 	memoryStr := strings.TrimSpace(string(out))
404 | 	memoryMB, err := strconv.ParseUint(memoryStr, 10, 64)
405 | 	if err != nil {
406 | 		return 0, fmt.Errorf("failed to parse GPU memory: %v", err)
407 | 	}
408 | 
409 | 	return memoryMB, nil
410 | }
411 | 
412 | // 从profile中提取显存需求 (GB)
413 | func (m *MIGManager) getProfileMemoryReq() uint64 {
414 | 	parts := strings.Split(m.profile, ".")
415 | 	if len(parts) < 2 {
416 | 		return 0
417 | 	}
418 | 
419 | 	memPart := parts[1]
420 | 	if strings.HasSuffix(memPart, "gb") {
421 | 		memPart = strings.TrimSuffix(memPart, "gb")
422 | 	} else if strings.HasSuffix(memPart, "g") {
423 | 		memPart = strings.TrimSuffix(memPart, "g")
424 | 	}
425 | 
426 | 	memGB, err := strconv.ParseUint(memPart, 10, 64)
427 | 	if err != nil {
428 | 		klog.Warningf("Failed to parse memory requirement from profile %s: %v", m.profile, err)
429 | 		return 0
430 | 	}
431 | 
432 | 	return memGB * 1024 // 转换为MB
433 | }
434 | 
435 | /*
436 | *
437 | https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html
438 | */
439 | func (m *MIGManager) createMIGDevices() error {
440 | 	// 获取GPU列表
441 | 	out, err := runNvidiaSmiCommand("--query-gpu=index", "--format=csv,noheader")
442 | 	if err != nil {
443 | 		return err
444 | 	}
445 | 
446 | 	gpuIndexes := regexp.MustCompile(`\d+`).FindAllString(string(out), -1)
447 | 	for _, index := range gpuIndexes {
448 | 		// 检查是否已启用MIG
449 | 		out, err := runNvidiaSmiCommand("-i", index, "--query-gpu=mig.mode.current", "--format=csv,noheader")
450 | 		if err != nil {
451 | 			klog.Errorf("Failed to check MIG status for GPU %s: %v", index, err)
452 | 			continue
453 | 		}
454 | 
455 | 		currentMode := strings.TrimSpace(string(out))
456 | 		if currentMode != "Enabled" {
457 | 			// 启用MIG模式
458 | 			if _, err := runNvidiaSmiCommand("-i", index, "--enable-mig"); err != nil {
459 | 				klog.Errorf("Failed to enable MIG for GPU %s: %v", index, err)
460 | 				continue
461 | 			}
462 | 			klog.Infof("Enabled MIG mode for GPU %s", index)
463 | 		} else {
464 | 			klog.Infof("GPU %s already in MIG mode", index)
465 | 		}
466 | 
467 | 		// 检查现有MIG设备
468 | 		count, err := m.getMIGDeviceCount(index)
469 | 		if err != nil {
470 | 			klog.Errorf("Failed to get MIG device count for GPU %s: %v", index, err)
471 | 			continue
472 | 		}
473 | 
474 | 		// 如果已切分且配置跳过，则跳过创建
475 | 		if count > 0 && m.skipConfigured {
476 | 			klog.Infof("Skipping GPU %s (already has %d MIG devices)", index, count)
477 | 			continue
478 | 		}
479 | 
480 | 		// 如果已有设备且不跳过，先销毁现有设备
481 | 		if count > 0 {
482 | 			klog.Infof("Destroying existing MIG devices on GPU %s", index)
483 | 			if _, err := runNvidiaSmiCommand("mig", "-i", index, "-dci"); err != nil {
484 | 				klog.Errorf("Failed to destroy compute instances on GPU %s: %v", index, err)
485 | 			}
486 | 			if _, err := runNvidiaSmiCommand("mig", "-i", index, "-dgi"); err != nil {
487 | 				klog.Errorf("Failed to destroy GPU instances on GPU %s: %v", index, err)
488 | 			}
489 | 			time.Sleep(2 * time.Second) // 等待资源释放
490 | 		}
491 | 
492 | 		// 获取GPU显存大小
493 | 		totalMemory, err := m.getGPUMemory(index)
494 | 		if err != nil {
495 | 			klog.Errorf("Failed to get GPU memory for %s: %v", index, err)
496 | 			continue
497 | 		}
498 | 
499 | 		// 计算最大可创建实例数
500 | 		profileMem := m.getProfileMemoryReq()
501 | 		maxInstances := 0
502 | 
503 | 		if profileMem > 0 {
504 | 			maxInstances = int(totalMemory / profileMem)
505 | 			if maxInstances == 0 {
506 | 				klog.Warningf("GPU %s has insufficient memory (%dMB) for profile %s (%dMB required)",
507 | 					index, totalMemory, m.profile, profileMem)
508 | 				continue
509 | 			}
510 | 		}
511 | 
512 | 		// 确定要创建的实例数量
513 | 		createCount := maxInstances
514 | 		if m.instanceCount > 0 {
515 | 			if m.instanceCount > maxInstances {
516 | 				klog.Warningf("Requested %d instances exceeds maximum %d for GPU %s",
517 | 					m.instanceCount, maxInstances, index)
518 | 				createCount = maxInstances
519 | 			} else {
520 | 				createCount = m.instanceCount
521 | 			}
522 | 		}
523 | 
524 | 		if createCount == 0 {
525 | 			klog.Errorf("Cannot determine instance count for GPU %s", index)
526 | 			continue
527 | 		}
528 | 
529 | 		klog.Infof("Creating %d MIG device(s) with profile %s on GPU %s", createCount, m.profile, index)
530 | 
531 | 		profileID, err := getProfileID(m.profile)
532 | 		if err != nil {
533 | 			klog.Errorf("Failed to get profile ID: %v", err)
534 | 			continue
535 | 		}
536 | 
537 | 		// === 修复开始 ===
538 | 		// 构造逗号分隔的ID列表 (e.g., "9,9" for 2 instances)
539 | 		ids := make([]string, createCount)
540 | 		for i := 0; i < createCount; i++ {
541 | 			ids[i] = strconv.Itoa(profileID)
542 | 		}
543 | 		profileArg := strings.Join(ids, ",")
544 | 
545 | 		// 单次执行创建命令
546 | 		_, err = runNvidiaSmiCommand("mig", "-cgi", profileArg, "-C")
547 | 		if err != nil {
548 | 			klog.Errorf("Failed to create %d MIG devices on GPU %s: %v", createCount, index, err)
549 | 		} else {
550 | 			klog.Infof("Successfully created %d MIG devices on GPU %s", createCount, index)
551 | 		}
552 | 		// === 修复结束 ===
553 | 	}
554 | 
555 | 	return nil
556 | }
557 | 
558 | func getProfileID(profileName string) (int, error) {
559 | 	out, err := runNvidiaSmiCommand("mig", "-lgip")
560 | 	if err != nil {
561 | 		return 0, err
562 | 	}
563 | 
564 | 	// 正则表达式匹配profile行
565 | 	re := regexp.MustCompile(`\|\s+\d+\s+MIG\s+(\S+)\s+(\d+)`)
566 | 
567 | 	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
568 | 	for _, line := range lines {
569 | 		klog.Infof("Found line %s", line)
570 | 		// 跳过非profile行（表格线、标题等）
571 | 		if !strings.Contains(line, "MIG") || !strings.Contains(line, "|") {
572 | 			continue
573 | 		}
574 | 
575 | 		matches := re.FindStringSubmatch(line)
576 | 		if len(matches) > 2 {
577 | 			name := matches[1]
578 | 			idStr := matches[2]
579 | 
580 | 			if name == profileName {
581 | 				profileID, err := strconv.Atoi(idStr)
582 | 				if err != nil {
583 | 					klog.Warningf("Invalid profile ID format: %s", idStr)
584 | 					continue
585 | 				}
586 | 				klog.Infof("Found profile %s with ID %d", profileName, profileID)
587 | 				return profileID, nil
588 | 			}
589 | 		}
590 | 	}
591 | 	return 0, fmt.Errorf("profile not found: %s", profileName)
592 | }
593 | 
594 | // 获取当前MIG设备数量
595 | func (m *MIGManager) getMIGDeviceCount(gpuIndex string) (int, error) {
596 | 	out, err := runNvidiaSmiCommand("mig", "-lgi", "-i", gpuIndex)
597 | 	output := string(out)
598 | 
599 | 	// 处理无 MIG 设备的情况
600 | 	if strings.Contains(output, "No GPU instances found") ||
601 | 		strings.Contains(output, "Not Found") ||
602 | 		strings.Contains(output, "No devices were found") {
603 | 		klog.Infof("No MIG instances found on GPU %s", gpuIndex)
604 | 		return 0, nil
605 | 	}
606 | 
607 | 	if err != nil {
608 | 		// 检查是否因为无设备而返回错误
609 | 		if strings.Contains(err.Error(), "exit status 255") &&
610 | 			(strings.Contains(output, "No GPU instances found") ||
611 | 				strings.Contains(output, "Not Found")) {
612 | 			klog.Infof("No MIG devices on GPU %s (ignoring error)", gpuIndex)
613 | 			return 0, nil
614 | 		}
615 | 		return 0, fmt.Errorf("nvidia-smi MIG query failed: %v, output: %s", err, output)
616 | 	}
617 | 
618 | 	// 解析输出中的设备计数
619 | 	count := 0
620 | 	lines := strings.Split(output, "\n")
621 | 
622 | 	// 检测表头行
623 | 	headerFound := false
624 | 	for _, line := range lines {
625 | 		if strings.Contains(line, "GPU Instance ID") {
626 | 			headerFound = true
627 | 			continue
628 | 		}
629 | 
630 | 		// 统计数据行
631 | 		if headerFound && strings.TrimSpace(line) != "" {
632 | 			count++
633 | 		}
634 | 	}
635 | 
636 | 	return count, nil
637 | }
638 | 


--------------------------------------------------------------------------------