├── test ├── deployment │ ├── nvidia-container-runtime.yaml │ ├── nvidia-tmp-pod.yaml │ ├── nvidia-test-pod.yaml │ └── qwen-mini.yaml └── DEMO.md ├── readme ├── DEPLOY.md └── LOCAL.BUILD.md ├── Makefile ├── monitor ├── monitor.sh └── press_test.py ├── pkg ├── device │ ├── simulator.go │ ├── device.go │ ├── huawei.go │ └── nvidia.go ├── allocator │ └── allocator.go └── deviceplugin │ └── server.go ├── Dockerfile ├── go.mod ├── cmd └── main.go ├── README.md └── manifests └── daemonset.yaml /test/deployment/nvidia-container-runtime.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: node.k8s.io/v1 2 | kind: RuntimeClass 3 | metadata: 4 | name: nvidia 5 | handler: nvidia # 指向 nvidia-container-runtime -------------------------------------------------------------------------------- /readme/DEPLOY.md: -------------------------------------------------------------------------------- 1 | 2 | ### apply 3 | ```shell 4 | kubectl apply -f manifests/daemonset.yaml 5 | 6 | ``` 7 | 8 | 9 | ### delete & auto create daemonset 10 | 11 | ```shell 12 | kubectl delete pod -l app=micro-device-plugin -n kube-system 13 | 14 | kubectl logs -l app=micro-device-plugin -n kube-system --tail=-1 15 | 16 | ``` -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BINARY := micro-device-plugin 2 | IMAGE := your-registry/micro-device-plugin 3 | VERSION := v1.0.0 4 | 5 | .PHONY: build 6 | build: 7 | go build -o bin/$(BINARY) ./cmd 8 | 9 | .PHONY: docker-build 10 | docker-build: 11 | docker build -t $(IMAGE):$(VERSION) . 12 | 13 | .PHONY: push 14 | push: 15 | docker push $(IMAGE):$(VERSION) 16 | 17 | .PHONY: clean 18 | clean: 19 | rm -rf bin -------------------------------------------------------------------------------- /monitor/monitor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while true; do 4 | clear 5 | echo "===== $(date) =====" 6 | echo "MIG 设备列表:" 7 | nvidia-smi mig -lgi 8 | 9 | echo -e "\n设备 1 状态:" 10 | nvidia-smi -i MIG-GPU-0-1 -q | grep -A 5 "Utilization" | grep -v "Gpu" 11 | 12 | echo -e "\n设备 2 状态:" 13 | nvidia-smi -i MIG-GPU-0-2 -q | grep -A 5 "Utilization" | grep -v "Gpu" 14 | 15 | sleep 1 16 | done -------------------------------------------------------------------------------- /test/deployment/nvidia-tmp-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: nvidia-tmp-pod 5 | spec: 6 | 7 | runtimeClassName: nvidia # 使用 NVIDIA 运行时 8 | restartPolicy: Never # 执行一次后退出 9 | containers: 10 | - name: test-container 11 | image: nvcr.io/nvidia/pytorch:24.05-py3 12 | command: ["nvidia-smi"] 13 | resources: 14 | limits: 15 | nvidia.com/microgpu: 1 # 请求 1 个 GPU -------------------------------------------------------------------------------- /test/deployment/nvidia-test-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: nvidia-test-pod 5 | spec: 6 | runtimeClassName: nvidia 7 | restartPolicy: Never 8 | containers: 9 | - name: test-container 10 | image: nvcr.io/nvidia/pytorch:24.05-py3 11 | imagePullPolicy: IfNotPresent 12 | # 关键修改:启动无限循环命令 13 | command: ["/bin/sh", "-c"] 14 | args: ["while true; do sleep 3600; done"] # 每小时唤醒一次的永久循环 15 | resources: 16 | limits: 17 | nvidia.com/microgpu: 1 -------------------------------------------------------------------------------- /readme/LOCAL.BUILD.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ### build 4 | ```shell 5 | 6 | 7 | git pull 8 | 9 | docker build --build-arg HTTP_PROXY=http://10.0.168.12:7890 --build-arg HTTPS_PROXY=http://10.0.168.12:7890 -t binyue/micro-device-plugin:v1.0.13 . 10 | 11 | 12 | 13 | ``` 14 | ### transfer to containerd 15 | 16 | ```shell 17 | 18 | docker save binyue/micro-device-plugin:v1.0.13 -o micro-device-plugin.tar 19 | 20 | sudo ctr -n k8s.io images import micro-device-plugin.tar 21 | 22 | sudo ctr -n k8s.io images ls |grep micro-device-plugin 23 | 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /pkg/device/simulator.go: -------------------------------------------------------------------------------- 1 | package device 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type SimulatorManager struct { 8 | lastDiscovery time.Time 9 | devices []GPUDevice 10 | } 11 | 12 | func (m *SimulatorManager) DiscoverGPUs() ([]GPUDevice, error) { 13 | return []GPUDevice{ 14 | &SimulatorDevice{id: "0", healthy: true}, 15 | &SimulatorDevice{id: "1", healthy: true}, 16 | &SimulatorDevice{id: "2", healthy: true}, 17 | }, nil 18 | } 19 | 20 | func (m *SimulatorManager) CheckHealth(deviceID string) bool { 21 | // 模拟 10% 的失败率 22 | return time.Now().UnixNano()%10 != 0 23 | } 24 | -------------------------------------------------------------------------------- /test/DEMO.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ### deploy and see nvidia-smi 4 | ```shell 5 | kubectl apply -f deployment/nvidia-test-pod.yaml 6 | kubectl describe pod nvidia-test-pod 7 | kubectl delete pod nvidia-test-pod 8 | kubectl logs nvidia-test-pod --tail=-1 9 | kubectl exec -it nvidia-test-pod -- sh 10 | ``` 11 | 12 | ### cuda test 13 | ```shell 14 | python -c "import torch; print(torch.__version__, torch.cuda.is_available())" 15 | 16 | ``` 17 | 18 | 19 | ### mutil allocate test 20 | 21 | ```shell 22 | kubectl apply -f deployment/nvidia-tmp-pod.yaml 23 | kubectl describe pod nvidia-tmp-pod 24 | kubectl delete pod nvidia-tmp-pod 25 | kubectl logs nvidia-tmp-pod --tail=-1 26 | ``` -------------------------------------------------------------------------------- /pkg/device/device.go: -------------------------------------------------------------------------------- 1 | package device 2 | 3 | // GPUDevice 表示GPU设备的接口 4 | type GPUDevice interface { 5 | ID() string 6 | IsHealthy() bool 7 | GetVendor() string 8 | GetPath() string 9 | IsMIG() bool // 新增:是否为MIG设备 10 | PhysicalID() string // 新增:物理GPU ID 11 | } 12 | 13 | // DeviceManager 设备管理器接口 14 | type DeviceManager interface { 15 | DiscoverGPUs() ([]GPUDevice, error) 16 | CheckHealth(deviceID string) bool 17 | } 18 | 19 | type SimulatorDevice struct { 20 | id string 21 | healthy bool 22 | } 23 | 24 | func (d *SimulatorDevice) IsMIG() bool { 25 | return false 26 | } 27 | 28 | func (d *SimulatorDevice) PhysicalID() string { 29 | return d.id 30 | } 31 | 32 | func (d *SimulatorDevice) ID() string { return d.id } 33 | func (d *SimulatorDevice) IsHealthy() bool { return d.healthy } 34 | func (d *SimulatorDevice) GetVendor() string { return "simulator" } 35 | func (d *SimulatorDevice) GetPath() string { return "/dev/sim_gpu" + d.id } 36 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 第一阶段:使用官方golang镜像构建 2 | FROM golang:1.24 AS builder 3 | 4 | # 设置工作目录 5 | WORKDIR /workspace 6 | COPY . . 7 | 8 | # 设置代理(通过构建参数) 9 | ARG HTTP_PROXY 10 | ARG HTTPS_PROXY 11 | ENV http_proxy=${HTTP_PROXY} 12 | ENV https_proxy=${HTTPS_PROXY} 13 | 14 | 15 | # 静态编译,确保零依赖 16 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ 17 | -a -installsuffix cgo \ 18 | -ldflags="-w -s" \ 19 | -o /usr/bin/micro-device-plugin ./cmd 20 | 21 | # 第二阶段:使用Ubuntu基础镜像 22 | FROM ubuntu:22.04 23 | 24 | 25 | # 安装必要的运行依赖 26 | RUN apt-get update && apt-get install -y --no-install-recommends \ 27 | ca-certificates \ 28 | curl \ 29 | kmod \ 30 | libunwind8 \ 31 | && rm -rf /var/lib/apt/lists/* 32 | 33 | # 设置环境变量(修复警告问题) 34 | ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu 35 | 36 | # 从构建阶段复制静态编译的二进制文件 37 | COPY --from=builder /usr/bin/micro-device-plugin /usr/bin/ 38 | 39 | # 健康检查 40 | HEALTHCHECK --interval=30s --timeout=10s \ 41 | CMD curl -f http://localhost:8080/health || exit 1 42 | 43 | # 容器入口点 44 | ENTRYPOINT ["/usr/bin/micro-device-plugin"] 45 | -------------------------------------------------------------------------------- /pkg/device/huawei.go: -------------------------------------------------------------------------------- 1 | package device 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | "k8s.io/klog/v2" 8 | ) 9 | 10 | type HuaweiDevice struct { 11 | id string 12 | healthy bool 13 | } 14 | 15 | func (d *HuaweiDevice) IsMIG() bool { 16 | return false 17 | } 18 | 19 | func (d *HuaweiDevice) PhysicalID() string { 20 | return d.id 21 | } 22 | 23 | func (d *HuaweiDevice) ID() string { return d.id } 24 | func (d *HuaweiDevice) IsHealthy() bool { return d.healthy } 25 | func (d *HuaweiDevice) GetVendor() string { return "huawei" } 26 | func (d *HuaweiDevice) GetPath() string { return "/dev/davinci" + d.id } 27 | 28 | type HuaweiManager struct { 29 | lastDiscovery time.Time 30 | devices []GPUDevice 31 | discoverySync sync.Mutex 32 | } 33 | 34 | func (m *HuaweiManager) DiscoverGPUs() ([]GPUDevice, error) { 35 | m.discoverySync.Lock() 36 | defer m.discoverySync.Unlock() 37 | 38 | // 如果最近已经发现过设备,则使用缓存 39 | if time.Since(m.lastDiscovery) < 5*time.Minute && m.devices != nil { 40 | klog.V(4).Infof("Using cached Huawei devices (last discovery: %s)", m.lastDiscovery) 41 | return m.devices, nil 42 | } 43 | 44 | klog.Info("Discovering Huawei devices") 45 | 46 | // 实际生产环境中应使用华为NPU SDK调用 47 | // 这里为模拟实现 48 | devices := []GPUDevice{ 49 | &HuaweiDevice{id: "0", healthy: true}, 50 | &HuaweiDevice{id: "1", healthy: true}, 51 | } 52 | 53 | klog.Infof("Discovered %d Huawei devices", len(devices)) 54 | for _, d := range devices { 55 | klog.Infof("Huawei Device: ID=%s, Healthy=%v", d.ID(), d.IsHealthy()) 56 | } 57 | 58 | m.devices = devices 59 | m.lastDiscovery = time.Now() 60 | return devices, nil 61 | } 62 | 63 | func (m *HuaweiManager) CheckHealth(deviceID string) bool { 64 | // 实际生产环境中应使用华为NPU SDK的健康检查 65 | // 这里总是返回true作为模拟 66 | healthy := true 67 | klog.V(5).Infof("Checking health of Huawei device %s: %v", deviceID, healthy) 68 | return healthy 69 | } 70 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/benyuereal/micro-device-plugin 2 | 3 | go 1.24.0 4 | 5 | require ( 6 | google.golang.org/grpc v1.68.1 7 | k8s.io/api v0.33.4 8 | k8s.io/apimachinery v0.33.4 9 | k8s.io/client-go v0.33.4 10 | k8s.io/klog/v2 v2.130.1 11 | k8s.io/kubelet v0.33.4 12 | ) 13 | 14 | require ( 15 | github.com/davecgh/go-spew v1.1.1 // indirect 16 | github.com/emicklei/go-restful/v3 v3.11.0 // indirect 17 | github.com/fxamacker/cbor/v2 v2.7.0 // indirect 18 | github.com/go-logr/logr v1.4.2 // indirect 19 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 20 | github.com/go-openapi/jsonreference v0.20.2 // indirect 21 | github.com/go-openapi/swag v0.23.0 // indirect 22 | github.com/gogo/protobuf v1.3.2 // indirect 23 | github.com/golang/protobuf v1.5.4 // indirect 24 | github.com/google/gnostic-models v0.6.9 // indirect 25 | github.com/google/go-cmp v0.7.0 // indirect 26 | github.com/google/uuid v1.6.0 // indirect 27 | github.com/josharian/intern v1.0.0 // indirect 28 | github.com/json-iterator/go v1.1.12 // indirect 29 | github.com/mailru/easyjson v0.7.7 // indirect 30 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 31 | github.com/modern-go/reflect2 v1.0.2 // indirect 32 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 33 | github.com/pkg/errors v0.9.1 // indirect 34 | github.com/x448/float16 v0.8.4 // indirect 35 | golang.org/x/net v0.38.0 // indirect 36 | golang.org/x/oauth2 v0.27.0 // indirect 37 | golang.org/x/sys v0.31.0 // indirect 38 | golang.org/x/term v0.30.0 // indirect 39 | golang.org/x/text v0.23.0 // indirect 40 | golang.org/x/time v0.9.0 // indirect 41 | google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect 42 | google.golang.org/protobuf v1.36.5 // indirect 43 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 44 | gopkg.in/inf.v0 v0.9.1 // indirect 45 | gopkg.in/yaml.v3 v3.0.1 // indirect 46 | k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect 47 | k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect 48 | sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect 49 | sigs.k8s.io/randfill v1.0.0 // indirect 50 | sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect 51 | sigs.k8s.io/yaml v1.4.0 // indirect 52 | ) 53 | -------------------------------------------------------------------------------- /monitor/press_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import threading 3 | import time 4 | import random 5 | import json # 添加 json 模块用于格式化输出 6 | 7 | # 目标URL 8 | url = "http://10.0.168.12:30080/generate" 9 | 10 | # 请求头 11 | headers = { 12 | "Content-Type": "application/json" 13 | } 14 | 15 | # 多个不同的提示词,增加多样性 16 | prompts = [ 17 | "请用简单的话解释量子计算", 18 | "详细描述深度学习的原理和应用", 19 | "解释Transformer模型在自然语言处理中的作用", 20 | "讲述人工智能的发展历史", 21 | "比较机器学习和深度学习的区别", 22 | "解释神经网络的基本原理", 23 | "描述计算机视觉的最新进展", 24 | "讲解自然语言处理中的注意力机制", 25 | "什么是强化学习?它有哪些应用?", 26 | "解释生成式对抗网络(GAN)的工作原理" 27 | ] 28 | 29 | # 发送请求的函数 30 | def send_request(thread_id): 31 | request_count = 0 32 | while True: 33 | try: 34 | # 随机选择一个提示词 35 | prompt = random.choice(prompts) 36 | 37 | data = { 38 | "inputs": f"<|im_start|>system\n你是一个AI助手<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n", 39 | "parameters": { 40 | "max_new_tokens": 512, # 增加生成的token数量 41 | "temperature": 0.7, 42 | "top_p": 0.9, 43 | "do_sample": True, 44 | "repetition_penalty": 1.1 45 | } 46 | } 47 | 48 | response = requests.post(url, json=data, headers=headers, timeout=30) 49 | request_count += 1 50 | 51 | if response.status_code == 200: 52 | # 获取并打印响应内容 53 | response_data = response.json() 54 | generated_text = response_data.get("generated_text", "") 55 | 56 | # 截取部分文本,避免输出过长 57 | preview_text = generated_text[:100] + "..." if len(generated_text) > 100 else generated_text 58 | 59 | print(f"线程{thread_id} - 请求#{request_count}成功") 60 | print(f"生成文本: {preview_text}") 61 | print(f"详细响应: {json.dumps(response_data, indent=2, ensure_ascii=False)[:200]}...") # 打印部分JSON响应 62 | print("-" * 80) # 分隔线 63 | else: 64 | print(f"线程{thread_id} - 请求#{request_count}失败,状态码: {response.status_code}") 65 | print(f"错误响应: {response.text[:200]}") 66 | print("-" * 80) 67 | 68 | except Exception as e: 69 | print(f"线程{thread_id} - 请求异常: {e}") 70 | print("-" * 80) 71 | 72 | # 稍微延迟,避免过度请求 73 | time.sleep(0.1) 74 | 75 | # 创建多个线程并发请求 76 | threads = [] 77 | for i in range(20): # 创建20个并发线程 78 | t = threading.Thread(target=send_request, args=(i,)) 79 | t.daemon = True 80 | threads.append(t) 81 | t.start() 82 | 83 | # 保持脚本运行 84 | try: 85 | while True: 86 | time.sleep(1) 87 | except KeyboardInterrupt: 88 | print("停止压力测试") -------------------------------------------------------------------------------- /test/deployment/qwen-mini.yaml: -------------------------------------------------------------------------------- 1 | # 修改后的 qwen-statefulset-local-direct.yaml 2 | apiVersion: apps/v1 3 | kind: StatefulSet 4 | metadata: 5 | name: qwen-mini 6 | spec: 7 | serviceName: "qwen-service" 8 | replicas: 2 # 增加副本数 9 | podManagementPolicy: OrderedReady 10 | selector: 11 | matchLabels: 12 | app: qwen 13 | template: 14 | metadata: 15 | labels: 16 | app: qwen 17 | spec: 18 | # 新增拓扑分布约束 - 核心修改点 19 | topologySpreadConstraints: 20 | - maxSkew: 2 # 节点间最大Pod数差异 21 | topologyKey: kubernetes.io/hostname # 节点级别的拓扑域 22 | whenUnsatisfiable: DoNotSchedule # 不满足条件时不调度 23 | labelSelector: 24 | matchLabels: 25 | app: qwen 26 | 27 | runtimeClassName: nvidia 28 | containers: 29 | - name: text-generation 30 | image: ghcr.io/huggingface/text-generation-inference:1.4.1 31 | command: ["text-generation-launcher"] 32 | args: 33 | - "--model-id" 34 | - "/model" 35 | - "--num-shard" 36 | - "1" 37 | - "--port" 38 | - "8000" 39 | - "--quantize" 40 | - "bitsandbytes" 41 | env: 42 | - name: TRANSFORMERS_OFFLINE 43 | value: "1" 44 | - name: HF_HUB_OFFLINE 45 | value: "1" 46 | - name: NVIDIA_DISABLE_REQUIRE 47 | value: "1" 48 | # 添加就绪探针,确保Pod完全启动后才被认为是就绪 49 | readinessProbe: 50 | httpGet: 51 | path: /health 52 | port: 8000 53 | initialDelaySeconds: 30 54 | periodSeconds: 5 55 | failureThreshold: 10 56 | timeoutSeconds: 5 57 | # 添加存活探针,确保应用健康 58 | livenessProbe: 59 | httpGet: 60 | path: /health 61 | port: 8000 62 | initialDelaySeconds: 60 63 | periodSeconds: 10 64 | failureThreshold: 3 65 | timeoutSeconds: 5 66 | resources: 67 | limits: 68 | nvidia.com/microgpu: 1 69 | ports: 70 | - containerPort: 8000 71 | volumeMounts: 72 | - name: model-storage 73 | mountPath: /model 74 | volumes: 75 | - name: model-storage 76 | hostPath: 77 | path: /home/Qwen1.5-0.5B-Chat 78 | type: Directory 79 | # StatefulSet更新策略 80 | updateStrategy: 81 | type: RollingUpdate 82 | rollingUpdate: 83 | partition: 0 # 确保有序更新 84 | 85 | --- 86 | apiVersion: v1 87 | kind: Service 88 | metadata: 89 | name: qwen-service 90 | spec: 91 | type: NodePort 92 | selector: 93 | app: qwen 94 | ports: 95 | - protocol: TCP 96 | port: 8000 97 | targetPort: 8000 98 | nodePort: 30080 -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "os" 7 | "os/signal" 8 | "sync" 9 | "syscall" 10 | "time" 11 | 12 | "github.com/benyuereal/micro-device-plugin/pkg/device" 13 | "github.com/benyuereal/micro-device-plugin/pkg/deviceplugin" 14 | "k8s.io/klog/v2" 15 | ) 16 | 17 | func main() { 18 | klog.InitFlags(nil) 19 | defer klog.Flush() 20 | 21 | // 获取环境变量设置 22 | simulate := os.Getenv("SIMULATE") 23 | cdiEnabled := os.Getenv("CDI_ENABLED") == "true" 24 | cdiPrefix := os.Getenv("CDI_PREFIX") 25 | nodeName := os.Getenv("NODE_NAME") 26 | if cdiPrefix == "" { 27 | cdiPrefix = "micro.device" // 默认值 28 | } 29 | klog.Infof("Running in simulation mode: %s", simulate) 30 | 31 | // 初始化设备管理器 32 | var managers []struct { 33 | vendor string 34 | manager device.DeviceManager 35 | } 36 | 37 | // 添加模拟管理器 38 | if simulate != "" { 39 | managers = append(managers, struct { 40 | vendor string 41 | manager device.DeviceManager 42 | }{ 43 | vendor: "simulator", 44 | manager: &device.NVIDIAManager{}, 45 | }) 46 | } else { 47 | // 真实环境下的设备管理器 48 | managers = append(managers, struct { 49 | vendor string 50 | manager device.DeviceManager 51 | }{"nvidia", device.NewNVIDIAManager()}) 52 | managers = append(managers, struct { 53 | vendor string 54 | manager device.DeviceManager 55 | }{"huawei", &device.HuaweiManager{}}) 56 | } 57 | 58 | var servers []*deviceplugin.DevicePluginServer 59 | var wg sync.WaitGroup 60 | var serverMutex sync.Mutex 61 | 62 | ctx, cancel := context.WithCancel(context.Background()) 63 | 64 | // 为每个供应商启动插件 65 | for _, m := range managers { 66 | wg.Add(1) 67 | go func(vendor string, manager device.DeviceManager) { 68 | defer wg.Done() 69 | 70 | srv := deviceplugin.New(vendor, manager, cdiEnabled, cdiPrefix, nodeName) 71 | if err := srv.Start(ctx); err != nil { 72 | klog.Errorf("Failed to start %s device plugin: %v", vendor, err) 73 | return 74 | } 75 | 76 | serverMutex.Lock() 77 | servers = append(servers, srv) 78 | serverMutex.Unlock() 79 | 80 | // 后台运行健康检查 81 | go srv.HealthCheck(ctx, 30*time.Second) 82 | }(m.vendor, m.manager) 83 | } 84 | 85 | // 健康检查路由 86 | http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { 87 | w.WriteHeader(http.StatusOK) 88 | }) 89 | go func() { 90 | if err := http.ListenAndServe(":8080", nil); err != nil { 91 | klog.Fatalf("Health check server failed: %v", err) 92 | } 93 | }() 94 | klog.Info("Health check server started on :8080") 95 | 96 | // 等待终止信号 97 | signalChan := make(chan os.Signal, 1) 98 | signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) 99 | <-signalChan 100 | klog.Info("Received termination signal, shutting down...") 101 | 102 | // 关闭所有插件 103 | cancel() 104 | for _, srv := range servers { 105 | srv.Stop() 106 | } 107 | 108 | klog.Info("All device plugins stopped. Exiting.") 109 | } 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Micro GPU Device Plugin for Kubernetes 2 | 3 | ## 概述 4 | 这是一个支持多GPU资源限制的Kubernetes设备插件,特别优化了对NVIDIA MIG设备的支持。它能够在Kubernetes集群中自动发现、管理和分配GPU资源,包括完整的GPU设备和MIG分区。 5 | 6 | ## 核心特性 7 | - ✅ 完整的GPU设备发现与管理 8 | - ✅ NVIDIA MIG设备支持(自动分区与配置) 9 | - ✅ 设备健康检查与监控 10 | - ⛔️ CDI(Container Device Interface)支持 11 | - ✅ 资源回收与自动清理机制 12 | 13 | ## 前提条件 14 | ### 1. Kubernetes 集群 15 | - Kubernetes 1.20+ 版本 16 | - kubectl 配置完成 17 | 18 | ### 2 Containerd 配置 19 | 在 `/etc/containerd/config.toml` 中添加: 20 | 21 | ```toml 22 | [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] 23 | [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] 24 | runtime_type = "io.containerd.runc.v2" 25 | privileged_without_host_devices = false 26 | [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] 27 | BinaryName = "/usr/bin/nvidia-container-runtime" 28 | ``` 29 | ### 3 runclass配置 30 | ```yaml 31 | apiVersion: node.k8s.io/v1 32 | kind: RuntimeClass 33 | metadata: 34 | name: nvidia 35 | handler: nvidia # 指向 nvidia-container-runtime 36 | ``` 37 | 38 | ### 4. **GPU MIG 设置**: 39 | ```yaml 40 | ### 启用mig 41 | sudo nvidia-smi -mig 1 42 | ### 创建 MIG 设备 (例如 3g.20gb 配置) 43 | sudo nvidia-smi mig -cgi 9 -C 44 | ``` 45 | 46 | ## 🚀 快速开始 47 | 48 | ### 部署设备插件 49 | ```yaml 50 | kubectl apply -f manifests/daemonset.yaml 51 | ``` 52 | 53 | ### 验证部署 54 | ```shell 55 | kubectl get pods -n kube-system -l app=micro-device-plugin 56 | 57 | kubectl logs -n kube-system -l app=micro-device-plugin --tail=50 58 | ``` 59 | 60 | ### 测试示例 61 | ```yaml 62 | apiVersion: v1 63 | kind: Pod 64 | metadata: 65 | name: nvidia-test-pod 66 | spec: 67 | runtimeClassName: nvidia 68 | restartPolicy: Never 69 | containers: 70 | - name: test-container 71 | image: nvcr.io/nvidia/pytorch:24.05-py3 72 | imagePullPolicy: IfNotPresent 73 | # 关键修改:启动无限循环命令 74 | command: ["/bin/sh", "-c"] 75 | args: ["while true; do sleep 3600; done"] # 每小时唤醒一次的永久循环 76 | resources: 77 | limits: 78 | nvidia.com/microgpu: 1 79 | ``` 80 | 81 | ### 部署测试应用: 82 | 83 | ```shell 84 | 85 | kubectl apply -f deployment/nvidia-test-pod.yaml 86 | kubectl describe pod nvidia-test-pod 87 | kubectl logs nvidia-test-pod --tail=-1 88 | kubectl exec -it nvidia-test-pod -- sh 89 | 90 | ``` 91 | 92 | ## 📊 功能特性 93 | - 支持 NVIDIA GPU 和 MIG 设备管理 94 | - 自动健康检查和设备回收 95 | - CDI 设备注入支持 96 | - 拓扑感知调度优化 97 | - 多实例 GPU 资源切分 98 | 99 | ## 🛠 构建与部署 100 | 101 | 102 | ```shell 103 | docker build -t your-registry/micro-device-plugin:v1.0.0 . 104 | docker push your-registry/micro-device-plugin:v1.0.0 105 | ``` 106 | 107 | ## 部署到kubernetes 108 | 109 | ```shell 110 | kubectl apply -f manifests/daemonset.yaml 111 | ``` 112 | 113 | ## 🔧 配置选项 114 | | 环境变量 | 默认值 | 描述 | 115 | |---------|--------|------| 116 | | `ENABLE_MIG` | `false` | 启用 MIG 管理 | 117 | | `MIG_PROFILE` | `3g.20gb` | MIG 切分配置 | 118 | | `MIG_INSTANCE_COUNT` | `0` | MIG 实例数量 (0=自动计算) | 119 | | `SKIP_CONFIGURED` | `true` | 跳过已配置的 MIG 设备 | 120 | | `CDI_ENABLED` | `false` | 启用 CDI 设备注入 | 121 | | `CDI_PREFIX` | `micro.device` | CDI 设备前缀 | -------------------------------------------------------------------------------- /pkg/allocator/allocator.go: -------------------------------------------------------------------------------- 1 | package allocator 2 | 3 | import ( 4 | "errors" 5 | "sync" 6 | 7 | "k8s.io/klog/v2" 8 | ) 9 | 10 | // Allocator 设备资源分配器接口 11 | type Allocator interface { 12 | Allocate(ids []string, podUID string) error // 增加podUID参数 13 | Deallocate(ids []string) 14 | GetAllocatedDevices() []string 15 | CleanupOrphanedDevices(map[string]bool) 16 | GetPodUID(deviceID string) string // 修改为 string 参数 17 | GetAllocationMap() map[string]string 18 | IsAvailable(id string) bool // 新增方法 19 | } 20 | 21 | // SimpleAllocator 简单的内存分配器实现 22 | type SimpleAllocator struct { 23 | mu sync.RWMutex 24 | allocated map[string]bool // 已分配设备ID 25 | deviceToPod map[string]string // 新增:设备到 Pod 的映射 26 | } 27 | 28 | func NewSimpleAllocator() *SimpleAllocator { 29 | return &SimpleAllocator{ 30 | allocated: make(map[string]bool), 31 | deviceToPod: make(map[string]string), 32 | } 33 | } 34 | 35 | // Allocate 分配设备资源 36 | func (a *SimpleAllocator) Allocate(ids []string, podUID string) error { 37 | a.mu.Lock() 38 | defer a.mu.Unlock() 39 | 40 | // 首先检查所有设备是否可用 41 | for _, id := range ids { 42 | if _, exists := a.allocated[id]; exists { 43 | return ErrDeviceAlreadyAllocated 44 | } 45 | } 46 | 47 | // 然后分配设备 48 | for _, id := range ids { 49 | a.allocated[id] = true 50 | klog.Infof("Device allocated: %s", id) 51 | } 52 | 53 | for _, id := range ids { 54 | a.allocated[id] = true 55 | a.deviceToPod[id] = podUID // 记录设备到 Pod 的映射 56 | klog.Infof("Device allocated: %s to pod %s", id, podUID) 57 | } 58 | 59 | return nil 60 | } 61 | 62 | // 新增方法:获取设备对应的 Pod UID 63 | func (a *SimpleAllocator) GetPodUID(deviceID string) string { 64 | a.mu.RLock() 65 | defer a.mu.RUnlock() 66 | return a.deviceToPod[deviceID] 67 | } 68 | 69 | // Deallocate 释放设备资源 70 | func (a *SimpleAllocator) Deallocate(ids []string) { 71 | a.mu.Lock() 72 | defer a.mu.Unlock() 73 | 74 | for _, id := range ids { 75 | if _, exists := a.allocated[id]; exists { 76 | delete(a.allocated, id) 77 | delete(a.deviceToPod, id) // 清理映射关系 78 | klog.Infof("Device deallocated: %s", id) 79 | } 80 | } 81 | } 82 | 83 | // GetAllocatedDevices 获取所有已分配设备 84 | func (a *SimpleAllocator) GetAllocatedDevices() []string { 85 | a.mu.RLock() 86 | defer a.mu.RUnlock() 87 | 88 | devices := make([]string, 0, len(a.allocated)) 89 | for id := range a.allocated { 90 | devices = append(devices, id) 91 | } 92 | return devices 93 | } 94 | func (a *SimpleAllocator) CleanupOrphanedDevices(discoveredIDs map[string]bool) { 95 | a.mu.Lock() 96 | defer a.mu.Unlock() 97 | 98 | for id := range a.allocated { 99 | if !discoveredIDs[id] { 100 | delete(a.allocated, id) 101 | klog.Warningf("Cleaned orphaned device: %s", id) 102 | } 103 | } 104 | } 105 | 106 | // GetAllocationMap 返回设备分配状态的副本 107 | func (a *SimpleAllocator) GetAllocationMap() map[string]string { 108 | a.mu.RLock() 109 | defer a.mu.RUnlock() 110 | 111 | // 返回深拷贝防止并发修改 112 | result := make(map[string]string) 113 | for k, v := range a.deviceToPod { 114 | result[k] = v 115 | } 116 | return result 117 | } 118 | 119 | // IsAvailable 检查设备是否可用(未被分配) 120 | func (a *SimpleAllocator) IsAvailable(deviceID string) bool { 121 | a.mu.RLock() 122 | defer a.mu.RUnlock() 123 | _, exists := a.allocated[deviceID] 124 | return !exists // 如果存在表示已分配,不可用 125 | } 126 | 127 | // 错误定义 128 | var ( 129 | ErrDeviceAlreadyAllocated = errors.New("device already allocated") 130 | ) 131 | -------------------------------------------------------------------------------- /manifests/daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: micro-device-plugin 5 | rules: 6 | - apiGroups: [""] 7 | resources: ["pods"] 8 | verbs: ["list", "watch", "get"] # 增加get权限 9 | 10 | --- 11 | apiVersion: rbac.authorization.k8s.io/v1 12 | kind: ClusterRoleBinding 13 | metadata: 14 | name: micro-device-plugin 15 | subjects: 16 | - kind: ServiceAccount 17 | name: default 18 | namespace: kube-system 19 | roleRef: 20 | kind: ClusterRole 21 | name: micro-device-plugin 22 | apiGroup: rbac.authorization.k8s.io 23 | --- 24 | # 原有DaemonSet保持不变 25 | apiVersion: apps/v1 26 | kind: DaemonSet 27 | metadata: 28 | name: micro-device-plugin 29 | namespace: kube-system 30 | labels: 31 | app: micro-device-plugin 32 | spec: 33 | selector: 34 | matchLabels: 35 | app: micro-device-plugin 36 | template: 37 | metadata: 38 | labels: 39 | app: micro-device-plugin 40 | spec: 41 | hostNetwork: true 42 | containers: 43 | - name: plugin 44 | image: binyue/micro-device-plugin:v1.0.13 # 更新版本 45 | imagePullPolicy: IfNotPresent 46 | securityContext: 47 | privileged: true 48 | resources: 49 | limits: 50 | cpu: 100m 51 | memory: 128Mi 52 | env: 53 | - name: POD_NAME 54 | valueFrom: 55 | fieldRef: 56 | fieldPath: metadata.name 57 | - name: POD_NAMESPACE 58 | valueFrom: 59 | fieldRef: 60 | fieldPath: metadata.namespace 61 | - name: POD_UID 62 | valueFrom: 63 | fieldRef: 64 | fieldPath: metadata.uid 65 | - name: NODE_NAME 66 | valueFrom: 67 | fieldRef: 68 | fieldPath: spec.nodeName 69 | - name: LD_LIBRARY_PATH 70 | value: /usr/lib/x86_64-linux-gnu:/host-lib 71 | # 启用MIG配置 72 | - name: ENABLE_MIG 73 | value: "true" 74 | # 设置切分策略(默认3g.20gb) 75 | - name: MIG_PROFILE 76 | value: "3g.20gb" 77 | # 新增实例数量配置 (可选) 78 | - name: MIG_INSTANCE_COUNT 79 | value: "2" # 显式指定创建4个实例 80 | 81 | # 是否跳过已切分的设备 82 | - name: SKIP_CONFIGURED 83 | value: "true" 84 | - name: CDI_ENABLED 85 | value: "false" # 启用CDI 86 | - name: CDI_PREFIX 87 | value: "micro.device" # CDI前缀 88 | volumeMounts: 89 | - name: device-plugin 90 | mountPath: /var/lib/kubelet/device-plugins 91 | - name: dev 92 | mountPath: /dev 93 | - name: nvidia-bin 94 | mountPath: /host-driver/nvidia-smi 95 | subPath: nvidia-smi 96 | - name: nvidia-lib 97 | mountPath: /host-lib 98 | - name: dev-caps 99 | mountPath: /dev/nvidia-caps 100 | - name: cdi-dir # 新增卷 101 | mountPath: /etc/cdi 102 | 103 | volumes: 104 | - name: device-plugin 105 | hostPath: 106 | path: /var/lib/kubelet/device-plugins 107 | type: DirectoryOrCreate 108 | - name: dev 109 | hostPath: 110 | path: /dev 111 | - name: nvidia-bin 112 | hostPath: 113 | path: /usr/bin 114 | - name: nvidia-lib 115 | hostPath: 116 | path: /usr/lib/x86_64-linux-gnu 117 | - name: dev-caps 118 | hostPath: 119 | path: /dev/nvidia-caps 120 | type: Directory 121 | - name: cdi-dir # 新增卷定义 122 | hostPath: 123 | path: /etc/cdi 124 | type: DirectoryOrCreate 125 | -------------------------------------------------------------------------------- /pkg/deviceplugin/server.go: -------------------------------------------------------------------------------- 1 | package deviceplugin 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "os" 8 | "path" 9 | "strings" 10 | "syscall" 11 | "time" 12 | 13 | "github.com/benyuereal/micro-device-plugin/pkg/allocator" 14 | "github.com/benyuereal/micro-device-plugin/pkg/device" 15 | "google.golang.org/grpc" 16 | 17 | corev1 "k8s.io/api/core/v1" 18 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 19 | "k8s.io/client-go/kubernetes" 20 | "k8s.io/client-go/rest" 21 | "k8s.io/klog/v2" 22 | pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 23 | ) 24 | 25 | const ( 26 | socketPrefix = "microui.sock" 27 | kubeletSocket = pluginapi.KubeletSocket 28 | restartDelay = 5 * time.Second 29 | ) 30 | 31 | type DevicePluginServer struct { 32 | vendor string 33 | resource string 34 | socket string 35 | stop chan struct{} 36 | healthChan chan string 37 | allocator allocator.Allocator 38 | manager device.DeviceManager 39 | server *grpc.Server 40 | lastDeviceState map[string]string // 使用字符串记录健康状态 41 | deviceMap map[string]device.GPUDevice // 设备ID到设备对象的映射 42 | cdiEnabled bool 43 | cdiPrefix string // 添加CDI前缀配置 44 | kubeClient *kubernetes.Clientset // 新增 Kubernetes 客户端 45 | nodeName string // 新增节点名称 46 | } 47 | 48 | func New(vendor string, manager device.DeviceManager, cdiEnabled bool, cdiPrefix string, nodeName string) *DevicePluginServer { 49 | // 创建 Kubernetes 客户端 50 | config, _ := rest.InClusterConfig() 51 | kubeClient, _ := kubernetes.NewForConfig(config) 52 | return &DevicePluginServer{ 53 | vendor: vendor, 54 | resource: vendor + ".com/microgpu", 55 | socket: path.Join(pluginapi.DevicePluginPath, socketPrefix+"."+vendor), 56 | stop: make(chan struct{}), 57 | healthChan: make(chan string, 1), 58 | manager: manager, 59 | allocator: allocator.NewSimpleAllocator(), 60 | lastDeviceState: make(map[string]string), 61 | deviceMap: make(map[string]device.GPUDevice), 62 | cdiEnabled: cdiEnabled, 63 | cdiPrefix: cdiPrefix, 64 | kubeClient: kubeClient, 65 | nodeName: nodeName, 66 | } 67 | } 68 | 69 | // ListAndWatch 实现设备插件服务 70 | func (s *DevicePluginServer) ListAndWatch(_ *pluginapi.Empty, stream pluginapi.DevicePlugin_ListAndWatchServer) error { 71 | klog.Infof("Starting ListAndWatch for %s device plugin", s.vendor) 72 | 73 | // 初始设备列表 74 | if err := s.updateDeviceList(stream); err != nil { 75 | return err 76 | } 77 | 78 | // 定时更新和健康检查 79 | ticker := time.NewTicker(10 * time.Second) 80 | defer ticker.Stop() 81 | 82 | for { 83 | select { 84 | case <-ticker.C: 85 | klog.V(5).Infof("Periodic device list update for %s", s.vendor) 86 | if err := s.updateDeviceList(stream); err != nil { 87 | return err 88 | } 89 | case id := <-s.healthChan: 90 | klog.Warningf("Device %s health status changed, updating device list", id) 91 | if err := s.updateDeviceList(stream); err != nil { 92 | return err 93 | } 94 | case <-s.stop: 95 | klog.Infof("Stopping ListAndWatch for %s device plugin", s.vendor) 96 | return nil 97 | } 98 | } 99 | } 100 | 101 | func (s *DevicePluginServer) updateDeviceList(stream pluginapi.DevicePlugin_ListAndWatchServer) error { 102 | devices, err := s.manager.DiscoverGPUs() 103 | if err != nil { 104 | klog.Errorf("Failed to discover devices: %v", err) 105 | return fmt.Errorf("failed to discover devices: %v", err) 106 | } 107 | // 新增:清理已消失设备的分配状态 108 | discoveredIDs := make(map[string]bool) 109 | for _, d := range devices { 110 | discoveredIDs[d.ID()] = true 111 | } 112 | s.allocator.CleanupOrphanedDevices(discoveredIDs) 113 | 114 | // 修复:在更新设备列表时重建deviceMap 115 | newDeviceMap := make(map[string]device.GPUDevice) 116 | for _, d := range devices { 117 | newDeviceMap[d.ID()] = d 118 | } 119 | s.deviceMap = newDeviceMap 120 | klog.Infof("Discovered %d new devices, deviceMap %v", len(newDeviceMap), newDeviceMap) 121 | 122 | deviceList := make([]*pluginapi.Device, len(devices)) 123 | healthStatusCount := map[string]int{ 124 | pluginapi.Healthy: 0, 125 | pluginapi.Unhealthy: 0} 126 | 127 | for i, d := range devices { 128 | // 更新设备健康状态 129 | healthy := s.manager.CheckHealth(d.ID()) 130 | state := pluginapi.Healthy 131 | if !healthy { 132 | state = pluginapi.Unhealthy 133 | } 134 | healthStatusCount[state]++ 135 | 136 | // 记录状态变化 137 | if prevState, exists := s.lastDeviceState[d.ID()]; exists && prevState != state { 138 | klog.Infof("Device %s health changed from %s to %s", d.ID(), prevState, state) 139 | } 140 | s.lastDeviceState[d.ID()] = state 141 | 142 | deviceList[i] = &pluginapi.Device{ 143 | ID: d.ID(), 144 | Health: state, 145 | } 146 | } 147 | 148 | klog.Infof("Updating device list for %s: %d devices (%d healthy, %d unhealthy)", 149 | s.vendor, len(deviceList), healthStatusCount[pluginapi.Healthy], healthStatusCount[pluginapi.Unhealthy]) 150 | 151 | return stream.Send(&pluginapi.ListAndWatchResponse{Devices: deviceList}) 152 | } 153 | 154 | // Allocate 设备分配实现 - 生产级MIG支持 155 | func (s *DevicePluginServer) Allocate(ctx context.Context, req *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { 156 | klog.Infof("Received Allocate request for %s: %v", s.resource, req.ContainerRequests) 157 | response := pluginapi.AllocateResponse{} 158 | 159 | // 修复:从请求的注解中获取 Pod UID(Kubernetes 标准方式) 160 | // 方法1: 尝试从环境变量获取 Pod 信息 161 | podName := os.Getenv("POD_NAME") 162 | podNamespace := os.Getenv("POD_NAMESPACE") 163 | podUID := "" 164 | if podName != "" && podNamespace != "" { 165 | pod, err := s.kubeClient.CoreV1().Pods(podNamespace).Get(ctx, podName, metav1.GetOptions{}) 166 | if err != nil { 167 | klog.Warningf("Failed to get pod %s/%s: %v", podNamespace, podName, err) 168 | } else { 169 | podUID = string(pod.UID) 170 | klog.Infof("Found pod UID via API: %s", podUID) 171 | } 172 | } 173 | 174 | for _, containerReq := range req.ContainerRequests { 175 | containerResp := new(pluginapi.ContainerAllocateResponse) 176 | 177 | // 获取 Pod UI 178 | // 尝试分配这些设备 179 | // 在分配设备前检查设备是否可用 180 | for _, devID := range containerReq.DevicesIDs { 181 | if !s.allocator.IsAvailable(devID) { 182 | // 如果设备已被分配但Pod不存在,清除错误状态 183 | if !s.isPodActive(s.allocator.GetPodUID(devID)) { 184 | s.allocator.Deallocate([]string{devID}) 185 | } else { 186 | return nil, fmt.Errorf("device %s is already allocated", devID) 187 | } 188 | } 189 | } 190 | 191 | if err := s.allocator.Allocate(containerReq.DevicesIDs, podUID); err != nil { 192 | klog.Errorf("Allocation failed for devices %v: %v", containerReq.DevicesIDs, err) 193 | return nil, fmt.Errorf("allocation failed: %v", err) 194 | } 195 | 196 | // ================= 核心环境变量设置 ================= 197 | envs := make(map[string]string) 198 | 199 | // 关键修改:使用物理索引而非设备ID 200 | envs["NVIDIA_VISIBLE_DEVICES"] = strings.Join(containerReq.DevicesIDs, ",") 201 | envs["NVIDIA_DRIVER_CAPABILITIES"] = "compute,utility,video,graphics" 202 | envs["NVIDIA_DISABLE_REQUIRE"] = "1" 203 | envs["NVIDIA_REQUIRE_MIG"] = "1" 204 | 205 | containerResp.Envs = envs 206 | 207 | // 打印环境变量用于调试 208 | for k, v := range containerResp.Envs { 209 | klog.Infof("Setting env: %s=%s", k, v) 210 | } 211 | 212 | // 添加 CDI 设备注入 213 | if s.cdiEnabled { 214 | cdiDevices := make([]string, len(containerReq.DevicesIDs)) 215 | for i, id := range containerReq.DevicesIDs { 216 | cdiDevices[i] = fmt.Sprintf("%s/%s=%s", s.cdiPrefix, s.vendor, id) 217 | } 218 | containerResp.CDIDevices = []*pluginapi.CDIDevice{ 219 | { 220 | Name: strings.Join(cdiDevices, ","), 221 | }, 222 | } 223 | } 224 | 225 | response.ContainerResponses = append(response.ContainerResponses, containerResp) 226 | } 227 | 228 | klog.Infof("Allocation successful for %s, req :%v, resp: %v", s.resource, req.ContainerRequests, 229 | response.ContainerResponses) 230 | return &response, nil 231 | } 232 | 233 | func (s *DevicePluginServer) isMIGDevice(id string) bool { 234 | devices, _ := s.manager.DiscoverGPUs() 235 | for _, d := range devices { 236 | if d.ID() == id && d.IsMIG() { 237 | return true 238 | } 239 | } 240 | return false 241 | } 242 | 243 | // GetDevicePluginOptions 插件选项 244 | func (s *DevicePluginServer) GetDevicePluginOptions(ctx context.Context, empty *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { 245 | return &pluginapi.DevicePluginOptions{ 246 | PreStartRequired: false, 247 | }, nil 248 | } 249 | 250 | // PreStartContainer 容器启动前预处理(可选) 251 | func (s *DevicePluginServer) PreStartContainer(ctx context.Context, req *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { 252 | return &pluginapi.PreStartContainerResponse{}, nil 253 | } 254 | 255 | // GetPreferredAllocation 分配偏好(可选) 256 | func (s *DevicePluginServer) GetPreferredAllocation(ctx context.Context, req *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { 257 | return &pluginapi.PreferredAllocationResponse{}, nil 258 | } 259 | 260 | // *********** 服务管理方法 *********** 261 | 262 | // Start 启动设备插件服务 263 | func (s *DevicePluginServer) Start(ctx context.Context) error { 264 | klog.Infof("Starting %s device plugin", s.vendor) 265 | 266 | // 启动资源回收器(每 30 秒运行一次) 267 | go s.ResourceRecycler(ctx, 30*time.Second) // 共享主流程上下文 268 | // 如果是NVIDIA设备,配置MIG 269 | if nvidiaManager, ok := s.manager.(*device.NVIDIAManager); ok { 270 | nvidiaManager.ConfigureMIG() 271 | } 272 | 273 | // 确保插件目录存在 274 | if err := os.MkdirAll(pluginapi.DevicePluginPath, 0755); err != nil { 275 | klog.Errorf("Failed to create device plugin directory: %v", err) 276 | return fmt.Errorf("failed to create device plugin directory: %v", err) 277 | } 278 | 279 | // 清理现有的socket文件 280 | if err := syscall.Unlink(s.socket); err != nil && !os.IsNotExist(err) { 281 | klog.Errorf("Failed to unlink socket: %v", err) 282 | return fmt.Errorf("failed to unlink socket: %v", err) 283 | } 284 | 285 | // 创建监听 286 | lis, err := net.Listen("unix", s.socket) 287 | if err != nil { 288 | klog.Errorf("Failed to listen on socket: %v", err) 289 | return fmt.Errorf("failed to listen on socket: %v", err) 290 | } 291 | 292 | // 创建gRPC服务 293 | s.server = grpc.NewServer() 294 | pluginapi.RegisterDevicePluginServer(s.server, s) 295 | 296 | // 启动gRPC服务 297 | go func() { 298 | klog.Infof("Starting %s device plugin server at: %s", s.vendor, s.socket) 299 | if err := s.server.Serve(lis); err != nil { 300 | klog.Fatalf("Device plugin server failed: %v", err) 301 | } 302 | }() 303 | 304 | // 等待服务器启动 305 | connCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 306 | defer cancel() 307 | 308 | if err := waitForSocket(connCtx, s.socket); err != nil { 309 | klog.Errorf("Failed to start gRPC server: %v", err) 310 | return fmt.Errorf("failed to start gRPC server: %v", err) 311 | } 312 | 313 | // 注册到kubelet 314 | if err := s.registerWithKubelet(); err != nil { 315 | klog.Errorf("Failed to register with kubelet: %v", err) 316 | return fmt.Errorf("failed to register with kubelet: %v", err) 317 | } 318 | 319 | klog.Infof("%s device plugin started and registered with resource name %s", s.vendor, s.resource) 320 | 321 | return nil 322 | } 323 | 324 | // Stop 停止设备插件 325 | func (s *DevicePluginServer) Stop() { 326 | klog.Infof("Stopping %s device plugin", s.vendor) 327 | close(s.stop) 328 | if s.server != nil { 329 | s.server.Stop() 330 | } 331 | } 332 | 333 | // HealthCheck 后台健康检查 334 | func (s *DevicePluginServer) HealthCheck(ctx context.Context, interval time.Duration) { 335 | klog.Infof("Starting health check for %s plugin with interval %v", s.vendor, interval) 336 | ticker := time.NewTicker(interval) 337 | defer ticker.Stop() 338 | 339 | for { 340 | select { 341 | case <-ticker.C: 342 | devices, err := s.manager.DiscoverGPUs() 343 | if err != nil { 344 | klog.Errorf("Failed to discover devices during health check: %v", err) 345 | continue 346 | } 347 | 348 | for _, d := range devices { 349 | currentHealth := d.IsHealthy() 350 | actualHealth := s.manager.CheckHealth(d.ID()) 351 | 352 | if currentHealth != actualHealth { 353 | klog.Warningf("Device %s health status changed from %v to %v", d.ID(), currentHealth, actualHealth) 354 | s.healthChan <- d.ID() 355 | } 356 | } 357 | case <-ctx.Done(): 358 | klog.Infof("Stopping health check for %s plugin", s.vendor) 359 | return 360 | } 361 | } 362 | } 363 | 364 | // *********** 辅助方法 *********** 365 | 366 | func (s *DevicePluginServer) registerWithKubelet() error { 367 | klog.Infof("Registering with kubelet at %s", kubeletSocket) 368 | 369 | conn, err := grpc.Dial(kubeletSocket, grpc.WithInsecure(), 370 | grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { 371 | return (&net.Dialer{}).DialContext(ctx, "unix", addr) 372 | }), 373 | ) 374 | 375 | if err != nil { 376 | return fmt.Errorf("failed to connect to kubelet: %v", err) 377 | } 378 | defer conn.Close() 379 | 380 | client := pluginapi.NewRegistrationClient(conn) 381 | req := &pluginapi.RegisterRequest{ 382 | Version: pluginapi.Version, 383 | Endpoint: path.Base(s.socket), 384 | ResourceName: s.resource, 385 | } 386 | 387 | _, err = client.Register(context.Background(), req) 388 | return err 389 | } 390 | 391 | func waitForSocket(ctx context.Context, socket string) error { 392 | klog.V(4).Infof("Waiting for socket %s to be ready", socket) 393 | 394 | for { 395 | select { 396 | case <-ctx.Done(): 397 | return ctx.Err() 398 | default: 399 | if conn, err := net.Dial("unix", socket); err == nil { 400 | conn.Close() 401 | klog.V(4).Infof("Socket %s is ready", socket) 402 | return nil 403 | } 404 | time.Sleep(restartDelay) 405 | } 406 | } 407 | } 408 | 409 | // 新增方法:资源回收器 410 | func (s *DevicePluginServer) ResourceRecycler(ctx context.Context, interval time.Duration) { 411 | klog.Infof("Starting resource recycler for %s plugin", s.vendor) 412 | ticker := time.NewTicker(interval) 413 | defer ticker.Stop() 414 | 415 | for { 416 | select { 417 | case <-ticker.C: 418 | 419 | allocatedMap := s.allocator.GetAllocationMap() // 获取设备到 Pod 的映射 420 | if len(allocatedMap) == 0 { 421 | continue 422 | } 423 | 424 | // 检查已分配设备对应的 Pod 425 | var toRelease []string 426 | for deviceID, podUID := range allocatedMap { 427 | if podUID == "" { 428 | toRelease = append(toRelease, deviceID) // 无主设备直接释放 429 | continue 430 | } 431 | 432 | // 检查 Pod 状态:只有非活动状态(终止/完成)才释放 433 | if !s.isPodActive(podUID) { 434 | toRelease = append(toRelease, deviceID) 435 | klog.Infof("Marking device %s for release (pod %s is inactive)", deviceID, podUID) 436 | } 437 | } 438 | 439 | // 释放资源 440 | if len(toRelease) > 0 { 441 | s.allocator.Deallocate(toRelease) 442 | klog.Infof("Released %d orphaned devices, deivce %v", len(toRelease), toRelease) 443 | } 444 | 445 | case <-ctx.Done(): 446 | klog.Infof("Stopping resource recycler for %s plugin", s.vendor) 447 | return 448 | } 449 | } 450 | } 451 | 452 | // isPodActive 检查 Pod 是否处于活动状态(非终止/完成) 453 | func (s *DevicePluginServer) isPodActive(podUID string) bool { 454 | if podUID == "" { 455 | return false 456 | } 457 | pod, err := s.kubeClient.CoreV1().Pods("").Get(context.Background(), "", metav1.GetOptions{}) 458 | if err != nil { 459 | klog.Warningf("Failed to get pod with UID %s: %v", podUID, err) 460 | return false // 默认按非活动处理 461 | } 462 | if pod.DeletionTimestamp != nil { 463 | return false // 正在终止,视为非活动 464 | } 465 | 466 | // 活动状态:Running 或 Pending 467 | if pod.Status.Phase == corev1.PodRunning || pod.Status.Phase == corev1.PodPending { 468 | return true 469 | } 470 | // 非活动状态:Succeeded(完成)、Failed(失败)或正在删除(DeletionTimestamp 非空) 471 | return false 472 | } 473 | -------------------------------------------------------------------------------- /pkg/device/nvidia.go: -------------------------------------------------------------------------------- 1 | package device 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/exec" 7 | "regexp" 8 | "strconv" 9 | "strings" 10 | "sync" 11 | "time" 12 | 13 | "k8s.io/klog/v2" 14 | ) 15 | 16 | type NVIDIADevice struct { 17 | id string 18 | deviceIndex string // 系统设备索引 19 | physicalID string // 物理GPU ID 20 | migEnabled bool // 是否为MIG设备 21 | profile string // MIG配置类型 22 | healthy bool 23 | } 24 | 25 | func (d *NVIDIADevice) ID() string { return d.id } 26 | func (d *NVIDIADevice) IsHealthy() bool { return d.healthy } 27 | func (d *NVIDIADevice) GetVendor() string { return "nvidia" } 28 | 29 | // device/nvidia.go 30 | func (d *NVIDIADevice) GetPath() string { 31 | if d.migEnabled { 32 | // 生成设备节点名称(如 nvidia-cap12) 33 | return fmt.Sprintf("/dev/nvidia-caps/nvidia-cap%s", d.physicalID) 34 | } 35 | return "/dev/nvidia" + d.physicalID 36 | } 37 | func (d *NVIDIADevice) IsMIG() bool { return d.migEnabled } 38 | func (d *NVIDIADevice) PhysicalID() string { // 对于MIG设备返回物理GPU索引(如"0") 39 | if d.migEnabled { 40 | return d.physicalID 41 | } 42 | return d.deviceIndex 43 | } 44 | func (d *NVIDIADevice) Profile() string { return d.profile } 45 | 46 | type NVIDIAManager struct { 47 | lastDiscovery time.Time 48 | devices []GPUDevice 49 | deviceMap map[string]*NVIDIADevice // 设备ID到设备对象的映射 50 | discoverySync sync.Mutex 51 | migManager *MIGManager 52 | } 53 | 54 | // 初始化MIG管理器 55 | func NewNVIDIAManager() *NVIDIAManager { 56 | return &NVIDIAManager{ 57 | migManager: NewMIGManager(), 58 | deviceMap: make(map[string]*NVIDIADevice), 59 | } 60 | } 61 | 62 | // 获取nvidia-smi的路径 63 | func getNvidiaSmiPath() string { 64 | if customPath := os.Getenv("NVIDIA_SMI_PATH"); customPath != "" { 65 | klog.V(4).Infof("Using custom NVIDIA-SMI path: %s", customPath) 66 | return customPath 67 | } 68 | return "/host-driver/nvidia-smi" 69 | } 70 | 71 | // 确保命令使用正确的库路径 72 | func runNvidiaSmiCommand(args ...string) ([]byte, error) { 73 | cmd := exec.Command(getNvidiaSmiPath(), args...) 74 | cmd.Env = append(os.Environ(), 75 | "LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/host-lib", 76 | "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 77 | ) 78 | klog.Infof("Executing NVIDIA-SMI command: %v", cmd.Args) 79 | return cmd.CombinedOutput() 80 | } 81 | 82 | func (m *NVIDIAManager) DiscoverGPUs() ([]GPUDevice, error) { 83 | m.discoverySync.Lock() 84 | defer m.discoverySync.Unlock() 85 | 86 | // 使用缓存机制 87 | if time.Since(m.lastDiscovery) < 5*time.Minute && m.devices != nil { 88 | klog.V(4).Infof("Using cached NVIDIA devices (last discovery: %s)", m.lastDiscovery) 89 | return m.devices, nil 90 | } 91 | 92 | klog.Info("Discovering NVIDIA devices") 93 | 94 | // 重置设备映射 95 | m.deviceMap = make(map[string]*NVIDIADevice) 96 | var devices []GPUDevice 97 | 98 | // 步骤1: 获取所有GPU设备列表 99 | out, err := runNvidiaSmiCommand("--query-gpu=index,uuid,memory.total,mig.mode.current", "--format=csv,noheader") 100 | if err != nil { 101 | klog.Errorf("Failed to discover NVIDIA GPUs: %v", err) 102 | return nil, err 103 | } 104 | 105 | lines := strings.Split(strings.TrimSpace(string(out)), "\n") 106 | for _, line := range lines { 107 | fields := strings.Split(line, ",") 108 | if len(fields) < 4 { 109 | continue 110 | } 111 | 112 | gpuIndex := strings.TrimSpace(fields[0]) 113 | gpuUUID := strings.TrimSpace(fields[1]) 114 | migMode := strings.TrimSpace(fields[3]) 115 | 116 | // 步骤2: 检查MIG模式 117 | if migMode == "Enabled" && os.Getenv("ENABLE_MIG") == "true" { 118 | 119 | // 获取MIG设备 120 | migDevices, err := m.discoverMIGDevices(gpuIndex) 121 | if err != nil { 122 | klog.Errorf("Failed to discover MIG devices for GPU %s: %v", gpuIndex, err) 123 | continue 124 | } 125 | devices = append(devices, migDevices...) 126 | } else { 127 | // 普通GPU设备 128 | device := &NVIDIADevice{ 129 | id: gpuUUID, 130 | deviceIndex: gpuIndex, 131 | physicalID: gpuIndex, 132 | migEnabled: false, 133 | healthy: true, 134 | } 135 | devices = append(devices, device) 136 | m.deviceMap[gpuUUID] = device 137 | } 138 | } 139 | 140 | klog.Infof("Discovered %d NVIDIA devices", len(devices)) 141 | for _, d := range devices { 142 | nvDevice := d.(*NVIDIADevice) 143 | klog.Infof("NVIDIA Device: ID=%s, Index=%s, MIG=%v, Profile=%s", 144 | nvDevice.ID(), nvDevice.deviceIndex, nvDevice.IsMIG(), nvDevice.Profile()) 145 | } 146 | 147 | m.devices = devices 148 | m.lastDiscovery = time.Now() 149 | return devices, nil 150 | } 151 | 152 | // 发现MIG设备 153 | func (m *NVIDIAManager) discoverMIGDevices(gpuIndex string) ([]GPUDevice, error) { 154 | var devices []GPUDevice 155 | 156 | // 查询GPU实例(GPU Instances) 157 | out, err := runNvidiaSmiCommand("mig", "-lgi", "-i", gpuIndex) 158 | output := strings.TrimSpace(string(out)) 159 | 160 | // 处理无GPU实例的情况 161 | if strings.Contains(output, "No GPU instances found") { 162 | klog.Infof("No MIG GPU instances found on GPU %s", gpuIndex) 163 | return devices, nil 164 | } 165 | 166 | if err != nil { 167 | klog.Errorf("Failed to query GPU instances for GPU %s: %v", gpuIndex, err) 168 | return nil, err 169 | } 170 | 171 | uuids, err := m.getMIGDeviceUUIDs(gpuIndex) 172 | 173 | for index, uuid := range uuids { 174 | // 创建设备ID: GPUIndex-GI-CI 175 | 176 | klog.Infof("Device ID: %s", uuid) 177 | device := &NVIDIADevice{ 178 | id: uuid, 179 | deviceIndex: string(rune(index)), // 使用GPU实例ID作为设备索引 180 | physicalID: gpuIndex, 181 | migEnabled: true, 182 | profile: "3g.20gb", 183 | healthy: true, 184 | } 185 | klog.Infof("device: %v", device) 186 | devices = append(devices, device) 187 | m.deviceMap[uuid] = device 188 | 189 | klog.Infof("Found device: %v", device) 190 | } 191 | 192 | return devices, nil 193 | } 194 | 195 | // 获取指定GPU上的MIG设备UUID 196 | func (m *NVIDIAManager) getMIGDeviceUUIDs(gpuIndex string) ([]string, error) { 197 | // 使用nvidia-smi -L命令获取所有GPU信息 198 | out, err := runNvidiaSmiCommand("-L") 199 | if err != nil { 200 | return nil, fmt.Errorf("failed to get MIG UUIDs: %v", err) 201 | } 202 | 203 | output := strings.TrimSpace(string(out)) 204 | lines := strings.Split(output, "\n") 205 | 206 | var uuids []string 207 | currentGPU := "" 208 | 209 | for _, line := range lines { 210 | // 匹配GPU行 211 | if strings.HasPrefix(line, "GPU "+gpuIndex+":") { 212 | currentGPU = gpuIndex 213 | continue 214 | } 215 | 216 | // 匹配MIG设备行 217 | if currentGPU == gpuIndex && strings.Contains(line, "MIG") && strings.Contains(line, "UUID") { 218 | parts := strings.Split(line, "UUID:") 219 | if len(parts) >= 2 { 220 | uuid := strings.TrimSpace(parts[1]) 221 | // 移除末尾的括号 222 | uuid = strings.TrimSuffix(uuid, ")") 223 | uuids = append(uuids, uuid) 224 | } 225 | } 226 | } 227 | 228 | klog.Infof("Found %d MIG UUIDs for GPU %s: %v", len(uuids), gpuIndex, uuids) 229 | return uuids, nil 230 | } 231 | 232 | func (m *NVIDIAManager) getProfileName(profileID string) (string, error) { 233 | // 查询所有可用profile 234 | out, err := runNvidiaSmiCommand("mig", "-lgip") 235 | if err != nil { 236 | return "", err 237 | } 238 | 239 | lines := strings.Split(strings.TrimSpace(string(out)), "\n") 240 | for _, line := range lines { 241 | if strings.Contains(line, profileID) { 242 | // 示例行: " 19 4 4 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1g.10gb" 243 | fields := strings.Fields(line) 244 | if len(fields) > 0 { 245 | // 最后一个字段是profile名称 246 | return fields[len(fields)-1], nil 247 | } 248 | } 249 | } 250 | return "unknown", fmt.Errorf("profile not found for ID %s", profileID) 251 | } 252 | 253 | // 健康检查 254 | func (m *NVIDIAManager) CheckHealth(deviceID string) bool { 255 | klog.V(5).Infof("Checking health of NVIDIA device %s", deviceID) 256 | 257 | // 从设备映射中获取设备 258 | device, exists := m.deviceMap[deviceID] 259 | if !exists { 260 | klog.Warningf("Device %s not found in device map", deviceID) 261 | return false 262 | } 263 | 264 | // 对于MIG设备,检查其物理GPU的健康 265 | targetID := deviceID 266 | if device.IsMIG() { 267 | targetID = device.PhysicalID() 268 | } 269 | 270 | // 使用更通用的健康检查方式 271 | out, err := runNvidiaSmiCommand("-i", targetID, "--query-gpu=utilization.gpu", "--format=csv,noheader") 272 | if err != nil { 273 | klog.Errorf("Failed to check health for NVIDIA device %s: %v", targetID, err) 274 | return false 275 | } 276 | 277 | // 如果能够获取到GPU利用率数据,则认为设备健康 278 | utilization := strings.TrimSpace(string(out)) 279 | if utilization != "" { 280 | klog.V(4).Infof("NVIDIA device %s is healthy (utilization: %s%%)", targetID, utilization) 281 | return true 282 | } 283 | 284 | return false 285 | } 286 | 287 | // MIG管理功能 288 | func (m *NVIDIAManager) ConfigureMIG() { 289 | klog.Info("Configuring MIG devices") 290 | m.migManager.Configure() 291 | } 292 | 293 | // MIG管理器 294 | type MIGManager struct { 295 | enabled bool 296 | profile string 297 | skipConfigured bool 298 | instanceCount int // 每个GPU上要创建的实例数 299 | gpuMemory uint64 // GPU显存大小(MB) 300 | } 301 | 302 | func NewMIGManager() *MIGManager { 303 | enabled := os.Getenv("ENABLE_MIG") == "true" 304 | profile := os.Getenv("MIG_PROFILE") 305 | if profile == "" { 306 | profile = "3g.20gb" // 默认20GB切分策略 307 | } 308 | 309 | skipConfigured := os.Getenv("SKIP_CONFIGURED") == "true" 310 | 311 | // 读取实例数量配置 312 | instanceCount := 0 // 0表示自动计算 313 | if countStr := os.Getenv("MIG_INSTANCE_COUNT"); countStr != "" { 314 | if count, err := strconv.Atoi(countStr); err == nil { 315 | instanceCount = count 316 | } 317 | } 318 | 319 | return &MIGManager{ 320 | enabled: enabled, 321 | profile: profile, 322 | skipConfigured: skipConfigured, 323 | instanceCount: instanceCount, 324 | } 325 | } 326 | 327 | func (m *MIGManager) Configure() { 328 | 329 | klog.Info("MIG configuration is in process ") 330 | 331 | if !m.enabled { 332 | klog.Info("MIG configuration is disabled") 333 | return 334 | } 335 | 336 | klog.Infof("Starting MIG configuration with profile: %s", m.profile) 337 | 338 | // 检查设备是否支持MIG 339 | if supported, err := m.isMigSupported(); err != nil { 340 | klog.Errorf("Failed to check MIG support: %v", err) 341 | return 342 | } else if !supported { 343 | klog.Warning("MIG is not supported on this device. Skipping MIG configuration.") 344 | return 345 | } 346 | 347 | // 2. 创建MIG设备 fixme 先不要创建设备 348 | //if err := m.createMIGDevices(); err != nil { 349 | // klog.Errorf("Failed to create MIG devices: %v", err) 350 | //} 351 | } 352 | 353 | // 检查设备是否支持MIG 354 | func (m *MIGManager) isMigSupported() (bool, error) { 355 | // 检查MIG支持状态 356 | out, err := runNvidiaSmiCommand("mig", "-lgip") 357 | output := strings.TrimSpace(string(out)) 358 | 359 | // 先检查特定不支持信息 360 | if strings.Contains(output, "No MIG-supported devices found") { 361 | klog.V(4).Info("MIG not supported: No MIG-supported devices found") 362 | return false, nil 363 | } 364 | 365 | // 检查其他不支持情况 366 | if strings.Contains(output, "not supported") { 367 | klog.V(4).Infof("MIG not supported: %s", output) 368 | return false, nil 369 | } 370 | 371 | // 处理命令错误 372 | if err != nil { 373 | klog.V(4).Infof("MIG command failed: %s", output) 374 | return false, fmt.Errorf("MIG command failed: %v", err) 375 | } 376 | 377 | // 检查有效输出(应该包含设备信息) 378 | if len(output) > 0 && !strings.Contains(output, "error") { 379 | klog.V(4).Infof("MIG supported devices found: %s", output) 380 | return true, nil 381 | } 382 | 383 | klog.V(4).Infof("Unknown MIG support status: %s", output) 384 | return false, nil 385 | } 386 | 387 | func (m *MIGManager) enableMIGMode() error { 388 | out, err := runNvidiaSmiCommand("--enable-mig") 389 | if err != nil { 390 | return err 391 | } 392 | klog.V(4).Infof("MIG enable output: %s", string(out)) 393 | return nil 394 | } 395 | 396 | // 获取GPU显存大小 397 | func (m *MIGManager) getGPUMemory(gpuIndex string) (uint64, error) { 398 | out, err := runNvidiaSmiCommand("-i", gpuIndex, "--query-gpu=memory.total", "--format=csv,noheader,nounits") 399 | if err != nil { 400 | return 0, err 401 | } 402 | 403 | memoryStr := strings.TrimSpace(string(out)) 404 | memoryMB, err := strconv.ParseUint(memoryStr, 10, 64) 405 | if err != nil { 406 | return 0, fmt.Errorf("failed to parse GPU memory: %v", err) 407 | } 408 | 409 | return memoryMB, nil 410 | } 411 | 412 | // 从profile中提取显存需求 (GB) 413 | func (m *MIGManager) getProfileMemoryReq() uint64 { 414 | parts := strings.Split(m.profile, ".") 415 | if len(parts) < 2 { 416 | return 0 417 | } 418 | 419 | memPart := parts[1] 420 | if strings.HasSuffix(memPart, "gb") { 421 | memPart = strings.TrimSuffix(memPart, "gb") 422 | } else if strings.HasSuffix(memPart, "g") { 423 | memPart = strings.TrimSuffix(memPart, "g") 424 | } 425 | 426 | memGB, err := strconv.ParseUint(memPart, 10, 64) 427 | if err != nil { 428 | klog.Warningf("Failed to parse memory requirement from profile %s: %v", m.profile, err) 429 | return 0 430 | } 431 | 432 | return memGB * 1024 // 转换为MB 433 | } 434 | 435 | /* 436 | * 437 | https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html 438 | */ 439 | func (m *MIGManager) createMIGDevices() error { 440 | // 获取GPU列表 441 | out, err := runNvidiaSmiCommand("--query-gpu=index", "--format=csv,noheader") 442 | if err != nil { 443 | return err 444 | } 445 | 446 | gpuIndexes := regexp.MustCompile(`\d+`).FindAllString(string(out), -1) 447 | for _, index := range gpuIndexes { 448 | // 检查是否已启用MIG 449 | out, err := runNvidiaSmiCommand("-i", index, "--query-gpu=mig.mode.current", "--format=csv,noheader") 450 | if err != nil { 451 | klog.Errorf("Failed to check MIG status for GPU %s: %v", index, err) 452 | continue 453 | } 454 | 455 | currentMode := strings.TrimSpace(string(out)) 456 | if currentMode != "Enabled" { 457 | // 启用MIG模式 458 | if _, err := runNvidiaSmiCommand("-i", index, "--enable-mig"); err != nil { 459 | klog.Errorf("Failed to enable MIG for GPU %s: %v", index, err) 460 | continue 461 | } 462 | klog.Infof("Enabled MIG mode for GPU %s", index) 463 | } else { 464 | klog.Infof("GPU %s already in MIG mode", index) 465 | } 466 | 467 | // 检查现有MIG设备 468 | count, err := m.getMIGDeviceCount(index) 469 | if err != nil { 470 | klog.Errorf("Failed to get MIG device count for GPU %s: %v", index, err) 471 | continue 472 | } 473 | 474 | // 如果已切分且配置跳过,则跳过创建 475 | if count > 0 && m.skipConfigured { 476 | klog.Infof("Skipping GPU %s (already has %d MIG devices)", index, count) 477 | continue 478 | } 479 | 480 | // 如果已有设备且不跳过,先销毁现有设备 481 | if count > 0 { 482 | klog.Infof("Destroying existing MIG devices on GPU %s", index) 483 | if _, err := runNvidiaSmiCommand("mig", "-i", index, "-dci"); err != nil { 484 | klog.Errorf("Failed to destroy compute instances on GPU %s: %v", index, err) 485 | } 486 | if _, err := runNvidiaSmiCommand("mig", "-i", index, "-dgi"); err != nil { 487 | klog.Errorf("Failed to destroy GPU instances on GPU %s: %v", index, err) 488 | } 489 | time.Sleep(2 * time.Second) // 等待资源释放 490 | } 491 | 492 | // 获取GPU显存大小 493 | totalMemory, err := m.getGPUMemory(index) 494 | if err != nil { 495 | klog.Errorf("Failed to get GPU memory for %s: %v", index, err) 496 | continue 497 | } 498 | 499 | // 计算最大可创建实例数 500 | profileMem := m.getProfileMemoryReq() 501 | maxInstances := 0 502 | 503 | if profileMem > 0 { 504 | maxInstances = int(totalMemory / profileMem) 505 | if maxInstances == 0 { 506 | klog.Warningf("GPU %s has insufficient memory (%dMB) for profile %s (%dMB required)", 507 | index, totalMemory, m.profile, profileMem) 508 | continue 509 | } 510 | } 511 | 512 | // 确定要创建的实例数量 513 | createCount := maxInstances 514 | if m.instanceCount > 0 { 515 | if m.instanceCount > maxInstances { 516 | klog.Warningf("Requested %d instances exceeds maximum %d for GPU %s", 517 | m.instanceCount, maxInstances, index) 518 | createCount = maxInstances 519 | } else { 520 | createCount = m.instanceCount 521 | } 522 | } 523 | 524 | if createCount == 0 { 525 | klog.Errorf("Cannot determine instance count for GPU %s", index) 526 | continue 527 | } 528 | 529 | klog.Infof("Creating %d MIG device(s) with profile %s on GPU %s", createCount, m.profile, index) 530 | 531 | profileID, err := getProfileID(m.profile) 532 | if err != nil { 533 | klog.Errorf("Failed to get profile ID: %v", err) 534 | continue 535 | } 536 | 537 | // === 修复开始 === 538 | // 构造逗号分隔的ID列表 (e.g., "9,9" for 2 instances) 539 | ids := make([]string, createCount) 540 | for i := 0; i < createCount; i++ { 541 | ids[i] = strconv.Itoa(profileID) 542 | } 543 | profileArg := strings.Join(ids, ",") 544 | 545 | // 单次执行创建命令 546 | _, err = runNvidiaSmiCommand("mig", "-cgi", profileArg, "-C") 547 | if err != nil { 548 | klog.Errorf("Failed to create %d MIG devices on GPU %s: %v", createCount, index, err) 549 | } else { 550 | klog.Infof("Successfully created %d MIG devices on GPU %s", createCount, index) 551 | } 552 | // === 修复结束 === 553 | } 554 | 555 | return nil 556 | } 557 | 558 | func getProfileID(profileName string) (int, error) { 559 | out, err := runNvidiaSmiCommand("mig", "-lgip") 560 | if err != nil { 561 | return 0, err 562 | } 563 | 564 | // 正则表达式匹配profile行 565 | re := regexp.MustCompile(`\|\s+\d+\s+MIG\s+(\S+)\s+(\d+)`) 566 | 567 | lines := strings.Split(strings.TrimSpace(string(out)), "\n") 568 | for _, line := range lines { 569 | klog.Infof("Found line %s", line) 570 | // 跳过非profile行(表格线、标题等) 571 | if !strings.Contains(line, "MIG") || !strings.Contains(line, "|") { 572 | continue 573 | } 574 | 575 | matches := re.FindStringSubmatch(line) 576 | if len(matches) > 2 { 577 | name := matches[1] 578 | idStr := matches[2] 579 | 580 | if name == profileName { 581 | profileID, err := strconv.Atoi(idStr) 582 | if err != nil { 583 | klog.Warningf("Invalid profile ID format: %s", idStr) 584 | continue 585 | } 586 | klog.Infof("Found profile %s with ID %d", profileName, profileID) 587 | return profileID, nil 588 | } 589 | } 590 | } 591 | return 0, fmt.Errorf("profile not found: %s", profileName) 592 | } 593 | 594 | // 获取当前MIG设备数量 595 | func (m *MIGManager) getMIGDeviceCount(gpuIndex string) (int, error) { 596 | out, err := runNvidiaSmiCommand("mig", "-lgi", "-i", gpuIndex) 597 | output := string(out) 598 | 599 | // 处理无 MIG 设备的情况 600 | if strings.Contains(output, "No GPU instances found") || 601 | strings.Contains(output, "Not Found") || 602 | strings.Contains(output, "No devices were found") { 603 | klog.Infof("No MIG instances found on GPU %s", gpuIndex) 604 | return 0, nil 605 | } 606 | 607 | if err != nil { 608 | // 检查是否因为无设备而返回错误 609 | if strings.Contains(err.Error(), "exit status 255") && 610 | (strings.Contains(output, "No GPU instances found") || 611 | strings.Contains(output, "Not Found")) { 612 | klog.Infof("No MIG devices on GPU %s (ignoring error)", gpuIndex) 613 | return 0, nil 614 | } 615 | return 0, fmt.Errorf("nvidia-smi MIG query failed: %v, output: %s", err, output) 616 | } 617 | 618 | // 解析输出中的设备计数 619 | count := 0 620 | lines := strings.Split(output, "\n") 621 | 622 | // 检测表头行 623 | headerFound := false 624 | for _, line := range lines { 625 | if strings.Contains(line, "GPU Instance ID") { 626 | headerFound = true 627 | continue 628 | } 629 | 630 | // 统计数据行 631 | if headerFound && strings.TrimSpace(line) != "" { 632 | count++ 633 | } 634 | } 635 | 636 | return count, nil 637 | } 638 | --------------------------------------------------------------------------------